Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
99f7d79c
Unverified
Commit
99f7d79c
authored
Sep 26, 2019
by
SparkSnail
Committed by
GitHub
Sep 26, 2019
Browse files
Support experiment view (#1524)
parent
0b7d6260
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
208 additions
and
145 deletions
+208
-145
docs/en_US/Tutorial/Nnictl.md
docs/en_US/Tutorial/Nnictl.md
+30
-0
src/nni_manager/common/experimentStartupInfo.ts
src/nni_manager/common/experimentStartupInfo.ts
+19
-5
src/nni_manager/common/log.ts
src/nni_manager/common/log.ts
+18
-8
src/nni_manager/common/manager.ts
src/nni_manager/common/manager.ts
+6
-2
src/nni_manager/core/nnimanager.ts
src/nni_manager/core/nnimanager.ts
+23
-3
src/nni_manager/main.ts
src/nni_manager/main.ts
+16
-8
src/nni_manager/rest_server/restHandler.ts
src/nni_manager/rest_server/restHandler.ts
+17
-17
tools/nni_cmd/launcher.py
tools/nni_cmd/launcher.py
+72
-101
tools/nni_cmd/nnictl.py
tools/nni_cmd/nnictl.py
+7
-1
No files found.
docs/en_US/Tutorial/Nnictl.md
View file @
99f7d79c
...
@@ -10,6 +10,7 @@ nnictl support commands:
...
@@ -10,6 +10,7 @@ nnictl support commands:
*
[
nnictl create
](
#create
)
*
[
nnictl create
](
#create
)
*
[
nnictl resume
](
#resume
)
*
[
nnictl resume
](
#resume
)
*
[
nnictl view
](
#view
)
*
[
nnictl stop
](
#stop
)
*
[
nnictl stop
](
#stop
)
*
[
nnictl update
](
#update
)
*
[
nnictl update
](
#update
)
*
[
nnictl trial
](
#trial
)
*
[
nnictl trial
](
#trial
)
...
@@ -104,6 +105,35 @@ Debug mode will disable version check function in Trialkeeper.
...
@@ -104,6 +105,35 @@ Debug mode will disable version check function in Trialkeeper.
nnictl resume
[
experiment_id]
--port
8088
nnictl resume
[
experiment_id]
--port
8088
```
```
<a
name=
"view"
></a>

`nnictl view`
*
Description
You can use this command to view a stopped experiment.
*
Usage
```
bash
nnictl view
[
OPTIONS]
```
*
Options
|Name, shorthand|Required|Default|Description|
|------|------|------ |------|
|id| True| |The id of the experiment you want to view|
|--port, -p| False| |Rest port of the experiment you want to view|
*
Example
> view an experiment with specified port 8088
```
bash
nnictl view
[
experiment_id]
--port
8088
```
<a
name=
"stop"
></a>
<a
name=
"stop"
></a>

`nnictl stop`

`nnictl stop`
...
...
src/nni_manager/common/experimentStartupInfo.ts
View file @
99f7d79c
...
@@ -33,11 +33,11 @@ class ExperimentStartupInfo {
...
@@ -33,11 +33,11 @@ class ExperimentStartupInfo {
private
initTrialSequenceID
:
number
=
0
;
private
initTrialSequenceID
:
number
=
0
;
private
logDir
:
string
=
''
;
private
logDir
:
string
=
''
;
private
logLevel
:
string
=
''
;
private
logLevel
:
string
=
''
;
private
readonly
:
boolean
=
false
;
public
setStartupInfo
(
newExperiment
:
boolean
,
experimentId
:
string
,
basePort
:
number
,
logDir
?:
string
,
logLevel
?:
string
):
void
{
public
setStartupInfo
(
newExperiment
:
boolean
,
experimentId
:
string
,
basePort
:
number
,
logDir
?:
string
,
logLevel
?:
string
,
readonly
?:
boolean
):
void
{
assert
(
!
this
.
initialized
);
assert
(
!
this
.
initialized
);
assert
(
experimentId
.
trim
().
length
>
0
);
assert
(
experimentId
.
trim
().
length
>
0
);
this
.
newExperiment
=
newExperiment
;
this
.
newExperiment
=
newExperiment
;
this
.
experimentId
=
experimentId
;
this
.
experimentId
=
experimentId
;
this
.
basePort
=
basePort
;
this
.
basePort
=
basePort
;
...
@@ -52,6 +52,10 @@ class ExperimentStartupInfo {
...
@@ -52,6 +52,10 @@ class ExperimentStartupInfo {
if
(
logLevel
!==
undefined
&&
logLevel
.
length
>
1
)
{
if
(
logLevel
!==
undefined
&&
logLevel
.
length
>
1
)
{
this
.
logLevel
=
logLevel
;
this
.
logLevel
=
logLevel
;
}
}
if
(
readonly
!==
undefined
)
{
this
.
readonly
=
readonly
;
}
}
}
public
getExperimentId
():
string
{
public
getExperimentId
():
string
{
...
@@ -84,6 +88,12 @@ class ExperimentStartupInfo {
...
@@ -84,6 +88,12 @@ class ExperimentStartupInfo {
return
this
.
logLevel
;
return
this
.
logLevel
;
}
}
public
isReadonly
():
boolean
{
assert
(
this
.
initialized
);
return
this
.
readonly
;
}
public
setInitTrialSequenceId
(
initSequenceId
:
number
):
void
{
public
setInitTrialSequenceId
(
initSequenceId
:
number
):
void
{
assert
(
this
.
initialized
);
assert
(
this
.
initialized
);
this
.
initTrialSequenceID
=
initSequenceId
;
this
.
initTrialSequenceID
=
initSequenceId
;
...
@@ -121,10 +131,14 @@ function getExperimentStartupInfo(): ExperimentStartupInfo {
...
@@ -121,10 +131,14 @@ function getExperimentStartupInfo(): ExperimentStartupInfo {
}
}
function
setExperimentStartupInfo
(
function
setExperimentStartupInfo
(
newExperiment
:
boolean
,
experimentId
:
string
,
basePort
:
number
,
logDir
?:
string
,
logLevel
?:
string
):
void
{
newExperiment
:
boolean
,
experimentId
:
string
,
basePort
:
number
,
logDir
?:
string
,
logLevel
?:
string
,
readonly
?:
boolean
):
void
{
component
.
get
<
ExperimentStartupInfo
>
(
ExperimentStartupInfo
)
component
.
get
<
ExperimentStartupInfo
>
(
ExperimentStartupInfo
)
.
setStartupInfo
(
newExperiment
,
experimentId
,
basePort
,
logDir
,
logLevel
);
.
setStartupInfo
(
newExperiment
,
experimentId
,
basePort
,
logDir
,
logLevel
,
readonly
);
}
function
isReadonly
():
boolean
{
return
component
.
get
<
ExperimentStartupInfo
>
(
ExperimentStartupInfo
).
isReadonly
();
}
}
export
{
ExperimentStartupInfo
,
getBasePort
,
getExperimentId
,
isNewExperiment
,
getExperimentStartupInfo
,
export
{
ExperimentStartupInfo
,
getBasePort
,
getExperimentId
,
isNewExperiment
,
getExperimentStartupInfo
,
setExperimentStartupInfo
,
setInitTrialSequenceId
,
getInitTrialSequenceId
};
setExperimentStartupInfo
,
setInitTrialSequenceId
,
getInitTrialSequenceId
,
isReadonly
};
src/nni_manager/common/log.ts
View file @
99f7d79c
...
@@ -26,7 +26,7 @@ import { Writable } from 'stream';
...
@@ -26,7 +26,7 @@ import { Writable } from 'stream';
import
{
WritableStreamBuffer
}
from
'
stream-buffers
'
;
import
{
WritableStreamBuffer
}
from
'
stream-buffers
'
;
import
{
format
}
from
'
util
'
;
import
{
format
}
from
'
util
'
;
import
*
as
component
from
'
../common/component
'
;
import
*
as
component
from
'
../common/component
'
;
import
{
getExperimentStartupInfo
}
from
'
./experimentStartupInfo
'
;
import
{
getExperimentStartupInfo
,
isReadonly
}
from
'
./experimentStartupInfo
'
;
import
{
getLogDir
}
from
'
./utils
'
;
import
{
getLogDir
}
from
'
./utils
'
;
const
FATAL
:
number
=
1
;
const
FATAL
:
number
=
1
;
...
@@ -76,6 +76,7 @@ class Logger {
...
@@ -76,6 +76,7 @@ class Logger {
private
level
:
number
=
INFO
;
private
level
:
number
=
INFO
;
private
bufferSerialEmitter
:
BufferSerialEmitter
;
private
bufferSerialEmitter
:
BufferSerialEmitter
;
private
writable
:
Writable
;
private
writable
:
Writable
;
private
readonly
:
boolean
=
false
;
constructor
(
fileName
?:
string
)
{
constructor
(
fileName
?:
string
)
{
let
logFile
:
string
|
undefined
=
fileName
;
let
logFile
:
string
|
undefined
=
fileName
;
...
@@ -95,6 +96,8 @@ class Logger {
...
@@ -95,6 +96,8 @@ class Logger {
if
(
logLevel
!==
undefined
)
{
if
(
logLevel
!==
undefined
)
{
this
.
level
=
logLevel
;
this
.
level
=
logLevel
;
}
}
this
.
readonly
=
isReadonly
();
}
}
public
close
()
{
public
close
()
{
...
@@ -134,14 +137,21 @@ class Logger {
...
@@ -134,14 +137,21 @@ class Logger {
public
fatal
(...
param
:
any
[]):
void
{
public
fatal
(...
param
:
any
[]):
void
{
this
.
log
(
'
FATAL
'
,
param
);
this
.
log
(
'
FATAL
'
,
param
);
}
}
/**
* if the experiment is not in readonly mode, write log content to stream
* @param level log level
* @param param the params to be written
*/
private
log
(
level
:
string
,
param
:
any
[]):
void
{
private
log
(
level
:
string
,
param
:
any
[]):
void
{
const
buffer
:
WritableStreamBuffer
=
new
WritableStreamBuffer
();
if
(
!
this
.
readonly
)
{
buffer
.
write
(
`[
${(
new
Date
()).
toLocaleString
()}
]
${
level
}
`
);
const
buffer
:
WritableStreamBuffer
=
new
WritableStreamBuffer
();
buffer
.
write
(
format
(
param
));
buffer
.
write
(
`[
${(
new
Date
()).
toLocaleString
()}
]
${
level
}
`
);
buffer
.
write
(
'
\n
'
);
buffer
.
write
(
format
(
param
));
buffer
.
end
();
buffer
.
write
(
'
\n
'
);
this
.
bufferSerialEmitter
.
feed
(
buffer
.
getContents
());
buffer
.
end
();
this
.
bufferSerialEmitter
.
feed
(
buffer
.
getContents
());
}
}
}
}
}
...
...
src/nni_manager/common/manager.ts
View file @
99f7d79c
...
@@ -24,6 +24,10 @@ import { TrialJobStatus } from './trainingService';
...
@@ -24,6 +24,10 @@ import { TrialJobStatus } from './trainingService';
type
ProfileUpdateType
=
'
TRIAL_CONCURRENCY
'
|
'
MAX_EXEC_DURATION
'
|
'
SEARCH_SPACE
'
|
'
MAX_TRIAL_NUM
'
;
type
ProfileUpdateType
=
'
TRIAL_CONCURRENCY
'
|
'
MAX_EXEC_DURATION
'
|
'
SEARCH_SPACE
'
|
'
MAX_TRIAL_NUM
'
;
type
ExperimentStatus
=
'
INITIALIZED
'
|
'
RUNNING
'
|
'
ERROR
'
|
'
STOPPING
'
|
'
STOPPED
'
|
'
DONE
'
|
'
NO_MORE_TRIAL
'
|
'
TUNER_NO_MORE_TRIAL
'
;
type
ExperimentStatus
=
'
INITIALIZED
'
|
'
RUNNING
'
|
'
ERROR
'
|
'
STOPPING
'
|
'
STOPPED
'
|
'
DONE
'
|
'
NO_MORE_TRIAL
'
|
'
TUNER_NO_MORE_TRIAL
'
;
namespace
ExperimentStartUpMode
{
export
const
NEW
=
'
new
'
;
export
const
RESUME
=
'
resume
'
;
}
interface
ExperimentParams
{
interface
ExperimentParams
{
authorName
:
string
;
authorName
:
string
;
...
@@ -95,7 +99,7 @@ interface NNIManagerStatus {
...
@@ -95,7 +99,7 @@ interface NNIManagerStatus {
abstract
class
Manager
{
abstract
class
Manager
{
public
abstract
startExperiment
(
experimentParams
:
ExperimentParams
):
Promise
<
string
>
;
public
abstract
startExperiment
(
experimentParams
:
ExperimentParams
):
Promise
<
string
>
;
public
abstract
resumeExperiment
():
Promise
<
void
>
;
public
abstract
resumeExperiment
(
readonly
:
boolean
):
Promise
<
void
>
;
public
abstract
stopExperiment
():
Promise
<
void
>
;
public
abstract
stopExperiment
():
Promise
<
void
>
;
public
abstract
getExperimentProfile
():
Promise
<
ExperimentProfile
>
;
public
abstract
getExperimentProfile
():
Promise
<
ExperimentProfile
>
;
public
abstract
updateExperimentProfile
(
experimentProfile
:
ExperimentProfile
,
updateType
:
ProfileUpdateType
):
Promise
<
void
>
;
public
abstract
updateExperimentProfile
(
experimentProfile
:
ExperimentProfile
,
updateType
:
ProfileUpdateType
):
Promise
<
void
>
;
...
@@ -115,4 +119,4 @@ abstract class Manager {
...
@@ -115,4 +119,4 @@ abstract class Manager {
public
abstract
getStatus
():
NNIManagerStatus
;
public
abstract
getStatus
():
NNIManagerStatus
;
}
}
export
{
Manager
,
ExperimentParams
,
ExperimentProfile
,
TrialJobStatistics
,
ProfileUpdateType
,
NNIManagerStatus
,
ExperimentStatus
};
export
{
Manager
,
ExperimentParams
,
ExperimentProfile
,
TrialJobStatistics
,
ProfileUpdateType
,
NNIManagerStatus
,
ExperimentStatus
,
ExperimentStartUpMode
};
src/nni_manager/core/nnimanager.ts
View file @
99f7d79c
...
@@ -59,6 +59,7 @@ class NNIManager implements Manager {
...
@@ -59,6 +59,7 @@ class NNIManager implements Manager {
private
waitingTrials
:
string
[];
private
waitingTrials
:
string
[];
private
trialJobs
:
Map
<
string
,
TrialJobDetail
>
;
private
trialJobs
:
Map
<
string
,
TrialJobDetail
>
;
private
trialDataForTuner
:
string
;
private
trialDataForTuner
:
string
;
private
readonly
:
boolean
;
private
trialJobMetricListener
:
(
metric
:
TrialJobMetric
)
=>
void
;
private
trialJobMetricListener
:
(
metric
:
TrialJobMetric
)
=>
void
;
...
@@ -72,6 +73,7 @@ class NNIManager implements Manager {
...
@@ -72,6 +73,7 @@ class NNIManager implements Manager {
this
.
waitingTrials
=
[];
this
.
waitingTrials
=
[];
this
.
trialJobs
=
new
Map
<
string
,
TrialJobDetail
>
();
this
.
trialJobs
=
new
Map
<
string
,
TrialJobDetail
>
();
this
.
trialDataForTuner
=
''
;
this
.
trialDataForTuner
=
''
;
this
.
readonly
=
false
;
this
.
log
=
getLogger
();
this
.
log
=
getLogger
();
this
.
dataStore
=
component
.
get
(
DataStore
);
this
.
dataStore
=
component
.
get
(
DataStore
);
...
@@ -88,6 +90,9 @@ class NNIManager implements Manager {
...
@@ -88,6 +90,9 @@ class NNIManager implements Manager {
}
}
public
updateExperimentProfile
(
experimentProfile
:
ExperimentProfile
,
updateType
:
ProfileUpdateType
):
Promise
<
void
>
{
public
updateExperimentProfile
(
experimentProfile
:
ExperimentProfile
,
updateType
:
ProfileUpdateType
):
Promise
<
void
>
{
if
(
this
.
readonly
)
{
return
Promise
.
reject
(
new
Error
(
'
Error: can not update experiment profile in readonly mode!
'
));
}
switch
(
updateType
)
{
switch
(
updateType
)
{
case
'
TRIAL_CONCURRENCY
'
:
case
'
TRIAL_CONCURRENCY
'
:
this
.
updateTrialConcurrency
(
experimentProfile
.
params
.
trialConcurrency
);
this
.
updateTrialConcurrency
(
experimentProfile
.
params
.
trialConcurrency
);
...
@@ -109,6 +114,9 @@ class NNIManager implements Manager {
...
@@ -109,6 +114,9 @@ class NNIManager implements Manager {
}
}
public
importData
(
data
:
string
):
Promise
<
void
>
{
public
importData
(
data
:
string
):
Promise
<
void
>
{
if
(
this
.
readonly
)
{
return
Promise
.
reject
(
new
Error
(
'
Error: can not import data in readonly mode!
'
));
}
if
(
this
.
dispatcher
===
undefined
)
{
if
(
this
.
dispatcher
===
undefined
)
{
return
Promise
.
reject
(
return
Promise
.
reject
(
new
Error
(
'
tuner has not been setup
'
)
new
Error
(
'
tuner has not been setup
'
)
...
@@ -124,6 +132,9 @@ class NNIManager implements Manager {
...
@@ -124,6 +132,9 @@ class NNIManager implements Manager {
}
}
public
addCustomizedTrialJob
(
hyperParams
:
string
):
Promise
<
void
>
{
public
addCustomizedTrialJob
(
hyperParams
:
string
):
Promise
<
void
>
{
if
(
this
.
readonly
)
{
return
Promise
.
reject
(
new
Error
(
'
Error: can not add customized trial job in readonly mode!
'
));
}
if
(
this
.
currSubmittedTrialNum
>=
this
.
experimentProfile
.
params
.
maxTrialNum
)
{
if
(
this
.
currSubmittedTrialNum
>=
this
.
experimentProfile
.
params
.
maxTrialNum
)
{
return
Promise
.
reject
(
return
Promise
.
reject
(
new
Error
(
'
reach maxTrialNum
'
)
new
Error
(
'
reach maxTrialNum
'
)
...
@@ -136,6 +147,9 @@ class NNIManager implements Manager {
...
@@ -136,6 +147,9 @@ class NNIManager implements Manager {
}
}
public
async
cancelTrialJobByUser
(
trialJobId
:
string
):
Promise
<
void
>
{
public
async
cancelTrialJobByUser
(
trialJobId
:
string
):
Promise
<
void
>
{
if
(
this
.
readonly
)
{
return
Promise
.
reject
(
new
Error
(
'
Error: can not cancel trial job in readonly mode!
'
));
}
this
.
log
.
info
(
`User cancelTrialJob:
${
trialJobId
}
`
);
this
.
log
.
info
(
`User cancelTrialJob:
${
trialJobId
}
`
);
await
this
.
trainingService
.
cancelTrialJob
(
trialJobId
);
await
this
.
trainingService
.
cancelTrialJob
(
trialJobId
);
await
this
.
dataStore
.
storeTrialJobEvent
(
'
USER_TO_CANCEL
'
,
trialJobId
,
''
);
await
this
.
dataStore
.
storeTrialJobEvent
(
'
USER_TO_CANCEL
'
,
trialJobId
,
''
);
...
@@ -180,13 +194,16 @@ class NNIManager implements Manager {
...
@@ -180,13 +194,16 @@ class NNIManager implements Manager {
return
this
.
experimentProfile
.
id
;
return
this
.
experimentProfile
.
id
;
}
}
public
async
resumeExperiment
():
Promise
<
void
>
{
public
async
resumeExperiment
(
readonly
:
boolean
):
Promise
<
void
>
{
this
.
log
.
info
(
`Resuming experiment:
${
this
.
experimentProfile
.
id
}
`
);
this
.
log
.
info
(
`Resuming experiment:
${
this
.
experimentProfile
.
id
}
`
);
//Fetch back the experiment profile
//Fetch back the experiment profile
const
experimentId
:
string
=
getExperimentId
();
const
experimentId
:
string
=
getExperimentId
();
this
.
experimentProfile
=
await
this
.
dataStore
.
getExperimentProfile
(
experimentId
);
this
.
experimentProfile
=
await
this
.
dataStore
.
getExperimentProfile
(
experimentId
);
this
.
readonly
=
readonly
;
if
(
readonly
)
{
return
Promise
.
resolve
();
}
const
expParams
:
ExperimentParams
=
this
.
experimentProfile
.
params
;
const
expParams
:
ExperimentParams
=
this
.
experimentProfile
.
params
;
setInitTrialSequenceId
(
this
.
experimentProfile
.
maxSequenceId
+
1
);
setInitTrialSequenceId
(
this
.
experimentProfile
.
maxSequenceId
+
1
);
// Set up multiphase config
// Set up multiphase config
...
@@ -196,7 +213,7 @@ class NNIManager implements Manager {
...
@@ -196,7 +213,7 @@ class NNIManager implements Manager {
// Set up versionCheck config
// Set up versionCheck config
if
(
expParams
.
versionCheck
!==
undefined
)
{
if
(
expParams
.
versionCheck
!==
undefined
)
{
this
.
trainingService
.
setClusterMetadata
(
'
version
C
heck
'
,
expParams
.
versionCheck
.
toString
());
this
.
trainingService
.
setClusterMetadata
(
'
version
_c
heck
'
,
expParams
.
versionCheck
.
toString
());
}
}
const
dispatcherCommand
:
string
=
getMsgDispatcherCommand
(
expParams
.
tuner
,
expParams
.
assessor
,
expParams
.
advisor
,
const
dispatcherCommand
:
string
=
getMsgDispatcherCommand
(
expParams
.
tuner
,
expParams
.
assessor
,
expParams
.
advisor
,
...
@@ -247,6 +264,9 @@ class NNIManager implements Manager {
...
@@ -247,6 +264,9 @@ class NNIManager implements Manager {
}
}
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
if
(
this
.
readonly
)
{
return
Promise
.
reject
(
new
Error
(
'
Error: can not set cluster metadata in readonly mode!
'
));
}
this
.
log
.
info
(
`NNIManager setClusterMetadata, key:
${
key
}
, value:
${
value
}
`
);
this
.
log
.
info
(
`NNIManager setClusterMetadata, key:
${
key
}
, value:
${
value
}
`
);
let
timeoutId
:
NodeJS
.
Timer
;
let
timeoutId
:
NodeJS
.
Timer
;
// TO DO: move timeout value to constants file
// TO DO: move timeout value to constants file
...
...
src/nni_manager/main.ts
View file @
99f7d79c
...
@@ -26,7 +26,7 @@ import * as component from './common/component';
...
@@ -26,7 +26,7 @@ import * as component from './common/component';
import
{
Database
,
DataStore
}
from
'
./common/datastore
'
;
import
{
Database
,
DataStore
}
from
'
./common/datastore
'
;
import
{
setExperimentStartupInfo
}
from
'
./common/experimentStartupInfo
'
;
import
{
setExperimentStartupInfo
}
from
'
./common/experimentStartupInfo
'
;
import
{
getLogger
,
Logger
,
logLevelNameMap
}
from
'
./common/log
'
;
import
{
getLogger
,
Logger
,
logLevelNameMap
}
from
'
./common/log
'
;
import
{
Manager
}
from
'
./common/manager
'
;
import
{
Manager
,
ExperimentStartUpMode
}
from
'
./common/manager
'
;
import
{
TrainingService
}
from
'
./common/trainingService
'
;
import
{
TrainingService
}
from
'
./common/trainingService
'
;
import
{
getLogDir
,
mkDirP
,
parseArg
,
uniqueString
}
from
'
./common/utils
'
;
import
{
getLogDir
,
mkDirP
,
parseArg
,
uniqueString
}
from
'
./common/utils
'
;
import
{
NNIDataStore
}
from
'
./core/nniDataStore
'
;
import
{
NNIDataStore
}
from
'
./core/nniDataStore
'
;
...
@@ -43,10 +43,10 @@ import {
...
@@ -43,10 +43,10 @@ import {
function
initStartupInfo
(
function
initStartupInfo
(
startExpMode
:
string
,
resumeExperimentId
:
string
,
basePort
:
number
,
startExpMode
:
string
,
resumeExperimentId
:
string
,
basePort
:
number
,
logDirectory
:
string
,
experimentLogLevel
:
string
):
void
{
logDirectory
:
string
,
experimentLogLevel
:
string
,
readonly
:
boolean
):
void
{
const
createNew
:
boolean
=
(
startExpMode
===
'
new
'
);
const
createNew
:
boolean
=
(
startExpMode
===
ExperimentStartUpMode
.
NEW
);
const
expId
:
string
=
createNew
?
uniqueString
(
8
)
:
resumeExperimentId
;
const
expId
:
string
=
createNew
?
uniqueString
(
8
)
:
resumeExperimentId
;
setExperimentStartupInfo
(
createNew
,
expId
,
basePort
,
logDirectory
,
experimentLogLevel
);
setExperimentStartupInfo
(
createNew
,
expId
,
basePort
,
logDirectory
,
experimentLogLevel
,
readonly
);
}
}
async
function
initContainer
(
platformMode
:
string
):
Promise
<
void
>
{
async
function
initContainer
(
platformMode
:
string
):
Promise
<
void
>
{
...
@@ -108,15 +108,15 @@ if (!['local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller'].includes(mode
...
@@ -108,15 +108,15 @@ if (!['local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller'].includes(mode
}
}
const
startMode
:
string
=
parseArg
([
'
--start_mode
'
,
'
-s
'
]);
const
startMode
:
string
=
parseArg
([
'
--start_mode
'
,
'
-s
'
]);
if
(
!
[
'
new
'
,
'
resume
'
].
includes
(
startMode
))
{
if
(
!
[
ExperimentStartUpMode
.
NEW
,
ExperimentStartUpMode
.
RESUME
].
includes
(
startMode
))
{
console
.
log
(
`FATAL: unknown start_mode:
${
startMode
}
`
);
console
.
log
(
`FATAL: unknown start_mode:
${
startMode
}
`
);
usage
();
usage
();
process
.
exit
(
1
);
process
.
exit
(
1
);
}
}
const
experimentId
:
string
=
parseArg
([
'
--experiment_id
'
,
'
-id
'
]);
const
experimentId
:
string
=
parseArg
([
'
--experiment_id
'
,
'
-id
'
]);
if
(
startMode
===
'
resume
'
&&
experimentId
.
trim
().
length
<
1
)
{
if
(
(
startMode
===
ExperimentStartUpMode
.
RESUME
)
&&
experimentId
.
trim
().
length
<
1
)
{
console
.
log
(
`FATAL: cannot resume experiment, invalid experiment_id:
${
experimentId
}
`
);
console
.
log
(
`FATAL: cannot resume
the
experiment, invalid experiment_id:
${
experimentId
}
`
);
usage
();
usage
();
process
.
exit
(
1
);
process
.
exit
(
1
);
}
}
...
@@ -133,7 +133,15 @@ if (logLevel.length > 0 && !logLevelNameMap.has(logLevel)) {
...
@@ -133,7 +133,15 @@ if (logLevel.length > 0 && !logLevelNameMap.has(logLevel)) {
console
.
log
(
`FATAL: invalid log_level:
${
logLevel
}
`
);
console
.
log
(
`FATAL: invalid log_level:
${
logLevel
}
`
);
}
}
initStartupInfo
(
startMode
,
experimentId
,
port
,
logDir
,
logLevel
);
const
readonlyArg
:
string
=
parseArg
([
'
--readonly
'
,
'
-r
'
]);
if
(
!
(
'
true
'
||
'
false
'
).
includes
(
readonlyArg
.
toLowerCase
()))
{
console
.
log
(
`FATAL: readonly property should only be true or false`
);
usage
();
process
.
exit
(
1
);
}
const
readonly
=
readonlyArg
.
toLowerCase
()
==
'
true
'
?
true
:
false
;
initStartupInfo
(
startMode
,
experimentId
,
port
,
logDir
,
logLevel
,
readonly
);
mkDirP
(
getLogDir
())
mkDirP
(
getLogDir
())
.
then
(
async
()
=>
{
.
then
(
async
()
=>
{
...
...
src/nni_manager/rest_server/restHandler.ts
View file @
99f7d79c
...
@@ -25,9 +25,9 @@ import * as path from 'path';
...
@@ -25,9 +25,9 @@ import * as path from 'path';
import
*
as
component
from
'
../common/component
'
;
import
*
as
component
from
'
../common/component
'
;
import
{
DataStore
,
MetricDataRecord
,
TrialJobInfo
}
from
'
../common/datastore
'
;
import
{
DataStore
,
MetricDataRecord
,
TrialJobInfo
}
from
'
../common/datastore
'
;
import
{
NNIError
,
NNIErrorNames
}
from
'
../common/errors
'
;
import
{
NNIError
,
NNIErrorNames
}
from
'
../common/errors
'
;
import
{
isNewExperiment
}
from
'
../common/experimentStartupInfo
'
;
import
{
isNewExperiment
,
isReadonly
}
from
'
../common/experimentStartupInfo
'
;
import
{
getLogger
,
Logger
}
from
'
../common/log
'
;
import
{
getLogger
,
Logger
}
from
'
../common/log
'
;
import
{
ExperimentProfile
,
Manager
,
TrialJobStatistics
}
from
'
../common/manager
'
;
import
{
ExperimentProfile
,
Manager
,
TrialJobStatistics
,
ExperimentStartUpMode
}
from
'
../common/manager
'
;
import
{
ValidationSchemas
}
from
'
./restValidationSchemas
'
;
import
{
ValidationSchemas
}
from
'
./restValidationSchemas
'
;
import
{
NNIRestServer
}
from
'
./nniRestServer
'
;
import
{
NNIRestServer
}
from
'
./nniRestServer
'
;
import
{
getVersion
}
from
'
../common/utils
'
;
import
{
getVersion
}
from
'
../common/utils
'
;
...
@@ -86,11 +86,11 @@ class NNIRestHandler {
...
@@ -86,11 +86,11 @@ class NNIRestHandler {
return
router
;
return
router
;
}
}
private
handle_error
(
err
:
Error
,
res
:
Response
,
isFatal
:
boolean
=
false
):
void
{
private
handle_error
(
err
:
Error
,
res
:
Response
,
isFatal
:
boolean
=
false
,
errorCode
:
number
=
500
):
void
{
if
(
err
instanceof
NNIError
&&
err
.
name
===
NNIErrorNames
.
NOT_FOUND
)
{
if
(
err
instanceof
NNIError
&&
err
.
name
===
NNIErrorNames
.
NOT_FOUND
)
{
res
.
status
(
404
);
res
.
status
(
404
);
}
else
{
}
else
{
res
.
status
(
500
);
res
.
status
(
errorCode
);
}
}
res
.
send
({
res
.
send
({
error
:
err
.
message
error
:
err
.
message
...
@@ -169,13 +169,13 @@ class NNIRestHandler {
...
@@ -169,13 +169,13 @@ class NNIRestHandler {
this
.
handle_error
(
err
,
res
);
this
.
handle_error
(
err
,
res
);
});
});
}
else
{
}
else
{
this
.
nniManager
.
resumeExperiment
().
then
(()
=>
{
this
.
nniManager
.
resumeExperiment
(
isReadonly
()
).
then
(()
=>
{
res
.
send
();
res
.
send
();
}).
catch
((
err
:
Error
)
=>
{
}).
catch
((
err
:
Error
)
=>
{
// Resume experiment is a step of initialization, so any exception thrown is a fatal
// Resume experiment is a step of initialization, so any exception thrown is a fatal
this
.
handle_error
(
err
,
res
);
this
.
handle_error
(
err
,
res
);
});
});
}
}
});
});
}
}
...
@@ -193,18 +193,18 @@ class NNIRestHandler {
...
@@ -193,18 +193,18 @@ class NNIRestHandler {
router
.
put
(
router
.
put
(
'
/experiment/cluster-metadata
'
,
expressJoi
(
ValidationSchemas
.
SETCLUSTERMETADATA
),
'
/experiment/cluster-metadata
'
,
expressJoi
(
ValidationSchemas
.
SETCLUSTERMETADATA
),
async
(
req
:
Request
,
res
:
Response
)
=>
{
async
(
req
:
Request
,
res
:
Response
)
=>
{
// tslint:disable-next-line:no-any
// tslint:disable-next-line:no-any
const
metadata
:
any
=
req
.
body
;
const
metadata
:
any
=
req
.
body
;
const
keys
:
string
[]
=
Object
.
keys
(
metadata
);
const
keys
:
string
[]
=
Object
.
keys
(
metadata
);
try
{
try
{
for
(
const
key
of
keys
)
{
for
(
const
key
of
keys
)
{
await
this
.
nniManager
.
setClusterMetadata
(
key
,
JSON
.
stringify
(
metadata
[
key
]));
await
this
.
nniManager
.
setClusterMetadata
(
key
,
JSON
.
stringify
(
metadata
[
key
]));
}
res
.
send
();
}
catch
(
err
)
{
// setClusterMetata is a step of initialization, so any exception thrown is a fatal
this
.
handle_error
(
NNIError
.
FromError
(
err
),
res
,
true
);
}
}
res
.
send
();
}
catch
(
err
)
{
// setClusterMetata is a step of initialization, so any exception thrown is a fatal
this
.
handle_error
(
NNIError
.
FromError
(
err
),
res
,
true
);
}
});
});
}
}
...
...
tools/nni_cmd/launcher.py
View file @
99f7d79c
...
@@ -118,12 +118,17 @@ def start_rest_server(port, platform, mode, config_file_name, experiment_id=None
...
@@ -118,12 +118,17 @@ def start_rest_server(port, platform, mode, config_file_name, experiment_id=None
node_command
=
'node'
node_command
=
'node'
if
sys
.
platform
==
'win32'
:
if
sys
.
platform
==
'win32'
:
node_command
=
os
.
path
.
join
(
entry_dir
[:
-
3
],
'Scripts'
,
'node.exe'
)
node_command
=
os
.
path
.
join
(
entry_dir
[:
-
3
],
'Scripts'
,
'node.exe'
)
cmds
=
[
node_command
,
entry_file
,
'--port'
,
str
(
port
),
'--mode'
,
platform
,
'--start_mode'
,
mode
]
cmds
=
[
node_command
,
entry_file
,
'--port'
,
str
(
port
),
'--mode'
,
platform
]
if
mode
==
'view'
:
cmds
+=
[
'--start_mode'
,
'resume'
]
cmds
+=
[
'--readonly'
,
'true'
]
else
:
cmds
+=
[
'--start_mode'
,
mode
]
if
log_dir
is
not
None
:
if
log_dir
is
not
None
:
cmds
+=
[
'--log_dir'
,
log_dir
]
cmds
+=
[
'--log_dir'
,
log_dir
]
if
log_level
is
not
None
:
if
log_level
is
not
None
:
cmds
+=
[
'--log_level'
,
log_level
]
cmds
+=
[
'--log_level'
,
log_level
]
if
mode
==
'resume'
:
if
mode
in
[
'resume'
,
'view'
]
:
cmds
+=
[
'--experiment_id'
,
experiment_id
]
cmds
+=
[
'--experiment_id'
,
experiment_id
]
stdout_full_path
,
stderr_full_path
=
get_log_path
(
config_file_name
)
stdout_full_path
,
stderr_full_path
=
get_log_path
(
config_file_name
)
with
open
(
stdout_full_path
,
'a+'
)
as
stdout_file
,
open
(
stderr_full_path
,
'a+'
)
as
stderr_file
:
with
open
(
stdout_full_path
,
'a+'
)
as
stdout_file
,
open
(
stderr_full_path
,
'a+'
)
as
stderr_file
:
...
@@ -156,7 +161,6 @@ def set_trial_config(experiment_config, port, config_file_name):
...
@@ -156,7 +161,6 @@ def set_trial_config(experiment_config, port, config_file_name):
def
set_local_config
(
experiment_config
,
port
,
config_file_name
):
def
set_local_config
(
experiment_config
,
port
,
config_file_name
):
'''set local configuration'''
'''set local configuration'''
#set machine_list
request_data
=
dict
()
request_data
=
dict
()
if
experiment_config
.
get
(
'localConfig'
):
if
experiment_config
.
get
(
'localConfig'
):
request_data
[
'local_config'
]
=
experiment_config
[
'localConfig'
]
request_data
[
'local_config'
]
=
experiment_config
[
'localConfig'
]
...
@@ -177,7 +181,7 @@ def set_local_config(experiment_config, port, config_file_name):
...
@@ -177,7 +181,7 @@ def set_local_config(experiment_config, port, config_file_name):
fout
.
write
(
json
.
dumps
(
json
.
loads
(
err_message
),
indent
=
4
,
sort_keys
=
True
,
separators
=
(
','
,
':'
)))
fout
.
write
(
json
.
dumps
(
json
.
loads
(
err_message
),
indent
=
4
,
sort_keys
=
True
,
separators
=
(
','
,
':'
)))
return
False
,
err_message
return
False
,
err_message
return
set_trial_config
(
experiment_config
,
port
,
config_file_name
)
return
set_trial_config
(
experiment_config
,
port
,
config_file_name
)
,
None
def
set_remote_config
(
experiment_config
,
port
,
config_file_name
):
def
set_remote_config
(
experiment_config
,
port
,
config_file_name
):
'''Call setClusterMetadata to pass trial'''
'''Call setClusterMetadata to pass trial'''
...
@@ -345,7 +349,6 @@ def set_experiment(experiment_config, mode, port, config_file_name):
...
@@ -345,7 +349,6 @@ def set_experiment(experiment_config, mode, port, config_file_name):
{
'key'
:
'frameworkcontroller_config'
,
'value'
:
experiment_config
[
'frameworkcontrollerConfig'
]})
{
'key'
:
'frameworkcontroller_config'
,
'value'
:
experiment_config
[
'frameworkcontrollerConfig'
]})
request_data
[
'clusterMetaData'
].
append
(
request_data
[
'clusterMetaData'
].
append
(
{
'key'
:
'trial_config'
,
'value'
:
experiment_config
[
'trial'
]})
{
'key'
:
'trial_config'
,
'value'
:
experiment_config
[
'trial'
]})
response
=
rest_post
(
experiment_url
(
port
),
json
.
dumps
(
request_data
),
REST_TIME_OUT
,
show_error
=
True
)
response
=
rest_post
(
experiment_url
(
port
),
json
.
dumps
(
request_data
),
REST_TIME_OUT
,
show_error
=
True
)
if
check_response
(
response
):
if
check_response
(
response
):
return
response
return
response
...
@@ -357,6 +360,33 @@ def set_experiment(experiment_config, mode, port, config_file_name):
...
@@ -357,6 +360,33 @@ def set_experiment(experiment_config, mode, port, config_file_name):
print_error
(
'Setting experiment error, error message is {}'
.
format
(
response
.
text
))
print_error
(
'Setting experiment error, error message is {}'
.
format
(
response
.
text
))
return
None
return
None
def
set_platform_config
(
platform
,
experiment_config
,
port
,
config_file_name
,
rest_process
):
'''call set_cluster_metadata for specific platform'''
print_normal
(
'Setting {0} config...'
.
format
(
platform
))
config_result
,
err_msg
=
None
,
None
if
platform
==
'local'
:
config_result
,
err_msg
=
set_local_config
(
experiment_config
,
port
,
config_file_name
)
elif
platform
==
'remote'
:
config_result
,
err_msg
=
set_remote_config
(
experiment_config
,
port
,
config_file_name
)
elif
platform
==
'pai'
:
config_result
,
err_msg
=
set_pai_config
(
experiment_config
,
port
,
config_file_name
)
elif
platform
==
'kubeflow'
:
config_result
,
err_msg
=
set_kubeflow_config
(
experiment_config
,
port
,
config_file_name
)
elif
platform
==
'frameworkcontroller'
:
config_result
,
err_msg
=
set_frameworkcontroller_config
(
experiment_config
,
port
,
config_file_name
)
else
:
raise
Exception
(
ERROR_INFO
%
'Unsupported platform!'
)
exit
(
1
)
if
config_result
:
print_normal
(
'Successfully set {0} config!'
.
format
(
platform
))
else
:
print_error
(
'Failed! Error is: {}'
.
format
(
err_msg
))
try
:
kill_command
(
rest_process
.
pid
)
except
Exception
:
raise
Exception
(
ERROR_INFO
%
'Rest server stopped!'
)
exit
(
1
)
def
launch_experiment
(
args
,
experiment_config
,
mode
,
config_file_name
,
experiment_id
=
None
):
def
launch_experiment
(
args
,
experiment_config
,
mode
,
config_file_name
,
experiment_id
=
None
):
'''follow steps to start rest server and start experiment'''
'''follow steps to start rest server and start experiment'''
nni_config
=
Config
(
config_file_name
)
nni_config
=
Config
(
config_file_name
)
...
@@ -381,8 +411,10 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen
...
@@ -381,8 +411,10 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen
exit
(
1
)
exit
(
1
)
log_dir
=
experiment_config
[
'logDir'
]
if
experiment_config
.
get
(
'logDir'
)
else
None
log_dir
=
experiment_config
[
'logDir'
]
if
experiment_config
.
get
(
'logDir'
)
else
None
log_level
=
experiment_config
[
'logLevel'
]
if
experiment_config
.
get
(
'logLevel'
)
else
None
log_level
=
experiment_config
[
'logLevel'
]
if
experiment_config
.
get
(
'logLevel'
)
else
None
if
log_level
not
in
[
'trace'
,
'debug'
]
and
(
args
.
debug
or
experiment_config
.
get
(
'debug'
)
is
True
):
#view experiment mode do not need debug function, when view an experiment, there will be no new logs created
log_level
=
'debug'
if
mode
!=
'view'
:
if
log_level
not
in
[
'trace'
,
'debug'
]
and
(
args
.
debug
or
experiment_config
.
get
(
'debug'
)
is
True
):
log_level
=
'debug'
# start rest server
# start rest server
rest_process
,
start_time
=
start_rest_server
(
args
.
port
,
experiment_config
[
'trainingServicePlatform'
],
mode
,
config_file_name
,
experiment_id
,
log_dir
,
log_level
)
rest_process
,
start_time
=
start_rest_server
(
args
.
port
,
experiment_config
[
'trainingServicePlatform'
],
mode
,
config_file_name
,
experiment_id
,
log_dir
,
log_level
)
nni_config
.
set_config
(
'restServerPid'
,
rest_process
.
pid
)
nni_config
.
set_config
(
'restServerPid'
,
rest_process
.
pid
)
...
@@ -416,83 +448,14 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen
...
@@ -416,83 +448,14 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen
except
Exception
:
except
Exception
:
raise
Exception
(
ERROR_INFO
%
'Rest server stopped!'
)
raise
Exception
(
ERROR_INFO
%
'Rest server stopped!'
)
exit
(
1
)
exit
(
1
)
if
mode
!=
'view'
:
# set remote config
# set platform configuration
if
experiment_config
[
'trainingServicePlatform'
]
==
'remote'
:
set_platform_config
(
experiment_config
[
'trainingServicePlatform'
],
experiment_config
,
args
.
port
,
config_file_name
,
rest_process
)
print_normal
(
'Setting remote config...'
)
config_result
,
err_msg
=
set_remote_config
(
experiment_config
,
args
.
port
,
config_file_name
)
if
config_result
:
print_normal
(
'Successfully set remote config!'
)
else
:
print_error
(
'Failed! Error is: {}'
.
format
(
err_msg
))
try
:
kill_command
(
rest_process
.
pid
)
except
Exception
:
raise
Exception
(
ERROR_INFO
%
'Rest server stopped!'
)
exit
(
1
)
# set local config
if
experiment_config
[
'trainingServicePlatform'
]
==
'local'
:
print_normal
(
'Setting local config...'
)
if
set_local_config
(
experiment_config
,
args
.
port
,
config_file_name
):
print_normal
(
'Successfully set local config!'
)
else
:
print_error
(
'Set local config failed!'
)
try
:
kill_command
(
rest_process
.
pid
)
except
Exception
:
raise
Exception
(
ERROR_INFO
%
'Rest server stopped!'
)
exit
(
1
)
#set pai config
if
experiment_config
[
'trainingServicePlatform'
]
==
'pai'
:
print_normal
(
'Setting pai config...'
)
config_result
,
err_msg
=
set_pai_config
(
experiment_config
,
args
.
port
,
config_file_name
)
if
config_result
:
print_normal
(
'Successfully set pai config!'
)
else
:
if
err_msg
:
print_error
(
'Failed! Error is: {}'
.
format
(
err_msg
))
try
:
kill_command
(
rest_process
.
pid
)
except
Exception
:
raise
Exception
(
ERROR_INFO
%
'Restful server stopped!'
)
exit
(
1
)
#set kubeflow config
if
experiment_config
[
'trainingServicePlatform'
]
==
'kubeflow'
:
print_normal
(
'Setting kubeflow config...'
)
config_result
,
err_msg
=
set_kubeflow_config
(
experiment_config
,
args
.
port
,
config_file_name
)
if
config_result
:
print_normal
(
'Successfully set kubeflow config!'
)
else
:
if
err_msg
:
print_error
(
'Failed! Error is: {}'
.
format
(
err_msg
))
try
:
kill_command
(
rest_process
.
pid
)
except
Exception
:
raise
Exception
(
ERROR_INFO
%
'Restful server stopped!'
)
exit
(
1
)
#set frameworkcontroller config
if
experiment_config
[
'trainingServicePlatform'
]
==
'frameworkcontroller'
:
print_normal
(
'Setting frameworkcontroller config...'
)
config_result
,
err_msg
=
set_frameworkcontroller_config
(
experiment_config
,
args
.
port
,
config_file_name
)
if
config_result
:
print_normal
(
'Successfully set frameworkcontroller config!'
)
else
:
if
err_msg
:
print_error
(
'Failed! Error is: {}'
.
format
(
err_msg
))
try
:
kill_command
(
rest_process
.
pid
)
except
Exception
:
raise
Exception
(
ERROR_INFO
%
'Restful server stopped!'
)
exit
(
1
)
# start a new experiment
# start a new experiment
print_normal
(
'Starting experiment...'
)
print_normal
(
'Starting experiment...'
)
# set debug configuration
# set debug configuration
if
experiment_config
.
get
(
'debug'
)
is
None
:
if
mode
!=
'view'
and
experiment_config
.
get
(
'debug'
)
is
None
:
experiment_config
[
'debug'
]
=
args
.
debug
experiment_config
[
'debug'
]
=
args
.
debug
response
=
set_experiment
(
experiment_config
,
mode
,
args
.
port
,
config_file_name
)
response
=
set_experiment
(
experiment_config
,
mode
,
args
.
port
,
config_file_name
)
if
response
:
if
response
:
...
@@ -519,8 +482,23 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen
...
@@ -519,8 +482,23 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen
print_normal
(
EXPERIMENT_SUCCESS_INFO
%
(
experiment_id
,
' '
.
join
(
web_ui_url_list
)))
print_normal
(
EXPERIMENT_SUCCESS_INFO
%
(
experiment_id
,
' '
.
join
(
web_ui_url_list
)))
def
resume_experiment
(
args
):
def
create_experiment
(
args
):
'''resume an experiment'''
'''start a new experiment'''
config_file_name
=
''
.
join
(
random
.
sample
(
string
.
ascii_letters
+
string
.
digits
,
8
))
nni_config
=
Config
(
config_file_name
)
config_path
=
os
.
path
.
abspath
(
args
.
config
)
if
not
os
.
path
.
exists
(
config_path
):
print_error
(
'Please set correct config path!'
)
exit
(
1
)
experiment_config
=
get_yml_content
(
config_path
)
validate_all_content
(
experiment_config
,
config_path
)
nni_config
.
set_config
(
'experimentConfig'
,
experiment_config
)
launch_experiment
(
args
,
experiment_config
,
'new'
,
config_file_name
)
nni_config
.
set_config
(
'restServerPort'
,
args
.
port
)
def
manage_stopped_experiment
(
args
,
mode
):
'''view a stopped experiment'''
update_experiment
()
update_experiment
()
experiment_config
=
Experiments
()
experiment_config
=
Experiments
()
experiment_dict
=
experiment_config
.
get_all_experiments
()
experiment_dict
=
experiment_config
.
get_all_experiments
()
...
@@ -528,38 +506,31 @@ def resume_experiment(args):
...
@@ -528,38 +506,31 @@ def resume_experiment(args):
experiment_endTime
=
None
experiment_endTime
=
None
#find the latest stopped experiment
#find the latest stopped experiment
if
not
args
.
id
:
if
not
args
.
id
:
print_error
(
'Please set experiment id!
\n
You could use
\'
nnictl
resume
{id}
\'
to
resume
a stopped experiment!
\n
'
\
print_error
(
'Please set experiment id!
\n
You could use
\'
nnictl
{0}
{id}
\'
to
{0}
a stopped experiment!
\n
'
\
'You could use
\'
nnictl experiment list --all
\'
to show all experiments!'
)
'You could use
\'
nnictl experiment list --all
\'
to show all experiments!'
.
format
(
mode
)
)
exit
(
1
)
exit
(
1
)
else
:
else
:
if
experiment_dict
.
get
(
args
.
id
)
is
None
:
if
experiment_dict
.
get
(
args
.
id
)
is
None
:
print_error
(
'Id %s not exist!'
%
args
.
id
)
print_error
(
'Id %s not exist!'
%
args
.
id
)
exit
(
1
)
exit
(
1
)
if
experiment_dict
[
args
.
id
][
'status'
]
!=
'STOPPED'
:
if
experiment_dict
[
args
.
id
][
'status'
]
!=
'STOPPED'
:
print_error
(
'Only stopped experiments can be
resumed!'
)
print_error
(
'Only stopped experiments can be
{0}ed!'
.
format
(
mode
)
)
exit
(
1
)
exit
(
1
)
experiment_id
=
args
.
id
experiment_id
=
args
.
id
print_normal
(
'
Resuming
experiment
%s
...'
%
experiment_id
)
print_normal
(
'
{0}
experiment
{1}
...'
.
format
(
mode
,
experiment_id
)
)
nni_config
=
Config
(
experiment_dict
[
experiment_id
][
'fileName'
])
nni_config
=
Config
(
experiment_dict
[
experiment_id
][
'fileName'
])
experiment_config
=
nni_config
.
get_config
(
'experimentConfig'
)
experiment_config
=
nni_config
.
get_config
(
'experimentConfig'
)
experiment_id
=
nni_config
.
get_config
(
'experimentId'
)
experiment_id
=
nni_config
.
get_config
(
'experimentId'
)
new_config_file_name
=
''
.
join
(
random
.
sample
(
string
.
ascii_letters
+
string
.
digits
,
8
))
new_config_file_name
=
''
.
join
(
random
.
sample
(
string
.
ascii_letters
+
string
.
digits
,
8
))
new_nni_config
=
Config
(
new_config_file_name
)
new_nni_config
=
Config
(
new_config_file_name
)
new_nni_config
.
set_config
(
'experimentConfig'
,
experiment_config
)
new_nni_config
.
set_config
(
'experimentConfig'
,
experiment_config
)
launch_experiment
(
args
,
experiment_config
,
'resume'
,
new_config_file_name
,
experiment_id
)
launch_experiment
(
args
,
experiment_config
,
mode
,
new_config_file_name
,
experiment_id
)
new_nni_config
.
set_config
(
'restServerPort'
,
args
.
port
)
new_nni_config
.
set_config
(
'restServerPort'
,
args
.
port
)
def
create_experiment
(
args
):
def
view_experiment
(
args
):
'''start a new experiment'''
'''view a stopped experiment'''
config_file_name
=
''
.
join
(
random
.
sample
(
string
.
ascii_letters
+
string
.
digits
,
8
))
manage_stopped_experiment
(
args
,
'view'
)
nni_config
=
Config
(
config_file_name
)
config_path
=
os
.
path
.
abspath
(
args
.
config
)
if
not
os
.
path
.
exists
(
config_path
):
print_error
(
'Please set correct config path!'
)
exit
(
1
)
experiment_config
=
get_yml_content
(
config_path
)
validate_all_content
(
experiment_config
,
config_path
)
nni_config
.
set_config
(
'experimentConfig'
,
experiment
_config
)
def
resume_
experiment
(
args
):
launch_experiment
(
args
,
experiment_config
,
'new'
,
config_file_name
)
'''resume an experiment'''
nni_config
.
set_config
(
'restServerPort'
,
args
.
port
)
manage_stopped_experiment
(
args
,
'resume'
)
\ No newline at end of file
tools/nni_cmd/nnictl.py
View file @
99f7d79c
...
@@ -21,7 +21,7 @@
...
@@ -21,7 +21,7 @@
import
argparse
import
argparse
import
pkg_resources
import
pkg_resources
from
.launcher
import
create_experiment
,
resume_experiment
from
.launcher
import
create_experiment
,
resume_experiment
,
view_experiment
from
.updater
import
update_searchspace
,
update_concurrency
,
update_duration
,
update_trialnum
,
import_data
from
.updater
import
update_searchspace
,
update_concurrency
,
update_duration
,
update_trialnum
,
import_data
from
.nnictl_utils
import
*
from
.nnictl_utils
import
*
from
.package_management
import
*
from
.package_management
import
*
...
@@ -66,6 +66,12 @@ def parse_args():
...
@@ -66,6 +66,12 @@ def parse_args():
parser_resume
.
add_argument
(
'--debug'
,
'-d'
,
action
=
'store_true'
,
help
=
' set debug mode'
)
parser_resume
.
add_argument
(
'--debug'
,
'-d'
,
action
=
'store_true'
,
help
=
' set debug mode'
)
parser_resume
.
set_defaults
(
func
=
resume_experiment
)
parser_resume
.
set_defaults
(
func
=
resume_experiment
)
# parse view command
parser_resume
=
subparsers
.
add_parser
(
'view'
,
help
=
'view a stopped experiment'
)
parser_resume
.
add_argument
(
'id'
,
nargs
=
'?'
,
help
=
'The id of the experiment you want to view'
)
parser_resume
.
add_argument
(
'--port'
,
'-p'
,
default
=
DEFAULT_REST_PORT
,
dest
=
'port'
,
help
=
'the port of restful server'
)
parser_resume
.
set_defaults
(
func
=
view_experiment
)
# parse update command
# parse update command
parser_updater
=
subparsers
.
add_parser
(
'update'
,
help
=
'update the experiment'
)
parser_updater
=
subparsers
.
add_parser
(
'update'
,
help
=
'update the experiment'
)
#add subparsers for parser_updater
#add subparsers for parser_updater
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment