Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
b40e3db7
Commit
b40e3db7
authored
Dec 01, 2020
by
quzha
Browse files
Merge branch 'master' of github.com:Microsoft/nni into dev-retiarii
parents
efa4e31c
95f731e4
Changes
226
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
668 additions
and
50 deletions
+668
-50
ts/nni_manager/common/experimentManager.ts
ts/nni_manager/common/experimentManager.ts
+13
-0
ts/nni_manager/common/manager.ts
ts/nni_manager/common/manager.ts
+1
-0
ts/nni_manager/common/trainingService.ts
ts/nni_manager/common/trainingService.ts
+1
-0
ts/nni_manager/common/utils.ts
ts/nni_manager/common/utils.ts
+32
-2
ts/nni_manager/config/adl/adaptdl-crd-v1.json
ts/nni_manager/config/adl/adaptdl-crd-v1.json
+17
-0
ts/nni_manager/config/adl/adaptdl-nni-configmap-template.json
...ni_manager/config/adl/adaptdl-nni-configmap-template.json
+19
-0
ts/nni_manager/config/adl/adaptdl-pvc-template.json
ts/nni_manager/config/adl/adaptdl-pvc-template.json
+27
-0
ts/nni_manager/config/adl/adaptdl-tensorboard-deployment-template.json
...r/config/adl/adaptdl-tensorboard-deployment-template.json
+55
-0
ts/nni_manager/config/adl/adaptdl-tensorboard-pvc-template.json
..._manager/config/adl/adaptdl-tensorboard-pvc-template.json
+27
-0
ts/nni_manager/config/adl/adaptdljob-template.json
ts/nni_manager/config/adl/adaptdljob-template.json
+109
-0
ts/nni_manager/core/nniDataStore.ts
ts/nni_manager/core/nniDataStore.ts
+6
-6
ts/nni_manager/core/nniExperimentsManager.ts
ts/nni_manager/core/nniExperimentsManager.ts
+171
-0
ts/nni_manager/core/nnimanager.ts
ts/nni_manager/core/nnimanager.ts
+36
-8
ts/nni_manager/core/test/experimentManager.test.ts
ts/nni_manager/core/test/experimentManager.test.ts
+60
-0
ts/nni_manager/core/test/mockedDatastore.ts
ts/nni_manager/core/test/mockedDatastore.ts
+3
-3
ts/nni_manager/core/test/nnimanager.test.ts
ts/nni_manager/core/test/nnimanager.test.ts
+22
-2
ts/nni_manager/main.ts
ts/nni_manager/main.ts
+26
-29
ts/nni_manager/package.json
ts/nni_manager/package.json
+2
-0
ts/nni_manager/rest_server/restHandler.ts
ts/nni_manager/rest_server/restHandler.ts
+24
-0
ts/nni_manager/rest_server/restValidationSchemas.ts
ts/nni_manager/rest_server/restValidationSchemas.ts
+17
-0
No files found.
ts/nni_manager/common/experimentManager.ts
0 → 100644
View file @
b40e3db7
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
'
use strict
'
;
abstract
class
ExperimentManager
{
public
abstract
getExperimentsInfo
():
Promise
<
JSON
>
;
public
abstract
setExperimentPath
(
newPath
:
string
):
void
;
public
abstract
setExperimentInfo
(
experimentId
:
string
,
key
:
string
,
value
:
any
):
void
;
public
abstract
stop
():
Promise
<
void
>
;
}
export
{
ExperimentManager
};
ts/nni_manager/common/manager.ts
View file @
b40e3db7
...
@@ -105,6 +105,7 @@ abstract class Manager {
...
@@ -105,6 +105,7 @@ abstract class Manager {
public
abstract
getTrialLog
(
trialJobId
:
string
,
logType
:
LogType
):
Promise
<
string
>
;
public
abstract
getTrialLog
(
trialJobId
:
string
,
logType
:
LogType
):
Promise
<
string
>
;
public
abstract
getTrialJobStatistics
():
Promise
<
TrialJobStatistics
[]
>
;
public
abstract
getTrialJobStatistics
():
Promise
<
TrialJobStatistics
[]
>
;
public
abstract
getTrialJobMessage
(
trialJobId
:
string
):
string
|
undefined
;
public
abstract
getStatus
():
NNIManagerStatus
;
public
abstract
getStatus
():
NNIManagerStatus
;
}
}
...
...
ts/nni_manager/common/trainingService.ts
View file @
b40e3db7
...
@@ -42,6 +42,7 @@ interface TrialJobDetail {
...
@@ -42,6 +42,7 @@ interface TrialJobDetail {
readonly
workingDirectory
:
string
;
readonly
workingDirectory
:
string
;
readonly
form
:
TrialJobApplicationForm
;
readonly
form
:
TrialJobApplicationForm
;
isEarlyStopped
?:
boolean
;
isEarlyStopped
?:
boolean
;
message
?:
string
;
}
}
/**
/**
...
...
ts/nni_manager/common/utils.ts
View file @
b40e3db7
...
@@ -11,13 +11,16 @@ import { ChildProcess, spawn, StdioOptions } from 'child_process';
...
@@ -11,13 +11,16 @@ import { ChildProcess, spawn, StdioOptions } from 'child_process';
import
*
as
fs
from
'
fs
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
os
from
'
os
'
;
import
*
as
os
from
'
os
'
;
import
*
as
path
from
'
path
'
;
import
*
as
path
from
'
path
'
;
import
*
as
lockfile
from
'
lockfile
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
Container
}
from
'
typescript-ioc
'
;
import
{
Container
}
from
'
typescript-ioc
'
;
import
*
as
util
from
'
util
'
;
import
*
as
util
from
'
util
'
;
import
*
as
glob
from
'
glob
'
;
import
{
Database
,
DataStore
}
from
'
./datastore
'
;
import
{
Database
,
DataStore
}
from
'
./datastore
'
;
import
{
ExperimentStartupInfo
,
getExperimentStartupInfo
,
setExperimentStartupInfo
}
from
'
./experimentStartupInfo
'
;
import
{
ExperimentStartupInfo
,
getExperimentStartupInfo
,
setExperimentStartupInfo
}
from
'
./experimentStartupInfo
'
;
import
{
ExperimentParams
,
Manager
}
from
'
./manager
'
;
import
{
ExperimentParams
,
Manager
}
from
'
./manager
'
;
import
{
ExperimentManager
}
from
'
./experimentManager
'
;
import
{
HyperParameters
,
TrainingService
,
TrialJobStatus
}
from
'
./trainingService
'
;
import
{
HyperParameters
,
TrainingService
,
TrialJobStatus
}
from
'
./trainingService
'
;
import
{
logLevelNameMap
}
from
'
./log
'
;
import
{
logLevelNameMap
}
from
'
./log
'
;
...
@@ -43,6 +46,10 @@ function getCheckpointDir(): string {
...
@@ -43,6 +46,10 @@ function getCheckpointDir(): string {
return
path
.
join
(
getExperimentRootDir
(),
'
checkpoint
'
);
return
path
.
join
(
getExperimentRootDir
(),
'
checkpoint
'
);
}
}
function
getExperimentsInfoPath
():
string
{
return
path
.
join
(
os
.
homedir
(),
'
nni-experiments
'
,
'
.experiment
'
);
}
function
mkDirP
(
dirPath
:
string
):
Promise
<
void
>
{
function
mkDirP
(
dirPath
:
string
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
fs
.
exists
(
dirPath
,
(
exists
:
boolean
)
=>
{
fs
.
exists
(
dirPath
,
(
exists
:
boolean
)
=>
{
...
@@ -184,6 +191,7 @@ function prepareUnitTest(): void {
...
@@ -184,6 +191,7 @@ function prepareUnitTest(): void {
Container
.
snapshot
(
DataStore
);
Container
.
snapshot
(
DataStore
);
Container
.
snapshot
(
TrainingService
);
Container
.
snapshot
(
TrainingService
);
Container
.
snapshot
(
Manager
);
Container
.
snapshot
(
Manager
);
Container
.
snapshot
(
ExperimentManager
);
const
logLevel
:
string
=
parseArg
([
'
--log_level
'
,
'
-ll
'
]);
const
logLevel
:
string
=
parseArg
([
'
--log_level
'
,
'
-ll
'
]);
if
(
logLevel
.
length
>
0
&&
!
logLevelNameMap
.
has
(
logLevel
))
{
if
(
logLevel
.
length
>
0
&&
!
logLevelNameMap
.
has
(
logLevel
))
{
...
@@ -211,6 +219,7 @@ function cleanupUnitTest(): void {
...
@@ -211,6 +219,7 @@ function cleanupUnitTest(): void {
Container
.
restore
(
DataStore
);
Container
.
restore
(
DataStore
);
Container
.
restore
(
Database
);
Container
.
restore
(
Database
);
Container
.
restore
(
ExperimentStartupInfo
);
Container
.
restore
(
ExperimentStartupInfo
);
Container
.
restore
(
ExperimentManager
);
}
}
let
cachedipv4Address
:
string
=
''
;
let
cachedipv4Address
:
string
=
''
;
...
@@ -416,8 +425,29 @@ function unixPathJoin(...paths: any[]): string {
...
@@ -416,8 +425,29 @@ function unixPathJoin(...paths: any[]): string {
return
dir
;
return
dir
;
}
}
/**
* lock a file sync
*/
function
withLockSync
(
func
:
Function
,
filePath
:
string
,
lockOpts
:
{[
key
:
string
]:
any
},
...
args
:
any
):
any
{
const
lockName
=
path
.
join
(
path
.
dirname
(
filePath
),
path
.
basename
(
filePath
)
+
`.lock.
${
process
.
pid
}
`
);
if
(
typeof
lockOpts
.
stale
===
'
number
'
){
const
lockPath
=
path
.
join
(
path
.
dirname
(
filePath
),
path
.
basename
(
filePath
)
+
'
.lock.*
'
);
const
lockFileNames
:
string
[]
=
glob
.
sync
(
lockPath
);
const
canLock
:
boolean
=
lockFileNames
.
map
((
fileName
)
=>
{
return
fs
.
existsSync
(
fileName
)
&&
Date
.
now
()
-
fs
.
statSync
(
fileName
).
mtimeMs
>
lockOpts
.
stale
;
}).
filter
(
isExpired
=>
isExpired
===
false
).
length
===
0
;
if
(
!
canLock
)
{
throw
new
Error
(
'
File has been locked.
'
);
}
}
lockfile
.
lockSync
(
lockName
,
lockOpts
);
const
result
=
func
(...
args
);
lockfile
.
unlockSync
(
lockName
);
return
result
;
}
export
{
export
{
countFilesRecursively
,
validateFileNameRecursively
,
generateParamFileName
,
getMsgDispatcherCommand
,
getCheckpointDir
,
countFilesRecursively
,
validateFileNameRecursively
,
generateParamFileName
,
getMsgDispatcherCommand
,
getCheckpointDir
,
getExperimentsInfoPath
,
getLogDir
,
getExperimentRootDir
,
getJobCancelStatus
,
getDefaultDatabaseDir
,
getIPV4Address
,
unixPathJoin
,
getLogDir
,
getExperimentRootDir
,
getJobCancelStatus
,
getDefaultDatabaseDir
,
getIPV4Address
,
unixPathJoin
,
withLockSync
,
mkDirP
,
mkDirPSync
,
delay
,
prepareUnitTest
,
parseArg
,
cleanupUnitTest
,
uniqueString
,
randomInt
,
randomSelect
,
getLogLevel
,
getVersion
,
getCmdPy
,
getTunerProc
,
isAlive
,
killPid
,
getNewLine
mkDirP
,
mkDirPSync
,
delay
,
prepareUnitTest
,
parseArg
,
cleanupUnitTest
,
uniqueString
,
randomInt
,
randomSelect
,
getLogLevel
,
getVersion
,
getCmdPy
,
getTunerProc
,
isAlive
,
killPid
,
getNewLine
};
};
ts/nni_manager/config/adl/adaptdl-crd-v1.json
0 → 100644
View file @
b40e3db7
{
"apiVersion"
:
"apiextensions.k8s.io/v1beta1"
,
"kind"
:
"CustomResourceDefinition"
,
"metadata"
:
{
"name"
:
"adaptdljobs.adaptdl.petuum.com"
},
"spec"
:
{
"group"
:
"adaptdl.petuum.com"
,
"version"
:
"v1"
,
"scope"
:
"Namespaced"
,
"names"
:
{
"plural"
:
"adaptdljobs"
,
"singular"
:
"adaptdljob"
,
"kind"
:
"AdaptDLJob"
}
}
}
ts/nni_manager/config/adl/adaptdl-nni-configmap-template.json
0 → 100644
View file @
b40e3db7
{
"apiVersion"
:
"v1"
,
"kind"
:
"ConfigMap"
,
"metadata"
:
{
"name"
:
"<name>"
,
"ownerReferences"
:
[
{
"apiVersion"
:
"adaptdl.petuum.com/v1"
,
"kind"
:
"AdaptDLJob"
,
"name"
:
"<adaptdljob_name>"
,
"uid"
:
"<adaptdljob_uid>"
}
]
},
"data"
:
{
"run.sh"
:
"<run_script>"
,
"cleanup.sh"
:
"<clean_script>"
}
}
ts/nni_manager/config/adl/adaptdl-pvc-template.json
0 → 100644
View file @
b40e3db7
{
"apiVersion"
:
"v1"
,
"kind"
:
"PersistentVolumeClaim"
,
"metadata"
:
{
"name"
:
"<name>"
,
"ownerReferences"
:
[
{
"apiVersion"
:
"adaptdl.petuum.com/v1"
,
"kind"
:
"AdaptDLJob"
,
"name"
:
"<adaptdljob_name>"
,
"uid"
:
"<adaptdljob_uid>"
}
]
},
"spec"
:
{
"accessModes"
:
[
"ReadWriteMany"
],
"resources"
:
{
"requests"
:
{
"storage"
:
"<storage_size>"
}
},
"storageClassName"
:
"<storage_class>"
,
"volumeMode"
:
"Filesystem"
}
}
ts/nni_manager/config/adl/adaptdl-tensorboard-deployment-template.json
0 → 100644
View file @
b40e3db7
{
"apiVersion"
:
"apps/v1"
,
"kind"
:
"Deployment"
,
"metadata"
:
{
"name"
:
"<name>"
,
"labels"
:
{
"expId"
:
"<exp_id>"
}
},
"spec"
:
{
"selector"
:
{
"matchLabels"
:
{
"app"
:
"<name>"
}
},
"replicas"
:
1
,
"template"
:
{
"metadata"
:
{
"labels"
:
{
"app"
:
"<name>"
}
},
"spec"
:
{
"containers"
:
[
{
"command"
:
[
"tensorboard"
],
"args"
:
[
"--host=0.0.0.0"
,
"--logdir=/adaptdl/tensorboard"
,
"--port=6006"
],
"image"
:
"tensorflow/tensorflow"
,
"name"
:
"tensorboard"
,
"ports"
:
[
{
"containerPort"
:
6006
}
],
"volumeMounts"
:
[
{
"mountPath"
:
"/adaptdl/tensorboard"
,
"name"
:
"adaptdl-tensorboard-pvc"
,
"subPath"
:
"adaptdl/tensorboard"
}
]
}
],
"volumes"
:
[
{
"name"
:
"adaptdl-tensorboard-pvc"
,
"persistentVolumeClaim"
:
{
"claimName"
:
"<adaptdl_tensorflow_pvc_name>"
}
}
]
}
}
}
}
\ No newline at end of file
ts/nni_manager/config/adl/adaptdl-tensorboard-pvc-template.json
0 → 100644
View file @
b40e3db7
{
"apiVersion"
:
"v1"
,
"kind"
:
"PersistentVolumeClaim"
,
"metadata"
:
{
"name"
:
"<name>"
,
"ownerReferences"
:
[
{
"apiVersion"
:
"apps/v1"
,
"kind"
:
"Deployment"
,
"name"
:
"<adaptdl_tensorboard_name>"
,
"uid"
:
"<adaptdl_tensorboard_uid>"
}
]
},
"spec"
:
{
"accessModes"
:
[
"ReadWriteMany"
],
"resources"
:
{
"requests"
:
{
"storage"
:
"<storage_size>"
}
},
"storageClassName"
:
"<storage_class>"
,
"volumeMode"
:
"Filesystem"
}
}
ts/nni_manager/config/adl/adaptdljob-template.json
0 → 100644
View file @
b40e3db7
{
"apiVersion"
:
"adaptdl.petuum.com/v1"
,
"kind"
:
"AdaptDLJob"
,
"metadata"
:
{
"name"
:
"<name>"
,
"labels"
:
{
"app"
:
"<app_name>"
,
"expId"
:
"<exp_id>"
,
"trialId"
:
"<trial_id>"
}
},
"spec"
:
{
"preemptible"
:
false
,
"template"
:
{
"spec"
:
{
"containers"
:
[
{
"lifecycle"
:
{
"preStop"
:
{
"exec"
:
{
"command"
:
[
"/cleanup.sh"
]
}
}
},
"command"
:
[
"/run.sh"
],
"env"
:
[
{
"name"
:
"ADAPTDL_CHECKPOINT_PATH"
,
"value"
:
"/adaptdl/checkpoint"
},
{
"name"
:
"ADAPTDL_TENSORBOARD_LOGDIR"
,
"value"
:
"/adaptdl/tensorboard"
},
{
"name"
:
"ADAPTDL_SHARE_PATH"
,
"value"
:
"/adaptdl/share"
}
],
"image"
:
"<image>"
,
"imagePullPolicy"
:
"Always"
,
"name"
:
"main"
,
"resources"
:
{
"requests"
:
{
"memory"
:
"<memorySize>"
,
"cpu"
:
"<cpuNum>"
},
"limits"
:
{
"nvidia.com/gpu"
:
1
}
},
"volumeMounts"
:
[
{
"mountPath"
:
"/adaptdl/checkpoint"
,
"name"
:
"adaptdl-pvc"
,
"subPath"
:
"adaptdl/checkpoint"
},
{
"mountPath"
:
"/adaptdl/share"
,
"name"
:
"adaptdl-pvc"
,
"subPath"
:
"adaptdl/share"
},
{
"mountPath"
:
"/adaptdl/tensorboard"
,
"name"
:
"adaptdl-tensorboard-pvc"
,
"subPath"
:
"adaptdl/tensorboard"
},
{
"mountPath"
:
"/cleanup.sh"
,
"name"
:
"adaptdl-nni-configmap"
,
"subPath"
:
"cleanup.sh"
},
{
"mountPath"
:
"/run.sh"
,
"name"
:
"adaptdl-nni-configmap"
,
"subPath"
:
"run.sh"
}
]
}
],
"imagePullSecrets"
:
[],
"volumes"
:
[
{
"name"
:
"adaptdl-pvc"
,
"persistentVolumeClaim"
:
{
"claimName"
:
"<adaptdl_pvc_name>"
}
},
{
"name"
:
"adaptdl-tensorboard-pvc"
,
"persistentVolumeClaim"
:
{
"claimName"
:
"<adaptdl_tensorflow_pvc_name>"
}
},
{
"name"
:
"adaptdl-nni-configmap"
,
"configMap"
:
{
"name"
:
"<adaptdl_nni_configmap_name>"
,
"defaultMode"
:
511
}
}
]
}
}
}
}
ts/nni_manager/core/nniDataStore.ts
View file @
b40e3db7
...
@@ -168,7 +168,7 @@ class NNIDataStore implements DataStore {
...
@@ -168,7 +168,7 @@ class NNIDataStore implements DataStore {
const
oneEntry
:
ExportedDataFormat
=
{
const
oneEntry
:
ExportedDataFormat
=
{
parameter
:
parameters
.
parameters
,
parameter
:
parameters
.
parameters
,
value
:
JSON
.
parse
(
job
.
finalMetricData
[
0
].
data
),
value
:
JSON
.
parse
(
job
.
finalMetricData
[
0
].
data
),
id
:
job
.
i
d
trialJobId
:
job
.
trialJobI
d
};
};
exportedData
.
push
(
oneEntry
);
exportedData
.
push
(
oneEntry
);
}
else
{
}
else
{
...
@@ -188,7 +188,7 @@ class NNIDataStore implements DataStore {
...
@@ -188,7 +188,7 @@ class NNIDataStore implements DataStore {
const
oneEntry
:
ExportedDataFormat
=
{
const
oneEntry
:
ExportedDataFormat
=
{
parameter
:
value
,
parameter
:
value
,
value
:
metricValue
,
value
:
metricValue
,
id
:
job
.
i
d
trialJobId
:
job
.
trialJobI
d
};
};
exportedData
.
push
(
oneEntry
);
exportedData
.
push
(
oneEntry
);
}
}
...
@@ -229,7 +229,7 @@ class NNIDataStore implements DataStore {
...
@@ -229,7 +229,7 @@ class NNIDataStore implements DataStore {
}
}
if
(
!
(
status
!==
undefined
&&
jobInfo
.
status
!==
status
))
{
if
(
!
(
status
!==
undefined
&&
jobInfo
.
status
!==
status
))
{
if
(
jobInfo
.
status
===
'
SUCCEEDED
'
)
{
if
(
jobInfo
.
status
===
'
SUCCEEDED
'
)
{
jobInfo
.
finalMetricData
=
finalMetricsMap
.
get
(
jobInfo
.
i
d
);
jobInfo
.
finalMetricData
=
finalMetricsMap
.
get
(
jobInfo
.
trialJobI
d
);
}
}
result
.
push
(
jobInfo
);
result
.
push
(
jobInfo
);
}
}
...
@@ -320,7 +320,7 @@ class NNIDataStore implements DataStore {
...
@@ -320,7 +320,7 @@ class NNIDataStore implements DataStore {
jobInfo
=
map
.
get
(
record
.
trialJobId
);
jobInfo
=
map
.
get
(
record
.
trialJobId
);
}
else
{
}
else
{
jobInfo
=
{
jobInfo
=
{
i
d
:
record
.
trialJobId
,
trialJobI
d
:
record
.
trialJobId
,
status
:
this
.
getJobStatusByLatestEvent
(
'
UNKNOWN
'
,
record
.
event
),
status
:
this
.
getJobStatusByLatestEvent
(
'
UNKNOWN
'
,
record
.
event
),
hyperParameters
:
[]
hyperParameters
:
[]
};
};
...
@@ -364,14 +364,14 @@ class NNIDataStore implements DataStore {
...
@@ -364,14 +364,14 @@ class NNIDataStore implements DataStore {
const
newHParam
:
any
=
this
.
parseHyperParameter
(
record
.
data
);
const
newHParam
:
any
=
this
.
parseHyperParameter
(
record
.
data
);
if
(
newHParam
!==
undefined
)
{
if
(
newHParam
!==
undefined
)
{
if
(
jobInfo
.
hyperParameters
!==
undefined
)
{
if
(
jobInfo
.
hyperParameters
!==
undefined
)
{
let
hParamIds
:
Set
<
number
>
|
undefined
=
hParamIdMap
.
get
(
jobInfo
.
i
d
);
let
hParamIds
:
Set
<
number
>
|
undefined
=
hParamIdMap
.
get
(
jobInfo
.
trialJobI
d
);
if
(
hParamIds
===
undefined
)
{
if
(
hParamIds
===
undefined
)
{
hParamIds
=
new
Set
();
hParamIds
=
new
Set
();
}
}
if
(
!
hParamIds
.
has
(
newHParam
.
parameter_index
))
{
if
(
!
hParamIds
.
has
(
newHParam
.
parameter_index
))
{
jobInfo
.
hyperParameters
.
push
(
JSON
.
stringify
(
newHParam
));
jobInfo
.
hyperParameters
.
push
(
JSON
.
stringify
(
newHParam
));
hParamIds
.
add
(
newHParam
.
parameter_index
);
hParamIds
.
add
(
newHParam
.
parameter_index
);
hParamIdMap
.
set
(
jobInfo
.
i
d
,
hParamIds
);
hParamIdMap
.
set
(
jobInfo
.
trialJobI
d
,
hParamIds
);
}
}
}
else
{
}
else
{
assert
(
false
,
'
jobInfo.hyperParameters is undefined
'
);
assert
(
false
,
'
jobInfo.hyperParameters is undefined
'
);
...
...
ts/nni_manager/core/nniExperimentsManager.ts
0 → 100644
View file @
b40e3db7
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
'
use strict
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
os
from
'
os
'
;
import
*
as
path
from
'
path
'
;
import
*
as
assert
from
'
assert
'
;
import
{
getLogger
,
Logger
}
from
'
../common/log
'
;
import
{
isAlive
,
withLockSync
,
getExperimentsInfoPath
,
delay
}
from
'
../common/utils
'
;
import
{
ExperimentManager
}
from
'
../common/experimentManager
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
interface
CrashedInfo
{
experimentId
:
string
;
isCrashed
:
boolean
;
}
interface
FileInfo
{
buffer
:
Buffer
;
mtime
:
number
;
}
class
NNIExperimentsManager
implements
ExperimentManager
{
private
experimentsPath
:
string
;
private
log
:
Logger
;
private
profileUpdateTimer
:
{[
key
:
string
]:
any
};
constructor
()
{
this
.
experimentsPath
=
getExperimentsInfoPath
();
this
.
log
=
getLogger
();
this
.
profileUpdateTimer
=
{};
}
public
async
getExperimentsInfo
():
Promise
<
JSON
>
{
const
fileInfo
:
FileInfo
=
await
this
.
withLockIterated
(
this
.
readExperimentsInfo
,
100
);
const
experimentsInformation
=
JSON
.
parse
(
fileInfo
.
buffer
.
toString
());
const
expIdList
:
Array
<
string
>
=
Object
.
keys
(
experimentsInformation
).
filter
((
expId
)
=>
{
return
experimentsInformation
[
expId
][
'
status
'
]
!==
'
STOPPED
'
;
});
const
updateList
:
Array
<
CrashedInfo
>
=
(
await
Promise
.
all
(
expIdList
.
map
((
expId
)
=>
{
return
this
.
checkCrashed
(
expId
,
experimentsInformation
[
expId
][
'
pid
'
]);
}))).
filter
(
crashedInfo
=>
crashedInfo
.
isCrashed
);
if
(
updateList
.
length
>
0
){
const
result
=
await
this
.
withLockIterated
(
this
.
updateAllStatus
,
100
,
updateList
.
map
(
crashedInfo
=>
crashedInfo
.
experimentId
),
fileInfo
.
mtime
);
if
(
result
!==
undefined
)
{
return
JSON
.
parse
(
JSON
.
stringify
(
Object
.
keys
(
result
).
map
(
key
=>
result
[
key
])));
}
else
{
await
delay
(
500
);
return
await
this
.
getExperimentsInfo
();
}
}
else
{
return
JSON
.
parse
(
JSON
.
stringify
(
Object
.
keys
(
experimentsInformation
).
map
(
key
=>
experimentsInformation
[
key
])));
}
}
public
setExperimentPath
(
newPath
:
string
):
void
{
if
(
newPath
[
0
]
===
'
~
'
)
{
newPath
=
path
.
join
(
os
.
homedir
(),
newPath
.
slice
(
1
));
}
if
(
!
path
.
isAbsolute
(
newPath
))
{
newPath
=
path
.
resolve
(
newPath
);
}
this
.
log
.
info
(
`Set new experiment information path:
${
newPath
}
`
);
this
.
experimentsPath
=
newPath
;
}
public
setExperimentInfo
(
experimentId
:
string
,
key
:
string
,
value
:
any
):
void
{
try
{
if
(
this
.
profileUpdateTimer
[
key
]
!==
undefined
)
{
// if a new call with the same timerId occurs, destroy the unfinished old one
clearTimeout
(
this
.
profileUpdateTimer
[
key
]);
this
.
profileUpdateTimer
[
key
]
=
undefined
;
}
this
.
withLockSync
(()
=>
{
const
experimentsInformation
=
JSON
.
parse
(
fs
.
readFileSync
(
this
.
experimentsPath
).
toString
());
assert
(
experimentId
in
experimentsInformation
,
`Experiment Manager: Experiment Id
${
experimentId
}
not found, this should not happen`
);
experimentsInformation
[
experimentId
][
key
]
=
value
;
fs
.
writeFileSync
(
this
.
experimentsPath
,
JSON
.
stringify
(
experimentsInformation
,
null
,
4
));
});
}
catch
(
err
)
{
this
.
log
.
error
(
err
);
this
.
log
.
debug
(
`Experiment Manager: Retry set key value:
${
experimentId
}
{
${
key
}
:
${
value
}
}`
);
if
(
err
.
code
===
'
EEXIST
'
||
err
.
message
===
'
File has been locked.
'
)
{
this
.
profileUpdateTimer
[
key
]
=
setTimeout
(
this
.
setExperimentInfo
.
bind
(
this
),
100
,
experimentId
,
key
,
value
);
}
}
}
private
async
withLockIterated
(
func
:
Function
,
retry
:
number
,
...
args
:
any
):
Promise
<
any
>
{
if
(
retry
<
0
)
{
throw
new
Error
(
'
Lock file out of retries.
'
);
}
try
{
return
this
.
withLockSync
(
func
,
...
args
);
}
catch
(
err
)
{
if
(
err
.
code
===
'
EEXIST
'
||
err
.
message
===
'
File has been locked.
'
)
{
// retry wait is 50ms
await
delay
(
50
);
return
await
this
.
withLockIterated
(
func
,
retry
-
1
,
...
args
);
}
throw
err
;
}
}
private
withLockSync
(
func
:
Function
,
...
args
:
any
):
any
{
return
withLockSync
(
func
.
bind
(
this
),
this
.
experimentsPath
,
{
stale
:
2
*
1000
},
...
args
);
}
private
readExperimentsInfo
():
FileInfo
{
const
buffer
:
Buffer
=
fs
.
readFileSync
(
this
.
experimentsPath
);
const
mtime
:
number
=
fs
.
statSync
(
this
.
experimentsPath
).
mtimeMs
;
return
{
buffer
:
buffer
,
mtime
:
mtime
};
}
private
async
checkCrashed
(
expId
:
string
,
pid
:
number
):
Promise
<
CrashedInfo
>
{
const
alive
:
boolean
=
await
isAlive
(
pid
);
return
{
experimentId
:
expId
,
isCrashed
:
!
alive
}
}
private
updateAllStatus
(
updateList
:
Array
<
string
>
,
timestamp
:
number
):
{[
key
:
string
]:
any
}
|
undefined
{
if
(
timestamp
!==
fs
.
statSync
(
this
.
experimentsPath
).
mtimeMs
)
{
return
;
}
else
{
const
experimentsInformation
=
JSON
.
parse
(
fs
.
readFileSync
(
this
.
experimentsPath
).
toString
());
updateList
.
forEach
((
expId
:
string
)
=>
{
if
(
experimentsInformation
[
expId
])
{
experimentsInformation
[
expId
][
'
status
'
]
=
'
STOPPED
'
;
}
else
{
this
.
log
.
error
(
`Experiment Manager: Experiment Id
${
expId
}
not found, this should not happen`
);
}
});
fs
.
writeFileSync
(
this
.
experimentsPath
,
JSON
.
stringify
(
experimentsInformation
,
null
,
4
));
return
experimentsInformation
;
}
}
public
async
stop
():
Promise
<
void
>
{
this
.
log
.
debug
(
'
Stopping experiment manager.
'
);
await
this
.
cleanUp
().
catch
(
err
=>
this
.
log
.
error
(
err
.
message
));
this
.
log
.
debug
(
'
Experiment manager stopped.
'
);
}
private
async
cleanUp
():
Promise
<
void
>
{
const
deferred
=
new
Deferred
<
void
>
();
if
(
this
.
isUndone
())
{
this
.
log
.
debug
(
'
Experiment manager: something undone
'
);
setTimeout
(((
deferred
:
Deferred
<
void
>
):
void
=>
{
if
(
this
.
isUndone
())
{
deferred
.
reject
(
new
Error
(
'
Still has undone after 5s, forced stop.
'
));
}
else
{
deferred
.
resolve
();
}
}).
bind
(
this
),
5
*
1000
,
deferred
);
}
else
{
this
.
log
.
debug
(
'
Experiment manager: all clean up
'
);
deferred
.
resolve
();
}
return
deferred
.
promise
;
}
private
isUndone
():
boolean
{
return
Object
.
keys
(
this
.
profileUpdateTimer
).
filter
((
key
:
string
)
=>
{
return
this
.
profileUpdateTimer
[
key
]
!==
undefined
;
}).
length
>
0
;
}
}
export
{
NNIExperimentsManager
};
ts/nni_manager/core/nnimanager.ts
View file @
b40e3db7
...
@@ -15,6 +15,7 @@ import {
...
@@ -15,6 +15,7 @@ import {
ExperimentParams
,
ExperimentProfile
,
Manager
,
ExperimentStatus
,
ExperimentParams
,
ExperimentProfile
,
Manager
,
ExperimentStatus
,
NNIManagerStatus
,
ProfileUpdateType
,
TrialJobStatistics
NNIManagerStatus
,
ProfileUpdateType
,
TrialJobStatistics
}
from
'
../common/manager
'
;
}
from
'
../common/manager
'
;
import
{
ExperimentManager
}
from
'
../common/experimentManager
'
;
import
{
import
{
TrainingService
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
,
TrialJobStatus
,
LogType
TrainingService
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
,
TrialJobStatus
,
LogType
}
from
'
../common/trainingService
'
;
}
from
'
../common/trainingService
'
;
...
@@ -31,6 +32,7 @@ import { createDispatcherInterface, IpcInterface } from './ipcInterface';
...
@@ -31,6 +32,7 @@ import { createDispatcherInterface, IpcInterface } from './ipcInterface';
class
NNIManager
implements
Manager
{
class
NNIManager
implements
Manager
{
private
trainingService
:
TrainingService
;
private
trainingService
:
TrainingService
;
private
dispatcher
:
IpcInterface
|
undefined
;
private
dispatcher
:
IpcInterface
|
undefined
;
private
experimentManager
:
ExperimentManager
;
private
currSubmittedTrialNum
:
number
;
// need to be recovered
private
currSubmittedTrialNum
:
number
;
// need to be recovered
private
trialConcurrencyChange
:
number
;
// >0: increase, <0: decrease
private
trialConcurrencyChange
:
number
;
// >0: increase, <0: decrease
private
log
:
Logger
;
private
log
:
Logger
;
...
@@ -49,6 +51,7 @@ class NNIManager implements Manager {
...
@@ -49,6 +51,7 @@ class NNIManager implements Manager {
this
.
currSubmittedTrialNum
=
0
;
this
.
currSubmittedTrialNum
=
0
;
this
.
trialConcurrencyChange
=
0
;
this
.
trialConcurrencyChange
=
0
;
this
.
trainingService
=
component
.
get
(
TrainingService
);
this
.
trainingService
=
component
.
get
(
TrainingService
);
this
.
experimentManager
=
component
.
get
(
ExperimentManager
);
assert
(
this
.
trainingService
);
assert
(
this
.
trainingService
);
this
.
dispatcherPid
=
0
;
this
.
dispatcherPid
=
0
;
this
.
waitingTrials
=
[];
this
.
waitingTrials
=
[];
...
@@ -231,7 +234,7 @@ class NNIManager implements Manager {
...
@@ -231,7 +234,7 @@ class NNIManager implements Manager {
// Check the final status for WAITING and RUNNING jobs
// Check the final status for WAITING and RUNNING jobs
await
Promise
.
all
(
allTrialJobs
await
Promise
.
all
(
allTrialJobs
.
filter
((
job
:
TrialJobInfo
)
=>
job
.
status
===
'
WAITING
'
||
job
.
status
===
'
RUNNING
'
)
.
filter
((
job
:
TrialJobInfo
)
=>
job
.
status
===
'
WAITING
'
||
job
.
status
===
'
RUNNING
'
)
.
map
((
job
:
TrialJobInfo
)
=>
this
.
dataStore
.
storeTrialJobEvent
(
'
FAILED
'
,
job
.
i
d
)));
.
map
((
job
:
TrialJobInfo
)
=>
this
.
dataStore
.
storeTrialJobEvent
(
'
FAILED
'
,
job
.
trialJobI
d
)));
// Collect generated trials and imported trials
// Collect generated trials and imported trials
const
finishedTrialData
:
string
=
await
this
.
exportData
();
const
finishedTrialData
:
string
=
await
this
.
exportData
();
...
@@ -304,7 +307,7 @@ class NNIManager implements Manager {
...
@@ -304,7 +307,7 @@ class NNIManager implements Manager {
// FIXME: can this be undefined?
// FIXME: can this be undefined?
trial
.
sequenceId
!==
undefined
&&
minSeqId
<=
trial
.
sequenceId
&&
trial
.
sequenceId
<=
maxSeqId
trial
.
sequenceId
!==
undefined
&&
minSeqId
<=
trial
.
sequenceId
&&
trial
.
sequenceId
<=
maxSeqId
));
));
const
targetTrialIds
=
new
Set
(
targetTrials
.
map
(
trial
=>
trial
.
i
d
));
const
targetTrialIds
=
new
Set
(
targetTrials
.
map
(
trial
=>
trial
.
trialJobI
d
));
const
allMetrics
=
await
this
.
dataStore
.
getMetricData
();
const
allMetrics
=
await
this
.
dataStore
.
getMetricData
();
return
allMetrics
.
filter
(
metric
=>
targetTrialIds
.
has
(
metric
.
trialJobId
));
return
allMetrics
.
filter
(
metric
=>
targetTrialIds
.
has
(
metric
.
trialJobId
));
...
@@ -345,6 +348,14 @@ class NNIManager implements Manager {
...
@@ -345,6 +348,14 @@ class NNIManager implements Manager {
return
this
.
status
;
return
this
.
status
;
}
}
public
getTrialJobMessage
(
trialJobId
:
string
):
string
|
undefined
{
const
trialJob
=
this
.
trialJobs
.
get
(
trialJobId
);
if
(
trialJob
!==
undefined
){
return
trialJob
.
message
}
return
undefined
}
public
async
listTrialJobs
(
status
?:
TrialJobStatus
):
Promise
<
TrialJobInfo
[]
>
{
public
async
listTrialJobs
(
status
?:
TrialJobStatus
):
Promise
<
TrialJobInfo
[]
>
{
return
this
.
dataStore
.
listTrialJobs
(
status
);
return
this
.
dataStore
.
listTrialJobs
(
status
);
}
}
...
@@ -459,7 +470,9 @@ class NNIManager implements Manager {
...
@@ -459,7 +470,9 @@ class NNIManager implements Manager {
}
}
}
}
await
this
.
trainingService
.
cleanUp
();
await
this
.
trainingService
.
cleanUp
();
this
.
experimentProfile
.
endTime
=
Date
.
now
();
if
(
this
.
experimentProfile
.
endTime
===
undefined
)
{
this
.
setEndtime
();
}
await
this
.
storeExperimentProfile
();
await
this
.
storeExperimentProfile
();
this
.
setStatus
(
'
STOPPED
'
);
this
.
setStatus
(
'
STOPPED
'
);
}
}
...
@@ -501,6 +514,10 @@ class NNIManager implements Manager {
...
@@ -501,6 +514,10 @@ class NNIManager implements Manager {
this
.
trialJobs
.
set
(
trialJobId
,
Object
.
assign
({},
trialJobDetail
));
this
.
trialJobs
.
set
(
trialJobId
,
Object
.
assign
({},
trialJobDetail
));
await
this
.
dataStore
.
storeTrialJobEvent
(
trialJobDetail
.
status
,
trialJobDetail
.
id
,
undefined
,
trialJobDetail
);
await
this
.
dataStore
.
storeTrialJobEvent
(
trialJobDetail
.
status
,
trialJobDetail
.
id
,
undefined
,
trialJobDetail
);
}
}
const
newTrialJobDetail
:
TrialJobDetail
|
undefined
=
this
.
trialJobs
.
get
(
trialJobId
);
if
(
newTrialJobDetail
!==
undefined
)
{
newTrialJobDetail
.
message
=
trialJobDetail
.
message
;
}
let
hyperParams
:
string
|
undefined
=
undefined
;
let
hyperParams
:
string
|
undefined
=
undefined
;
switch
(
trialJobDetail
.
status
)
{
switch
(
trialJobDetail
.
status
)
{
case
'
SUCCEEDED
'
:
case
'
SUCCEEDED
'
:
...
@@ -584,7 +601,7 @@ class NNIManager implements Manager {
...
@@ -584,7 +601,7 @@ class NNIManager implements Manager {
assert
(
allFinishedTrialJobNum
<=
waitSubmittedToFinish
);
assert
(
allFinishedTrialJobNum
<=
waitSubmittedToFinish
);
if
(
allFinishedTrialJobNum
>=
waitSubmittedToFinish
)
{
if
(
allFinishedTrialJobNum
>=
waitSubmittedToFinish
)
{
this
.
setStatus
(
'
DONE
'
);
this
.
setStatus
(
'
DONE
'
);
this
.
experimentProfile
.
e
nd
T
ime
=
Date
.
now
();
this
.
setE
nd
t
ime
();
await
this
.
storeExperimentProfile
();
await
this
.
storeExperimentProfile
();
// write this log for travis CI
// write this log for travis CI
this
.
log
.
info
(
'
Experiment done.
'
);
this
.
log
.
info
(
'
Experiment done.
'
);
...
@@ -678,11 +695,15 @@ class NNIManager implements Manager {
...
@@ -678,11 +695,15 @@ class NNIManager implements Manager {
private
async
onTrialJobMetrics
(
metric
:
TrialJobMetric
):
Promise
<
void
>
{
private
async
onTrialJobMetrics
(
metric
:
TrialJobMetric
):
Promise
<
void
>
{
this
.
log
.
debug
(
`NNIManager received trial job metrics:
${
metric
}
`
);
this
.
log
.
debug
(
`NNIManager received trial job metrics:
${
metric
}
`
);
await
this
.
dataStore
.
storeMetricData
(
metric
.
id
,
metric
.
data
);
if
(
this
.
trialJobs
.
has
(
metric
.
id
)){
if
(
this
.
dispatcher
===
undefined
)
{
await
this
.
dataStore
.
storeMetricData
(
metric
.
id
,
metric
.
data
);
throw
new
Error
(
'
Error: tuner has not been setup
'
);
if
(
this
.
dispatcher
===
undefined
)
{
throw
new
Error
(
'
Error: tuner has not been setup
'
);
}
this
.
dispatcher
.
sendCommand
(
REPORT_METRIC_DATA
,
metric
.
data
);
}
else
{
this
.
log
.
warning
(
`NNIManager received non-existent trial job metrics:
${
metric
}
`
);
}
}
this
.
dispatcher
.
sendCommand
(
REPORT_METRIC_DATA
,
metric
.
data
);
}
}
private
requestTrialJobs
(
jobNum
:
number
):
void
{
private
requestTrialJobs
(
jobNum
:
number
):
void
{
...
@@ -780,6 +801,7 @@ class NNIManager implements Manager {
...
@@ -780,6 +801,7 @@ class NNIManager implements Manager {
this
.
log
.
error
(
err
.
stack
);
this
.
log
.
error
(
err
.
stack
);
}
}
this
.
status
.
errors
.
push
(
err
.
message
);
this
.
status
.
errors
.
push
(
err
.
message
);
this
.
setEndtime
();
this
.
setStatus
(
'
ERROR
'
);
this
.
setStatus
(
'
ERROR
'
);
}
}
...
@@ -787,9 +809,15 @@ class NNIManager implements Manager {
...
@@ -787,9 +809,15 @@ class NNIManager implements Manager {
if
(
status
!==
this
.
status
.
status
)
{
if
(
status
!==
this
.
status
.
status
)
{
this
.
log
.
info
(
`Change NNIManager status from:
${
this
.
status
.
status
}
to:
${
status
}
`
);
this
.
log
.
info
(
`Change NNIManager status from:
${
this
.
status
.
status
}
to:
${
status
}
`
);
this
.
status
.
status
=
status
;
this
.
status
.
status
=
status
;
this
.
experimentManager
.
setExperimentInfo
(
this
.
experimentProfile
.
id
,
'
status
'
,
this
.
status
.
status
);
}
}
}
}
private
setEndtime
():
void
{
this
.
experimentProfile
.
endTime
=
Date
.
now
();
this
.
experimentManager
.
setExperimentInfo
(
this
.
experimentProfile
.
id
,
'
endTime
'
,
this
.
experimentProfile
.
endTime
);
}
private
createEmptyExperimentProfile
():
ExperimentProfile
{
private
createEmptyExperimentProfile
():
ExperimentProfile
{
return
{
return
{
id
:
getExperimentId
(),
id
:
getExperimentId
(),
...
...
ts/nni_manager/core/test/experimentManager.test.ts
0 → 100644
View file @
b40e3db7
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
'
use strict
'
;
import
{
assert
,
expect
}
from
'
chai
'
;
import
*
as
fs
from
'
fs
'
;
import
{
Container
,
Scope
}
from
'
typescript-ioc
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
cleanupUnitTest
,
prepareUnitTest
}
from
'
../../common/utils
'
;
import
{
ExperimentManager
}
from
'
../../common/experimentManager
'
;
import
{
NNIExperimentsManager
}
from
'
../nniExperimentsManager
'
;
describe
(
'
Unit test for experiment manager
'
,
function
()
{
let
experimentManager
:
NNIExperimentsManager
;
const
mockedInfo
=
{
"
test
"
:
{
"
port
"
:
8080
,
"
startTime
"
:
1605246730756
,
"
endTime
"
:
"
N/A
"
,
"
status
"
:
"
INITIALIZED
"
,
"
platform
"
:
"
local
"
,
"
experimentName
"
:
"
testExp
"
,
"
tag
"
:
[],
"
pid
"
:
11111
,
"
webuiUrl
"
:
[],
"
logDir
"
:
null
}
}
before
(()
=>
{
prepareUnitTest
();
fs
.
writeFileSync
(
'
.experiment.test
'
,
JSON
.
stringify
(
mockedInfo
));
Container
.
bind
(
ExperimentManager
).
to
(
NNIExperimentsManager
).
scope
(
Scope
.
Singleton
);
experimentManager
=
component
.
get
(
NNIExperimentsManager
);
experimentManager
.
setExperimentPath
(
'
.experiment.test
'
);
});
after
(()
=>
{
if
(
fs
.
existsSync
(
'
.experiment.test
'
))
{
fs
.
unlinkSync
(
'
.experiment.test
'
);
}
cleanupUnitTest
();
});
it
(
'
test getExperimentsInfo
'
,
()
=>
{
return
experimentManager
.
getExperimentsInfo
().
then
(
function
(
experimentsInfo
:
{[
key
:
string
]:
any
})
{
new
Array
(
experimentsInfo
)
for
(
let
idx
in
experimentsInfo
)
{
if
(
experimentsInfo
[
idx
][
'
id
'
]
===
'
test
'
)
{
expect
(
experimentsInfo
[
idx
][
'
status
'
]).
to
.
be
.
oneOf
([
'
STOPPED
'
,
'
ERROR
'
]);
break
;
}
}
}).
catch
((
error
)
=>
{
assert
.
fail
(
error
);
})
});
});
ts/nni_manager/core/test/mockedDatastore.ts
View file @
b40e3db7
...
@@ -161,7 +161,7 @@ class MockedDataStore implements DataStore {
...
@@ -161,7 +161,7 @@ class MockedDataStore implements DataStore {
}
}
if
(
!
(
status
&&
jobInfo
.
status
!==
status
))
{
if
(
!
(
status
&&
jobInfo
.
status
!==
status
))
{
if
(
jobInfo
.
status
===
'
SUCCEEDED
'
)
{
if
(
jobInfo
.
status
===
'
SUCCEEDED
'
)
{
jobInfo
.
finalMetricData
=
await
this
.
getFinalMetricData
(
jobInfo
.
i
d
);
jobInfo
.
finalMetricData
=
await
this
.
getFinalMetricData
(
jobInfo
.
trialJobI
d
);
}
}
result
.
push
(
jobInfo
);
result
.
push
(
jobInfo
);
}
}
...
@@ -206,7 +206,7 @@ class MockedDataStore implements DataStore {
...
@@ -206,7 +206,7 @@ class MockedDataStore implements DataStore {
public
getTrialJob
(
trialJobId
:
string
):
Promise
<
TrialJobInfo
>
{
public
getTrialJob
(
trialJobId
:
string
):
Promise
<
TrialJobInfo
>
{
return
Promise
.
resolve
({
return
Promise
.
resolve
({
i
d
:
'
1234
'
,
trialJobI
d
:
'
1234
'
,
status
:
'
SUCCEEDED
'
,
status
:
'
SUCCEEDED
'
,
startTime
:
Date
.
now
(),
startTime
:
Date
.
now
(),
endTime
:
Date
.
now
()
endTime
:
Date
.
now
()
...
@@ -242,7 +242,7 @@ class MockedDataStore implements DataStore {
...
@@ -242,7 +242,7 @@ class MockedDataStore implements DataStore {
jobInfo
=
map
.
get
(
record
.
trialJobId
);
jobInfo
=
map
.
get
(
record
.
trialJobId
);
}
else
{
}
else
{
jobInfo
=
{
jobInfo
=
{
i
d
:
record
.
trialJobId
,
trialJobI
d
:
record
.
trialJobId
,
status
:
this
.
getJobStatusByLatestEvent
(
record
.
event
),
status
:
this
.
getJobStatusByLatestEvent
(
record
.
event
),
};
};
}
}
...
...
ts/nni_manager/core/test/nnimanager.test.ts
View file @
b40e3db7
...
@@ -3,6 +3,7 @@
...
@@ -3,6 +3,7 @@
'
use strict
'
;
'
use strict
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
os
from
'
os
'
;
import
*
as
os
from
'
os
'
;
import
{
assert
,
expect
}
from
'
chai
'
;
import
{
assert
,
expect
}
from
'
chai
'
;
import
{
Container
,
Scope
}
from
'
typescript-ioc
'
;
import
{
Container
,
Scope
}
from
'
typescript-ioc
'
;
...
@@ -10,9 +11,10 @@ import { Container, Scope } from 'typescript-ioc';
...
@@ -10,9 +11,10 @@ import { Container, Scope } from 'typescript-ioc';
import
*
as
component
from
'
../../common/component
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
Database
,
DataStore
}
from
'
../../common/datastore
'
;
import
{
Database
,
DataStore
}
from
'
../../common/datastore
'
;
import
{
Manager
,
ExperimentProfile
}
from
'
../../common/manager
'
;
import
{
Manager
,
ExperimentProfile
}
from
'
../../common/manager
'
;
import
{
ExperimentManager
}
from
'
../../common/experimentManager
'
;
import
{
TrainingService
}
from
'
../../common/trainingService
'
;
import
{
TrainingService
}
from
'
../../common/trainingService
'
;
import
{
cleanupUnitTest
,
prepareUnitTest
}
from
'
../../common/utils
'
;
import
{
cleanupUnitTest
,
prepareUnitTest
}
from
'
../../common/utils
'
;
import
{
NNI
DataStore
}
from
'
../nniDataStore
'
;
import
{
NNI
ExperimentsManager
}
from
'
../nniExperimentsManager
'
;
import
{
NNIManager
}
from
'
../nnimanager
'
;
import
{
NNIManager
}
from
'
../nnimanager
'
;
import
{
SqlDB
}
from
'
../sqlDatabase
'
;
import
{
SqlDB
}
from
'
../sqlDatabase
'
;
import
{
MockedTrainingService
}
from
'
./mockedTrainingService
'
;
import
{
MockedTrainingService
}
from
'
./mockedTrainingService
'
;
...
@@ -25,6 +27,7 @@ async function initContainer(): Promise<void> {
...
@@ -25,6 +27,7 @@ async function initContainer(): Promise<void> {
Container
.
bind
(
Manager
).
to
(
NNIManager
).
scope
(
Scope
.
Singleton
);
Container
.
bind
(
Manager
).
to
(
NNIManager
).
scope
(
Scope
.
Singleton
);
Container
.
bind
(
Database
).
to
(
SqlDB
).
scope
(
Scope
.
Singleton
);
Container
.
bind
(
Database
).
to
(
SqlDB
).
scope
(
Scope
.
Singleton
);
Container
.
bind
(
DataStore
).
to
(
MockedDataStore
).
scope
(
Scope
.
Singleton
);
Container
.
bind
(
DataStore
).
to
(
MockedDataStore
).
scope
(
Scope
.
Singleton
);
Container
.
bind
(
ExperimentManager
).
to
(
NNIExperimentsManager
).
scope
(
Scope
.
Singleton
);
await
component
.
get
<
DataStore
>
(
DataStore
).
init
();
await
component
.
get
<
DataStore
>
(
DataStore
).
init
();
}
}
...
@@ -87,9 +90,26 @@ describe('Unit test for nnimanager', function () {
...
@@ -87,9 +90,26 @@ describe('Unit test for nnimanager', function () {
revision
:
0
revision
:
0
}
}
let
mockedInfo
=
{
"
unittest
"
:
{
"
port
"
:
8080
,
"
startTime
"
:
1605246730756
,
"
endTime
"
:
"
N/A
"
,
"
status
"
:
"
INITIALIZED
"
,
"
platform
"
:
"
local
"
,
"
experimentName
"
:
"
testExp
"
,
"
tag
"
:
[],
"
pid
"
:
11111
,
"
webuiUrl
"
:
[],
"
logDir
"
:
null
}
}
before
(
async
()
=>
{
before
(
async
()
=>
{
await
initContainer
();
await
initContainer
();
fs
.
writeFileSync
(
'
.experiment.test
'
,
JSON
.
stringify
(
mockedInfo
));
const
experimentsManager
:
ExperimentManager
=
component
.
get
(
ExperimentManager
);
experimentsManager
.
setExperimentPath
(
'
.experiment.test
'
);
nniManager
=
component
.
get
(
Manager
);
nniManager
=
component
.
get
(
Manager
);
const
expId
:
string
=
await
nniManager
.
startExperiment
(
experimentParams
);
const
expId
:
string
=
await
nniManager
.
startExperiment
(
experimentParams
);
assert
.
strictEqual
(
expId
,
'
unittest
'
);
assert
.
strictEqual
(
expId
,
'
unittest
'
);
...
@@ -122,7 +142,7 @@ describe('Unit test for nnimanager', function () {
...
@@ -122,7 +142,7 @@ describe('Unit test for nnimanager', function () {
it
(
'
test getTrialJob valid
'
,
()
=>
{
it
(
'
test getTrialJob valid
'
,
()
=>
{
//query a exist id
//query a exist id
return
nniManager
.
getTrialJob
(
'
1234
'
).
then
(
function
(
trialJobDetail
)
{
return
nniManager
.
getTrialJob
(
'
1234
'
).
then
(
function
(
trialJobDetail
)
{
expect
(
trialJobDetail
.
i
d
).
to
.
be
.
equal
(
'
1234
'
);
expect
(
trialJobDetail
.
trialJobI
d
).
to
.
be
.
equal
(
'
1234
'
);
}).
catch
((
error
)
=>
{
}).
catch
((
error
)
=>
{
assert
.
fail
(
error
);
assert
.
fail
(
error
);
})
})
...
...
ts/nni_manager/main.ts
View file @
b40e3db7
...
@@ -12,13 +12,16 @@ import { Database, DataStore } from './common/datastore';
...
@@ -12,13 +12,16 @@ import { Database, DataStore } from './common/datastore';
import
{
setExperimentStartupInfo
}
from
'
./common/experimentStartupInfo
'
;
import
{
setExperimentStartupInfo
}
from
'
./common/experimentStartupInfo
'
;
import
{
getLogger
,
Logger
,
logLevelNameMap
}
from
'
./common/log
'
;
import
{
getLogger
,
Logger
,
logLevelNameMap
}
from
'
./common/log
'
;
import
{
Manager
,
ExperimentStartUpMode
}
from
'
./common/manager
'
;
import
{
Manager
,
ExperimentStartUpMode
}
from
'
./common/manager
'
;
import
{
ExperimentManager
}
from
'
./common/experimentManager
'
;
import
{
TrainingService
}
from
'
./common/trainingService
'
;
import
{
TrainingService
}
from
'
./common/trainingService
'
;
import
{
getLogDir
,
mkDirP
,
parseArg
,
uniqueString
}
from
'
./common/utils
'
;
import
{
getLogDir
,
mkDirP
,
parseArg
}
from
'
./common/utils
'
;
import
{
NNIDataStore
}
from
'
./core/nniDataStore
'
;
import
{
NNIDataStore
}
from
'
./core/nniDataStore
'
;
import
{
NNIManager
}
from
'
./core/nnimanager
'
;
import
{
NNIManager
}
from
'
./core/nnimanager
'
;
import
{
SqlDB
}
from
'
./core/sqlDatabase
'
;
import
{
SqlDB
}
from
'
./core/sqlDatabase
'
;
import
{
NNIExperimentsManager
}
from
'
./core/nniExperimentsManager
'
;
import
{
NNIRestServer
}
from
'
./rest_server/nniRestServer
'
;
import
{
NNIRestServer
}
from
'
./rest_server/nniRestServer
'
;
import
{
FrameworkControllerTrainingService
}
from
'
./training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService
'
;
import
{
FrameworkControllerTrainingService
}
from
'
./training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService
'
;
import
{
AdlTrainingService
}
from
'
./training_service/kubernetes/adl/adlTrainingService
'
;
import
{
KubeflowTrainingService
}
from
'
./training_service/kubernetes/kubeflow/kubeflowTrainingService
'
;
import
{
KubeflowTrainingService
}
from
'
./training_service/kubernetes/kubeflow/kubeflowTrainingService
'
;
import
{
LocalTrainingService
}
from
'
./training_service/local/localTrainingService
'
;
import
{
LocalTrainingService
}
from
'
./training_service/local/localTrainingService
'
;
import
{
RouterTrainingService
}
from
'
./training_service/reusable/routerTrainingService
'
;
import
{
RouterTrainingService
}
from
'
./training_service/reusable/routerTrainingService
'
;
...
@@ -26,15 +29,18 @@ import { PAIYarnTrainingService } from './training_service/pai/paiYarn/paiYarnTr
...
@@ -26,15 +29,18 @@ import { PAIYarnTrainingService } from './training_service/pai/paiYarn/paiYarnTr
import
{
DLTSTrainingService
}
from
'
./training_service/dlts/dltsTrainingService
'
;
import
{
DLTSTrainingService
}
from
'
./training_service/dlts/dltsTrainingService
'
;
function
initStartupInfo
(
function
initStartupInfo
(
startExpMode
:
string
,
resumeE
xperimentId
:
string
,
basePort
:
number
,
platform
:
string
,
startExpMode
:
string
,
e
xperimentId
:
string
,
basePort
:
number
,
platform
:
string
,
logDirectory
:
string
,
experimentLogLevel
:
string
,
readonly
:
boolean
):
void
{
logDirectory
:
string
,
experimentLogLevel
:
string
,
readonly
:
boolean
):
void
{
const
createNew
:
boolean
=
(
startExpMode
===
ExperimentStartUpMode
.
NEW
);
const
createNew
:
boolean
=
(
startExpMode
===
ExperimentStartUpMode
.
NEW
);
const
expId
:
string
=
createNew
?
uniqueString
(
8
)
:
resumeExperimentId
;
setExperimentStartupInfo
(
createNew
,
experimentId
,
basePort
,
platform
,
logDirectory
,
experimentLogLevel
,
readonly
);
setExperimentStartupInfo
(
createNew
,
expId
,
basePort
,
platform
,
logDirectory
,
experimentLogLevel
,
readonly
);
}
}
async
function
initContainer
(
foreground
:
boolean
,
platformMode
:
string
,
logFileName
?:
string
):
Promise
<
void
>
{
async
function
initContainer
(
foreground
:
boolean
,
platformMode
:
string
,
logFileName
?:
string
):
Promise
<
void
>
{
if
(
platformMode
===
'
local
'
)
{
if
(
platformMode
===
'
adl
'
)
{
Container
.
bind
(
TrainingService
)
.
to
(
AdlTrainingService
)
.
scope
(
Scope
.
Singleton
);
}
else
if
(
platformMode
===
'
local
'
)
{
Container
.
bind
(
TrainingService
)
Container
.
bind
(
TrainingService
)
.
to
(
LocalTrainingService
)
.
to
(
LocalTrainingService
)
.
scope
(
Scope
.
Singleton
);
.
scope
(
Scope
.
Singleton
);
...
@@ -78,6 +84,9 @@ async function initContainer(foreground: boolean, platformMode: string, logFileN
...
@@ -78,6 +84,9 @@ async function initContainer(foreground: boolean, platformMode: string, logFileN
Container
.
bind
(
DataStore
)
Container
.
bind
(
DataStore
)
.
to
(
NNIDataStore
)
.
to
(
NNIDataStore
)
.
scope
(
Scope
.
Singleton
);
.
scope
(
Scope
.
Singleton
);
Container
.
bind
(
ExperimentManager
)
.
to
(
NNIExperimentsManager
)
.
scope
(
Scope
.
Singleton
);
const
DEFAULT_LOGFILE
:
string
=
path
.
join
(
getLogDir
(),
'
nnimanager.log
'
);
const
DEFAULT_LOGFILE
:
string
=
path
.
join
(
getLogDir
(),
'
nnimanager.log
'
);
if
(
foreground
)
{
if
(
foreground
)
{
logFileName
=
undefined
;
logFileName
=
undefined
;
...
@@ -94,7 +103,7 @@ async function initContainer(foreground: boolean, platformMode: string, logFileN
...
@@ -94,7 +103,7 @@ async function initContainer(foreground: boolean, platformMode: string, logFileN
function
usage
():
void
{
function
usage
():
void
{
console
.
info
(
'
usage: node main.js --port <port> --mode
\
console
.
info
(
'
usage: node main.js --port <port> --mode
\
<local/remote/pai/kubeflow/frameworkcontroller/paiYarn/aml> --start_mode <new/resume> --experiment_id <id> --foreground <true/false>
'
);
<
adl/
local/remote/pai/kubeflow/frameworkcontroller/paiYarn/aml> --start_mode <new/resume> --experiment_id <id> --foreground <true/false>
'
);
}
}
const
strPort
:
string
=
parseArg
([
'
--port
'
,
'
-p
'
]);
const
strPort
:
string
=
parseArg
([
'
--port
'
,
'
-p
'
]);
...
@@ -114,7 +123,7 @@ const foreground: boolean = foregroundArg.toLowerCase() === 'true' ? true : fals
...
@@ -114,7 +123,7 @@ const foreground: boolean = foregroundArg.toLowerCase() === 'true' ? true : fals
const
port
:
number
=
parseInt
(
strPort
,
10
);
const
port
:
number
=
parseInt
(
strPort
,
10
);
const
mode
:
string
=
parseArg
([
'
--mode
'
,
'
-m
'
]);
const
mode
:
string
=
parseArg
([
'
--mode
'
,
'
-m
'
]);
if
(
!
[
'
local
'
,
'
remote
'
,
'
pai
'
,
'
kubeflow
'
,
'
frameworkcontroller
'
,
'
paiYarn
'
,
'
dlts
'
,
'
aml
'
].
includes
(
mode
))
{
if
(
!
[
'
adl
'
,
'
local
'
,
'
remote
'
,
'
pai
'
,
'
kubeflow
'
,
'
frameworkcontroller
'
,
'
paiYarn
'
,
'
dlts
'
,
'
aml
'
].
includes
(
mode
))
{
console
.
log
(
`FATAL: unknown mode:
${
mode
}
`
);
console
.
log
(
`FATAL: unknown mode:
${
mode
}
`
);
usage
();
usage
();
process
.
exit
(
1
);
process
.
exit
(
1
);
...
@@ -128,7 +137,7 @@ if (![ExperimentStartUpMode.NEW, ExperimentStartUpMode.RESUME].includes(startMod
...
@@ -128,7 +137,7 @@ if (![ExperimentStartUpMode.NEW, ExperimentStartUpMode.RESUME].includes(startMod
}
}
const
experimentId
:
string
=
parseArg
([
'
--experiment_id
'
,
'
-id
'
]);
const
experimentId
:
string
=
parseArg
([
'
--experiment_id
'
,
'
-id
'
]);
if
(
(
startMode
===
ExperimentStartUpMode
.
RESUME
)
&&
experimentId
.
trim
().
length
<
1
)
{
if
(
experimentId
.
trim
().
length
<
1
)
{
console
.
log
(
`FATAL: cannot resume the experiment, invalid experiment_id:
${
experimentId
}
`
);
console
.
log
(
`FATAL: cannot resume the experiment, invalid experiment_id:
${
experimentId
}
`
);
usage
();
usage
();
process
.
exit
(
1
);
process
.
exit
(
1
);
...
@@ -174,30 +183,14 @@ mkDirP(getLogDir())
...
@@ -174,30 +183,14 @@ mkDirP(getLogDir())
console
.
error
(
`Failed to create log dir:
${
err
.
stack
}
`
);
console
.
error
(
`Failed to create log dir:
${
err
.
stack
}
`
);
});
});
function
getStopSignal
():
any
{
async
function
cleanUp
():
Promise
<
void
>
{
if
(
process
.
platform
===
"
win32
"
)
{
return
'
SIGBREAK
'
;
}
else
{
return
'
SIGTERM
'
;
}
}
function
getCtrlCSignal
():
any
{
return
'
SIGINT
'
;
}
process
.
on
(
getCtrlCSignal
(),
async
()
=>
{
const
log
:
Logger
=
getLogger
();
log
.
info
(
`Get SIGINT signal!`
);
});
process
.
on
(
getStopSignal
(),
async
()
=>
{
const
log
:
Logger
=
getLogger
();
const
log
:
Logger
=
getLogger
();
let
hasError
:
boolean
=
false
;
let
hasError
:
boolean
=
false
;
try
{
try
{
const
nniManager
:
Manager
=
component
.
get
(
Manager
);
const
nniManager
:
Manager
=
component
.
get
(
Manager
);
await
nniManager
.
stopExperiment
();
await
nniManager
.
stopExperiment
();
const
experimentManager
:
ExperimentManager
=
component
.
get
(
ExperimentManager
);
await
experimentManager
.
stop
();
const
ds
:
DataStore
=
component
.
get
(
DataStore
);
const
ds
:
DataStore
=
component
.
get
(
DataStore
);
await
ds
.
close
();
await
ds
.
close
();
const
restServer
:
NNIRestServer
=
component
.
get
(
NNIRestServer
);
const
restServer
:
NNIRestServer
=
component
.
get
(
NNIRestServer
);
...
@@ -206,7 +199,11 @@ process.on(getStopSignal(), async () => {
...
@@ -206,7 +199,11 @@ process.on(getStopSignal(), async () => {
hasError
=
true
;
hasError
=
true
;
log
.
error
(
`
${
err
.
stack
}
`
);
log
.
error
(
`
${
err
.
stack
}
`
);
}
finally
{
}
finally
{
await
log
.
close
();
log
.
close
();
process
.
exit
(
hasError
?
1
:
0
);
process
.
exit
(
hasError
?
1
:
0
);
}
}
});
}
process
.
on
(
'
SIGTERM
'
,
cleanUp
);
process
.
on
(
'
SIGBREAK
'
,
cleanUp
);
process
.
on
(
'
SIGINT
'
,
cleanUp
);
ts/nni_manager/package.json
View file @
b40e3db7
...
@@ -18,6 +18,7 @@
...
@@ -18,6 +18,7 @@
"ignore"
:
"^5.1.4"
,
"ignore"
:
"^5.1.4"
,
"js-base64"
:
"^2.4.9"
,
"js-base64"
:
"^2.4.9"
,
"kubernetes-client"
:
"^6.5.0"
,
"kubernetes-client"
:
"^6.5.0"
,
"lockfile"
:
"^1.0.4"
,
"python-shell"
:
"^2.0.1"
,
"python-shell"
:
"^2.0.1"
,
"rx"
:
"^4.1.0"
,
"rx"
:
"^4.1.0"
,
"sqlite3"
:
"^5.0.0"
,
"sqlite3"
:
"^5.0.0"
,
...
@@ -39,6 +40,7 @@
...
@@ -39,6 +40,7 @@
"@types/glob"
:
"^7.1.1"
,
"@types/glob"
:
"^7.1.1"
,
"@types/js-base64"
:
"^2.3.1"
,
"@types/js-base64"
:
"^2.3.1"
,
"@types/js-yaml"
:
"^3.12.5"
,
"@types/js-yaml"
:
"^3.12.5"
,
"@types/lockfile"
:
"^1.0.0"
,
"@types/mocha"
:
"^8.0.3"
,
"@types/mocha"
:
"^8.0.3"
,
"@types/node"
:
"10.12.18"
,
"@types/node"
:
"10.12.18"
,
"@types/request"
:
"^2.47.1"
,
"@types/request"
:
"^2.47.1"
,
...
...
ts/nni_manager/rest_server/restHandler.ts
View file @
b40e3db7
...
@@ -12,6 +12,7 @@ import { NNIError, NNIErrorNames } from '../common/errors';
...
@@ -12,6 +12,7 @@ import { NNIError, NNIErrorNames } from '../common/errors';
import
{
isNewExperiment
,
isReadonly
}
from
'
../common/experimentStartupInfo
'
;
import
{
isNewExperiment
,
isReadonly
}
from
'
../common/experimentStartupInfo
'
;
import
{
getLogger
,
Logger
}
from
'
../common/log
'
;
import
{
getLogger
,
Logger
}
from
'
../common/log
'
;
import
{
ExperimentProfile
,
Manager
,
TrialJobStatistics
}
from
'
../common/manager
'
;
import
{
ExperimentProfile
,
Manager
,
TrialJobStatistics
}
from
'
../common/manager
'
;
import
{
ExperimentManager
}
from
'
../common/experimentManager
'
;
import
{
ValidationSchemas
}
from
'
./restValidationSchemas
'
;
import
{
ValidationSchemas
}
from
'
./restValidationSchemas
'
;
import
{
NNIRestServer
}
from
'
./nniRestServer
'
;
import
{
NNIRestServer
}
from
'
./nniRestServer
'
;
import
{
getVersion
}
from
'
../common/utils
'
;
import
{
getVersion
}
from
'
../common/utils
'
;
...
@@ -21,10 +22,12 @@ const expressJoi = require('express-joi-validator');
...
@@ -21,10 +22,12 @@ const expressJoi = require('express-joi-validator');
class
NNIRestHandler
{
class
NNIRestHandler
{
private
restServer
:
NNIRestServer
;
private
restServer
:
NNIRestServer
;
private
nniManager
:
Manager
;
private
nniManager
:
Manager
;
private
experimentsManager
:
ExperimentManager
;
private
log
:
Logger
;
private
log
:
Logger
;
constructor
(
rs
:
NNIRestServer
)
{
constructor
(
rs
:
NNIRestServer
)
{
this
.
nniManager
=
component
.
get
(
Manager
);
this
.
nniManager
=
component
.
get
(
Manager
);
this
.
experimentsManager
=
component
.
get
(
ExperimentManager
);
this
.
restServer
=
rs
;
this
.
restServer
=
rs
;
this
.
log
=
getLogger
();
this
.
log
=
getLogger
();
}
}
...
@@ -60,6 +63,7 @@ class NNIRestHandler {
...
@@ -60,6 +63,7 @@ class NNIRestHandler {
this
.
getLatestMetricData
(
router
);
this
.
getLatestMetricData
(
router
);
this
.
getTrialLog
(
router
);
this
.
getTrialLog
(
router
);
this
.
exportData
(
router
);
this
.
exportData
(
router
);
this
.
getExperimentsInfo
(
router
);
// Express-joi-validator configuration
// Express-joi-validator configuration
router
.
use
((
err
:
any
,
_req
:
Request
,
res
:
Response
,
_next
:
any
)
=>
{
router
.
use
((
err
:
any
,
_req
:
Request
,
res
:
Response
,
_next
:
any
)
=>
{
...
@@ -209,6 +213,7 @@ class NNIRestHandler {
...
@@ -209,6 +213,7 @@ class NNIRestHandler {
this
.
nniManager
.
listTrialJobs
(
req
.
query
.
status
).
then
((
jobInfos
:
TrialJobInfo
[])
=>
{
this
.
nniManager
.
listTrialJobs
(
req
.
query
.
status
).
then
((
jobInfos
:
TrialJobInfo
[])
=>
{
jobInfos
.
forEach
((
trialJob
:
TrialJobInfo
)
=>
{
jobInfos
.
forEach
((
trialJob
:
TrialJobInfo
)
=>
{
this
.
setErrorPathForFailedJob
(
trialJob
);
this
.
setErrorPathForFailedJob
(
trialJob
);
this
.
setMessageforJob
(
trialJob
);
});
});
res
.
send
(
jobInfos
);
res
.
send
(
jobInfos
);
}).
catch
((
err
:
Error
)
=>
{
}).
catch
((
err
:
Error
)
=>
{
...
@@ -221,6 +226,7 @@ class NNIRestHandler {
...
@@ -221,6 +226,7 @@ class NNIRestHandler {
router
.
get
(
'
/trial-jobs/:id
'
,
(
req
:
Request
,
res
:
Response
)
=>
{
router
.
get
(
'
/trial-jobs/:id
'
,
(
req
:
Request
,
res
:
Response
)
=>
{
this
.
nniManager
.
getTrialJob
(
req
.
params
.
id
).
then
((
jobDetail
:
TrialJobInfo
)
=>
{
this
.
nniManager
.
getTrialJob
(
req
.
params
.
id
).
then
((
jobDetail
:
TrialJobInfo
)
=>
{
const
jobInfo
:
TrialJobInfo
=
this
.
setErrorPathForFailedJob
(
jobDetail
);
const
jobInfo
:
TrialJobInfo
=
this
.
setErrorPathForFailedJob
(
jobDetail
);
this
.
setMessageforJob
(
jobInfo
);
res
.
send
(
jobInfo
);
res
.
send
(
jobInfo
);
}).
catch
((
err
:
Error
)
=>
{
}).
catch
((
err
:
Error
)
=>
{
this
.
handleError
(
err
,
res
);
this
.
handleError
(
err
,
res
);
...
@@ -303,6 +309,16 @@ class NNIRestHandler {
...
@@ -303,6 +309,16 @@ class NNIRestHandler {
});
});
}
}
private
getExperimentsInfo
(
router
:
Router
):
void
{
router
.
get
(
'
/experiments-info
'
,
(
req
:
Request
,
res
:
Response
)
=>
{
this
.
experimentsManager
.
getExperimentsInfo
().
then
((
experimentInfo
:
JSON
)
=>
{
res
.
send
(
JSON
.
stringify
(
experimentInfo
));
}).
catch
((
err
:
Error
)
=>
{
this
.
handleError
(
err
,
res
);
});
});
}
private
setErrorPathForFailedJob
(
jobInfo
:
TrialJobInfo
):
TrialJobInfo
{
private
setErrorPathForFailedJob
(
jobInfo
:
TrialJobInfo
):
TrialJobInfo
{
if
(
jobInfo
===
undefined
||
jobInfo
.
status
!==
'
FAILED
'
||
jobInfo
.
logPath
===
undefined
)
{
if
(
jobInfo
===
undefined
||
jobInfo
.
status
!==
'
FAILED
'
||
jobInfo
.
logPath
===
undefined
)
{
return
jobInfo
;
return
jobInfo
;
...
@@ -311,6 +327,14 @@ class NNIRestHandler {
...
@@ -311,6 +327,14 @@ class NNIRestHandler {
return
jobInfo
;
return
jobInfo
;
}
}
private
setMessageforJob
(
jobInfo
:
TrialJobInfo
):
TrialJobInfo
{
if
(
jobInfo
===
undefined
){
return
jobInfo
}
jobInfo
.
message
=
this
.
nniManager
.
getTrialJobMessage
(
jobInfo
.
trialJobId
);
return
jobInfo
}
}
}
export
function
createRestHandler
(
rs
:
NNIRestServer
):
Router
{
export
function
createRestHandler
(
rs
:
NNIRestServer
):
Router
{
...
...
ts/nni_manager/rest_server/restValidationSchemas.ts
View file @
b40e3db7
...
@@ -32,6 +32,9 @@ export namespace ValidationSchemas {
...
@@ -32,6 +32,9 @@ export namespace ValidationSchemas {
outputDir
:
joi
.
string
(),
outputDir
:
joi
.
string
(),
cpuNum
:
joi
.
number
().
min
(
1
),
cpuNum
:
joi
.
number
().
min
(
1
),
memoryMB
:
joi
.
number
().
min
(
100
),
memoryMB
:
joi
.
number
().
min
(
100
),
// ############## adl cpu and memory config ###############
memorySize
:
joi
.
string
(),
// ########################################################
gpuNum
:
joi
.
number
().
min
(
0
),
gpuNum
:
joi
.
number
().
min
(
0
),
command
:
joi
.
string
().
min
(
1
),
command
:
joi
.
string
().
min
(
1
),
virtualCluster
:
joi
.
string
(),
virtualCluster
:
joi
.
string
(),
...
@@ -93,6 +96,20 @@ export namespace ValidationSchemas {
...
@@ -93,6 +96,20 @@ export namespace ValidationSchemas {
minFailedTaskCount
:
joi
.
number
(),
minFailedTaskCount
:
joi
.
number
(),
minSucceededTaskCount
:
joi
.
number
()
minSucceededTaskCount
:
joi
.
number
()
})
})
}),
imagePullSecrets
:
joi
.
array
({
name
:
joi
.
string
().
min
(
1
).
required
()
}),
// ############## adl ###############
adaptive
:
joi
.
boolean
(),
checkpoint
:
joi
.
object
({
storageClass
:
joi
.
string
().
min
(
1
).
required
(),
storageSize
:
joi
.
string
().
min
(
1
).
required
()
}),
nfs
:
joi
.
object
({
server
:
joi
.
string
().
min
(
1
).
required
(),
path
:
joi
.
string
().
min
(
1
).
required
(),
containerMountPath
:
joi
.
string
().
min
(
1
).
required
()
})
})
}),
}),
pai_yarn_config
:
joi
.
object
({
// eslint-disable-line @typescript-eslint/camelcase
pai_yarn_config
:
joi
.
object
({
// eslint-disable-line @typescript-eslint/camelcase
...
...
Prev
1
…
5
6
7
8
9
10
11
12
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment