Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
b40e3db7
Commit
b40e3db7
authored
Dec 01, 2020
by
quzha
Browse files
Merge branch 'master' of github.com:Microsoft/nni into dev-retiarii
parents
efa4e31c
95f731e4
Changes
226
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
668 additions
and
50 deletions
+668
-50
ts/nni_manager/common/experimentManager.ts
ts/nni_manager/common/experimentManager.ts
+13
-0
ts/nni_manager/common/manager.ts
ts/nni_manager/common/manager.ts
+1
-0
ts/nni_manager/common/trainingService.ts
ts/nni_manager/common/trainingService.ts
+1
-0
ts/nni_manager/common/utils.ts
ts/nni_manager/common/utils.ts
+32
-2
ts/nni_manager/config/adl/adaptdl-crd-v1.json
ts/nni_manager/config/adl/adaptdl-crd-v1.json
+17
-0
ts/nni_manager/config/adl/adaptdl-nni-configmap-template.json
...ni_manager/config/adl/adaptdl-nni-configmap-template.json
+19
-0
ts/nni_manager/config/adl/adaptdl-pvc-template.json
ts/nni_manager/config/adl/adaptdl-pvc-template.json
+27
-0
ts/nni_manager/config/adl/adaptdl-tensorboard-deployment-template.json
...r/config/adl/adaptdl-tensorboard-deployment-template.json
+55
-0
ts/nni_manager/config/adl/adaptdl-tensorboard-pvc-template.json
..._manager/config/adl/adaptdl-tensorboard-pvc-template.json
+27
-0
ts/nni_manager/config/adl/adaptdljob-template.json
ts/nni_manager/config/adl/adaptdljob-template.json
+109
-0
ts/nni_manager/core/nniDataStore.ts
ts/nni_manager/core/nniDataStore.ts
+6
-6
ts/nni_manager/core/nniExperimentsManager.ts
ts/nni_manager/core/nniExperimentsManager.ts
+171
-0
ts/nni_manager/core/nnimanager.ts
ts/nni_manager/core/nnimanager.ts
+36
-8
ts/nni_manager/core/test/experimentManager.test.ts
ts/nni_manager/core/test/experimentManager.test.ts
+60
-0
ts/nni_manager/core/test/mockedDatastore.ts
ts/nni_manager/core/test/mockedDatastore.ts
+3
-3
ts/nni_manager/core/test/nnimanager.test.ts
ts/nni_manager/core/test/nnimanager.test.ts
+22
-2
ts/nni_manager/main.ts
ts/nni_manager/main.ts
+26
-29
ts/nni_manager/package.json
ts/nni_manager/package.json
+2
-0
ts/nni_manager/rest_server/restHandler.ts
ts/nni_manager/rest_server/restHandler.ts
+24
-0
ts/nni_manager/rest_server/restValidationSchemas.ts
ts/nni_manager/rest_server/restValidationSchemas.ts
+17
-0
No files found.
ts/nni_manager/common/experimentManager.ts
0 → 100644
View file @
b40e3db7
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
'
use strict
'
;
abstract
class
ExperimentManager
{
public
abstract
getExperimentsInfo
():
Promise
<
JSON
>
;
public
abstract
setExperimentPath
(
newPath
:
string
):
void
;
public
abstract
setExperimentInfo
(
experimentId
:
string
,
key
:
string
,
value
:
any
):
void
;
public
abstract
stop
():
Promise
<
void
>
;
}
export
{
ExperimentManager
};
ts/nni_manager/common/manager.ts
View file @
b40e3db7
...
@@ -105,6 +105,7 @@ abstract class Manager {
...
@@ -105,6 +105,7 @@ abstract class Manager {
public
abstract
getTrialLog
(
trialJobId
:
string
,
logType
:
LogType
):
Promise
<
string
>
;
public
abstract
getTrialLog
(
trialJobId
:
string
,
logType
:
LogType
):
Promise
<
string
>
;
public
abstract
getTrialJobStatistics
():
Promise
<
TrialJobStatistics
[]
>
;
public
abstract
getTrialJobStatistics
():
Promise
<
TrialJobStatistics
[]
>
;
public
abstract
getTrialJobMessage
(
trialJobId
:
string
):
string
|
undefined
;
public
abstract
getStatus
():
NNIManagerStatus
;
public
abstract
getStatus
():
NNIManagerStatus
;
}
}
...
...
ts/nni_manager/common/trainingService.ts
View file @
b40e3db7
...
@@ -42,6 +42,7 @@ interface TrialJobDetail {
...
@@ -42,6 +42,7 @@ interface TrialJobDetail {
readonly
workingDirectory
:
string
;
readonly
workingDirectory
:
string
;
readonly
form
:
TrialJobApplicationForm
;
readonly
form
:
TrialJobApplicationForm
;
isEarlyStopped
?:
boolean
;
isEarlyStopped
?:
boolean
;
message
?:
string
;
}
}
/**
/**
...
...
ts/nni_manager/common/utils.ts
View file @
b40e3db7
...
@@ -11,13 +11,16 @@ import { ChildProcess, spawn, StdioOptions } from 'child_process';
...
@@ -11,13 +11,16 @@ import { ChildProcess, spawn, StdioOptions } from 'child_process';
import
*
as
fs
from
'
fs
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
os
from
'
os
'
;
import
*
as
os
from
'
os
'
;
import
*
as
path
from
'
path
'
;
import
*
as
path
from
'
path
'
;
import
*
as
lockfile
from
'
lockfile
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
Container
}
from
'
typescript-ioc
'
;
import
{
Container
}
from
'
typescript-ioc
'
;
import
*
as
util
from
'
util
'
;
import
*
as
util
from
'
util
'
;
import
*
as
glob
from
'
glob
'
;
import
{
Database
,
DataStore
}
from
'
./datastore
'
;
import
{
Database
,
DataStore
}
from
'
./datastore
'
;
import
{
ExperimentStartupInfo
,
getExperimentStartupInfo
,
setExperimentStartupInfo
}
from
'
./experimentStartupInfo
'
;
import
{
ExperimentStartupInfo
,
getExperimentStartupInfo
,
setExperimentStartupInfo
}
from
'
./experimentStartupInfo
'
;
import
{
ExperimentParams
,
Manager
}
from
'
./manager
'
;
import
{
ExperimentParams
,
Manager
}
from
'
./manager
'
;
import
{
ExperimentManager
}
from
'
./experimentManager
'
;
import
{
HyperParameters
,
TrainingService
,
TrialJobStatus
}
from
'
./trainingService
'
;
import
{
HyperParameters
,
TrainingService
,
TrialJobStatus
}
from
'
./trainingService
'
;
import
{
logLevelNameMap
}
from
'
./log
'
;
import
{
logLevelNameMap
}
from
'
./log
'
;
...
@@ -43,6 +46,10 @@ function getCheckpointDir(): string {
...
@@ -43,6 +46,10 @@ function getCheckpointDir(): string {
return
path
.
join
(
getExperimentRootDir
(),
'
checkpoint
'
);
return
path
.
join
(
getExperimentRootDir
(),
'
checkpoint
'
);
}
}
function
getExperimentsInfoPath
():
string
{
return
path
.
join
(
os
.
homedir
(),
'
nni-experiments
'
,
'
.experiment
'
);
}
function
mkDirP
(
dirPath
:
string
):
Promise
<
void
>
{
function
mkDirP
(
dirPath
:
string
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
fs
.
exists
(
dirPath
,
(
exists
:
boolean
)
=>
{
fs
.
exists
(
dirPath
,
(
exists
:
boolean
)
=>
{
...
@@ -184,6 +191,7 @@ function prepareUnitTest(): void {
...
@@ -184,6 +191,7 @@ function prepareUnitTest(): void {
Container
.
snapshot
(
DataStore
);
Container
.
snapshot
(
DataStore
);
Container
.
snapshot
(
TrainingService
);
Container
.
snapshot
(
TrainingService
);
Container
.
snapshot
(
Manager
);
Container
.
snapshot
(
Manager
);
Container
.
snapshot
(
ExperimentManager
);
const
logLevel
:
string
=
parseArg
([
'
--log_level
'
,
'
-ll
'
]);
const
logLevel
:
string
=
parseArg
([
'
--log_level
'
,
'
-ll
'
]);
if
(
logLevel
.
length
>
0
&&
!
logLevelNameMap
.
has
(
logLevel
))
{
if
(
logLevel
.
length
>
0
&&
!
logLevelNameMap
.
has
(
logLevel
))
{
...
@@ -211,6 +219,7 @@ function cleanupUnitTest(): void {
...
@@ -211,6 +219,7 @@ function cleanupUnitTest(): void {
Container
.
restore
(
DataStore
);
Container
.
restore
(
DataStore
);
Container
.
restore
(
Database
);
Container
.
restore
(
Database
);
Container
.
restore
(
ExperimentStartupInfo
);
Container
.
restore
(
ExperimentStartupInfo
);
Container
.
restore
(
ExperimentManager
);
}
}
let
cachedipv4Address
:
string
=
''
;
let
cachedipv4Address
:
string
=
''
;
...
@@ -416,8 +425,29 @@ function unixPathJoin(...paths: any[]): string {
...
@@ -416,8 +425,29 @@ function unixPathJoin(...paths: any[]): string {
return
dir
;
return
dir
;
}
}
/**
* lock a file sync
*/
function
withLockSync
(
func
:
Function
,
filePath
:
string
,
lockOpts
:
{[
key
:
string
]:
any
},
...
args
:
any
):
any
{
const
lockName
=
path
.
join
(
path
.
dirname
(
filePath
),
path
.
basename
(
filePath
)
+
`.lock.
${
process
.
pid
}
`
);
if
(
typeof
lockOpts
.
stale
===
'
number
'
){
const
lockPath
=
path
.
join
(
path
.
dirname
(
filePath
),
path
.
basename
(
filePath
)
+
'
.lock.*
'
);
const
lockFileNames
:
string
[]
=
glob
.
sync
(
lockPath
);
const
canLock
:
boolean
=
lockFileNames
.
map
((
fileName
)
=>
{
return
fs
.
existsSync
(
fileName
)
&&
Date
.
now
()
-
fs
.
statSync
(
fileName
).
mtimeMs
>
lockOpts
.
stale
;
}).
filter
(
isExpired
=>
isExpired
===
false
).
length
===
0
;
if
(
!
canLock
)
{
throw
new
Error
(
'
File has been locked.
'
);
}
}
lockfile
.
lockSync
(
lockName
,
lockOpts
);
const
result
=
func
(...
args
);
lockfile
.
unlockSync
(
lockName
);
return
result
;
}
export
{
export
{
countFilesRecursively
,
validateFileNameRecursively
,
generateParamFileName
,
getMsgDispatcherCommand
,
getCheckpointDir
,
countFilesRecursively
,
validateFileNameRecursively
,
generateParamFileName
,
getMsgDispatcherCommand
,
getCheckpointDir
,
getExperimentsInfoPath
,
getLogDir
,
getExperimentRootDir
,
getJobCancelStatus
,
getDefaultDatabaseDir
,
getIPV4Address
,
unixPathJoin
,
getLogDir
,
getExperimentRootDir
,
getJobCancelStatus
,
getDefaultDatabaseDir
,
getIPV4Address
,
unixPathJoin
,
withLockSync
,
mkDirP
,
mkDirPSync
,
delay
,
prepareUnitTest
,
parseArg
,
cleanupUnitTest
,
uniqueString
,
randomInt
,
randomSelect
,
getLogLevel
,
getVersion
,
getCmdPy
,
getTunerProc
,
isAlive
,
killPid
,
getNewLine
mkDirP
,
mkDirPSync
,
delay
,
prepareUnitTest
,
parseArg
,
cleanupUnitTest
,
uniqueString
,
randomInt
,
randomSelect
,
getLogLevel
,
getVersion
,
getCmdPy
,
getTunerProc
,
isAlive
,
killPid
,
getNewLine
};
};
ts/nni_manager/config/adl/adaptdl-crd-v1.json
0 → 100644
View file @
b40e3db7
{
"apiVersion"
:
"apiextensions.k8s.io/v1beta1"
,
"kind"
:
"CustomResourceDefinition"
,
"metadata"
:
{
"name"
:
"adaptdljobs.adaptdl.petuum.com"
},
"spec"
:
{
"group"
:
"adaptdl.petuum.com"
,
"version"
:
"v1"
,
"scope"
:
"Namespaced"
,
"names"
:
{
"plural"
:
"adaptdljobs"
,
"singular"
:
"adaptdljob"
,
"kind"
:
"AdaptDLJob"
}
}
}
ts/nni_manager/config/adl/adaptdl-nni-configmap-template.json
0 → 100644
View file @
b40e3db7
{
"apiVersion"
:
"v1"
,
"kind"
:
"ConfigMap"
,
"metadata"
:
{
"name"
:
"<name>"
,
"ownerReferences"
:
[
{
"apiVersion"
:
"adaptdl.petuum.com/v1"
,
"kind"
:
"AdaptDLJob"
,
"name"
:
"<adaptdljob_name>"
,
"uid"
:
"<adaptdljob_uid>"
}
]
},
"data"
:
{
"run.sh"
:
"<run_script>"
,
"cleanup.sh"
:
"<clean_script>"
}
}
ts/nni_manager/config/adl/adaptdl-pvc-template.json
0 → 100644
View file @
b40e3db7
{
"apiVersion"
:
"v1"
,
"kind"
:
"PersistentVolumeClaim"
,
"metadata"
:
{
"name"
:
"<name>"
,
"ownerReferences"
:
[
{
"apiVersion"
:
"adaptdl.petuum.com/v1"
,
"kind"
:
"AdaptDLJob"
,
"name"
:
"<adaptdljob_name>"
,
"uid"
:
"<adaptdljob_uid>"
}
]
},
"spec"
:
{
"accessModes"
:
[
"ReadWriteMany"
],
"resources"
:
{
"requests"
:
{
"storage"
:
"<storage_size>"
}
},
"storageClassName"
:
"<storage_class>"
,
"volumeMode"
:
"Filesystem"
}
}
ts/nni_manager/config/adl/adaptdl-tensorboard-deployment-template.json
0 → 100644
View file @
b40e3db7
{
"apiVersion"
:
"apps/v1"
,
"kind"
:
"Deployment"
,
"metadata"
:
{
"name"
:
"<name>"
,
"labels"
:
{
"expId"
:
"<exp_id>"
}
},
"spec"
:
{
"selector"
:
{
"matchLabels"
:
{
"app"
:
"<name>"
}
},
"replicas"
:
1
,
"template"
:
{
"metadata"
:
{
"labels"
:
{
"app"
:
"<name>"
}
},
"spec"
:
{
"containers"
:
[
{
"command"
:
[
"tensorboard"
],
"args"
:
[
"--host=0.0.0.0"
,
"--logdir=/adaptdl/tensorboard"
,
"--port=6006"
],
"image"
:
"tensorflow/tensorflow"
,
"name"
:
"tensorboard"
,
"ports"
:
[
{
"containerPort"
:
6006
}
],
"volumeMounts"
:
[
{
"mountPath"
:
"/adaptdl/tensorboard"
,
"name"
:
"adaptdl-tensorboard-pvc"
,
"subPath"
:
"adaptdl/tensorboard"
}
]
}
],
"volumes"
:
[
{
"name"
:
"adaptdl-tensorboard-pvc"
,
"persistentVolumeClaim"
:
{
"claimName"
:
"<adaptdl_tensorflow_pvc_name>"
}
}
]
}
}
}
}
\ No newline at end of file
ts/nni_manager/config/adl/adaptdl-tensorboard-pvc-template.json
0 → 100644
View file @
b40e3db7
{
"apiVersion"
:
"v1"
,
"kind"
:
"PersistentVolumeClaim"
,
"metadata"
:
{
"name"
:
"<name>"
,
"ownerReferences"
:
[
{
"apiVersion"
:
"apps/v1"
,
"kind"
:
"Deployment"
,
"name"
:
"<adaptdl_tensorboard_name>"
,
"uid"
:
"<adaptdl_tensorboard_uid>"
}
]
},
"spec"
:
{
"accessModes"
:
[
"ReadWriteMany"
],
"resources"
:
{
"requests"
:
{
"storage"
:
"<storage_size>"
}
},
"storageClassName"
:
"<storage_class>"
,
"volumeMode"
:
"Filesystem"
}
}
ts/nni_manager/config/adl/adaptdljob-template.json
0 → 100644
View file @
b40e3db7
{
"apiVersion"
:
"adaptdl.petuum.com/v1"
,
"kind"
:
"AdaptDLJob"
,
"metadata"
:
{
"name"
:
"<name>"
,
"labels"
:
{
"app"
:
"<app_name>"
,
"expId"
:
"<exp_id>"
,
"trialId"
:
"<trial_id>"
}
},
"spec"
:
{
"preemptible"
:
false
,
"template"
:
{
"spec"
:
{
"containers"
:
[
{
"lifecycle"
:
{
"preStop"
:
{
"exec"
:
{
"command"
:
[
"/cleanup.sh"
]
}
}
},
"command"
:
[
"/run.sh"
],
"env"
:
[
{
"name"
:
"ADAPTDL_CHECKPOINT_PATH"
,
"value"
:
"/adaptdl/checkpoint"
},
{
"name"
:
"ADAPTDL_TENSORBOARD_LOGDIR"
,
"value"
:
"/adaptdl/tensorboard"
},
{
"name"
:
"ADAPTDL_SHARE_PATH"
,
"value"
:
"/adaptdl/share"
}
],
"image"
:
"<image>"
,
"imagePullPolicy"
:
"Always"
,
"name"
:
"main"
,
"resources"
:
{
"requests"
:
{
"memory"
:
"<memorySize>"
,
"cpu"
:
"<cpuNum>"
},
"limits"
:
{
"nvidia.com/gpu"
:
1
}
},
"volumeMounts"
:
[
{
"mountPath"
:
"/adaptdl/checkpoint"
,
"name"
:
"adaptdl-pvc"
,
"subPath"
:
"adaptdl/checkpoint"
},
{
"mountPath"
:
"/adaptdl/share"
,
"name"
:
"adaptdl-pvc"
,
"subPath"
:
"adaptdl/share"
},
{
"mountPath"
:
"/adaptdl/tensorboard"
,
"name"
:
"adaptdl-tensorboard-pvc"
,
"subPath"
:
"adaptdl/tensorboard"
},
{
"mountPath"
:
"/cleanup.sh"
,
"name"
:
"adaptdl-nni-configmap"
,
"subPath"
:
"cleanup.sh"
},
{
"mountPath"
:
"/run.sh"
,
"name"
:
"adaptdl-nni-configmap"
,
"subPath"
:
"run.sh"
}
]
}
],
"imagePullSecrets"
:
[],
"volumes"
:
[
{
"name"
:
"adaptdl-pvc"
,
"persistentVolumeClaim"
:
{
"claimName"
:
"<adaptdl_pvc_name>"
}
},
{
"name"
:
"adaptdl-tensorboard-pvc"
,
"persistentVolumeClaim"
:
{
"claimName"
:
"<adaptdl_tensorflow_pvc_name>"
}
},
{
"name"
:
"adaptdl-nni-configmap"
,
"configMap"
:
{
"name"
:
"<adaptdl_nni_configmap_name>"
,
"defaultMode"
:
511
}
}
]
}
}
}
}
ts/nni_manager/core/nniDataStore.ts
View file @
b40e3db7
...
@@ -168,7 +168,7 @@ class NNIDataStore implements DataStore {
...
@@ -168,7 +168,7 @@ class NNIDataStore implements DataStore {
const
oneEntry
:
ExportedDataFormat
=
{
const
oneEntry
:
ExportedDataFormat
=
{
parameter
:
parameters
.
parameters
,
parameter
:
parameters
.
parameters
,
value
:
JSON
.
parse
(
job
.
finalMetricData
[
0
].
data
),
value
:
JSON
.
parse
(
job
.
finalMetricData
[
0
].
data
),
id
:
job
.
i
d
trialJobId
:
job
.
trialJobI
d
};
};
exportedData
.
push
(
oneEntry
);
exportedData
.
push
(
oneEntry
);
}
else
{
}
else
{
...
@@ -188,7 +188,7 @@ class NNIDataStore implements DataStore {
...
@@ -188,7 +188,7 @@ class NNIDataStore implements DataStore {
const
oneEntry
:
ExportedDataFormat
=
{
const
oneEntry
:
ExportedDataFormat
=
{
parameter
:
value
,
parameter
:
value
,
value
:
metricValue
,
value
:
metricValue
,
id
:
job
.
i
d
trialJobId
:
job
.
trialJobI
d
};
};
exportedData
.
push
(
oneEntry
);
exportedData
.
push
(
oneEntry
);
}
}
...
@@ -229,7 +229,7 @@ class NNIDataStore implements DataStore {
...
@@ -229,7 +229,7 @@ class NNIDataStore implements DataStore {
}
}
if
(
!
(
status
!==
undefined
&&
jobInfo
.
status
!==
status
))
{
if
(
!
(
status
!==
undefined
&&
jobInfo
.
status
!==
status
))
{
if
(
jobInfo
.
status
===
'
SUCCEEDED
'
)
{
if
(
jobInfo
.
status
===
'
SUCCEEDED
'
)
{
jobInfo
.
finalMetricData
=
finalMetricsMap
.
get
(
jobInfo
.
i
d
);
jobInfo
.
finalMetricData
=
finalMetricsMap
.
get
(
jobInfo
.
trialJobI
d
);
}
}
result
.
push
(
jobInfo
);
result
.
push
(
jobInfo
);
}
}
...
@@ -320,7 +320,7 @@ class NNIDataStore implements DataStore {
...
@@ -320,7 +320,7 @@ class NNIDataStore implements DataStore {
jobInfo
=
map
.
get
(
record
.
trialJobId
);
jobInfo
=
map
.
get
(
record
.
trialJobId
);
}
else
{
}
else
{
jobInfo
=
{
jobInfo
=
{
i
d
:
record
.
trialJobId
,
trialJobI
d
:
record
.
trialJobId
,
status
:
this
.
getJobStatusByLatestEvent
(
'
UNKNOWN
'
,
record
.
event
),
status
:
this
.
getJobStatusByLatestEvent
(
'
UNKNOWN
'
,
record
.
event
),
hyperParameters
:
[]
hyperParameters
:
[]
};
};
...
@@ -364,14 +364,14 @@ class NNIDataStore implements DataStore {
...
@@ -364,14 +364,14 @@ class NNIDataStore implements DataStore {
const
newHParam
:
any
=
this
.
parseHyperParameter
(
record
.
data
);
const
newHParam
:
any
=
this
.
parseHyperParameter
(
record
.
data
);
if
(
newHParam
!==
undefined
)
{
if
(
newHParam
!==
undefined
)
{
if
(
jobInfo
.
hyperParameters
!==
undefined
)
{
if
(
jobInfo
.
hyperParameters
!==
undefined
)
{
let
hParamIds
:
Set
<
number
>
|
undefined
=
hParamIdMap
.
get
(
jobInfo
.
i
d
);
let
hParamIds
:
Set
<
number
>
|
undefined
=
hParamIdMap
.
get
(
jobInfo
.
trialJobI
d
);
if
(
hParamIds
===
undefined
)
{
if
(
hParamIds
===
undefined
)
{
hParamIds
=
new
Set
();
hParamIds
=
new
Set
();
}
}
if
(
!
hParamIds
.
has
(
newHParam
.
parameter_index
))
{
if
(
!
hParamIds
.
has
(
newHParam
.
parameter_index
))
{
jobInfo
.
hyperParameters
.
push
(
JSON
.
stringify
(
newHParam
));
jobInfo
.
hyperParameters
.
push
(
JSON
.
stringify
(
newHParam
));
hParamIds
.
add
(
newHParam
.
parameter_index
);
hParamIds
.
add
(
newHParam
.
parameter_index
);
hParamIdMap
.
set
(
jobInfo
.
i
d
,
hParamIds
);
hParamIdMap
.
set
(
jobInfo
.
trialJobI
d
,
hParamIds
);
}
}
}
else
{
}
else
{
assert
(
false
,
'
jobInfo.hyperParameters is undefined
'
);
assert
(
false
,
'
jobInfo.hyperParameters is undefined
'
);
...
...
ts/nni_manager/core/nniExperimentsManager.ts
0 → 100644
View file @
b40e3db7
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
'
use strict
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
os
from
'
os
'
;
import
*
as
path
from
'
path
'
;
import
*
as
assert
from
'
assert
'
;
import
{
getLogger
,
Logger
}
from
'
../common/log
'
;
import
{
isAlive
,
withLockSync
,
getExperimentsInfoPath
,
delay
}
from
'
../common/utils
'
;
import
{
ExperimentManager
}
from
'
../common/experimentManager
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
interface
CrashedInfo
{
experimentId
:
string
;
isCrashed
:
boolean
;
}
interface
FileInfo
{
buffer
:
Buffer
;
mtime
:
number
;
}
class
NNIExperimentsManager
implements
ExperimentManager
{
private
experimentsPath
:
string
;
private
log
:
Logger
;
private
profileUpdateTimer
:
{[
key
:
string
]:
any
};
constructor
()
{
this
.
experimentsPath
=
getExperimentsInfoPath
();
this
.
log
=
getLogger
();
this
.
profileUpdateTimer
=
{};
}
public
async
getExperimentsInfo
():
Promise
<
JSON
>
{
const
fileInfo
:
FileInfo
=
await
this
.
withLockIterated
(
this
.
readExperimentsInfo
,
100
);
const
experimentsInformation
=
JSON
.
parse
(
fileInfo
.
buffer
.
toString
());
const
expIdList
:
Array
<
string
>
=
Object
.
keys
(
experimentsInformation
).
filter
((
expId
)
=>
{
return
experimentsInformation
[
expId
][
'
status
'
]
!==
'
STOPPED
'
;
});
const
updateList
:
Array
<
CrashedInfo
>
=
(
await
Promise
.
all
(
expIdList
.
map
((
expId
)
=>
{
return
this
.
checkCrashed
(
expId
,
experimentsInformation
[
expId
][
'
pid
'
]);
}))).
filter
(
crashedInfo
=>
crashedInfo
.
isCrashed
);
if
(
updateList
.
length
>
0
){
const
result
=
await
this
.
withLockIterated
(
this
.
updateAllStatus
,
100
,
updateList
.
map
(
crashedInfo
=>
crashedInfo
.
experimentId
),
fileInfo
.
mtime
);
if
(
result
!==
undefined
)
{
return
JSON
.
parse
(
JSON
.
stringify
(
Object
.
keys
(
result
).
map
(
key
=>
result
[
key
])));
}
else
{
await
delay
(
500
);
return
await
this
.
getExperimentsInfo
();
}
}
else
{
return
JSON
.
parse
(
JSON
.
stringify
(
Object
.
keys
(
experimentsInformation
).
map
(
key
=>
experimentsInformation
[
key
])));
}
}
public
setExperimentPath
(
newPath
:
string
):
void
{
if
(
newPath
[
0
]
===
'
~
'
)
{
newPath
=
path
.
join
(
os
.
homedir
(),
newPath
.
slice
(
1
));
}
if
(
!
path
.
isAbsolute
(
newPath
))
{
newPath
=
path
.
resolve
(
newPath
);
}
this
.
log
.
info
(
`Set new experiment information path:
${
newPath
}
`
);
this
.
experimentsPath
=
newPath
;
}
public
setExperimentInfo
(
experimentId
:
string
,
key
:
string
,
value
:
any
):
void
{
try
{
if
(
this
.
profileUpdateTimer
[
key
]
!==
undefined
)
{
// if a new call with the same timerId occurs, destroy the unfinished old one
clearTimeout
(
this
.
profileUpdateTimer
[
key
]);
this
.
profileUpdateTimer
[
key
]
=
undefined
;
}
this
.
withLockSync
(()
=>
{
const
experimentsInformation
=
JSON
.
parse
(
fs
.
readFileSync
(
this
.
experimentsPath
).
toString
());
assert
(
experimentId
in
experimentsInformation
,
`Experiment Manager: Experiment Id
${
experimentId
}
not found, this should not happen`
);
experimentsInformation
[
experimentId
][
key
]
=
value
;
fs
.
writeFileSync
(
this
.
experimentsPath
,
JSON
.
stringify
(
experimentsInformation
,
null
,
4
));
});
}
catch
(
err
)
{
this
.
log
.
error
(
err
);
this
.
log
.
debug
(
`Experiment Manager: Retry set key value:
${
experimentId
}
{
${
key
}
:
${
value
}
}`
);
if
(
err
.
code
===
'
EEXIST
'
||
err
.
message
===
'
File has been locked.
'
)
{
this
.
profileUpdateTimer
[
key
]
=
setTimeout
(
this
.
setExperimentInfo
.
bind
(
this
),
100
,
experimentId
,
key
,
value
);
}
}
}
private
async
withLockIterated
(
func
:
Function
,
retry
:
number
,
...
args
:
any
):
Promise
<
any
>
{
if
(
retry
<
0
)
{
throw
new
Error
(
'
Lock file out of retries.
'
);
}
try
{
return
this
.
withLockSync
(
func
,
...
args
);
}
catch
(
err
)
{
if
(
err
.
code
===
'
EEXIST
'
||
err
.
message
===
'
File has been locked.
'
)
{
// retry wait is 50ms
await
delay
(
50
);
return
await
this
.
withLockIterated
(
func
,
retry
-
1
,
...
args
);
}
throw
err
;
}
}
private
withLockSync
(
func
:
Function
,
...
args
:
any
):
any
{
return
withLockSync
(
func
.
bind
(
this
),
this
.
experimentsPath
,
{
stale
:
2
*
1000
},
...
args
);
}
private
readExperimentsInfo
():
FileInfo
{
const
buffer
:
Buffer
=
fs
.
readFileSync
(
this
.
experimentsPath
);
const
mtime
:
number
=
fs
.
statSync
(
this
.
experimentsPath
).
mtimeMs
;
return
{
buffer
:
buffer
,
mtime
:
mtime
};
}
private
async
checkCrashed
(
expId
:
string
,
pid
:
number
):
Promise
<
CrashedInfo
>
{
const
alive
:
boolean
=
await
isAlive
(
pid
);
return
{
experimentId
:
expId
,
isCrashed
:
!
alive
}
}
private
updateAllStatus
(
updateList
:
Array
<
string
>
,
timestamp
:
number
):
{[
key
:
string
]:
any
}
|
undefined
{
if
(
timestamp
!==
fs
.
statSync
(
this
.
experimentsPath
).
mtimeMs
)
{
return
;
}
else
{
const
experimentsInformation
=
JSON
.
parse
(
fs
.
readFileSync
(
this
.
experimentsPath
).
toString
());
updateList
.
forEach
((
expId
:
string
)
=>
{
if
(
experimentsInformation
[
expId
])
{
experimentsInformation
[
expId
][
'
status
'
]
=
'
STOPPED
'
;
}
else
{
this
.
log
.
error
(
`Experiment Manager: Experiment Id
${
expId
}
not found, this should not happen`
);
}
});
fs
.
writeFileSync
(
this
.
experimentsPath
,
JSON
.
stringify
(
experimentsInformation
,
null
,
4
));
return
experimentsInformation
;
}
}
public
async
stop
():
Promise
<
void
>
{
this
.
log
.
debug
(
'
Stopping experiment manager.
'
);
await
this
.
cleanUp
().
catch
(
err
=>
this
.
log
.
error
(
err
.
message
));
this
.
log
.
debug
(
'
Experiment manager stopped.
'
);
}
private
async
cleanUp
():
Promise
<
void
>
{
const
deferred
=
new
Deferred
<
void
>
();
if
(
this
.
isUndone
())
{
this
.
log
.
debug
(
'
Experiment manager: something undone
'
);
setTimeout
(((
deferred
:
Deferred
<
void
>
):
void
=>
{
if
(
this
.
isUndone
())
{
deferred
.
reject
(
new
Error
(
'
Still has undone after 5s, forced stop.
'
));
}
else
{
deferred
.
resolve
();
}
}).
bind
(
this
),
5
*
1000
,
deferred
);
}
else
{
this
.
log
.
debug
(
'
Experiment manager: all clean up
'
);
deferred
.
resolve
();
}
return
deferred
.
promise
;
}
private
isUndone
():
boolean
{
return
Object
.
keys
(
this
.
profileUpdateTimer
).
filter
((
key
:
string
)
=>
{
return
this
.
profileUpdateTimer
[
key
]
!==
undefined
;
}).
length
>
0
;
}
}
export
{
NNIExperimentsManager
};
ts/nni_manager/core/nnimanager.ts
View file @
b40e3db7
...
@@ -15,6 +15,7 @@ import {
...
@@ -15,6 +15,7 @@ import {
ExperimentParams
,
ExperimentProfile
,
Manager
,
ExperimentStatus
,
ExperimentParams
,
ExperimentProfile
,
Manager
,
ExperimentStatus
,
NNIManagerStatus
,
ProfileUpdateType
,
TrialJobStatistics
NNIManagerStatus
,
ProfileUpdateType
,
TrialJobStatistics
}
from
'
../common/manager
'
;
}
from
'
../common/manager
'
;
import
{
ExperimentManager
}
from
'
../common/experimentManager
'
;
import
{
import
{
TrainingService
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
,
TrialJobStatus
,
LogType
TrainingService
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
,
TrialJobStatus
,
LogType
}
from
'
../common/trainingService
'
;
}
from
'
../common/trainingService
'
;
...
@@ -31,6 +32,7 @@ import { createDispatcherInterface, IpcInterface } from './ipcInterface';
...
@@ -31,6 +32,7 @@ import { createDispatcherInterface, IpcInterface } from './ipcInterface';
class
NNIManager
implements
Manager
{
class
NNIManager
implements
Manager
{
private
trainingService
:
TrainingService
;
private
trainingService
:
TrainingService
;
private
dispatcher
:
IpcInterface
|
undefined
;
private
dispatcher
:
IpcInterface
|
undefined
;
private
experimentManager
:
ExperimentManager
;
private
currSubmittedTrialNum
:
number
;
// need to be recovered
private
currSubmittedTrialNum
:
number
;
// need to be recovered
private
trialConcurrencyChange
:
number
;
// >0: increase, <0: decrease
private
trialConcurrencyChange
:
number
;
// >0: increase, <0: decrease
private
log
:
Logger
;
private
log
:
Logger
;
...
@@ -49,6 +51,7 @@ class NNIManager implements Manager {
...
@@ -49,6 +51,7 @@ class NNIManager implements Manager {
this
.
currSubmittedTrialNum
=
0
;
this
.
currSubmittedTrialNum
=
0
;
this
.
trialConcurrencyChange
=
0
;
this
.
trialConcurrencyChange
=
0
;
this
.
trainingService
=
component
.
get
(
TrainingService
);
this
.
trainingService
=
component
.
get
(
TrainingService
);
this
.
experimentManager
=
component
.
get
(
ExperimentManager
);
assert
(
this
.
trainingService
);
assert
(
this
.
trainingService
);
this
.
dispatcherPid
=
0
;
this
.
dispatcherPid
=
0
;
this
.
waitingTrials
=
[];
this
.
waitingTrials
=
[];
...
@@ -231,7 +234,7 @@ class NNIManager implements Manager {
...
@@ -231,7 +234,7 @@ class NNIManager implements Manager {
// Check the final status for WAITING and RUNNING jobs
// Check the final status for WAITING and RUNNING jobs
await
Promise
.
all
(
allTrialJobs
await
Promise
.
all
(
allTrialJobs
.
filter
((
job
:
TrialJobInfo
)
=>
job
.
status
===
'
WAITING
'
||
job
.
status
===
'
RUNNING
'
)
.
filter
((
job
:
TrialJobInfo
)
=>
job
.
status
===
'
WAITING
'
||
job
.
status
===
'
RUNNING
'
)
.
map
((
job
:
TrialJobInfo
)
=>
this
.
dataStore
.
storeTrialJobEvent
(
'
FAILED
'
,
job
.
i
d
)));
.
map
((
job
:
TrialJobInfo
)
=>
this
.
dataStore
.
storeTrialJobEvent
(
'
FAILED
'
,
job
.
trialJobI
d
)));
// Collect generated trials and imported trials
// Collect generated trials and imported trials
const
finishedTrialData
:
string
=
await
this
.
exportData
();
const
finishedTrialData
:
string
=
await
this
.
exportData
();
...
@@ -304,7 +307,7 @@ class NNIManager implements Manager {
...
@@ -304,7 +307,7 @@ class NNIManager implements Manager {
// FIXME: can this be undefined?
// FIXME: can this be undefined?
trial
.
sequenceId
!==
undefined
&&
minSeqId
<=
trial
.
sequenceId
&&
trial
.
sequenceId
<=
maxSeqId
trial
.
sequenceId
!==
undefined
&&
minSeqId
<=
trial
.
sequenceId
&&
trial
.
sequenceId
<=
maxSeqId
));
));
const
targetTrialIds
=
new
Set
(
targetTrials
.
map
(
trial
=>
trial
.
i
d
));
const
targetTrialIds
=
new
Set
(
targetTrials
.
map
(
trial
=>
trial
.
trialJobI
d
));
const
allMetrics
=
await
this
.
dataStore
.
getMetricData
();
const
allMetrics
=
await
this
.
dataStore
.
getMetricData
();
return
allMetrics
.
filter
(
metric
=>
targetTrialIds
.
has
(
metric
.
trialJobId
));
return
allMetrics
.
filter
(
metric
=>
targetTrialIds
.
has
(
metric
.
trialJobId
));
...
@@ -345,6 +348,14 @@ class NNIManager implements Manager {
...
@@ -345,6 +348,14 @@ class NNIManager implements Manager {
return
this
.
status
;
return
this
.
status
;
}
}
public
getTrialJobMessage
(
trialJobId
:
string
):
string
|
undefined
{
const
trialJob
=
this
.
trialJobs
.
get
(
trialJobId
);
if
(
trialJob
!==
undefined
){
return
trialJob
.
message
}
return
undefined
}
public
async
listTrialJobs
(
status
?:
TrialJobStatus
):
Promise
<
TrialJobInfo
[]
>
{
public
async
listTrialJobs
(
status
?:
TrialJobStatus
):
Promise
<
TrialJobInfo
[]
>
{
return
this
.
dataStore
.
listTrialJobs
(
status
);
return
this
.
dataStore
.
listTrialJobs
(
status
);
}
}
...
@@ -459,7 +470,9 @@ class NNIManager implements Manager {
...
@@ -459,7 +470,9 @@ class NNIManager implements Manager {
}
}
}
}
await
this
.
trainingService
.
cleanUp
();
await
this
.
trainingService
.
cleanUp
();
this
.
experimentProfile
.
endTime
=
Date
.
now
();
if
(
this
.
experimentProfile
.
endTime
===
undefined
)
{
this
.
setEndtime
();
}
await
this
.
storeExperimentProfile
();
await
this
.
storeExperimentProfile
();
this
.
setStatus
(
'
STOPPED
'
);
this
.
setStatus
(
'
STOPPED
'
);
}
}
...
@@ -501,6 +514,10 @@ class NNIManager implements Manager {
...
@@ -501,6 +514,10 @@ class NNIManager implements Manager {
this
.
trialJobs
.
set
(
trialJobId
,
Object
.
assign
({},
trialJobDetail
));
this
.
trialJobs
.
set
(
trialJobId
,
Object
.
assign
({},
trialJobDetail
));
await
this
.
dataStore
.
storeTrialJobEvent
(
trialJobDetail
.
status
,
trialJobDetail
.
id
,
undefined
,
trialJobDetail
);
await
this
.
dataStore
.
storeTrialJobEvent
(
trialJobDetail
.
status
,
trialJobDetail
.
id
,
undefined
,
trialJobDetail
);
}
}
const
newTrialJobDetail
:
TrialJobDetail
|
undefined
=
this
.
trialJobs
.
get
(
trialJobId
);
if
(
newTrialJobDetail
!==
undefined
)
{
newTrialJobDetail
.
message
=
trialJobDetail
.
message
;
}
let
hyperParams
:
string
|
undefined
=
undefined
;
let
hyperParams
:
string
|
undefined
=
undefined
;
switch
(
trialJobDetail
.
status
)
{
switch
(
trialJobDetail
.
status
)
{
case
'
SUCCEEDED
'
:
case
'
SUCCEEDED
'
:
...
@@ -584,7 +601,7 @@ class NNIManager implements Manager {
...
@@ -584,7 +601,7 @@ class NNIManager implements Manager {
assert
(
allFinishedTrialJobNum
<=
waitSubmittedToFinish
);
assert
(
allFinishedTrialJobNum
<=
waitSubmittedToFinish
);
if
(
allFinishedTrialJobNum
>=
waitSubmittedToFinish
)
{
if
(
allFinishedTrialJobNum
>=
waitSubmittedToFinish
)
{
this
.
setStatus
(
'
DONE
'
);
this
.
setStatus
(
'
DONE
'
);
this
.
experimentProfile
.
e
nd
T
ime
=
Date
.
now
();
this
.
setE
nd
t
ime
();
await
this
.
storeExperimentProfile
();
await
this
.
storeExperimentProfile
();
// write this log for travis CI
// write this log for travis CI
this
.
log
.
info
(
'
Experiment done.
'
);
this
.
log
.
info
(
'
Experiment done.
'
);
...
@@ -678,11 +695,15 @@ class NNIManager implements Manager {
...
@@ -678,11 +695,15 @@ class NNIManager implements Manager {
private
async
onTrialJobMetrics
(
metric
:
TrialJobMetric
):
Promise
<
void
>
{
private
async
onTrialJobMetrics
(
metric
:
TrialJobMetric
):
Promise
<
void
>
{
this
.
log
.
debug
(
`NNIManager received trial job metrics:
${
metric
}
`
);
this
.
log
.
debug
(
`NNIManager received trial job metrics:
${
metric
}
`
);
await
this
.
dataStore
.
storeMetricData
(
metric
.
id
,
metric
.
data
);
if
(
this
.
trialJobs
.
has
(
metric
.
id
)){
if
(
this
.
dispatcher
===
undefined
)
{
await
this
.
dataStore
.
storeMetricData
(
metric
.
id
,
metric
.
data
);
throw
new
Error
(
'
Error: tuner has not been setup
'
);
if
(
this
.
dispatcher
===
undefined
)
{
throw
new
Error
(
'
Error: tuner has not been setup
'
);
}
this
.
dispatcher
.
sendCommand
(
REPORT_METRIC_DATA
,
metric
.
data
);
}
else
{
this
.
log
.
warning
(
`NNIManager received non-existent trial job metrics:
${
metric
}
`
);
}
}
this
.
dispatcher
.
sendCommand
(
REPORT_METRIC_DATA
,
metric
.
data
);
}
}
private
requestTrialJobs
(
jobNum
:
number
):
void
{
private
requestTrialJobs
(
jobNum
:
number
):
void
{
...
@@ -780,6 +801,7 @@ class NNIManager implements Manager {
...
@@ -780,6 +801,7 @@ class NNIManager implements Manager {
this
.
log
.
error
(
err
.
stack
);
this
.
log
.
error
(
err
.
stack
);
}
}
this
.
status
.
errors
.
push
(
err
.
message
);
this
.
status
.
errors
.
push
(
err
.
message
);
this
.
setEndtime
();
this
.
setStatus
(
'
ERROR
'
);
this
.
setStatus
(
'
ERROR
'
);
}
}
...
@@ -787,9 +809,15 @@ class NNIManager implements Manager {
...
@@ -787,9 +809,15 @@ class NNIManager implements Manager {
if
(
status
!==
this
.
status
.
status
)
{
if
(
status
!==
this
.
status
.
status
)
{
this
.
log
.
info
(
`Change NNIManager status from:
${
this
.
status
.
status
}
to:
${
status
}
`
);
this
.
log
.
info
(
`Change NNIManager status from:
${
this
.
status
.
status
}
to:
${
status
}
`
);
this
.
status
.
status
=
status
;
this
.
status
.
status
=
status
;
this
.
experimentManager
.
setExperimentInfo
(
this
.
experimentProfile
.
id
,
'
status
'
,
this
.
status
.
status
);
}
}
}
}
private
setEndtime
():
void
{
this
.
experimentProfile
.
endTime
=
Date
.
now
();
this
.
experimentManager
.
setExperimentInfo
(
this
.
experimentProfile
.
id
,
'
endTime
'
,
this
.
experimentProfile
.
endTime
);
}
private
createEmptyExperimentProfile
():
ExperimentProfile
{
private
createEmptyExperimentProfile
():
ExperimentProfile
{
return
{
return
{
id
:
getExperimentId
(),
id
:
getExperimentId
(),
...
...
ts/nni_manager/core/test/experimentManager.test.ts
0 → 100644
View file @
b40e3db7
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
'
use strict
'
;
import
{
assert
,
expect
}
from
'
chai
'
;
import
*
as
fs
from
'
fs
'
;
import
{
Container
,
Scope
}
from
'
typescript-ioc
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
cleanupUnitTest
,
prepareUnitTest
}
from
'
../../common/utils
'
;
import
{
ExperimentManager
}
from
'
../../common/experimentManager
'
;
import
{
NNIExperimentsManager
}
from
'
../nniExperimentsManager
'
;
describe
(
'
Unit test for experiment manager
'
,
function
()
{
let
experimentManager
:
NNIExperimentsManager
;
const
mockedInfo
=
{
"
test
"
:
{
"
port
"
:
8080
,
"
startTime
"
:
1605246730756
,
"
endTime
"
:
"
N/A
"
,
"
status
"
:
"
INITIALIZED
"
,
"
platform
"
:
"
local
"
,
"
experimentName
"
:
"
testExp
"
,
"
tag
"
:
[],
"
pid
"
:
11111
,
"
webuiUrl
"
:
[],
"
logDir
"
:
null
}
}
before
(()
=>
{
prepareUnitTest
();
fs
.
writeFileSync
(
'
.experiment.test
'
,
JSON
.
stringify
(
mockedInfo
));
Container
.
bind
(
ExperimentManager
).
to
(
NNIExperimentsManager
).
scope
(
Scope
.
Singleton
);
experimentManager
=
component
.
get
(
NNIExperimentsManager
);
experimentManager
.
setExperimentPath
(
'
.experiment.test
'
);
});
after
(()
=>
{
if
(
fs
.
existsSync
(
'
.experiment.test
'
))
{
fs
.
unlinkSync
(
'
.experiment.test
'
);
}
cleanupUnitTest
();
});
it
(
'
test getExperimentsInfo
'
,
()
=>
{
return
experimentManager
.
getExperimentsInfo
().
then
(
function
(
experimentsInfo
:
{[
key
:
string
]:
any
})
{
new
Array
(
experimentsInfo
)
for
(
let
idx
in
experimentsInfo
)
{
if
(
experimentsInfo
[
idx
][
'
id
'
]
===
'
test
'
)
{
expect
(
experimentsInfo
[
idx
][
'
status
'
]).
to
.
be
.
oneOf
([
'
STOPPED
'
,
'
ERROR
'
]);
break
;
}
}
}).
catch
((
error
)
=>
{
assert
.
fail
(
error
);
})
});
});
ts/nni_manager/core/test/mockedDatastore.ts
View file @
b40e3db7
...
@@ -161,7 +161,7 @@ class MockedDataStore implements DataStore {
...
@@ -161,7 +161,7 @@ class MockedDataStore implements DataStore {
}
}
if
(
!
(
status
&&
jobInfo
.
status
!==
status
))
{
if
(
!
(
status
&&
jobInfo
.
status
!==
status
))
{
if
(
jobInfo
.
status
===
'
SUCCEEDED
'
)
{
if
(
jobInfo
.
status
===
'
SUCCEEDED
'
)
{
jobInfo
.
finalMetricData
=
await
this
.
getFinalMetricData
(
jobInfo
.
i
d
);
jobInfo
.
finalMetricData
=
await
this
.
getFinalMetricData
(
jobInfo
.
trialJobI
d
);
}
}
result
.
push
(
jobInfo
);
result
.
push
(
jobInfo
);
}
}
...
@@ -206,7 +206,7 @@ class MockedDataStore implements DataStore {
...
@@ -206,7 +206,7 @@ class MockedDataStore implements DataStore {
public
getTrialJob
(
trialJobId
:
string
):
Promise
<
TrialJobInfo
>
{
public
getTrialJob
(
trialJobId
:
string
):
Promise
<
TrialJobInfo
>
{
return
Promise
.
resolve
({
return
Promise
.
resolve
({
i
d
:
'
1234
'
,
trialJobI
d
:
'
1234
'
,
status
:
'
SUCCEEDED
'
,
status
:
'
SUCCEEDED
'
,
startTime
:
Date
.
now
(),
startTime
:
Date
.
now
(),
endTime
:
Date
.
now
()
endTime
:
Date
.
now
()
...
@@ -242,7 +242,7 @@ class MockedDataStore implements DataStore {
...
@@ -242,7 +242,7 @@ class MockedDataStore implements DataStore {
jobInfo
=
map
.
get
(
record
.
trialJobId
);
jobInfo
=
map
.
get
(
record
.
trialJobId
);
}
else
{
}
else
{
jobInfo
=
{
jobInfo
=
{
i
d
:
record
.
trialJobId
,
trialJobI
d
:
record
.
trialJobId
,
status
:
this
.
getJobStatusByLatestEvent
(
record
.
event
),
status
:
this
.
getJobStatusByLatestEvent
(
record
.
event
),
};
};
}
}
...
...
ts/nni_manager/core/test/nnimanager.test.ts
View file @
b40e3db7
...
@@ -3,6 +3,7 @@
...
@@ -3,6 +3,7 @@
'
use strict
'
;
'
use strict
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
os
from
'
os
'
;
import
*
as
os
from
'
os
'
;
import
{
assert
,
expect
}
from
'
chai
'
;
import
{
assert
,
expect
}
from
'
chai
'
;
import
{
Container
,
Scope
}
from
'
typescript-ioc
'
;
import
{
Container
,
Scope
}
from
'
typescript-ioc
'
;
...
@@ -10,9 +11,10 @@ import { Container, Scope } from 'typescript-ioc';
...
@@ -10,9 +11,10 @@ import { Container, Scope } from 'typescript-ioc';
import
*
as
component
from
'
../../common/component
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
Database
,
DataStore
}
from
'
../../common/datastore
'
;
import
{
Database
,
DataStore
}
from
'
../../common/datastore
'
;
import
{
Manager
,
ExperimentProfile
}
from
'
../../common/manager
'
;
import
{
Manager
,
ExperimentProfile
}
from
'
../../common/manager
'
;
import
{
ExperimentManager
}
from
'
../../common/experimentManager
'
;
import
{
TrainingService
}
from
'
../../common/trainingService
'
;
import
{
TrainingService
}
from
'
../../common/trainingService
'
;
import
{
cleanupUnitTest
,
prepareUnitTest
}
from
'
../../common/utils
'
;
import
{
cleanupUnitTest
,
prepareUnitTest
}
from
'
../../common/utils
'
;
import
{
NNI
DataStore
}
from
'
../nniDataStore
'
;
import
{
NNI
ExperimentsManager
}
from
'
../nniExperimentsManager
'
;
import
{
NNIManager
}
from
'
../nnimanager
'
;
import
{
NNIManager
}
from
'
../nnimanager
'
;
import
{
SqlDB
}
from
'
../sqlDatabase
'
;
import
{
SqlDB
}
from
'
../sqlDatabase
'
;
import
{
MockedTrainingService
}
from
'
./mockedTrainingService
'
;
import
{
MockedTrainingService
}
from
'
./mockedTrainingService
'
;
...
@@ -25,6 +27,7 @@ async function initContainer(): Promise<void> {
...
@@ -25,6 +27,7 @@ async function initContainer(): Promise<void> {
Container
.
bind
(
Manager
).
to
(
NNIManager
).
scope
(
Scope
.
Singleton
);
Container
.
bind
(
Manager
).
to
(
NNIManager
).
scope
(
Scope
.
Singleton
);
Container
.
bind
(
Database
).
to
(
SqlDB
).
scope
(
Scope
.
Singleton
);
Container
.
bind
(
Database
).
to
(
SqlDB
).
scope
(
Scope
.
Singleton
);
Container
.
bind
(
DataStore
).
to
(
MockedDataStore
).
scope
(
Scope
.
Singleton
);
Container
.
bind
(
DataStore
).
to
(
MockedDataStore
).
scope
(
Scope
.
Singleton
);
Container
.
bind
(
ExperimentManager
).
to
(
NNIExperimentsManager
).
scope
(
Scope
.
Singleton
);
await
component
.
get
<
DataStore
>
(
DataStore
).
init
();
await
component
.
get
<
DataStore
>
(
DataStore
).
init
();
}
}
...
@@ -87,9 +90,26 @@ describe('Unit test for nnimanager', function () {
...
@@ -87,9 +90,26 @@ describe('Unit test for nnimanager', function () {
revision
:
0
revision
:
0
}
}
let
mockedInfo
=
{
"
unittest
"
:
{
"
port
"
:
8080
,
"
startTime
"
:
1605246730756
,
"
endTime
"
:
"
N/A
"
,
"
status
"
:
"
INITIALIZED
"
,
"
platform
"
:
"
local
"
,
"
experimentName
"
:
"
testExp
"
,
"
tag
"
:
[],
"
pid
"
:
11111
,
"
webuiUrl
"
:
[],
"
logDir
"
:
null
}
}
before
(
async
()
=>
{
before
(
async
()
=>
{
await
initContainer
();
await
initContainer
();
fs
.
writeFileSync
(
'
.experiment.test
'
,
JSON
.
stringify
(
mockedInfo
));
const
experimentsManager
:
ExperimentManager
=
component
.
get
(
ExperimentManager
);
experimentsManager
.
setExperimentPath
(
'
.experiment.test
'
);
nniManager
=
component
.
get
(
Manager
);
nniManager
=
component
.
get
(
Manager
);
const
expId
:
string
=
await
nniManager
.
startExperiment
(
experimentParams
);
const
expId
:
string
=
await
nniManager
.
startExperiment
(
experimentParams
);
assert
.
strictEqual
(
expId
,
'
unittest
'
);
assert
.
strictEqual
(
expId
,
'
unittest
'
);
...
@@ -122,7 +142,7 @@ describe('Unit test for nnimanager', function () {
...
@@ -122,7 +142,7 @@ describe('Unit test for nnimanager', function () {
it
(
'
test getTrialJob valid
'
,
()
=>
{
it
(
'
test getTrialJob valid
'
,
()
=>
{
//query a exist id
//query a exist id
return
nniManager
.
getTrialJob
(
'
1234
'
).
then
(
function
(
trialJobDetail
)
{
return
nniManager
.
getTrialJob
(
'
1234
'
).
then
(
function
(
trialJobDetail
)
{
expect
(
trialJobDetail
.
i
d
).
to
.
be
.
equal
(
'
1234
'
);
expect
(
trialJobDetail
.
trialJobI
d
).
to
.
be
.
equal
(
'
1234
'
);
}).
catch
((
error
)
=>
{
}).
catch
((
error
)
=>
{
assert
.
fail
(
error
);
assert
.
fail
(
error
);
})
})
...
...
ts/nni_manager/main.ts
View file @
b40e3db7
...
@@ -12,13 +12,16 @@ import { Database, DataStore } from './common/datastore';
...
@@ -12,13 +12,16 @@ import { Database, DataStore } from './common/datastore';
import
{
setExperimentStartupInfo
}
from
'
./common/experimentStartupInfo
'
;
import
{
setExperimentStartupInfo
}
from
'
./common/experimentStartupInfo
'
;
import
{
getLogger
,
Logger
,
logLevelNameMap
}
from
'
./common/log
'
;
import
{
getLogger
,
Logger
,
logLevelNameMap
}
from
'
./common/log
'
;
import
{
Manager
,
ExperimentStartUpMode
}
from
'
./common/manager
'
;
import
{
Manager
,
ExperimentStartUpMode
}
from
'
./common/manager
'
;
import
{
ExperimentManager
}
from
'
./common/experimentManager
'
;
import
{
TrainingService
}
from
'
./common/trainingService
'
;
import
{
TrainingService
}
from
'
./common/trainingService
'
;
import
{
getLogDir
,
mkDirP
,
parseArg
,
uniqueString
}
from
'
./common/utils
'
;
import
{
getLogDir
,
mkDirP
,
parseArg
}
from
'
./common/utils
'
;
import
{
NNIDataStore
}
from
'
./core/nniDataStore
'
;
import
{
NNIDataStore
}
from
'
./core/nniDataStore
'
;
import
{
NNIManager
}
from
'
./core/nnimanager
'
;
import
{
NNIManager
}
from
'
./core/nnimanager
'
;
import
{
SqlDB
}
from
'
./core/sqlDatabase
'
;
import
{
SqlDB
}
from
'
./core/sqlDatabase
'
;
import
{
NNIExperimentsManager
}
from
'
./core/nniExperimentsManager
'
;
import
{
NNIRestServer
}
from
'
./rest_server/nniRestServer
'
;
import
{
NNIRestServer
}
from
'
./rest_server/nniRestServer
'
;
import
{
FrameworkControllerTrainingService
}
from
'
./training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService
'
;
import
{
FrameworkControllerTrainingService
}
from
'
./training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService
'
;
import
{
AdlTrainingService
}
from
'
./training_service/kubernetes/adl/adlTrainingService
'
;
import
{
KubeflowTrainingService
}
from
'
./training_service/kubernetes/kubeflow/kubeflowTrainingService
'
;
import
{
KubeflowTrainingService
}
from
'
./training_service/kubernetes/kubeflow/kubeflowTrainingService
'
;
import
{
LocalTrainingService
}
from
'
./training_service/local/localTrainingService
'
;
import
{
LocalTrainingService
}
from
'
./training_service/local/localTrainingService
'
;
import
{
RouterTrainingService
}
from
'
./training_service/reusable/routerTrainingService
'
;
import
{
RouterTrainingService
}
from
'
./training_service/reusable/routerTrainingService
'
;
...
@@ -26,15 +29,18 @@ import { PAIYarnTrainingService } from './training_service/pai/paiYarn/paiYarnTr
...
@@ -26,15 +29,18 @@ import { PAIYarnTrainingService } from './training_service/pai/paiYarn/paiYarnTr
import
{
DLTSTrainingService
}
from
'
./training_service/dlts/dltsTrainingService
'
;
import
{
DLTSTrainingService
}
from
'
./training_service/dlts/dltsTrainingService
'
;
function
initStartupInfo
(
function
initStartupInfo
(
startExpMode
:
string
,
resumeE
xperimentId
:
string
,
basePort
:
number
,
platform
:
string
,
startExpMode
:
string
,
e
xperimentId
:
string
,
basePort
:
number
,
platform
:
string
,
logDirectory
:
string
,
experimentLogLevel
:
string
,
readonly
:
boolean
):
void
{
logDirectory
:
string
,
experimentLogLevel
:
string
,
readonly
:
boolean
):
void
{
const
createNew
:
boolean
=
(
startExpMode
===
ExperimentStartUpMode
.
NEW
);
const
createNew
:
boolean
=
(
startExpMode
===
ExperimentStartUpMode
.
NEW
);
const
expId
:
string
=
createNew
?
uniqueString
(
8
)
:
resumeExperimentId
;
setExperimentStartupInfo
(
createNew
,
experimentId
,
basePort
,
platform
,
logDirectory
,
experimentLogLevel
,
readonly
);
setExperimentStartupInfo
(
createNew
,
expId
,
basePort
,
platform
,
logDirectory
,
experimentLogLevel
,
readonly
);
}
}
async
function
initContainer
(
foreground
:
boolean
,
platformMode
:
string
,
logFileName
?:
string
):
Promise
<
void
>
{
async
function
initContainer
(
foreground
:
boolean
,
platformMode
:
string
,
logFileName
?:
string
):
Promise
<
void
>
{
if
(
platformMode
===
'
local
'
)
{
if
(
platformMode
===
'
adl
'
)
{
Container
.
bind
(
TrainingService
)
.
to
(
AdlTrainingService
)
.
scope
(
Scope
.
Singleton
);
}
else
if
(
platformMode
===
'
local
'
)
{
Container
.
bind
(
TrainingService
)
Container
.
bind
(
TrainingService
)
.
to
(
LocalTrainingService
)
.
to
(
LocalTrainingService
)
.
scope
(
Scope
.
Singleton
);
.
scope
(
Scope
.
Singleton
);
...
@@ -78,6 +84,9 @@ async function initContainer(foreground: boolean, platformMode: string, logFileN
...
@@ -78,6 +84,9 @@ async function initContainer(foreground: boolean, platformMode: string, logFileN
Container
.
bind
(
DataStore
)
Container
.
bind
(
DataStore
)
.
to
(
NNIDataStore
)
.
to
(
NNIDataStore
)
.
scope
(
Scope
.
Singleton
);
.
scope
(
Scope
.
Singleton
);
Container
.
bind
(
ExperimentManager
)
.
to
(
NNIExperimentsManager
)
.
scope
(
Scope
.
Singleton
);
const
DEFAULT_LOGFILE
:
string
=
path
.
join
(
getLogDir
(),
'
nnimanager.log
'
);
const
DEFAULT_LOGFILE
:
string
=
path
.
join
(
getLogDir
(),
'
nnimanager.log
'
);
if
(
foreground
)
{
if
(
foreground
)
{
logFileName
=
undefined
;
logFileName
=
undefined
;
...
@@ -94,7 +103,7 @@ async function initContainer(foreground: boolean, platformMode: string, logFileN
...
@@ -94,7 +103,7 @@ async function initContainer(foreground: boolean, platformMode: string, logFileN
function
usage
():
void
{
function
usage
():
void
{
console
.
info
(
'
usage: node main.js --port <port> --mode
\
console
.
info
(
'
usage: node main.js --port <port> --mode
\
<local/remote/pai/kubeflow/frameworkcontroller/paiYarn/aml> --start_mode <new/resume> --experiment_id <id> --foreground <true/false>
'
);
<
adl/
local/remote/pai/kubeflow/frameworkcontroller/paiYarn/aml> --start_mode <new/resume> --experiment_id <id> --foreground <true/false>
'
);
}
}
const
strPort
:
string
=
parseArg
([
'
--port
'
,
'
-p
'
]);
const
strPort
:
string
=
parseArg
([
'
--port
'
,
'
-p
'
]);
...
@@ -114,7 +123,7 @@ const foreground: boolean = foregroundArg.toLowerCase() === 'true' ? true : fals
...
@@ -114,7 +123,7 @@ const foreground: boolean = foregroundArg.toLowerCase() === 'true' ? true : fals
const
port
:
number
=
parseInt
(
strPort
,
10
);
const
port
:
number
=
parseInt
(
strPort
,
10
);
const
mode
:
string
=
parseArg
([
'
--mode
'
,
'
-m
'
]);
const
mode
:
string
=
parseArg
([
'
--mode
'
,
'
-m
'
]);
if
(
!
[
'
local
'
,
'
remote
'
,
'
pai
'
,
'
kubeflow
'
,
'
frameworkcontroller
'
,
'
paiYarn
'
,
'
dlts
'
,
'
aml
'
].
includes
(
mode
))
{
if
(
!
[
'
adl
'
,
'
local
'
,
'
remote
'
,
'
pai
'
,
'
kubeflow
'
,
'
frameworkcontroller
'
,
'
paiYarn
'
,
'
dlts
'
,
'
aml
'
].
includes
(
mode
))
{
console
.
log
(
`FATAL: unknown mode:
${
mode
}
`
);
console
.
log
(
`FATAL: unknown mode:
${
mode
}
`
);
usage
();
usage
();
process
.
exit
(
1
);
process
.
exit
(
1
);
...
@@ -128,7 +137,7 @@ if (![ExperimentStartUpMode.NEW, ExperimentStartUpMode.RESUME].includes(startMod
...
@@ -128,7 +137,7 @@ if (![ExperimentStartUpMode.NEW, ExperimentStartUpMode.RESUME].includes(startMod
}
}
const
experimentId
:
string
=
parseArg
([
'
--experiment_id
'
,
'
-id
'
]);
const
experimentId
:
string
=
parseArg
([
'
--experiment_id
'
,
'
-id
'
]);
if
(
(
startMode
===
ExperimentStartUpMode
.
RESUME
)
&&
experimentId
.
trim
().
length
<
1
)
{
if
(
experimentId
.
trim
().
length
<
1
)
{
console
.
log
(
`FATAL: cannot resume the experiment, invalid experiment_id:
${
experimentId
}
`
);
console
.
log
(
`FATAL: cannot resume the experiment, invalid experiment_id:
${
experimentId
}
`
);
usage
();
usage
();
process
.
exit
(
1
);
process
.
exit
(
1
);
...
@@ -174,30 +183,14 @@ mkDirP(getLogDir())
...
@@ -174,30 +183,14 @@ mkDirP(getLogDir())
console
.
error
(
`Failed to create log dir:
${
err
.
stack
}
`
);
console
.
error
(
`Failed to create log dir:
${
err
.
stack
}
`
);
});
});
function
getStopSignal
():
any
{
async
function
cleanUp
():
Promise
<
void
>
{
if
(
process
.
platform
===
"
win32
"
)
{
return
'
SIGBREAK
'
;
}
else
{
return
'
SIGTERM
'
;
}
}
function
getCtrlCSignal
():
any
{
return
'
SIGINT
'
;
}
process
.
on
(
getCtrlCSignal
(),
async
()
=>
{
const
log
:
Logger
=
getLogger
();
log
.
info
(
`Get SIGINT signal!`
);
});
process
.
on
(
getStopSignal
(),
async
()
=>
{
const
log
:
Logger
=
getLogger
();
const
log
:
Logger
=
getLogger
();
let
hasError
:
boolean
=
false
;
let
hasError
:
boolean
=
false
;
try
{
try
{
const
nniManager
:
Manager
=
component
.
get
(
Manager
);
const
nniManager
:
Manager
=
component
.
get
(
Manager
);
await
nniManager
.
stopExperiment
();
await
nniManager
.
stopExperiment
();
const
experimentManager
:
ExperimentManager
=
component
.
get
(
ExperimentManager
);
await
experimentManager
.
stop
();
const
ds
:
DataStore
=
component
.
get
(
DataStore
);
const
ds
:
DataStore
=
component
.
get
(
DataStore
);
await
ds
.
close
();
await
ds
.
close
();
const
restServer
:
NNIRestServer
=
component
.
get
(
NNIRestServer
);
const
restServer
:
NNIRestServer
=
component
.
get
(
NNIRestServer
);
...
@@ -206,7 +199,11 @@ process.on(getStopSignal(), async () => {
...
@@ -206,7 +199,11 @@ process.on(getStopSignal(), async () => {
hasError
=
true
;
hasError
=
true
;
log
.
error
(
`
${
err
.
stack
}
`
);
log
.
error
(
`
${
err
.
stack
}
`
);
}
finally
{
}
finally
{
await
log
.
close
();
log
.
close
();
process
.
exit
(
hasError
?
1
:
0
);
process
.
exit
(
hasError
?
1
:
0
);
}
}
});
}
process
.
on
(
'
SIGTERM
'
,
cleanUp
);
process
.
on
(
'
SIGBREAK
'
,
cleanUp
);
process
.
on
(
'
SIGINT
'
,
cleanUp
);
ts/nni_manager/package.json
View file @
b40e3db7
...
@@ -18,6 +18,7 @@
...
@@ -18,6 +18,7 @@
"ignore"
:
"^5.1.4"
,
"ignore"
:
"^5.1.4"
,
"js-base64"
:
"^2.4.9"
,
"js-base64"
:
"^2.4.9"
,
"kubernetes-client"
:
"^6.5.0"
,
"kubernetes-client"
:
"^6.5.0"
,
"lockfile"
:
"^1.0.4"
,
"python-shell"
:
"^2.0.1"
,
"python-shell"
:
"^2.0.1"
,
"rx"
:
"^4.1.0"
,
"rx"
:
"^4.1.0"
,
"sqlite3"
:
"^5.0.0"
,
"sqlite3"
:
"^5.0.0"
,
...
@@ -39,6 +40,7 @@
...
@@ -39,6 +40,7 @@
"@types/glob"
:
"^7.1.1"
,
"@types/glob"
:
"^7.1.1"
,
"@types/js-base64"
:
"^2.3.1"
,
"@types/js-base64"
:
"^2.3.1"
,
"@types/js-yaml"
:
"^3.12.5"
,
"@types/js-yaml"
:
"^3.12.5"
,
"@types/lockfile"
:
"^1.0.0"
,
"@types/mocha"
:
"^8.0.3"
,
"@types/mocha"
:
"^8.0.3"
,
"@types/node"
:
"10.12.18"
,
"@types/node"
:
"10.12.18"
,
"@types/request"
:
"^2.47.1"
,
"@types/request"
:
"^2.47.1"
,
...
...
ts/nni_manager/rest_server/restHandler.ts
View file @
b40e3db7
...
@@ -12,6 +12,7 @@ import { NNIError, NNIErrorNames } from '../common/errors';
...
@@ -12,6 +12,7 @@ import { NNIError, NNIErrorNames } from '../common/errors';
import
{
isNewExperiment
,
isReadonly
}
from
'
../common/experimentStartupInfo
'
;
import
{
isNewExperiment
,
isReadonly
}
from
'
../common/experimentStartupInfo
'
;
import
{
getLogger
,
Logger
}
from
'
../common/log
'
;
import
{
getLogger
,
Logger
}
from
'
../common/log
'
;
import
{
ExperimentProfile
,
Manager
,
TrialJobStatistics
}
from
'
../common/manager
'
;
import
{
ExperimentProfile
,
Manager
,
TrialJobStatistics
}
from
'
../common/manager
'
;
import
{
ExperimentManager
}
from
'
../common/experimentManager
'
;
import
{
ValidationSchemas
}
from
'
./restValidationSchemas
'
;
import
{
ValidationSchemas
}
from
'
./restValidationSchemas
'
;
import
{
NNIRestServer
}
from
'
./nniRestServer
'
;
import
{
NNIRestServer
}
from
'
./nniRestServer
'
;
import
{
getVersion
}
from
'
../common/utils
'
;
import
{
getVersion
}
from
'
../common/utils
'
;
...
@@ -21,10 +22,12 @@ const expressJoi = require('express-joi-validator');
...
@@ -21,10 +22,12 @@ const expressJoi = require('express-joi-validator');
class
NNIRestHandler
{
class
NNIRestHandler
{
private
restServer
:
NNIRestServer
;
private
restServer
:
NNIRestServer
;
private
nniManager
:
Manager
;
private
nniManager
:
Manager
;
private
experimentsManager
:
ExperimentManager
;
private
log
:
Logger
;
private
log
:
Logger
;
constructor
(
rs
:
NNIRestServer
)
{
constructor
(
rs
:
NNIRestServer
)
{
this
.
nniManager
=
component
.
get
(
Manager
);
this
.
nniManager
=
component
.
get
(
Manager
);
this
.
experimentsManager
=
component
.
get
(
ExperimentManager
);
this
.
restServer
=
rs
;
this
.
restServer
=
rs
;
this
.
log
=
getLogger
();
this
.
log
=
getLogger
();
}
}
...
@@ -60,6 +63,7 @@ class NNIRestHandler {
...
@@ -60,6 +63,7 @@ class NNIRestHandler {
this
.
getLatestMetricData
(
router
);
this
.
getLatestMetricData
(
router
);
this
.
getTrialLog
(
router
);
this
.
getTrialLog
(
router
);
this
.
exportData
(
router
);
this
.
exportData
(
router
);
this
.
getExperimentsInfo
(
router
);
// Express-joi-validator configuration
// Express-joi-validator configuration
router
.
use
((
err
:
any
,
_req
:
Request
,
res
:
Response
,
_next
:
any
)
=>
{
router
.
use
((
err
:
any
,
_req
:
Request
,
res
:
Response
,
_next
:
any
)
=>
{
...
@@ -209,6 +213,7 @@ class NNIRestHandler {
...
@@ -209,6 +213,7 @@ class NNIRestHandler {
this
.
nniManager
.
listTrialJobs
(
req
.
query
.
status
).
then
((
jobInfos
:
TrialJobInfo
[])
=>
{
this
.
nniManager
.
listTrialJobs
(
req
.
query
.
status
).
then
((
jobInfos
:
TrialJobInfo
[])
=>
{
jobInfos
.
forEach
((
trialJob
:
TrialJobInfo
)
=>
{
jobInfos
.
forEach
((
trialJob
:
TrialJobInfo
)
=>
{
this
.
setErrorPathForFailedJob
(
trialJob
);
this
.
setErrorPathForFailedJob
(
trialJob
);
this
.
setMessageforJob
(
trialJob
);
});
});
res
.
send
(
jobInfos
);
res
.
send
(
jobInfos
);
}).
catch
((
err
:
Error
)
=>
{
}).
catch
((
err
:
Error
)
=>
{
...
@@ -221,6 +226,7 @@ class NNIRestHandler {
...
@@ -221,6 +226,7 @@ class NNIRestHandler {
router
.
get
(
'
/trial-jobs/:id
'
,
(
req
:
Request
,
res
:
Response
)
=>
{
router
.
get
(
'
/trial-jobs/:id
'
,
(
req
:
Request
,
res
:
Response
)
=>
{
this
.
nniManager
.
getTrialJob
(
req
.
params
.
id
).
then
((
jobDetail
:
TrialJobInfo
)
=>
{
this
.
nniManager
.
getTrialJob
(
req
.
params
.
id
).
then
((
jobDetail
:
TrialJobInfo
)
=>
{
const
jobInfo
:
TrialJobInfo
=
this
.
setErrorPathForFailedJob
(
jobDetail
);
const
jobInfo
:
TrialJobInfo
=
this
.
setErrorPathForFailedJob
(
jobDetail
);
this
.
setMessageforJob
(
jobInfo
);
res
.
send
(
jobInfo
);
res
.
send
(
jobInfo
);
}).
catch
((
err
:
Error
)
=>
{
}).
catch
((
err
:
Error
)
=>
{
this
.
handleError
(
err
,
res
);
this
.
handleError
(
err
,
res
);
...
@@ -303,6 +309,16 @@ class NNIRestHandler {
...
@@ -303,6 +309,16 @@ class NNIRestHandler {
});
});
}
}
private
getExperimentsInfo
(
router
:
Router
):
void
{
router
.
get
(
'
/experiments-info
'
,
(
req
:
Request
,
res
:
Response
)
=>
{
this
.
experimentsManager
.
getExperimentsInfo
().
then
((
experimentInfo
:
JSON
)
=>
{
res
.
send
(
JSON
.
stringify
(
experimentInfo
));
}).
catch
((
err
:
Error
)
=>
{
this
.
handleError
(
err
,
res
);
});
});
}
private
setErrorPathForFailedJob
(
jobInfo
:
TrialJobInfo
):
TrialJobInfo
{
private
setErrorPathForFailedJob
(
jobInfo
:
TrialJobInfo
):
TrialJobInfo
{
if
(
jobInfo
===
undefined
||
jobInfo
.
status
!==
'
FAILED
'
||
jobInfo
.
logPath
===
undefined
)
{
if
(
jobInfo
===
undefined
||
jobInfo
.
status
!==
'
FAILED
'
||
jobInfo
.
logPath
===
undefined
)
{
return
jobInfo
;
return
jobInfo
;
...
@@ -311,6 +327,14 @@ class NNIRestHandler {
...
@@ -311,6 +327,14 @@ class NNIRestHandler {
return
jobInfo
;
return
jobInfo
;
}
}
private
setMessageforJob
(
jobInfo
:
TrialJobInfo
):
TrialJobInfo
{
if
(
jobInfo
===
undefined
){
return
jobInfo
}
jobInfo
.
message
=
this
.
nniManager
.
getTrialJobMessage
(
jobInfo
.
trialJobId
);
return
jobInfo
}
}
}
export
function
createRestHandler
(
rs
:
NNIRestServer
):
Router
{
export
function
createRestHandler
(
rs
:
NNIRestServer
):
Router
{
...
...
ts/nni_manager/rest_server/restValidationSchemas.ts
View file @
b40e3db7
...
@@ -32,6 +32,9 @@ export namespace ValidationSchemas {
...
@@ -32,6 +32,9 @@ export namespace ValidationSchemas {
outputDir
:
joi
.
string
(),
outputDir
:
joi
.
string
(),
cpuNum
:
joi
.
number
().
min
(
1
),
cpuNum
:
joi
.
number
().
min
(
1
),
memoryMB
:
joi
.
number
().
min
(
100
),
memoryMB
:
joi
.
number
().
min
(
100
),
// ############## adl cpu and memory config ###############
memorySize
:
joi
.
string
(),
// ########################################################
gpuNum
:
joi
.
number
().
min
(
0
),
gpuNum
:
joi
.
number
().
min
(
0
),
command
:
joi
.
string
().
min
(
1
),
command
:
joi
.
string
().
min
(
1
),
virtualCluster
:
joi
.
string
(),
virtualCluster
:
joi
.
string
(),
...
@@ -93,6 +96,20 @@ export namespace ValidationSchemas {
...
@@ -93,6 +96,20 @@ export namespace ValidationSchemas {
minFailedTaskCount
:
joi
.
number
(),
minFailedTaskCount
:
joi
.
number
(),
minSucceededTaskCount
:
joi
.
number
()
minSucceededTaskCount
:
joi
.
number
()
})
})
}),
imagePullSecrets
:
joi
.
array
({
name
:
joi
.
string
().
min
(
1
).
required
()
}),
// ############## adl ###############
adaptive
:
joi
.
boolean
(),
checkpoint
:
joi
.
object
({
storageClass
:
joi
.
string
().
min
(
1
).
required
(),
storageSize
:
joi
.
string
().
min
(
1
).
required
()
}),
nfs
:
joi
.
object
({
server
:
joi
.
string
().
min
(
1
).
required
(),
path
:
joi
.
string
().
min
(
1
).
required
(),
containerMountPath
:
joi
.
string
().
min
(
1
).
required
()
})
})
}),
}),
pai_yarn_config
:
joi
.
object
({
// eslint-disable-line @typescript-eslint/camelcase
pai_yarn_config
:
joi
.
object
({
// eslint-disable-line @typescript-eslint/camelcase
...
...
Prev
1
…
5
6
7
8
9
10
11
12
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment