Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
ef15fc81
"...composable_kernel_onnxruntime.git" did not exist on "1f543bfa79de0687f9b6144b5dea10f4190c8892"
Unverified
Commit
ef15fc81
authored
Jun 21, 2021
by
liuzhe-lz
Committed by
GitHub
Jun 21, 2021
Browse files
Bump node.js version to v16 (#3828)
parent
b2225436
Changes
45
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
100 additions
and
852 deletions
+100
-852
pipelines/fast-test.yml
pipelines/fast-test.yml
+24
-5
setup_ts.py
setup_ts.py
+1
-1
ts/nni_manager/.eslintrc
ts/nni_manager/.eslintrc
+5
-3
ts/nni_manager/common/utils.ts
ts/nni_manager/common/utils.ts
+1
-1
ts/nni_manager/core/nniTensorboardManager.ts
ts/nni_manager/core/nniTensorboardManager.ts
+1
-1
ts/nni_manager/core/nnimanager.ts
ts/nni_manager/core/nnimanager.ts
+1
-1
ts/nni_manager/core/test/ipcInterface.test.ts
ts/nni_manager/core/test/ipcInterface.test.ts
+1
-1
ts/nni_manager/package.json
ts/nni_manager/package.json
+45
-53
ts/nni_manager/rest_server/restHandler.ts
ts/nni_manager/rest_server/restHandler.ts
+7
-4
ts/nni_manager/rest_server/restValidationSchemas.ts
ts/nni_manager/rest_server/restValidationSchemas.ts
+2
-2
ts/nni_manager/training_service/dlts/dltsClusterConfig.ts
ts/nni_manager/training_service/dlts/dltsClusterConfig.ts
+0
-14
ts/nni_manager/training_service/dlts/dltsData.ts
ts/nni_manager/training_service/dlts/dltsData.ts
+0
-8
ts/nni_manager/training_service/dlts/dltsJobConfig.ts
ts/nni_manager/training_service/dlts/dltsJobConfig.ts
+0
-45
ts/nni_manager/training_service/dlts/dltsJobRestServer.ts
ts/nni_manager/training_service/dlts/dltsJobRestServer.ts
+0
-77
ts/nni_manager/training_service/dlts/dltsTrainingService.ts
ts/nni_manager/training_service/dlts/dltsTrainingService.ts
+0
-578
ts/nni_manager/training_service/dlts/dltsTrialConfig.ts
ts/nni_manager/training_service/dlts/dltsTrialConfig.ts
+0
-15
ts/nni_manager/training_service/dlts/dltsTrialJobDetail.ts
ts/nni_manager/training_service/dlts/dltsTrialJobDetail.ts
+0
-31
ts/nni_manager/training_service/remote_machine/shellExecutor.ts
..._manager/training_service/remote_machine/shellExecutor.ts
+3
-3
ts/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts
...ervice/reusable/environments/openPaiEnvironmentService.ts
+1
-1
ts/nni_manager/training_service/reusable/storageService.ts
ts/nni_manager/training_service/reusable/storageService.ts
+8
-8
No files found.
pipelines/fast-test.yml
View file @
ef15fc81
...
@@ -87,6 +87,10 @@ stages:
...
@@ -87,6 +87,10 @@ stages:
variables
:
variables
:
YARN_CACHE_FOLDER
:
$(Pipeline.Workspace)/.yarn
YARN_CACHE_FOLDER
:
$(Pipeline.Workspace)/.yarn
steps
:
steps
:
-
task
:
NodeTool@0
inputs
:
versionSpec
:
16.3.0
displayName
:
Configure Node.js version
-
task
:
Cache@2
-
task
:
Cache@2
inputs
:
inputs
:
key
:
'
yarn
|
"$(Agent.OS)"
|
ts/**/yarn.lock,
!**/node_modules/**'
key
:
'
yarn
|
"$(Agent.OS)"
|
ts/**/yarn.lock,
!**/node_modules/**'
...
@@ -123,6 +127,11 @@ stages:
...
@@ -123,6 +127,11 @@ stages:
versionSpec
:
3.8
versionSpec
:
3.8
displayName
:
Configure Python version
displayName
:
Configure Python version
-
task
:
NodeTool@0
inputs
:
versionSpec
:
16.3.0
displayName
:
Configure Node.js version
-
script
:
|
-
script
:
|
sudo apt-get install -y pandoc
sudo apt-get install -y pandoc
sudo apt-get remove swig -y
sudo apt-get remove swig -y
...
@@ -201,13 +210,17 @@ stages:
...
@@ -201,13 +210,17 @@ stages:
PIP_CACHE_DIR
:
$(Pipeline.Workspace)/.pip
PIP_CACHE_DIR
:
$(Pipeline.Workspace)/.pip
YARN_CACHE_FOLDER
:
$(Pipeline.Workspace)/.yarn
YARN_CACHE_FOLDER
:
$(Pipeline.Workspace)/.yarn
# This platform runs integration test first.
steps
:
steps
:
-
task
:
UsePythonVersion@0
-
task
:
UsePythonVersion@0
inputs
:
inputs
:
versionSpec
:
3.6
versionSpec
:
3.6
displayName
:
Configure Python version
displayName
:
Configure Python version
-
task
:
NodeTool@0
inputs
:
versionSpec
:
16.3.0
displayName
:
Configure Node.js version
-
script
:
|
-
script
:
|
sudo apt-get install -y pandoc
sudo apt-get install -y pandoc
sudo apt-get remove swig -y
sudo apt-get remove swig -y
...
@@ -283,14 +296,17 @@ stages:
...
@@ -283,14 +296,17 @@ stages:
PIP_CACHE_DIR
:
$(Pipeline.Workspace)/.pip
PIP_CACHE_DIR
:
$(Pipeline.Workspace)/.pip
YARN_CACHE_FOLDER
:
$(Pipeline.Workspace)/.yarn
YARN_CACHE_FOLDER
:
$(Pipeline.Workspace)/.yarn
# This platform runs TypeScript unit test first.
steps
:
steps
:
-
task
:
UsePythonVersion@0
-
task
:
UsePythonVersion@0
inputs
:
inputs
:
versionSpec
:
3.8
versionSpec
:
3.8
displayName
:
Configure Python version
displayName
:
Configure Python version
-
task
:
NodeTool@0
inputs
:
versionSpec
:
16.3.0
displayName
:
Configure Node.js version
-
script
:
|
-
script
:
|
brew install swig@3
brew install swig@3
rm -f /usr/local/bin/swig
rm -f /usr/local/bin/swig
...
@@ -361,14 +377,17 @@ stages:
...
@@ -361,14 +377,17 @@ stages:
PIP_CACHE_DIR
:
$(Pipeline.Workspace)/.pip
PIP_CACHE_DIR
:
$(Pipeline.Workspace)/.pip
YARN_CACHE_FOLDER
:
$(Pipeline.Workspace)/.yarn
YARN_CACHE_FOLDER
:
$(Pipeline.Workspace)/.yarn
# This platform runs Python unit test first.
steps
:
steps
:
-
task
:
UsePythonVersion@0
-
task
:
UsePythonVersion@0
inputs
:
inputs
:
versionSpec
:
3.8
versionSpec
:
3.8
displayName
:
Configure Python version
displayName
:
Configure Python version
-
task
:
NodeTool@0
inputs
:
versionSpec
:
16.3.0
displayName
:
Configure Node.js version
-
task
:
Cache@2
-
task
:
Cache@2
inputs
:
inputs
:
key
:
'
python
|
"$(Agent.OS)"
|
dependencies/*.txt'
key
:
'
python
|
"$(Agent.OS)"
|
dependencies/*.txt'
...
...
setup_ts.py
View file @
ef15fc81
...
@@ -22,7 +22,7 @@ import tarfile
...
@@ -22,7 +22,7 @@ import tarfile
from
zipfile
import
ZipFile
from
zipfile
import
ZipFile
node_version
=
'v1
0.2
3.0'
node_version
=
'v1
6.
3.0'
yarn_version
=
'v1.22.10'
yarn_version
=
'v1.22.10'
...
...
ts/nni_manager/.eslintrc
View file @
ef15fc81
...
@@ -24,14 +24,16 @@
...
@@ -24,14 +24,16 @@
"@typescript-eslint/no-inferrable-types": 0,
"@typescript-eslint/no-inferrable-types": 0,
"no-inner-declarations": 0,
"no-inner-declarations": 0,
"@typescript-eslint/explicit-function-return-type": "error",
"@typescript-eslint/explicit-function-return-type": "error",
"@typescript-eslint/no-var-requires": 0,
"@typescript-eslint/no-non-null-assertion": 0,
"@typescript-eslint/no-unused-vars": [
"@typescript-eslint/no-unused-vars": [
"
error
",
"
off
",
{
{
"argsIgnorePattern": "^_"
"argsIgnorePattern": "^_"
}
}
],
],
"@typescript-eslint/no-var-requires": 0,
"@typescript-eslint/no-use-before-define": 0
"@typescript-eslint/no-non-null-assertion": 0
},
},
"ignorePatterns": [
"ignorePatterns": [
"node_modules/",
"node_modules/",
...
...
ts/nni_manager/common/utils.ts
View file @
ef15fc81
...
@@ -56,7 +56,7 @@ function mkDirP(dirPath: string): Promise<void> {
...
@@ -56,7 +56,7 @@ function mkDirP(dirPath: string): Promise<void> {
}
else
{
}
else
{
const
parent
:
string
=
path
.
dirname
(
dirPath
);
const
parent
:
string
=
path
.
dirname
(
dirPath
);
mkDirP
(
parent
).
then
(()
=>
{
mkDirP
(
parent
).
then
(()
=>
{
fs
.
mkdir
(
dirPath
,
(
err
:
Error
)
=>
{
fs
.
mkdir
(
dirPath
,
(
err
:
Error
|
null
)
=>
{
if
(
err
)
{
if
(
err
)
{
deferred
.
reject
(
err
);
deferred
.
reject
(
err
);
}
else
{
}
else
{
...
...
ts/nni_manager/core/nniTensorboardManager.ts
View file @
ef15fc81
...
@@ -70,7 +70,7 @@ class NNITensorboardManager implements TensorboardManager {
...
@@ -70,7 +70,7 @@ class NNITensorboardManager implements TensorboardManager {
this
.
log
.
error
(
error
);
this
.
log
.
error
(
error
);
const
alive
:
boolean
=
await
isAlive
(
tensorboardProc
.
pid
);
const
alive
:
boolean
=
await
isAlive
(
tensorboardProc
.
pid
);
if
(
alive
)
{
if
(
alive
)
{
process
.
kill
(
-
tensorboardProc
.
pid
);
process
.
kill
(
-
tensorboardProc
.
pid
!
);
}
}
this
.
setTensorboardTaskStatus
(
tensorboardTask
,
'
ERROR
'
);
this
.
setTensorboardTaskStatus
(
tensorboardTask
,
'
ERROR
'
);
});
});
...
...
ts/nni_manager/core/nnimanager.ts
View file @
ef15fc81
...
@@ -490,7 +490,7 @@ class NNIManager implements Manager {
...
@@ -490,7 +490,7 @@ class NNIManager implements Manager {
};
};
const
newEnv
=
Object
.
assign
({},
process
.
env
,
nniEnv
);
const
newEnv
=
Object
.
assign
({},
process
.
env
,
nniEnv
);
const
tunerProc
:
ChildProcess
=
getTunerProc
(
command
,
stdio
,
newCwd
,
newEnv
);
const
tunerProc
:
ChildProcess
=
getTunerProc
(
command
,
stdio
,
newCwd
,
newEnv
);
this
.
dispatcherPid
=
tunerProc
.
pid
;
this
.
dispatcherPid
=
tunerProc
.
pid
!
;
this
.
dispatcher
=
createDispatcherInterface
(
tunerProc
);
this
.
dispatcher
=
createDispatcherInterface
(
tunerProc
);
return
;
return
;
...
...
ts/nni_manager/core/test/ipcInterface.test.ts
View file @
ef15fc81
...
@@ -30,7 +30,7 @@ function runProcess(): Promise<Error | null> {
...
@@ -30,7 +30,7 @@ function runProcess(): Promise<Error | null> {
if
(
code
!==
0
)
{
if
(
code
!==
0
)
{
deferred
.
resolve
(
new
Error
(
`return code:
${
code
}
`
));
deferred
.
resolve
(
new
Error
(
`return code:
${
code
}
`
));
}
else
{
}
else
{
let
str
=
proc
.
stdout
.
read
().
toString
();
let
str
=
proc
.
stdout
!
.
read
().
toString
();
if
(
str
.
search
(
"
\r\n
"
)
!=-
1
){
if
(
str
.
search
(
"
\r\n
"
)
!=-
1
){
sentCommands
=
str
.
split
(
"
\r\n
"
);
sentCommands
=
str
.
split
(
"
\r\n
"
);
}
}
...
...
ts/nni_manager/package.json
View file @
ef15fc81
...
@@ -11,79 +11,71 @@
...
@@ -11,79 +11,71 @@
},
},
"license"
:
"MIT"
,
"license"
:
"MIT"
,
"dependencies"
:
{
"dependencies"
:
{
"azure-storage"
:
"^2.10.
2
"
,
"azure-storage"
:
"^2.10.
4
"
,
"child-process-promise"
:
"^2.2.1"
,
"child-process-promise"
:
"^2.2.1"
,
"express"
:
"^4.1
6.3
"
,
"express"
:
"^4.1
7.1
"
,
"express-joi-validator"
:
"^2.0.
0
"
,
"express-joi-validator"
:
"^2.0.
1
"
,
"ignore"
:
"^5.1.
4
"
,
"ignore"
:
"^5.1.
8
"
,
"js-base64"
:
"^
2.4.9
"
,
"js-base64"
:
"^
3.6.1
"
,
"kubernetes-client"
:
"^6.
5.0
"
,
"kubernetes-client"
:
"^6.
12.1
"
,
"lockfile"
:
"^1.0.4"
,
"lockfile"
:
"^1.0.4"
,
"python-shell"
:
"^
2
.0.
1
"
,
"python-shell"
:
"^
3
.0.
0
"
,
"rx"
:
"^4.1.0"
,
"rx"
:
"^4.1.0"
,
"sqlite3"
:
"5.0.
0
"
,
"sqlite3"
:
"5.0.
2
"
,
"ssh2"
:
"^
0.8.9
"
,
"ssh2"
:
"^
1.1.0
"
,
"stream-buffers"
:
"^3.0.2"
,
"stream-buffers"
:
"^3.0.2"
,
"tail-stream"
:
"^0.3.4"
,
"tail-stream"
:
"^0.3.4"
,
"tar"
:
"^6.
0.2
"
,
"tar"
:
"^6.
1.0
"
,
"tree-kill"
:
"^1.2.2"
,
"tree-kill"
:
"^1.2.2"
,
"ts-deferred"
:
"^1.0.4"
,
"ts-deferred"
:
"^1.0.4"
,
"typescript-ioc"
:
"^1.2.4"
,
"typescript-ioc"
:
"^1.2.6"
,
"typescript-string-operations"
:
"^1.3.1"
,
"typescript-string-operations"
:
"^1.4.1"
,
"webhdfs"
:
"^1.2.0"
,
"ws"
:
"^7.4.6"
"ws"
:
"^7.4.6"
},
},
"devDependencies"
:
{
"devDependencies"
:
{
"@types/chai"
:
"^4.
1.4
"
,
"@types/chai"
:
"^4.
2.18
"
,
"@types/chai-as-promised"
:
"^7.1.0"
,
"@types/chai-as-promised"
:
"^7.1.0"
,
"@types/express"
:
"^4.1
6.0
"
,
"@types/express"
:
"^4.1
7.2
"
,
"@types/glob"
:
"^7.1.
1
"
,
"@types/glob"
:
"^7.1.
3
"
,
"@types/js-base64"
:
"^
2
.3.1"
,
"@types/js-base64"
:
"^
3
.3.1"
,
"@types/js-yaml"
:
"^
3.12.5
"
,
"@types/js-yaml"
:
"^
4.0.1
"
,
"@types/lockfile"
:
"^1.0.0"
,
"@types/lockfile"
:
"^1.0.0"
,
"@types/mocha"
:
"^8.
0.3
"
,
"@types/mocha"
:
"^8.
2.2
"
,
"@types/node"
:
"
10
.12.1
8
"
,
"@types/node"
:
"
^15
.12.1"
,
"@types/request"
:
"^2.4
7.1
"
,
"@types/request"
:
"^2.4
8.5
"
,
"@types/rx"
:
"^4.1.
1
"
,
"@types/rx"
:
"^4.1.
2
"
,
"@types/sqlite3"
:
"^3.1.
3
"
,
"@types/sqlite3"
:
"^3.1.
7
"
,
"@types/ssh2"
:
"^0.5.
35
"
,
"@types/ssh2"
:
"^0.5.
46
"
,
"@types/stream-buffers"
:
"^3.0.
2
"
,
"@types/stream-buffers"
:
"^3.0.
3
"
,
"@types/tar"
:
"^4.0.
3
"
,
"@types/tar"
:
"^4.0.
4
"
,
"@types/tmp"
:
"^0.
0.33
"
,
"@types/tmp"
:
"^0.
2.0
"
,
"@types/ws"
:
"^7.
2.5
"
,
"@types/ws"
:
"^7.
4.4
"
,
"@typescript-eslint/eslint-plugin"
:
"^2.10.0"
,
"@typescript-eslint/eslint-plugin"
:
"^2.10.0"
,
"@typescript-eslint/parser"
:
"^
2.10
.0"
,
"@typescript-eslint/parser"
:
"^
4.26
.0"
,
"chai"
:
"^4.
1.2
"
,
"chai"
:
"^4.
3.4
"
,
"chai-as-promised"
:
"^7.1.1"
,
"chai-as-promised"
:
"^7.1.1"
,
"eslint"
:
"^
6.
7.2"
,
"eslint"
:
"^7.2
8.0
"
,
"glob"
:
"^7.1.
3
"
,
"glob"
:
"^7.1.
7
"
,
"mocha"
:
"^8.
1.3
"
,
"mocha"
:
"^8.
4.0
"
,
"npx"
:
"^10.2.2"
,
"npx"
:
"^10.2.2"
,
"nyc"
:
"^15.
0
.0"
,
"nyc"
:
"^15.
1
.0"
,
"request"
:
"^2.8
7.0
"
,
"request"
:
"^2.8
8.2
"
,
"rmdir"
:
"^1.2.0"
,
"rmdir"
:
"^1.2.0"
,
"tmp"
:
"^0.
0.33
"
,
"tmp"
:
"^0.
2.1
"
,
"ts-node"
:
"^
7
.0.0"
,
"ts-node"
:
"^
10
.0.0"
,
"typescript"
:
"^
3.2
.2"
"typescript"
:
"^
4.3
.2"
},
},
"resolutions"
:
{
"resolutions"
:
{
"mem"
:
"^4.0.0"
,
"acorn"
:
">=8.3.0"
,
"lodash"
:
">=4.17.13"
,
"hoek"
:
">=6.1.3"
,
"lodash.merge"
:
">=4.6.2"
,
"node.extend"
:
">=1.1.8"
,
"node.extend"
:
"^1.1.7"
,
"npm"
:
">=7.16.0"
,
"hoek"
:
"^4.2.1"
,
"y18n"
:
">=5.0.8"
,
"js-yaml"
:
"^3.13.1"
,
"yargs-parser"
:
">=20.2.7"
,
"node-forge"
:
">=0.10.0"
,
"joi"
:
">=17.4.0"
"dot-prop"
:
"^4.2.1"
,
"npm"
:
">=6.14.8"
,
"yargs"
:
"~16.0.3"
,
"yargs-parser"
:
">=20.2.0"
,
"y18n"
:
">=5.0.5"
,
"acorn"
:
">=8.0.4"
,
"serialize-javascript"
:
">=5.0.1"
},
},
"engines"
:
{
"engines"
:
{
"node"
:
"
>=10.0
.0"
"node"
:
"
^16.3
.0"
},
},
"nyc"
:
{
"nyc"
:
{
"include"
:
[
"include"
:
[
...
...
ts/nni_manager/rest_server/restHandler.ts
View file @
ef15fc81
...
@@ -17,6 +17,9 @@ import { TensorboardManager, TensorboardTaskInfo } from '../common/tensorboardMa
...
@@ -17,6 +17,9 @@ import { TensorboardManager, TensorboardTaskInfo } from '../common/tensorboardMa
import
{
ValidationSchemas
}
from
'
./restValidationSchemas
'
;
import
{
ValidationSchemas
}
from
'
./restValidationSchemas
'
;
import
{
NNIRestServer
}
from
'
./nniRestServer
'
;
import
{
NNIRestServer
}
from
'
./nniRestServer
'
;
import
{
getVersion
}
from
'
../common/utils
'
;
import
{
getVersion
}
from
'
../common/utils
'
;
import
{
MetricType
}
from
'
../common/datastore
'
;
import
{
ProfileUpdateType
}
from
'
../common/manager
'
;
import
{
LogType
,
TrialJobStatus
}
from
'
../common/trainingService
'
;
const
expressJoi
=
require
(
'
express-joi-validator
'
);
const
expressJoi
=
require
(
'
express-joi-validator
'
);
...
@@ -139,7 +142,7 @@ class NNIRestHandler {
...
@@ -139,7 +142,7 @@ class NNIRestHandler {
private
updateExperimentProfile
(
router
:
Router
):
void
{
private
updateExperimentProfile
(
router
:
Router
):
void
{
router
.
put
(
'
/experiment
'
,
(
req
:
Request
,
res
:
Response
)
=>
{
router
.
put
(
'
/experiment
'
,
(
req
:
Request
,
res
:
Response
)
=>
{
this
.
nniManager
.
updateExperimentProfile
(
req
.
body
,
req
.
query
.
update_type
).
then
(()
=>
{
this
.
nniManager
.
updateExperimentProfile
(
req
.
body
,
req
.
query
.
update_type
as
ProfileUpdateType
).
then
(()
=>
{
res
.
send
();
res
.
send
();
}).
catch
((
err
:
Error
)
=>
{
}).
catch
((
err
:
Error
)
=>
{
this
.
handleError
(
err
,
res
);
this
.
handleError
(
err
,
res
);
...
@@ -219,7 +222,7 @@ class NNIRestHandler {
...
@@ -219,7 +222,7 @@ class NNIRestHandler {
private
listTrialJobs
(
router
:
Router
):
void
{
private
listTrialJobs
(
router
:
Router
):
void
{
router
.
get
(
'
/trial-jobs
'
,
(
req
:
Request
,
res
:
Response
)
=>
{
router
.
get
(
'
/trial-jobs
'
,
(
req
:
Request
,
res
:
Response
)
=>
{
this
.
nniManager
.
listTrialJobs
(
req
.
query
.
status
).
then
((
jobInfos
:
TrialJobInfo
[])
=>
{
this
.
nniManager
.
listTrialJobs
(
req
.
query
.
status
as
TrialJobStatus
).
then
((
jobInfos
:
TrialJobInfo
[])
=>
{
jobInfos
.
forEach
((
trialJob
:
TrialJobInfo
)
=>
{
jobInfos
.
forEach
((
trialJob
:
TrialJobInfo
)
=>
{
this
.
setErrorPathForFailedJob
(
trialJob
);
this
.
setErrorPathForFailedJob
(
trialJob
);
});
});
...
@@ -263,7 +266,7 @@ class NNIRestHandler {
...
@@ -263,7 +266,7 @@ class NNIRestHandler {
private
getMetricData
(
router
:
Router
):
void
{
private
getMetricData
(
router
:
Router
):
void
{
router
.
get
(
'
/metric-data/:job_id*?
'
,
async
(
req
:
Request
,
res
:
Response
)
=>
{
router
.
get
(
'
/metric-data/:job_id*?
'
,
async
(
req
:
Request
,
res
:
Response
)
=>
{
this
.
nniManager
.
getMetricData
(
req
.
params
.
job_id
,
req
.
query
.
type
).
then
((
metricsData
:
MetricDataRecord
[])
=>
{
this
.
nniManager
.
getMetricData
(
req
.
params
.
job_id
,
req
.
query
.
type
as
MetricType
).
then
((
metricsData
:
MetricDataRecord
[])
=>
{
res
.
send
(
metricsData
);
res
.
send
(
metricsData
);
}).
catch
((
err
:
Error
)
=>
{
}).
catch
((
err
:
Error
)
=>
{
this
.
handleError
(
err
,
res
);
this
.
handleError
(
err
,
res
);
...
@@ -295,7 +298,7 @@ class NNIRestHandler {
...
@@ -295,7 +298,7 @@ class NNIRestHandler {
private
getTrialLog
(
router
:
Router
):
void
{
private
getTrialLog
(
router
:
Router
):
void
{
router
.
get
(
'
/trial-log/:id/:type
'
,
async
(
req
:
Request
,
res
:
Response
)
=>
{
router
.
get
(
'
/trial-log/:id/:type
'
,
async
(
req
:
Request
,
res
:
Response
)
=>
{
this
.
nniManager
.
getTrialLog
(
req
.
params
.
id
,
req
.
params
.
type
).
then
((
log
:
string
)
=>
{
this
.
nniManager
.
getTrialLog
(
req
.
params
.
id
,
req
.
params
.
type
as
LogType
).
then
((
log
:
string
)
=>
{
if
(
log
===
''
)
{
if
(
log
===
''
)
{
log
=
'
No logs available.
'
log
=
'
No logs available.
'
}
}
...
...
ts/nni_manager/rest_server/restValidationSchemas.ts
View file @
ef15fc81
...
@@ -82,7 +82,7 @@ export namespace ValidationSchemas {
...
@@ -82,7 +82,7 @@ export namespace ValidationSchemas {
gpuNum
:
joi
.
number
().
min
(
0
).
required
(),
gpuNum
:
joi
.
number
().
min
(
0
).
required
(),
command
:
joi
.
string
().
min
(
1
).
required
()
command
:
joi
.
string
().
min
(
1
).
required
()
}),
}),
taskRoles
:
joi
.
array
({
taskRoles
:
joi
.
array
(
).
items
(
{
name
:
joi
.
string
().
min
(
1
),
name
:
joi
.
string
().
min
(
1
),
taskNum
:
joi
.
number
().
min
(
1
).
required
(),
taskNum
:
joi
.
number
().
min
(
1
).
required
(),
image
:
joi
.
string
().
min
(
1
),
image
:
joi
.
string
().
min
(
1
),
...
@@ -98,7 +98,7 @@ export namespace ValidationSchemas {
...
@@ -98,7 +98,7 @@ export namespace ValidationSchemas {
minSucceededTaskCount
:
joi
.
number
()
minSucceededTaskCount
:
joi
.
number
()
})
})
}),
}),
imagePullSecrets
:
joi
.
array
({
imagePullSecrets
:
joi
.
array
(
).
items
(
{
name
:
joi
.
string
().
min
(
1
).
required
()
name
:
joi
.
string
().
min
(
1
).
required
()
}),
}),
// ############## adl ###############
// ############## adl ###############
...
...
ts/nni_manager/training_service/dlts/dltsClusterConfig.ts
deleted
100644 → 0
View file @
b2225436
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
export
interface
DLTSClusterConfig
{
dashboard
:
string
;
cluster
:
string
;
team
:
string
;
email
:
string
;
password
:
string
;
gpuType
?:
string
;
}
ts/nni_manager/training_service/dlts/dltsData.ts
deleted
100644 → 0
View file @
b2225436
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
export
const
DLTS_TRIAL_COMMAND_FORMAT
:
string
=
`export NNI_PLATFORM=dlts NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4} MULTI_PHASE={5} \
&& cd $NNI_SYS_DIR && sh install_nni.sh \
&& cd '{6}' && python3 -m nni.tools.trial_tool.trial_keeper --trial_command '{7}' \
--nnimanager_ip '{8}' --nnimanager_port '{9}' --nni_manager_version '{10}' --log_collection '{11}'`
;
ts/nni_manager/training_service/dlts/dltsJobConfig.ts
deleted
100644 → 0
View file @
b2225436
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
import
{
DLTSClusterConfig
}
from
"
./dltsClusterConfig
"
;
export
class
DLTSJobConfig
{
public
readonly
team
:
string
;
public
readonly
userName
:
string
;
public
readonly
vcName
:
string
;
public
readonly
gpuType
:
string
;
public
readonly
jobType
=
"
training
"
;
public
readonly
jobtrainingtype
=
"
RegularJob
"
;
public
readonly
ssh
=
false
;
public
readonly
ipython
=
false
;
public
readonly
tensorboard
=
false
;
public
readonly
workPath
=
''
;
public
readonly
enableworkpath
=
true
;
public
readonly
dataPath
=
''
;
public
readonly
enabledatapath
=
false
;
public
readonly
jobPath
=
''
;
public
readonly
enablejobpath
=
true
;
public
readonly
mountpoints
=
[];
public
readonly
env
=
[{
name
:
'
TMPDIR
'
,
value
:
'
$HOME/tmp
'
}]
public
readonly
hostNetwork
=
false
;
public
readonly
useGPUTopology
=
false
;
public
readonly
isPrivileged
=
false
;
public
readonly
hostIPC
=
false
;
public
readonly
preemptionAllowed
=
"
False
"
public
constructor
(
clusterConfig
:
DLTSClusterConfig
,
public
readonly
jobName
:
string
,
public
readonly
resourcegpu
:
number
,
public
readonly
image
:
string
,
public
readonly
cmd
:
string
,
public
readonly
interactivePorts
:
number
[],
)
{
if
(
clusterConfig
.
gpuType
===
undefined
)
{
throw
Error
(
'
GPU type not fetched
'
)
}
this
.
vcName
=
this
.
team
=
clusterConfig
.
team
this
.
gpuType
=
clusterConfig
.
gpuType
this
.
userName
=
clusterConfig
.
email
}
}
ts/nni_manager/training_service/dlts/dltsJobRestServer.ts
deleted
100644 → 0
View file @
b2225436
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
'
use strict
'
;
import
{
Request
,
Response
,
Router
}
from
'
express
'
;
import
{
Inject
}
from
'
typescript-ioc
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
ClusterJobRestServer
}
from
'
../common/clusterJobRestServer
'
;
import
{
DLTSTrainingService
}
from
'
./dltsTrainingService
'
;
export
interface
ParameterFileMeta
{
readonly
experimentId
:
string
;
readonly
trialId
:
string
;
readonly
filePath
:
string
;
}
/**
* DLTS Training service Rest server, provides rest API to support DLTS job metrics update
*
*/
@
component
.
Singleton
export
class
DLTSJobRestServer
extends
ClusterJobRestServer
{
private
parameterFileMetaList
:
ParameterFileMeta
[]
=
[];
@
Inject
private
readonly
dltsTrainingService
:
DLTSTrainingService
;
/**
* constructor to provide NNIRestServer's own rest property, e.g. port
*/
constructor
()
{
super
();
this
.
dltsTrainingService
=
component
.
get
(
DLTSTrainingService
);
}
// tslint:disable-next-line:no-any
protected
handleTrialMetrics
(
jobId
:
string
,
metrics
:
any
[]):
void
{
// Split metrics array into single metric, then emit
// Warning: If not split metrics into single ones, the behavior will be UNKNOWN
for
(
const
singleMetric
of
metrics
)
{
this
.
dltsTrainingService
.
MetricsEmitter
.
emit
(
'
metric
'
,
{
id
:
jobId
,
data
:
singleMetric
});
}
}
protected
createRestHandler
():
Router
{
const
router
:
Router
=
super
.
createRestHandler
();
router
.
post
(
`/parameter-file-meta`
,
(
req
:
Request
,
res
:
Response
)
=>
{
try
{
this
.
log
.
info
(
`POST /parameter-file-meta, body is
${
JSON
.
stringify
(
req
.
body
)}
`
);
this
.
parameterFileMetaList
.
push
(
req
.
body
);
res
.
send
();
}
catch
(
err
)
{
this
.
log
.
error
(
`POST parameter-file-meta error:
${
err
}
`
);
res
.
status
(
500
);
res
.
send
(
err
.
message
);
}
});
router
.
get
(
`/parameter-file-meta`
,
(
req
:
Request
,
res
:
Response
)
=>
{
try
{
this
.
log
.
info
(
`GET /parameter-file-meta`
);
res
.
send
(
this
.
parameterFileMetaList
);
}
catch
(
err
)
{
this
.
log
.
error
(
`GET parameter-file-meta error:
${
err
}
`
);
res
.
status
(
500
);
res
.
send
(
err
.
message
);
}
});
return
router
;
}
}
ts/nni_manager/training_service/dlts/dltsTrainingService.ts
deleted
100644 → 0
View file @
b2225436
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
'
use strict
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
import
*
as
request
from
'
request
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
EventEmitter
}
from
'
events
'
;
import
{
String
}
from
'
typescript-string-operations
'
;
import
{
getExperimentId
}
from
'
../../common/experimentStartupInfo
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
MethodNotImplementedError
}
from
'
../../common/errors
'
;
import
{
NNIManagerIpConfig
,
TrainingService
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
,
LogType
}
from
'
../../common/trainingService
'
;
import
{
DLTS_TRIAL_COMMAND_FORMAT
}
from
'
./dltsData
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../common/containerJobData
'
;
import
{
execMkdir
,
validateCodeDir
}
from
'
../common/util
'
;
import
{
delay
,
uniqueString
,
getIPV4Address
,
getExperimentRootDir
,
getVersion
,
generateParamFileName
}
from
'
../../common/utils
'
;
import
{
DLTSJobRestServer
}
from
'
./dltsJobRestServer
'
;
import
{
TrialConfigMetadataKey
}
from
'
../../training_service/common/trialConfigMetadataKey
'
;
import
{
DLTSJobConfig
}
from
'
./dltsJobConfig
'
;
import
{
DLTSClusterConfig
}
from
'
./dltsClusterConfig
'
;
import
{
DLTSTrialConfig
}
from
'
./dltsTrialConfig
'
;
import
{
DLTSTrialJobDetail
}
from
'
./dltsTrialJobDetail
'
;
@
component
.
Singleton
class
DLTSTrainingService
implements
TrainingService
{
private
readonly
log
!
:
Logger
;
private
readonly
metricsEmitter
:
EventEmitter
;
//private readonly expRootDir: string;
private
readonly
jobQueue
:
string
[];
private
stopping
:
boolean
=
false
;
private
readonly
experimentId
!
:
string
;
private
versionCheck
:
boolean
=
true
;
private
logCollection
:
string
=
'
none
'
;
private
isMultiPhase
:
boolean
=
false
;
private
dltsRestServerHost
:
string
;
private
dltsRestServerPort
?:
number
;
private
jobMode
:
boolean
;
private
readonly
trialJobsMap
:
Map
<
string
,
DLTSTrialJobDetail
>
;
private
nniManagerIpConfig
?:
NNIManagerIpConfig
;
private
dltsClusterConfig
?:
DLTSClusterConfig
;
private
dltsTrialConfig
?:
DLTSTrialConfig
;
constructor
()
{
this
.
log
=
getLogger
();
this
.
metricsEmitter
=
new
EventEmitter
();
this
.
trialJobsMap
=
new
Map
();
this
.
jobQueue
=
[];
this
.
experimentId
=
getExperimentId
();
this
.
dltsRestServerHost
=
getIPV4Address
();
this
.
jobMode
=
'
DLTS_JOB_ID
'
in
process
.
env
;
this
.
log
.
info
(
`Construct DLTS training service in
${
this
.
jobMode
?
'
job mode
'
:
'
local mode
'
}
.`
);
}
public
async
run
():
Promise
<
void
>
{
this
.
log
.
info
(
'
Run DLTS training service.
'
);
const
restServer
:
DLTSJobRestServer
=
component
.
get
(
DLTSJobRestServer
);
await
restServer
.
start
();
restServer
.
setEnableVersionCheck
=
this
.
versionCheck
;
this
.
log
.
info
(
`DLTS Training service rest server listening on:
${
restServer
.
endPoint
}
`
);
if
(
this
.
jobMode
)
{
await
this
.
exposeRestServerPort
(
restServer
.
clusterRestServerPort
);
}
else
{
this
.
dltsRestServerPort
=
restServer
.
clusterRestServerPort
}
await
Promise
.
all
([
this
.
statusCheckingLoop
(),
this
.
submitJobLoop
()]);
this
.
log
.
info
(
'
DLTS training service exit.
'
);
}
private
async
exposeRestServerPort
(
port
:
number
):
Promise
<
void
>
{
if
(
this
.
dltsClusterConfig
==
null
)
{
throw
Error
(
'
Cluster config is not set
'
);
}
const
{
dashboard
,
cluster
,
email
,
password
}
=
this
.
dltsClusterConfig
;
const
jobId
=
process
.
env
[
'
DLTS_JOB_ID
'
]
+
''
;
const
uri
=
`
${
dashboard
}
api/clusters/
${
cluster
}
/jobs/
${
jobId
}
/endpoints`
;
const
qs
=
{
email
,
password
};
do
{
this
.
log
.
debug
(
'
Checking endpoints
'
);
const
endpoints
=
await
new
Promise
((
resolve
,
reject
)
=>
{
request
.
get
(
uri
,
{
qs
,
json
:
true
},
function
(
error
,
response
,
body
)
{
if
(
error
)
{
reject
(
error
);
}
else
{
resolve
(
body
);
}
});
});
this
.
log
.
debug
(
'
Endpoints: %o
'
,
endpoints
);
if
(
Array
.
isArray
(
endpoints
))
{
const
restServerEndpoint
=
endpoints
.
find
(({
podPort
})
=>
podPort
===
port
);
if
(
restServerEndpoint
==
null
)
{
this
.
log
.
debug
(
'
Exposing %d
'
,
port
);
await
new
Promise
((
resolve
,
reject
)
=>
{
request
.
post
(
uri
,
{
qs
,
json
:
true
,
body
:
{
endpoints
:
[{
name
:
"
nni-rest-server
"
,
podPort
:
port
}]
}
},
function
(
error
)
{
if
(
error
)
{
reject
(
error
);
}
else
{
resolve
();
}
});
});
}
else
if
(
restServerEndpoint
[
'
status
'
]
===
'
running
'
)
{
// We get an exposed restserver port
this
.
dltsRestServerHost
=
restServerEndpoint
[
'
nodeName
'
];
this
.
dltsRestServerPort
=
restServerEndpoint
[
'
port
'
];
break
;
}
}
}
while
(
await
new
Promise
(
resolve
=>
setTimeout
(
resolve
,
1000
,
true
)));
}
private
async
statusCheckingLoop
():
Promise
<
void
>
{
while
(
!
this
.
stopping
)
{
const
updateDLTSTrialJobs
:
Promise
<
void
>
[]
=
[];
for
(
const
dltsTrialJob
of
this
.
trialJobsMap
.
values
())
{
updateDLTSTrialJobs
.
push
(
this
.
getDLTSTrialJobInfo
(
dltsTrialJob
));
}
await
Promise
.
all
(
updateDLTSTrialJobs
);
// Calcel paused dlts job
const
cancelPausedJobPromises
:
Promise
<
void
>
[]
=
[];
for
(
const
[
trialJobId
,
dltsTrialJob
]
of
this
.
trialJobsMap
)
{
if
(
dltsTrialJob
.
dltsPaused
&&
dltsTrialJob
.
status
===
'
RUNNING
'
)
{
cancelPausedJobPromises
.
push
(
this
.
cancelTrialJob
(
trialJobId
));
}
}
await
Promise
.
all
(
cancelPausedJobPromises
);
const
restServer
:
DLTSJobRestServer
=
component
.
get
(
DLTSJobRestServer
);
if
(
restServer
.
getErrorMessage
!==
undefined
)
{
throw
new
Error
(
restServer
.
getErrorMessage
);
}
await
delay
(
3000
);
}
}
private
async
getDLTSTrialJobInfo
(
dltsTrialJob
:
DLTSTrialJobDetail
):
Promise
<
void
>
{
if
(
this
.
dltsClusterConfig
==
null
)
{
throw
Error
(
'
Cluster config is not set
'
);
}
const
requestOptions
:
request
.
Options
=
{
uri
:
`
${
this
.
dltsClusterConfig
.
dashboard
}
api/v2/clusters/
${
this
.
dltsClusterConfig
.
cluster
}
/jobs/
${
dltsTrialJob
.
dltsJobId
}
`
,
qs
:
{
email
:
this
.
dltsClusterConfig
.
email
,
password
:
this
.
dltsClusterConfig
.
password
},
json
:
true
};
const
body
=
await
new
Promise
((
resolve
,
reject
)
=>
{
request
(
requestOptions
,
(
error
,
response
,
body
)
=>
{
if
(
error
!=
null
)
{
reject
(
error
)
}
else
{
resolve
(
body
)
}
})
})
as
any
;
void
(():
void
=>
{
switch
(
body
[
'
jobStatus
'
])
{
case
'
unapproved
'
:
case
'
queued
'
:
case
'
scheduling
'
:
dltsTrialJob
.
status
=
"
WAITING
"
;
break
;
case
'
running
'
:
dltsTrialJob
.
status
=
"
RUNNING
"
;
if
(
dltsTrialJob
.
startTime
===
undefined
)
{
dltsTrialJob
.
startTime
=
Date
.
parse
(
body
[
'
jobStatusDetail
'
][
0
][
'
startedAt
'
])
}
if
(
dltsTrialJob
.
url
===
undefined
)
{
dltsTrialJob
.
url
=
`
${
this
.
dltsClusterConfig
.
dashboard
}
job/
${
this
.
dltsClusterConfig
.
team
}
/
${
this
.
dltsClusterConfig
.
cluster
}
/
${
dltsTrialJob
.
dltsJobId
}
`
}
break
;
case
'
finished
'
:
dltsTrialJob
.
status
=
"
SUCCEEDED
"
;
break
;
case
'
failed
'
:
dltsTrialJob
.
status
=
"
FAILED
"
;
break
;
case
'
pausing
'
:
case
'
paused
'
:
dltsTrialJob
.
status
=
"
RUNNING
"
;
dltsTrialJob
.
dltsPaused
=
true
;
break
;
case
'
killing
'
:
case
'
killed
'
:
if
(
dltsTrialJob
.
isEarlyStopped
!==
undefined
)
{
dltsTrialJob
.
status
=
dltsTrialJob
.
isEarlyStopped
===
true
?
'
EARLY_STOPPED
'
:
'
USER_CANCELED
'
;
}
else
{
dltsTrialJob
.
status
=
'
SYS_CANCELED
'
;
}
break
;
default
:
dltsTrialJob
.
status
=
"
UNKNOWN
"
;
}
})
();
}
private
async
submitJobLoop
():
Promise
<
void
>
{
while
(
!
this
.
stopping
)
{
while
(
!
this
.
stopping
&&
this
.
jobQueue
.
length
>
0
)
{
const
trialJobId
:
string
=
this
.
jobQueue
[
0
];
this
.
log
.
info
(
`Got job
${
trialJobId
}
`
);
if
(
await
this
.
submitTrialJobToDLTS
(
trialJobId
))
{
// Remove trial job with trialJobId from job queue
this
.
jobQueue
.
shift
();
}
else
{
// Break the while loop since failed to submitJob
break
;
}
}
await
delay
(
3000
);
}
}
public
async
listTrialJobs
():
Promise
<
TrialJobDetail
[]
>
{
return
Array
.
from
(
this
.
trialJobsMap
.
values
());
}
public
async
getTrialJob
(
trialJobId
:
string
):
Promise
<
TrialJobDetail
>
{
const
trialJob
=
this
.
trialJobsMap
.
get
(
trialJobId
);
if
(
trialJob
===
undefined
)
{
throw
Error
(
`Trial job
${
trialJobId
}
not found.`
)
}
return
trialJob
}
public
async
getTrialLog
(
_trialJobId
:
string
,
_logType
:
LogType
):
Promise
<
string
>
{
throw
new
MethodNotImplementedError
();
}
public
addTrialJobMetricListener
(
listener
:
(
metric
:
TrialJobMetric
)
=>
void
):
void
{
this
.
metricsEmitter
.
on
(
'
metric
'
,
listener
);
}
public
removeTrialJobMetricListener
(
listener
:
(
metric
:
TrialJobMetric
)
=>
void
):
void
{
this
.
metricsEmitter
.
off
(
'
metric
'
,
listener
);
}
public
get
MetricsEmitter
():
EventEmitter
{
return
this
.
metricsEmitter
;
}
public
async
submitTrialJob
(
form
:
TrialJobApplicationForm
):
Promise
<
TrialJobDetail
>
{
const
trialJobId
:
string
=
uniqueString
(
5
);
const
trialWorkingFolder
:
string
=
path
.
join
(
'
/nni-experiments
'
,
getExperimentId
(),
'
/trials/
'
,
trialJobId
);
const
trialJobDetail
=
new
DLTSTrialJobDetail
(
trialJobId
,
// id
'
WAITING
'
,
// status
Date
.
now
(),
// submitTime
trialWorkingFolder
,
// workingDirectory
form
,
`nni_exp_
${
this
.
experimentId
}
_trial_
${
trialJobId
}
`
);
this
.
trialJobsMap
.
set
(
trialJobId
,
trialJobDetail
);
this
.
jobQueue
.
push
(
trialJobId
);
return
trialJobDetail
;
}
public
async
cancelTrialJob
(
trialJobId
:
string
,
isEarlyStopped
:
boolean
=
false
):
Promise
<
void
>
{
const
trialJobDetail
:
DLTSTrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
if
(
trialJobDetail
===
undefined
)
{
throw
Error
(
`cancelTrialJob: trial job id
${
trialJobId
}
not found`
);
}
if
(
this
.
dltsClusterConfig
===
undefined
)
{
throw
Error
(
'
DLTS Cluster config is not initialized
'
);
}
const
options
:
request
.
Options
=
{
method
:
'
PUT
'
,
uri
:
`
${
this
.
dltsClusterConfig
.
dashboard
}
api/clusters/
${
this
.
dltsClusterConfig
.
cluster
}
/jobs/
${
trialJobDetail
.
dltsJobId
}
/status`
,
qs
:
{
email
:
this
.
dltsClusterConfig
.
email
,
password
:
this
.
dltsClusterConfig
.
password
},
body
:
{
status
:
'
killing
'
},
json
:
true
};
// Set trialjobDetail's early stopped field, to mark the job's cancellation source
trialJobDetail
.
isEarlyStopped
=
isEarlyStopped
;
await
new
Promise
((
resolve
,
reject
)
=>
{
request
(
options
,
(
error
:
Error
,
response
:
request
.
Response
,
body
:
any
)
=>
{
if
(
error
)
{
reject
(
error
);
}
else
{
resolve
(
body
);
}
});
});
}
private
async
getGpuType
():
Promise
<
string
>
{
if
(
this
.
dltsClusterConfig
===
undefined
)
{
throw
new
Error
(
'
DLTS Cluster config is not initialized
'
);
}
const
gpuRequestOptions
:
request
.
Options
=
{
method
:
'
GET
'
,
qs
:
{
email
:
this
.
dltsClusterConfig
.
email
,
password
:
this
.
dltsClusterConfig
.
password
},
uri
:
`
${
this
.
dltsClusterConfig
.
dashboard
}
api/teams/
${
this
.
dltsClusterConfig
.
team
}
/clusters/
${
this
.
dltsClusterConfig
.
cluster
}
`
,
json
:
true
};
return
new
Promise
<
string
>
((
resolve
,
reject
)
=>
{
request
(
gpuRequestOptions
,
(
error
,
response
,
data
)
=>
{
if
(
error
)
{
return
reject
(
error
)
}
try
{
const
metadata
=
JSON
.
parse
(
data
[
'
metadata
'
])
resolve
(
Object
.
keys
(
metadata
)[
0
])
}
catch
(
error
)
{
reject
(
error
)
}
})
});
}
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
switch
(
key
)
{
case
TrialConfigMetadataKey
.
NNI_MANAGER_IP
:
this
.
nniManagerIpConfig
=
<
NNIManagerIpConfig
>
JSON
.
parse
(
value
);
break
;
case
TrialConfigMetadataKey
.
DLTS_CLUSTER_CONFIG
:
this
.
dltsClusterConfig
=
<
DLTSClusterConfig
>
JSON
.
parse
(
value
);
if
(
!
this
.
dltsClusterConfig
.
cluster
)
{
this
.
dltsClusterConfig
.
cluster
=
'
.default
'
}
if
(
!
this
.
dltsClusterConfig
.
email
)
{
if
(
process
.
env
[
'
DLWS_USER_EMAIL
'
])
{
this
.
dltsClusterConfig
.
email
=
process
.
env
[
'
DLWS_USER_EMAIL
'
]
as
string
}
else
{
throw
Error
(
'
`email` field in `dltsConfig` is not configured.
'
)
}
}
if
(
!
this
.
dltsClusterConfig
.
password
)
{
if
(
process
.
env
[
'
DLTS_JOB_TOKEN
'
])
{
this
.
dltsClusterConfig
.
password
=
process
.
env
[
'
DLTS_JOB_TOKEN
'
]
as
string
}
else
{
throw
Error
(
'
`password` field in `dltsConfig` is not configured.
'
)
}
}
if
(
!
this
.
dltsClusterConfig
.
team
)
{
if
(
process
.
env
[
'
DLWS_VC_NAME
'
])
{
this
.
dltsClusterConfig
.
team
=
process
.
env
[
'
DLWS_VC_NAME
'
]
as
string
}
else
{
throw
Error
(
'
`team` field in `dltsConfig` is not configured.
'
)
}
}
this
.
dltsClusterConfig
.
gpuType
=
await
this
.
getGpuType
();
break
;
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
this
.
dltsTrialConfig
=
<
DLTSTrialConfig
>
JSON
.
parse
(
value
);
// Validate to make sure codeDir doesn't have too many files
try
{
await
validateCodeDir
(
this
.
dltsTrialConfig
.
codeDir
);
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
throw
error
;
}
break
;
case
TrialConfigMetadataKey
.
VERSION_CHECK
:
this
.
versionCheck
=
(
value
===
'
true
'
||
value
===
'
True
'
);
break
;
case
TrialConfigMetadataKey
.
LOG_COLLECTION
:
this
.
logCollection
=
value
;
break
;
case
TrialConfigMetadataKey
.
MULTI_PHASE
:
this
.
isMultiPhase
=
(
value
===
'
true
'
||
value
===
'
True
'
);
break
;
default
:
//Reject for unknown keys
throw
new
Error
(
`Uknown key:
${
key
}
`
);
}
}
public
async
getClusterMetadata
(
_key
:
string
):
Promise
<
string
>
{
return
''
;
}
public
async
cleanUp
():
Promise
<
void
>
{
this
.
log
.
info
(
'
Stopping DLTS training service...
'
);
this
.
stopping
=
true
;
const
restServer
:
DLTSJobRestServer
=
component
.
get
(
DLTSJobRestServer
);
try
{
await
restServer
.
stop
();
this
.
log
.
info
(
'
DLTS Training service rest server stopped successfully.
'
);
return
;
}
catch
(
error
)
{
// tslint:disable-next-line: no-unsafe-any
this
.
log
.
error
(
`DLTS Training service rest server stopped failed, error:
${
error
.
message
}
`
);
throw
error
;
}
}
private
async
submitTrialJobToDLTS
(
trialJobId
:
string
):
Promise
<
boolean
>
{
const
trialJobDetail
:
DLTSTrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
if
(
trialJobDetail
===
undefined
)
{
throw
new
Error
(
`Failed to find DLTSTrialJobDetail for job
${
trialJobId
}
`
);
}
if
(
this
.
dltsClusterConfig
===
undefined
)
{
throw
new
Error
(
'
DLTS Cluster config is not initialized
'
);
}
if
(
this
.
dltsTrialConfig
===
undefined
)
{
throw
new
Error
(
'
trial config is not initialized
'
);
}
if
(
this
.
dltsRestServerPort
===
undefined
)
{
const
restServer
:
DLTSJobRestServer
=
component
.
get
(
DLTSJobRestServer
);
this
.
dltsRestServerPort
=
restServer
.
clusterRestServerPort
;
}
// Step 1. Prepare DLTS job configuration
const
trialLocalFolder
=
path
.
join
(
getExperimentRootDir
(),
'
trials-local
'
,
trialJobId
);
//create tmp trial working folder locally.
await
execMkdir
(
trialLocalFolder
);
const
runScriptContent
:
string
=
CONTAINER_INSTALL_NNI_SHELL_FORMAT
;
// Write NNI installation file to local tmp files
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalFolder
,
'
install_nni.sh
'
),
runScriptContent
,
{
encoding
:
'
utf8
'
});
// Write file content ( parameter.cfg ) to local tmp folders
if
(
trialJobDetail
.
form
!==
undefined
)
{
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalFolder
,
generateParamFileName
(
trialJobDetail
.
form
.
hyperParameters
)),
trialJobDetail
.
form
.
hyperParameters
.
value
,
{
encoding
:
'
utf8
'
}
);
}
// tslint:disable-next-line: strict-boolean-expressions
const
nniManagerIp
:
string
=
this
.
nniManagerIpConfig
?
this
.
nniManagerIpConfig
.
nniManagerIp
:
this
.
dltsRestServerHost
;
const
version
:
string
=
this
.
versionCheck
?
await
getVersion
()
:
''
;
const
nniDLTSTrialCommand
:
string
=
String
.
Format
(
DLTS_TRIAL_COMMAND_FORMAT
,
trialLocalFolder
,
path
.
join
(
trialLocalFolder
,
'
nnioutput
'
),
trialJobId
,
this
.
experimentId
,
trialJobDetail
.
form
.
sequenceId
,
false
,
this
.
dltsTrialConfig
.
codeDir
,
this
.
dltsTrialConfig
.
command
,
nniManagerIp
,
this
.
dltsRestServerPort
,
version
,
this
.
logCollection
)
.
replace
(
/
\r\n
|
\n
|
\r
/gm
,
''
);
// Step 2. Submit DLTS job via Rest call
const
dltsJobConfig
:
DLTSJobConfig
=
new
DLTSJobConfig
(
this
.
dltsClusterConfig
,
trialJobDetail
.
dltsJobName
,
this
.
dltsTrialConfig
.
gpuNum
,
this
.
dltsTrialConfig
.
image
,
nniDLTSTrialCommand
,
[]
);
const
submitJobRequest
:
request
.
Options
=
{
method
:
'
POST
'
,
uri
:
`
${
this
.
dltsClusterConfig
.
dashboard
}
api/clusters/
${
this
.
dltsClusterConfig
.
cluster
}
/jobs`
,
qs
:
{
email
:
this
.
dltsClusterConfig
.
email
,
password
:
this
.
dltsClusterConfig
.
password
},
body
:
dltsJobConfig
,
json
:
true
}
const
responseData
=
await
new
Promise
<
any
>
((
resolve
,
reject
)
=>
{
request
(
submitJobRequest
,
function
(
error
,
response
,
data
)
{
if
(
error
)
{
return
reject
(
error
)
}
else
{
return
resolve
(
data
)
}
})
});
trialJobDetail
.
dltsJobId
=
responseData
[
'
jobId
'
]
return
true
;
}
public
async
updateTrialJob
(
trialJobId
:
string
,
form
:
TrialJobApplicationForm
):
Promise
<
TrialJobDetail
>
{
const
trialJobDetail
:
undefined
|
TrialJobDetail
=
this
.
trialJobsMap
.
get
(
trialJobId
);
if
(
trialJobDetail
===
undefined
)
{
throw
new
Error
(
`updateTrialJob failed:
${
trialJobId
}
not found`
);
}
if
(
this
.
dltsClusterConfig
===
undefined
)
{
throw
new
Error
(
'
DLTS Cluster config is not initialized
'
);
}
if
(
this
.
dltsTrialConfig
===
undefined
)
{
throw
new
Error
(
'
DLTS trial config is not initialized
'
);
}
const
hyperParameters
=
form
.
hyperParameters
;
const
trialLocalTempFolder
:
string
=
path
.
join
(
getExperimentRootDir
(),
'
trials-local
'
,
trialJobId
);
const
hpFileName
:
string
=
generateParamFileName
(
hyperParameters
);
const
localFilepath
:
string
=
path
.
join
(
trialLocalTempFolder
,
hpFileName
);
await
fs
.
promises
.
writeFile
(
localFilepath
,
hyperParameters
.
value
,
{
encoding
:
'
utf8
'
});
const
parameterFileMeta
=
{
experimentId
:
this
.
experimentId
,
trialId
:
trialJobId
};
const
restServer
:
DLTSJobRestServer
=
component
.
get
(
DLTSJobRestServer
);
const
req
:
request
.
Options
=
{
uri
:
`
${
restServer
.
endPoint
}${
restServer
.
apiRootUrl
}
/parameter-file-meta`
,
method
:
'
POST
'
,
json
:
true
,
body
:
parameterFileMeta
};
await
new
Promise
((
resolve
,
reject
)
=>
{
request
(
req
,
(
err
:
Error
,
_res
:
request
.
Response
)
=>
{
if
(
err
)
{
reject
(
err
);
}
else
{
resolve
();
}
});
});
return
trialJobDetail
;
}
public
get
isMultiPhaseJobSupported
():
boolean
{
return
false
;
}
public
getTrialOutputLocalPath
(
_trialJobId
:
string
):
Promise
<
string
>
{
throw
new
MethodNotImplementedError
();
}
public
fetchTrialOutput
(
_trialJobId
:
string
,
_subpath
:
string
):
Promise
<
void
>
{
throw
new
MethodNotImplementedError
();
}
}
export
{
DLTSTrainingService
};
ts/nni_manager/training_service/dlts/dltsTrialConfig.ts
deleted
100644 → 0
View file @
b2225436
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
import
{
TrialConfig
}
from
"
training_service/common/trialConfig
"
;
export
class
DLTSTrialConfig
extends
TrialConfig
{
public
constructor
(
command
:
string
,
codeDir
:
string
,
gpuNum
:
number
,
public
readonly
image
:
string
)
{
super
(
command
,
codeDir
,
gpuNum
);
}
}
ts/nni_manager/training_service/dlts/dltsTrialJobDetail.ts
deleted
100644 → 0
View file @
b2225436
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
import
{
TrialJobDetail
,
TrialJobStatus
,
TrialJobApplicationForm
}
from
"
../../common/trainingService
"
;
export
class
DLTSTrialJobDetail
implements
TrialJobDetail
{
public
startTime
?:
number
;
public
endTime
?:
number
;
public
tags
?:
string
[];
public
url
?:
string
;
public
isEarlyStopped
?:
boolean
;
// DLTS staff
public
dltsJobId
?:
string
;
public
dltsPaused
:
boolean
=
false
;
public
constructor
(
public
id
:
string
,
public
status
:
TrialJobStatus
,
public
submitTime
:
number
,
public
workingDirectory
:
string
,
public
form
:
TrialJobApplicationForm
,
// DLTS staff
public
dltsJobName
:
string
,
)
{}
}
ts/nni_manager/training_service/remote_machine/shellExecutor.ts
View file @
ef15fc81
...
@@ -277,7 +277,7 @@ class ShellExecutor {
...
@@ -277,7 +277,7 @@ class ShellExecutor {
this
.
log
.
debug
(
`copyFileToRemote(
${
commandIndex
}
): localFilePath:
${
localFilePath
}
, remoteFilePath:
${
remoteFilePath
}
`
);
this
.
log
.
debug
(
`copyFileToRemote(
${
commandIndex
}
): localFilePath:
${
localFilePath
}
, remoteFilePath:
${
remoteFilePath
}
`
);
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
this
.
sshClient
.
sftp
((
err
:
Error
,
sftp
:
SFTPWrapper
)
=>
{
this
.
sshClient
.
sftp
((
err
:
Error
|
undefined
,
sftp
:
SFTPWrapper
)
=>
{
if
(
err
!==
undefined
&&
err
!==
null
)
{
if
(
err
!==
undefined
&&
err
!==
null
)
{
this
.
log
.
error
(
`copyFileToRemote(
${
commandIndex
}
):
${
err
}
`
);
this
.
log
.
error
(
`copyFileToRemote(
${
commandIndex
}
):
${
err
}
`
);
deferred
.
reject
(
err
);
deferred
.
reject
(
err
);
...
@@ -328,7 +328,7 @@ class ShellExecutor {
...
@@ -328,7 +328,7 @@ class ShellExecutor {
const
commandIndex
=
randomInt
(
10000
);
const
commandIndex
=
randomInt
(
10000
);
this
.
log
.
debug
(
`getRemoteFileContent(
${
commandIndex
}
): filePath:
${
filePath
}
`
);
this
.
log
.
debug
(
`getRemoteFileContent(
${
commandIndex
}
): filePath:
${
filePath
}
`
);
const
deferred
:
Deferred
<
string
>
=
new
Deferred
<
string
>
();
const
deferred
:
Deferred
<
string
>
=
new
Deferred
<
string
>
();
this
.
sshClient
.
sftp
((
err
:
Error
,
sftp
:
SFTPWrapper
)
=>
{
this
.
sshClient
.
sftp
((
err
:
Error
|
undefined
,
sftp
:
SFTPWrapper
)
=>
{
if
(
err
!==
undefined
&&
err
!==
null
)
{
if
(
err
!==
undefined
&&
err
!==
null
)
{
this
.
log
.
error
(
`getRemoteFileContent(
${
commandIndex
}
) sftp:
${
err
}
`
);
this
.
log
.
error
(
`getRemoteFileContent(
${
commandIndex
}
) sftp:
${
err
}
`
);
deferred
.
reject
(
new
Error
(
`SFTP error:
${
err
}
`
));
deferred
.
reject
(
new
Error
(
`SFTP error:
${
err
}
`
));
...
@@ -376,7 +376,7 @@ class ShellExecutor {
...
@@ -376,7 +376,7 @@ class ShellExecutor {
// Windows always uses shell, and it needs to disable to get it works.
// Windows always uses shell, and it needs to disable to get it works.
useShell
=
useShell
&&
!
this
.
isWindows
;
useShell
=
useShell
&&
!
this
.
isWindows
;
const
callback
=
(
err
:
Error
,
channel
:
ClientChannel
):
void
=>
{
const
callback
=
(
err
:
Error
|
undefined
,
channel
:
ClientChannel
):
void
=>
{
if
(
err
!==
undefined
&&
err
!==
null
)
{
if
(
err
!==
undefined
&&
err
!==
null
)
{
this
.
log
.
error
(
`remoteExeCommand(
${
commandIndex
}
):
${
err
.
message
}
`
);
this
.
log
.
error
(
`remoteExeCommand(
${
commandIndex
}
):
${
err
.
message
}
`
);
deferred
.
reject
(
err
);
deferred
.
reject
(
err
);
...
...
ts/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts
View file @
ef15fc81
...
@@ -310,7 +310,7 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
...
@@ -310,7 +310,7 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
}
}
}
}
}
}
return
yaml
.
safeD
ump
(
nniJobConfig
);
return
yaml
.
d
ump
(
nniJobConfig
);
}
}
protected
formatPAIHost
(
host
:
string
):
string
{
protected
formatPAIHost
(
host
:
string
):
string
{
...
...
ts/nni_manager/training_service/reusable/storageService.ts
View file @
ef15fc81
...
@@ -16,14 +16,14 @@ export abstract class StorageService {
...
@@ -16,14 +16,14 @@ export abstract class StorageService {
protected
logger
:
Logger
;
protected
logger
:
Logger
;
protected
abstract
internalConfig
(
key
:
string
,
value
:
string
):
void
;
protected
abstract
internalConfig
(
key
:
string
,
value
:
string
):
void
;
protected
abstract
async
internalRemove
(
remotePath
:
string
,
isDirectory
:
boolean
,
isRecursive
:
boolean
):
Promise
<
void
>
;
protected
abstract
internalRemove
(
remotePath
:
string
,
isDirectory
:
boolean
,
isRecursive
:
boolean
):
Promise
<
void
>
;
protected
abstract
async
internalRename
(
remotePath
:
string
,
newName
:
string
):
Promise
<
void
>
;
protected
abstract
internalRename
(
remotePath
:
string
,
newName
:
string
):
Promise
<
void
>
;
protected
abstract
async
internalMkdir
(
remotePath
:
string
):
Promise
<
void
>
;
protected
abstract
internalMkdir
(
remotePath
:
string
):
Promise
<
void
>
;
protected
abstract
async
internalCopy
(
sourcePath
:
string
,
targetPath
:
string
,
isDirectory
:
boolean
,
isFromRemote
:
boolean
,
isToRemote
:
boolean
):
Promise
<
string
>
;
protected
abstract
internalCopy
(
sourcePath
:
string
,
targetPath
:
string
,
isDirectory
:
boolean
,
isFromRemote
:
boolean
,
isToRemote
:
boolean
):
Promise
<
string
>
;
protected
abstract
async
internalExists
(
remotePath
:
string
):
Promise
<
boolean
>
;
protected
abstract
internalExists
(
remotePath
:
string
):
Promise
<
boolean
>
;
protected
abstract
async
internalRead
(
remotePath
:
string
,
offset
:
number
,
length
:
number
):
Promise
<
string
>
;
protected
abstract
internalRead
(
remotePath
:
string
,
offset
:
number
,
length
:
number
):
Promise
<
string
>
;
protected
abstract
async
internalList
(
remotePath
:
string
):
Promise
<
string
[]
>
;
protected
abstract
internalList
(
remotePath
:
string
):
Promise
<
string
[]
>
;
protected
abstract
async
internalAttach
(
remotePath
:
string
,
content
:
string
):
Promise
<
boolean
>
;
protected
abstract
internalAttach
(
remotePath
:
string
,
content
:
string
):
Promise
<
boolean
>
;
protected
abstract
internalIsRelativePath
(
path
:
string
):
boolean
;
protected
abstract
internalIsRelativePath
(
path
:
string
):
boolean
;
protected
abstract
internalJoin
(...
paths
:
string
[]):
string
;
protected
abstract
internalJoin
(...
paths
:
string
[]):
string
;
protected
abstract
internalDirname
(...
paths
:
string
[]):
string
;
protected
abstract
internalDirname
(...
paths
:
string
[]):
string
;
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment