Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
ba8dccd6
Commit
ba8dccd6
authored
Jun 23, 2019
by
suiguoxin
Browse files
Merge branch 'master' of
https://github.com/microsoft/nni
parents
56a1575b
150ee83a
Changes
198
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
147 additions
and
107 deletions
+147
-107
examples/tuners/ga_customer_tuner/README.md
examples/tuners/ga_customer_tuner/README.md
+2
-2
examples/tuners/weight_sharing/ga_customer_tuner/README.md
examples/tuners/weight_sharing/ga_customer_tuner/README.md
+2
-2
src/nni_manager/common/restServer.ts
src/nni_manager/common/restServer.ts
+5
-5
src/nni_manager/common/trainingService.ts
src/nni_manager/common/trainingService.ts
+1
-2
src/nni_manager/common/utils.ts
src/nni_manager/common/utils.ts
+41
-7
src/nni_manager/config/frameworkcontroller/frameworkcontrollerjob-crd-v1.json
...ig/frameworkcontroller/frameworkcontrollerjob-crd-v1.json
+8
-8
src/nni_manager/config/kubeflow/pytorchjob-crd-v1alpha2.json
src/nni_manager/config/kubeflow/pytorchjob-crd-v1alpha2.json
+8
-8
src/nni_manager/config/kubeflow/pytorchjob-crd-v1beta1.json
src/nni_manager/config/kubeflow/pytorchjob-crd-v1beta1.json
+8
-8
src/nni_manager/config/kubeflow/tfjob-crd-v1alpha2.json
src/nni_manager/config/kubeflow/tfjob-crd-v1alpha2.json
+8
-8
src/nni_manager/config/kubeflow/tfjob-crd-v1beta1.json
src/nni_manager/config/kubeflow/tfjob-crd-v1beta1.json
+8
-8
src/nni_manager/core/nnimanager.ts
src/nni_manager/core/nnimanager.ts
+2
-2
src/nni_manager/core/test/ipcInterface.test.ts
src/nni_manager/core/test/ipcInterface.test.ts
+3
-3
src/nni_manager/core/test/mockedTrainingService.ts
src/nni_manager/core/test/mockedTrainingService.ts
+2
-2
src/nni_manager/core/test/nnimanager.test.ts
src/nni_manager/core/test/nnimanager.test.ts
+1
-1
src/nni_manager/package.json
src/nni_manager/package.json
+1
-0
src/nni_manager/rest_server/nniRestServer.ts
src/nni_manager/rest_server/nniRestServer.ts
+1
-1
src/nni_manager/rest_server/restHandler.ts
src/nni_manager/rest_server/restHandler.ts
+1
-1
src/nni_manager/rest_server/restValidationSchemas.ts
src/nni_manager/rest_server/restValidationSchemas.ts
+1
-1
src/nni_manager/training_service/common/clusterJobRestServer.ts
...i_manager/training_service/common/clusterJobRestServer.ts
+41
-35
src/nni_manager/training_service/common/containerJobData.ts
src/nni_manager/training_service/common/containerJobData.ts
+3
-3
No files found.
examples/tuners/ga_customer_tuner/README.md
View file @
ba8dccd6
examples/tuners/weight_sharing/ga_customer_tuner/README.md
View file @
ba8dccd6
src/nni_manager/common/restServer.ts
View file @
ba8dccd6
src/nni_manager/common/trainingService.ts
View file @
ba8dccd6
...
@@ -91,6 +91,7 @@ interface TrialJobMetric {
...
@@ -91,6 +91,7 @@ interface TrialJobMetric {
* define TrainingServiceError
* define TrainingServiceError
*/
*/
class
TrainingServiceError
extends
Error
{
class
TrainingServiceError
extends
Error
{
private
errCode
:
number
;
private
errCode
:
number
;
constructor
(
errorCode
:
number
,
errorMessage
:
string
)
{
constructor
(
errorCode
:
number
,
errorMessage
:
string
)
{
...
@@ -136,5 +137,3 @@ export {
...
@@ -136,5 +137,3 @@ export {
TrainingServiceMetadata
,
TrialJobDetail
,
TrialJobMetric
,
HyperParameters
,
TrainingServiceMetadata
,
TrialJobDetail
,
TrialJobMetric
,
HyperParameters
,
HostJobApplicationForm
,
JobApplicationForm
,
JobType
,
NNIManagerIpConfig
HostJobApplicationForm
,
JobApplicationForm
,
JobType
,
NNIManagerIpConfig
};
};
src/nni_manager/common/utils.ts
View file @
ba8dccd6
...
@@ -374,6 +374,40 @@ function countFilesRecursively(directory: string, timeoutMilliSeconds?: number):
...
@@ -374,6 +374,40 @@ function countFilesRecursively(directory: string, timeoutMilliSeconds?: number):
});
});
}
}
function
validateFileName
(
fileName
:
string
):
boolean
{
let
pattern
:
string
=
'
^[a-z0-9A-Z
\
.-_]+$
'
;
const
validateResult
=
fileName
.
match
(
pattern
);
if
(
validateResult
)
{
return
true
;
}
return
false
;
}
async
function
validateFileNameRecursively
(
directory
:
string
):
Promise
<
boolean
>
{
if
(
!
fs
.
existsSync
(
directory
))
{
throw
Error
(
`Direcotory
${
directory
}
doesn't exist`
);
}
const
fileNameArray
:
string
[]
=
fs
.
readdirSync
(
directory
);
let
result
=
true
;
for
(
var
name
of
fileNameArray
){
const
fullFilePath
:
string
=
path
.
join
(
directory
,
name
);
try
{
// validate file names and directory names
result
=
validateFileName
(
name
);
if
(
fs
.
lstatSync
(
fullFilePath
).
isDirectory
())
{
result
=
result
&&
await
validateFileNameRecursively
(
fullFilePath
);
}
if
(
!
result
)
{
return
Promise
.
reject
(
new
Error
(
`file name in
${
fullFilePath
}
is not valid!`
));
}
}
catch
(
error
)
{
return
Promise
.
reject
(
error
);
}
}
return
Promise
.
resolve
(
result
);
}
/**
/**
* get the version of current package
* get the version of current package
*/
*/
...
@@ -474,6 +508,6 @@ function unixPathJoin(...paths: any[]): string {
...
@@ -474,6 +508,6 @@ function unixPathJoin(...paths: any[]): string {
return
dir
;
return
dir
;
}
}
export
{
countFilesRecursively
,
getRemoteTmpDir
,
generateParamFileName
,
getMsgDispatcherCommand
,
getCheckpointDir
,
export
{
countFilesRecursively
,
validateFileNameRecursively
,
getRemoteTmpDir
,
generateParamFileName
,
getMsgDispatcherCommand
,
getCheckpointDir
,
getLogDir
,
getExperimentRootDir
,
getJobCancelStatus
,
getDefaultDatabaseDir
,
getIPV4Address
,
unixPathJoin
,
getLogDir
,
getExperimentRootDir
,
getJobCancelStatus
,
getDefaultDatabaseDir
,
getIPV4Address
,
unixPathJoin
,
mkDirP
,
delay
,
prepareUnitTest
,
parseArg
,
cleanupUnitTest
,
uniqueString
,
randomSelect
,
getLogLevel
,
getVersion
,
getCmdPy
,
getTunerProc
,
isAlive
,
killPid
,
getNewLine
};
mkDirP
,
delay
,
prepareUnitTest
,
parseArg
,
cleanupUnitTest
,
uniqueString
,
randomSelect
,
getLogLevel
,
getVersion
,
getCmdPy
,
getTunerProc
,
isAlive
,
killPid
,
getNewLine
};
src/nni_manager/config/frameworkcontroller/frameworkcontrollerjob-crd-v1.json
View file @
ba8dccd6
src/nni_manager/config/kubeflow/pytorchjob-crd-v1alpha2.json
View file @
ba8dccd6
src/nni_manager/config/kubeflow/pytorchjob-crd-v1beta1.json
View file @
ba8dccd6
src/nni_manager/config/kubeflow/tfjob-crd-v1alpha2.json
View file @
ba8dccd6
src/nni_manager/config/kubeflow/tfjob-crd-v1beta1.json
View file @
ba8dccd6
src/nni_manager/core/nnimanager.ts
View file @
ba8dccd6
src/nni_manager/core/test/ipcInterface.test.ts
View file @
ba8dccd6
src/nni_manager/core/test/mockedTrainingService.ts
View file @
ba8dccd6
src/nni_manager/core/test/nnimanager.test.ts
View file @
ba8dccd6
src/nni_manager/package.json
View file @
ba8dccd6
...
@@ -33,6 +33,7 @@
...
@@ -33,6 +33,7 @@
"@types/chai-as-promised"
:
"^7.1.0"
,
"@types/chai-as-promised"
:
"^7.1.0"
,
"@types/express"
:
"^4.16.0"
,
"@types/express"
:
"^4.16.0"
,
"@types/glob"
:
"^7.1.1"
,
"@types/glob"
:
"^7.1.1"
,
"@types/js-base64"
:
"^2.3.1"
,
"@types/mocha"
:
"^5.2.5"
,
"@types/mocha"
:
"^5.2.5"
,
"@types/node"
:
"10.12.18"
,
"@types/node"
:
"10.12.18"
,
"@types/request"
:
"^2.47.1"
,
"@types/request"
:
"^2.47.1"
,
...
...
src/nni_manager/rest_server/nniRestServer.ts
View file @
ba8dccd6
src/nni_manager/rest_server/restHandler.ts
View file @
ba8dccd6
src/nni_manager/rest_server/restValidationSchemas.ts
View file @
ba8dccd6
src/nni_manager/training_service/common/clusterJobRestServer.ts
View file @
ba8dccd6
...
@@ -20,22 +20,24 @@
...
@@ -20,22 +20,24 @@
'
use strict
'
;
'
use strict
'
;
import
*
as
assert
from
'
assert
'
;
import
*
as
assert
from
'
assert
'
;
import
{
Request
,
Response
,
Router
}
from
'
express
'
;
// tslint:disable-next-line:no-implicit-dependencies
import
*
as
bodyParser
from
'
body-parser
'
;
import
*
as
bodyParser
from
'
body-parser
'
;
import
{
Request
,
Response
,
Router
}
from
'
express
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
import
{
Writable
}
from
'
stream
'
;
import
{
String
}
from
'
typescript-string-operations
'
;
import
*
as
component
from
'
../../common/component
'
;
import
*
as
component
from
'
../../common/component
'
;
import
*
as
fs
from
'
fs
'
import
*
as
path
from
'
path
'
import
{
getBasePort
,
getExperimentId
}
from
'
../../common/experimentStartupInfo
'
;
import
{
getBasePort
,
getExperimentId
}
from
'
../../common/experimentStartupInfo
'
;
import
{
RestServer
}
from
'
../../common/restServer
'
import
{
RestServer
}
from
'
../../common/restServer
'
;
import
{
getLogDir
}
from
'
../../common/utils
'
;
import
{
getLogDir
}
from
'
../../common/utils
'
;
import
{
Writable
}
from
'
stream
'
;
/**
/**
* Cluster Job Training service Rest server, provides rest API to support Cluster job metrics update
* Cluster Job Training service Rest server, provides rest API to support Cluster job metrics update
*
*
*/
*/
@
component
.
Singleton
@
component
.
Singleton
export
abstract
class
ClusterJobRestServer
extends
RestServer
{
export
abstract
class
ClusterJobRestServer
extends
RestServer
{
private
readonly
API_ROOT_URL
:
string
=
'
/api/v1/nni-pai
'
;
private
readonly
API_ROOT_URL
:
string
=
'
/api/v1/nni-pai
'
;
private
readonly
NNI_METRICS_PATTERN
:
string
=
`NNISDK_MEb'(?<metrics>.*?)'`
;
private
readonly
NNI_METRICS_PATTERN
:
string
=
`NNISDK_MEb'(?<metrics>.*?)'`
;
...
@@ -51,19 +53,20 @@ export abstract class ClusterJobRestServer extends RestServer{
...
@@ -51,19 +53,20 @@ export abstract class ClusterJobRestServer extends RestServer{
constructor
()
{
constructor
()
{
super
();
super
();
const
basePort
:
number
=
getBasePort
();
const
basePort
:
number
=
getBasePort
();
assert
(
basePort
&&
basePort
>
1024
);
assert
(
basePort
!==
undefined
&&
basePort
>
1024
);
this
.
port
=
basePort
+
1
;
this
.
port
=
basePort
+
1
;
}
}
public
get
clusterRestServerPort
():
number
{
public
get
clusterRestServerPort
():
number
{
if
(
!
this
.
port
)
{
if
(
this
.
port
===
undefined
)
{
throw
new
Error
(
'
PAI Rest server port is undefined
'
);
throw
new
Error
(
'
PAI Rest server port is undefined
'
);
}
}
return
this
.
port
;
return
this
.
port
;
}
}
public
get
getErrorMessage
():
string
|
undefined
{
public
get
getErrorMessage
():
string
|
undefined
{
return
this
.
errorMessage
;
return
this
.
errorMessage
;
}
}
...
@@ -79,11 +82,15 @@ export abstract class ClusterJobRestServer extends RestServer{
...
@@ -79,11 +82,15 @@ export abstract class ClusterJobRestServer extends RestServer{
this
.
app
.
use
(
this
.
API_ROOT_URL
,
this
.
createRestHandler
());
this
.
app
.
use
(
this
.
API_ROOT_URL
,
this
.
createRestHandler
());
}
}
// Abstract method to handle trial metrics data
// tslint:disable-next-line:no-any
protected
abstract
handleTrialMetrics
(
jobId
:
string
,
trialMetrics
:
any
[])
:
void
;
// tslint:disable: no-unsafe-any no-any
private
createRestHandler
()
:
Router
{
private
createRestHandler
()
:
Router
{
const
router
:
Router
=
Router
();
const
router
:
Router
=
Router
();
// tslint:disable-next-line:typedef
router
.
use
((
req
:
Request
,
res
:
Response
,
next
:
any
)
=>
{
router
.
use
((
req
:
Request
,
res
:
Response
,
next
)
=>
{
this
.
log
.
info
(
`
${
req
.
method
}
:
${
req
.
url
}
: body:\n
${
JSON
.
stringify
(
req
.
body
,
undefined
,
4
)}
`
);
this
.
log
.
info
(
`
${
req
.
method
}
:
${
req
.
url
}
: body:\n
${
JSON
.
stringify
(
req
.
body
,
undefined
,
4
)}
`
);
res
.
setHeader
(
'
Content-Type
'
,
'
application/json
'
);
res
.
setHeader
(
'
Content-Type
'
,
'
application/json
'
);
next
();
next
();
...
@@ -92,7 +99,7 @@ export abstract class ClusterJobRestServer extends RestServer{
...
@@ -92,7 +99,7 @@ export abstract class ClusterJobRestServer extends RestServer{
router
.
post
(
`/version/
${
this
.
expId
}
/:trialId`
,
(
req
:
Request
,
res
:
Response
)
=>
{
router
.
post
(
`/version/
${
this
.
expId
}
/:trialId`
,
(
req
:
Request
,
res
:
Response
)
=>
{
if
(
this
.
enableVersionCheck
)
{
if
(
this
.
enableVersionCheck
)
{
try
{
try
{
const
checkResultSuccess
:
boolean
=
req
.
body
.
tag
===
'
VCSuccess
'
?
true
:
false
;
const
checkResultSuccess
:
boolean
=
req
.
body
.
tag
===
'
VCSuccess
'
?
true
:
false
;
if
(
this
.
versionCheckSuccess
!==
undefined
&&
this
.
versionCheckSuccess
!==
checkResultSuccess
)
{
if
(
this
.
versionCheckSuccess
!==
undefined
&&
this
.
versionCheckSuccess
!==
checkResultSuccess
)
{
this
.
errorMessage
=
'
Version check error, version check result is inconsistent!
'
;
this
.
errorMessage
=
'
Version check error, version check result is inconsistent!
'
;
this
.
log
.
error
(
this
.
errorMessage
);
this
.
log
.
error
(
this
.
errorMessage
);
...
@@ -103,7 +110,7 @@ export abstract class ClusterJobRestServer extends RestServer{
...
@@ -103,7 +110,7 @@ export abstract class ClusterJobRestServer extends RestServer{
this
.
versionCheckSuccess
=
false
;
this
.
versionCheckSuccess
=
false
;
this
.
errorMessage
=
req
.
body
.
msg
;
this
.
errorMessage
=
req
.
body
.
msg
;
}
}
}
catch
(
err
)
{
}
catch
(
err
)
{
this
.
log
.
error
(
`json parse metrics error:
${
err
}
`
);
this
.
log
.
error
(
`json parse metrics error:
${
err
}
`
);
res
.
status
(
500
);
res
.
status
(
500
);
res
.
send
(
err
.
message
);
res
.
send
(
err
.
message
);
...
@@ -122,8 +129,7 @@ export abstract class ClusterJobRestServer extends RestServer{
...
@@ -122,8 +129,7 @@ export abstract class ClusterJobRestServer extends RestServer{
this
.
handleTrialMetrics
(
req
.
body
.
jobId
,
req
.
body
.
metrics
);
this
.
handleTrialMetrics
(
req
.
body
.
jobId
,
req
.
body
.
metrics
);
res
.
send
();
res
.
send
();
}
}
catch
(
err
)
{
catch
(
err
)
{
this
.
log
.
error
(
`json parse metrics error:
${
err
}
`
);
this
.
log
.
error
(
`json parse metrics error:
${
err
}
`
);
res
.
status
(
500
);
res
.
status
(
500
);
res
.
send
(
err
.
message
);
res
.
send
(
err
.
message
);
...
@@ -131,35 +137,37 @@ export abstract class ClusterJobRestServer extends RestServer{
...
@@ -131,35 +137,37 @@ export abstract class ClusterJobRestServer extends RestServer{
});
});
router
.
post
(
`/stdout/
${
this
.
expId
}
/:trialId`
,
(
req
:
Request
,
res
:
Response
)
=>
{
router
.
post
(
`/stdout/
${
this
.
expId
}
/:trialId`
,
(
req
:
Request
,
res
:
Response
)
=>
{
if
(
this
.
enableVersionCheck
&&
!
this
.
versionCheckSuccess
&&
!
this
.
errorMessage
)
{
if
(
this
.
enableVersionCheck
&&
(
this
.
versionCheckSuccess
===
undefined
||
!
this
.
versionCheckSuccess
)
this
.
errorMessage
=
`Version check failed, didn't get version check response from trialKeeper, please check your NNI version in `
&&
this
.
errorMessage
===
undefined
)
{
+
`NNIManager and TrialKeeper!`
this
.
errorMessage
=
`Version check failed, didn't get version check response from trialKeeper,`
+
` please check your NNI version in NNIManager and TrialKeeper!`
;
}
}
const
trialLogPath
:
string
=
path
.
join
(
getLogDir
(),
`trial_
${
req
.
params
.
trialId
}
.log`
);
const
trialLogPath
:
string
=
path
.
join
(
getLogDir
(),
`trial_
${
req
.
params
.
trialId
}
.log`
);
try
{
try
{
let
skipLogging
:
boolean
=
false
;
let
skipLogging
:
boolean
=
false
;
if
(
req
.
body
.
tag
===
'
trial
'
&&
req
.
body
.
msg
!==
undefined
)
{
if
(
req
.
body
.
tag
===
'
trial
'
&&
req
.
body
.
msg
!==
undefined
)
{
const
metricsContent
=
req
.
body
.
msg
.
match
(
this
.
NNI_METRICS_PATTERN
);
const
metricsContent
:
any
=
req
.
body
.
msg
.
match
(
this
.
NNI_METRICS_PATTERN
);
if
(
metricsContent
&&
metricsContent
.
groups
)
{
if
(
metricsContent
&&
metricsContent
.
groups
)
{
this
.
handleTrialMetrics
(
req
.
params
.
trialId
,
[
metricsContent
.
groups
[
'
metrics
'
]]);
const
key
:
string
=
'
metrics
'
;
this
.
handleTrialMetrics
(
req
.
params
.
trialId
,
[
metricsContent
.
groups
[
key
]]);
skipLogging
=
true
;
skipLogging
=
true
;
}
}
}
}
if
(
!
skipLogging
){
if
(
!
skipLogging
)
{
// Construct write stream to write remote trial's log into local file
// Construct write stream to write remote trial's log into local file
// tslint:disable-next-line:non-literal-fs-path
const
writeStream
:
Writable
=
fs
.
createWriteStream
(
trialLogPath
,
{
const
writeStream
:
Writable
=
fs
.
createWriteStream
(
trialLogPath
,
{
flags
:
'
a+
'
,
flags
:
'
a+
'
,
encoding
:
'
utf8
'
,
encoding
:
'
utf8
'
,
autoClose
:
true
autoClose
:
true
});
});
writeStream
.
write
(
req
.
body
.
msg
+
'
\n
'
);
writeStream
.
write
(
String
.
Format
(
'
{0}
\n
'
,
req
.
body
.
msg
)
);
writeStream
.
end
();
writeStream
.
end
();
}
}
res
.
send
();
res
.
send
();
}
}
catch
(
err
)
{
catch
(
err
)
{
this
.
log
.
error
(
`json parse stdout data error:
${
err
}
`
);
this
.
log
.
error
(
`json parse stdout data error:
${
err
}
`
);
res
.
status
(
500
);
res
.
status
(
500
);
res
.
send
(
err
.
message
);
res
.
send
(
err
.
message
);
...
@@ -168,7 +176,5 @@ export abstract class ClusterJobRestServer extends RestServer{
...
@@ -168,7 +176,5 @@ export abstract class ClusterJobRestServer extends RestServer{
return
router
;
return
router
;
}
}
// tslint:enable: no-unsafe-any no-any
/** Abstract method to handle trial metrics data */
protected
abstract
handleTrialMetrics
(
jobId
:
string
,
trialMetrics
:
any
[])
:
void
;
}
}
src/nni_manager/training_service/common/containerJobData.ts
View file @
ba8dccd6
Prev
1
2
3
4
5
6
7
8
9
10
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment