Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
10c177c2
Unverified
Commit
10c177c2
authored
Aug 12, 2020
by
Junwei Sun
Committed by
GitHub
Aug 12, 2020
Browse files
support display trial log on local mode (#2718)
parent
e2a86899
Changes
15
Show whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
147 additions
and
20 deletions
+147
-20
src/nni_manager/common/manager.ts
src/nni_manager/common/manager.ts
+3
-1
src/nni_manager/common/trainingService.ts
src/nni_manager/common/trainingService.ts
+4
-1
src/nni_manager/core/nnimanager.ts
src/nni_manager/core/nnimanager.ts
+5
-1
src/nni_manager/core/test/mockedTrainingService.ts
src/nni_manager/core/test/mockedTrainingService.ts
+5
-1
src/nni_manager/rest_server/restHandler.ts
src/nni_manager/rest_server/restHandler.ts
+14
-0
src/nni_manager/rest_server/test/mockedNNIManager.ts
src/nni_manager/rest_server/test/mockedNNIManager.ts
+4
-1
src/nni_manager/training_service/dlts/dltsTrainingService.ts
src/nni_manager/training_service/dlts/dltsTrainingService.ts
+6
-1
src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts
.../training_service/kubernetes/kubernetesTrainingService.ts
+6
-1
src/nni_manager/training_service/local/localTrainingService.ts
...ni_manager/training_service/local/localTrainingService.ts
+15
-3
src/nni_manager/training_service/pai/paiTrainingService.ts
src/nni_manager/training_service/pai/paiTrainingService.ts
+6
-1
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
...ng_service/remote_machine/remoteMachineTrainingService.ts
+11
-2
src/nni_manager/training_service/reusable/routerTrainingService.ts
...anager/training_service/reusable/routerTrainingService.ts
+6
-1
src/nni_manager/training_service/reusable/trialDispatcher.ts
src/nni_manager/training_service/reusable/trialDispatcher.ts
+6
-2
src/nni_manager/training_service/test/localTrainingService.test.ts
...anager/training_service/test/localTrainingService.test.ts
+33
-3
src/webui/src/components/public-child/OpenRow.tsx
src/webui/src/components/public-child/OpenRow.tsx
+23
-1
No files found.
src/nni_manager/common/manager.ts
View file @
10c177c2
...
@@ -4,7 +4,7 @@
...
@@ -4,7 +4,7 @@
'
use strict
'
;
'
use strict
'
;
import
{
MetricDataRecord
,
MetricType
,
TrialJobInfo
}
from
'
./datastore
'
;
import
{
MetricDataRecord
,
MetricType
,
TrialJobInfo
}
from
'
./datastore
'
;
import
{
TrialJobStatus
}
from
'
./trainingService
'
;
import
{
TrialJobStatus
,
LogType
}
from
'
./trainingService
'
;
type
ProfileUpdateType
=
'
TRIAL_CONCURRENCY
'
|
'
MAX_EXEC_DURATION
'
|
'
SEARCH_SPACE
'
|
'
MAX_TRIAL_NUM
'
;
type
ProfileUpdateType
=
'
TRIAL_CONCURRENCY
'
|
'
MAX_EXEC_DURATION
'
|
'
SEARCH_SPACE
'
|
'
MAX_TRIAL_NUM
'
;
type
ExperimentStatus
=
'
INITIALIZED
'
|
'
RUNNING
'
|
'
ERROR
'
|
'
STOPPING
'
|
'
STOPPED
'
|
'
DONE
'
|
'
NO_MORE_TRIAL
'
|
'
TUNER_NO_MORE_TRIAL
'
;
type
ExperimentStatus
=
'
INITIALIZED
'
|
'
RUNNING
'
|
'
ERROR
'
|
'
STOPPING
'
|
'
STOPPED
'
|
'
DONE
'
|
'
NO_MORE_TRIAL
'
|
'
TUNER_NO_MORE_TRIAL
'
;
...
@@ -101,6 +101,8 @@ abstract class Manager {
...
@@ -101,6 +101,8 @@ abstract class Manager {
public
abstract
getMetricDataByRange
(
minSeqId
:
number
,
maxSeqId
:
number
):
Promise
<
MetricDataRecord
[]
>
;
public
abstract
getMetricDataByRange
(
minSeqId
:
number
,
maxSeqId
:
number
):
Promise
<
MetricDataRecord
[]
>
;
public
abstract
getLatestMetricData
():
Promise
<
MetricDataRecord
[]
>
;
public
abstract
getLatestMetricData
():
Promise
<
MetricDataRecord
[]
>
;
public
abstract
getTrialLog
(
trialJobId
:
string
,
logType
:
LogType
):
Promise
<
string
>
;
public
abstract
getTrialJobStatistics
():
Promise
<
TrialJobStatistics
[]
>
;
public
abstract
getTrialJobStatistics
():
Promise
<
TrialJobStatistics
[]
>
;
public
abstract
getStatus
():
NNIManagerStatus
;
public
abstract
getStatus
():
NNIManagerStatus
;
}
}
...
...
src/nni_manager/common/trainingService.ts
View file @
10c177c2
...
@@ -8,6 +8,8 @@
...
@@ -8,6 +8,8 @@
*/
*/
type
TrialJobStatus
=
'
UNKNOWN
'
|
'
WAITING
'
|
'
RUNNING
'
|
'
SUCCEEDED
'
|
'
FAILED
'
|
'
USER_CANCELED
'
|
'
SYS_CANCELED
'
|
'
EARLY_STOPPED
'
;
type
TrialJobStatus
=
'
UNKNOWN
'
|
'
WAITING
'
|
'
RUNNING
'
|
'
SUCCEEDED
'
|
'
FAILED
'
|
'
USER_CANCELED
'
|
'
SYS_CANCELED
'
|
'
EARLY_STOPPED
'
;
type
LogType
=
'
TRIAL_LOG
'
|
'
TRIAL_ERROR
'
;
interface
TrainingServiceMetadata
{
interface
TrainingServiceMetadata
{
readonly
key
:
string
;
readonly
key
:
string
;
readonly
value
:
string
;
readonly
value
:
string
;
...
@@ -79,6 +81,7 @@ abstract class TrainingService {
...
@@ -79,6 +81,7 @@ abstract class TrainingService {
public
abstract
updateTrialJob
(
trialJobId
:
string
,
form
:
TrialJobApplicationForm
):
Promise
<
TrialJobDetail
>
;
public
abstract
updateTrialJob
(
trialJobId
:
string
,
form
:
TrialJobApplicationForm
):
Promise
<
TrialJobDetail
>
;
public
abstract
get
isMultiPhaseJobSupported
():
boolean
;
public
abstract
get
isMultiPhaseJobSupported
():
boolean
;
public
abstract
cancelTrialJob
(
trialJobId
:
string
,
isEarlyStopped
?:
boolean
):
Promise
<
void
>
;
public
abstract
cancelTrialJob
(
trialJobId
:
string
,
isEarlyStopped
?:
boolean
):
Promise
<
void
>
;
public
abstract
getTrialLog
(
trialJobId
:
string
,
logType
:
LogType
):
Promise
<
string
>
;
public
abstract
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
;
public
abstract
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
;
public
abstract
getClusterMetadata
(
key
:
string
):
Promise
<
string
>
;
public
abstract
getClusterMetadata
(
key
:
string
):
Promise
<
string
>
;
public
abstract
cleanUp
():
Promise
<
void
>
;
public
abstract
cleanUp
():
Promise
<
void
>
;
...
@@ -98,5 +101,5 @@ class NNIManagerIpConfig {
...
@@ -98,5 +101,5 @@ class NNIManagerIpConfig {
export
{
export
{
TrainingService
,
TrainingServiceError
,
TrialJobStatus
,
TrialJobApplicationForm
,
TrainingService
,
TrainingServiceError
,
TrialJobStatus
,
TrialJobApplicationForm
,
TrainingServiceMetadata
,
TrialJobDetail
,
TrialJobMetric
,
HyperParameters
,
TrainingServiceMetadata
,
TrialJobDetail
,
TrialJobMetric
,
HyperParameters
,
NNIManagerIpConfig
NNIManagerIpConfig
,
LogType
};
};
src/nni_manager/core/nnimanager.ts
View file @
10c177c2
...
@@ -16,7 +16,7 @@ import {
...
@@ -16,7 +16,7 @@ import {
NNIManagerStatus
,
ProfileUpdateType
,
TrialJobStatistics
NNIManagerStatus
,
ProfileUpdateType
,
TrialJobStatistics
}
from
'
../common/manager
'
;
}
from
'
../common/manager
'
;
import
{
import
{
TrainingService
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
,
TrialJobStatus
TrainingService
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
,
TrialJobStatus
,
LogType
}
from
'
../common/trainingService
'
;
}
from
'
../common/trainingService
'
;
import
{
delay
,
getCheckpointDir
,
getExperimentRootDir
,
getLogDir
,
getMsgDispatcherCommand
,
mkDirP
,
getTunerProc
,
getLogLevel
,
isAlive
,
killPid
}
from
'
../common/utils
'
;
import
{
delay
,
getCheckpointDir
,
getExperimentRootDir
,
getLogDir
,
getMsgDispatcherCommand
,
mkDirP
,
getTunerProc
,
getLogLevel
,
isAlive
,
killPid
}
from
'
../common/utils
'
;
import
{
import
{
...
@@ -325,6 +325,10 @@ class NNIManager implements Manager {
...
@@ -325,6 +325,10 @@ class NNIManager implements Manager {
// FIXME: unit test
// FIXME: unit test
}
}
public
async
getTrialLog
(
trialJobId
:
string
,
logType
:
LogType
):
Promise
<
string
>
{
return
this
.
trainingService
.
getTrialLog
(
trialJobId
,
logType
);
}
public
getExperimentProfile
():
Promise
<
ExperimentProfile
>
{
public
getExperimentProfile
():
Promise
<
ExperimentProfile
>
{
// TO DO: using Promise.resolve()
// TO DO: using Promise.resolve()
const
deferred
:
Deferred
<
ExperimentProfile
>
=
new
Deferred
<
ExperimentProfile
>
();
const
deferred
:
Deferred
<
ExperimentProfile
>
=
new
Deferred
<
ExperimentProfile
>
();
...
...
src/nni_manager/core/test/mockedTrainingService.ts
View file @
10c177c2
...
@@ -7,7 +7,7 @@ import { Deferred } from 'ts-deferred';
...
@@ -7,7 +7,7 @@ import { Deferred } from 'ts-deferred';
import
{
Provider
}
from
'
typescript-ioc
'
;
import
{
Provider
}
from
'
typescript-ioc
'
;
import
{
MethodNotImplementedError
}
from
'
../../common/errors
'
;
import
{
MethodNotImplementedError
}
from
'
../../common/errors
'
;
import
{
TrainingService
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
}
from
'
../../common/trainingService
'
;
import
{
TrainingService
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
,
LogType
}
from
'
../../common/trainingService
'
;
const
testTrainingServiceProvider
:
Provider
=
{
const
testTrainingServiceProvider
:
Provider
=
{
get
:
()
=>
{
return
new
MockedTrainingService
();
}
get
:
()
=>
{
return
new
MockedTrainingService
();
}
...
@@ -63,6 +63,10 @@ class MockedTrainingService extends TrainingService {
...
@@ -63,6 +63,10 @@ class MockedTrainingService extends TrainingService {
return
deferred
.
promise
;
return
deferred
.
promise
;
}
}
public
getTrialLog
(
trialJobId
:
string
,
logType
:
LogType
):
Promise
<
string
>
{
throw
new
MethodNotImplementedError
();
}
async
run
():
Promise
<
void
>
{
async
run
():
Promise
<
void
>
{
}
}
...
...
src/nni_manager/rest_server/restHandler.ts
View file @
10c177c2
...
@@ -57,6 +57,7 @@ class NNIRestHandler {
...
@@ -57,6 +57,7 @@ class NNIRestHandler {
this
.
getMetricData
(
router
);
this
.
getMetricData
(
router
);
this
.
getMetricDataByRange
(
router
);
this
.
getMetricDataByRange
(
router
);
this
.
getLatestMetricData
(
router
);
this
.
getLatestMetricData
(
router
);
this
.
getTrialLog
(
router
);
this
.
exportData
(
router
);
this
.
exportData
(
router
);
// Express-joi-validator configuration
// Express-joi-validator configuration
...
@@ -268,6 +269,19 @@ class NNIRestHandler {
...
@@ -268,6 +269,19 @@ class NNIRestHandler {
});
});
}
}
private
getTrialLog
(
router
:
Router
):
void
{
router
.
get
(
'
/trial-log/:id/:type
'
,
async
(
req
:
Request
,
res
:
Response
)
=>
{
this
.
nniManager
.
getTrialLog
(
req
.
params
.
id
,
req
.
params
.
type
).
then
((
log
:
string
)
=>
{
if
(
log
===
''
)
{
log
=
'
No logs available.
'
}
res
.
send
(
log
);
}).
catch
((
err
:
Error
)
=>
{
this
.
handleError
(
err
,
res
);
});
});
}
private
exportData
(
router
:
Router
):
void
{
private
exportData
(
router
:
Router
):
void
{
router
.
get
(
'
/export-data
'
,
(
req
:
Request
,
res
:
Response
)
=>
{
router
.
get
(
'
/export-data
'
,
(
req
:
Request
,
res
:
Response
)
=>
{
this
.
nniManager
.
exportData
().
then
((
exportedData
:
string
)
=>
{
this
.
nniManager
.
exportData
().
then
((
exportedData
:
string
)
=>
{
...
...
src/nni_manager/rest_server/test/mockedNNIManager.ts
View file @
10c177c2
...
@@ -13,7 +13,7 @@ import {
...
@@ -13,7 +13,7 @@ import {
TrialJobStatistics
,
NNIManagerStatus
TrialJobStatistics
,
NNIManagerStatus
}
from
'
../../common/manager
'
;
}
from
'
../../common/manager
'
;
import
{
import
{
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobStatus
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobStatus
,
LogType
}
from
'
../../common/trainingService
'
;
}
from
'
../../common/trainingService
'
;
export
const
testManagerProvider
:
Provider
=
{
export
const
testManagerProvider
:
Provider
=
{
...
@@ -118,6 +118,9 @@ export class MockedNNIManager extends Manager {
...
@@ -118,6 +118,9 @@ export class MockedNNIManager extends Manager {
public
getLatestMetricData
():
Promise
<
MetricDataRecord
[]
>
{
public
getLatestMetricData
():
Promise
<
MetricDataRecord
[]
>
{
throw
new
MethodNotImplementedError
();
throw
new
MethodNotImplementedError
();
}
}
public
getTrialLog
(
trialJobId
:
string
,
logType
:
LogType
):
Promise
<
string
>
{
throw
new
MethodNotImplementedError
();
}
public
getExperimentProfile
():
Promise
<
ExperimentProfile
>
{
public
getExperimentProfile
():
Promise
<
ExperimentProfile
>
{
const
profile
:
ExperimentProfile
=
{
const
profile
:
ExperimentProfile
=
{
params
:
{
params
:
{
...
...
src/nni_manager/training_service/dlts/dltsTrainingService.ts
View file @
10c177c2
...
@@ -12,9 +12,10 @@ import { EventEmitter } from 'events';
...
@@ -12,9 +12,10 @@ import { EventEmitter } from 'events';
import
{
String
}
from
'
typescript-string-operations
'
;
import
{
String
}
from
'
typescript-string-operations
'
;
import
{
getExperimentId
}
from
'
../../common/experimentStartupInfo
'
;
import
{
getExperimentId
}
from
'
../../common/experimentStartupInfo
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
MethodNotImplementedError
}
from
'
../../common/errors
'
;
import
{
import
{
NNIManagerIpConfig
,
TrainingService
,
NNIManagerIpConfig
,
TrainingService
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
,
LogType
}
from
'
../../common/trainingService
'
;
}
from
'
../../common/trainingService
'
;
import
{
DLTS_TRIAL_COMMAND_FORMAT
}
from
'
./dltsData
'
;
import
{
DLTS_TRIAL_COMMAND_FORMAT
}
from
'
./dltsData
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../common/containerJobData
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../common/containerJobData
'
;
...
@@ -246,6 +247,10 @@ class DLTSTrainingService implements TrainingService {
...
@@ -246,6 +247,10 @@ class DLTSTrainingService implements TrainingService {
return
trialJob
return
trialJob
}
}
public
async
getTrialLog
(
_trialJobId
:
string
,
_logType
:
LogType
):
Promise
<
string
>
{
throw
new
MethodNotImplementedError
();
}
public
addTrialJobMetricListener
(
listener
:
(
metric
:
TrialJobMetric
)
=>
void
):
void
{
public
addTrialJobMetricListener
(
listener
:
(
metric
:
TrialJobMetric
)
=>
void
):
void
{
this
.
metricsEmitter
.
on
(
'
metric
'
,
listener
);
this
.
metricsEmitter
.
on
(
'
metric
'
,
listener
);
}
}
...
...
src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts
View file @
10c177c2
...
@@ -12,8 +12,9 @@ import { Base64 } from 'js-base64';
...
@@ -12,8 +12,9 @@ import { Base64 } from 'js-base64';
import
{
String
}
from
'
typescript-string-operations
'
;
import
{
String
}
from
'
typescript-string-operations
'
;
import
{
getExperimentId
}
from
'
../../common/experimentStartupInfo
'
;
import
{
getExperimentId
}
from
'
../../common/experimentStartupInfo
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
MethodNotImplementedError
}
from
'
../../common/errors
'
;
import
{
import
{
NNIManagerIpConfig
,
TrialJobDetail
,
TrialJobMetric
NNIManagerIpConfig
,
TrialJobDetail
,
TrialJobMetric
,
LogType
}
from
'
../../common/trainingService
'
;
}
from
'
../../common/trainingService
'
;
import
{
delay
,
getExperimentRootDir
,
getIPV4Address
,
getJobCancelStatus
,
getVersion
,
uniqueString
}
from
'
../../common/utils
'
;
import
{
delay
,
getExperimentRootDir
,
getIPV4Address
,
getJobCancelStatus
,
getVersion
,
uniqueString
}
from
'
../../common/utils
'
;
import
{
AzureStorageClientUtility
}
from
'
./azureStorageClientUtils
'
;
import
{
AzureStorageClientUtility
}
from
'
./azureStorageClientUtils
'
;
...
@@ -98,6 +99,10 @@ abstract class KubernetesTrainingService {
...
@@ -98,6 +99,10 @@ abstract class KubernetesTrainingService {
return
Promise
.
resolve
(
kubernetesTrialJob
);
return
Promise
.
resolve
(
kubernetesTrialJob
);
}
}
public
async
getTrialLog
(
_trialJobId
:
string
,
_logType
:
LogType
):
Promise
<
string
>
{
throw
new
MethodNotImplementedError
();
}
public
addTrialJobMetricListener
(
listener
:
(
metric
:
TrialJobMetric
)
=>
void
):
void
{
public
addTrialJobMetricListener
(
listener
:
(
metric
:
TrialJobMetric
)
=>
void
):
void
{
this
.
metricsEmitter
.
on
(
'
metric
'
,
listener
);
this
.
metricsEmitter
.
on
(
'
metric
'
,
listener
);
}
}
...
...
src/nni_manager/training_service/local/localTrainingService.ts
View file @
10c177c2
...
@@ -14,7 +14,7 @@ import { getExperimentId } from '../../common/experimentStartupInfo';
...
@@ -14,7 +14,7 @@ import { getExperimentId } from '../../common/experimentStartupInfo';
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
import
{
HyperParameters
,
TrainingService
,
TrialJobApplicationForm
,
HyperParameters
,
TrainingService
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
,
TrialJobStatus
TrialJobDetail
,
TrialJobMetric
,
TrialJobStatus
,
LogType
}
from
'
../../common/trainingService
'
;
}
from
'
../../common/trainingService
'
;
import
{
import
{
delay
,
generateParamFileName
,
getExperimentRootDir
,
getJobCancelStatus
,
getNewLine
,
isAlive
,
uniqueString
delay
,
generateParamFileName
,
getExperimentRootDir
,
getJobCancelStatus
,
getNewLine
,
isAlive
,
uniqueString
...
@@ -184,6 +184,18 @@ class LocalTrainingService implements TrainingService {
...
@@ -184,6 +184,18 @@ class LocalTrainingService implements TrainingService {
return
trialJob
;
return
trialJob
;
}
}
public
async
getTrialLog
(
trialJobId
:
string
,
logType
:
LogType
):
Promise
<
string
>
{
let
logPath
:
string
;
if
(
logType
===
'
TRIAL_LOG
'
)
{
logPath
=
path
.
join
(
this
.
rootDir
,
'
trials
'
,
trialJobId
,
'
trial.log
'
);
}
else
if
(
logType
===
'
TRIAL_ERROR
'
)
{
logPath
=
path
.
join
(
this
.
rootDir
,
'
trials
'
,
trialJobId
,
'
stderr
'
);
}
else
{
throw
new
Error
(
'
unexpected log type
'
);
}
return
fs
.
promises
.
readFile
(
logPath
,
'
utf8
'
);
}
public
addTrialJobMetricListener
(
listener
:
(
metric
:
TrialJobMetric
)
=>
void
):
void
{
public
addTrialJobMetricListener
(
listener
:
(
metric
:
TrialJobMetric
)
=>
void
):
void
{
this
.
eventEmitter
.
on
(
'
metric
'
,
listener
);
this
.
eventEmitter
.
on
(
'
metric
'
,
listener
);
}
}
...
@@ -450,8 +462,8 @@ class LocalTrainingService implements TrainingService {
...
@@ -450,8 +462,8 @@ class LocalTrainingService implements TrainingService {
while
(
!
this
.
stopping
)
{
while
(
!
this
.
stopping
)
{
while
(
!
this
.
stopping
&&
this
.
jobQueue
.
length
!==
0
)
{
while
(
!
this
.
stopping
&&
this
.
jobQueue
.
length
!==
0
)
{
const
trialJobId
:
string
=
this
.
jobQueue
[
0
];
const
trialJobId
:
string
=
this
.
jobQueue
[
0
];
const
trialJobDe
a
til
:
LocalTrialJobDetail
|
undefined
=
this
.
jobMap
.
get
(
trialJobId
);
const
trialJobDet
a
il
:
LocalTrialJobDetail
|
undefined
=
this
.
jobMap
.
get
(
trialJobId
);
if
(
trialJobDe
a
til
!==
undefined
&&
trialJobDe
a
til
.
status
===
'
WAITING
'
)
{
if
(
trialJobDet
a
il
!==
undefined
&&
trialJobDet
a
il
.
status
===
'
WAITING
'
)
{
const
[
success
,
resource
]
=
this
.
tryGetAvailableResource
();
const
[
success
,
resource
]
=
this
.
tryGetAvailableResource
();
if
(
!
success
)
{
if
(
!
success
)
{
break
;
break
;
...
...
src/nni_manager/training_service/pai/paiTrainingService.ts
View file @
10c177c2
...
@@ -11,9 +11,10 @@ import { EventEmitter } from 'events';
...
@@ -11,9 +11,10 @@ import { EventEmitter } from 'events';
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
getExperimentId
}
from
'
../../common/experimentStartupInfo
'
;
import
{
getExperimentId
}
from
'
../../common/experimentStartupInfo
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
MethodNotImplementedError
}
from
'
../../common/errors
'
;
import
{
import
{
NNIManagerIpConfig
,
TrainingService
,
NNIManagerIpConfig
,
TrainingService
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
,
LogType
}
from
'
../../common/trainingService
'
;
}
from
'
../../common/trainingService
'
;
import
{
delay
}
from
'
../../common/utils
'
;
import
{
delay
}
from
'
../../common/utils
'
;
import
{
PAIJobInfoCollector
}
from
'
./paiJobInfoCollector
'
;
import
{
PAIJobInfoCollector
}
from
'
./paiJobInfoCollector
'
;
...
@@ -117,6 +118,10 @@ abstract class PAITrainingService implements TrainingService {
...
@@ -117,6 +118,10 @@ abstract class PAITrainingService implements TrainingService {
return
jobs
;
return
jobs
;
}
}
public
async
getTrialLog
(
_trialJobId
:
string
,
_logType
:
LogType
):
Promise
<
string
>
{
throw
new
MethodNotImplementedError
();
}
public
async
getTrialJob
(
trialJobId
:
string
):
Promise
<
TrialJobDetail
>
{
public
async
getTrialJob
(
trialJobId
:
string
):
Promise
<
TrialJobDetail
>
{
if
(
this
.
paiClusterConfig
===
undefined
)
{
if
(
this
.
paiClusterConfig
===
undefined
)
{
throw
new
Error
(
'
PAI Cluster config is not initialized
'
);
throw
new
Error
(
'
PAI Cluster config is not initialized
'
);
...
...
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
View file @
10c177c2
...
@@ -10,13 +10,13 @@ import * as path from 'path';
...
@@ -10,13 +10,13 @@ import * as path from 'path';
import
{
ShellExecutor
}
from
'
training_service/remote_machine/shellExecutor
'
;
import
{
ShellExecutor
}
from
'
training_service/remote_machine/shellExecutor
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
*
as
component
from
'
../../common/component
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
NNIError
,
NNIErrorNames
}
from
'
../../common/errors
'
;
import
{
NNIError
,
NNIErrorNames
,
MethodNotImplementedError
}
from
'
../../common/errors
'
;
import
{
getExperimentId
}
from
'
../../common/experimentStartupInfo
'
;
import
{
getExperimentId
}
from
'
../../common/experimentStartupInfo
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
ObservableTimer
}
from
'
../../common/observableTimer
'
;
import
{
ObservableTimer
}
from
'
../../common/observableTimer
'
;
import
{
import
{
HyperParameters
,
NNIManagerIpConfig
,
TrainingService
,
TrialJobApplicationForm
,
HyperParameters
,
NNIManagerIpConfig
,
TrainingService
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
TrialJobDetail
,
TrialJobMetric
,
LogType
}
from
'
../../common/trainingService
'
;
}
from
'
../../common/trainingService
'
;
import
{
import
{
delay
,
generateParamFileName
,
getExperimentRootDir
,
getIPV4Address
,
getJobCancelStatus
,
delay
,
generateParamFileName
,
getExperimentRootDir
,
getIPV4Address
,
getJobCancelStatus
,
...
@@ -180,6 +180,15 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -180,6 +180,15 @@ class RemoteMachineTrainingService implements TrainingService {
}
}
}
}
/**
* Get trial job log
* @param _trialJobId ID of trial job
* @param _logType 'TRIAL_LOG' | 'TRIAL_STDERR'
*/
public
async
getTrialLog
(
_trialJobId
:
string
,
_logType
:
LogType
):
Promise
<
string
>
{
throw
new
MethodNotImplementedError
();
}
/**
/**
* Add job metrics listener
* Add job metrics listener
* @param listener callback listener
* @param listener callback listener
...
...
src/nni_manager/training_service/reusable/routerTrainingService.ts
View file @
10c177c2
...
@@ -6,7 +6,8 @@
...
@@ -6,7 +6,8 @@
import
{
Container
,
Scope
}
from
'
typescript-ioc
'
;
import
{
Container
,
Scope
}
from
'
typescript-ioc
'
;
import
*
as
component
from
'
../../common/component
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
TrainingService
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
}
from
'
../../common/trainingService
'
;
import
{
MethodNotImplementedError
}
from
'
../../common/errors
'
import
{
TrainingService
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
,
LogType
}
from
'
../../common/trainingService
'
;
import
{
delay
}
from
'
../../common/utils
'
;
import
{
delay
}
from
'
../../common/utils
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
import
{
PAIClusterConfig
}
from
'
../pai/paiConfig
'
;
import
{
PAIClusterConfig
}
from
'
../pai/paiConfig
'
;
...
@@ -47,6 +48,10 @@ class RouterTrainingService implements TrainingService {
...
@@ -47,6 +48,10 @@ class RouterTrainingService implements TrainingService {
return
await
this
.
internalTrainingService
.
getTrialJob
(
trialJobId
);
return
await
this
.
internalTrainingService
.
getTrialJob
(
trialJobId
);
}
}
public
async
getTrialLog
(
_trialJobId
:
string
,
_logType
:
LogType
):
Promise
<
string
>
{
throw
new
MethodNotImplementedError
();
}
public
addTrialJobMetricListener
(
listener
:
(
metric
:
TrialJobMetric
)
=>
void
):
void
{
public
addTrialJobMetricListener
(
listener
:
(
metric
:
TrialJobMetric
)
=>
void
):
void
{
if
(
this
.
internalTrainingService
===
undefined
)
{
if
(
this
.
internalTrainingService
===
undefined
)
{
throw
new
Error
(
"
TrainingService is not assigned!
"
);
throw
new
Error
(
"
TrainingService is not assigned!
"
);
...
...
src/nni_manager/training_service/reusable/trialDispatcher.ts
View file @
10c177c2
...
@@ -9,10 +9,10 @@ import * as path from 'path';
...
@@ -9,10 +9,10 @@ import * as path from 'path';
import
{
Writable
}
from
'
stream
'
;
import
{
Writable
}
from
'
stream
'
;
import
{
String
}
from
'
typescript-string-operations
'
;
import
{
String
}
from
'
typescript-string-operations
'
;
import
*
as
component
from
'
../../common/component
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
NNIError
,
NNIErrorNames
}
from
'
../../common/errors
'
;
import
{
NNIError
,
NNIErrorNames
,
MethodNotImplementedError
}
from
'
../../common/errors
'
;
import
{
getBasePort
,
getExperimentId
,
getPlatform
}
from
'
../../common/experimentStartupInfo
'
;
import
{
getBasePort
,
getExperimentId
,
getPlatform
}
from
'
../../common/experimentStartupInfo
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
NNIManagerIpConfig
,
TrainingService
,
TrialJobApplicationForm
,
TrialJobMetric
,
TrialJobStatus
}
from
'
../../common/trainingService
'
;
import
{
NNIManagerIpConfig
,
TrainingService
,
TrialJobApplicationForm
,
TrialJobMetric
,
TrialJobStatus
,
LogType
}
from
'
../../common/trainingService
'
;
import
{
delay
,
getExperimentRootDir
,
getIPV4Address
,
getLogLevel
,
getVersion
,
mkDirPSync
,
uniqueString
}
from
'
../../common/utils
'
;
import
{
delay
,
getExperimentRootDir
,
getIPV4Address
,
getLogLevel
,
getVersion
,
mkDirPSync
,
uniqueString
}
from
'
../../common/utils
'
;
import
{
GPU_INFO
,
INITIALIZED
,
KILL_TRIAL_JOB
,
NEW_TRIAL_JOB
,
REPORT_METRIC_DATA
,
SEND_TRIAL_JOB_PARAMETER
,
STDOUT
,
TRIAL_END
,
VERSION_CHECK
}
from
'
../../core/commands
'
;
import
{
GPU_INFO
,
INITIALIZED
,
KILL_TRIAL_JOB
,
NEW_TRIAL_JOB
,
REPORT_METRIC_DATA
,
SEND_TRIAL_JOB_PARAMETER
,
STDOUT
,
TRIAL_END
,
VERSION_CHECK
}
from
'
../../core/commands
'
;
import
{
ScheduleResultType
}
from
'
../../training_service/common/gpuData
'
;
import
{
ScheduleResultType
}
from
'
../../training_service/common/gpuData
'
;
...
@@ -111,6 +111,10 @@ class TrialDispatcher implements TrainingService {
...
@@ -111,6 +111,10 @@ class TrialDispatcher implements TrainingService {
return
trial
;
return
trial
;
}
}
public
async
getTrialLog
(
_trialJobId
:
string
,
_logType
:
LogType
):
Promise
<
string
>
{
throw
new
MethodNotImplementedError
();
}
public
async
submitTrialJob
(
form
:
TrialJobApplicationForm
):
Promise
<
TrialDetail
>
{
public
async
submitTrialJob
(
form
:
TrialJobApplicationForm
):
Promise
<
TrialDetail
>
{
if
(
this
.
trialConfig
===
undefined
)
{
if
(
this
.
trialConfig
===
undefined
)
{
throw
new
Error
(
`trialConfig not initialized!`
);
throw
new
Error
(
`trialConfig not initialized!`
);
...
...
src/nni_manager/training_service/test/localTrainingService.test.ts
View file @
10c177c2
...
@@ -3,14 +3,14 @@
...
@@ -3,14 +3,14 @@
'
use strict
'
;
'
use strict
'
;
import
*
as
assert
from
'
assert
'
;
import
*
as
chai
from
'
chai
'
;
import
*
as
chai
from
'
chai
'
;
import
*
as
chaiAsPromised
from
'
chai-as-promised
'
;
import
*
as
chaiAsPromised
from
'
chai-as-promised
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
import
*
as
tmp
from
'
tmp
'
;
import
*
as
tmp
from
'
tmp
'
;
import
*
as
component
from
'
../../common/component
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
TrialJobApplicationForm
,
TrialJobDetail
,
TrainingService
}
from
'
../../common/trainingService
'
;
import
{
TrialJobApplicationForm
,
TrialJobDetail
}
from
'
../../common/trainingService
'
;
import
{
cleanupUnitTest
,
delay
,
prepareUnitTest
}
from
'
../../common/utils
'
;
import
{
cleanupUnitTest
,
delay
,
prepareUnitTest
,
getExperimentRootDir
}
from
'
../../common/utils
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
import
{
LocalTrainingService
}
from
'
../local/localTrainingService
'
;
import
{
LocalTrainingService
}
from
'
../local/localTrainingService
'
;
...
@@ -72,6 +72,36 @@ describe('Unit Test for LocalTrainingService', () => {
...
@@ -72,6 +72,36 @@ describe('Unit Test for LocalTrainingService', () => {
chai
.
expect
(
jobDetail
.
status
).
to
.
be
.
equals
(
'
USER_CANCELED
'
);
chai
.
expect
(
jobDetail
.
status
).
to
.
be
.
equals
(
'
USER_CANCELED
'
);
}).
timeout
(
20000
);
}).
timeout
(
20000
);
it
(
'
Get trial log
'
,
async
()
=>
{
await
localTrainingService
.
setClusterMetadata
(
TrialConfigMetadataKey
.
TRIAL_CONFIG
,
trialConfig
);
// submit job
const
form
:
TrialJobApplicationForm
=
{
sequenceId
:
0
,
hyperParameters
:
{
value
:
'
mock hyperparameters
'
,
index
:
0
}
};
const
jobDetail
:
TrialJobDetail
=
await
localTrainingService
.
submitTrialJob
(
form
);
// get trial log
const
rootDir
:
string
=
getExperimentRootDir
()
fs
.
mkdirSync
(
path
.
join
(
rootDir
,
'
trials
'
))
fs
.
mkdirSync
(
jobDetail
.
workingDirectory
)
fs
.
writeFileSync
(
path
.
join
(
jobDetail
.
workingDirectory
,
'
trial.log
'
),
'
trial log
'
)
fs
.
writeFileSync
(
path
.
join
(
jobDetail
.
workingDirectory
,
'
stderr
'
),
'
trial stderr
'
)
chai
.
expect
(
await
localTrainingService
.
getTrialLog
(
jobDetail
.
id
,
'
TRIAL_LOG
'
)).
to
.
be
.
equals
(
'
trial log
'
);
chai
.
expect
(
await
localTrainingService
.
getTrialLog
(
jobDetail
.
id
,
'
TRIAL_ERROR
'
)).
to
.
be
.
equals
(
'
trial stderr
'
);
fs
.
unlinkSync
(
path
.
join
(
jobDetail
.
workingDirectory
,
'
trial.log
'
))
fs
.
unlinkSync
(
path
.
join
(
jobDetail
.
workingDirectory
,
'
stderr
'
))
fs
.
rmdirSync
(
jobDetail
.
workingDirectory
)
fs
.
rmdirSync
(
path
.
join
(
rootDir
,
'
trials
'
))
await
localTrainingService
.
cancelTrialJob
(
jobDetail
.
id
);
}).
timeout
(
20000
);
it
(
'
Read metrics, Add listener, and remove listener
'
,
async
()
=>
{
it
(
'
Read metrics, Add listener, and remove listener
'
,
async
()
=>
{
// set meta data
// set meta data
const
trialConfig
:
string
=
`{\"command\":\"python3 mockedTrial.py\", \"codeDir\":\"
${
localCodeDir
}
\",\"gpuNum\":0}`
const
trialConfig
:
string
=
`{\"command\":\"python3 mockedTrial.py\", \"codeDir\":\"
${
localCodeDir
}
\",\"gpuNum\":0}`
...
...
src/webui/src/components/public-child/OpenRow.tsx
View file @
10c177c2
...
@@ -2,6 +2,7 @@ import * as React from 'react';
...
@@ -2,6 +2,7 @@ import * as React from 'react';
import
*
as
copy
from
'
copy-to-clipboard
'
;
import
*
as
copy
from
'
copy-to-clipboard
'
;
import
{
Stack
,
PrimaryButton
,
Pivot
,
PivotItem
}
from
'
office-ui-fabric-react
'
;
import
{
Stack
,
PrimaryButton
,
Pivot
,
PivotItem
}
from
'
office-ui-fabric-react
'
;
import
{
Trial
}
from
'
../../static/model/trial
'
;
import
{
Trial
}
from
'
../../static/model/trial
'
;
import
{
MANAGER_IP
}
from
'
../../static/const
'
;
import
{
EXPERIMENT
,
TRIALS
}
from
'
../../static/datamodel
'
;
import
{
EXPERIMENT
,
TRIALS
}
from
'
../../static/datamodel
'
;
import
JSONTree
from
'
react-json-tree
'
;
import
JSONTree
from
'
react-json-tree
'
;
import
PaiTrialLog
from
'
../public-child/PaiTrialLog
'
;
import
PaiTrialLog
from
'
../public-child/PaiTrialLog
'
;
...
@@ -9,6 +10,7 @@ import TrialLog from '../public-child/TrialLog';
...
@@ -9,6 +10,7 @@ import TrialLog from '../public-child/TrialLog';
import
MessageInfo
from
'
../Modals/MessageInfo
'
;
import
MessageInfo
from
'
../Modals/MessageInfo
'
;
import
'
../../static/style/overview.scss
'
;
import
'
../../static/style/overview.scss
'
;
import
'
../../static/style/copyParameter.scss
'
;
import
'
../../static/style/copyParameter.scss
'
;
import
'
../../static/style/openRow.scss
'
;
interface
OpenRowProps
{
interface
OpenRowProps
{
trialId
:
string
;
trialId
:
string
;
...
@@ -55,6 +57,10 @@ class OpenRow extends React.Component<OpenRowProps, OpenRowState> {
...
@@ -55,6 +57,10 @@ class OpenRow extends React.Component<OpenRowProps, OpenRowState> {
}
}
}
}
openTrialLog
=
(
type
:
string
):
void
=>
{
window
.
open
(
`
${
MANAGER_IP
}
/trial-log/
${
this
.
props
.
trialId
}
/
${
type
}
`
);
}
render
():
React
.
ReactNode
{
render
():
React
.
ReactNode
{
const
{
isHidenInfo
,
typeInfo
,
info
}
=
this
.
state
;
const
{
isHidenInfo
,
typeInfo
,
info
}
=
this
.
state
;
const
trialId
=
this
.
props
.
trialId
;
const
trialId
=
this
.
props
.
trialId
;
...
@@ -105,7 +111,23 @@ class OpenRow extends React.Component<OpenRowProps, OpenRowState> {
...
@@ -105,7 +111,23 @@ class OpenRow extends React.Component<OpenRowProps, OpenRowState> {
logCollection
=
{
EXPERIMENT
.
logCollectionEnabled
}
logCollection
=
{
EXPERIMENT
.
logCollectionEnabled
}
/>
/>
:
:
<
div
>
<
TrialLog
logStr
=
{
logPathRow
}
id
=
{
trialId
}
/>
<
TrialLog
logStr
=
{
logPathRow
}
id
=
{
trialId
}
/>
{
/* view each trial log in drawer*/
}
<
div
id
=
"trialog"
>
<
div
className
=
"copy"
style
=
{
{
marginTop
:
15
}
}
>
<
PrimaryButton
onClick
=
{
this
.
openTrialLog
.
bind
(
this
,
'
TRIAL_LOG
'
)
}
text
=
"View trial log"
/>
<
PrimaryButton
onClick
=
{
this
.
openTrialLog
.
bind
(
this
,
'
TRIAL_ERROR
'
)
}
text
=
"View trial error"
styles
=
{
{
root
:
{
marginLeft
:
15
}
}
}
/>
</
div
>
</
div
>
</
div
>
}
}
</
PivotItem
>
</
PivotItem
>
</
Pivot
>
</
Pivot
>
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment