Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
817ec68b
Unverified
Commit
817ec68b
authored
Apr 09, 2021
by
liuzhe-lz
Committed by
GitHub
Apr 09, 2021
Browse files
Add native support for v2 config (#3466)
parent
6aaca5f7
Changes
69
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
382 additions
and
773 deletions
+382
-773
ts/nni_manager/training_service/remote_machine/remoteMachineData.ts
...ager/training_service/remote_machine/remoteMachineData.ts
+11
-15
ts/nni_manager/training_service/remote_machine/remoteMachineJobRestServer.ts
...ning_service/remote_machine/remoteMachineJobRestServer.ts
+2
-6
ts/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
...ng_service/remote_machine/remoteMachineTrainingService.ts
+48
-133
ts/nni_manager/training_service/remote_machine/shellExecutor.ts
..._manager/training_service/remote_machine/shellExecutor.ts
+13
-13
ts/nni_manager/training_service/reusable/environment.ts
ts/nni_manager/training_service/reusable/environment.ts
+0
-1
ts/nni_manager/training_service/reusable/environments/environmentServiceFactory.ts
...ervice/reusable/environments/environmentServiceFactory.ts
+6
-5
ts/nni_manager/training_service/reusable/environments/localEnvironmentService.ts
..._service/reusable/environments/localEnvironmentService.ts
+2
-17
ts/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts
...ervice/reusable/environments/openPaiEnvironmentService.ts
+31
-93
ts/nni_manager/training_service/reusable/environments/remoteEnvironmentService.ts
...service/reusable/environments/remoteEnvironmentService.ts
+30
-79
ts/nni_manager/training_service/reusable/remote/remoteConfig.ts
..._manager/training_service/reusable/remote/remoteConfig.ts
+2
-14
ts/nni_manager/training_service/reusable/routerTrainingService.ts
...anager/training_service/reusable/routerTrainingService.ts
+15
-92
ts/nni_manager/training_service/reusable/sharedStorage.ts
ts/nni_manager/training_service/reusable/sharedStorage.ts
+2
-7
ts/nni_manager/training_service/reusable/shared_storages/azureblobStorageService.ts
...rvice/reusable/shared_storages/azureblobStorageService.ts
+28
-56
ts/nni_manager/training_service/reusable/shared_storages/nfsStorageService.ts
...ing_service/reusable/shared_storages/nfsStorageService.ts
+18
-41
ts/nni_manager/training_service/reusable/test/trialDispatcher.test.ts
...er/training_service/reusable/test/trialDispatcher.test.ts
+66
-64
ts/nni_manager/training_service/reusable/trialDispatcher.ts
ts/nni_manager/training_service/reusable/trialDispatcher.ts
+53
-88
ts/nni_manager/training_service/test/localTrainingService.test.ts
...anager/training_service/test/localTrainingService.test.ts
+32
-26
ts/webui/mock/all-types-metric.json
ts/webui/mock/all-types-metric.json
+18
-20
ts/webui/package.json
ts/webui/package.json
+3
-0
ts/webui/src/components/Overview.tsx
ts/webui/src/components/Overview.tsx
+2
-3
No files found.
ts/nni_manager/training_service/remote_machine/remoteMachineData.ts
View file @
817ec68b
...
@@ -4,6 +4,7 @@
...
@@ -4,6 +4,7 @@
'
use strict
'
;
'
use strict
'
;
import
{
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobStatus
}
from
'
../../common/trainingService
'
;
import
{
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobStatus
}
from
'
../../common/trainingService
'
;
import
{
RemoteMachineConfig
}
from
'
../../common/experimentConfig
'
;
import
{
GPUInfo
,
GPUSummary
,
ScheduleResultType
}
from
'
../common/gpuData
'
;
import
{
GPUInfo
,
GPUSummary
,
ScheduleResultType
}
from
'
../common/gpuData
'
;
import
{
ShellExecutor
}
from
'
./shellExecutor
'
;
import
{
ShellExecutor
}
from
'
./shellExecutor
'
;
...
@@ -11,19 +12,14 @@ import { ShellExecutor } from './shellExecutor';
...
@@ -11,19 +12,14 @@ import { ShellExecutor } from './shellExecutor';
* Metadata of remote machine for configuration and statuc query
* Metadata of remote machine for configuration and statuc query
*/
*/
export
class
RemoteMachineMeta
{
export
class
RemoteMachineMeta
{
public
readonly
ip
:
string
=
''
;
public
readonly
config
:
RemoteMachineConfig
;
public
readonly
port
:
number
=
22
;
public
readonly
username
:
string
=
''
;
public
readonly
passwd
:
string
=
''
;
public
readonly
sshKeyPath
?:
string
;
public
readonly
passphrase
?:
string
;
public
gpuSummary
:
GPUSummary
|
undefined
;
public
gpuSummary
:
GPUSummary
|
undefined
;
public
readonly
gpuIndices
?:
string
;
public
occupiedGpuIndexMap
:
Map
<
number
,
number
>
;
public
readonly
maxTrialNumPerGpu
?:
number
;
//TODO: initialize varialbe in constructor
constructor
(
config
:
RemoteMachineConfig
)
{
public
occupiedGpuIndexMap
?:
Map
<
number
,
number
>
;
this
.
config
=
config
;
public
readonly
useActiveGpu
?:
boolean
=
false
;
this
.
occupiedGpuIndexMap
=
new
Map
<
number
,
number
>
()
;
public
readonly
pythonPath
?:
string
;
}
}
}
/**
/**
...
@@ -74,13 +70,13 @@ export class RemoteMachineTrialJobDetail implements TrialJobDetail {
...
@@ -74,13 +70,13 @@ export class RemoteMachineTrialJobDetail implements TrialJobDetail {
* The remote machine executor manager
* The remote machine executor manager
*/
*/
export
class
ExecutorManager
{
export
class
ExecutorManager
{
public
readonly
rmMeta
:
RemoteMachineMeta
;
private
readonly
executorMap
:
Map
<
string
,
ShellExecutor
>
=
new
Map
<
string
,
ShellExecutor
>
();
private
readonly
executorMap
:
Map
<
string
,
ShellExecutor
>
=
new
Map
<
string
,
ShellExecutor
>
();
private
readonly
rmMeta
:
RemoteMachineMeta
;
private
executors
:
ShellExecutor
[]
=
[];
private
executors
:
ShellExecutor
[]
=
[];
constructor
(
rmMeta
:
RemoteMachine
Meta
)
{
constructor
(
config
:
RemoteMachine
Config
)
{
this
.
rmMeta
=
rmMeta
;
this
.
rmMeta
=
new
RemoteMachineMeta
(
config
)
;
}
}
public
async
getExecutor
(
id
:
string
):
Promise
<
ShellExecutor
>
{
public
async
getExecutor
(
id
:
string
):
Promise
<
ShellExecutor
>
{
...
...
ts/nni_manager/training_service/remote_machine/remoteMachineJobRestServer.ts
View file @
817ec68b
...
@@ -3,8 +3,6 @@
...
@@ -3,8 +3,6 @@
'
use strict
'
;
'
use strict
'
;
import
{
Inject
}
from
'
typescript-ioc
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
ClusterJobRestServer
}
from
'
../common/clusterJobRestServer
'
;
import
{
ClusterJobRestServer
}
from
'
../common/clusterJobRestServer
'
;
import
{
RemoteMachineTrainingService
}
from
'
./remoteMachineTrainingService
'
;
import
{
RemoteMachineTrainingService
}
from
'
./remoteMachineTrainingService
'
;
...
@@ -12,17 +10,15 @@ import { RemoteMachineTrainingService } from './remoteMachineTrainingService';
...
@@ -12,17 +10,15 @@ import { RemoteMachineTrainingService } from './remoteMachineTrainingService';
* RemoteMachine Training service Rest server, provides rest RemoteMachine to support remotemachine job metrics update
* RemoteMachine Training service Rest server, provides rest RemoteMachine to support remotemachine job metrics update
*
*
*/
*/
@
component
.
Singleton
export
class
RemoteMachineJobRestServer
extends
ClusterJobRestServer
{
export
class
RemoteMachineJobRestServer
extends
ClusterJobRestServer
{
@
Inject
private
readonly
remoteMachineTrainingService
:
RemoteMachineTrainingService
;
private
readonly
remoteMachineTrainingService
:
RemoteMachineTrainingService
;
/**
/**
* constructor to provide NNIRestServer's own rest property, e.g. port
* constructor to provide NNIRestServer's own rest property, e.g. port
*/
*/
constructor
()
{
constructor
(
remoteMachineTrainingService
:
RemoteMachineTrainingService
)
{
super
();
super
();
this
.
remoteMachineTrainingService
=
component
.
get
(
R
emoteMachineTrainingService
)
;
this
.
remoteMachineTrainingService
=
r
emoteMachineTrainingService
;
}
}
protected
handleTrialMetrics
(
jobId
:
string
,
metrics
:
any
[]):
void
{
protected
handleTrialMetrics
(
jobId
:
string
,
metrics
:
any
[]):
void
{
...
...
ts/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
View file @
817ec68b
...
@@ -15,70 +15,77 @@ import { getExperimentId } from '../../common/experimentStartupInfo';
...
@@ -15,70 +15,77 @@ import { getExperimentId } from '../../common/experimentStartupInfo';
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
ObservableTimer
}
from
'
../../common/observableTimer
'
;
import
{
ObservableTimer
}
from
'
../../common/observableTimer
'
;
import
{
import
{
HyperParameters
,
NNIManagerIpConfig
,
TrainingService
,
TrialJobApplicationForm
,
HyperParameters
,
TrainingService
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
,
LogType
TrialJobDetail
,
TrialJobMetric
,
LogType
}
from
'
../../common/trainingService
'
;
}
from
'
../../common/trainingService
'
;
import
{
import
{
delay
,
generateParamFileName
,
getExperimentRootDir
,
getIPV4Address
,
getJobCancelStatus
,
delay
,
generateParamFileName
,
getExperimentRootDir
,
getIPV4Address
,
getJobCancelStatus
,
getVersion
,
uniqueString
getVersion
,
uniqueString
}
from
'
../../common/utils
'
;
}
from
'
../../common/utils
'
;
import
{
ExperimentConfig
,
RemoteConfig
,
RemoteMachineConfig
,
flattenConfig
}
from
'
../../common/experimentConfig
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../common/containerJobData
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../common/containerJobData
'
;
import
{
GPUSummary
,
ScheduleResultType
}
from
'
../common/gpuData
'
;
import
{
GPUSummary
,
ScheduleResultType
}
from
'
../common/gpuData
'
;
import
{
TrialConfig
}
from
'
../common/trialConfig
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
import
{
execMkdir
,
validateCodeDir
}
from
'
../common/util
'
;
import
{
execMkdir
,
validateCodeDir
}
from
'
../common/util
'
;
import
{
GPUScheduler
}
from
'
./gpuScheduler
'
;
import
{
GPUScheduler
}
from
'
./gpuScheduler
'
;
import
{
import
{
ExecutorManager
,
RemoteMachineMeta
,
ExecutorManager
,
RemoteMachineScheduleInfo
,
RemoteMachineScheduleResult
,
RemoteMachineTrialJobDetail
RemoteMachineScheduleInfo
,
RemoteMachineScheduleResult
,
RemoteMachineTrialJobDetail
}
from
'
./remoteMachineData
'
;
}
from
'
./remoteMachineData
'
;
import
{
RemoteMachineJobRestServer
}
from
'
./remoteMachineJobRestServer
'
;
import
{
RemoteMachineJobRestServer
}
from
'
./remoteMachineJobRestServer
'
;
interface
FlattenRemoteConfig
extends
ExperimentConfig
,
RemoteConfig
{
}
/**
/**
* Training Service implementation for Remote Machine (Linux)
* Training Service implementation for Remote Machine (Linux)
*/
*/
@
component
.
Singleton
@
component
.
Singleton
class
RemoteMachineTrainingService
implements
TrainingService
{
class
RemoteMachineTrainingService
implements
TrainingService
{
private
readonly
initExecutorId
=
"
initConnection
"
;
private
readonly
initExecutorId
=
"
initConnection
"
;
private
readonly
machineExecutorManagerMap
:
Map
<
RemoteMachine
Meta
,
ExecutorManager
>
;
//machine excutor map
private
readonly
machineExecutorManagerMap
:
Map
<
RemoteMachine
Config
,
ExecutorManager
>
;
//machine excutor map
private
readonly
machineCopyExpCodeDirPromiseMap
:
Map
<
RemoteMachine
Meta
,
Promise
<
void
>>
;
private
readonly
machineCopyExpCodeDirPromiseMap
:
Map
<
RemoteMachine
Config
,
Promise
<
void
>>
;
private
readonly
trialExecutorManagerMap
:
Map
<
string
,
ExecutorManager
>
;
//trial excutor map
private
readonly
trialExecutorManagerMap
:
Map
<
string
,
ExecutorManager
>
;
//trial excutor map
private
readonly
trialJobsMap
:
Map
<
string
,
RemoteMachineTrialJobDetail
>
;
private
readonly
trialJobsMap
:
Map
<
string
,
RemoteMachineTrialJobDetail
>
;
private
readonly
expRootDir
:
string
;
private
readonly
expRootDir
:
string
;
private
trialConfig
:
TrialConfig
|
undefined
;
private
gpuScheduler
?:
GPUScheduler
;
private
gpuScheduler
?:
GPUScheduler
;
private
readonly
jobQueue
:
string
[];
private
readonly
jobQueue
:
string
[];
private
readonly
timer
:
ObservableTimer
;
private
readonly
timer
:
ObservableTimer
;
private
stopping
:
boolean
=
false
;
private
stopping
:
boolean
=
false
;
private
readonly
metricsEmitter
:
EventEmitter
;
private
readonly
metricsEmitter
:
EventEmitter
;
private
readonly
log
:
Logger
;
private
readonly
log
:
Logger
;
private
isMultiPhase
:
boolean
=
false
;
private
remoteRestServerPort
?:
number
;
private
remoteRestServerPort
?:
number
;
private
nniManagerIpConfig
?:
NNIManagerIpConfig
;
private
versionCheck
:
boolean
=
true
;
private
versionCheck
:
boolean
=
true
;
private
logCollection
:
string
;
private
logCollection
:
string
=
'
none
'
;
private
sshConnectionPromises
:
any
[];
private
sshConnectionPromises
:
any
[];
private
config
:
FlattenRemoteConfig
;
constructor
(
@
co
mponent
.
Inject
timer
:
ObservableTimer
)
{
constructor
(
co
nfig
:
ExperimentConfig
)
{
this
.
metricsEmitter
=
new
EventEmitter
();
this
.
metricsEmitter
=
new
EventEmitter
();
this
.
trialJobsMap
=
new
Map
<
string
,
RemoteMachineTrialJobDetail
>
();
this
.
trialJobsMap
=
new
Map
<
string
,
RemoteMachineTrialJobDetail
>
();
this
.
trialExecutorManagerMap
=
new
Map
<
string
,
ExecutorManager
>
();
this
.
trialExecutorManagerMap
=
new
Map
<
string
,
ExecutorManager
>
();
this
.
machineCopyExpCodeDirPromiseMap
=
new
Map
<
RemoteMachine
Meta
,
Promise
<
void
>>
();
this
.
machineCopyExpCodeDirPromiseMap
=
new
Map
<
RemoteMachine
Config
,
Promise
<
void
>>
();
this
.
machineExecutorManagerMap
=
new
Map
<
RemoteMachine
Meta
,
ExecutorManager
>
();
this
.
machineExecutorManagerMap
=
new
Map
<
RemoteMachine
Config
,
ExecutorManager
>
();
this
.
jobQueue
=
[];
this
.
jobQueue
=
[];
this
.
sshConnectionPromises
=
[];
this
.
sshConnectionPromises
=
[];
this
.
expRootDir
=
getExperimentRootDir
();
this
.
expRootDir
=
getExperimentRootDir
();
this
.
timer
=
t
imer
;
this
.
timer
=
component
.
get
(
ObservableT
imer
)
;
this
.
log
=
getLogger
();
this
.
log
=
getLogger
();
this
.
logCollection
=
'
none
'
;
this
.
log
.
info
(
'
Construct remote machine training service.
'
);
this
.
log
.
info
(
'
Construct remote machine training service.
'
);
this
.
config
=
flattenConfig
(
config
,
'
remote
'
);
if
(
!
fs
.
lstatSync
(
this
.
config
.
trialCodeDirectory
).
isDirectory
())
{
throw
new
Error
(
`codeDir
${
this
.
config
.
trialCodeDirectory
}
is not a directory`
);
}
validateCodeDir
(
this
.
config
.
trialCodeDirectory
);
this
.
sshConnectionPromises
=
this
.
config
.
machineList
.
map
(
machine
=>
this
.
initRemoteMachineOnConnected
(
machine
)
);
}
}
/**
/**
* Loop to launch trial jobs and collect trial metrics
* Loop to launch trial jobs and collect trial metrics
*/
*/
public
async
run
():
Promise
<
void
>
{
public
async
run
():
Promise
<
void
>
{
const
restServer
:
RemoteMachineJobRestServer
=
component
.
get
(
RemoteMachineJobRestServer
);
const
restServer
=
new
RemoteMachineJobRestServer
(
this
);
await
restServer
.
start
();
await
restServer
.
start
();
restServer
.
setEnableVersionCheck
=
this
.
versionCheck
;
restServer
.
setEnableVersionCheck
=
this
.
versionCheck
;
this
.
log
.
info
(
'
Run remote machine training service.
'
);
this
.
log
.
info
(
'
Run remote machine training service.
'
);
...
@@ -89,16 +96,13 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -89,16 +96,13 @@ class RemoteMachineTrainingService implements TrainingService {
this
.
sshConnectionPromises
=
[];
this
.
sshConnectionPromises
=
[];
// initialize gpuScheduler
// initialize gpuScheduler
this
.
gpuScheduler
=
new
GPUScheduler
(
this
.
machineExecutorManagerMap
);
this
.
gpuScheduler
=
new
GPUScheduler
(
this
.
machineExecutorManagerMap
);
if
(
this
.
trialConfig
===
undefined
)
{
throw
new
Error
(
"
trial config not initialized!
"
);
}
// Copy codeDir to remote machine
// Copy codeDir to remote machine
for
(
const
[
rmMeta
,
executorManager
]
of
this
.
machineExecutorManagerMap
.
entries
())
{
for
(
const
[
machineConfig
,
executorManager
]
of
this
.
machineExecutorManagerMap
.
entries
())
{
const
executor
:
ShellExecutor
=
await
executorManager
.
getExecutor
(
this
.
initExecutorId
);
const
executor
:
ShellExecutor
=
await
executorManager
.
getExecutor
(
this
.
initExecutorId
);
if
(
executor
!==
undefined
)
{
if
(
executor
!==
undefined
)
{
this
.
machineCopyExpCodeDirPromiseMap
.
set
(
this
.
machineCopyExpCodeDirPromiseMap
.
set
(
rmMeta
,
machineConfig
,
executor
.
copyDirectoryToRemote
(
this
.
trialConfig
.
codeDir
,
executor
.
getRemoteCodePath
(
getExperimentId
()))
executor
.
copyDirectoryToRemote
(
this
.
config
.
trialCodeDirectory
,
executor
.
getRemoteCodePath
(
getExperimentId
()))
);
);
}
}
}
}
...
@@ -134,7 +138,7 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -134,7 +138,7 @@ class RemoteMachineTrainingService implements TrainingService {
if
(
trial
.
rmMeta
===
undefined
)
{
if
(
trial
.
rmMeta
===
undefined
)
{
throw
new
Error
(
`rmMeta not set in trial
${
trial
.
id
}
`
);
throw
new
Error
(
`rmMeta not set in trial
${
trial
.
id
}
`
);
}
}
const
executorManager
:
ExecutorManager
|
undefined
=
this
.
machineExecutorManagerMap
.
get
(
trial
.
rmMeta
);
const
executorManager
:
ExecutorManager
|
undefined
=
this
.
machineExecutorManagerMap
.
get
(
trial
.
rmMeta
.
config
);
if
(
executorManager
===
undefined
)
{
if
(
executorManager
===
undefined
)
{
throw
new
Error
(
`executorManager not initialized`
);
throw
new
Error
(
`executorManager not initialized`
);
}
}
...
@@ -225,10 +229,6 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -225,10 +229,6 @@ class RemoteMachineTrainingService implements TrainingService {
* @param form trial job description form
* @param form trial job description form
*/
*/
public
async
submitTrialJob
(
form
:
TrialJobApplicationForm
):
Promise
<
TrialJobDetail
>
{
public
async
submitTrialJob
(
form
:
TrialJobApplicationForm
):
Promise
<
TrialJobDetail
>
{
if
(
this
.
trialConfig
===
undefined
)
{
throw
new
Error
(
'
trial config is not initialized
'
);
}
// Generate trial job id(random)
// Generate trial job id(random)
const
trialJobId
:
string
=
uniqueString
(
5
);
const
trialJobId
:
string
=
uniqueString
(
5
);
...
@@ -260,13 +260,6 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -260,13 +260,6 @@ class RemoteMachineTrainingService implements TrainingService {
return
trialJobDetail
;
return
trialJobDetail
;
}
}
/**
* Is multiphase job supported in current training service
*/
public
get
isMultiPhaseJobSupported
():
boolean
{
return
true
;
}
/**
/**
* Cancel trial job
* Cancel trial job
* @param trialJobId ID of trial job
* @param trialJobId ID of trial job
...
@@ -311,70 +304,8 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -311,70 +304,8 @@ class RemoteMachineTrainingService implements TrainingService {
}
}
}
}
/**
public
async
setClusterMetadata
(
_key
:
string
,
_value
:
string
):
Promise
<
void
>
{
return
;
}
* Set culster metadata
public
async
getClusterMetadata
(
_key
:
string
):
Promise
<
string
>
{
return
''
;
}
* @param key metadata key
* //1. MACHINE_LIST -- create executor of machine list
* //2. TRIAL_CONFIG -- trial configuration
* @param value metadata value
*/
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
switch
(
key
)
{
case
TrialConfigMetadataKey
.
NNI_MANAGER_IP
:
this
.
nniManagerIpConfig
=
<
NNIManagerIpConfig
>
JSON
.
parse
(
value
);
break
;
case
TrialConfigMetadataKey
.
MACHINE_LIST
:
await
this
.
setupConnections
(
value
);
break
;
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
{
const
remoteMachineTrailConfig
:
TrialConfig
=
<
TrialConfig
>
JSON
.
parse
(
value
);
// Parse trial config failed, throw Error
if
(
remoteMachineTrailConfig
===
undefined
)
{
throw
new
Error
(
'
trial config parsed failed
'
);
}
// codeDir is not a valid directory, throw Error
if
(
!
fs
.
lstatSync
(
remoteMachineTrailConfig
.
codeDir
)
.
isDirectory
())
{
throw
new
Error
(
`codeDir
${
remoteMachineTrailConfig
.
codeDir
}
is not a directory`
);
}
try
{
// Validate to make sure codeDir doesn't have too many files
await
validateCodeDir
(
remoteMachineTrailConfig
.
codeDir
);
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
new
Error
(
error
));
}
this
.
trialConfig
=
remoteMachineTrailConfig
;
break
;
}
case
TrialConfigMetadataKey
.
MULTI_PHASE
:
this
.
isMultiPhase
=
(
value
===
'
true
'
||
value
===
'
True
'
);
break
;
case
TrialConfigMetadataKey
.
VERSION_CHECK
:
this
.
versionCheck
=
(
value
===
'
true
'
||
value
===
'
True
'
);
break
;
case
TrialConfigMetadataKey
.
LOG_COLLECTION
:
this
.
logCollection
=
value
;
break
;
case
TrialConfigMetadataKey
.
REMOTE_CONFIG
:
// Add remote_config in remoteEnvironmentService to set reuse mode,
// this config need to be catched here, otherwise will throw Unknown key exception here
break
;
default
:
//Reject for unknown keys
throw
new
Error
(
`Uknown key:
${
key
}
`
);
}
}
/**
* Get culster metadata
* @param key metadata key
*/
public
async
getClusterMetadata
(
_key
:
string
):
Promise
<
string
>
{
return
""
;
}
/**
/**
* cleanup() has a time out of 10s to clean remote connections
* cleanup() has a time out of 10s to clean remote connections
...
@@ -426,23 +357,12 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -426,23 +357,12 @@ class RemoteMachineTrainingService implements TrainingService {
}
}
}
}
private
async
setupConnections
(
machineList
:
string
):
Promise
<
void
>
{
private
async
initRemoteMachineOnConnected
(
machineConfig
:
RemoteMachineConfig
):
Promise
<
void
>
{
this
.
log
.
debug
(
`Connecting to remote machines:
${
machineList
}
`
);
const
executorManager
:
ExecutorManager
=
new
ExecutorManager
(
machineConfig
);
//TO DO: verify if value's format is wrong, and json parse failed, how to handle error
this
.
log
.
info
(
`connecting to
${
machineConfig
.
user
}
@
${
machineConfig
.
host
}
:
${
machineConfig
.
port
}
`
);
const
rmMetaList
:
RemoteMachineMeta
[]
=
<
RemoteMachineMeta
[]
>
JSON
.
parse
(
machineList
);
for
(
const
rmMeta
of
rmMetaList
)
{
this
.
sshConnectionPromises
.
push
(
this
.
initRemoteMachineOnConnected
(
rmMeta
));
}
}
private
async
initRemoteMachineOnConnected
(
rmMeta
:
RemoteMachineMeta
):
Promise
<
void
>
{
rmMeta
.
occupiedGpuIndexMap
=
new
Map
<
number
,
number
>
();
const
executorManager
:
ExecutorManager
=
new
ExecutorManager
(
rmMeta
);
this
.
log
.
info
(
`connecting to
${
rmMeta
.
username
}
@
${
rmMeta
.
ip
}
:
${
rmMeta
.
port
}
`
);
const
executor
:
ShellExecutor
=
await
executorManager
.
getExecutor
(
this
.
initExecutorId
);
const
executor
:
ShellExecutor
=
await
executorManager
.
getExecutor
(
this
.
initExecutorId
);
this
.
log
.
debug
(
`reached
${
executor
.
name
}
`
);
this
.
log
.
debug
(
`reached
${
executor
.
name
}
`
);
this
.
machineExecutorManagerMap
.
set
(
rmMeta
,
executorManager
);
this
.
machineExecutorManagerMap
.
set
(
machineConfig
,
executorManager
);
this
.
log
.
debug
(
`initializing
${
executor
.
name
}
`
);
this
.
log
.
debug
(
`initializing
${
executor
.
name
}
`
);
// Create root working directory after executor is ready
// Create root working directory after executor is ready
...
@@ -469,15 +389,15 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -469,15 +389,15 @@ class RemoteMachineTrainingService implements TrainingService {
collectingCount
.
push
(
true
);
collectingCount
.
push
(
true
);
const
cmdresult
=
await
executor
.
readLastLines
(
executor
.
joinPath
(
remoteGpuScriptCollectorDir
,
'
gpu_metrics
'
));
const
cmdresult
=
await
executor
.
readLastLines
(
executor
.
joinPath
(
remoteGpuScriptCollectorDir
,
'
gpu_metrics
'
));
if
(
cmdresult
!==
""
)
{
if
(
cmdresult
!==
""
)
{
rmMeta
.
gpuSummary
=
<
GPUSummary
>
JSON
.
parse
(
cmdresult
);
executorManager
.
rmMeta
.
gpuSummary
=
<
GPUSummary
>
JSON
.
parse
(
cmdresult
);
if
(
rmMeta
.
gpuSummary
.
gpuCount
===
0
)
{
if
(
executorManager
.
rmMeta
.
gpuSummary
.
gpuCount
===
0
)
{
this
.
log
.
warning
(
`No GPU found on remote machine
${
rmMeta
.
ip
}
`
);
this
.
log
.
warning
(
`No GPU found on remote machine
${
machineConfig
.
host
}
`
);
this
.
timer
.
unsubscribe
(
disposable
);
this
.
timer
.
unsubscribe
(
disposable
);
}
}
}
}
if
(
this
.
stopping
)
{
if
(
this
.
stopping
)
{
this
.
timer
.
unsubscribe
(
disposable
);
this
.
timer
.
unsubscribe
(
disposable
);
this
.
log
.
debug
(
`Stopped GPU collector on
${
rmMeta
.
ip
}
, since experiment is exiting.`
);
this
.
log
.
debug
(
`Stopped GPU collector on
${
machineConfig
.
host
}
, since experiment is exiting.`
);
}
}
collectingCount
.
pop
();
collectingCount
.
pop
();
}
}
...
@@ -488,9 +408,6 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -488,9 +408,6 @@ class RemoteMachineTrainingService implements TrainingService {
private
async
prepareTrialJob
(
trialJobId
:
string
):
Promise
<
boolean
>
{
private
async
prepareTrialJob
(
trialJobId
:
string
):
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
if
(
this
.
trialConfig
===
undefined
)
{
throw
new
Error
(
'
trial config is not initialized
'
);
}
if
(
this
.
gpuScheduler
===
undefined
)
{
if
(
this
.
gpuScheduler
===
undefined
)
{
throw
new
Error
(
'
gpuScheduler is not initialized
'
);
throw
new
Error
(
'
gpuScheduler is not initialized
'
);
}
}
...
@@ -505,9 +422,9 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -505,9 +422,9 @@ class RemoteMachineTrainingService implements TrainingService {
return
deferred
.
promise
;
return
deferred
.
promise
;
}
}
// get an executor from scheduler
// get an executor from scheduler
const
rmScheduleResult
:
RemoteMachineScheduleResult
=
this
.
gpuScheduler
.
scheduleMachine
(
this
.
trialConfig
.
g
puNum
,
trialJobDetail
);
const
rmScheduleResult
:
RemoteMachineScheduleResult
=
this
.
gpuScheduler
.
scheduleMachine
(
this
.
config
.
trialG
puNum
ber
,
trialJobDetail
);
if
(
rmScheduleResult
.
resultType
===
ScheduleResultType
.
REQUIRE_EXCEED_TOTAL
)
{
if
(
rmScheduleResult
.
resultType
===
ScheduleResultType
.
REQUIRE_EXCEED_TOTAL
)
{
const
errorMessage
:
string
=
`Required GPU number
${
this
.
trialConfig
.
g
puNum
}
is too large, no machine can meet`
;
const
errorMessage
:
string
=
`Required GPU number
${
this
.
config
.
trialG
puNum
ber
}
is too large, no machine can meet`
;
this
.
log
.
error
(
errorMessage
);
this
.
log
.
error
(
errorMessage
);
deferred
.
reject
();
deferred
.
reject
();
throw
new
NNIError
(
NNIErrorNames
.
RESOURCE_NOT_AVAILABLE
,
errorMessage
);
throw
new
NNIError
(
NNIErrorNames
.
RESOURCE_NOT_AVAILABLE
,
errorMessage
);
...
@@ -516,7 +433,7 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -516,7 +433,7 @@ class RemoteMachineTrainingService implements TrainingService {
const
rmScheduleInfo
:
RemoteMachineScheduleInfo
=
rmScheduleResult
.
scheduleInfo
;
const
rmScheduleInfo
:
RemoteMachineScheduleInfo
=
rmScheduleResult
.
scheduleInfo
;
trialJobDetail
.
rmMeta
=
rmScheduleInfo
.
rmMeta
;
trialJobDetail
.
rmMeta
=
rmScheduleInfo
.
rmMeta
;
const
copyExpCodeDirPromise
=
this
.
machineCopyExpCodeDirPromiseMap
.
get
(
trialJobDetail
.
rmMeta
);
const
copyExpCodeDirPromise
=
this
.
machineCopyExpCodeDirPromiseMap
.
get
(
rmScheduleInfo
.
rmMeta
.
config
);
if
(
copyExpCodeDirPromise
!==
undefined
)
{
if
(
copyExpCodeDirPromise
!==
undefined
)
{
await
copyExpCodeDirPromise
;
await
copyExpCodeDirPromise
;
}
}
...
@@ -530,7 +447,7 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -530,7 +447,7 @@ class RemoteMachineTrainingService implements TrainingService {
trialJobId
,
trialJobDetail
.
form
,
rmScheduleInfo
);
trialJobId
,
trialJobDetail
.
form
,
rmScheduleInfo
);
trialJobDetail
.
status
=
'
RUNNING
'
;
trialJobDetail
.
status
=
'
RUNNING
'
;
trialJobDetail
.
url
=
`file://
${
rmScheduleInfo
.
rmMeta
.
ip
}
:
${
trialJobDetail
.
workingDirectory
}
`
;
trialJobDetail
.
url
=
`file://
${
rmScheduleInfo
.
rmMeta
.
config
.
host
}
:
${
trialJobDetail
.
workingDirectory
}
`
;
trialJobDetail
.
startTime
=
Date
.
now
();
trialJobDetail
.
startTime
=
Date
.
now
();
this
.
trialJobsMap
.
set
(
trialJobId
,
trialJobDetail
);
this
.
trialJobsMap
.
set
(
trialJobId
,
trialJobDetail
);
...
@@ -547,9 +464,6 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -547,9 +464,6 @@ class RemoteMachineTrainingService implements TrainingService {
private
async
launchTrialOnScheduledMachine
(
trialJobId
:
string
,
form
:
TrialJobApplicationForm
,
private
async
launchTrialOnScheduledMachine
(
trialJobId
:
string
,
form
:
TrialJobApplicationForm
,
rmScheduleInfo
:
RemoteMachineScheduleInfo
):
Promise
<
void
>
{
rmScheduleInfo
:
RemoteMachineScheduleInfo
):
Promise
<
void
>
{
if
(
this
.
trialConfig
===
undefined
)
{
throw
new
Error
(
'
trial config is not initialized
'
);
}
const
cudaVisibleDevice
:
string
=
rmScheduleInfo
.
cudaVisibleDevice
;
const
cudaVisibleDevice
:
string
=
rmScheduleInfo
.
cudaVisibleDevice
;
const
executor
=
await
this
.
getExecutor
(
trialJobId
);
const
executor
=
await
this
.
getExecutor
(
trialJobId
);
const
trialJobDetail
:
RemoteMachineTrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
const
trialJobDetail
:
RemoteMachineTrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
...
@@ -568,7 +482,7 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -568,7 +482,7 @@ class RemoteMachineTrainingService implements TrainingService {
// Set CUDA_VISIBLE_DEVICES environment variable based on cudaVisibleDevice
// Set CUDA_VISIBLE_DEVICES environment variable based on cudaVisibleDevice
// If no valid cudaVisibleDevice is defined, set CUDA_VISIBLE_DEVICES to empty string to hide GPU device
// If no valid cudaVisibleDevice is defined, set CUDA_VISIBLE_DEVICES to empty string to hide GPU device
// If gpuNum is undefined, will not set CUDA_VISIBLE_DEVICES in script
// If gpuNum is undefined, will not set CUDA_VISIBLE_DEVICES in script
if
(
this
.
trialConfig
.
g
puNum
===
undefined
)
{
if
(
this
.
config
.
trialG
puNum
ber
===
undefined
)
{
cudaVisible
=
""
cudaVisible
=
""
}
else
{
}
else
{
if
(
typeof
cudaVisibleDevice
===
'
string
'
&&
cudaVisibleDevice
.
length
>
0
)
{
if
(
typeof
cudaVisibleDevice
===
'
string
'
&&
cudaVisibleDevice
.
length
>
0
)
{
...
@@ -577,7 +491,7 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -577,7 +491,7 @@ class RemoteMachineTrainingService implements TrainingService {
cudaVisible
=
`CUDA_VISIBLE_DEVICES=" "`
;
cudaVisible
=
`CUDA_VISIBLE_DEVICES=" "`
;
}
}
}
}
const
nniManagerIp
:
string
=
this
.
nniManagerIp
Config
?
this
.
nniManagerIpC
onfig
.
nniManagerIp
:
getIPV4Address
();
const
nniManagerIp
:
string
=
this
.
config
.
nniManagerIp
?
this
.
c
onfig
.
nniManagerIp
:
getIPV4Address
();
if
(
this
.
remoteRestServerPort
===
undefined
)
{
if
(
this
.
remoteRestServerPort
===
undefined
)
{
const
restServer
:
RemoteMachineJobRestServer
=
component
.
get
(
RemoteMachineJobRestServer
);
const
restServer
:
RemoteMachineJobRestServer
=
component
.
get
(
RemoteMachineJobRestServer
);
this
.
remoteRestServerPort
=
restServer
.
clusterRestServerPort
;
this
.
remoteRestServerPort
=
restServer
.
clusterRestServerPort
;
...
@@ -588,12 +502,13 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -588,12 +502,13 @@ class RemoteMachineTrainingService implements TrainingService {
trialJobId
,
trialJobId
,
getExperimentId
(),
getExperimentId
(),
trialJobDetail
.
form
.
sequenceId
.
toString
(),
trialJobDetail
.
form
.
sequenceId
.
toString
(),
this
.
isM
ulti
P
hase
,
false
,
// m
ulti
-p
hase
this
.
trialConfig
.
c
ommand
,
this
.
config
.
trialC
ommand
,
nniManagerIp
,
nniManagerIp
,
this
.
remoteRestServerPort
,
this
.
remoteRestServerPort
,
version
,
version
,
this
.
logCollection
,
cudaVisible
);
this
.
logCollection
,
cudaVisible
);
//create tmp trial working folder locally.
//create tmp trial working folder locally.
await
execMkdir
(
path
.
join
(
trialLocalTempFolder
,
'
.nni
'
));
await
execMkdir
(
path
.
join
(
trialLocalTempFolder
,
'
.nni
'
));
...
...
ts/nni_manager/training_service/remote_machine/shellExecutor.ts
View file @
817ec68b
...
@@ -44,24 +44,24 @@ class ShellExecutor {
...
@@ -44,24 +44,24 @@ class ShellExecutor {
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
connectConfig
:
ConnectConfig
=
{
const
connectConfig
:
ConnectConfig
=
{
host
:
rmMeta
.
ip
,
host
:
rmMeta
.
config
.
host
,
port
:
rmMeta
.
port
,
port
:
rmMeta
.
config
.
port
,
username
:
rmMeta
.
user
name
,
username
:
rmMeta
.
config
.
user
,
tryKeyboard
:
true
,
tryKeyboard
:
true
,
};
};
this
.
pythonPath
=
rmMeta
.
pythonPath
;
this
.
pythonPath
=
rmMeta
.
config
.
pythonPath
;
this
.
name
=
`
${
rmMeta
.
username
}
@
${
rmMeta
.
ip
}
:
${
rmMeta
.
port
}
`
;
this
.
name
=
`
${
rmMeta
.
config
.
user
}
@
${
rmMeta
.
config
.
host
}
:
${
rmMeta
.
config
.
port
}
`
;
if
(
rmMeta
.
passwd
!==
undefined
)
{
if
(
rmMeta
.
config
.
passw
or
d
!==
undefined
)
{
connectConfig
.
password
=
rmMeta
.
passwd
;
connectConfig
.
password
=
rmMeta
.
config
.
passw
or
d
;
}
else
if
(
rmMeta
.
sshKey
Path
!==
undefined
)
{
}
else
if
(
rmMeta
.
config
.
sshKey
File
!==
undefined
)
{
if
(
!
fs
.
existsSync
(
rmMeta
.
sshKey
Path
))
{
if
(
!
fs
.
existsSync
(
rmMeta
.
config
.
sshKey
File
))
{
//SSh key path is not a valid file, reject
//SSh key path is not a valid file, reject
deferred
.
reject
(
new
Error
(
`
${
rmMeta
.
sshKey
Path
}
does not exist.`
));
deferred
.
reject
(
new
Error
(
`
${
rmMeta
.
config
.
sshKey
File
}
does not exist.`
));
}
}
const
privateKey
:
string
=
fs
.
readFileSync
(
rmMeta
.
sshKey
Path
,
'
utf8
'
);
const
privateKey
:
string
=
fs
.
readFileSync
(
rmMeta
.
config
.
sshKey
File
,
'
utf8
'
);
connectConfig
.
privateKey
=
privateKey
;
connectConfig
.
privateKey
=
privateKey
;
connectConfig
.
passphrase
=
rmMeta
.
p
assphrase
;
connectConfig
.
passphrase
=
rmMeta
.
config
.
sshP
assphrase
;
}
else
{
}
else
{
deferred
.
reject
(
new
Error
(
`No valid passwd or sshKeyPath is configed.`
));
deferred
.
reject
(
new
Error
(
`No valid passwd or sshKeyPath is configed.`
));
}
}
...
@@ -101,7 +101,7 @@ class ShellExecutor {
...
@@ -101,7 +101,7 @@ class ShellExecutor {
// SSH connection error, reject with error message
// SSH connection error, reject with error message
deferred
.
reject
(
new
Error
(
err
.
message
));
deferred
.
reject
(
new
Error
(
err
.
message
));
}).
on
(
"
keyboard-interactive
"
,
(
_name
,
_instructions
,
_lang
,
_prompts
,
finish
)
=>
{
}).
on
(
"
keyboard-interactive
"
,
(
_name
,
_instructions
,
_lang
,
_prompts
,
finish
)
=>
{
finish
([
rmMeta
.
passwd
]);
finish
([
rmMeta
.
config
.
password
||
''
]);
}).
connect
(
connectConfig
);
}).
connect
(
connectConfig
);
return
deferred
.
promise
;
return
deferred
.
promise
;
...
...
ts/nni_manager/training_service/reusable/environment.ts
View file @
817ec68b
...
@@ -129,7 +129,6 @@ export class EnvironmentInformation {
...
@@ -129,7 +129,6 @@ export class EnvironmentInformation {
export
abstract
class
EnvironmentService
{
export
abstract
class
EnvironmentService
{
public
abstract
get
hasStorageService
():
boolean
;
public
abstract
get
hasStorageService
():
boolean
;
public
abstract
config
(
key
:
string
,
value
:
string
):
Promise
<
void
>
;
public
abstract
refreshEnvironmentsStatus
(
environments
:
EnvironmentInformation
[]):
Promise
<
void
>
;
public
abstract
refreshEnvironmentsStatus
(
environments
:
EnvironmentInformation
[]):
Promise
<
void
>
;
public
abstract
stopEnvironment
(
environment
:
EnvironmentInformation
):
Promise
<
void
>
;
public
abstract
stopEnvironment
(
environment
:
EnvironmentInformation
):
Promise
<
void
>
;
public
abstract
startEnvironment
(
environment
:
EnvironmentInformation
):
Promise
<
void
>
;
public
abstract
startEnvironment
(
environment
:
EnvironmentInformation
):
Promise
<
void
>
;
...
...
ts/nni_manager/training_service/reusable/environments/environmentServiceFactory.ts
View file @
817ec68b
...
@@ -3,18 +3,19 @@ import { OpenPaiEnvironmentService } from './openPaiEnvironmentService';
...
@@ -3,18 +3,19 @@ import { OpenPaiEnvironmentService } from './openPaiEnvironmentService';
import
{
LocalEnvironmentService
}
from
'
./localEnvironmentService
'
;
import
{
LocalEnvironmentService
}
from
'
./localEnvironmentService
'
;
import
{
RemoteEnvironmentService
}
from
'
./remoteEnvironmentService
'
;
import
{
RemoteEnvironmentService
}
from
'
./remoteEnvironmentService
'
;
import
{
EnvironmentService
}
from
'
../environment
'
;
import
{
EnvironmentService
}
from
'
../environment
'
;
import
{
ExperimentConfig
}
from
'
../../../common/experimentConfig
'
;
export
class
EnvironmentServiceFactory
{
export
class
EnvironmentServiceFactory
{
public
static
createEnvironmentService
(
name
:
string
):
EnvironmentService
{
public
static
createEnvironmentService
(
name
:
string
,
config
:
ExperimentConfig
):
EnvironmentService
{
switch
(
name
)
{
switch
(
name
)
{
case
'
local
'
:
case
'
local
'
:
return
new
LocalEnvironmentService
();
return
new
LocalEnvironmentService
(
config
);
case
'
remote
'
:
case
'
remote
'
:
return
new
RemoteEnvironmentService
();
return
new
RemoteEnvironmentService
(
config
);
case
'
aml
'
:
case
'
aml
'
:
return
new
AMLEnvironmentService
();
return
new
AMLEnvironmentService
();
case
'
pai
'
:
case
'
open
pai
'
:
return
new
OpenPaiEnvironmentService
();
return
new
OpenPaiEnvironmentService
(
config
);
default
:
default
:
throw
new
Error
(
`
${
name
}
not supported!`
);
throw
new
Error
(
`
${
name
}
not supported!`
);
}
}
...
...
ts/nni_manager/training_service/reusable/environments/localEnvironmentService.ts
View file @
817ec68b
...
@@ -9,9 +9,8 @@ import * as tkill from 'tree-kill';
...
@@ -9,9 +9,8 @@ import * as tkill from 'tree-kill';
import
*
as
component
from
'
../../../common/component
'
;
import
*
as
component
from
'
../../../common/component
'
;
import
{
getExperimentId
}
from
'
../../../common/experimentStartupInfo
'
;
import
{
getExperimentId
}
from
'
../../../common/experimentStartupInfo
'
;
import
{
getLogger
,
Logger
}
from
'
../../../common/log
'
;
import
{
getLogger
,
Logger
}
from
'
../../../common/log
'
;
import
{
TrialConfigMetadataKey
}
from
'
../../common/
trialConfigMetadataKey
'
;
import
{
ExperimentConfig
}
from
'
../../
../
common/
experimentConfig
'
;
import
{
EnvironmentInformation
,
EnvironmentService
}
from
'
../environment
'
;
import
{
EnvironmentInformation
,
EnvironmentService
}
from
'
../environment
'
;
import
{
TrialConfig
}
from
'
../../common/trialConfig
'
;
import
{
getExperimentRootDir
,
isAlive
,
getNewLine
}
from
'
../../../common/utils
'
;
import
{
getExperimentRootDir
,
isAlive
,
getNewLine
}
from
'
../../../common/utils
'
;
import
{
execMkdir
,
runScript
,
getScriptName
,
execCopydir
}
from
'
../../common/util
'
;
import
{
execMkdir
,
runScript
,
getScriptName
,
execCopydir
}
from
'
../../common/util
'
;
import
{
SharedStorageService
}
from
'
../sharedStorage
'
import
{
SharedStorageService
}
from
'
../sharedStorage
'
...
@@ -20,11 +19,10 @@ import { SharedStorageService } from '../sharedStorage'
...
@@ -20,11 +19,10 @@ import { SharedStorageService } from '../sharedStorage'
export
class
LocalEnvironmentService
extends
EnvironmentService
{
export
class
LocalEnvironmentService
extends
EnvironmentService
{
private
readonly
log
:
Logger
=
getLogger
();
private
readonly
log
:
Logger
=
getLogger
();
private
localTrialConfig
:
TrialConfig
|
undefined
;
private
experimentRootDir
:
string
;
private
experimentRootDir
:
string
;
private
experimentId
:
string
;
private
experimentId
:
string
;
constructor
()
{
constructor
(
_config
:
ExperimentConfig
)
{
super
();
super
();
this
.
experimentId
=
getExperimentId
();
this
.
experimentId
=
getExperimentId
();
this
.
experimentRootDir
=
getExperimentRootDir
();
this
.
experimentRootDir
=
getExperimentRootDir
();
...
@@ -42,16 +40,6 @@ export class LocalEnvironmentService extends EnvironmentService {
...
@@ -42,16 +40,6 @@ export class LocalEnvironmentService extends EnvironmentService {
return
'
local
'
;
return
'
local
'
;
}
}
public
async
config
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
switch
(
key
)
{
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
this
.
localTrialConfig
=
<
TrialConfig
>
JSON
.
parse
(
value
);
break
;
default
:
this
.
log
.
debug
(
`Local mode does not proccess metadata key: '
${
key
}
', value: '
${
value
}
'`
);
}
}
public
async
refreshEnvironmentsStatus
(
environments
:
EnvironmentInformation
[]):
Promise
<
void
>
{
public
async
refreshEnvironmentsStatus
(
environments
:
EnvironmentInformation
[]):
Promise
<
void
>
{
environments
.
forEach
(
async
(
environment
)
=>
{
environments
.
forEach
(
async
(
environment
)
=>
{
const
jobpidPath
:
string
=
`
${
path
.
join
(
environment
.
runnerWorkingFolder
,
'
pid
'
)}
`
;
const
jobpidPath
:
string
=
`
${
path
.
join
(
environment
.
runnerWorkingFolder
,
'
pid
'
)}
`
;
...
@@ -118,9 +106,6 @@ export class LocalEnvironmentService extends EnvironmentService {
...
@@ -118,9 +106,6 @@ export class LocalEnvironmentService extends EnvironmentService {
}
}
public
async
startEnvironment
(
environment
:
EnvironmentInformation
):
Promise
<
void
>
{
public
async
startEnvironment
(
environment
:
EnvironmentInformation
):
Promise
<
void
>
{
if
(
this
.
localTrialConfig
===
undefined
)
{
throw
new
Error
(
'
Local trial config is not initialized
'
);
}
// Need refactor, this temp folder path is not appropriate, there are two expId in this path
// Need refactor, this temp folder path is not appropriate, there are two expId in this path
const
sharedStorageService
=
component
.
get
<
SharedStorageService
>
(
SharedStorageService
);
const
sharedStorageService
=
component
.
get
<
SharedStorageService
>
(
SharedStorageService
);
if
(
environment
.
useSharedStorage
&&
sharedStorageService
.
canLocalMounted
)
{
if
(
environment
.
useSharedStorage
&&
sharedStorageService
.
canLocalMounted
)
{
...
...
ts/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts
View file @
817ec68b
...
@@ -3,20 +3,20 @@
...
@@ -3,20 +3,20 @@
'
use strict
'
;
'
use strict
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
yaml
from
'
js-yaml
'
;
import
*
as
yaml
from
'
js-yaml
'
;
import
*
as
request
from
'
request
'
;
import
*
as
request
from
'
request
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
*
as
component
from
'
../../../common/component
'
;
import
*
as
component
from
'
../../../common/component
'
;
import
{
getExperimentId
}
from
'
../../../common/experimentStartupInfo
'
;
import
{
getExperimentId
}
from
'
../../../common/experimentStartupInfo
'
;
import
{
ExperimentConfig
,
OpenpaiConfig
,
flattenConfig
,
toMegaBytes
}
from
'
../../../common/experimentConfig
'
;
import
{
getLogger
,
Logger
}
from
'
../../../common/log
'
;
import
{
getLogger
,
Logger
}
from
'
../../../common/log
'
;
import
{
TrialConfigMetadataKey
}
from
'
../../common/trialConfigMetadataKey
'
;
import
{
PAIClusterConfig
}
from
'
../../pai/paiConfig
'
;
import
{
PAIClusterConfig
}
from
'
../../pai/paiConfig
'
;
import
{
NNIPAITrialConfig
}
from
'
../../pai/paiConfig
'
;
import
{
NNIPAITrialConfig
}
from
'
../../pai/paiConfig
'
;
import
{
EnvironmentInformation
,
EnvironmentService
}
from
'
../environment
'
;
import
{
EnvironmentInformation
,
EnvironmentService
}
from
'
../environment
'
;
import
{
SharedStorageService
}
from
'
../sharedStorage
'
;
import
{
SharedStorageService
}
from
'
../sharedStorage
'
;
import
{
StorageService
}
from
'
../storageService
'
;
import
{
Mounted
StorageService
}
from
'
../storage
s/mountedStorage
Service
'
;
interface
FlattenOpenpaiConfig
extends
ExperimentConfig
,
OpenpaiConfig
{
}
/**
/**
* Collector PAI jobs info from PAI cluster, and update pai job status locally
* Collector PAI jobs info from PAI cluster, and update pai job status locally
...
@@ -27,15 +27,22 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
...
@@ -27,15 +27,22 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
private
readonly
log
:
Logger
=
getLogger
();
private
readonly
log
:
Logger
=
getLogger
();
private
paiClusterConfig
:
PAIClusterConfig
|
undefined
;
private
paiClusterConfig
:
PAIClusterConfig
|
undefined
;
private
paiTrialConfig
:
NNIPAITrialConfig
|
undefined
;
private
paiTrialConfig
:
NNIPAITrialConfig
|
undefined
;
private
paiJobConfig
:
any
;
private
paiToken
:
string
;
private
paiToken
?:
string
;
private
protocol
:
string
;
private
protocol
:
string
=
'
http
'
;
private
experimentId
:
string
;
private
experimentId
:
string
;
private
config
:
FlattenOpenpaiConfig
;
constructor
()
{
constructor
(
config
:
ExperimentConfig
)
{
super
();
super
();
this
.
experimentId
=
getExperimentId
();
this
.
experimentId
=
getExperimentId
();
this
.
config
=
flattenConfig
(
config
,
'
openpai
'
);
this
.
paiToken
=
this
.
config
.
token
;
this
.
protocol
=
this
.
config
.
host
.
toLowerCase
().
startsWith
(
'
https://
'
)
?
'
https
'
:
'
http
'
;
// FIXME: only support MountedStorageService
const
storageService
=
new
MountedStorageService
();
const
remoteRoot
=
storageService
.
joinPath
(
this
.
config
.
localStorageMountPoint
,
this
.
experimentId
);
storageService
.
initialize
(
this
.
config
.
localStorageMountPoint
,
remoteRoot
);
}
}
public
get
environmentMaintenceLoopInterval
():
number
{
public
get
environmentMaintenceLoopInterval
():
number
{
...
@@ -50,58 +57,15 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
...
@@ -50,58 +57,15 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
return
'
pai
'
;
return
'
pai
'
;
}
}
public
async
config
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
switch
(
key
)
{
case
TrialConfigMetadataKey
.
PAI_CLUSTER_CONFIG
:
this
.
paiClusterConfig
=
<
PAIClusterConfig
>
JSON
.
parse
(
value
);
this
.
paiClusterConfig
.
host
=
this
.
formatPAIHost
(
this
.
paiClusterConfig
.
host
);
this
.
paiToken
=
this
.
paiClusterConfig
.
token
;
break
;
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
{
if
(
this
.
paiClusterConfig
===
undefined
)
{
this
.
log
.
error
(
'
pai cluster config is not initialized
'
);
break
;
}
this
.
paiTrialConfig
=
<
NNIPAITrialConfig
>
JSON
.
parse
(
value
);
// Validate to make sure codeDir doesn't have too many files
const
storageService
=
component
.
get
<
StorageService
>
(
StorageService
);
const
remoteRoot
=
storageService
.
joinPath
(
this
.
paiTrialConfig
.
nniManagerNFSMountPath
,
this
.
experimentId
);
storageService
.
initialize
(
this
.
paiTrialConfig
.
nniManagerNFSMountPath
,
remoteRoot
);
if
(
this
.
paiTrialConfig
.
paiConfigPath
)
{
this
.
paiJobConfig
=
yaml
.
safeLoad
(
fs
.
readFileSync
(
this
.
paiTrialConfig
.
paiConfigPath
,
'
utf8
'
));
}
if
(
this
.
paiClusterConfig
.
gpuNum
===
undefined
)
{
this
.
paiClusterConfig
.
gpuNum
=
this
.
paiTrialConfig
.
gpuNum
;
}
if
(
this
.
paiClusterConfig
.
cpuNum
===
undefined
)
{
this
.
paiClusterConfig
.
cpuNum
=
this
.
paiTrialConfig
.
cpuNum
;
}
if
(
this
.
paiClusterConfig
.
memoryMB
===
undefined
)
{
this
.
paiClusterConfig
.
memoryMB
=
this
.
paiTrialConfig
.
memoryMB
;
}
break
;
}
default
:
this
.
log
.
debug
(
`OpenPAI not proccessed metadata key: '
${
key
}
', value: '
${
value
}
'`
);
}
}
public
async
refreshEnvironmentsStatus
(
environments
:
EnvironmentInformation
[]):
Promise
<
void
>
{
public
async
refreshEnvironmentsStatus
(
environments
:
EnvironmentInformation
[]):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
if
(
this
.
paiClusterConfig
===
undefined
)
{
throw
new
Error
(
'
PAI Cluster config is not initialized
'
);
}
if
(
this
.
paiToken
===
undefined
)
{
if
(
this
.
paiToken
===
undefined
)
{
throw
new
Error
(
'
PAI token is not initialized
'
);
throw
new
Error
(
'
PAI token is not initialized
'
);
}
}
const
getJobInfoRequest
:
request
.
Options
=
{
const
getJobInfoRequest
:
request
.
Options
=
{
uri
:
`
${
this
.
protocol
}
://
${
this
.
paiClusterC
onfig
.
host
}
/rest-server/api/v2/jobs?username=
${
this
.
paiClusterC
onfig
.
user
N
ame
}
`
,
uri
:
`
${
this
.
c
onfig
.
host
}
/rest-server/api/v2/jobs?username=
${
this
.
c
onfig
.
user
n
ame
}
`
,
method
:
'
GET
'
,
method
:
'
GET
'
,
json
:
true
,
json
:
true
,
headers
:
{
headers
:
{
...
@@ -168,29 +132,22 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
...
@@ -168,29 +132,22 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
public
async
startEnvironment
(
environment
:
EnvironmentInformation
):
Promise
<
void
>
{
public
async
startEnvironment
(
environment
:
EnvironmentInformation
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
if
(
this
.
paiClusterConfig
===
undefined
)
{
throw
new
Error
(
'
PAI Cluster config is not initialized
'
);
}
if
(
this
.
paiToken
===
undefined
)
{
if
(
this
.
paiToken
===
undefined
)
{
throw
new
Error
(
'
PAI token is not initialized
'
);
throw
new
Error
(
'
PAI token is not initialized
'
);
}
}
if
(
this
.
paiTrialConfig
===
undefined
)
{
throw
new
Error
(
'
PAI trial config is not initialized
'
);
}
// Step 1. Prepare PAI job configuration
// Step 1. Prepare PAI job configuration
let
environmentRoot
:
string
;
let
environmentRoot
:
string
;
if
(
environment
.
useSharedStorage
)
{
if
(
environment
.
useSharedStorage
)
{
environmentRoot
=
component
.
get
<
SharedStorageService
>
(
SharedStorageService
).
remoteWorkingRoot
;
environmentRoot
=
component
.
get
<
SharedStorageService
>
(
SharedStorageService
).
remoteWorkingRoot
;
environment
.
command
=
`
${
component
.
get
<
SharedStorageService
>
(
SharedStorageService
).
remoteMountCommand
.
replace
(
/echo -e /g
,
`echo `
).
replace
(
/echo /g
,
`echo -e `
)}
&& cd
${
environmentRoot
}
&&
${
environment
.
command
}
`
;
environment
.
command
=
`
${
component
.
get
<
SharedStorageService
>
(
SharedStorageService
).
remoteMountCommand
.
replace
(
/echo -e /g
,
`echo `
).
replace
(
/echo /g
,
`echo -e `
)}
&& cd
${
environmentRoot
}
&&
${
environment
.
command
}
`
;
}
else
{
}
else
{
environmentRoot
=
`
${
this
.
paiTrialC
onfig
.
container
NFS
MountP
ath
}
/
${
this
.
experimentId
}
`
;
environmentRoot
=
`
${
this
.
c
onfig
.
container
Storage
MountP
oint
}
/
${
this
.
experimentId
}
`
;
environment
.
command
=
`cd
${
environmentRoot
}
&&
${
environment
.
command
}
`
;
environment
.
command
=
`cd
${
environmentRoot
}
&&
${
environment
.
command
}
`
;
}
}
environment
.
runnerWorkingFolder
=
`
${
environmentRoot
}
/envs/
${
environment
.
id
}
`
;
environment
.
runnerWorkingFolder
=
`
${
environmentRoot
}
/envs/
${
environment
.
id
}
`
;
environment
.
trackingUrl
=
`
${
this
.
protocol
}
://
${
this
.
paiClusterC
onfig
.
host
}
/job-detail.html?username=
${
this
.
paiClusterC
onfig
.
user
N
ame
}
&jobName=
${
environment
.
envId
}
`
;
environment
.
trackingUrl
=
`
${
this
.
c
onfig
.
host
}
/job-detail.html?username=
${
this
.
c
onfig
.
user
n
ame
}
&jobName=
${
environment
.
envId
}
`
;
environment
.
useActiveGpu
=
this
.
paiClusterConfig
.
useActiveGpu
;
environment
.
useActiveGpu
=
false
;
// does openpai supports these?
environment
.
maxTrialNumberPerGpu
=
this
.
paiClusterConfig
.
maxTrialNumPerGpu
;
environment
.
maxTrialNumberPerGpu
=
1
;
// Step 2. Generate Job Configuration in yaml format
// Step 2. Generate Job Configuration in yaml format
const
paiJobConfig
=
this
.
generateJobConfigInYamlFormat
(
environment
);
const
paiJobConfig
=
this
.
generateJobConfigInYamlFormat
(
environment
);
...
@@ -198,7 +155,7 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
...
@@ -198,7 +155,7 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
// Step 3. Submit PAI job via Rest call
// Step 3. Submit PAI job via Rest call
const
submitJobRequest
:
request
.
Options
=
{
const
submitJobRequest
:
request
.
Options
=
{
uri
:
`
${
this
.
protocol
}
://
${
this
.
paiClusterC
onfig
.
host
}
/rest-server/api/v2/jobs`
,
uri
:
`
${
this
.
c
onfig
.
host
}
/rest-server/api/v2/jobs`
,
method
:
'
POST
'
,
method
:
'
POST
'
,
body
:
paiJobConfig
,
body
:
paiJobConfig
,
followAllRedirects
:
true
,
followAllRedirects
:
true
,
...
@@ -229,15 +186,12 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
...
@@ -229,15 +186,12 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
if
(
environment
.
isAlive
===
false
)
{
if
(
environment
.
isAlive
===
false
)
{
return
Promise
.
resolve
();
return
Promise
.
resolve
();
}
}
if
(
this
.
paiClusterConfig
===
undefined
)
{
return
Promise
.
reject
(
new
Error
(
'
PAI Cluster config is not initialized
'
));
}
if
(
this
.
paiToken
===
undefined
)
{
if
(
this
.
paiToken
===
undefined
)
{
return
Promise
.
reject
(
Error
(
'
PAI token is not initialized
'
));
return
Promise
.
reject
(
Error
(
'
PAI token is not initialized
'
));
}
}
const
stopJobRequest
:
request
.
Options
=
{
const
stopJobRequest
:
request
.
Options
=
{
uri
:
`
${
this
.
protocol
}
://
${
this
.
paiClusterC
onfig
.
host
}
/rest-server/api/v2/jobs/
${
this
.
paiClusterC
onfig
.
user
N
ame
}
~
${
environment
.
envId
}
/executionType`
,
uri
:
`
${
this
.
c
onfig
.
host
}
/rest-server/api/v2/jobs/
${
this
.
c
onfig
.
user
n
ame
}
~
${
environment
.
envId
}
/executionType`
,
method
:
'
PUT
'
,
method
:
'
PUT
'
,
json
:
true
,
json
:
true
,
body
:
{
value
:
'
STOP
'
},
body
:
{
value
:
'
STOP
'
},
...
@@ -278,14 +232,11 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
...
@@ -278,14 +232,11 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
}
}
private
generateJobConfigInYamlFormat
(
environment
:
EnvironmentInformation
):
any
{
private
generateJobConfigInYamlFormat
(
environment
:
EnvironmentInformation
):
any
{
if
(
this
.
paiTrialConfig
===
undefined
)
{
throw
new
Error
(
'
trial config is not initialized
'
);
}
const
jobName
=
environment
.
envId
;
const
jobName
=
environment
.
envId
;
let
nniJobConfig
:
any
=
undefined
;
let
nniJobConfig
:
any
=
undefined
;
if
(
this
.
paiTrialC
onfig
.
paiConfig
Path
)
{
if
(
this
.
c
onfig
.
open
paiConfig
!==
undefined
)
{
nniJobConfig
=
JSON
.
parse
(
JSON
.
stringify
(
this
.
pai
Job
Config
));
//Trick for deep clone in Typescript
nniJobConfig
=
JSON
.
parse
(
JSON
.
stringify
(
this
.
config
.
open
paiConfig
));
//Trick for deep clone in Typescript
nniJobConfig
.
name
=
jobName
;
nniJobConfig
.
name
=
jobName
;
if
(
nniJobConfig
.
taskRoles
)
{
if
(
nniJobConfig
.
taskRoles
)
{
...
@@ -313,19 +264,6 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
...
@@ -313,19 +264,6 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
}
}
}
else
{
}
else
{
if
(
this
.
paiClusterConfig
===
undefined
)
{
throw
new
Error
(
'
PAI Cluster config is not initialized
'
);
}
if
(
this
.
paiClusterConfig
.
gpuNum
===
undefined
)
{
throw
new
Error
(
'
PAI Cluster gpuNum is not initialized
'
);
}
if
(
this
.
paiClusterConfig
.
cpuNum
===
undefined
)
{
throw
new
Error
(
'
PAI Cluster cpuNum is not initialized
'
);
}
if
(
this
.
paiClusterConfig
.
memoryMB
===
undefined
)
{
throw
new
Error
(
'
PAI Cluster memoryMB is not initialized
'
);
}
nniJobConfig
=
{
nniJobConfig
=
{
protocolVersion
:
2
,
protocolVersion
:
2
,
name
:
jobName
,
name
:
jobName
,
...
@@ -334,7 +272,7 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
...
@@ -334,7 +272,7 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
prerequisites
:
[
prerequisites
:
[
{
{
type
:
'
dockerimage
'
,
type
:
'
dockerimage
'
,
uri
:
this
.
paiTrialConfig
.
i
mage
,
uri
:
this
.
config
.
dockerI
mage
,
name
:
'
docker_image_0
'
name
:
'
docker_image_0
'
}
}
],
],
...
@@ -348,9 +286,9 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
...
@@ -348,9 +286,9 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
taskRetryCount
:
0
,
taskRetryCount
:
0
,
dockerImage
:
'
docker_image_0
'
,
dockerImage
:
'
docker_image_0
'
,
resourcePerInstance
:
{
resourcePerInstance
:
{
gpu
:
this
.
paiClusterConfig
.
g
puNum
,
gpu
:
this
.
config
.
trialG
puNum
ber
,
cpu
:
this
.
paiClusterConfig
.
c
puNum
,
cpu
:
this
.
config
.
trialC
puNum
ber
,
memoryMB
:
t
his
.
paiClusterConfig
.
memoryMB
memoryMB
:
t
oMegaBytes
(
this
.
config
.
trialMemorySize
)
},
},
commands
:
[
commands
:
[
environment
.
command
environment
.
command
...
@@ -360,15 +298,15 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
...
@@ -360,15 +298,15 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
extras
:
{
extras
:
{
'
storages
'
:
[
'
storages
'
:
[
{
{
name
:
this
.
paiTrialC
onfig
.
paiS
torageConfigName
name
:
this
.
c
onfig
.
s
torageConfigName
}
}
],
],
submitFrom
:
'
submit-job-v2
'
submitFrom
:
'
submit-job-v2
'
}
}
}
}
if
(
this
.
paiTrialConfig
.
virtualCluster
)
{
if
(
this
.
config
.
deprecated
&&
this
.
config
.
deprecated
.
virtualCluster
)
{
nniJobConfig
.
defaults
=
{
nniJobConfig
.
defaults
=
{
virtualCluster
:
this
.
paiTrialConfig
.
virtualCluster
virtualCluster
:
this
.
config
.
deprecated
.
virtualCluster
}
}
}
}
}
}
...
...
ts/nni_manager/training_service/reusable/environments/remoteEnvironmentService.ts
View file @
817ec68b
...
@@ -9,44 +9,50 @@ import * as component from '../../../common/component';
...
@@ -9,44 +9,50 @@ import * as component from '../../../common/component';
import
{
getExperimentId
}
from
'
../../../common/experimentStartupInfo
'
;
import
{
getExperimentId
}
from
'
../../../common/experimentStartupInfo
'
;
import
{
getLogger
,
Logger
}
from
'
../../../common/log
'
;
import
{
getLogger
,
Logger
}
from
'
../../../common/log
'
;
import
{
EnvironmentInformation
,
EnvironmentService
}
from
'
../environment
'
;
import
{
EnvironmentInformation
,
EnvironmentService
}
from
'
../environment
'
;
import
{
import
{
getExperimentRootDir
,
getLogLevel
}
from
'
../../../common/utils
'
;
getExperimentRootDir
,
getLogLevel
import
{
ExperimentConfig
,
RemoteConfig
,
RemoteMachineConfig
,
flattenConfig
}
from
'
../../../common/experimentConfig
'
;
}
from
'
../../../common/utils
'
;
import
{
execMkdir
}
from
'
../../common/util
'
;
import
{
TrialConfig
}
from
'
../../common/trialConfig
'
;
import
{
ExecutorManager
}
from
'
../../remote_machine/remoteMachineData
'
;
import
{
TrialConfigMetadataKey
}
from
'
../../common/trialConfigMetadataKey
'
;
import
{
execMkdir
,
validateCodeDir
}
from
'
../../common/util
'
;
import
{
ExecutorManager
,
RemoteMachineMeta
,
}
from
'
../../remote_machine/remoteMachineData
'
;
import
{
ShellExecutor
}
from
'
training_service/remote_machine/shellExecutor
'
;
import
{
ShellExecutor
}
from
'
training_service/remote_machine/shellExecutor
'
;
import
{
RemoteMachineEnvironmentInformation
}
from
'
../remote/remoteConfig
'
;
import
{
RemoteMachineEnvironmentInformation
}
from
'
../remote/remoteConfig
'
;
import
{
SharedStorageService
}
from
'
../sharedStorage
'
import
{
SharedStorageService
}
from
'
../sharedStorage
'
interface
FlattenRemoteConfig
extends
ExperimentConfig
,
RemoteConfig
{
}
@
component
.
Singleton
@
component
.
Singleton
export
class
RemoteEnvironmentService
extends
EnvironmentService
{
export
class
RemoteEnvironmentService
extends
EnvironmentService
{
private
readonly
initExecutorId
=
"
initConnection
"
;
private
readonly
initExecutorId
=
"
initConnection
"
;
private
readonly
machineExecutorManagerMap
:
Map
<
RemoteMachine
Meta
,
ExecutorManager
>
;
private
readonly
machineExecutorManagerMap
:
Map
<
RemoteMachine
Config
,
ExecutorManager
>
;
private
readonly
environmentExecutorManagerMap
:
Map
<
string
,
ExecutorManager
>
;
private
readonly
environmentExecutorManagerMap
:
Map
<
string
,
ExecutorManager
>
;
private
readonly
remoteMachineMetaOccupiedMap
:
Map
<
RemoteMachineMeta
,
boolean
>
;
private
readonly
remoteMachineMetaOccupiedMap
:
Map
<
RemoteMachineConfig
,
boolean
>
;
private
trialConfig
:
TrialConfig
|
undefined
;
private
readonly
log
:
Logger
;
private
readonly
log
:
Logger
;
private
sshConnectionPromises
:
any
[];
private
sshConnectionPromises
:
any
[];
private
experimentRootDir
:
string
;
private
experimentRootDir
:
string
;
private
remoteExperimentRootDir
:
string
=
""
;
private
remoteExperimentRootDir
:
string
=
""
;
private
experimentId
:
string
;
private
experimentId
:
string
;
private
config
:
FlattenRemoteConfig
;
constructor
()
{
constructor
(
config
:
ExperimentConfig
)
{
super
();
super
();
this
.
experimentId
=
getExperimentId
();
this
.
experimentId
=
getExperimentId
();
this
.
environmentExecutorManagerMap
=
new
Map
<
string
,
ExecutorManager
>
();
this
.
environmentExecutorManagerMap
=
new
Map
<
string
,
ExecutorManager
>
();
this
.
machineExecutorManagerMap
=
new
Map
<
RemoteMachine
Meta
,
ExecutorManager
>
();
this
.
machineExecutorManagerMap
=
new
Map
<
RemoteMachine
Config
,
ExecutorManager
>
();
this
.
remoteMachineMetaOccupiedMap
=
new
Map
<
RemoteMachine
Meta
,
boolean
>
();
this
.
remoteMachineMetaOccupiedMap
=
new
Map
<
RemoteMachine
Config
,
boolean
>
();
this
.
sshConnectionPromises
=
[];
this
.
sshConnectionPromises
=
[];
this
.
experimentRootDir
=
getExperimentRootDir
();
this
.
experimentRootDir
=
getExperimentRootDir
();
this
.
experimentId
=
getExperimentId
();
this
.
experimentId
=
getExperimentId
();
this
.
log
=
getLogger
();
this
.
log
=
getLogger
();
this
.
config
=
flattenConfig
(
config
,
'
remote
'
);
// codeDir is not a valid directory, throw Error
if
(
!
fs
.
lstatSync
(
this
.
config
.
trialCodeDirectory
).
isDirectory
())
{
throw
new
Error
(
`codeDir
${
this
.
config
.
trialCodeDirectory
}
is not a directory`
);
}
this
.
sshConnectionPromises
=
this
.
config
.
machineList
.
map
(
machine
=>
this
.
initRemoteMachineOnConnected
(
machine
)
);
}
}
public
get
prefetchedEnvironmentCount
():
number
{
public
get
prefetchedEnvironmentCount
():
number
{
...
@@ -69,39 +75,7 @@ export class RemoteEnvironmentService extends EnvironmentService {
...
@@ -69,39 +75,7 @@ export class RemoteEnvironmentService extends EnvironmentService {
return
'
remote
'
;
return
'
remote
'
;
}
}
public
async
config
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
private
scheduleMachine
():
RemoteMachineConfig
|
undefined
{
switch
(
key
)
{
case
TrialConfigMetadataKey
.
MACHINE_LIST
:
await
this
.
setupConnections
(
value
);
break
;
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
{
const
remoteMachineTrailConfig
:
TrialConfig
=
<
TrialConfig
>
JSON
.
parse
(
value
);
// Parse trial config failed, throw Error
if
(
remoteMachineTrailConfig
===
undefined
)
{
throw
new
Error
(
'
trial config parsed failed
'
);
}
// codeDir is not a valid directory, throw Error
if
(
!
fs
.
lstatSync
(
remoteMachineTrailConfig
.
codeDir
)
.
isDirectory
())
{
throw
new
Error
(
`codeDir
${
remoteMachineTrailConfig
.
codeDir
}
is not a directory`
);
}
try
{
// Validate to make sure codeDir doesn't have too many files
await
validateCodeDir
(
remoteMachineTrailConfig
.
codeDir
);
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
new
Error
(
error
));
}
this
.
trialConfig
=
remoteMachineTrailConfig
;
break
;
}
default
:
this
.
log
.
debug
(
`Remote not support metadata key: '
${
key
}
', value: '
${
value
}
'`
);
}
}
private
scheduleMachine
():
RemoteMachineMeta
|
undefined
{
for
(
const
[
rmMeta
,
occupied
]
of
this
.
remoteMachineMetaOccupiedMap
)
{
for
(
const
[
rmMeta
,
occupied
]
of
this
.
remoteMachineMetaOccupiedMap
)
{
if
(
!
occupied
)
{
if
(
!
occupied
)
{
this
.
remoteMachineMetaOccupiedMap
.
set
(
rmMeta
,
true
);
this
.
remoteMachineMetaOccupiedMap
.
set
(
rmMeta
,
true
);
...
@@ -111,19 +85,9 @@ export class RemoteEnvironmentService extends EnvironmentService {
...
@@ -111,19 +85,9 @@ export class RemoteEnvironmentService extends EnvironmentService {
return
undefined
;
return
undefined
;
}
}
private
async
setupConnections
(
machineList
:
string
):
Promise
<
void
>
{
private
async
initRemoteMachineOnConnected
(
rmMeta
:
RemoteMachineConfig
):
Promise
<
void
>
{
this
.
log
.
debug
(
`Connecting to remote machines:
${
machineList
}
`
);
//TO DO: verify if value's format is wrong, and json parse failed, how to handle error
const
rmMetaList
:
RemoteMachineMeta
[]
=
<
RemoteMachineMeta
[]
>
JSON
.
parse
(
machineList
);
for
(
const
rmMeta
of
rmMetaList
)
{
this
.
sshConnectionPromises
.
push
(
await
this
.
initRemoteMachineOnConnected
(
rmMeta
));
}
}
private
async
initRemoteMachineOnConnected
(
rmMeta
:
RemoteMachineMeta
):
Promise
<
void
>
{
const
executorManager
:
ExecutorManager
=
new
ExecutorManager
(
rmMeta
);
const
executorManager
:
ExecutorManager
=
new
ExecutorManager
(
rmMeta
);
this
.
log
.
info
(
`connecting to
${
rmMeta
.
user
name
}
@
${
rmMeta
.
ip
}
:
${
rmMeta
.
port
}
`
);
this
.
log
.
info
(
`connecting to
${
rmMeta
.
user
}
@
${
rmMeta
.
host
}
:
${
rmMeta
.
port
}
`
);
const
executor
:
ShellExecutor
=
await
executorManager
.
getExecutor
(
this
.
initExecutorId
);
const
executor
:
ShellExecutor
=
await
executorManager
.
getExecutor
(
this
.
initExecutorId
);
this
.
log
.
debug
(
`reached
${
executor
.
name
}
`
);
this
.
log
.
debug
(
`reached
${
executor
.
name
}
`
);
this
.
machineExecutorManagerMap
.
set
(
rmMeta
,
executorManager
);
this
.
machineExecutorManagerMap
.
set
(
rmMeta
,
executorManager
);
...
@@ -142,10 +106,7 @@ export class RemoteEnvironmentService extends EnvironmentService {
...
@@ -142,10 +106,7 @@ export class RemoteEnvironmentService extends EnvironmentService {
}
}
public
async
refreshEnvironmentsStatus
(
environments
:
EnvironmentInformation
[]):
Promise
<
void
>
{
public
async
refreshEnvironmentsStatus
(
environments
:
EnvironmentInformation
[]):
Promise
<
void
>
{
const
tasks
:
Promise
<
void
>
[]
=
[];
const
tasks
=
environments
.
map
(
environment
=>
this
.
refreshEnvironment
(
environment
));
environments
.
forEach
(
async
(
environment
)
=>
{
tasks
.
push
(
this
.
refreshEnvironment
(
environment
));
});
await
Promise
.
all
(
tasks
);
await
Promise
.
all
(
tasks
);
}
}
...
@@ -168,7 +129,7 @@ export class RemoteEnvironmentService extends EnvironmentService {
...
@@ -168,7 +129,7 @@ export class RemoteEnvironmentService extends EnvironmentService {
if
(
remoteEnvironment
.
rmMachineMeta
===
undefined
)
{
if
(
remoteEnvironment
.
rmMachineMeta
===
undefined
)
{
throw
new
Error
(
`
${
remoteEnvironment
.
id
}
machine meta not initialized!`
);
throw
new
Error
(
`
${
remoteEnvironment
.
id
}
machine meta not initialized!`
);
}
}
this
.
log
.
info
(
`pid in
${
remoteEnvironment
.
rmMachineMeta
.
ip
}
:
${
jobpidPath
}
is not alive!`
);
this
.
log
.
info
(
`pid in
${
remoteEnvironment
.
rmMachineMeta
.
host
}
:
${
jobpidPath
}
is not alive!`
);
if
(
fs
.
existsSync
(
runnerReturnCodeFilePath
))
{
if
(
fs
.
existsSync
(
runnerReturnCodeFilePath
))
{
const
runnerReturnCode
:
string
=
await
executor
.
getRemoteFileContent
(
runnerReturnCodeFilePath
);
const
runnerReturnCode
:
string
=
await
executor
.
getRemoteFileContent
(
runnerReturnCodeFilePath
);
const
match
:
RegExpMatchArray
|
null
=
runnerReturnCode
.
trim
()
const
match
:
RegExpMatchArray
|
null
=
runnerReturnCode
.
trim
()
...
@@ -248,9 +209,6 @@ export class RemoteEnvironmentService extends EnvironmentService {
...
@@ -248,9 +209,6 @@ export class RemoteEnvironmentService extends EnvironmentService {
this
.
log
.
info
(
'
ssh connection initialized!
'
);
this
.
log
.
info
(
'
ssh connection initialized!
'
);
// set sshConnectionPromises to [] to avoid log information duplicated
// set sshConnectionPromises to [] to avoid log information duplicated
this
.
sshConnectionPromises
=
[];
this
.
sshConnectionPromises
=
[];
if
(
this
.
trialConfig
===
undefined
)
{
throw
new
Error
(
"
trial config not initialized!
"
);
}
Array
.
from
(
this
.
machineExecutorManagerMap
.
keys
()).
forEach
(
rmMeta
=>
{
Array
.
from
(
this
.
machineExecutorManagerMap
.
keys
()).
forEach
(
rmMeta
=>
{
// initialize remoteMachineMetaOccupiedMap, false means not occupied
// initialize remoteMachineMetaOccupiedMap, false means not occupied
this
.
remoteMachineMetaOccupiedMap
.
set
(
rmMeta
,
false
);
this
.
remoteMachineMetaOccupiedMap
.
set
(
rmMeta
,
false
);
...
@@ -265,12 +223,8 @@ export class RemoteEnvironmentService extends EnvironmentService {
...
@@ -265,12 +223,8 @@ export class RemoteEnvironmentService extends EnvironmentService {
}
}
private
async
prepareEnvironment
(
environment
:
RemoteMachineEnvironmentInformation
):
Promise
<
boolean
>
{
private
async
prepareEnvironment
(
environment
:
RemoteMachineEnvironmentInformation
):
Promise
<
boolean
>
{
if
(
this
.
trialConfig
===
undefined
)
{
throw
new
Error
(
'
trial config is not initialized
'
);
}
// get an executor from scheduler
// get an executor from scheduler
const
rmMachineMeta
:
RemoteMachine
Meta
|
undefined
=
this
.
scheduleMachine
();
const
rmMachineMeta
:
RemoteMachine
Config
|
undefined
=
this
.
scheduleMachine
();
if
(
rmMachineMeta
===
undefined
)
{
if
(
rmMachineMeta
===
undefined
)
{
this
.
log
.
warning
(
`No available machine!`
);
this
.
log
.
warning
(
`No available machine!`
);
return
Promise
.
resolve
(
false
);
return
Promise
.
resolve
(
false
);
...
@@ -299,9 +253,6 @@ export class RemoteEnvironmentService extends EnvironmentService {
...
@@ -299,9 +253,6 @@ export class RemoteEnvironmentService extends EnvironmentService {
}
}
private
async
launchRunner
(
environment
:
RemoteMachineEnvironmentInformation
):
Promise
<
void
>
{
private
async
launchRunner
(
environment
:
RemoteMachineEnvironmentInformation
):
Promise
<
void
>
{
if
(
this
.
trialConfig
===
undefined
)
{
throw
new
Error
(
'
trial config is not initialized
'
);
}
const
executor
=
await
this
.
getExecutor
(
environment
.
id
);
const
executor
=
await
this
.
getExecutor
(
environment
.
id
);
const
environmentLocalTempFolder
:
string
=
const
environmentLocalTempFolder
:
string
=
path
.
join
(
this
.
experimentRootDir
,
"
environment-temp
"
)
path
.
join
(
this
.
experimentRootDir
,
"
environment-temp
"
)
...
@@ -317,7 +268,7 @@ export class RemoteEnvironmentService extends EnvironmentService {
...
@@ -317,7 +268,7 @@ export class RemoteEnvironmentService extends EnvironmentService {
if
(
environment
.
rmMachineMeta
===
undefined
)
{
if
(
environment
.
rmMachineMeta
===
undefined
)
{
throw
new
Error
(
`
${
environment
.
id
}
rmMachineMeta not initialized!`
);
throw
new
Error
(
`
${
environment
.
id
}
rmMachineMeta not initialized!`
);
}
}
environment
.
trackingUrl
=
`file://
${
environment
.
rmMachineMeta
.
ip
}
:
${
environment
.
runnerWorkingFolder
}
`
;
environment
.
trackingUrl
=
`file://
${
environment
.
rmMachineMeta
.
host
}
:
${
environment
.
runnerWorkingFolder
}
`
;
}
}
private
async
getExecutor
(
environmentId
:
string
):
Promise
<
ShellExecutor
>
{
private
async
getExecutor
(
environmentId
:
string
):
Promise
<
ShellExecutor
>
{
...
@@ -330,7 +281,7 @@ export class RemoteEnvironmentService extends EnvironmentService {
...
@@ -330,7 +281,7 @@ export class RemoteEnvironmentService extends EnvironmentService {
public
async
stopEnvironment
(
environment
:
EnvironmentInformation
):
Promise
<
void
>
{
public
async
stopEnvironment
(
environment
:
EnvironmentInformation
):
Promise
<
void
>
{
if
(
environment
.
isAlive
===
false
)
{
if
(
environment
.
isAlive
===
false
)
{
return
Promise
.
resolve
()
;
return
;
}
}
const
executor
=
await
this
.
getExecutor
(
environment
.
id
);
const
executor
=
await
this
.
getExecutor
(
environment
.
id
);
...
@@ -338,7 +289,7 @@ export class RemoteEnvironmentService extends EnvironmentService {
...
@@ -338,7 +289,7 @@ export class RemoteEnvironmentService extends EnvironmentService {
if
(
environment
.
status
===
'
UNKNOWN
'
)
{
if
(
environment
.
status
===
'
UNKNOWN
'
)
{
environment
.
status
=
'
USER_CANCELED
'
;
environment
.
status
=
'
USER_CANCELED
'
;
await
this
.
releaseEnvironmentResource
(
environment
);
await
this
.
releaseEnvironmentResource
(
environment
);
return
return
;
}
}
const
jobpidPath
:
string
=
`
${
environment
.
runnerWorkingFolder
}
/pid`
;
const
jobpidPath
:
string
=
`
${
environment
.
runnerWorkingFolder
}
/pid`
;
...
...
ts/nni_manager/training_service/reusable/remote/remoteConfig.ts
View file @
817ec68b
...
@@ -2,23 +2,11 @@
...
@@ -2,23 +2,11 @@
// Licensed under the MIT license.
// Licensed under the MIT license.
import
{
EnvironmentInformation
}
from
'
../environment
'
;
import
{
EnvironmentInformation
}
from
'
../environment
'
;
import
{
RemoteMachine
Meta
}
from
'
../../
remote_machine/remoteMachineData
'
;
import
{
RemoteMachine
Config
}
from
'
../../
../
common/experimentConfig
'
;
/**
/**
* RemoteMachineEnvironmentInformation
* RemoteMachineEnvironmentInformation
*/
*/
export
class
RemoteMachineEnvironmentInformation
extends
EnvironmentInformation
{
export
class
RemoteMachineEnvironmentInformation
extends
EnvironmentInformation
{
public
rmMachineMeta
?:
RemoteMachineMeta
;
public
rmMachineMeta
?:
RemoteMachineConfig
;
}
export
class
RemoteConfig
{
public
readonly
reuse
:
boolean
;
/**
* Constructor
* @param reuse If job is reusable for multiple trials
*/
constructor
(
reuse
:
boolean
)
{
this
.
reuse
=
reuse
;
}
}
}
ts/nni_manager/training_service/reusable/routerTrainingService.ts
View file @
817ec68b
...
@@ -3,21 +3,15 @@
...
@@ -3,21 +3,15 @@
'
use strict
'
;
'
use strict
'
;
import
{
Container
,
Scope
}
from
'
typescript-ioc
'
;
import
*
as
component
from
'
../../common/component
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
MethodNotImplementedError
}
from
'
../../common/errors
'
import
{
MethodNotImplementedError
}
from
'
../../common/errors
'
;
import
{
ExperimentConfig
,
RemoteConfig
,
OpenpaiConfig
}
from
'
../../common/experimentConfig
'
;
import
{
TrainingService
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
,
LogType
}
from
'
../../common/trainingService
'
;
import
{
TrainingService
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
,
LogType
}
from
'
../../common/trainingService
'
;
import
{
delay
}
from
'
../../common/utils
'
;
import
{
delay
}
from
'
../../common/utils
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
import
{
PAIClusterConfig
}
from
'
../pai/paiConfig
'
;
import
{
PAITrainingService
}
from
'
../pai/paiTrainingService
'
;
import
{
PAITrainingService
}
from
'
../pai/paiTrainingService
'
;
import
{
RemoteMachineTrainingService
}
from
'
../remote_machine/remoteMachineTrainingService
'
;
import
{
RemoteMachineTrainingService
}
from
'
../remote_machine/remoteMachineTrainingService
'
;
import
{
MountedStorageService
}
from
'
./storages/mountedStorageService
'
;
import
{
StorageService
}
from
'
./storageService
'
;
import
{
TrialDispatcher
}
from
'
./trialDispatcher
'
;
import
{
TrialDispatcher
}
from
'
./trialDispatcher
'
;
import
{
RemoteConfig
}
from
'
./remote/remoteConfig
'
;
import
{
HeterogenousConfig
}
from
'
./heterogenous/heterogenousConfig
'
;
/**
/**
...
@@ -26,11 +20,19 @@ import { HeterogenousConfig } from './heterogenous/heterogenousConfig';
...
@@ -26,11 +20,19 @@ import { HeterogenousConfig } from './heterogenous/heterogenousConfig';
*/
*/
@
component
.
Singleton
@
component
.
Singleton
class
RouterTrainingService
implements
TrainingService
{
class
RouterTrainingService
implements
TrainingService
{
protected
readonly
log
!
:
Logger
;
protected
readonly
log
:
Logger
;
private
internalTrainingService
:
TrainingService
|
undefined
;
private
internalTrainingService
:
TrainingService
;
constructor
()
{
constructor
(
config
:
ExperimentConfig
)
{
this
.
log
=
getLogger
();
this
.
log
=
getLogger
();
const
platform
=
Array
.
isArray
(
config
.
trainingService
)
?
'
hybrid
'
:
config
.
trainingService
.
platform
;
if
(
platform
===
'
remote
'
&&
!
(
<
RemoteConfig
>
config
.
trainingService
).
reuseMode
)
{
this
.
internalTrainingService
=
new
RemoteMachineTrainingService
(
config
);
}
else
if
(
platform
===
'
openpai
'
&&
!
(
<
OpenpaiConfig
>
config
.
trainingService
).
reuseMode
)
{
this
.
internalTrainingService
=
new
PAITrainingService
(
config
);
}
else
{
this
.
internalTrainingService
=
new
TrialDispatcher
(
config
);
}
}
}
public
async
listTrialJobs
():
Promise
<
TrialJobDetail
[]
>
{
public
async
listTrialJobs
():
Promise
<
TrialJobDetail
[]
>
{
...
@@ -79,13 +81,6 @@ class RouterTrainingService implements TrainingService {
...
@@ -79,13 +81,6 @@ class RouterTrainingService implements TrainingService {
return
await
this
.
internalTrainingService
.
updateTrialJob
(
trialJobId
,
form
);
return
await
this
.
internalTrainingService
.
updateTrialJob
(
trialJobId
,
form
);
}
}
public
get
isMultiPhaseJobSupported
():
boolean
{
if
(
this
.
internalTrainingService
===
undefined
)
{
throw
new
Error
(
"
TrainingService is not assigned!
"
);
}
return
this
.
internalTrainingService
.
isMultiPhaseJobSupported
;
}
public
async
cancelTrialJob
(
trialJobId
:
string
,
isEarlyStopped
?:
boolean
|
undefined
):
Promise
<
void
>
{
public
async
cancelTrialJob
(
trialJobId
:
string
,
isEarlyStopped
?:
boolean
|
undefined
):
Promise
<
void
>
{
if
(
this
.
internalTrainingService
===
undefined
)
{
if
(
this
.
internalTrainingService
===
undefined
)
{
throw
new
Error
(
"
TrainingService is not assigned!
"
);
throw
new
Error
(
"
TrainingService is not assigned!
"
);
...
@@ -93,80 +88,8 @@ class RouterTrainingService implements TrainingService {
...
@@ -93,80 +88,8 @@ class RouterTrainingService implements TrainingService {
await
this
.
internalTrainingService
.
cancelTrialJob
(
trialJobId
,
isEarlyStopped
);
await
this
.
internalTrainingService
.
cancelTrialJob
(
trialJobId
,
isEarlyStopped
);
}
}
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
public
async
setClusterMetadata
(
_key
:
string
,
_value
:
string
):
Promise
<
void
>
{
return
;
}
if
(
this
.
internalTrainingService
===
undefined
)
{
public
async
getClusterMetadata
(
_key
:
string
):
Promise
<
string
>
{
return
''
;
}
// Need to refactor configuration, remove hybrid_config field in the future
if
(
key
===
TrialConfigMetadataKey
.
HYBRID_CONFIG
){
this
.
internalTrainingService
=
component
.
get
(
TrialDispatcher
);
const
heterogenousConfig
:
HeterogenousConfig
=
<
HeterogenousConfig
>
JSON
.
parse
(
value
);
if
(
this
.
internalTrainingService
===
undefined
)
{
throw
new
Error
(
"
internalTrainingService not initialized!
"
);
}
// Initialize storageService for pai, only support singleton for now, need refactor
if
(
heterogenousConfig
.
trainingServicePlatforms
.
includes
(
'
pai
'
))
{
Container
.
bind
(
StorageService
)
.
to
(
MountedStorageService
)
.
scope
(
Scope
.
Singleton
);
}
await
this
.
internalTrainingService
.
setClusterMetadata
(
'
platform_list
'
,
heterogenousConfig
.
trainingServicePlatforms
.
join
(
'
,
'
));
}
else
if
(
key
===
TrialConfigMetadataKey
.
LOCAL_CONFIG
)
{
this
.
internalTrainingService
=
component
.
get
(
TrialDispatcher
);
if
(
this
.
internalTrainingService
===
undefined
)
{
throw
new
Error
(
"
internalTrainingService not initialized!
"
);
}
await
this
.
internalTrainingService
.
setClusterMetadata
(
'
platform_list
'
,
'
local
'
);
}
else
if
(
key
===
TrialConfigMetadataKey
.
PAI_CLUSTER_CONFIG
)
{
const
config
=
<
PAIClusterConfig
>
JSON
.
parse
(
value
);
if
(
config
.
reuse
===
true
)
{
this
.
log
.
info
(
`reuse flag enabled, use EnvironmentManager.`
);
this
.
internalTrainingService
=
component
.
get
(
TrialDispatcher
);
// TODO to support other storages later.
Container
.
bind
(
StorageService
)
.
to
(
MountedStorageService
)
.
scope
(
Scope
.
Singleton
);
if
(
this
.
internalTrainingService
===
undefined
)
{
throw
new
Error
(
"
internalTrainingService not initialized!
"
);
}
await
this
.
internalTrainingService
.
setClusterMetadata
(
'
platform_list
'
,
'
pai
'
);
}
else
{
this
.
log
.
debug
(
`caching metadata key:{} value:{}, as training service is not determined.`
);
this
.
internalTrainingService
=
component
.
get
(
PAITrainingService
);
}
}
else
if
(
key
===
TrialConfigMetadataKey
.
AML_CLUSTER_CONFIG
)
{
this
.
internalTrainingService
=
component
.
get
(
TrialDispatcher
);
if
(
this
.
internalTrainingService
===
undefined
)
{
throw
new
Error
(
"
internalTrainingService not initialized!
"
);
}
await
this
.
internalTrainingService
.
setClusterMetadata
(
'
platform_list
'
,
'
aml
'
);
}
else
if
(
key
===
TrialConfigMetadataKey
.
REMOTE_CONFIG
)
{
const
config
=
<
RemoteConfig
>
JSON
.
parse
(
value
);
if
(
config
.
reuse
===
true
)
{
this
.
log
.
info
(
`reuse flag enabled, use EnvironmentManager.`
);
this
.
internalTrainingService
=
component
.
get
(
TrialDispatcher
);
if
(
this
.
internalTrainingService
===
undefined
)
{
throw
new
Error
(
"
internalTrainingService not initialized!
"
);
}
await
this
.
internalTrainingService
.
setClusterMetadata
(
'
platform_list
'
,
'
remote
'
);
}
else
{
this
.
log
.
debug
(
`caching metadata key:{} value:{}, as training service is not determined.`
);
this
.
internalTrainingService
=
component
.
get
(
RemoteMachineTrainingService
);
}
}
}
if
(
this
.
internalTrainingService
===
undefined
)
{
throw
new
Error
(
"
internalTrainingService not initialized!
"
);
}
await
this
.
internalTrainingService
.
setClusterMetadata
(
key
,
value
);
}
public
async
getClusterMetadata
(
key
:
string
):
Promise
<
string
>
{
if
(
this
.
internalTrainingService
===
undefined
)
{
throw
new
Error
(
"
TrainingService is not assigned!
"
);
}
return
await
this
.
internalTrainingService
.
getClusterMetadata
(
key
);
}
public
async
cleanUp
():
Promise
<
void
>
{
public
async
cleanUp
():
Promise
<
void
>
{
if
(
this
.
internalTrainingService
===
undefined
)
{
if
(
this
.
internalTrainingService
===
undefined
)
{
...
...
ts/nni_manager/training_service/reusable/sharedStorage.ts
View file @
817ec68b
...
@@ -3,19 +3,14 @@
...
@@ -3,19 +3,14 @@
'
use strict
'
;
'
use strict
'
;
import
{
SharedStorageConfig
}
from
'
../../common/experimentConfig
'
;
import
{
StorageService
}
from
'
./storageService
'
import
{
StorageService
}
from
'
./storageService
'
export
type
SharedStorageType
=
'
NFS
'
|
'
AzureBlob
'
export
type
SharedStorageType
=
'
NFS
'
|
'
AzureBlob
'
export
type
LocalMountedType
=
'
usermount
'
|
'
nnimount
'
|
'
nomount
'
export
type
LocalMountedType
=
'
usermount
'
|
'
nnimount
'
|
'
nomount
'
export
interface
SharedStorageConfig
{
readonly
storageType
:
SharedStorageType
;
readonly
localMountPoint
?:
string
;
readonly
remoteMountPoint
:
string
;
}
export
abstract
class
SharedStorageService
{
export
abstract
class
SharedStorageService
{
public
abstract
config
(
key
:
string
,
value
:
strin
g
):
Promise
<
void
>
;
public
abstract
config
(
config
:
SharedStorageConfi
g
):
Promise
<
void
>
;
public
abstract
get
canLocalMounted
():
boolean
;
public
abstract
get
canLocalMounted
():
boolean
;
public
abstract
get
storageService
():
StorageService
;
public
abstract
get
storageService
():
StorageService
;
public
abstract
get
localMountCommand
():
string
;
public
abstract
get
localMountCommand
():
string
;
...
...
ts/nni_manager/training_service/reusable/shared_storages/azureblobStorageService.ts
View file @
817ec68b
...
@@ -6,11 +6,11 @@
...
@@ -6,11 +6,11 @@
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
path
from
'
path
'
;
import
*
as
path
from
'
path
'
;
import
{
SharedStorageService
,
SharedStorage
Config
,
SharedStorageType
,
LocalMounted
Type
}
from
'
../sharedStorage
'
import
{
SharedStorageService
,
SharedStorageType
}
from
'
../sharedStorage
'
import
{
MountedStorageService
}
from
'
../storages/mountedStorageService
'
;
import
{
MountedStorageService
}
from
'
../storages/mountedStorageService
'
;
import
{
TrialConfigMetadataKey
}
from
'
../../common/trialConfigMetadataKey
'
;
import
{
getLogger
,
Logger
}
from
'
../../../common/log
'
;
import
{
getLogger
,
Logger
}
from
'
../../../common/log
'
;
import
{
getExperimentId
}
from
'
../../../common/experimentStartupInfo
'
;
import
{
getExperimentId
}
from
'
../../../common/experimentStartupInfo
'
;
import
{
AzureBlobConfig
}
from
'
../../../common/experimentConfig
'
;
const
INSTALL_BLOBFUSE
=
`
const
INSTALL_BLOBFUSE
=
`
#!/bin/bash
#!/bin/bash
...
@@ -50,31 +50,6 @@ else
...
@@ -50,31 +50,6 @@ else
fi
fi
`
`
class
AzureBlobSharedStorageConfig
implements
SharedStorageConfig
{
public
storageType
:
SharedStorageType
;
public
localMountPoint
?:
string
;
public
remoteMountPoint
:
string
;
public
resourceGroupName
?:
string
;
public
storageAccountName
:
string
;
public
storageAccountKey
?:
string
;
public
containerName
:
string
;
public
localMounted
:
LocalMountedType
;
constructor
(
storageType
:
SharedStorageType
,
remoteMountPoint
:
string
,
storageAccountName
:
string
,
containerName
:
string
,
localMounted
:
LocalMountedType
,
localMountPoint
?:
string
,
resourceGroupName
?:
string
,
storageAccountKey
?:
string
)
{
this
.
storageType
=
storageType
;
this
.
localMountPoint
=
localMountPoint
;
this
.
remoteMountPoint
=
remoteMountPoint
;
this
.
resourceGroupName
=
resourceGroupName
;
this
.
storageAccountName
=
storageAccountName
;
this
.
storageAccountKey
=
storageAccountKey
;
this
.
containerName
=
containerName
;
this
.
localMounted
=
localMounted
;
}
}
export
class
AzureBlobSharedStorageService
extends
SharedStorageService
{
export
class
AzureBlobSharedStorageService
extends
SharedStorageService
{
private
log
:
Logger
;
private
log
:
Logger
;
private
internalStorageService
:
MountedStorageService
;
private
internalStorageService
:
MountedStorageService
;
...
@@ -96,13 +71,11 @@ export class AzureBlobSharedStorageService extends SharedStorageService {
...
@@ -96,13 +71,11 @@ export class AzureBlobSharedStorageService extends SharedStorageService {
this
.
experimentId
=
getExperimentId
();
this
.
experimentId
=
getExperimentId
();
}
}
public
async
config
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
public
async
config
(
azureblobConfig
:
AzureBlobConfig
):
Promise
<
void
>
{
if
(
key
===
TrialConfigMetadataKey
.
SHARED_STORAGE_CONFIG
)
{
const
azureblobConfig
=
<
AzureBlobSharedStorageConfig
>
JSON
.
parse
(
value
);
this
.
localMountPoint
=
azureblobConfig
.
localMountPoint
;
this
.
localMountPoint
=
azureblobConfig
.
localMountPoint
;
this
.
remoteMountPoint
=
azureblobConfig
.
remoteMountPoint
;
this
.
remoteMountPoint
=
azureblobConfig
.
remoteMountPoint
;
this
.
storageType
=
azureblobConfig
.
storageType
;
this
.
storageType
=
azureblobConfig
.
storageType
as
SharedStorageType
;
this
.
storageAccountName
=
azureblobConfig
.
storageAccountName
;
this
.
storageAccountName
=
azureblobConfig
.
storageAccountName
;
this
.
containerName
=
azureblobConfig
.
containerName
;
this
.
containerName
=
azureblobConfig
.
containerName
;
if
(
azureblobConfig
.
storageAccountKey
!==
undefined
)
{
if
(
azureblobConfig
.
storageAccountKey
!==
undefined
)
{
...
@@ -127,7 +100,6 @@ export class AzureBlobSharedStorageService extends SharedStorageService {
...
@@ -127,7 +100,6 @@ export class AzureBlobSharedStorageService extends SharedStorageService {
this
.
internalStorageService
.
initialize
(
this
.
localMountPoint
,
path
.
join
(
this
.
localMountPoint
,
'
nni
'
,
this
.
experimentId
));
this
.
internalStorageService
.
initialize
(
this
.
localMountPoint
,
path
.
join
(
this
.
localMountPoint
,
'
nni
'
,
this
.
experimentId
));
}
}
}
}
}
public
get
canLocalMounted
():
boolean
{
public
get
canLocalMounted
():
boolean
{
return
true
;
return
true
;
...
...
ts/nni_manager/training_service/reusable/shared_storages/nfsStorageService.ts
View file @
817ec68b
...
@@ -6,11 +6,11 @@
...
@@ -6,11 +6,11 @@
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
path
from
'
path
'
;
import
*
as
path
from
'
path
'
;
import
{
SharedStorageService
,
SharedStorage
Config
,
SharedStorageType
,
LocalMounted
Type
}
from
'
../sharedStorage
'
import
{
SharedStorageService
,
SharedStorageType
}
from
'
../sharedStorage
'
import
{
MountedStorageService
}
from
'
../storages/mountedStorageService
'
;
import
{
MountedStorageService
}
from
'
../storages/mountedStorageService
'
;
import
{
TrialConfigMetadataKey
}
from
'
../../common/trialConfigMetadataKey
'
;
import
{
getLogger
,
Logger
}
from
'
../../../common/log
'
;
import
{
getLogger
,
Logger
}
from
'
../../../common/log
'
;
import
{
getExperimentId
}
from
'
../../../common/experimentStartupInfo
'
;
import
{
getExperimentId
}
from
'
../../../common/experimentStartupInfo
'
;
import
{
NfsConfig
}
from
'
../../../common/experimentConfig
'
;
const
INSTALL_NFS_CLIENT
=
`
const
INSTALL_NFS_CLIENT
=
`
#!/bin/bash
#!/bin/bash
...
@@ -35,26 +35,6 @@ else
...
@@ -35,26 +35,6 @@ else
fi
fi
`
`
class
NFSSharedStorageConfig
implements
SharedStorageConfig
{
public
storageType
:
SharedStorageType
;
public
localMountPoint
:
string
;
public
remoteMountPoint
:
string
;
public
nfsServer
:
string
;
public
exportedDirectory
:
string
;
public
localMounted
:
LocalMountedType
;
constructor
(
storageType
:
SharedStorageType
,
localMountPoint
:
string
,
remoteMountPoint
:
string
,
nfsServer
:
string
,
exportedDirectory
:
string
,
localMounted
:
LocalMountedType
)
{
this
.
storageType
=
storageType
;
this
.
localMountPoint
=
localMountPoint
;
this
.
remoteMountPoint
=
remoteMountPoint
;
this
.
nfsServer
=
nfsServer
;
this
.
exportedDirectory
=
exportedDirectory
;
this
.
localMounted
=
localMounted
;
}
}
export
class
NFSSharedStorageService
extends
SharedStorageService
{
export
class
NFSSharedStorageService
extends
SharedStorageService
{
private
log
:
Logger
;
private
log
:
Logger
;
private
internalStorageService
:
MountedStorageService
;
private
internalStorageService
:
MountedStorageService
;
...
@@ -75,9 +55,7 @@ export class NFSSharedStorageService extends SharedStorageService {
...
@@ -75,9 +55,7 @@ export class NFSSharedStorageService extends SharedStorageService {
this
.
experimentId
=
getExperimentId
();
this
.
experimentId
=
getExperimentId
();
}
}
public
async
config
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
public
async
config
(
nfsConfig
:
NfsConfig
):
Promise
<
void
>
{
if
(
key
===
TrialConfigMetadataKey
.
SHARED_STORAGE_CONFIG
)
{
const
nfsConfig
=
<
NFSSharedStorageConfig
>
JSON
.
parse
(
value
);
this
.
localMountPoint
=
nfsConfig
.
localMountPoint
;
this
.
localMountPoint
=
nfsConfig
.
localMountPoint
;
this
.
remoteMountPoint
=
nfsConfig
.
remoteMountPoint
;
this
.
remoteMountPoint
=
nfsConfig
.
remoteMountPoint
;
...
@@ -94,7 +72,6 @@ export class NFSSharedStorageService extends SharedStorageService {
...
@@ -94,7 +72,6 @@ export class NFSSharedStorageService extends SharedStorageService {
}
}
this
.
internalStorageService
.
initialize
(
this
.
localMountPoint
,
path
.
join
(
this
.
localMountPoint
,
'
nni
'
,
this
.
experimentId
));
this
.
internalStorageService
.
initialize
(
this
.
localMountPoint
,
path
.
join
(
this
.
localMountPoint
,
'
nni
'
,
this
.
experimentId
));
}
return
Promise
.
resolve
();
return
Promise
.
resolve
();
}
}
...
...
ts/nni_manager/training_service/reusable/test/trialDispatcher.test.ts
View file @
817ec68b
...
@@ -169,6 +169,18 @@ async function waitEnvironment(waitCount: number,
...
@@ -169,6 +169,18 @@ async function waitEnvironment(waitCount: number,
return
waitRequestEnvironment
;
return
waitRequestEnvironment
;
}
}
const
config
=
{
searchSpace
:
{
},
trialCommand
:
'
echo hi
'
,
trialCodeDirectory
:
path
.
dirname
(
__filename
),
trialConcurrency
:
0
,
nniManagerIp
:
'
127.0.0.1
'
,
trainingService
:
{
platform
:
'
local
'
},
debug
:
true
};
describe
(
'
Unit Test for TrialDispatcher
'
,
()
=>
{
describe
(
'
Unit Test for TrialDispatcher
'
,
()
=>
{
let
trialRunPromise
:
Promise
<
void
>
;
let
trialRunPromise
:
Promise
<
void
>
;
...
@@ -191,17 +203,8 @@ describe('Unit Test for TrialDispatcher', () => {
...
@@ -191,17 +203,8 @@ describe('Unit Test for TrialDispatcher', () => {
});
});
beforeEach
(
async
()
=>
{
beforeEach
(
async
()
=>
{
const
trialConfig
=
{
trialDispatcher
=
new
TrialDispatcher
(
config
);
codeDir
:
currentDir
,
command
:
"
echo
"
,
}
const
nniManagerIpConfig
=
{
nniManagerIp
:
"
127.0.0.1
"
,
}
trialDispatcher
=
new
TrialDispatcher
();
await
trialDispatcher
.
setClusterMetadata
(
TrialConfigMetadataKey
.
TRIAL_CONFIG
,
JSON
.
stringify
(
trialConfig
));
await
trialDispatcher
.
setClusterMetadata
(
TrialConfigMetadataKey
.
NNI_MANAGER_IP
,
JSON
.
stringify
(
nniManagerIpConfig
));
// set ut environment
// set ut environment
let
environmentServiceList
:
EnvironmentService
[]
=
[];
let
environmentServiceList
:
EnvironmentService
[]
=
[];
environmentService
=
new
UtEnvironmentService
();
environmentService
=
new
UtEnvironmentService
();
...
@@ -224,7 +227,6 @@ describe('Unit Test for TrialDispatcher', () => {
...
@@ -224,7 +227,6 @@ describe('Unit Test for TrialDispatcher', () => {
});
});
it
(
'
reuse env
'
,
async
()
=>
{
it
(
'
reuse env
'
,
async
()
=>
{
let
trialDetail
=
await
newTrial
(
trialDispatcher
);
let
trialDetail
=
await
newTrial
(
trialDispatcher
);
await
waitEnvironment
(
1
,
previousEnvironments
,
environmentService
,
commandChannel
);
await
waitEnvironment
(
1
,
previousEnvironments
,
environmentService
,
commandChannel
);
await
verifyTrialRunning
(
commandChannel
,
trialDetail
);
await
verifyTrialRunning
(
commandChannel
,
trialDetail
);
...
@@ -240,31 +242,31 @@ describe('Unit Test for TrialDispatcher', () => {
...
@@ -240,31 +242,31 @@ describe('Unit Test for TrialDispatcher', () => {
});
});
it
(
'
not reusable env
'
,
async
()
=>
{
it
(
'
not reusable env
'
,
async
()
=>
{
trialDispatcher
.
setClusterMetadata
(
//
trialDispatcher.setClusterMetadata(
TrialConfigMetadataKey
.
TRIAL_CONFIG
,
//
TrialConfigMetadataKey.TRIAL_CONFIG,
JSON
.
stringify
({
//
JSON.stringify({
reuseEnvironment
:
false
,
//
reuseEnvironment: false,
codeDir
:
currentDir
,
//
codeDir: currentDir,
}));
//
}));
let
trialDetail
=
await
newTrial
(
trialDispatcher
);
//
let trialDetail = await newTrial(trialDispatcher);
let
environment
=
await
waitEnvironment
(
1
,
previousEnvironments
,
environmentService
,
commandChannel
);
//
let environment = await waitEnvironment(1, previousEnvironments, environmentService, commandChannel);
await
verifyTrialRunning
(
commandChannel
,
trialDetail
);
//
await verifyTrialRunning(commandChannel, trialDetail);
await
verifyTrialResult
(
commandChannel
,
trialDetail
,
0
);
//
await verifyTrialResult(commandChannel, trialDetail, 0);
await
waitResultMust
<
true
>
(
async
()
=>
{
//
await waitResultMust<true>(async () => {
return
environment
.
status
===
'
USER_CANCELED
'
?
true
:
undefined
;
//
return environment.status === 'USER_CANCELED' ? true : undefined;
});
//
});
trialDetail
=
await
newTrial
(
trialDispatcher
);
//
trialDetail = await newTrial(trialDispatcher);
await
waitEnvironment
(
2
,
previousEnvironments
,
environmentService
,
commandChannel
);
//
await waitEnvironment(2, previousEnvironments, environmentService, commandChannel);
await
verifyTrialRunning
(
commandChannel
,
trialDetail
);
//
await verifyTrialRunning(commandChannel, trialDetail);
await
verifyTrialResult
(
commandChannel
,
trialDetail
,
-
1
);
//
await verifyTrialResult(commandChannel, trialDetail, -1);
chai
.
assert
.
equal
(
environmentService
.
testGetEnvironments
().
size
,
2
,
"
as env not reused, so only 2 envs should be here.
"
);
//
chai.assert.equal(environmentService.testGetEnvironments().size, 2, "as env not reused, so only 2 envs should be here.");
const
trials
=
await
trialDispatcher
.
listTrialJobs
();
//
const trials = await trialDispatcher.listTrialJobs();
chai
.
assert
.
equal
(
trials
.
length
,
2
,
"
there should be 2 trials
"
);
//
chai.assert.equal(trials.length, 2, "there should be 2 trials");
});
});
it
(
'
no more env
'
,
async
()
=>
{
it
(
'
no more env
'
,
async
()
=>
{
...
@@ -475,37 +477,37 @@ describe('Unit Test for TrialDispatcher', () => {
...
@@ -475,37 +477,37 @@ describe('Unit Test for TrialDispatcher', () => {
});
});
it
(
'
GPUScheduler disabled gpuNum === 0
'
,
async
()
=>
{
it
(
'
GPUScheduler disabled gpuNum === 0
'
,
async
()
=>
{
trialDispatcher
.
setClusterMetadata
(
//
trialDispatcher.setClusterMetadata(
TrialConfigMetadataKey
.
TRIAL_CONFIG
,
//
TrialConfigMetadataKey.TRIAL_CONFIG,
JSON
.
stringify
({
//
JSON.stringify({
reuseEnvironment
:
false
,
//
reuseEnvironment: false,
codeDir
:
currentDir
,
//
codeDir: currentDir,
gpuNum
:
0
,
//
gpuNum: 0,
}));
//
}));
let
trialDetail
=
await
newTrial
(
trialDispatcher
);
//
let trialDetail = await newTrial(trialDispatcher);
await
waitEnvironment
(
1
,
previousEnvironments
,
environmentService
,
commandChannel
);
//
await waitEnvironment(1, previousEnvironments, environmentService, commandChannel);
const
command
=
await
verifyTrialRunning
(
commandChannel
,
trialDetail
);
//
const command = await verifyTrialRunning(commandChannel, trialDetail);
await
verifyTrialResult
(
commandChannel
,
trialDetail
,
0
);
//
await verifyTrialResult(commandChannel, trialDetail, 0);
chai
.
assert
.
equal
(
command
.
data
[
"
gpuIndices
"
],
""
);
//
chai.assert.equal(command.data["gpuIndices"], "");
});
});
it
(
'
GPUScheduler enable no cluster gpu config
'
,
async
()
=>
{
it
(
'
GPUScheduler enable no cluster gpu config
'
,
async
()
=>
{
trialDispatcher
.
setClusterMetadata
(
//
trialDispatcher.setClusterMetadata(
TrialConfigMetadataKey
.
TRIAL_CONFIG
,
//
TrialConfigMetadataKey.TRIAL_CONFIG,
JSON
.
stringify
({
//
JSON.stringify({
reuseEnvironment
:
false
,
//
reuseEnvironment: false,
codeDir
:
currentDir
,
//
codeDir: currentDir,
gpuNum
:
1
,
//
gpuNum: 1,
}));
//
}));
let
trialDetail
=
await
newTrial
(
trialDispatcher
);
//
let trialDetail = await newTrial(trialDispatcher);
await
waitEnvironment
(
1
,
previousEnvironments
,
environmentService
,
commandChannel
);
//
await waitEnvironment(1, previousEnvironments, environmentService, commandChannel);
const
command
=
await
verifyTrialRunning
(
commandChannel
,
trialDetail
);
//
const command = await verifyTrialRunning(commandChannel, trialDetail);
await
verifyTrialResult
(
commandChannel
,
trialDetail
,
0
);
//
await verifyTrialResult(commandChannel, trialDetail, 0);
chai
.
assert
.
equal
(
command
.
data
[
"
gpuIndices
"
],
"
0
"
);
//
chai.assert.equal(command.data["gpuIndices"], "0");
});
});
it
(
'
GPUScheduler skipped no GPU info
'
,
async
()
=>
{
it
(
'
GPUScheduler skipped no GPU info
'
,
async
()
=>
{
...
...
ts/nni_manager/training_service/reusable/trialDispatcher.ts
View file @
817ec68b
This diff is collapsed.
Click to expand it.
ts/nni_manager/training_service/test/localTrainingService.test.ts
View file @
817ec68b
...
@@ -13,6 +13,7 @@ import { TrialJobApplicationForm, TrialJobDetail} from '../../common/trainingSer
...
@@ -13,6 +13,7 @@ import { TrialJobApplicationForm, TrialJobDetail} from '../../common/trainingSer
import
{
cleanupUnitTest
,
delay
,
prepareUnitTest
,
getExperimentRootDir
}
from
'
../../common/utils
'
;
import
{
cleanupUnitTest
,
delay
,
prepareUnitTest
,
getExperimentRootDir
}
from
'
../../common/utils
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
import
{
LocalTrainingService
}
from
'
../local/localTrainingService
'
;
import
{
LocalTrainingService
}
from
'
../local/localTrainingService
'
;
import
{
ExperimentConfig
}
from
'
../../common/experimentConfig
'
;
// TODO: copy mockedTrail.py to local folder
// TODO: copy mockedTrail.py to local folder
const
localCodeDir
:
string
=
tmp
.
dirSync
().
name
.
split
(
'
\\
'
).
join
(
'
\\\\
'
);
const
localCodeDir
:
string
=
tmp
.
dirSync
().
name
.
split
(
'
\\
'
).
join
(
'
\\\\
'
);
...
@@ -20,9 +21,23 @@ const mockedTrialPath: string = './training_service/test/mockedTrial.py'
...
@@ -20,9 +21,23 @@ const mockedTrialPath: string = './training_service/test/mockedTrial.py'
fs
.
copyFileSync
(
mockedTrialPath
,
localCodeDir
+
'
/mockedTrial.py
'
)
fs
.
copyFileSync
(
mockedTrialPath
,
localCodeDir
+
'
/mockedTrial.py
'
)
describe
(
'
Unit Test for LocalTrainingService
'
,
()
=>
{
describe
(
'
Unit Test for LocalTrainingService
'
,
()
=>
{
let
trialConfig
:
any
=
`{"command":"sleep 1h && echo hello","codeDir":"
${
localCodeDir
}
","gpuNum":1}`
const
config
=
<
ExperimentConfig
>
{
trialCommand
:
'
sleep 1h && echo hello
'
,
trialCodeDirectory
:
`
${
localCodeDir
}
`
,
trialGpuNumber
:
1
,
trainingService
:
{
platform
:
'
local
'
}
};
let
localTrainingService
:
LocalTrainingService
;
const
config2
=
<
ExperimentConfig
>
{
trialCommand
:
'
python3 mockedTrial.py
'
,
trialCodeDirectory
:
`
${
localCodeDir
}
`
,
trialGpuNumber
:
0
,
trainingService
:
{
platform
:
'
local
'
}
};
before
(()
=>
{
before
(()
=>
{
chai
.
should
();
chai
.
should
();
...
@@ -34,29 +49,19 @@ describe('Unit Test for LocalTrainingService', () => {
...
@@ -34,29 +49,19 @@ describe('Unit Test for LocalTrainingService', () => {
cleanupUnitTest
();
cleanupUnitTest
();
});
});
beforeEach
(
()
=>
{
it
(
'
List empty trial jobs
'
,
async
()
=>
{
localTrainingService
=
component
.
get
(
LocalTrainingService
);
const
localTrainingService
=
new
LocalTrainingService
(
config
);
localTrainingService
.
run
();
localTrainingService
.
run
();
});
afterEach
(()
=>
{
localTrainingService
.
cleanUp
();
});
it
(
'
List empty trial jobs
'
,
async
()
=>
{
//trial jobs should be empty, since there are no submitted jobs
//trial jobs should be empty, since there are no submitted jobs
chai
.
expect
(
await
localTrainingService
.
listTrialJobs
()).
to
.
be
.
empty
;
chai
.
expect
(
await
localTrainingService
.
listTrialJobs
()).
to
.
be
.
empty
;
});
it
(
'
setClusterMetadata and getClusterMetadata
'
,
async
()
=>
{
localTrainingService
.
cleanUp
();
await
localTrainingService
.
setClusterMetadata
(
TrialConfigMetadataKey
.
TRIAL_CONFIG
,
trialConfig
);
localTrainingService
.
getClusterMetadata
(
TrialConfigMetadataKey
.
TRIAL_CONFIG
).
then
((
data
)
=>
{
chai
.
expect
(
data
).
to
.
be
.
equals
(
trialConfig
);
});
});
});
it
(
'
Submit job and Cancel job
'
,
async
()
=>
{
it
(
'
Submit job and Cancel job
'
,
async
()
=>
{
await
localTrainingService
.
setClusterMetadata
(
TrialConfigMetadataKey
.
TRIAL_CONFIG
,
trialConfig
);
const
localTrainingService
=
new
LocalTrainingService
(
config
);
localTrainingService
.
run
();
// submit job
// submit job
const
form
:
TrialJobApplicationForm
=
{
const
form
:
TrialJobApplicationForm
=
{
...
@@ -70,10 +75,13 @@ describe('Unit Test for LocalTrainingService', () => {
...
@@ -70,10 +75,13 @@ describe('Unit Test for LocalTrainingService', () => {
chai
.
expect
(
jobDetail
.
status
).
to
.
be
.
equals
(
'
WAITING
'
);
chai
.
expect
(
jobDetail
.
status
).
to
.
be
.
equals
(
'
WAITING
'
);
await
localTrainingService
.
cancelTrialJob
(
jobDetail
.
id
);
await
localTrainingService
.
cancelTrialJob
(
jobDetail
.
id
);
chai
.
expect
(
jobDetail
.
status
).
to
.
be
.
equals
(
'
USER_CANCELED
'
);
chai
.
expect
(
jobDetail
.
status
).
to
.
be
.
equals
(
'
USER_CANCELED
'
);
localTrainingService
.
cleanUp
();
}).
timeout
(
20000
);
}).
timeout
(
20000
);
it
(
'
Get trial log
'
,
async
()
=>
{
it
(
'
Get trial log
'
,
async
()
=>
{
await
localTrainingService
.
setClusterMetadata
(
TrialConfigMetadataKey
.
TRIAL_CONFIG
,
trialConfig
);
const
localTrainingService
=
new
LocalTrainingService
(
config
);
localTrainingService
.
run
();
// submit job
// submit job
const
form
:
TrialJobApplicationForm
=
{
const
form
:
TrialJobApplicationForm
=
{
...
@@ -100,13 +108,14 @@ describe('Unit Test for LocalTrainingService', () => {
...
@@ -100,13 +108,14 @@ describe('Unit Test for LocalTrainingService', () => {
fs
.
rmdirSync
(
path
.
join
(
rootDir
,
'
trials
'
))
fs
.
rmdirSync
(
path
.
join
(
rootDir
,
'
trials
'
))
await
localTrainingService
.
cancelTrialJob
(
jobDetail
.
id
);
await
localTrainingService
.
cancelTrialJob
(
jobDetail
.
id
);
localTrainingService
.
cleanUp
();
}).
timeout
(
20000
);
}).
timeout
(
20000
);
it
(
'
Read metrics, Add listener, and remove listener
'
,
async
()
=>
{
it
(
'
Read metrics, Add listener, and remove listener
'
,
async
()
=>
{
// set meta data
const
localTrainingService
=
new
LocalTrainingService
(
config2
);
const
trialConfig
:
string
=
`{\"command\":\"python3 mockedTrial.py\", \"codeDir\":\"
${
localCodeDir
}
\",\"gpuNum\":0}`
localTrainingService
.
run
();
await
localTrainingService
.
setClusterMetadata
(
TrialConfigMetadataKey
.
TRIAL_CONFIG
,
trialConfig
);
// set meta data
// submit job
// submit job
const
form
:
TrialJobApplicationForm
=
{
const
form
:
TrialJobApplicationForm
=
{
sequenceId
:
0
,
sequenceId
:
0
,
...
@@ -130,9 +139,6 @@ describe('Unit Test for LocalTrainingService', () => {
...
@@ -130,9 +139,6 @@ describe('Unit Test for LocalTrainingService', () => {
await
localTrainingService
.
cancelTrialJob
(
jobDetail
.
id
);
await
localTrainingService
.
cancelTrialJob
(
jobDetail
.
id
);
localTrainingService
.
removeTrialJobMetricListener
(
listener1
);
localTrainingService
.
removeTrialJobMetricListener
(
listener1
);
localTrainingService
.
cleanUp
();
}).
timeout
(
20000
);
}).
timeout
(
20000
);
it
(
'
Test multiphaseSupported
'
,
()
=>
{
chai
.
expect
(
localTrainingService
.
isMultiPhaseJobSupported
).
to
.
be
.
equals
(
true
)
})
});
});
ts/webui/mock/all-types-metric.json
View file @
817ec68b
...
@@ -11,30 +11,28 @@
...
@@ -11,30 +11,28 @@
"logDir"
:
"/***/nni/experiments/Tkaxm2mb"
,
"logDir"
:
"/***/nni/experiments/Tkaxm2mb"
,
"nextSequenceId"
:
110
,
"nextSequenceId"
:
110
,
"params"
:
{
"params"
:
{
"authorName"
:
"default"
,
"experimentName"
:
"default"
,
"experimentName"
:
"default"
,
"trialConcurrency"
:
10
,
"trialConcurrency"
:
10
,
"maxExecDuration"
:
3600
,
"maxExperimentDuration"
:
"1h"
,
"maxTrialNum"
:
100
,
"maxTrialNumber"
:
100
,
"searchSpace"
:
"{
\"
intermediate1
\"
: {
\"
_type
\"
:
\"
choice
\"
,
\"
_value
\"
: [
\"
normal
\"
,
\"
inf
\"
,
\"
neginf
\"
,
\"
nan
\"
,
\"
string
\"
,
\"
dict-empty
\"
,
\"
dict-normal
\"
,
\"
dict-nodefault
\"
,
\"
dict-defaultdict
\"
]},
\"
intermediate2
\"
: {
\"
_type
\"
:
\"
choice
\"
,
\"
_value
\"
: [
\"
normal
\"
,
\"
inf
\"
,
\"
neginf
\"
,
\"
nan
\"
,
\"
string
\"
,
\"
dict-empty
\"
,
\"
dict-normal
\"
,
\"
dict-nodefault
\"
,
\"
dict-defaultdict
\"
]},
\"
intermediate3
\"
: {
\"
_type
\"
:
\"
choice
\"
,
\"
_value
\"
: [
\"
normal
\"
,
\"
inf
\"
,
\"
neginf
\"
,
\"
nan
\"
,
\"
string
\"
,
\"
dict-empty
\"
,
\"
dict-normal
\"
,
\"
dict-nodefault
\"
,
\"
dict-defaultdict
\"
]},
\"
intermediate_count
\"
: {
\"
_type
\"
:
\"
choice
\"
,
\"
_value
\"
: [0, 1, 2, 3]},
\"
final1
\"
: {
\"
_type
\"
:
\"
choice
\"
,
\"
_value
\"
: [
\"
normal
\"
,
\"
inf
\"
,
\"
neginf
\"
,
\"
nan
\"
,
\"
string
\"
,
\"
dict-empty
\"
,
\"
dict-normal
\"
,
\"
dict-nodefault
\"
,
\"
dict-defaultdict
\"
]},
\"
final2
\"
: {
\"
_type
\"
:
\"
choice
\"
,
\"
_value
\"
: [
\"
normal
\"
,
\"
inf
\"
,
\"
neginf
\"
,
\"
nan
\"
,
\"
string
\"
,
\"
dict-empty
\"
,
\"
dict-normal
\"
,
\"
dict-nodefault
\"
,
\"
dict-defaultdict
\"
]},
\"
final_count
\"
: {
\"
_type
\"
:
\"
choice
\"
,
\"
_value
\"
: [0, 1, 2]}}"
,
"searchSpace"
:
{
"trainingServicePlatform"
:
"local"
,
"intermediate1"
:
{
"_type"
:
"choice"
,
"_value"
:
[
"normal"
,
"inf"
,
"neginf"
,
"nan"
,
"string"
,
"dict-empty"
,
"dict-normal"
,
"dict-nodefault"
,
"dict-defaultdict"
]},
"tuner"
:
{
"intermediate2"
:
{
"_type"
:
"choice"
,
"_value"
:
[
"normal"
,
"inf"
,
"neginf"
,
"nan"
,
"string"
,
"dict-empty"
,
"dict-normal"
,
"dict-nodefault"
,
"dict-defaultdict"
]},
"codeDir"
:
"/***/nnidev/src/webui/tests/metrics-test/."
,
"intermediate3"
:
{
"_type"
:
"choice"
,
"_value"
:
[
"normal"
,
"inf"
,
"neginf"
,
"nan"
,
"string"
,
"dict-empty"
,
"dict-normal"
,
"dict-nodefault"
,
"dict-defaultdict"
]},
"classFileName"
:
"naive_random.py"
,
"intermediate_count"
:
{
"_type"
:
"choice"
,
"_value"
:
[
0
,
1
,
2
,
3
]},
"className"
:
"NaiveRandomTuner"
,
"final1"
:
{
"_type"
:
"choice"
,
"_value"
:
[
"normal"
,
"inf"
,
"neginf"
,
"nan"
,
"string"
,
"dict-empty"
,
"dict-normal"
,
"dict-nodefault"
,
"dict-defaultdict"
]},
"checkpointDir"
:
"/***/nni/experiments/Tkaxm2mb/checkpoint"
"final2"
:
{
"_type"
:
"choice"
,
"_value"
:
[
"normal"
,
"inf"
,
"neginf"
,
"nan"
,
"string"
,
"dict-empty"
,
"dict-normal"
,
"dict-nodefault"
,
"dict-defaultdict"
]},
"final_count"
:
{
"_type"
:
"choice"
,
"_value"
:
[
0
,
1
,
2
]}
},
"trainingService"
:
{
"platform"
:
"local"
},
},
"versionCheck"
:
true
,
"tuner"
:
{
"clusterMetaData"
:
[
"codeDirectory"
:
"/***/nnidev/src/webui/tests/metrics-test"
,
{
"className"
:
"naive_random.NaiveRandomTuner"
"key"
:
"codeDir"
,
"value"
:
"/***/nnidev/src/webui/tests/metrics-test/."
},
},
{
"trialCommand"
:
"python trial.py"
,
"key"
:
"command"
,
"codeDirectory"
:
"/***/nnidev/src/webui/tests/metrics-test"
"value"
:
"python trial.py"
}
]
},
},
"startTime"
:
1595901129833
,
"startTime"
:
1595901129833
,
"endTime"
:
1595901290657
"endTime"
:
1595901290657
...
...
ts/webui/package.json
View file @
817ec68b
...
@@ -119,5 +119,8 @@
...
@@ -119,5 +119,8 @@
"node-forge"
:
">=0.10.0"
,
"node-forge"
:
">=0.10.0"
,
"y18n"
:
">=5.0.5"
,
"y18n"
:
">=5.0.5"
,
"serialize-javascript"
:
">=5.0.1"
"serialize-javascript"
:
">=5.0.1"
},
"jest"
:
{
"verbose"
:
true
}
}
}
}
ts/webui/src/components/Overview.tsx
View file @
817ec68b
...
@@ -60,7 +60,6 @@ class Overview extends React.Component<{}, OverviewState> {
...
@@ -60,7 +60,6 @@ class Overview extends React.Component<{}, OverviewState> {
const
bestTrials
=
this
.
findBestTrials
();
const
bestTrials
=
this
.
findBestTrials
();
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
const
bestAccuracy
=
bestTrials
.
length
>
0
?
bestTrials
[
0
].
accuracy
!
:
NaN
;
const
bestAccuracy
=
bestTrials
.
length
>
0
?
bestTrials
[
0
].
accuracy
!
:
NaN
;
const
maxExecDuration
=
EXPERIMENT
.
profile
.
params
.
maxExecDuration
;
const
execDuration
=
EXPERIMENT
.
profile
.
execDuration
;
const
execDuration
=
EXPERIMENT
.
profile
.
execDuration
;
return
(
return
(
...
@@ -96,7 +95,7 @@ class Overview extends React.Component<{}, OverviewState> {
...
@@ -96,7 +95,7 @@ class Overview extends React.Component<{}, OverviewState> {
</
TitleContext
.
Provider
>
</
TitleContext
.
Provider
>
<
ExpDurationContext
.
Provider
<
ExpDurationContext
.
Provider
value
=
{
{
value
=
{
{
maxExecDuration
,
maxExecDuration
:
EXPERIMENT
.
maxExperimentDurationSeconds
,
execDuration
,
execDuration
,
updateOverviewPage
,
updateOverviewPage
,
maxDurationUnit
,
maxDurationUnit
,
...
@@ -112,7 +111,7 @@ class Overview extends React.Component<{}, OverviewState> {
...
@@ -112,7 +111,7 @@ class Overview extends React.Component<{}, OverviewState> {
</
TitleContext
.
Provider
>
</
TitleContext
.
Provider
>
<
ExpDurationContext
.
Provider
<
ExpDurationContext
.
Provider
value
=
{
{
value
=
{
{
maxExecDuration
,
maxExecDuration
:
EXPERIMENT
.
maxExperimentDurationSeconds
,
execDuration
,
execDuration
,
updateOverviewPage
,
updateOverviewPage
,
maxDurationUnit
,
maxDurationUnit
,
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment