Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
143c6615
Unverified
Commit
143c6615
authored
Jul 30, 2020
by
Chi Song
Committed by
GitHub
Jul 30, 2020
Browse files
Reusable environment support GPU scheduler, add test cases and refactoring. (#2627)
parent
8a20c348
Changes
30
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1449 additions
and
144 deletions
+1449
-144
src/nni_manager/common/utils.ts
src/nni_manager/common/utils.ts
+4
-3
src/nni_manager/package.json
src/nni_manager/package.json
+1
-0
src/nni_manager/rest_server/restValidationSchemas.ts
src/nni_manager/rest_server/restValidationSchemas.ts
+5
-0
src/nni_manager/training_service/common/gpuData.ts
src/nni_manager/training_service/common/gpuData.ts
+24
-0
src/nni_manager/training_service/common/trialConfig.ts
src/nni_manager/training_service/common/trialConfig.ts
+4
-0
src/nni_manager/training_service/pai/paiConfig.ts
src/nni_manager/training_service/pai/paiConfig.ts
+13
-2
src/nni_manager/training_service/remote_machine/gpuScheduler.ts
...i_manager/training_service/remote_machine/gpuScheduler.ts
+12
-14
src/nni_manager/training_service/remote_machine/remoteMachineData.ts
...ager/training_service/remote_machine/remoteMachineData.ts
+1
-24
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
...ng_service/remote_machine/remoteMachineTrainingService.ts
+4
-5
src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts
...r/training_service/reusable/channels/amlCommandChannel.ts
+3
-8
src/nni_manager/training_service/reusable/environment.ts
src/nni_manager/training_service/reusable/environment.ts
+96
-29
src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts
...ng_service/reusable/environments/amlEnvironmentService.ts
+20
-28
src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts
...ervice/reusable/environments/openPaiEnvironmentService.ts
+50
-24
src/nni_manager/training_service/reusable/gpuScheduler.ts
src/nni_manager/training_service/reusable/gpuScheduler.ts
+235
-0
src/nni_manager/training_service/reusable/storageService.ts
src/nni_manager/training_service/reusable/storageService.ts
+2
-2
src/nni_manager/training_service/reusable/storages/mountedStorageService.ts
...aining_service/reusable/storages/mountedStorageService.ts
+5
-5
src/nni_manager/training_service/reusable/test/mountedStorageService.test.ts
...ining_service/reusable/test/mountedStorageService.test.ts
+125
-0
src/nni_manager/training_service/reusable/test/trialDispatcher.test.ts
...er/training_service/reusable/test/trialDispatcher.test.ts
+712
-0
src/nni_manager/training_service/reusable/test/utCommandChannel.ts
...anager/training_service/reusable/test/utCommandChannel.ts
+57
-0
src/nni_manager/training_service/reusable/test/utEnvironmentService.ts
...er/training_service/reusable/test/utEnvironmentService.ts
+76
-0
No files found.
src/nni_manager/common/utils.ts
View file @
143c6615
...
@@ -222,15 +222,16 @@ function getIPV4Address(): string {
...
@@ -222,15 +222,16 @@ function getIPV4Address(): string {
return
cachedipv4Address
;
return
cachedipv4Address
;
}
}
if
(
os
.
networkInterfaces
().
eth0
)
{
const
networkInterfaces
=
os
.
networkInterfaces
();
for
(
const
item
of
os
.
networkInterfaces
().
eth0
)
{
if
(
networkInterfaces
.
eth0
)
{
for
(
const
item
of
networkInterfaces
.
eth0
)
{
if
(
item
.
family
===
'
IPv4
'
)
{
if
(
item
.
family
===
'
IPv4
'
)
{
cachedipv4Address
=
item
.
address
;
cachedipv4Address
=
item
.
address
;
return
cachedipv4Address
;
return
cachedipv4Address
;
}
}
}
}
}
else
{
}
else
{
throw
Error
(
'
getIPV4Address() failed because os.networkInterfaces().eth0 is undefined.
'
);
throw
Error
(
`
getIPV4Address() failed because os.networkInterfaces().eth0 is undefined.
Please specify NNI manager IP in config.`
);
}
}
throw
Error
(
'
getIPV4Address() failed because no valid IPv4 address found.
'
)
throw
Error
(
'
getIPV4Address() failed because no valid IPv4 address found.
'
)
...
...
src/nni_manager/package.json
View file @
143c6615
...
@@ -39,6 +39,7 @@
...
@@ -39,6 +39,7 @@
"@types/express"
:
"^4.16.0"
,
"@types/express"
:
"^4.16.0"
,
"@types/glob"
:
"^7.1.1"
,
"@types/glob"
:
"^7.1.1"
,
"@types/js-base64"
:
"^2.3.1"
,
"@types/js-base64"
:
"^2.3.1"
,
"@types/js-yaml"
:
"^3.12.5"
,
"@types/mocha"
:
"^5.2.5"
,
"@types/mocha"
:
"^5.2.5"
,
"@types/node"
:
"10.12.18"
,
"@types/node"
:
"10.12.18"
,
"@types/request"
:
"^2.47.1"
,
"@types/request"
:
"^2.47.1"
,
...
...
src/nni_manager/rest_server/restValidationSchemas.ts
View file @
143c6615
...
@@ -107,6 +107,11 @@ export namespace ValidationSchemas {
...
@@ -107,6 +107,11 @@ export namespace ValidationSchemas {
token
:
joi
.
string
().
min
(
1
),
token
:
joi
.
string
().
min
(
1
),
host
:
joi
.
string
().
min
(
1
).
required
(),
host
:
joi
.
string
().
min
(
1
).
required
(),
reuse
:
joi
.
boolean
(),
reuse
:
joi
.
boolean
(),
cpuNum
:
joi
.
number
().
min
(
1
),
memoryMB
:
joi
.
number
().
min
(
100
),
gpuNum
:
joi
.
number
().
min
(
1
),
maxTrialNumPerGpu
:
joi
.
number
(),
useActiveGpu
:
joi
.
boolean
(),
}),
}),
kubeflow_config
:
joi
.
object
({
// eslint-disable-line @typescript-eslint/camelcase
kubeflow_config
:
joi
.
object
({
// eslint-disable-line @typescript-eslint/camelcase
operator
:
joi
.
string
().
min
(
1
).
required
(),
operator
:
joi
.
string
().
min
(
1
).
required
(),
...
...
src/nni_manager/training_service/common/gpuData.ts
View file @
143c6615
...
@@ -3,6 +3,17 @@
...
@@ -3,6 +3,17 @@
'
use strict
'
;
'
use strict
'
;
export
enum
ScheduleResultType
{
// Schedule succeeded
SUCCEED
,
// Temporarily, no enough available GPU right now
TMP_NO_AVAILABLE_GPU
,
// Cannot match requirement even if all GPU are a
REQUIRE_EXCEED_TOTAL
}
/**
/**
* GPU Infromation class
* GPU Infromation class
* Representing the dynamic and static information retrieved from Nvidia-smi
* Representing the dynamic and static information retrieved from Nvidia-smi
...
@@ -52,6 +63,19 @@ export class GPUSummary {
...
@@ -52,6 +63,19 @@ export class GPUSummary {
}
}
}
}
export
function
parseGpuIndices
(
gpuIndices
?:
string
):
Set
<
number
>
|
undefined
{
if
(
gpuIndices
!==
undefined
)
{
const
indices
:
number
[]
=
gpuIndices
.
split
(
'
,
'
)
.
map
((
x
:
string
)
=>
parseInt
(
x
,
10
));
if
(
indices
.
length
>
0
)
{
return
new
Set
(
indices
);
}
else
{
throw
new
Error
(
'
gpuIndices can not be empty if specified.
'
);
}
}
}
export
const
GPU_INFO_COLLECTOR_FORMAT_WINDOWS
:
string
=
export
const
GPU_INFO_COLLECTOR_FORMAT_WINDOWS
:
string
=
`
`
$env:METRIC_OUTPUT_DIR="{0}"
$env:METRIC_OUTPUT_DIR="{0}"
...
...
src/nni_manager/training_service/common/trialConfig.ts
View file @
143c6615
...
@@ -17,6 +17,10 @@ export class TrialConfig {
...
@@ -17,6 +17,10 @@ export class TrialConfig {
// Required GPU number for trial job. The number should be in [0,100]
// Required GPU number for trial job. The number should be in [0,100]
public
readonly
gpuNum
:
number
;
public
readonly
gpuNum
:
number
;
// this flag uses for UT now.
// in future, all environments should be reusable, and this can be configurable by user.
public
reuseEnvironment
:
boolean
|
undefined
=
true
;
/**
/**
* Constructor
* Constructor
* @param command Trail command
* @param command Trail command
...
...
src/nni_manager/training_service/pai/paiConfig.ts
View file @
143c6615
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
'
use strict
'
;
'
use strict
'
;
import
{
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobStatus
}
from
'
../../common/trainingService
'
;
import
{
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobStatus
}
from
'
../../common/trainingService
'
;
export
class
PAIClusterConfig
{
export
class
PAIClusterConfig
{
public
readonly
userName
:
string
;
public
readonly
userName
:
string
;
...
@@ -12,6 +12,13 @@ export class PAIClusterConfig {
...
@@ -12,6 +12,13 @@ export class PAIClusterConfig {
public
readonly
token
?:
string
;
public
readonly
token
?:
string
;
public
readonly
reuse
?:
boolean
;
public
readonly
reuse
?:
boolean
;
public
cpuNum
?:
number
;
public
memoryMB
?:
number
;
public
gpuNum
?:
number
;
public
useActiveGpu
?:
boolean
;
public
maxTrialNumPerGpu
?:
number
;
/**
/**
* Constructor
* Constructor
* @param userName User name of PAI Cluster
* @param userName User name of PAI Cluster
...
@@ -20,12 +27,16 @@ export class PAIClusterConfig {
...
@@ -20,12 +27,16 @@ export class PAIClusterConfig {
* @param token PAI token of PAI Cluster
* @param token PAI token of PAI Cluster
* @param reuse If job is reusable for multiple trials
* @param reuse If job is reusable for multiple trials
*/
*/
constructor
(
userName
:
string
,
host
:
string
,
passWord
?:
string
,
token
?:
string
,
reuse
?:
boolean
)
{
constructor
(
userName
:
string
,
host
:
string
,
passWord
?:
string
,
token
?:
string
,
reuse
?:
boolean
,
cpuNum
?:
number
,
memoryMB
?:
number
,
gpuNum
?:
number
)
{
this
.
userName
=
userName
;
this
.
userName
=
userName
;
this
.
passWord
=
passWord
;
this
.
passWord
=
passWord
;
this
.
host
=
host
;
this
.
host
=
host
;
this
.
token
=
token
;
this
.
token
=
token
;
this
.
reuse
=
reuse
;
this
.
reuse
=
reuse
;
this
.
cpuNum
=
cpuNum
;
this
.
memoryMB
=
memoryMB
;
this
.
gpuNum
=
gpuNum
;
}
}
}
}
...
...
src/nni_manager/training_service/remote_machine/gpuScheduler.ts
View file @
143c6615
...
@@ -6,10 +6,8 @@
...
@@ -6,10 +6,8 @@
import
*
as
assert
from
'
assert
'
;
import
*
as
assert
from
'
assert
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
randomSelect
}
from
'
../../common/utils
'
;
import
{
randomSelect
}
from
'
../../common/utils
'
;
import
{
GPUInfo
}
from
'
../common/gpuData
'
;
import
{
GPUInfo
,
parseGpuIndices
,
ScheduleResultType
}
from
'
../common/gpuData
'
;
import
{
import
{
ExecutorManager
,
RemoteMachineMeta
,
RemoteMachineScheduleResult
,
RemoteMachineTrialJobDetail
}
from
'
./remoteMachineData
'
;
parseGpuIndices
,
RemoteMachineMeta
,
RemoteMachineScheduleResult
,
RemoteMachineTrialJobDetail
,
ScheduleResultType
,
ExecutorManager
}
from
'
./remoteMachineData
'
;
type
SCHEDULE_POLICY_NAME
=
'
random
'
|
'
round-robin
'
;
type
SCHEDULE_POLICY_NAME
=
'
random
'
|
'
round-robin
'
;
...
@@ -39,7 +37,7 @@ export class GPUScheduler {
...
@@ -39,7 +37,7 @@ export class GPUScheduler {
* @param requiredGPUNum required GPU number
* @param requiredGPUNum required GPU number
*/
*/
public
scheduleMachine
(
requiredGPUNum
:
number
|
undefined
,
trialJobDetail
:
RemoteMachineTrialJobDetail
):
RemoteMachineScheduleResult
{
public
scheduleMachine
(
requiredGPUNum
:
number
|
undefined
,
trialJobDetail
:
RemoteMachineTrialJobDetail
):
RemoteMachineScheduleResult
{
if
(
requiredGPUNum
===
undefined
)
{
if
(
requiredGPUNum
===
undefined
)
{
requiredGPUNum
=
0
;
requiredGPUNum
=
0
;
}
}
assert
(
requiredGPUNum
>=
0
);
assert
(
requiredGPUNum
>=
0
);
...
@@ -48,7 +46,7 @@ export class GPUScheduler {
...
@@ -48,7 +46,7 @@ export class GPUScheduler {
// Step 1: Check if required GPU number not exceeds the total GPU number in all machines
// Step 1: Check if required GPU number not exceeds the total GPU number in all machines
const
eligibleRM
:
RemoteMachineMeta
[]
=
allRMs
.
filter
((
rmMeta
:
RemoteMachineMeta
)
=>
const
eligibleRM
:
RemoteMachineMeta
[]
=
allRMs
.
filter
((
rmMeta
:
RemoteMachineMeta
)
=>
rmMeta
.
gpuSummary
===
undefined
||
requiredGPUNum
===
0
||
(
requiredGPUNum
!==
undefined
&&
rmMeta
.
gpuSummary
.
gpuCount
>=
requiredGPUNum
));
rmMeta
.
gpuSummary
===
undefined
||
requiredGPUNum
===
0
||
(
requiredGPUNum
!==
undefined
&&
rmMeta
.
gpuSummary
.
gpuCount
>=
requiredGPUNum
));
if
(
eligibleRM
.
length
===
0
)
{
if
(
eligibleRM
.
length
===
0
)
{
// If the required gpu number exceeds the upper limit of all machine's GPU number
// If the required gpu number exceeds the upper limit of all machine's GPU number
// Return REQUIRE_EXCEED_TOTAL directly
// Return REQUIRE_EXCEED_TOTAL directly
...
@@ -75,8 +73,8 @@ export class GPUScheduler {
...
@@ -75,8 +73,8 @@ export class GPUScheduler {
this
.
log
.
warning
(
`Scheduler: trialJob id
${
trialJobDetail
.
id
}
, no machine can be scheduled, return TMP_NO_AVAILABLE_GPU `
);
this
.
log
.
warning
(
`Scheduler: trialJob id
${
trialJobDetail
.
id
}
, no machine can be scheduled, return TMP_NO_AVAILABLE_GPU `
);
return
{
return
{
resultType
:
ScheduleResultType
.
TMP_NO_AVAILABLE_GPU
,
resultType
:
ScheduleResultType
.
TMP_NO_AVAILABLE_GPU
,
scheduleInfo
:
undefined
scheduleInfo
:
undefined
};
};
}
}
...
@@ -159,7 +157,7 @@ export class GPUScheduler {
...
@@ -159,7 +157,7 @@ export class GPUScheduler {
const
num
:
number
|
undefined
=
rmMeta
.
occupiedGpuIndexMap
.
get
(
gpuInfo
.
index
);
const
num
:
number
|
undefined
=
rmMeta
.
occupiedGpuIndexMap
.
get
(
gpuInfo
.
index
);
const
maxTrialNumPerGpu
:
number
=
rmMeta
.
maxTrialNumPerGpu
?
rmMeta
.
maxTrialNumPerGpu
:
1
;
const
maxTrialNumPerGpu
:
number
=
rmMeta
.
maxTrialNumPerGpu
?
rmMeta
.
maxTrialNumPerGpu
:
1
;
if
((
num
===
undefined
&&
(
!
rmMeta
.
useActiveGpu
&&
gpuInfo
.
activeProcessNum
===
0
||
rmMeta
.
useActiveGpu
))
||
if
((
num
===
undefined
&&
(
!
rmMeta
.
useActiveGpu
&&
gpuInfo
.
activeProcessNum
===
0
||
rmMeta
.
useActiveGpu
))
||
(
num
!==
undefined
&&
num
<
maxTrialNumPerGpu
))
{
(
num
!==
undefined
&&
num
<
maxTrialNumPerGpu
))
{
availableGPUs
.
push
(
gpuInfo
);
availableGPUs
.
push
(
gpuInfo
);
}
}
}
else
{
}
else
{
...
@@ -200,7 +198,7 @@ export class GPUScheduler {
...
@@ -200,7 +198,7 @@ export class GPUScheduler {
}
}
private
allocateHost
(
requiredGPUNum
:
number
,
rmMeta
:
RemoteMachineMeta
,
private
allocateHost
(
requiredGPUNum
:
number
,
rmMeta
:
RemoteMachineMeta
,
gpuInfos
:
GPUInfo
[],
trialJobDetail
:
RemoteMachineTrialJobDetail
):
RemoteMachineScheduleResult
{
gpuInfos
:
GPUInfo
[],
trialJobDetail
:
RemoteMachineTrialJobDetail
):
RemoteMachineScheduleResult
{
assert
(
gpuInfos
.
length
>=
requiredGPUNum
);
assert
(
gpuInfos
.
length
>=
requiredGPUNum
);
const
allocatedGPUs
:
GPUInfo
[]
=
this
.
selectGPUsForTrial
(
gpuInfos
,
requiredGPUNum
);
const
allocatedGPUs
:
GPUInfo
[]
=
this
.
selectGPUsForTrial
(
gpuInfos
,
requiredGPUNum
);
allocatedGPUs
.
forEach
((
gpuInfo
:
GPUInfo
)
=>
{
allocatedGPUs
.
forEach
((
gpuInfo
:
GPUInfo
)
=>
{
...
@@ -222,10 +220,10 @@ export class GPUScheduler {
...
@@ -222,10 +220,10 @@ export class GPUScheduler {
scheduleInfo
:
{
scheduleInfo
:
{
rmMeta
:
rmMeta
,
rmMeta
:
rmMeta
,
cudaVisibleDevice
:
allocatedGPUs
cudaVisibleDevice
:
allocatedGPUs
.
map
((
gpuInfo
:
GPUInfo
)
=>
{
.
map
((
gpuInfo
:
GPUInfo
)
=>
{
return
gpuInfo
.
index
;
return
gpuInfo
.
index
;
})
})
.
join
(
'
,
'
)
.
join
(
'
,
'
)
}
}
};
};
}
}
...
...
src/nni_manager/training_service/remote_machine/remoteMachineData.ts
View file @
143c6615
...
@@ -4,7 +4,7 @@
...
@@ -4,7 +4,7 @@
'
use strict
'
;
'
use strict
'
;
import
{
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobStatus
}
from
'
../../common/trainingService
'
;
import
{
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobStatus
}
from
'
../../common/trainingService
'
;
import
{
GPUInfo
,
GPUSummary
}
from
'
../common/gpuData
'
;
import
{
GPUInfo
,
GPUSummary
,
ScheduleResultType
}
from
'
../common/gpuData
'
;
import
{
ShellExecutor
}
from
'
./shellExecutor
'
;
import
{
ShellExecutor
}
from
'
./shellExecutor
'
;
/**
/**
...
@@ -25,18 +25,6 @@ export class RemoteMachineMeta {
...
@@ -25,18 +25,6 @@ export class RemoteMachineMeta {
public
readonly
useActiveGpu
?:
boolean
=
false
;
public
readonly
useActiveGpu
?:
boolean
=
false
;
}
}
export
function
parseGpuIndices
(
gpuIndices
?:
string
):
Set
<
number
>
|
undefined
{
if
(
gpuIndices
!==
undefined
)
{
const
indices
:
number
[]
=
gpuIndices
.
split
(
'
,
'
)
.
map
((
x
:
string
)
=>
parseInt
(
x
,
10
));
if
(
indices
.
length
>
0
)
{
return
new
Set
(
indices
);
}
else
{
throw
new
Error
(
'
gpuIndices can not be empty if specified.
'
);
}
}
}
/**
/**
* The execution result for command executed on remote machine
* The execution result for command executed on remote machine
*/
*/
...
@@ -168,14 +156,3 @@ export class ExecutorManager {
...
@@ -168,14 +156,3 @@ export class ExecutorManager {
export
type
RemoteMachineScheduleResult
=
{
scheduleInfo
:
RemoteMachineScheduleInfo
|
undefined
;
resultType
:
ScheduleResultType
};
export
type
RemoteMachineScheduleResult
=
{
scheduleInfo
:
RemoteMachineScheduleInfo
|
undefined
;
resultType
:
ScheduleResultType
};
export
type
RemoteMachineScheduleInfo
=
{
rmMeta
:
RemoteMachineMeta
;
cudaVisibleDevice
:
string
};
export
type
RemoteMachineScheduleInfo
=
{
rmMeta
:
RemoteMachineMeta
;
cudaVisibleDevice
:
string
};
export
enum
ScheduleResultType
{
// Schedule succeeded
SUCCEED
,
// Temporarily, no enough available GPU right now
TMP_NO_AVAILABLE_GPU
,
// Cannot match requirement even if all GPU are a
REQUIRE_EXCEED_TOTAL
}
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
View file @
143c6615
...
@@ -7,6 +7,7 @@ import * as assert from 'assert';
...
@@ -7,6 +7,7 @@ import * as assert from 'assert';
import
{
EventEmitter
}
from
'
events
'
;
import
{
EventEmitter
}
from
'
events
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
import
*
as
path
from
'
path
'
;
import
{
ShellExecutor
}
from
'
training_service/remote_machine/shellExecutor
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
*
as
component
from
'
../../common/component
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
NNIError
,
NNIErrorNames
}
from
'
../../common/errors
'
;
import
{
NNIError
,
NNIErrorNames
}
from
'
../../common/errors
'
;
...
@@ -22,18 +23,16 @@ import {
...
@@ -22,18 +23,16 @@ import {
getVersion
,
uniqueString
getVersion
,
uniqueString
}
from
'
../../common/utils
'
;
}
from
'
../../common/utils
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../common/containerJobData
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../common/containerJobData
'
;
import
{
GPUSummary
}
from
'
../common/gpuData
'
;
import
{
GPUSummary
,
ScheduleResultType
}
from
'
../common/gpuData
'
;
import
{
TrialConfig
}
from
'
../common/trialConfig
'
;
import
{
TrialConfig
}
from
'
../common/trialConfig
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
import
{
execMkdir
,
validateCodeDir
}
from
'
../common/util
'
;
import
{
execMkdir
,
validateCodeDir
}
from
'
../common/util
'
;
import
{
GPUScheduler
}
from
'
./gpuScheduler
'
;
import
{
GPUScheduler
}
from
'
./gpuScheduler
'
;
import
{
import
{
RemoteMachineMeta
,
ExecutorManager
,
RemoteMachineMeta
,
RemoteMachineScheduleInfo
,
RemoteMachineScheduleResult
,
RemoteMachineTrialJobDetail
,
RemoteMachineScheduleInfo
,
RemoteMachineScheduleResult
,
RemoteMachineTrialJobDetail
ScheduleResultType
,
ExecutorManager
}
from
'
./remoteMachineData
'
;
}
from
'
./remoteMachineData
'
;
import
{
RemoteMachineJobRestServer
}
from
'
./remoteMachineJobRestServer
'
;
import
{
RemoteMachineJobRestServer
}
from
'
./remoteMachineJobRestServer
'
;
import
{
ShellExecutor
}
from
'
training_service/remote_machine/shellExecutor
'
;
/**
/**
* Training Service implementation for Remote Machine (Linux)
* Training Service implementation for Remote Machine (Linux)
...
...
src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts
View file @
143c6615
...
@@ -3,7 +3,6 @@
...
@@ -3,7 +3,6 @@
'
use strict
'
;
'
use strict
'
;
import
{
EventEmitter
}
from
'
events
'
;
import
{
delay
}
from
"
../../../common/utils
"
;
import
{
delay
}
from
"
../../../common/utils
"
;
import
{
AMLEnvironmentInformation
}
from
'
../aml/amlConfig
'
;
import
{
AMLEnvironmentInformation
}
from
'
../aml/amlConfig
'
;
import
{
CommandChannel
,
RunnerConnection
}
from
"
../commandChannel
"
;
import
{
CommandChannel
,
RunnerConnection
}
from
"
../commandChannel
"
;
...
@@ -15,11 +14,7 @@ class AMLRunnerConnection extends RunnerConnection {
...
@@ -15,11 +14,7 @@ class AMLRunnerConnection extends RunnerConnection {
export
class
AMLCommandChannel
extends
CommandChannel
{
export
class
AMLCommandChannel
extends
CommandChannel
{
private
stopping
:
boolean
=
false
;
private
stopping
:
boolean
=
false
;
private
sendQueues
:
[
EnvironmentInformation
,
string
][]
=
[];
private
sendQueues
:
[
EnvironmentInformation
,
string
][]
=
[];
private
readonly
NNI_METRICS_PATTERN
:
string
=
`NNISDK_MEb'(?<metrics>.*?)'`
;
public
constructor
(
commandEmitter
:
EventEmitter
)
{
super
(
commandEmitter
);
}
public
get
channelName
():
Channel
{
public
get
channelName
():
Channel
{
return
"
aml
"
;
return
"
aml
"
;
}
}
...
@@ -99,11 +94,11 @@ export class AMLCommandChannel extends CommandChannel {
...
@@ -99,11 +94,11 @@ export class AMLCommandChannel extends CommandChannel {
const
messages
=
command
[
'
trial_runner
'
];
const
messages
=
command
[
'
trial_runner
'
];
if
(
messages
)
{
if
(
messages
)
{
if
(
messages
instanceof
Object
&&
currentMessageIndex
<
messages
.
length
-
1
)
{
if
(
messages
instanceof
Object
&&
currentMessageIndex
<
messages
.
length
-
1
)
{
for
(
let
index
=
currentMessageIndex
+
1
;
index
<
messages
.
length
;
index
++
)
{
for
(
let
index
=
currentMessageIndex
+
1
;
index
<
messages
.
length
;
index
++
)
{
this
.
handleCommand
(
runnerConnection
.
environment
,
messages
[
index
]);
this
.
handleCommand
(
runnerConnection
.
environment
,
messages
[
index
]);
}
}
currentMessageIndex
=
messages
.
length
-
1
;
currentMessageIndex
=
messages
.
length
-
1
;
}
else
if
(
currentMessageIndex
===
-
1
){
}
else
if
(
currentMessageIndex
===
-
1
)
{
this
.
handleCommand
(
runnerConnection
.
environment
,
messages
);
this
.
handleCommand
(
runnerConnection
.
environment
,
messages
);
currentMessageIndex
+=
1
;
currentMessageIndex
+=
1
;
}
}
...
...
src/nni_manager/training_service/reusable/environment.ts
View file @
143c6615
...
@@ -3,10 +3,10 @@
...
@@ -3,10 +3,10 @@
'
use strict
'
;
'
use strict
'
;
import
{
GPUSummary
}
from
"
training_service/common/gpuData
"
;
import
{
EventEmitter
}
from
"
events
"
;
import
{
getLogger
,
Logger
}
from
"
../../common/log
"
;
import
{
getLogger
,
Logger
}
from
"
../../common/log
"
;
import
{
TrialJobStatus
}
from
"
../../common/trainingService
"
;
import
{
TrialJobStatus
}
from
"
../../common/trainingService
"
;
import
{
EventEmitter
}
from
"
events
"
;
import
{
GPUInfo
}
from
"
../../training_service/common/gpuData
"
;
import
{
WebCommandChannel
}
from
"
./channels/webCommandChannel
"
;
import
{
WebCommandChannel
}
from
"
./channels/webCommandChannel
"
;
import
{
CommandChannel
}
from
"
./commandChannel
"
;
import
{
CommandChannel
}
from
"
./commandChannel
"
;
...
@@ -14,24 +14,50 @@ import { CommandChannel } from "./commandChannel";
...
@@ -14,24 +14,50 @@ import { CommandChannel } from "./commandChannel";
export
type
EnvironmentStatus
=
'
UNKNOWN
'
|
'
WAITING
'
|
'
RUNNING
'
|
'
SUCCEEDED
'
|
'
FAILED
'
|
'
USER_CANCELED
'
;
export
type
EnvironmentStatus
=
'
UNKNOWN
'
|
'
WAITING
'
|
'
RUNNING
'
|
'
SUCCEEDED
'
|
'
FAILED
'
|
'
USER_CANCELED
'
;
export
type
Channel
=
"
web
"
|
"
file
"
|
"
aml
"
|
"
ut
"
;
export
type
Channel
=
"
web
"
|
"
file
"
|
"
aml
"
|
"
ut
"
;
export
class
TrialGpuSummary
{
// GPU count on the machine
public
gpuCount
:
number
;
// The timestamp when GPU summary data queried
public
timestamp
:
string
;
// The array of GPU information for each GPU card
public
gpuInfos
:
GPUInfo
[];
// GPU assigned status
public
assignedGpuIndexMap
:
Map
<
number
,
number
>
=
new
Map
<
number
,
number
>
();
constructor
(
gpuCount
:
number
,
timestamp
:
string
,
gpuInfos
:
GPUInfo
[])
{
this
.
gpuCount
=
gpuCount
;
this
.
timestamp
=
timestamp
;
this
.
gpuInfos
=
gpuInfos
;
}
}
export
class
EnvironmentInformation
{
export
class
EnvironmentInformation
{
// node id is 5 chars, so won't conflict.
private
readonly
defaultNodeId
=
"
default
"
;
private
log
:
Logger
;
private
log
:
Logger
;
private
isNoGpuWarned
:
boolean
=
false
;
// NNI environment ID
public
id
:
string
;
// training platform unique job ID.
public
jobId
:
string
;
// training platform job friendly name, in case it's different with job ID.
public
jobName
:
string
;
// key states
// key states
// true: environment is ready to run trial.
public
isIdle
:
boolean
=
false
;
// true: environment is running, waiting, or unknown.
// true: environment is running, waiting, or unknown.
public
isAlive
:
boolean
=
true
;
public
isAlive
:
boolean
=
true
;
// true: Runner is initialized, and can receive trials.
public
isRunnerReady
:
boolean
=
false
;
// don't set status in environment directly, use setFinalState function to set a final state.
// don't set status in environment directly, use setFinalState function to set a final state.
public
status
:
EnvironmentStatus
=
"
UNKNOWN
"
;
public
status
:
EnvironmentStatus
=
"
UNKNOWN
"
;
// true: environment is ready to run trial.
public
runningTrialCount
:
number
=
0
;
// uses to count how many trial runs on this environment.
// it can be used in many scenarios, but for now, it uses for reusable.
public
assignedTrialCount
:
number
=
0
;
// NNI environment ID
public
id
:
string
;
// training platform unique job ID.
public
envId
:
string
;
// training platform job friendly name, in case it's different with job ID.
public
name
:
string
;
public
trackingUrl
:
string
=
""
;
public
trackingUrl
:
string
=
""
;
public
workingFolder
:
string
=
""
;
public
workingFolder
:
string
=
""
;
public
runnerWorkingFolder
:
string
=
""
;
public
runnerWorkingFolder
:
string
=
""
;
...
@@ -40,41 +66,82 @@ export class EnvironmentInformation {
...
@@ -40,41 +66,82 @@ export class EnvironmentInformation {
// it's used to aggregate node status for multiple node trial
// it's used to aggregate node status for multiple node trial
public
nodes
:
Map
<
string
,
NodeInfomation
>
;
public
nodes
:
Map
<
string
,
NodeInfomation
>
;
public
gpuSummar
y
:
Map
<
string
,
GPU
Summary
>
=
new
Map
<
string
,
GPU
Summary
>
();
public
gpuSummar
ies
:
Map
<
string
,
TrialGpu
Summary
>
=
new
Map
<
string
,
TrialGpu
Summary
>
();
constructor
(
id
:
string
,
jobName
:
string
,
jobId
?:
string
)
{
// use can specify which gpus can be used by NNI.
// it's usable for sharable environment like remote machine.
public
usableGpus
?:
number
[];
// user can specify how to use GPU resource for an environment, like local and remote.
public
maxTrialNumberPerGpu
?:
number
;
public
useActiveGpu
?:
boolean
;
constructor
(
id
:
string
,
name
:
string
,
envId
?:
string
)
{
this
.
log
=
getLogger
();
this
.
log
=
getLogger
();
this
.
id
=
id
;
this
.
id
=
id
;
this
.
jobN
ame
=
jobN
ame
;
this
.
n
ame
=
n
ame
;
this
.
job
Id
=
job
Id
?
job
Id
:
jobN
ame
;
this
.
env
Id
=
env
Id
?
env
Id
:
n
ame
;
this
.
nodes
=
new
Map
<
string
,
NodeInfomation
>
();
this
.
nodes
=
new
Map
<
string
,
NodeInfomation
>
();
}
}
public
setFinalStatus
(
status
:
EnvironmentStatus
):
void
{
public
setStatus
(
status
:
EnvironmentStatus
):
void
{
switch
(
status
)
{
if
(
this
.
status
!==
status
)
{
case
'
WAITING
'
:
this
.
log
.
info
(
`EnvironmentInformation:
${
this
.
envId
}
change status from
${
this
.
status
}
to
${
status
}
.`
)
case
'
SUCCEEDED
'
:
this
.
status
=
status
;
case
'
FAILED
'
:
}
case
'
USER_CANCELED
'
:
}
this
.
status
=
status
;
break
;
public
setGpuSummary
(
nodeId
:
string
,
newGpuSummary
:
TrialGpuSummary
):
void
{
default
:
if
(
nodeId
===
null
||
nodeId
===
undefined
)
{
this
.
log
.
error
(
`Environment: job
${
this
.
jobId
}
set an invalid final state
${
status
}
.`
);
nodeId
=
this
.
defaultNodeId
;
break
;
}
const
originalGpuSummary
=
this
.
gpuSummaries
.
get
(
nodeId
);
if
(
undefined
===
originalGpuSummary
)
{
newGpuSummary
.
assignedGpuIndexMap
=
new
Map
<
number
,
number
>
();
this
.
gpuSummaries
.
set
(
nodeId
,
newGpuSummary
);
}
else
{
originalGpuSummary
.
gpuCount
=
newGpuSummary
.
gpuCount
;
originalGpuSummary
.
timestamp
=
newGpuSummary
.
timestamp
;
originalGpuSummary
.
gpuInfos
=
newGpuSummary
.
gpuInfos
;
}
}
public
get
defaultGpuSummary
():
TrialGpuSummary
|
undefined
{
const
gpuSummary
=
this
.
gpuSummaries
.
get
(
this
.
defaultNodeId
);
if
(
gpuSummary
===
undefined
)
{
if
(
false
===
this
.
isNoGpuWarned
)
{
this
.
log
.
warning
(
`EnvironmentInformation:
${
this
.
envId
}
no default gpu found. current gpu info
${
JSON
.
stringify
(
this
.
gpuSummaries
)}
`
);
this
.
isNoGpuWarned
=
true
;
}
}
else
{
this
.
isNoGpuWarned
=
false
;
}
}
return
gpuSummary
;
}
}
}
}
export
abstract
class
EnvironmentService
{
export
abstract
class
EnvironmentService
{
public
abstract
get
hasStorageService
():
boolean
;
public
abstract
get
hasStorageService
():
boolean
;
public
abstract
config
(
key
:
string
,
value
:
string
):
Promise
<
void
>
;
public
abstract
config
(
key
:
string
,
value
:
string
):
Promise
<
void
>
;
public
abstract
refreshEnvironmentsStatus
(
environments
:
EnvironmentInformation
[]):
Promise
<
void
>
;
public
abstract
refreshEnvironmentsStatus
(
environments
:
EnvironmentInformation
[]):
Promise
<
void
>
;
public
abstract
startEnvironment
(
environment
:
EnvironmentInformation
):
Promise
<
void
>
;
public
abstract
startEnvironment
(
environment
:
EnvironmentInformation
):
Promise
<
void
>
;
public
abstract
stopEnvironment
(
environment
:
EnvironmentInformation
):
Promise
<
void
>
;
public
abstract
stopEnvironment
(
environment
:
EnvironmentInformation
):
Promise
<
void
>
;
public
getCommandChannel
(
commandEmitter
:
EventEmitter
):
CommandChannel
{
// It depends on environment pressure and settings
// for example, OpenPAI relies on API calls, and there is an limitation for frequence, so it need to be bigger.
public
get
environmentMaintenceLoopInterval
():
number
{
return
5000
;
}
// it's needed in two scenario
// 1. remote machine has fixed number, so it can return false, when all environment are assigned.
// 2. If there are consistent error on requested environments, for example, authentication failure on platform.
public
get
hasMoreEnvironments
():
boolean
{
return
true
;
}
public
createCommandChannel
(
commandEmitter
:
EventEmitter
):
CommandChannel
{
return
new
WebCommandChannel
(
commandEmitter
);
return
new
WebCommandChannel
(
commandEmitter
);
}
}
...
@@ -101,7 +168,7 @@ export class RunnerSettings {
...
@@ -101,7 +168,7 @@ export class RunnerSettings {
public
nniManagerVersion
:
string
=
""
;
public
nniManagerVersion
:
string
=
""
;
public
logCollection
:
string
=
"
none
"
;
public
logCollection
:
string
=
"
none
"
;
public
command
:
string
=
""
;
public
command
:
string
=
""
;
public
enableGpuCollector
:
boolean
=
fals
e
;
public
enableGpuCollector
:
boolean
=
tru
e
;
// specify which communication channel is used by runner.
// specify which communication channel is used by runner.
// supported channel includes: rest, storage, aml
// supported channel includes: rest, storage, aml
...
...
src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts
View file @
143c6615
...
@@ -3,24 +3,20 @@
...
@@ -3,24 +3,20 @@
'
use strict
'
;
'
use strict
'
;
import
{
EventEmitter
}
from
"
events
"
;
import
*
as
fs
from
'
fs
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
import
*
as
path
from
'
path
'
;
import
*
as
component
from
'
../../../common/component
'
;
import
*
as
component
from
'
../../../common/component
'
;
import
{
getExperimentId
}
from
'
../../../common/experimentStartupInfo
'
;
import
{
getExperimentId
}
from
'
../../../common/experimentStartupInfo
'
;
import
{
getLogger
,
Logger
}
from
'
../../../common/log
'
;
import
{
getLogger
,
Logger
}
from
'
../../../common/log
'
;
import
{
getExperimentRootDir
}
from
'
../../../common/utils
'
;
import
{
TrialConfigMetadataKey
}
from
'
../../common/trialConfigMetadataKey
'
;
import
{
TrialConfigMetadataKey
}
from
'
../../common/trialConfigMetadataKey
'
;
import
{
AMLClusterConfig
,
AMLTrialConfig
}
from
'
../aml/amlConfig
'
;
import
{
EnvironmentInformation
,
EnvironmentService
}
from
'
../environment
'
;
import
{
AMLEnvironmentInformation
}
from
'
../aml/amlConfig
'
;
import
{
AMLClient
}
from
'
../aml/amlClient
'
;
import
{
NNIManagerIpConfig
,
}
from
'
../../../common/trainingService
'
;
import
{
validateCodeDir
}
from
'
../../common/util
'
;
import
{
validateCodeDir
}
from
'
../../common/util
'
;
import
{
getExperimentRootDir
}
from
'
../../../common/utils
'
;
import
{
AMLClient
}
from
'
../aml/amlClient
'
;
import
{
AMLClusterConfig
,
AMLEnvironmentInformation
,
AMLTrialConfig
}
from
'
../aml/amlConfig
'
;
import
{
AMLCommandChannel
}
from
'
../channels/amlCommandChannel
'
;
import
{
AMLCommandChannel
}
from
'
../channels/amlCommandChannel
'
;
import
{
CommandChannel
}
from
"
../commandChannel
"
;
import
{
CommandChannel
}
from
"
../commandChannel
"
;
import
{
E
ventEmitter
}
from
"
ev
ent
s
"
;
import
{
E
nvironmentInformation
,
EnvironmentService
,
EnvironmentStatus
}
from
'
../environm
ent
'
;
/**
/**
...
@@ -28,17 +24,11 @@ import { EventEmitter } from "events";
...
@@ -28,17 +24,11 @@ import { EventEmitter } from "events";
*/
*/
@
component
.
Singleton
@
component
.
Singleton
export
class
AMLEnvironmentService
extends
EnvironmentService
{
export
class
AMLEnvironmentService
extends
EnvironmentService
{
private
readonly
log
:
Logger
=
getLogger
();
private
readonly
log
:
Logger
=
getLogger
();
public
amlClusterConfig
:
AMLClusterConfig
|
undefined
;
public
amlClusterConfig
:
AMLClusterConfig
|
undefined
;
public
amlTrialConfig
:
AMLTrialConfig
|
undefined
;
public
amlTrialConfig
:
AMLTrialConfig
|
undefined
;
private
amlJobConfig
:
any
;
private
stopping
:
boolean
=
false
;
private
versionCheck
:
boolean
=
true
;
private
isMultiPhase
:
boolean
=
false
;
private
nniVersion
?:
string
;
private
experimentId
:
string
;
private
experimentId
:
string
;
private
nniManagerIpConfig
?:
NNIManagerIpConfig
;
private
experimentRootDir
:
string
;
private
experimentRootDir
:
string
;
constructor
()
{
constructor
()
{
...
@@ -51,7 +41,7 @@ export class AMLEnvironmentService extends EnvironmentService {
...
@@ -51,7 +41,7 @@ export class AMLEnvironmentService extends EnvironmentService {
return
false
;
return
false
;
}
}
public
get
CommandChannel
(
commandEmitter
:
EventEmitter
):
CommandChannel
{
public
create
CommandChannel
(
commandEmitter
:
EventEmitter
):
CommandChannel
{
return
new
AMLCommandChannel
(
commandEmitter
);
return
new
AMLCommandChannel
(
commandEmitter
);
}
}
...
@@ -83,29 +73,31 @@ export class AMLEnvironmentService extends EnvironmentService {
...
@@ -83,29 +73,31 @@ export class AMLEnvironmentService extends EnvironmentService {
public
async
refreshEnvironmentsStatus
(
environments
:
EnvironmentInformation
[]):
Promise
<
void
>
{
public
async
refreshEnvironmentsStatus
(
environments
:
EnvironmentInformation
[]):
Promise
<
void
>
{
environments
.
forEach
(
async
(
environment
)
=>
{
environments
.
forEach
(
async
(
environment
)
=>
{
const
amlClient
=
(
environment
as
AMLEnvironmentInformation
).
amlClient
;
const
amlClient
=
(
environment
as
AMLEnvironmentInformation
).
amlClient
;
if
(
!
amlClient
)
{
if
(
!
amlClient
)
{
throw
new
Error
(
'
AML client not initialized!
'
);
throw
new
Error
(
'
AML client not initialized!
'
);
}
}
const
s
tatus
=
await
amlClient
.
updateStatus
(
environment
.
status
);
const
newS
tatus
=
await
amlClient
.
updateStatus
(
environment
.
status
);
switch
(
s
tatus
.
toUpperCase
())
{
switch
(
newS
tatus
.
toUpperCase
())
{
case
'
WAITING
'
:
case
'
WAITING
'
:
case
'
RUNNING
'
:
case
'
QUEUED
'
:
case
'
QUEUED
'
:
// RUNNING status is set by runner, and ignore waiting status
environment
.
setStatus
(
'
WAITING
'
);
break
;
case
'
RUNNING
'
:
environment
.
setStatus
(
'
RUNNING
'
);
break
;
break
;
case
'
COMPLETED
'
:
case
'
COMPLETED
'
:
case
'
SUCCEEDED
'
:
case
'
SUCCEEDED
'
:
environment
.
set
Final
Status
(
'
SUCCEEDED
'
);
environment
.
setStatus
(
'
SUCCEEDED
'
);
break
;
break
;
case
'
FAILED
'
:
case
'
FAILED
'
:
environment
.
set
FinalStatus
(
'
FAILED
'
);
environment
.
set
Status
(
newStatus
.
toUpperCase
()
as
EnvironmentStatus
);
break
;
break
;
case
'
STOPPED
'
:
case
'
STOPPED
'
:
case
'
STOPPING
'
:
case
'
STOPPING
'
:
environment
.
set
Final
Status
(
'
USER_CANCELED
'
);
environment
.
setStatus
(
'
USER_CANCELED
'
);
break
;
break
;
default
:
default
:
environment
.
set
Final
Status
(
'
UNKNOWN
'
);
environment
.
setStatus
(
'
UNKNOWN
'
);
}
}
});
});
}
}
...
@@ -120,7 +112,7 @@ export class AMLEnvironmentService extends EnvironmentService {
...
@@ -120,7 +112,7 @@ export class AMLEnvironmentService extends EnvironmentService {
const
amlEnvironment
:
AMLEnvironmentInformation
=
environment
as
AMLEnvironmentInformation
;
const
amlEnvironment
:
AMLEnvironmentInformation
=
environment
as
AMLEnvironmentInformation
;
const
environmentLocalTempFolder
=
path
.
join
(
this
.
experimentRootDir
,
this
.
experimentId
,
"
environment-temp
"
);
const
environmentLocalTempFolder
=
path
.
join
(
this
.
experimentRootDir
,
this
.
experimentId
,
"
environment-temp
"
);
environment
.
command
=
`import os\nos.system('
${
amlEnvironment
.
command
}
')`
;
environment
.
command
=
`import os\nos.system('
${
amlEnvironment
.
command
}
')`
;
await
fs
.
promises
.
writeFile
(
path
.
join
(
environmentLocalTempFolder
,
'
nni_script.py
'
),
amlEnvironment
.
command
,{
encoding
:
'
utf8
'
});
await
fs
.
promises
.
writeFile
(
path
.
join
(
environmentLocalTempFolder
,
'
nni_script.py
'
),
amlEnvironment
.
command
,
{
encoding
:
'
utf8
'
});
const
amlClient
=
new
AMLClient
(
const
amlClient
=
new
AMLClient
(
this
.
amlClusterConfig
.
subscriptionId
,
this
.
amlClusterConfig
.
subscriptionId
,
this
.
amlClusterConfig
.
resourceGroup
,
this
.
amlClusterConfig
.
resourceGroup
,
...
...
src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts
View file @
143c6615
...
@@ -4,6 +4,7 @@
...
@@ -4,6 +4,7 @@
'
use strict
'
;
'
use strict
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
yaml
from
'
js-yaml
'
;
import
*
as
request
from
'
request
'
;
import
*
as
request
from
'
request
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
*
as
component
from
'
../../../common/component
'
;
import
*
as
component
from
'
../../../common/component
'
;
...
@@ -15,7 +16,6 @@ import { NNIPAIK8STrialConfig } from '../../pai/paiK8S/paiK8SConfig';
...
@@ -15,7 +16,6 @@ import { NNIPAIK8STrialConfig } from '../../pai/paiK8S/paiK8SConfig';
import
{
EnvironmentInformation
,
EnvironmentService
}
from
'
../environment
'
;
import
{
EnvironmentInformation
,
EnvironmentService
}
from
'
../environment
'
;
import
{
StorageService
}
from
'
../storageService
'
;
import
{
StorageService
}
from
'
../storageService
'
;
const
yaml
=
require
(
'
js-yaml
'
);
/**
/**
* Collector PAI jobs info from PAI cluster, and update pai job status locally
* Collector PAI jobs info from PAI cluster, and update pai job status locally
...
@@ -40,6 +40,10 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
...
@@ -40,6 +40,10 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
this
.
experimentId
=
getExperimentId
();
this
.
experimentId
=
getExperimentId
();
}
}
public
get
environmentMaintenceLoopInterval
():
number
{
return
5000
;
}
public
get
hasStorageService
():
boolean
{
public
get
hasStorageService
():
boolean
{
return
true
;
return
true
;
}
}
...
@@ -72,6 +76,16 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
...
@@ -72,6 +76,16 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
if
(
this
.
paiTrialConfig
.
paiConfigPath
)
{
if
(
this
.
paiTrialConfig
.
paiConfigPath
)
{
this
.
paiJobConfig
=
yaml
.
safeLoad
(
fs
.
readFileSync
(
this
.
paiTrialConfig
.
paiConfigPath
,
'
utf8
'
));
this
.
paiJobConfig
=
yaml
.
safeLoad
(
fs
.
readFileSync
(
this
.
paiTrialConfig
.
paiConfigPath
,
'
utf8
'
));
}
}
if
(
this
.
paiClusterConfig
.
gpuNum
===
undefined
)
{
this
.
paiClusterConfig
.
gpuNum
=
this
.
paiTrialConfig
.
gpuNum
;
}
if
(
this
.
paiClusterConfig
.
cpuNum
===
undefined
)
{
this
.
paiClusterConfig
.
cpuNum
=
this
.
paiTrialConfig
.
cpuNum
;
}
if
(
this
.
paiClusterConfig
.
memoryMB
===
undefined
)
{
this
.
paiClusterConfig
.
memoryMB
=
this
.
paiTrialConfig
.
memoryMB
;
}
break
;
break
;
}
}
default
:
default
:
...
@@ -111,37 +125,35 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
...
@@ -111,37 +125,35 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
});
});
environments
.
forEach
((
environment
)
=>
{
environments
.
forEach
((
environment
)
=>
{
if
(
jobInfos
.
has
(
environment
.
job
Id
))
{
if
(
jobInfos
.
has
(
environment
.
env
Id
))
{
const
jobResponse
=
jobInfos
.
get
(
environment
.
job
Id
);
const
jobResponse
=
jobInfos
.
get
(
environment
.
env
Id
);
if
(
jobResponse
&&
jobResponse
.
state
)
{
if
(
jobResponse
&&
jobResponse
.
state
)
{
const
oldEnvironmentStatus
=
environment
.
status
;
const
oldEnvironmentStatus
=
environment
.
status
;
switch
(
jobResponse
.
state
)
{
switch
(
jobResponse
.
state
)
{
case
'
RUNNING
'
:
case
'
RUNNING
'
:
case
'
WAITING
'
:
case
'
WAITING
'
:
// RUNNING status is set by runner, and ignore waiting status
break
;
case
'
SUCCEEDED
'
:
case
'
SUCCEEDED
'
:
case
'
FAILED
'
:
case
'
FAILED
'
:
environment
.
set
Final
Status
(
jobResponse
.
state
);
environment
.
setStatus
(
jobResponse
.
state
);
break
;
break
;
case
'
STOPPED
'
:
case
'
STOPPED
'
:
case
'
STOPPING
'
:
case
'
STOPPING
'
:
environment
.
set
Final
Status
(
'
USER_CANCELED
'
);
environment
.
setStatus
(
'
USER_CANCELED
'
);
break
;
break
;
default
:
default
:
this
.
log
.
error
(
`OpenPAI: job
${
environment
.
job
Id
}
returns unknown state
${
jobResponse
.
state
}
.`
);
this
.
log
.
error
(
`OpenPAI: job
${
environment
.
env
Id
}
returns unknown state
${
jobResponse
.
state
}
.`
);
environment
.
set
Final
Status
(
'
UNKNOWN
'
);
environment
.
setStatus
(
'
UNKNOWN
'
);
}
}
if
(
oldEnvironmentStatus
!==
environment
.
status
)
{
if
(
oldEnvironmentStatus
!==
environment
.
status
)
{
this
.
log
.
debug
(
`OpenPAI: job
${
environment
.
job
Id
}
change status
${
oldEnvironmentStatus
}
to
${
environment
.
status
}
due to job is
${
jobResponse
.
state
}
.`
)
this
.
log
.
debug
(
`OpenPAI: job
${
environment
.
env
Id
}
change status
${
oldEnvironmentStatus
}
to
${
environment
.
status
}
due to job is
${
jobResponse
.
state
}
.`
)
}
}
}
else
{
}
else
{
this
.
log
.
error
(
`OpenPAI: job
${
environment
.
job
Id
}
has no state returned. body:
${
JSON
.
stringify
(
jobResponse
)}
`
);
this
.
log
.
error
(
`OpenPAI: job
${
environment
.
env
Id
}
has no state returned. body:
${
JSON
.
stringify
(
jobResponse
)}
`
);
// some error happens, and mark this environment
// some error happens, and mark this environment
environment
.
status
=
'
FAILED
'
;
environment
.
status
=
'
FAILED
'
;
}
}
}
else
{
}
else
{
this
.
log
.
error
(
`OpenPAI job
${
environment
.
job
Id
}
is not found in job list.`
);
this
.
log
.
error
(
`OpenPAI job
${
environment
.
env
Id
}
is not found in job list.`
);
environment
.
status
=
'
UNKNOWN
'
;
environment
.
status
=
'
UNKNOWN
'
;
}
}
});
});
...
@@ -169,8 +181,10 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
...
@@ -169,8 +181,10 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
// Step 1. Prepare PAI job configuration
// Step 1. Prepare PAI job configuration
const
environmentRoot
=
`
${
this
.
paiTrialConfig
.
containerNFSMountPath
}
/
${
this
.
experimentId
}
`
;
const
environmentRoot
=
`
${
this
.
paiTrialConfig
.
containerNFSMountPath
}
/
${
this
.
experimentId
}
`
;
environment
.
runnerWorkingFolder
=
`
${
environmentRoot
}
/envs/
${
environment
.
id
}
`
;
environment
.
runnerWorkingFolder
=
`
${
environmentRoot
}
/envs/
${
environment
.
id
}
`
;
environment
.
command
=
`cd
${
environmentRoot
}
&&
${
environment
.
command
}
`
environment
.
command
=
`cd
${
environmentRoot
}
&&
${
environment
.
command
}
`
;
environment
.
trackingUrl
=
`
${
this
.
protocol
}
://
${
this
.
paiClusterConfig
.
host
}
/job-detail.html?username=
${
this
.
paiClusterConfig
.
userName
}
&jobName=
${
environment
.
jobId
}
`
environment
.
trackingUrl
=
`
${
this
.
protocol
}
://
${
this
.
paiClusterConfig
.
host
}
/job-detail.html?username=
${
this
.
paiClusterConfig
.
userName
}
&jobName=
${
environment
.
envId
}
`
;
environment
.
useActiveGpu
=
this
.
paiClusterConfig
.
useActiveGpu
;
environment
.
maxTrialNumberPerGpu
=
this
.
paiClusterConfig
.
maxTrialNumPerGpu
;
// Step 2. Generate Job Configuration in yaml format
// Step 2. Generate Job Configuration in yaml format
const
paiJobConfig
=
this
.
generateJobConfigInYamlFormat
(
environment
);
const
paiJobConfig
=
this
.
generateJobConfigInYamlFormat
(
environment
);
...
@@ -189,7 +203,7 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
...
@@ -189,7 +203,7 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
request
(
submitJobRequest
,
(
error
,
response
,
body
)
=>
{
request
(
submitJobRequest
,
(
error
,
response
,
body
)
=>
{
if
((
error
!==
undefined
&&
error
!==
null
)
||
response
.
statusCode
>=
400
)
{
if
((
error
!==
undefined
&&
error
!==
null
)
||
response
.
statusCode
>=
400
)
{
const
errorMessage
:
string
=
(
error
!==
undefined
&&
error
!==
null
)
?
error
.
message
:
const
errorMessage
:
string
=
(
error
!==
undefined
&&
error
!==
null
)
?
error
.
message
:
`start environment
${
environment
.
job
Id
}
failed, http code:
${
response
.
statusCode
}
, http body:
${
body
}
`
;
`start environment
${
environment
.
env
Id
}
failed, http code:
${
response
.
statusCode
}
, http body:
${
body
}
`
;
this
.
log
.
error
(
errorMessage
);
this
.
log
.
error
(
errorMessage
);
environment
.
status
=
'
FAILED
'
;
environment
.
status
=
'
FAILED
'
;
...
@@ -211,7 +225,7 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
...
@@ -211,7 +225,7 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
}
}
const
stopJobRequest
:
request
.
Options
=
{
const
stopJobRequest
:
request
.
Options
=
{
uri
:
`
${
this
.
protocol
}
://
${
this
.
paiClusterConfig
.
host
}
/rest-server/api/v2/jobs/
${
this
.
paiClusterConfig
.
userName
}
~
${
environment
.
job
Id
}
/executionType`
,
uri
:
`
${
this
.
protocol
}
://
${
this
.
paiClusterConfig
.
host
}
/rest-server/api/v2/jobs/
${
this
.
paiClusterConfig
.
userName
}
~
${
environment
.
env
Id
}
/executionType`
,
method
:
'
PUT
'
,
method
:
'
PUT
'
,
json
:
true
,
json
:
true
,
body
:
{
value
:
'
STOP
'
},
body
:
{
value
:
'
STOP
'
},
...
@@ -222,17 +236,17 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
...
@@ -222,17 +236,17 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
}
}
};
};
this
.
log
.
debug
(
`stopping OpenPAI environment
${
environment
.
job
Id
}
,
${
stopJobRequest
.
uri
}
`
);
this
.
log
.
debug
(
`stopping OpenPAI environment
${
environment
.
env
Id
}
,
${
stopJobRequest
.
uri
}
`
);
try
{
try
{
request
(
stopJobRequest
,
(
error
,
response
,
_body
)
=>
{
request
(
stopJobRequest
,
(
error
,
response
,
_body
)
=>
{
try
{
try
{
if
((
error
!==
undefined
&&
error
!==
null
)
||
(
response
&&
response
.
statusCode
>=
400
))
{
if
((
error
!==
undefined
&&
error
!==
null
)
||
(
response
&&
response
.
statusCode
>=
400
))
{
this
.
log
.
error
(
`OpenPAI: stop job
${
environment
.
job
Id
}
failed with
${
response
.
statusCode
}
\n
${
error
}
`
);
this
.
log
.
error
(
`OpenPAI: stop job
${
environment
.
env
Id
}
failed with
${
response
.
statusCode
}
\n
${
error
}
`
);
deferred
.
reject
((
error
!==
undefined
&&
error
!==
null
)
?
error
:
deferred
.
reject
((
error
!==
undefined
&&
error
!==
null
)
?
error
:
`Stop trial failed, http code:
${
response
.
statusCode
}
`
);
`Stop trial failed, http code:
${
response
.
statusCode
}
`
);
}
else
{
}
else
{
this
.
log
.
info
(
`OpenPAI job
${
environment
.
job
Id
}
stopped.`
);
this
.
log
.
info
(
`OpenPAI job
${
environment
.
env
Id
}
stopped.`
);
}
}
deferred
.
resolve
();
deferred
.
resolve
();
}
catch
(
error
)
{
}
catch
(
error
)
{
...
@@ -265,7 +279,7 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
...
@@ -265,7 +279,7 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
if
(
this
.
paiTrialConfig
===
undefined
)
{
if
(
this
.
paiTrialConfig
===
undefined
)
{
throw
new
Error
(
'
trial config is not initialized
'
);
throw
new
Error
(
'
trial config is not initialized
'
);
}
}
const
jobName
=
environment
.
job
Id
;
const
jobName
=
environment
.
env
Id
;
let
nniJobConfig
:
any
=
undefined
;
let
nniJobConfig
:
any
=
undefined
;
if
(
this
.
paiTrialConfig
.
paiConfigPath
)
{
if
(
this
.
paiTrialConfig
.
paiConfigPath
)
{
...
@@ -284,7 +298,6 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
...
@@ -284,7 +298,6 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
environment
.
nodeCount
+=
instanceCount
;
environment
.
nodeCount
+=
instanceCount
;
}
}
// Each taskRole will generate new command in NNI's command format
// Each taskRole will generate new command in NNI's command format
// Each command will be formatted to NNI style
// Each command will be formatted to NNI style
for
(
const
taskRoleName
in
nniJobConfig
.
taskRoles
)
{
for
(
const
taskRoleName
in
nniJobConfig
.
taskRoles
)
{
...
@@ -298,6 +311,19 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
...
@@ -298,6 +311,19 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
}
}
}
else
{
}
else
{
if
(
this
.
paiClusterConfig
===
undefined
)
{
throw
new
Error
(
'
PAI Cluster config is not initialized
'
);
}
if
(
this
.
paiClusterConfig
.
gpuNum
===
undefined
)
{
throw
new
Error
(
'
PAI Cluster gpuNum is not initialized
'
);
}
if
(
this
.
paiClusterConfig
.
cpuNum
===
undefined
)
{
throw
new
Error
(
'
PAI Cluster cpuNum is not initialized
'
);
}
if
(
this
.
paiClusterConfig
.
memoryMB
===
undefined
)
{
throw
new
Error
(
'
PAI Cluster memoryMB is not initialized
'
);
}
nniJobConfig
=
{
nniJobConfig
=
{
protocolVersion
:
2
,
protocolVersion
:
2
,
name
:
jobName
,
name
:
jobName
,
...
@@ -320,9 +346,9 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
...
@@ -320,9 +346,9 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
taskRetryCount
:
0
,
taskRetryCount
:
0
,
dockerImage
:
'
docker_image_0
'
,
dockerImage
:
'
docker_image_0
'
,
resourcePerInstance
:
{
resourcePerInstance
:
{
gpu
:
this
.
pai
Trial
Config
.
gpuNum
,
gpu
:
this
.
pai
Cluster
Config
.
gpuNum
,
cpu
:
this
.
pai
Trial
Config
.
cpuNum
,
cpu
:
this
.
pai
Cluster
Config
.
cpuNum
,
memoryMB
:
this
.
pai
Trial
Config
.
memoryMB
memoryMB
:
this
.
pai
Cluster
Config
.
memoryMB
},
},
commands
:
[
commands
:
[
environment
.
command
environment
.
command
...
...
src/nni_manager/training_service/reusable/gpuScheduler.ts
0 → 100644
View file @
143c6615
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
'
use strict
'
;
import
*
as
assert
from
'
assert
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
randomSelect
}
from
'
../../common/utils
'
;
import
{
GPUInfo
,
ScheduleResultType
}
from
'
../common/gpuData
'
;
import
{
EnvironmentInformation
}
from
'
./environment
'
;
import
{
TrialDetail
}
from
'
./trial
'
;
type
SCHEDULE_POLICY_NAME
=
'
random
'
|
'
round-robin
'
;
export
class
GpuSchedulerSetting
{
public
useActiveGpu
:
boolean
=
false
;
public
maxTrialNumberPerGpu
:
number
=
1
;
}
export
type
GpuScheduleResult
=
{
resultType
:
ScheduleResultType
;
environment
:
EnvironmentInformation
|
undefined
;
gpuIndices
:
GPUInfo
[]
|
undefined
;
};
/**
* A simple GPU scheduler implementation
*/
export
class
GpuScheduler
{
// private readonly machineExecutorMap: Set<TrialDetail>;
private
readonly
log
:
Logger
=
getLogger
();
private
readonly
policyName
:
SCHEDULE_POLICY_NAME
=
'
round-robin
'
;
private
defaultSetting
:
GpuSchedulerSetting
;
private
roundRobinIndex
:
number
=
0
;
/**
* Constructor
* @param environments map from remote machine to executor
*/
constructor
(
gpuSchedulerSetting
:
GpuSchedulerSetting
|
undefined
=
undefined
)
{
if
(
undefined
===
gpuSchedulerSetting
)
{
gpuSchedulerSetting
=
new
GpuSchedulerSetting
();
}
this
.
defaultSetting
=
gpuSchedulerSetting
;
}
public
setSettings
(
gpuSchedulerSetting
:
GpuSchedulerSetting
):
void
{
this
.
defaultSetting
=
gpuSchedulerSetting
;
}
/**
* Schedule a machine according to the constraints (requiredGPUNum)
* @param requiredGPUNum required GPU number
*/
public
scheduleMachine
(
environments
:
EnvironmentInformation
[],
requiredGPUNum
:
number
|
undefined
,
trialDetail
:
TrialDetail
):
GpuScheduleResult
{
if
(
requiredGPUNum
===
undefined
)
{
requiredGPUNum
=
0
;
}
assert
(
requiredGPUNum
>=
0
);
// Step 1: Check if required GPU number not exceeds the total GPU number in all machines
const
eligibleEnvironments
:
EnvironmentInformation
[]
=
environments
.
filter
((
environment
:
EnvironmentInformation
)
=>
environment
.
defaultGpuSummary
===
undefined
||
requiredGPUNum
===
0
||
(
requiredGPUNum
!==
undefined
&&
environment
.
defaultGpuSummary
.
gpuCount
>=
requiredGPUNum
));
if
(
eligibleEnvironments
.
length
===
0
)
{
// If the required gpu number exceeds the upper limit of all machine's GPU number
// Return REQUIRE_EXCEED_TOTAL directly
return
({
resultType
:
ScheduleResultType
.
REQUIRE_EXCEED_TOTAL
,
gpuIndices
:
undefined
,
environment
:
undefined
,
});
}
// Step 2: Allocate Host/GPU for specified trial job
// Currenty the requireGPUNum parameter for all trial jobs are identical.
if
(
requiredGPUNum
>
0
)
{
// Trial job requires GPU
const
result
:
GpuScheduleResult
|
undefined
=
this
.
scheduleGPUHost
(
environments
,
requiredGPUNum
,
trialDetail
);
if
(
result
!==
undefined
)
{
return
result
;
}
}
else
{
// Trail job does not need GPU
const
allocatedRm
:
EnvironmentInformation
=
this
.
selectMachine
(
environments
,
environments
);
return
this
.
allocateHost
(
requiredGPUNum
,
allocatedRm
,
[],
trialDetail
);
}
return
{
resultType
:
ScheduleResultType
.
TMP_NO_AVAILABLE_GPU
,
gpuIndices
:
undefined
,
environment
:
undefined
,
};
}
/**
* remove the job's gpu reversion
*/
public
removeGpuReservation
(
trial
:
TrialDetail
):
void
{
if
(
trial
.
environment
!==
undefined
&&
trial
.
environment
.
defaultGpuSummary
!==
undefined
&&
trial
.
assignedGpus
!==
undefined
&&
trial
.
assignedGpus
.
length
>
0
)
{
for
(
const
gpuInfo
of
trial
.
assignedGpus
)
{
const
defaultGpuSummary
=
trial
.
environment
.
defaultGpuSummary
;
const
num
:
number
|
undefined
=
defaultGpuSummary
.
assignedGpuIndexMap
.
get
(
gpuInfo
.
index
);
if
(
num
!==
undefined
)
{
if
(
num
===
1
)
{
defaultGpuSummary
.
assignedGpuIndexMap
.
delete
(
gpuInfo
.
index
);
}
else
{
defaultGpuSummary
.
assignedGpuIndexMap
.
set
(
gpuInfo
.
index
,
num
-
1
);
}
}
}
}
}
private
scheduleGPUHost
(
environments
:
EnvironmentInformation
[],
requiredGPUNumber
:
number
,
trial
:
TrialDetail
):
GpuScheduleResult
|
undefined
{
const
totalResourceMap
:
Map
<
EnvironmentInformation
,
GPUInfo
[]
>
=
this
.
gpuResourceDetection
(
environments
);
const
qualifiedEnvironments
:
EnvironmentInformation
[]
=
[];
totalResourceMap
.
forEach
((
gpuInfos
:
GPUInfo
[],
environment
:
EnvironmentInformation
)
=>
{
if
(
gpuInfos
!==
undefined
&&
gpuInfos
.
length
>=
requiredGPUNumber
)
{
qualifiedEnvironments
.
push
(
environment
);
}
});
if
(
qualifiedEnvironments
.
length
>
0
)
{
const
allocatedEnvironment
:
EnvironmentInformation
=
this
.
selectMachine
(
qualifiedEnvironments
,
environments
);
const
gpuInfos
:
GPUInfo
[]
|
undefined
=
totalResourceMap
.
get
(
allocatedEnvironment
);
if
(
gpuInfos
!==
undefined
)
{
// should always true
return
this
.
allocateHost
(
requiredGPUNumber
,
allocatedEnvironment
,
gpuInfos
,
trial
);
}
else
{
assert
(
false
,
'
gpuInfos is undefined
'
);
}
}
}
/**
* Detect available GPU resource for an environment
* @returns Available GPUs on environments
*/
private
gpuResourceDetection
(
environments
:
EnvironmentInformation
[]):
Map
<
EnvironmentInformation
,
GPUInfo
[]
>
{
const
totalResourceMap
:
Map
<
EnvironmentInformation
,
GPUInfo
[]
>
=
new
Map
<
EnvironmentInformation
,
GPUInfo
[]
>
();
environments
.
forEach
((
environment
:
EnvironmentInformation
)
=>
{
// Assgin totoal GPU count as init available GPU number
if
(
environment
.
defaultGpuSummary
!==
undefined
)
{
const
defaultGpuSummary
=
environment
.
defaultGpuSummary
;
const
availableGPUs
:
GPUInfo
[]
=
[];
const
designatedGpuIndices
:
Set
<
number
>
=
new
Set
<
number
>
(
environment
.
usableGpus
);
if
(
designatedGpuIndices
.
size
>
0
)
{
for
(
const
gpuIndex
of
designatedGpuIndices
)
{
if
(
gpuIndex
>=
environment
.
defaultGpuSummary
.
gpuCount
)
{
throw
new
Error
(
`Specified GPU index not found:
${
gpuIndex
}
`
);
}
}
}
if
(
undefined
!==
defaultGpuSummary
.
gpuInfos
)
{
defaultGpuSummary
.
gpuInfos
.
forEach
((
gpuInfo
:
GPUInfo
)
=>
{
// if the GPU has active process, OR be reserved by a job,
// or index not in gpuIndices configuration in machineList,
// or trial number on a GPU reach max number,
// We should NOT allocate this GPU
// if users set useActiveGpu, use the gpu whether there is another activeProcess
if
(
designatedGpuIndices
.
size
===
0
||
designatedGpuIndices
.
has
(
gpuInfo
.
index
))
{
if
(
defaultGpuSummary
.
assignedGpuIndexMap
!==
undefined
)
{
const
num
:
number
|
undefined
=
defaultGpuSummary
.
assignedGpuIndexMap
.
get
(
gpuInfo
.
index
);
const
maxTrialNumberPerGpu
:
number
=
environment
.
maxTrialNumberPerGpu
?
environment
.
maxTrialNumberPerGpu
:
this
.
defaultSetting
.
maxTrialNumberPerGpu
;
const
useActiveGpu
:
boolean
=
environment
.
useActiveGpu
?
environment
.
useActiveGpu
:
this
.
defaultSetting
.
useActiveGpu
;
if
((
num
===
undefined
&&
(
!
useActiveGpu
&&
gpuInfo
.
activeProcessNum
===
0
||
useActiveGpu
))
||
(
num
!==
undefined
&&
num
<
maxTrialNumberPerGpu
))
{
availableGPUs
.
push
(
gpuInfo
);
}
}
else
{
throw
new
Error
(
`occupiedGpuIndexMap is undefined!`
);
}
}
});
}
totalResourceMap
.
set
(
environment
,
availableGPUs
);
}
});
return
totalResourceMap
;
}
private
selectMachine
(
qualifiedEnvironments
:
EnvironmentInformation
[],
allEnvironments
:
EnvironmentInformation
[]):
EnvironmentInformation
{
assert
(
qualifiedEnvironments
!==
undefined
&&
qualifiedEnvironments
.
length
>
0
);
if
(
this
.
policyName
===
'
random
'
)
{
return
randomSelect
(
qualifiedEnvironments
);
}
else
if
(
this
.
policyName
===
'
round-robin
'
)
{
return
this
.
roundRobinSelect
(
qualifiedEnvironments
,
allEnvironments
);
}
else
{
throw
new
Error
(
`Unsupported schedule policy:
${
this
.
policyName
}
`
);
}
}
private
roundRobinSelect
(
qualifiedEnvironments
:
EnvironmentInformation
[],
allEnvironments
:
EnvironmentInformation
[]):
EnvironmentInformation
{
while
(
!
qualifiedEnvironments
.
includes
(
allEnvironments
[
this
.
roundRobinIndex
%
allEnvironments
.
length
]))
{
this
.
roundRobinIndex
++
;
}
return
allEnvironments
[
this
.
roundRobinIndex
++
%
allEnvironments
.
length
];
}
private
selectGPUsForTrial
(
gpuInfos
:
GPUInfo
[],
requiredGPUNum
:
number
):
GPUInfo
[]
{
// Sequentially allocate GPUs
return
gpuInfos
.
slice
(
0
,
requiredGPUNum
);
}
private
allocateHost
(
requiredGPUNum
:
number
,
environment
:
EnvironmentInformation
,
gpuInfos
:
GPUInfo
[],
trialDetails
:
TrialDetail
):
GpuScheduleResult
{
assert
(
gpuInfos
.
length
>=
requiredGPUNum
);
const
allocatedGPUs
:
GPUInfo
[]
=
this
.
selectGPUsForTrial
(
gpuInfos
,
requiredGPUNum
);
const
defaultGpuSummary
=
environment
.
defaultGpuSummary
;
if
(
undefined
===
defaultGpuSummary
)
{
throw
new
Error
(
`Environment
${
environment
.
id
}
defaultGpuSummary shouldn't be undefined!`
);
}
allocatedGPUs
.
forEach
((
gpuInfo
:
GPUInfo
)
=>
{
let
num
:
number
|
undefined
=
defaultGpuSummary
.
assignedGpuIndexMap
.
get
(
gpuInfo
.
index
);
if
(
num
===
undefined
)
{
num
=
0
;
}
defaultGpuSummary
.
assignedGpuIndexMap
.
set
(
gpuInfo
.
index
,
num
+
1
);
});
trialDetails
.
assignedGpus
=
allocatedGPUs
;
return
{
resultType
:
ScheduleResultType
.
SUCCEED
,
environment
:
environment
,
gpuIndices
:
allocatedGPUs
,
};
}
}
src/nni_manager/training_service/reusable/storageService.ts
View file @
143c6615
...
@@ -83,7 +83,7 @@ export abstract class StorageService {
...
@@ -83,7 +83,7 @@ export abstract class StorageService {
localPath
=
this
.
expandPath
(
false
,
localPath
);
localPath
=
this
.
expandPath
(
false
,
localPath
);
remotePath
=
this
.
expandPath
(
true
,
remotePath
);
remotePath
=
this
.
expandPath
(
true
,
remotePath
);
this
.
logger
.
debug
(
`copy remotePath:
${
remotePath
}
to localPath:
${
localPath
}
`
);
this
.
logger
.
debug
(
`copy remotePath:
${
remotePath
}
to localPath:
${
localPath
}
`
);
return
await
this
.
internalCopy
(
localPath
,
remote
Path
,
true
,
true
,
false
);
return
await
this
.
internalCopy
(
remotePath
,
local
Path
,
true
,
true
,
false
);
}
}
public
async
removeDirectory
(
remotePath
:
string
,
isRecursive
:
boolean
):
Promise
<
void
>
{
public
async
removeDirectory
(
remotePath
:
string
,
isRecursive
:
boolean
):
Promise
<
void
>
{
...
@@ -151,7 +151,7 @@ export abstract class StorageService {
...
@@ -151,7 +151,7 @@ export abstract class StorageService {
localPath
=
this
.
expandPath
(
false
,
localPath
);
localPath
=
this
.
expandPath
(
false
,
localPath
);
remotePath
=
this
.
expandPath
(
true
,
remotePath
);
remotePath
=
this
.
expandPath
(
true
,
remotePath
);
this
.
logger
.
debug
(
`copy file remotePath:
${
remotePath
}
to localPath:
${
localPath
}
`
);
this
.
logger
.
debug
(
`copy file remotePath:
${
remotePath
}
to localPath:
${
localPath
}
`
);
await
this
.
internalCopy
(
localPath
,
remote
Path
,
false
,
true
,
false
);
await
this
.
internalCopy
(
remotePath
,
local
Path
,
false
,
true
,
false
);
}
}
public
async
removeFile
(
remotePath
:
string
):
Promise
<
void
>
{
public
async
removeFile
(
remotePath
:
string
):
Promise
<
void
>
{
...
...
src/nni_manager/training_service/reusable/storages/mountedStorageService.ts
View file @
143c6615
...
@@ -17,12 +17,12 @@ export class MountedStorageService extends StorageService {
...
@@ -17,12 +17,12 @@ export class MountedStorageService extends StorageService {
if
(
isRecursive
)
{
if
(
isRecursive
)
{
const
children
=
await
fs
.
promises
.
readdir
(
path
);
const
children
=
await
fs
.
promises
.
readdir
(
path
);
for
(
const
file
of
children
)
{
for
(
const
file
of
children
)
{
const
stat
=
await
fs
.
promises
.
lstat
(
file
);
const
filePath
=
this
.
internalJoin
(
path
,
file
);
this
.
internalRemove
(
file
,
stat
.
isDirectory
(),
isRecursive
);
const
stat
=
await
fs
.
promises
.
lstat
(
filePath
);
await
this
.
internalRemove
(
filePath
,
stat
.
isDirectory
(),
isRecursive
);
}
}
}
else
{
await
fs
.
promises
.
rmdir
(
path
);
}
}
await
fs
.
promises
.
rmdir
(
path
);
}
else
{
}
else
{
await
fs
.
promises
.
unlink
(
path
);
await
fs
.
promises
.
unlink
(
path
);
}
}
...
@@ -98,7 +98,7 @@ export class MountedStorageService extends StorageService {
...
@@ -98,7 +98,7 @@ export class MountedStorageService extends StorageService {
{
{
encoding
:
"
utf8
"
,
encoding
:
"
utf8
"
,
start
:
current
,
start
:
current
,
end
:
readLength
+
current
,
end
:
readLength
+
current
-
1
,
}).
on
(
"
data
"
,
(
data
)
=>
{
}).
on
(
"
data
"
,
(
data
)
=>
{
result
+=
data
;
result
+=
data
;
}).
on
(
"
end
"
,
()
=>
{
}).
on
(
"
end
"
,
()
=>
{
...
...
src/nni_manager/training_service/reusable/test/mountedStorageService.test.ts
0 → 100644
View file @
143c6615
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
'
use strict
'
;
import
*
as
chai
from
'
chai
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
import
{
getLogger
,
Logger
}
from
"
../../../common/log
"
;
import
{
cleanupUnitTest
,
prepareUnitTest
}
from
'
../../../common/utils
'
;
import
{
MountedStorageService
}
from
"
../storages/mountedStorageService
"
;
import
chaiAsPromised
=
require
(
"
chai-as-promised
"
);
async
function
remove
(
removedPath
:
string
,
isDirectory
:
boolean
,
isRecursive
:
boolean
):
Promise
<
void
>
{
if
(
isDirectory
)
{
if
(
isRecursive
)
{
const
children
=
await
fs
.
promises
.
readdir
(
removedPath
);
for
(
const
fileName
of
children
)
{
const
filePath
=
path
.
join
(
removedPath
,
fileName
);
const
stat
=
await
fs
.
promises
.
lstat
(
filePath
);
await
remove
(
filePath
,
stat
.
isDirectory
(),
isRecursive
);
}
}
await
fs
.
promises
.
rmdir
(
removedPath
);
}
else
{
await
fs
.
promises
.
unlink
(
removedPath
);
}
}
describe
(
'
Unit Test for MountedStorageService
'
,
()
=>
{
let
service
:
MountedStorageService
;
let
log
:
Logger
;
let
localPath
=
"
reusableut/local
"
;
let
mountedPath
=
"
reusableut/mounted
"
;
const
testPath
=
"
testpath
"
;
const
testFileName
=
"
testfile.txt
"
;
let
localCopiedPath
:
string
;
let
localFileName
:
string
;
let
mountedFileName
:
string
;
before
(()
=>
{
chai
.
should
();
chai
.
use
(
chaiAsPromised
);
prepareUnitTest
();
log
=
getLogger
();
const
testRoot
=
path
.
dirname
(
__filename
);
localPath
=
path
.
join
(
testRoot
,
localPath
);
mountedPath
=
path
.
join
(
testRoot
,
mountedPath
);
service
=
new
MountedStorageService
();
service
.
initialize
(
localPath
,
mountedPath
);
localCopiedPath
=
path
.
join
(
localPath
,
testPath
);
localFileName
=
path
.
join
(
localCopiedPath
,
testFileName
);
mountedFileName
=
path
.
join
(
testPath
,
testFileName
);
});
after
(()
=>
{
cleanupUnitTest
();
});
beforeEach
(
async
()
=>
{
if
(
!
fs
.
existsSync
(
localPath
))
{
await
fs
.
promises
.
mkdir
(
localPath
,
{
recursive
:
true
});
}
if
(
!
fs
.
existsSync
(
mountedPath
))
{
await
fs
.
promises
.
mkdir
(
mountedPath
,
{
recursive
:
true
});
}
log
.
info
(
`localFileName:
${
localFileName
}
`
);
await
fs
.
promises
.
mkdir
(
localCopiedPath
,
{
recursive
:
true
});
await
fs
.
promises
.
writeFile
(
localFileName
,
"
hello world
"
);
});
afterEach
(
async
()
=>
{
const
testRootPath
=
path
.
normalize
(
`
${
localPath
}
/../../reusableut`
);
await
remove
(
testRootPath
,
true
,
true
);
});
it
(
'
copyAndRename
'
,
async
()
=>
{
await
service
.
copyDirectory
(
localCopiedPath
,
"
.
"
);
chai
.
expect
(
fs
.
existsSync
(
mountedPath
));
const
newName
=
`
${
testFileName
}
new`
;
await
service
.
rename
(
mountedFileName
,
newName
);
chai
.
assert
.
isFalse
(
fs
.
existsSync
(
testPath
));
const
newTestPath
=
`
${
mountedFileName
}
new`
;
chai
.
assert
.
isTrue
(
await
service
.
exists
(
newTestPath
));
await
service
.
copyFileBack
(
newTestPath
,
"
.
"
);
const
localNewFileName
=
`
${
localPath
}
/
${
newName
}
`
;
chai
.
assert
.
isTrue
(
fs
.
existsSync
(
localNewFileName
));
fs
.
unlinkSync
(
`
${
localFileName
}
`
);
fs
.
rmdirSync
(
`
${
localPath
}
/
${
testPath
}
`
);
await
service
.
copyDirectoryBack
(
`
${
mountedPath
}
/
${
testPath
}
`
,
`.`
);
const
localNewName
=
`
${
localFileName
}
new`
;
chai
.
assert
.
isTrue
(
fs
.
existsSync
(
localNewName
));
})
it
(
'
FileContentTest
'
,
async
()
=>
{
const
savedFileName
=
"
savedfile.txt
"
;
await
service
.
save
(
"
01234
"
,
savedFileName
);
chai
.
expect
(
fs
.
existsSync
(
savedFileName
));
let
content
=
await
service
.
readFileContent
(
savedFileName
,
0
,
-
1
);
chai
.
assert
.
equal
(
content
,
"
01234
"
);
await
service
.
save
(
"
56789
"
,
savedFileName
,
true
);
content
=
await
service
.
readFileContent
(
savedFileName
,
0
,
-
1
);
chai
.
assert
.
equal
(
content
,
"
0123456789
"
);
content
=
await
service
.
readFileContent
(
savedFileName
,
-
1
,
1
);
chai
.
assert
.
equal
(
content
,
"
0
"
);
content
=
await
service
.
readFileContent
(
savedFileName
,
5
,
1
);
chai
.
assert
.
equal
(
content
,
"
5
"
);
content
=
await
service
.
readFileContent
(
savedFileName
,
5
,
-
1
);
chai
.
assert
.
equal
(
content
,
"
56789
"
);
});
});
src/nni_manager/training_service/reusable/test/trialDispatcher.test.ts
0 → 100644
View file @
143c6615
This diff is collapsed.
Click to expand it.
src/nni_manager/training_service/reusable/test/utCommandChannel.ts
0 → 100644
View file @
143c6615
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
import
{
encodeCommand
}
from
"
../../../core/ipcInterface
"
;
import
{
Command
,
CommandChannel
,
RunnerConnection
}
from
"
../commandChannel
"
;
import
{
Channel
,
EnvironmentInformation
}
from
"
../environment
"
;
class
UtRunnerConnection
extends
RunnerConnection
{
}
export
class
UtCommandChannel
extends
CommandChannel
{
private
readonly
receivedCommands
:
Command
[]
=
[];
public
get
channelName
():
Channel
{
return
"
ut
"
;
}
public
async
testSendCommandToTrialDispatcher
(
environment
:
EnvironmentInformation
,
commandType
:
string
,
commandData
:
any
)
{
const
content
=
encodeCommand
(
commandType
,
JSON
.
stringify
(
commandData
));
this
.
log
.
debug
(
`UtCommandChannel: env
${
environment
.
id
}
send test command
${
content
}
`
);
this
.
handleCommand
(
environment
,
content
.
toString
(
"
utf8
"
));
}
public
async
testReceiveCommandFromTrialDispatcher
():
Promise
<
Command
|
undefined
>
{
return
this
.
receivedCommands
.
shift
();
}
public
async
config
(
_key
:
string
,
value
:
any
):
Promise
<
void
>
{
// do nothing
}
public
async
start
():
Promise
<
void
>
{
// do nothing
}
public
async
stop
():
Promise
<
void
>
{
// do nothing
}
public
async
run
():
Promise
<
void
>
{
// do nothing
}
protected
async
sendCommandInternal
(
environment
:
EnvironmentInformation
,
message
:
string
):
Promise
<
void
>
{
const
parsedCommands
=
this
.
parseCommands
(
message
);
for
(
const
parsedCommand
of
parsedCommands
)
{
const
command
=
new
Command
(
environment
,
parsedCommand
[
0
],
parsedCommand
[
1
]);
this
.
receivedCommands
.
push
(
command
);
}
}
protected
createRunnerConnection
(
environment
:
EnvironmentInformation
):
RunnerConnection
{
// do nothing
return
new
UtRunnerConnection
(
environment
);
}
}
src/nni_manager/training_service/reusable/test/utEnvironmentService.ts
0 → 100644
View file @
143c6615
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
import
{
EnvironmentInformation
,
EnvironmentService
,
EnvironmentStatus
}
from
"
../environment
"
;
import
{
EventEmitter
}
from
"
events
"
;
import
{
CommandChannel
}
from
"
../commandChannel
"
;
import
{
UtCommandChannel
}
from
"
./utCommandChannel
"
;
export
class
UtEnvironmentService
extends
EnvironmentService
{
private
commandChannel
:
UtCommandChannel
|
undefined
;
private
allEnvironments
=
new
Map
<
string
,
EnvironmentInformation
>
();
private
hasMoreEnvironmentsInternal
=
true
;
constructor
()
{
super
();
}
public
get
hasStorageService
():
boolean
{
// storage service is tested by integration testing.
return
false
;
}
public
get
environmentMaintenceLoopInterval
():
number
{
return
1
;
}
public
testSetEnvironmentStatus
(
environment
:
EnvironmentInformation
,
newStatus
:
EnvironmentStatus
):
void
{
environment
.
status
=
newStatus
;
}
public
testReset
():
void
{
this
.
allEnvironments
.
clear
();
}
public
testGetEnvironments
():
Map
<
string
,
EnvironmentInformation
>
{
return
this
.
allEnvironments
;
}
public
testGetCommandChannel
():
UtCommandChannel
{
if
(
this
.
commandChannel
===
undefined
)
{
throw
new
Error
(
`command channel shouldn't be undefined.`
);
}
return
this
.
commandChannel
;
}
public
testSetNoMoreEnvironment
(
hasMore
:
boolean
):
void
{
this
.
hasMoreEnvironmentsInternal
=
hasMore
;
}
public
get
hasMoreEnvironments
():
boolean
{
return
this
.
hasMoreEnvironmentsInternal
;
}
public
createCommandChannel
(
commandEmitter
:
EventEmitter
):
CommandChannel
{
this
.
commandChannel
=
new
UtCommandChannel
(
commandEmitter
)
return
this
.
commandChannel
;
}
public
async
config
(
_key
:
string
,
_value
:
string
):
Promise
<
void
>
{
// do nothing
}
public
async
refreshEnvironmentsStatus
(
environments
:
EnvironmentInformation
[]):
Promise
<
void
>
{
// do nothing
}
public
async
startEnvironment
(
environment
:
EnvironmentInformation
):
Promise
<
void
>
{
if
(
!
this
.
allEnvironments
.
has
(
environment
.
id
))
{
this
.
allEnvironments
.
set
(
environment
.
id
,
environment
);
environment
.
status
=
"
WAITING
"
;
}
}
public
async
stopEnvironment
(
environment
:
EnvironmentInformation
):
Promise
<
void
>
{
environment
.
status
=
"
USER_CANCELED
"
;
}
}
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment