Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
143c6615
Unverified
Commit
143c6615
authored
Jul 30, 2020
by
Chi Song
Committed by
GitHub
Jul 30, 2020
Browse files
Reusable environment support GPU scheduler, add test cases and refactoring. (#2627)
parent
8a20c348
Changes
30
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1449 additions
and
144 deletions
+1449
-144
src/nni_manager/common/utils.ts
src/nni_manager/common/utils.ts
+4
-3
src/nni_manager/package.json
src/nni_manager/package.json
+1
-0
src/nni_manager/rest_server/restValidationSchemas.ts
src/nni_manager/rest_server/restValidationSchemas.ts
+5
-0
src/nni_manager/training_service/common/gpuData.ts
src/nni_manager/training_service/common/gpuData.ts
+24
-0
src/nni_manager/training_service/common/trialConfig.ts
src/nni_manager/training_service/common/trialConfig.ts
+4
-0
src/nni_manager/training_service/pai/paiConfig.ts
src/nni_manager/training_service/pai/paiConfig.ts
+13
-2
src/nni_manager/training_service/remote_machine/gpuScheduler.ts
...i_manager/training_service/remote_machine/gpuScheduler.ts
+12
-14
src/nni_manager/training_service/remote_machine/remoteMachineData.ts
...ager/training_service/remote_machine/remoteMachineData.ts
+1
-24
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
...ng_service/remote_machine/remoteMachineTrainingService.ts
+4
-5
src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts
...r/training_service/reusable/channels/amlCommandChannel.ts
+3
-8
src/nni_manager/training_service/reusable/environment.ts
src/nni_manager/training_service/reusable/environment.ts
+96
-29
src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts
...ng_service/reusable/environments/amlEnvironmentService.ts
+20
-28
src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts
...ervice/reusable/environments/openPaiEnvironmentService.ts
+50
-24
src/nni_manager/training_service/reusable/gpuScheduler.ts
src/nni_manager/training_service/reusable/gpuScheduler.ts
+235
-0
src/nni_manager/training_service/reusable/storageService.ts
src/nni_manager/training_service/reusable/storageService.ts
+2
-2
src/nni_manager/training_service/reusable/storages/mountedStorageService.ts
...aining_service/reusable/storages/mountedStorageService.ts
+5
-5
src/nni_manager/training_service/reusable/test/mountedStorageService.test.ts
...ining_service/reusable/test/mountedStorageService.test.ts
+125
-0
src/nni_manager/training_service/reusable/test/trialDispatcher.test.ts
...er/training_service/reusable/test/trialDispatcher.test.ts
+712
-0
src/nni_manager/training_service/reusable/test/utCommandChannel.ts
...anager/training_service/reusable/test/utCommandChannel.ts
+57
-0
src/nni_manager/training_service/reusable/test/utEnvironmentService.ts
...er/training_service/reusable/test/utEnvironmentService.ts
+76
-0
No files found.
src/nni_manager/common/utils.ts
View file @
143c6615
...
...
@@ -222,15 +222,16 @@ function getIPV4Address(): string {
return
cachedipv4Address
;
}
if
(
os
.
networkInterfaces
().
eth0
)
{
for
(
const
item
of
os
.
networkInterfaces
().
eth0
)
{
const
networkInterfaces
=
os
.
networkInterfaces
();
if
(
networkInterfaces
.
eth0
)
{
for
(
const
item
of
networkInterfaces
.
eth0
)
{
if
(
item
.
family
===
'
IPv4
'
)
{
cachedipv4Address
=
item
.
address
;
return
cachedipv4Address
;
}
}
}
else
{
throw
Error
(
'
getIPV4Address() failed because os.networkInterfaces().eth0 is undefined.
'
);
throw
Error
(
`
getIPV4Address() failed because os.networkInterfaces().eth0 is undefined.
Please specify NNI manager IP in config.`
);
}
throw
Error
(
'
getIPV4Address() failed because no valid IPv4 address found.
'
)
...
...
src/nni_manager/package.json
View file @
143c6615
...
...
@@ -39,6 +39,7 @@
"@types/express"
:
"^4.16.0"
,
"@types/glob"
:
"^7.1.1"
,
"@types/js-base64"
:
"^2.3.1"
,
"@types/js-yaml"
:
"^3.12.5"
,
"@types/mocha"
:
"^5.2.5"
,
"@types/node"
:
"10.12.18"
,
"@types/request"
:
"^2.47.1"
,
...
...
src/nni_manager/rest_server/restValidationSchemas.ts
View file @
143c6615
...
...
@@ -107,6 +107,11 @@ export namespace ValidationSchemas {
token
:
joi
.
string
().
min
(
1
),
host
:
joi
.
string
().
min
(
1
).
required
(),
reuse
:
joi
.
boolean
(),
cpuNum
:
joi
.
number
().
min
(
1
),
memoryMB
:
joi
.
number
().
min
(
100
),
gpuNum
:
joi
.
number
().
min
(
1
),
maxTrialNumPerGpu
:
joi
.
number
(),
useActiveGpu
:
joi
.
boolean
(),
}),
kubeflow_config
:
joi
.
object
({
// eslint-disable-line @typescript-eslint/camelcase
operator
:
joi
.
string
().
min
(
1
).
required
(),
...
...
src/nni_manager/training_service/common/gpuData.ts
View file @
143c6615
...
...
@@ -3,6 +3,17 @@
'
use strict
'
;
export
enum
ScheduleResultType
{
// Schedule succeeded
SUCCEED
,
// Temporarily, no enough available GPU right now
TMP_NO_AVAILABLE_GPU
,
// Cannot match requirement even if all GPU are a
REQUIRE_EXCEED_TOTAL
}
/**
* GPU Infromation class
* Representing the dynamic and static information retrieved from Nvidia-smi
...
...
@@ -52,6 +63,19 @@ export class GPUSummary {
}
}
export
function
parseGpuIndices
(
gpuIndices
?:
string
):
Set
<
number
>
|
undefined
{
if
(
gpuIndices
!==
undefined
)
{
const
indices
:
number
[]
=
gpuIndices
.
split
(
'
,
'
)
.
map
((
x
:
string
)
=>
parseInt
(
x
,
10
));
if
(
indices
.
length
>
0
)
{
return
new
Set
(
indices
);
}
else
{
throw
new
Error
(
'
gpuIndices can not be empty if specified.
'
);
}
}
}
export
const
GPU_INFO_COLLECTOR_FORMAT_WINDOWS
:
string
=
`
$env:METRIC_OUTPUT_DIR="{0}"
...
...
src/nni_manager/training_service/common/trialConfig.ts
View file @
143c6615
...
...
@@ -17,6 +17,10 @@ export class TrialConfig {
// Required GPU number for trial job. The number should be in [0,100]
public
readonly
gpuNum
:
number
;
// this flag uses for UT now.
// in future, all environments should be reusable, and this can be configurable by user.
public
reuseEnvironment
:
boolean
|
undefined
=
true
;
/**
* Constructor
* @param command Trail command
...
...
src/nni_manager/training_service/pai/paiConfig.ts
View file @
143c6615
...
...
@@ -3,7 +3,7 @@
'
use strict
'
;
import
{
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobStatus
}
from
'
../../common/trainingService
'
;
import
{
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobStatus
}
from
'
../../common/trainingService
'
;
export
class
PAIClusterConfig
{
public
readonly
userName
:
string
;
...
...
@@ -12,6 +12,13 @@ export class PAIClusterConfig {
public
readonly
token
?:
string
;
public
readonly
reuse
?:
boolean
;
public
cpuNum
?:
number
;
public
memoryMB
?:
number
;
public
gpuNum
?:
number
;
public
useActiveGpu
?:
boolean
;
public
maxTrialNumPerGpu
?:
number
;
/**
* Constructor
* @param userName User name of PAI Cluster
...
...
@@ -20,12 +27,16 @@ export class PAIClusterConfig {
* @param token PAI token of PAI Cluster
* @param reuse If job is reusable for multiple trials
*/
constructor
(
userName
:
string
,
host
:
string
,
passWord
?:
string
,
token
?:
string
,
reuse
?:
boolean
)
{
constructor
(
userName
:
string
,
host
:
string
,
passWord
?:
string
,
token
?:
string
,
reuse
?:
boolean
,
cpuNum
?:
number
,
memoryMB
?:
number
,
gpuNum
?:
number
)
{
this
.
userName
=
userName
;
this
.
passWord
=
passWord
;
this
.
host
=
host
;
this
.
token
=
token
;
this
.
reuse
=
reuse
;
this
.
cpuNum
=
cpuNum
;
this
.
memoryMB
=
memoryMB
;
this
.
gpuNum
=
gpuNum
;
}
}
...
...
src/nni_manager/training_service/remote_machine/gpuScheduler.ts
View file @
143c6615
...
...
@@ -6,10 +6,8 @@
import
*
as
assert
from
'
assert
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
randomSelect
}
from
'
../../common/utils
'
;
import
{
GPUInfo
}
from
'
../common/gpuData
'
;
import
{
parseGpuIndices
,
RemoteMachineMeta
,
RemoteMachineScheduleResult
,
RemoteMachineTrialJobDetail
,
ScheduleResultType
,
ExecutorManager
}
from
'
./remoteMachineData
'
;
import
{
GPUInfo
,
parseGpuIndices
,
ScheduleResultType
}
from
'
../common/gpuData
'
;
import
{
ExecutorManager
,
RemoteMachineMeta
,
RemoteMachineScheduleResult
,
RemoteMachineTrialJobDetail
}
from
'
./remoteMachineData
'
;
type
SCHEDULE_POLICY_NAME
=
'
random
'
|
'
round-robin
'
;
...
...
@@ -39,7 +37,7 @@ export class GPUScheduler {
* @param requiredGPUNum required GPU number
*/
public
scheduleMachine
(
requiredGPUNum
:
number
|
undefined
,
trialJobDetail
:
RemoteMachineTrialJobDetail
):
RemoteMachineScheduleResult
{
if
(
requiredGPUNum
===
undefined
)
{
if
(
requiredGPUNum
===
undefined
)
{
requiredGPUNum
=
0
;
}
assert
(
requiredGPUNum
>=
0
);
...
...
@@ -48,7 +46,7 @@ export class GPUScheduler {
// Step 1: Check if required GPU number not exceeds the total GPU number in all machines
const
eligibleRM
:
RemoteMachineMeta
[]
=
allRMs
.
filter
((
rmMeta
:
RemoteMachineMeta
)
=>
rmMeta
.
gpuSummary
===
undefined
||
requiredGPUNum
===
0
||
(
requiredGPUNum
!==
undefined
&&
rmMeta
.
gpuSummary
.
gpuCount
>=
requiredGPUNum
));
rmMeta
.
gpuSummary
===
undefined
||
requiredGPUNum
===
0
||
(
requiredGPUNum
!==
undefined
&&
rmMeta
.
gpuSummary
.
gpuCount
>=
requiredGPUNum
));
if
(
eligibleRM
.
length
===
0
)
{
// If the required gpu number exceeds the upper limit of all machine's GPU number
// Return REQUIRE_EXCEED_TOTAL directly
...
...
@@ -75,8 +73,8 @@ export class GPUScheduler {
this
.
log
.
warning
(
`Scheduler: trialJob id
${
trialJobDetail
.
id
}
, no machine can be scheduled, return TMP_NO_AVAILABLE_GPU `
);
return
{
resultType
:
ScheduleResultType
.
TMP_NO_AVAILABLE_GPU
,
scheduleInfo
:
undefined
resultType
:
ScheduleResultType
.
TMP_NO_AVAILABLE_GPU
,
scheduleInfo
:
undefined
};
}
...
...
@@ -159,7 +157,7 @@ export class GPUScheduler {
const
num
:
number
|
undefined
=
rmMeta
.
occupiedGpuIndexMap
.
get
(
gpuInfo
.
index
);
const
maxTrialNumPerGpu
:
number
=
rmMeta
.
maxTrialNumPerGpu
?
rmMeta
.
maxTrialNumPerGpu
:
1
;
if
((
num
===
undefined
&&
(
!
rmMeta
.
useActiveGpu
&&
gpuInfo
.
activeProcessNum
===
0
||
rmMeta
.
useActiveGpu
))
||
(
num
!==
undefined
&&
num
<
maxTrialNumPerGpu
))
{
(
num
!==
undefined
&&
num
<
maxTrialNumPerGpu
))
{
availableGPUs
.
push
(
gpuInfo
);
}
}
else
{
...
...
@@ -200,7 +198,7 @@ export class GPUScheduler {
}
private
allocateHost
(
requiredGPUNum
:
number
,
rmMeta
:
RemoteMachineMeta
,
gpuInfos
:
GPUInfo
[],
trialJobDetail
:
RemoteMachineTrialJobDetail
):
RemoteMachineScheduleResult
{
gpuInfos
:
GPUInfo
[],
trialJobDetail
:
RemoteMachineTrialJobDetail
):
RemoteMachineScheduleResult
{
assert
(
gpuInfos
.
length
>=
requiredGPUNum
);
const
allocatedGPUs
:
GPUInfo
[]
=
this
.
selectGPUsForTrial
(
gpuInfos
,
requiredGPUNum
);
allocatedGPUs
.
forEach
((
gpuInfo
:
GPUInfo
)
=>
{
...
...
@@ -222,10 +220,10 @@ export class GPUScheduler {
scheduleInfo
:
{
rmMeta
:
rmMeta
,
cudaVisibleDevice
:
allocatedGPUs
.
map
((
gpuInfo
:
GPUInfo
)
=>
{
return
gpuInfo
.
index
;
})
.
join
(
'
,
'
)
.
map
((
gpuInfo
:
GPUInfo
)
=>
{
return
gpuInfo
.
index
;
})
.
join
(
'
,
'
)
}
};
}
...
...
src/nni_manager/training_service/remote_machine/remoteMachineData.ts
View file @
143c6615
...
...
@@ -4,7 +4,7 @@
'
use strict
'
;
import
{
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobStatus
}
from
'
../../common/trainingService
'
;
import
{
GPUInfo
,
GPUSummary
}
from
'
../common/gpuData
'
;
import
{
GPUInfo
,
GPUSummary
,
ScheduleResultType
}
from
'
../common/gpuData
'
;
import
{
ShellExecutor
}
from
'
./shellExecutor
'
;
/**
...
...
@@ -25,18 +25,6 @@ export class RemoteMachineMeta {
public
readonly
useActiveGpu
?:
boolean
=
false
;
}
export
function
parseGpuIndices
(
gpuIndices
?:
string
):
Set
<
number
>
|
undefined
{
if
(
gpuIndices
!==
undefined
)
{
const
indices
:
number
[]
=
gpuIndices
.
split
(
'
,
'
)
.
map
((
x
:
string
)
=>
parseInt
(
x
,
10
));
if
(
indices
.
length
>
0
)
{
return
new
Set
(
indices
);
}
else
{
throw
new
Error
(
'
gpuIndices can not be empty if specified.
'
);
}
}
}
/**
* The execution result for command executed on remote machine
*/
...
...
@@ -168,14 +156,3 @@ export class ExecutorManager {
export
type
RemoteMachineScheduleResult
=
{
scheduleInfo
:
RemoteMachineScheduleInfo
|
undefined
;
resultType
:
ScheduleResultType
};
export
type
RemoteMachineScheduleInfo
=
{
rmMeta
:
RemoteMachineMeta
;
cudaVisibleDevice
:
string
};
export
enum
ScheduleResultType
{
// Schedule succeeded
SUCCEED
,
// Temporarily, no enough available GPU right now
TMP_NO_AVAILABLE_GPU
,
// Cannot match requirement even if all GPU are a
REQUIRE_EXCEED_TOTAL
}
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
View file @
143c6615
...
...
@@ -7,6 +7,7 @@ import * as assert from 'assert';
import
{
EventEmitter
}
from
'
events
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
import
{
ShellExecutor
}
from
'
training_service/remote_machine/shellExecutor
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
NNIError
,
NNIErrorNames
}
from
'
../../common/errors
'
;
...
...
@@ -22,18 +23,16 @@ import {
getVersion
,
uniqueString
}
from
'
../../common/utils
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../common/containerJobData
'
;
import
{
GPUSummary
}
from
'
../common/gpuData
'
;
import
{
GPUSummary
,
ScheduleResultType
}
from
'
../common/gpuData
'
;
import
{
TrialConfig
}
from
'
../common/trialConfig
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
import
{
execMkdir
,
validateCodeDir
}
from
'
../common/util
'
;
import
{
GPUScheduler
}
from
'
./gpuScheduler
'
;
import
{
RemoteMachineMeta
,
RemoteMachineScheduleInfo
,
RemoteMachineScheduleResult
,
RemoteMachineTrialJobDetail
,
ScheduleResultType
,
ExecutorManager
ExecutorManager
,
RemoteMachineMeta
,
RemoteMachineScheduleInfo
,
RemoteMachineScheduleResult
,
RemoteMachineTrialJobDetail
}
from
'
./remoteMachineData
'
;
import
{
RemoteMachineJobRestServer
}
from
'
./remoteMachineJobRestServer
'
;
import
{
ShellExecutor
}
from
'
training_service/remote_machine/shellExecutor
'
;
/**
* Training Service implementation for Remote Machine (Linux)
...
...
src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts
View file @
143c6615
...
...
@@ -3,7 +3,6 @@
'
use strict
'
;
import
{
EventEmitter
}
from
'
events
'
;
import
{
delay
}
from
"
../../../common/utils
"
;
import
{
AMLEnvironmentInformation
}
from
'
../aml/amlConfig
'
;
import
{
CommandChannel
,
RunnerConnection
}
from
"
../commandChannel
"
;
...
...
@@ -15,11 +14,7 @@ class AMLRunnerConnection extends RunnerConnection {
export
class
AMLCommandChannel
extends
CommandChannel
{
private
stopping
:
boolean
=
false
;
private
sendQueues
:
[
EnvironmentInformation
,
string
][]
=
[];
private
readonly
NNI_METRICS_PATTERN
:
string
=
`NNISDK_MEb'(?<metrics>.*?)'`
;
public
constructor
(
commandEmitter
:
EventEmitter
)
{
super
(
commandEmitter
);
}
public
get
channelName
():
Channel
{
return
"
aml
"
;
}
...
...
@@ -99,11 +94,11 @@ export class AMLCommandChannel extends CommandChannel {
const
messages
=
command
[
'
trial_runner
'
];
if
(
messages
)
{
if
(
messages
instanceof
Object
&&
currentMessageIndex
<
messages
.
length
-
1
)
{
for
(
let
index
=
currentMessageIndex
+
1
;
index
<
messages
.
length
;
index
++
)
{
for
(
let
index
=
currentMessageIndex
+
1
;
index
<
messages
.
length
;
index
++
)
{
this
.
handleCommand
(
runnerConnection
.
environment
,
messages
[
index
]);
}
currentMessageIndex
=
messages
.
length
-
1
;
}
else
if
(
currentMessageIndex
===
-
1
){
}
else
if
(
currentMessageIndex
===
-
1
)
{
this
.
handleCommand
(
runnerConnection
.
environment
,
messages
);
currentMessageIndex
+=
1
;
}
...
...
src/nni_manager/training_service/reusable/environment.ts
View file @
143c6615
...
...
@@ -3,10 +3,10 @@
'
use strict
'
;
import
{
GPUSummary
}
from
"
training_service/common/gpuData
"
;
import
{
EventEmitter
}
from
"
events
"
;
import
{
getLogger
,
Logger
}
from
"
../../common/log
"
;
import
{
TrialJobStatus
}
from
"
../../common/trainingService
"
;
import
{
EventEmitter
}
from
"
events
"
;
import
{
GPUInfo
}
from
"
../../training_service/common/gpuData
"
;
import
{
WebCommandChannel
}
from
"
./channels/webCommandChannel
"
;
import
{
CommandChannel
}
from
"
./commandChannel
"
;
...
...
@@ -14,24 +14,50 @@ import { CommandChannel } from "./commandChannel";
export
type
EnvironmentStatus
=
'
UNKNOWN
'
|
'
WAITING
'
|
'
RUNNING
'
|
'
SUCCEEDED
'
|
'
FAILED
'
|
'
USER_CANCELED
'
;
export
type
Channel
=
"
web
"
|
"
file
"
|
"
aml
"
|
"
ut
"
;
export
class
TrialGpuSummary
{
// GPU count on the machine
public
gpuCount
:
number
;
// The timestamp when GPU summary data queried
public
timestamp
:
string
;
// The array of GPU information for each GPU card
public
gpuInfos
:
GPUInfo
[];
// GPU assigned status
public
assignedGpuIndexMap
:
Map
<
number
,
number
>
=
new
Map
<
number
,
number
>
();
constructor
(
gpuCount
:
number
,
timestamp
:
string
,
gpuInfos
:
GPUInfo
[])
{
this
.
gpuCount
=
gpuCount
;
this
.
timestamp
=
timestamp
;
this
.
gpuInfos
=
gpuInfos
;
}
}
export
class
EnvironmentInformation
{
// node id is 5 chars, so won't conflict.
private
readonly
defaultNodeId
=
"
default
"
;
private
log
:
Logger
;
// NNI environment ID
public
id
:
string
;
// training platform unique job ID.
public
jobId
:
string
;
// training platform job friendly name, in case it's different with job ID.
public
jobName
:
string
;
private
isNoGpuWarned
:
boolean
=
false
;
// key states
// true: environment is ready to run trial.
public
isIdle
:
boolean
=
false
;
// true: environment is running, waiting, or unknown.
public
isAlive
:
boolean
=
true
;
// true: Runner is initialized, and can receive trials.
public
isRunnerReady
:
boolean
=
false
;
// don't set status in environment directly, use setFinalState function to set a final state.
public
status
:
EnvironmentStatus
=
"
UNKNOWN
"
;
// true: environment is ready to run trial.
public
runningTrialCount
:
number
=
0
;
// uses to count how many trial runs on this environment.
// it can be used in many scenarios, but for now, it uses for reusable.
public
assignedTrialCount
:
number
=
0
;
// NNI environment ID
public
id
:
string
;
// training platform unique job ID.
public
envId
:
string
;
// training platform job friendly name, in case it's different with job ID.
public
name
:
string
;
public
trackingUrl
:
string
=
""
;
public
workingFolder
:
string
=
""
;
public
runnerWorkingFolder
:
string
=
""
;
...
...
@@ -40,41 +66,82 @@ export class EnvironmentInformation {
// it's used to aggregate node status for multiple node trial
public
nodes
:
Map
<
string
,
NodeInfomation
>
;
public
gpuSummar
y
:
Map
<
string
,
GPU
Summary
>
=
new
Map
<
string
,
GPU
Summary
>
();
public
gpuSummar
ies
:
Map
<
string
,
TrialGpu
Summary
>
=
new
Map
<
string
,
TrialGpu
Summary
>
();
constructor
(
id
:
string
,
jobName
:
string
,
jobId
?:
string
)
{
// use can specify which gpus can be used by NNI.
// it's usable for sharable environment like remote machine.
public
usableGpus
?:
number
[];
// user can specify how to use GPU resource for an environment, like local and remote.
public
maxTrialNumberPerGpu
?:
number
;
public
useActiveGpu
?:
boolean
;
constructor
(
id
:
string
,
name
:
string
,
envId
?:
string
)
{
this
.
log
=
getLogger
();
this
.
id
=
id
;
this
.
jobN
ame
=
jobN
ame
;
this
.
job
Id
=
job
Id
?
job
Id
:
jobN
ame
;
this
.
n
ame
=
n
ame
;
this
.
env
Id
=
env
Id
?
env
Id
:
n
ame
;
this
.
nodes
=
new
Map
<
string
,
NodeInfomation
>
();
}
public
setFinalStatus
(
status
:
EnvironmentStatus
):
void
{
switch
(
status
)
{
case
'
WAITING
'
:
case
'
SUCCEEDED
'
:
case
'
FAILED
'
:
case
'
USER_CANCELED
'
:
this
.
status
=
status
;
break
;
default
:
this
.
log
.
error
(
`Environment: job
${
this
.
jobId
}
set an invalid final state
${
status
}
.`
);
break
;
public
setStatus
(
status
:
EnvironmentStatus
):
void
{
if
(
this
.
status
!==
status
)
{
this
.
log
.
info
(
`EnvironmentInformation:
${
this
.
envId
}
change status from
${
this
.
status
}
to
${
status
}
.`
)
this
.
status
=
status
;
}
}
public
setGpuSummary
(
nodeId
:
string
,
newGpuSummary
:
TrialGpuSummary
):
void
{
if
(
nodeId
===
null
||
nodeId
===
undefined
)
{
nodeId
=
this
.
defaultNodeId
;
}
const
originalGpuSummary
=
this
.
gpuSummaries
.
get
(
nodeId
);
if
(
undefined
===
originalGpuSummary
)
{
newGpuSummary
.
assignedGpuIndexMap
=
new
Map
<
number
,
number
>
();
this
.
gpuSummaries
.
set
(
nodeId
,
newGpuSummary
);
}
else
{
originalGpuSummary
.
gpuCount
=
newGpuSummary
.
gpuCount
;
originalGpuSummary
.
timestamp
=
newGpuSummary
.
timestamp
;
originalGpuSummary
.
gpuInfos
=
newGpuSummary
.
gpuInfos
;
}
}
public
get
defaultGpuSummary
():
TrialGpuSummary
|
undefined
{
const
gpuSummary
=
this
.
gpuSummaries
.
get
(
this
.
defaultNodeId
);
if
(
gpuSummary
===
undefined
)
{
if
(
false
===
this
.
isNoGpuWarned
)
{
this
.
log
.
warning
(
`EnvironmentInformation:
${
this
.
envId
}
no default gpu found. current gpu info
${
JSON
.
stringify
(
this
.
gpuSummaries
)}
`
);
this
.
isNoGpuWarned
=
true
;
}
}
else
{
this
.
isNoGpuWarned
=
false
;
}
return
gpuSummary
;
}
}
export
abstract
class
EnvironmentService
{
public
abstract
get
hasStorageService
():
boolean
;
public
abstract
config
(
key
:
string
,
value
:
string
):
Promise
<
void
>
;
public
abstract
refreshEnvironmentsStatus
(
environments
:
EnvironmentInformation
[]):
Promise
<
void
>
;
public
abstract
startEnvironment
(
environment
:
EnvironmentInformation
):
Promise
<
void
>
;
public
abstract
stopEnvironment
(
environment
:
EnvironmentInformation
):
Promise
<
void
>
;
public
getCommandChannel
(
commandEmitter
:
EventEmitter
):
CommandChannel
{
// It depends on environment pressure and settings
// for example, OpenPAI relies on API calls, and there is an limitation for frequence, so it need to be bigger.
public
get
environmentMaintenceLoopInterval
():
number
{
return
5000
;
}
// it's needed in two scenario
// 1. remote machine has fixed number, so it can return false, when all environment are assigned.
// 2. If there are consistent error on requested environments, for example, authentication failure on platform.
public
get
hasMoreEnvironments
():
boolean
{
return
true
;
}
public
createCommandChannel
(
commandEmitter
:
EventEmitter
):
CommandChannel
{
return
new
WebCommandChannel
(
commandEmitter
);
}
...
...
@@ -101,7 +168,7 @@ export class RunnerSettings {
public
nniManagerVersion
:
string
=
""
;
public
logCollection
:
string
=
"
none
"
;
public
command
:
string
=
""
;
public
enableGpuCollector
:
boolean
=
fals
e
;
public
enableGpuCollector
:
boolean
=
tru
e
;
// specify which communication channel is used by runner.
// supported channel includes: rest, storage, aml
...
...
src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts
View file @
143c6615
...
...
@@ -3,24 +3,20 @@
'
use strict
'
;
import
{
EventEmitter
}
from
"
events
"
;
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
import
*
as
component
from
'
../../../common/component
'
;
import
{
getExperimentId
}
from
'
../../../common/experimentStartupInfo
'
;
import
{
getLogger
,
Logger
}
from
'
../../../common/log
'
;
import
{
getExperimentRootDir
}
from
'
../../../common/utils
'
;
import
{
TrialConfigMetadataKey
}
from
'
../../common/trialConfigMetadataKey
'
;
import
{
AMLClusterConfig
,
AMLTrialConfig
}
from
'
../aml/amlConfig
'
;
import
{
EnvironmentInformation
,
EnvironmentService
}
from
'
../environment
'
;
import
{
AMLEnvironmentInformation
}
from
'
../aml/amlConfig
'
;
import
{
AMLClient
}
from
'
../aml/amlClient
'
;
import
{
NNIManagerIpConfig
,
}
from
'
../../../common/trainingService
'
;
import
{
validateCodeDir
}
from
'
../../common/util
'
;
import
{
getExperimentRootDir
}
from
'
../../../common/utils
'
;
import
{
AMLClient
}
from
'
../aml/amlClient
'
;
import
{
AMLClusterConfig
,
AMLEnvironmentInformation
,
AMLTrialConfig
}
from
'
../aml/amlConfig
'
;
import
{
AMLCommandChannel
}
from
'
../channels/amlCommandChannel
'
;
import
{
CommandChannel
}
from
"
../commandChannel
"
;
import
{
E
ventEmitter
}
from
"
ev
ent
s
"
;
import
{
E
nvironmentInformation
,
EnvironmentService
,
EnvironmentStatus
}
from
'
../environm
ent
'
;
/**
...
...
@@ -28,17 +24,11 @@ import { EventEmitter } from "events";
*/
@
component
.
Singleton
export
class
AMLEnvironmentService
extends
EnvironmentService
{
private
readonly
log
:
Logger
=
getLogger
();
public
amlClusterConfig
:
AMLClusterConfig
|
undefined
;
public
amlTrialConfig
:
AMLTrialConfig
|
undefined
;
private
amlJobConfig
:
any
;
private
stopping
:
boolean
=
false
;
private
versionCheck
:
boolean
=
true
;
private
isMultiPhase
:
boolean
=
false
;
private
nniVersion
?:
string
;
private
experimentId
:
string
;
private
nniManagerIpConfig
?:
NNIManagerIpConfig
;
private
experimentRootDir
:
string
;
constructor
()
{
...
...
@@ -51,7 +41,7 @@ export class AMLEnvironmentService extends EnvironmentService {
return
false
;
}
public
get
CommandChannel
(
commandEmitter
:
EventEmitter
):
CommandChannel
{
public
create
CommandChannel
(
commandEmitter
:
EventEmitter
):
CommandChannel
{
return
new
AMLCommandChannel
(
commandEmitter
);
}
...
...
@@ -83,29 +73,31 @@ export class AMLEnvironmentService extends EnvironmentService {
public
async
refreshEnvironmentsStatus
(
environments
:
EnvironmentInformation
[]):
Promise
<
void
>
{
environments
.
forEach
(
async
(
environment
)
=>
{
const
amlClient
=
(
environment
as
AMLEnvironmentInformation
).
amlClient
;
if
(
!
amlClient
)
{
throw
new
Error
(
'
AML client not initialized!
'
);
if
(
!
amlClient
)
{
throw
new
Error
(
'
AML client not initialized!
'
);
}
const
s
tatus
=
await
amlClient
.
updateStatus
(
environment
.
status
);
switch
(
s
tatus
.
toUpperCase
())
{
const
newS
tatus
=
await
amlClient
.
updateStatus
(
environment
.
status
);
switch
(
newS
tatus
.
toUpperCase
())
{
case
'
WAITING
'
:
case
'
RUNNING
'
:
case
'
QUEUED
'
:
// RUNNING status is set by runner, and ignore waiting status
environment
.
setStatus
(
'
WAITING
'
);
break
;
case
'
RUNNING
'
:
environment
.
setStatus
(
'
RUNNING
'
);
break
;
case
'
COMPLETED
'
:
case
'
SUCCEEDED
'
:
environment
.
set
Final
Status
(
'
SUCCEEDED
'
);
environment
.
setStatus
(
'
SUCCEEDED
'
);
break
;
case
'
FAILED
'
:
environment
.
set
FinalStatus
(
'
FAILED
'
);
environment
.
set
Status
(
newStatus
.
toUpperCase
()
as
EnvironmentStatus
);
break
;
case
'
STOPPED
'
:
case
'
STOPPING
'
:
environment
.
set
Final
Status
(
'
USER_CANCELED
'
);
environment
.
setStatus
(
'
USER_CANCELED
'
);
break
;
default
:
environment
.
set
Final
Status
(
'
UNKNOWN
'
);
environment
.
setStatus
(
'
UNKNOWN
'
);
}
});
}
...
...
@@ -120,7 +112,7 @@ export class AMLEnvironmentService extends EnvironmentService {
const
amlEnvironment
:
AMLEnvironmentInformation
=
environment
as
AMLEnvironmentInformation
;
const
environmentLocalTempFolder
=
path
.
join
(
this
.
experimentRootDir
,
this
.
experimentId
,
"
environment-temp
"
);
environment
.
command
=
`import os\nos.system('
${
amlEnvironment
.
command
}
')`
;
await
fs
.
promises
.
writeFile
(
path
.
join
(
environmentLocalTempFolder
,
'
nni_script.py
'
),
amlEnvironment
.
command
,{
encoding
:
'
utf8
'
});
await
fs
.
promises
.
writeFile
(
path
.
join
(
environmentLocalTempFolder
,
'
nni_script.py
'
),
amlEnvironment
.
command
,
{
encoding
:
'
utf8
'
});
const
amlClient
=
new
AMLClient
(
this
.
amlClusterConfig
.
subscriptionId
,
this
.
amlClusterConfig
.
resourceGroup
,
...
...
src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts
View file @
143c6615
...
...
@@ -4,6 +4,7 @@
'
use strict
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
yaml
from
'
js-yaml
'
;
import
*
as
request
from
'
request
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
*
as
component
from
'
../../../common/component
'
;
...
...
@@ -15,7 +16,6 @@ import { NNIPAIK8STrialConfig } from '../../pai/paiK8S/paiK8SConfig';
import
{
EnvironmentInformation
,
EnvironmentService
}
from
'
../environment
'
;
import
{
StorageService
}
from
'
../storageService
'
;
const
yaml
=
require
(
'
js-yaml
'
);
/**
* Collector PAI jobs info from PAI cluster, and update pai job status locally
...
...
@@ -40,6 +40,10 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
this
.
experimentId
=
getExperimentId
();
}
public
get
environmentMaintenceLoopInterval
():
number
{
return
5000
;
}
public
get
hasStorageService
():
boolean
{
return
true
;
}
...
...
@@ -72,6 +76,16 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
if
(
this
.
paiTrialConfig
.
paiConfigPath
)
{
this
.
paiJobConfig
=
yaml
.
safeLoad
(
fs
.
readFileSync
(
this
.
paiTrialConfig
.
paiConfigPath
,
'
utf8
'
));
}
if
(
this
.
paiClusterConfig
.
gpuNum
===
undefined
)
{
this
.
paiClusterConfig
.
gpuNum
=
this
.
paiTrialConfig
.
gpuNum
;
}
if
(
this
.
paiClusterConfig
.
cpuNum
===
undefined
)
{
this
.
paiClusterConfig
.
cpuNum
=
this
.
paiTrialConfig
.
cpuNum
;
}
if
(
this
.
paiClusterConfig
.
memoryMB
===
undefined
)
{
this
.
paiClusterConfig
.
memoryMB
=
this
.
paiTrialConfig
.
memoryMB
;
}
break
;
}
default
:
...
...
@@ -111,37 +125,35 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
});
environments
.
forEach
((
environment
)
=>
{
if
(
jobInfos
.
has
(
environment
.
job
Id
))
{
const
jobResponse
=
jobInfos
.
get
(
environment
.
job
Id
);
if
(
jobInfos
.
has
(
environment
.
env
Id
))
{
const
jobResponse
=
jobInfos
.
get
(
environment
.
env
Id
);
if
(
jobResponse
&&
jobResponse
.
state
)
{
const
oldEnvironmentStatus
=
environment
.
status
;
switch
(
jobResponse
.
state
)
{
case
'
RUNNING
'
:
case
'
WAITING
'
:
// RUNNING status is set by runner, and ignore waiting status
break
;
case
'
SUCCEEDED
'
:
case
'
FAILED
'
:
environment
.
set
Final
Status
(
jobResponse
.
state
);
environment
.
setStatus
(
jobResponse
.
state
);
break
;
case
'
STOPPED
'
:
case
'
STOPPING
'
:
environment
.
set
Final
Status
(
'
USER_CANCELED
'
);
environment
.
setStatus
(
'
USER_CANCELED
'
);
break
;
default
:
this
.
log
.
error
(
`OpenPAI: job
${
environment
.
job
Id
}
returns unknown state
${
jobResponse
.
state
}
.`
);
environment
.
set
Final
Status
(
'
UNKNOWN
'
);
this
.
log
.
error
(
`OpenPAI: job
${
environment
.
env
Id
}
returns unknown state
${
jobResponse
.
state
}
.`
);
environment
.
setStatus
(
'
UNKNOWN
'
);
}
if
(
oldEnvironmentStatus
!==
environment
.
status
)
{
this
.
log
.
debug
(
`OpenPAI: job
${
environment
.
job
Id
}
change status
${
oldEnvironmentStatus
}
to
${
environment
.
status
}
due to job is
${
jobResponse
.
state
}
.`
)
this
.
log
.
debug
(
`OpenPAI: job
${
environment
.
env
Id
}
change status
${
oldEnvironmentStatus
}
to
${
environment
.
status
}
due to job is
${
jobResponse
.
state
}
.`
)
}
}
else
{
this
.
log
.
error
(
`OpenPAI: job
${
environment
.
job
Id
}
has no state returned. body:
${
JSON
.
stringify
(
jobResponse
)}
`
);
this
.
log
.
error
(
`OpenPAI: job
${
environment
.
env
Id
}
has no state returned. body:
${
JSON
.
stringify
(
jobResponse
)}
`
);
// some error happens, and mark this environment
environment
.
status
=
'
FAILED
'
;
}
}
else
{
this
.
log
.
error
(
`OpenPAI job
${
environment
.
job
Id
}
is not found in job list.`
);
this
.
log
.
error
(
`OpenPAI job
${
environment
.
env
Id
}
is not found in job list.`
);
environment
.
status
=
'
UNKNOWN
'
;
}
});
...
...
@@ -169,8 +181,10 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
// Step 1. Prepare PAI job configuration
const
environmentRoot
=
`
${
this
.
paiTrialConfig
.
containerNFSMountPath
}
/
${
this
.
experimentId
}
`
;
environment
.
runnerWorkingFolder
=
`
${
environmentRoot
}
/envs/
${
environment
.
id
}
`
;
environment
.
command
=
`cd
${
environmentRoot
}
&&
${
environment
.
command
}
`
environment
.
trackingUrl
=
`
${
this
.
protocol
}
://
${
this
.
paiClusterConfig
.
host
}
/job-detail.html?username=
${
this
.
paiClusterConfig
.
userName
}
&jobName=
${
environment
.
jobId
}
`
environment
.
command
=
`cd
${
environmentRoot
}
&&
${
environment
.
command
}
`
;
environment
.
trackingUrl
=
`
${
this
.
protocol
}
://
${
this
.
paiClusterConfig
.
host
}
/job-detail.html?username=
${
this
.
paiClusterConfig
.
userName
}
&jobName=
${
environment
.
envId
}
`
;
environment
.
useActiveGpu
=
this
.
paiClusterConfig
.
useActiveGpu
;
environment
.
maxTrialNumberPerGpu
=
this
.
paiClusterConfig
.
maxTrialNumPerGpu
;
// Step 2. Generate Job Configuration in yaml format
const
paiJobConfig
=
this
.
generateJobConfigInYamlFormat
(
environment
);
...
...
@@ -189,7 +203,7 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
request
(
submitJobRequest
,
(
error
,
response
,
body
)
=>
{
if
((
error
!==
undefined
&&
error
!==
null
)
||
response
.
statusCode
>=
400
)
{
const
errorMessage
:
string
=
(
error
!==
undefined
&&
error
!==
null
)
?
error
.
message
:
`start environment
${
environment
.
job
Id
}
failed, http code:
${
response
.
statusCode
}
, http body:
${
body
}
`
;
`start environment
${
environment
.
env
Id
}
failed, http code:
${
response
.
statusCode
}
, http body:
${
body
}
`
;
this
.
log
.
error
(
errorMessage
);
environment
.
status
=
'
FAILED
'
;
...
...
@@ -211,7 +225,7 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
}
const
stopJobRequest
:
request
.
Options
=
{
uri
:
`
${
this
.
protocol
}
://
${
this
.
paiClusterConfig
.
host
}
/rest-server/api/v2/jobs/
${
this
.
paiClusterConfig
.
userName
}
~
${
environment
.
job
Id
}
/executionType`
,
uri
:
`
${
this
.
protocol
}
://
${
this
.
paiClusterConfig
.
host
}
/rest-server/api/v2/jobs/
${
this
.
paiClusterConfig
.
userName
}
~
${
environment
.
env
Id
}
/executionType`
,
method
:
'
PUT
'
,
json
:
true
,
body
:
{
value
:
'
STOP
'
},
...
...
@@ -222,17 +236,17 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
}
};
this
.
log
.
debug
(
`stopping OpenPAI environment
${
environment
.
job
Id
}
,
${
stopJobRequest
.
uri
}
`
);
this
.
log
.
debug
(
`stopping OpenPAI environment
${
environment
.
env
Id
}
,
${
stopJobRequest
.
uri
}
`
);
try
{
request
(
stopJobRequest
,
(
error
,
response
,
_body
)
=>
{
try
{
if
((
error
!==
undefined
&&
error
!==
null
)
||
(
response
&&
response
.
statusCode
>=
400
))
{
this
.
log
.
error
(
`OpenPAI: stop job
${
environment
.
job
Id
}
failed with
${
response
.
statusCode
}
\n
${
error
}
`
);
this
.
log
.
error
(
`OpenPAI: stop job
${
environment
.
env
Id
}
failed with
${
response
.
statusCode
}
\n
${
error
}
`
);
deferred
.
reject
((
error
!==
undefined
&&
error
!==
null
)
?
error
:
`Stop trial failed, http code:
${
response
.
statusCode
}
`
);
}
else
{
this
.
log
.
info
(
`OpenPAI job
${
environment
.
job
Id
}
stopped.`
);
this
.
log
.
info
(
`OpenPAI job
${
environment
.
env
Id
}
stopped.`
);
}
deferred
.
resolve
();
}
catch
(
error
)
{
...
...
@@ -265,7 +279,7 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
if
(
this
.
paiTrialConfig
===
undefined
)
{
throw
new
Error
(
'
trial config is not initialized
'
);
}
const
jobName
=
environment
.
job
Id
;
const
jobName
=
environment
.
env
Id
;
let
nniJobConfig
:
any
=
undefined
;
if
(
this
.
paiTrialConfig
.
paiConfigPath
)
{
...
...
@@ -284,7 +298,6 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
environment
.
nodeCount
+=
instanceCount
;
}
// Each taskRole will generate new command in NNI's command format
// Each command will be formatted to NNI style
for
(
const
taskRoleName
in
nniJobConfig
.
taskRoles
)
{
...
...
@@ -298,6 +311,19 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
}
}
else
{
if
(
this
.
paiClusterConfig
===
undefined
)
{
throw
new
Error
(
'
PAI Cluster config is not initialized
'
);
}
if
(
this
.
paiClusterConfig
.
gpuNum
===
undefined
)
{
throw
new
Error
(
'
PAI Cluster gpuNum is not initialized
'
);
}
if
(
this
.
paiClusterConfig
.
cpuNum
===
undefined
)
{
throw
new
Error
(
'
PAI Cluster cpuNum is not initialized
'
);
}
if
(
this
.
paiClusterConfig
.
memoryMB
===
undefined
)
{
throw
new
Error
(
'
PAI Cluster memoryMB is not initialized
'
);
}
nniJobConfig
=
{
protocolVersion
:
2
,
name
:
jobName
,
...
...
@@ -320,9 +346,9 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
taskRetryCount
:
0
,
dockerImage
:
'
docker_image_0
'
,
resourcePerInstance
:
{
gpu
:
this
.
pai
Trial
Config
.
gpuNum
,
cpu
:
this
.
pai
Trial
Config
.
cpuNum
,
memoryMB
:
this
.
pai
Trial
Config
.
memoryMB
gpu
:
this
.
pai
Cluster
Config
.
gpuNum
,
cpu
:
this
.
pai
Cluster
Config
.
cpuNum
,
memoryMB
:
this
.
pai
Cluster
Config
.
memoryMB
},
commands
:
[
environment
.
command
...
...
src/nni_manager/training_service/reusable/gpuScheduler.ts
0 → 100644
View file @
143c6615
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
'
use strict
'
;
import
*
as
assert
from
'
assert
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
randomSelect
}
from
'
../../common/utils
'
;
import
{
GPUInfo
,
ScheduleResultType
}
from
'
../common/gpuData
'
;
import
{
EnvironmentInformation
}
from
'
./environment
'
;
import
{
TrialDetail
}
from
'
./trial
'
;
type
SCHEDULE_POLICY_NAME
=
'
random
'
|
'
round-robin
'
;
export
class
GpuSchedulerSetting
{
public
useActiveGpu
:
boolean
=
false
;
public
maxTrialNumberPerGpu
:
number
=
1
;
}
export
type
GpuScheduleResult
=
{
resultType
:
ScheduleResultType
;
environment
:
EnvironmentInformation
|
undefined
;
gpuIndices
:
GPUInfo
[]
|
undefined
;
};
/**
* A simple GPU scheduler implementation
*/
export
class
GpuScheduler
{
// private readonly machineExecutorMap: Set<TrialDetail>;
private
readonly
log
:
Logger
=
getLogger
();
private
readonly
policyName
:
SCHEDULE_POLICY_NAME
=
'
round-robin
'
;
private
defaultSetting
:
GpuSchedulerSetting
;
private
roundRobinIndex
:
number
=
0
;
/**
* Constructor
* @param environments map from remote machine to executor
*/
constructor
(
gpuSchedulerSetting
:
GpuSchedulerSetting
|
undefined
=
undefined
)
{
if
(
undefined
===
gpuSchedulerSetting
)
{
gpuSchedulerSetting
=
new
GpuSchedulerSetting
();
}
this
.
defaultSetting
=
gpuSchedulerSetting
;
}
public
setSettings
(
gpuSchedulerSetting
:
GpuSchedulerSetting
):
void
{
this
.
defaultSetting
=
gpuSchedulerSetting
;
}
/**
* Schedule a machine according to the constraints (requiredGPUNum)
* @param requiredGPUNum required GPU number
*/
public
scheduleMachine
(
environments
:
EnvironmentInformation
[],
requiredGPUNum
:
number
|
undefined
,
trialDetail
:
TrialDetail
):
GpuScheduleResult
{
if
(
requiredGPUNum
===
undefined
)
{
requiredGPUNum
=
0
;
}
assert
(
requiredGPUNum
>=
0
);
// Step 1: Check if required GPU number not exceeds the total GPU number in all machines
const
eligibleEnvironments
:
EnvironmentInformation
[]
=
environments
.
filter
((
environment
:
EnvironmentInformation
)
=>
environment
.
defaultGpuSummary
===
undefined
||
requiredGPUNum
===
0
||
(
requiredGPUNum
!==
undefined
&&
environment
.
defaultGpuSummary
.
gpuCount
>=
requiredGPUNum
));
if
(
eligibleEnvironments
.
length
===
0
)
{
// If the required gpu number exceeds the upper limit of all machine's GPU number
// Return REQUIRE_EXCEED_TOTAL directly
return
({
resultType
:
ScheduleResultType
.
REQUIRE_EXCEED_TOTAL
,
gpuIndices
:
undefined
,
environment
:
undefined
,
});
}
// Step 2: Allocate Host/GPU for specified trial job
// Currenty the requireGPUNum parameter for all trial jobs are identical.
if
(
requiredGPUNum
>
0
)
{
// Trial job requires GPU
const
result
:
GpuScheduleResult
|
undefined
=
this
.
scheduleGPUHost
(
environments
,
requiredGPUNum
,
trialDetail
);
if
(
result
!==
undefined
)
{
return
result
;
}
}
else
{
// Trail job does not need GPU
const
allocatedRm
:
EnvironmentInformation
=
this
.
selectMachine
(
environments
,
environments
);
return
this
.
allocateHost
(
requiredGPUNum
,
allocatedRm
,
[],
trialDetail
);
}
return
{
resultType
:
ScheduleResultType
.
TMP_NO_AVAILABLE_GPU
,
gpuIndices
:
undefined
,
environment
:
undefined
,
};
}
/**
* remove the job's gpu reversion
*/
public
removeGpuReservation
(
trial
:
TrialDetail
):
void
{
if
(
trial
.
environment
!==
undefined
&&
trial
.
environment
.
defaultGpuSummary
!==
undefined
&&
trial
.
assignedGpus
!==
undefined
&&
trial
.
assignedGpus
.
length
>
0
)
{
for
(
const
gpuInfo
of
trial
.
assignedGpus
)
{
const
defaultGpuSummary
=
trial
.
environment
.
defaultGpuSummary
;
const
num
:
number
|
undefined
=
defaultGpuSummary
.
assignedGpuIndexMap
.
get
(
gpuInfo
.
index
);
if
(
num
!==
undefined
)
{
if
(
num
===
1
)
{
defaultGpuSummary
.
assignedGpuIndexMap
.
delete
(
gpuInfo
.
index
);
}
else
{
defaultGpuSummary
.
assignedGpuIndexMap
.
set
(
gpuInfo
.
index
,
num
-
1
);
}
}
}
}
}
private
scheduleGPUHost
(
environments
:
EnvironmentInformation
[],
requiredGPUNumber
:
number
,
trial
:
TrialDetail
):
GpuScheduleResult
|
undefined
{
const
totalResourceMap
:
Map
<
EnvironmentInformation
,
GPUInfo
[]
>
=
this
.
gpuResourceDetection
(
environments
);
const
qualifiedEnvironments
:
EnvironmentInformation
[]
=
[];
totalResourceMap
.
forEach
((
gpuInfos
:
GPUInfo
[],
environment
:
EnvironmentInformation
)
=>
{
if
(
gpuInfos
!==
undefined
&&
gpuInfos
.
length
>=
requiredGPUNumber
)
{
qualifiedEnvironments
.
push
(
environment
);
}
});
if
(
qualifiedEnvironments
.
length
>
0
)
{
const
allocatedEnvironment
:
EnvironmentInformation
=
this
.
selectMachine
(
qualifiedEnvironments
,
environments
);
const
gpuInfos
:
GPUInfo
[]
|
undefined
=
totalResourceMap
.
get
(
allocatedEnvironment
);
if
(
gpuInfos
!==
undefined
)
{
// should always true
return
this
.
allocateHost
(
requiredGPUNumber
,
allocatedEnvironment
,
gpuInfos
,
trial
);
}
else
{
assert
(
false
,
'
gpuInfos is undefined
'
);
}
}
}
/**
* Detect available GPU resource for an environment
* @returns Available GPUs on environments
*/
private
gpuResourceDetection
(
environments
:
EnvironmentInformation
[]):
Map
<
EnvironmentInformation
,
GPUInfo
[]
>
{
const
totalResourceMap
:
Map
<
EnvironmentInformation
,
GPUInfo
[]
>
=
new
Map
<
EnvironmentInformation
,
GPUInfo
[]
>
();
environments
.
forEach
((
environment
:
EnvironmentInformation
)
=>
{
// Assgin totoal GPU count as init available GPU number
if
(
environment
.
defaultGpuSummary
!==
undefined
)
{
const
defaultGpuSummary
=
environment
.
defaultGpuSummary
;
const
availableGPUs
:
GPUInfo
[]
=
[];
const
designatedGpuIndices
:
Set
<
number
>
=
new
Set
<
number
>
(
environment
.
usableGpus
);
if
(
designatedGpuIndices
.
size
>
0
)
{
for
(
const
gpuIndex
of
designatedGpuIndices
)
{
if
(
gpuIndex
>=
environment
.
defaultGpuSummary
.
gpuCount
)
{
throw
new
Error
(
`Specified GPU index not found:
${
gpuIndex
}
`
);
}
}
}
if
(
undefined
!==
defaultGpuSummary
.
gpuInfos
)
{
defaultGpuSummary
.
gpuInfos
.
forEach
((
gpuInfo
:
GPUInfo
)
=>
{
// if the GPU has active process, OR be reserved by a job,
// or index not in gpuIndices configuration in machineList,
// or trial number on a GPU reach max number,
// We should NOT allocate this GPU
// if users set useActiveGpu, use the gpu whether there is another activeProcess
if
(
designatedGpuIndices
.
size
===
0
||
designatedGpuIndices
.
has
(
gpuInfo
.
index
))
{
if
(
defaultGpuSummary
.
assignedGpuIndexMap
!==
undefined
)
{
const
num
:
number
|
undefined
=
defaultGpuSummary
.
assignedGpuIndexMap
.
get
(
gpuInfo
.
index
);
const
maxTrialNumberPerGpu
:
number
=
environment
.
maxTrialNumberPerGpu
?
environment
.
maxTrialNumberPerGpu
:
this
.
defaultSetting
.
maxTrialNumberPerGpu
;
const
useActiveGpu
:
boolean
=
environment
.
useActiveGpu
?
environment
.
useActiveGpu
:
this
.
defaultSetting
.
useActiveGpu
;
if
((
num
===
undefined
&&
(
!
useActiveGpu
&&
gpuInfo
.
activeProcessNum
===
0
||
useActiveGpu
))
||
(
num
!==
undefined
&&
num
<
maxTrialNumberPerGpu
))
{
availableGPUs
.
push
(
gpuInfo
);
}
}
else
{
throw
new
Error
(
`occupiedGpuIndexMap is undefined!`
);
}
}
});
}
totalResourceMap
.
set
(
environment
,
availableGPUs
);
}
});
return
totalResourceMap
;
}
private
selectMachine
(
qualifiedEnvironments
:
EnvironmentInformation
[],
allEnvironments
:
EnvironmentInformation
[]):
EnvironmentInformation
{
assert
(
qualifiedEnvironments
!==
undefined
&&
qualifiedEnvironments
.
length
>
0
);
if
(
this
.
policyName
===
'
random
'
)
{
return
randomSelect
(
qualifiedEnvironments
);
}
else
if
(
this
.
policyName
===
'
round-robin
'
)
{
return
this
.
roundRobinSelect
(
qualifiedEnvironments
,
allEnvironments
);
}
else
{
throw
new
Error
(
`Unsupported schedule policy:
${
this
.
policyName
}
`
);
}
}
private
roundRobinSelect
(
qualifiedEnvironments
:
EnvironmentInformation
[],
allEnvironments
:
EnvironmentInformation
[]):
EnvironmentInformation
{
while
(
!
qualifiedEnvironments
.
includes
(
allEnvironments
[
this
.
roundRobinIndex
%
allEnvironments
.
length
]))
{
this
.
roundRobinIndex
++
;
}
return
allEnvironments
[
this
.
roundRobinIndex
++
%
allEnvironments
.
length
];
}
private
selectGPUsForTrial
(
gpuInfos
:
GPUInfo
[],
requiredGPUNum
:
number
):
GPUInfo
[]
{
// Sequentially allocate GPUs
return
gpuInfos
.
slice
(
0
,
requiredGPUNum
);
}
private
allocateHost
(
requiredGPUNum
:
number
,
environment
:
EnvironmentInformation
,
gpuInfos
:
GPUInfo
[],
trialDetails
:
TrialDetail
):
GpuScheduleResult
{
assert
(
gpuInfos
.
length
>=
requiredGPUNum
);
const
allocatedGPUs
:
GPUInfo
[]
=
this
.
selectGPUsForTrial
(
gpuInfos
,
requiredGPUNum
);
const
defaultGpuSummary
=
environment
.
defaultGpuSummary
;
if
(
undefined
===
defaultGpuSummary
)
{
throw
new
Error
(
`Environment
${
environment
.
id
}
defaultGpuSummary shouldn't be undefined!`
);
}
allocatedGPUs
.
forEach
((
gpuInfo
:
GPUInfo
)
=>
{
let
num
:
number
|
undefined
=
defaultGpuSummary
.
assignedGpuIndexMap
.
get
(
gpuInfo
.
index
);
if
(
num
===
undefined
)
{
num
=
0
;
}
defaultGpuSummary
.
assignedGpuIndexMap
.
set
(
gpuInfo
.
index
,
num
+
1
);
});
trialDetails
.
assignedGpus
=
allocatedGPUs
;
return
{
resultType
:
ScheduleResultType
.
SUCCEED
,
environment
:
environment
,
gpuIndices
:
allocatedGPUs
,
};
}
}
src/nni_manager/training_service/reusable/storageService.ts
View file @
143c6615
...
...
@@ -83,7 +83,7 @@ export abstract class StorageService {
localPath
=
this
.
expandPath
(
false
,
localPath
);
remotePath
=
this
.
expandPath
(
true
,
remotePath
);
this
.
logger
.
debug
(
`copy remotePath:
${
remotePath
}
to localPath:
${
localPath
}
`
);
return
await
this
.
internalCopy
(
localPath
,
remote
Path
,
true
,
true
,
false
);
return
await
this
.
internalCopy
(
remotePath
,
local
Path
,
true
,
true
,
false
);
}
public
async
removeDirectory
(
remotePath
:
string
,
isRecursive
:
boolean
):
Promise
<
void
>
{
...
...
@@ -151,7 +151,7 @@ export abstract class StorageService {
localPath
=
this
.
expandPath
(
false
,
localPath
);
remotePath
=
this
.
expandPath
(
true
,
remotePath
);
this
.
logger
.
debug
(
`copy file remotePath:
${
remotePath
}
to localPath:
${
localPath
}
`
);
await
this
.
internalCopy
(
localPath
,
remote
Path
,
false
,
true
,
false
);
await
this
.
internalCopy
(
remotePath
,
local
Path
,
false
,
true
,
false
);
}
public
async
removeFile
(
remotePath
:
string
):
Promise
<
void
>
{
...
...
src/nni_manager/training_service/reusable/storages/mountedStorageService.ts
View file @
143c6615
...
...
@@ -17,12 +17,12 @@ export class MountedStorageService extends StorageService {
if
(
isRecursive
)
{
const
children
=
await
fs
.
promises
.
readdir
(
path
);
for
(
const
file
of
children
)
{
const
stat
=
await
fs
.
promises
.
lstat
(
file
);
this
.
internalRemove
(
file
,
stat
.
isDirectory
(),
isRecursive
);
const
filePath
=
this
.
internalJoin
(
path
,
file
);
const
stat
=
await
fs
.
promises
.
lstat
(
filePath
);
await
this
.
internalRemove
(
filePath
,
stat
.
isDirectory
(),
isRecursive
);
}
}
else
{
await
fs
.
promises
.
rmdir
(
path
);
}
await
fs
.
promises
.
rmdir
(
path
);
}
else
{
await
fs
.
promises
.
unlink
(
path
);
}
...
...
@@ -98,7 +98,7 @@ export class MountedStorageService extends StorageService {
{
encoding
:
"
utf8
"
,
start
:
current
,
end
:
readLength
+
current
,
end
:
readLength
+
current
-
1
,
}).
on
(
"
data
"
,
(
data
)
=>
{
result
+=
data
;
}).
on
(
"
end
"
,
()
=>
{
...
...
src/nni_manager/training_service/reusable/test/mountedStorageService.test.ts
0 → 100644
View file @
143c6615
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
'
use strict
'
;
import
*
as
chai
from
'
chai
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
import
{
getLogger
,
Logger
}
from
"
../../../common/log
"
;
import
{
cleanupUnitTest
,
prepareUnitTest
}
from
'
../../../common/utils
'
;
import
{
MountedStorageService
}
from
"
../storages/mountedStorageService
"
;
import
chaiAsPromised
=
require
(
"
chai-as-promised
"
);
async
function
remove
(
removedPath
:
string
,
isDirectory
:
boolean
,
isRecursive
:
boolean
):
Promise
<
void
>
{
if
(
isDirectory
)
{
if
(
isRecursive
)
{
const
children
=
await
fs
.
promises
.
readdir
(
removedPath
);
for
(
const
fileName
of
children
)
{
const
filePath
=
path
.
join
(
removedPath
,
fileName
);
const
stat
=
await
fs
.
promises
.
lstat
(
filePath
);
await
remove
(
filePath
,
stat
.
isDirectory
(),
isRecursive
);
}
}
await
fs
.
promises
.
rmdir
(
removedPath
);
}
else
{
await
fs
.
promises
.
unlink
(
removedPath
);
}
}
describe
(
'
Unit Test for MountedStorageService
'
,
()
=>
{
let
service
:
MountedStorageService
;
let
log
:
Logger
;
let
localPath
=
"
reusableut/local
"
;
let
mountedPath
=
"
reusableut/mounted
"
;
const
testPath
=
"
testpath
"
;
const
testFileName
=
"
testfile.txt
"
;
let
localCopiedPath
:
string
;
let
localFileName
:
string
;
let
mountedFileName
:
string
;
before
(()
=>
{
chai
.
should
();
chai
.
use
(
chaiAsPromised
);
prepareUnitTest
();
log
=
getLogger
();
const
testRoot
=
path
.
dirname
(
__filename
);
localPath
=
path
.
join
(
testRoot
,
localPath
);
mountedPath
=
path
.
join
(
testRoot
,
mountedPath
);
service
=
new
MountedStorageService
();
service
.
initialize
(
localPath
,
mountedPath
);
localCopiedPath
=
path
.
join
(
localPath
,
testPath
);
localFileName
=
path
.
join
(
localCopiedPath
,
testFileName
);
mountedFileName
=
path
.
join
(
testPath
,
testFileName
);
});
after
(()
=>
{
cleanupUnitTest
();
});
beforeEach
(
async
()
=>
{
if
(
!
fs
.
existsSync
(
localPath
))
{
await
fs
.
promises
.
mkdir
(
localPath
,
{
recursive
:
true
});
}
if
(
!
fs
.
existsSync
(
mountedPath
))
{
await
fs
.
promises
.
mkdir
(
mountedPath
,
{
recursive
:
true
});
}
log
.
info
(
`localFileName:
${
localFileName
}
`
);
await
fs
.
promises
.
mkdir
(
localCopiedPath
,
{
recursive
:
true
});
await
fs
.
promises
.
writeFile
(
localFileName
,
"
hello world
"
);
});
afterEach
(
async
()
=>
{
const
testRootPath
=
path
.
normalize
(
`
${
localPath
}
/../../reusableut`
);
await
remove
(
testRootPath
,
true
,
true
);
});
it
(
'
copyAndRename
'
,
async
()
=>
{
await
service
.
copyDirectory
(
localCopiedPath
,
"
.
"
);
chai
.
expect
(
fs
.
existsSync
(
mountedPath
));
const
newName
=
`
${
testFileName
}
new`
;
await
service
.
rename
(
mountedFileName
,
newName
);
chai
.
assert
.
isFalse
(
fs
.
existsSync
(
testPath
));
const
newTestPath
=
`
${
mountedFileName
}
new`
;
chai
.
assert
.
isTrue
(
await
service
.
exists
(
newTestPath
));
await
service
.
copyFileBack
(
newTestPath
,
"
.
"
);
const
localNewFileName
=
`
${
localPath
}
/
${
newName
}
`
;
chai
.
assert
.
isTrue
(
fs
.
existsSync
(
localNewFileName
));
fs
.
unlinkSync
(
`
${
localFileName
}
`
);
fs
.
rmdirSync
(
`
${
localPath
}
/
${
testPath
}
`
);
await
service
.
copyDirectoryBack
(
`
${
mountedPath
}
/
${
testPath
}
`
,
`.`
);
const
localNewName
=
`
${
localFileName
}
new`
;
chai
.
assert
.
isTrue
(
fs
.
existsSync
(
localNewName
));
})
it
(
'
FileContentTest
'
,
async
()
=>
{
const
savedFileName
=
"
savedfile.txt
"
;
await
service
.
save
(
"
01234
"
,
savedFileName
);
chai
.
expect
(
fs
.
existsSync
(
savedFileName
));
let
content
=
await
service
.
readFileContent
(
savedFileName
,
0
,
-
1
);
chai
.
assert
.
equal
(
content
,
"
01234
"
);
await
service
.
save
(
"
56789
"
,
savedFileName
,
true
);
content
=
await
service
.
readFileContent
(
savedFileName
,
0
,
-
1
);
chai
.
assert
.
equal
(
content
,
"
0123456789
"
);
content
=
await
service
.
readFileContent
(
savedFileName
,
-
1
,
1
);
chai
.
assert
.
equal
(
content
,
"
0
"
);
content
=
await
service
.
readFileContent
(
savedFileName
,
5
,
1
);
chai
.
assert
.
equal
(
content
,
"
5
"
);
content
=
await
service
.
readFileContent
(
savedFileName
,
5
,
-
1
);
chai
.
assert
.
equal
(
content
,
"
56789
"
);
});
});
src/nni_manager/training_service/reusable/test/trialDispatcher.test.ts
0 → 100644
View file @
143c6615
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
import
*
as
chai
from
'
chai
'
;
import
*
as
path
from
'
path
'
;
import
{
Scope
}
from
"
typescript-ioc
"
;
import
*
as
component
from
'
../../../common/component
'
;
import
{
getLogger
,
Logger
}
from
"
../../../common/log
"
;
import
{
TrialJobApplicationForm
,
TrialJobStatus
}
from
'
../../../common/trainingService
'
;
import
{
cleanupUnitTest
,
delay
,
prepareUnitTest
,
uniqueString
}
from
'
../../../common/utils
'
;
import
{
INITIALIZED
,
KILL_TRIAL_JOB
,
NEW_TRIAL_JOB
,
SEND_TRIAL_JOB_PARAMETER
,
TRIAL_END
,
GPU_INFO
}
from
'
../../../core/commands
'
;
import
{
TrialConfigMetadataKey
}
from
'
../../../training_service/common/trialConfigMetadataKey
'
;
import
{
Command
}
from
'
../commandChannel
'
;
import
{
EnvironmentInformation
,
EnvironmentService
}
from
"
../environment
"
;
import
{
TrialDetail
}
from
'
../trial
'
;
import
{
TrialDispatcher
}
from
"
../trialDispatcher
"
;
import
{
UtCommandChannel
}
from
'
./utCommandChannel
'
;
import
{
UtEnvironmentService
}
from
"
./utEnvironmentService
"
;
import
chaiAsPromised
=
require
(
"
chai-as-promised
"
);
import
{
promises
}
from
'
fs
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
NNIErrorNames
,
NNIError
,
MethodNotImplementedError
}
from
'
../../../common/errors
'
;
function
createTrialForm
(
content
:
any
=
undefined
):
TrialJobApplicationForm
{
if
(
content
===
undefined
)
{
content
=
{
"
test
"
:
1
};
}
const
trialForm
=
{
sequenceId
:
0
,
hyperParameters
:
{
value
:
JSON
.
stringify
(
content
),
index
:
0
}
};
return
trialForm
;
}
async
function
waitResult
<
TResult
>
(
callback
:
()
=>
Promise
<
TResult
|
undefined
>
,
waitMs
:
number
=
1000
,
interval
:
number
=
1
,
throwError
:
boolean
=
false
):
Promise
<
TResult
|
undefined
>
{
while
(
waitMs
>
0
)
{
const
result
=
await
callback
();
if
(
result
!==
undefined
)
{
return
result
;
}
await
delay
(
interval
);
waitMs
-=
interval
;
};
if
(
throwError
)
{
throw
new
Error
(
`wait result timeout!\n
${
callback
.
toString
()}
`
);
}
return
undefined
;
}
async
function
waitResultMust
<
TResult
>
(
callback
:
()
=>
Promise
<
TResult
|
undefined
>
,
waitMs
:
number
=
1000
,
interval
:
number
=
1
):
Promise
<
TResult
>
{
const
result
=
await
waitResult
(
callback
,
waitMs
,
interval
,
true
);
// this error should be thrown in waitResult already.
if
(
result
===
undefined
)
{
throw
new
Error
(
`wait result timeout!`
);
}
return
result
;
}
async
function
newTrial
(
trialDispatcher
:
TrialDispatcher
):
Promise
<
TrialDetail
>
{
const
trialDetail
=
await
trialDispatcher
.
submitTrialJob
(
createTrialForm
());
return
trialDetail
;
}
function
newGpuInfo
(
gpuCount
:
Number
=
2
,
nodeId
:
string
|
undefined
=
undefined
):
any
{
let
gpuInfos
=
[];
for
(
let
index
=
0
;
index
<
gpuCount
;
index
++
)
{
gpuInfos
.
push
({
index
:
index
,
activeProcessNum
:
0
,
});
}
const
gpuInfo
=
{
gpuInfos
:
gpuInfos
,
gpuCount
:
gpuInfos
.
length
,
node
:
nodeId
}
return
gpuInfo
;
}
async
function
verifyTrialRunning
(
commandChannel
:
UtCommandChannel
,
trialDetail
:
TrialDetail
):
Promise
<
Command
>
{
let
command
=
await
waitResultMust
<
Command
>
(
async
()
=>
{
return
await
commandChannel
.
testReceiveCommandFromTrialDispatcher
();
});
chai
.
assert
.
equal
(
command
.
command
,
NEW_TRIAL_JOB
,
"
verifyTrialRunning command type
"
);
chai
.
assert
.
equal
(
command
.
data
[
"
trialId
"
],
trialDetail
.
id
,
"
verifyTrialRunning trialDetail.id should be equal.
"
);
return
command
;
}
async
function
verifyTrialResult
(
commandChannel
:
UtCommandChannel
,
trialDetail
:
TrialDetail
,
returnCode
:
number
=
0
):
Promise
<
void
>
{
let
trialResult
=
{
trial
:
trialDetail
.
id
,
code
:
returnCode
,
timestamp
:
Date
.
now
(),
};
if
(
trialDetail
.
environment
===
undefined
)
{
throw
new
Error
(
`environment shouldn't be undefined.`
)
}
await
commandChannel
.
testSendCommandToTrialDispatcher
(
trialDetail
.
environment
,
TRIAL_END
,
trialResult
);
await
waitResultMust
<
boolean
>
(
async
()
=>
{
return
trialDetail
.
status
!==
'
RUNNING
'
?
true
:
undefined
;
});
if
(
returnCode
===
0
)
{
chai
.
assert
.
equal
<
TrialJobStatus
>
(
trialDetail
.
status
,
'
SUCCEEDED
'
,
"
trial should be succeeded
"
);
}
else
{
chai
.
assert
.
equal
<
TrialJobStatus
>
(
trialDetail
.
status
,
'
FAILED
'
,
"
trial should be failed
"
);
}
}
async
function
waitEnvironment
(
waitCount
:
number
,
previousEnvironments
:
Map
<
string
,
EnvironmentInformation
>
,
environmentService
:
UtEnvironmentService
,
commandChannel
:
UtCommandChannel
,
gpuCount
:
number
=
2
,
nodeCount
:
number
=
1
,
callback
:
((
environment
:
EnvironmentInformation
)
=>
Promise
<
void
>
)
|
undefined
=
undefined
):
Promise
<
EnvironmentInformation
>
{
const
waitRequestEnvironment
=
await
waitResultMust
<
EnvironmentInformation
>
(
async
()
=>
{
const
environments
=
environmentService
.
testGetEnvironments
();
if
(
environments
.
size
===
waitCount
)
{
for
(
const
[
id
,
environment
]
of
environments
)
{
if
(
!
previousEnvironments
.
has
(
id
))
{
previousEnvironments
.
set
(
id
,
environment
);
return
environment
;
}
}
}
return
undefined
;
});
if
(
waitRequestEnvironment
===
undefined
)
{
throw
new
Error
(
`waitRequestEnvironment is not defined.`
);
}
const
nodeIds
=
[];
waitRequestEnvironment
.
nodeCount
=
nodeCount
;
if
(
nodeCount
>
1
)
{
for
(
let
index
=
0
;
index
<
nodeCount
;
index
++
)
{
nodeIds
.
push
(
uniqueString
(
5
));
}
}
else
{
nodeIds
.
push
(
undefined
);
}
for
(
const
nodeId
of
nodeIds
)
{
// set runner is ready.
await
commandChannel
.
testSendCommandToTrialDispatcher
(
waitRequestEnvironment
,
INITIALIZED
,
{
node
:
nodeId
});
if
(
gpuCount
>
0
)
{
await
commandChannel
.
testSendCommandToTrialDispatcher
(
waitRequestEnvironment
,
GPU_INFO
,
newGpuInfo
(
gpuCount
,
nodeId
));
}
}
if
(
callback
)
{
await
callback
(
waitRequestEnvironment
);
}
// set env to running
environmentService
.
testSetEnvironmentStatus
(
waitRequestEnvironment
,
'
RUNNING
'
);
await
waitResultMust
<
boolean
>
(
async
()
=>
{
return
waitRequestEnvironment
.
isRunnerReady
?
true
:
undefined
;
});
return
waitRequestEnvironment
;
}
describe
(
'
Unit Test for TrialDispatcher
'
,
()
=>
{
let
trialRunPromise
:
Promise
<
void
>
;
let
trialDispatcher
:
TrialDispatcher
;
let
commandChannel
:
UtCommandChannel
;
let
environmentService
:
UtEnvironmentService
;
let
log
:
Logger
;
let
previousEnvironments
:
Map
<
string
,
EnvironmentInformation
>
=
new
Map
<
string
,
EnvironmentInformation
>
();
const
currentDir
=
path
.
dirname
(
__filename
);
before
(()
=>
{
chai
.
should
();
chai
.
use
(
chaiAsPromised
);
prepareUnitTest
();
log
=
getLogger
();
});
after
(()
=>
{
cleanupUnitTest
();
});
beforeEach
(
async
()
=>
{
const
trialConfig
=
{
codeDir
:
currentDir
,
command
:
"
echo
"
,
}
const
nniManagerIpConfig
=
{
nniManagerIp
:
"
127.0.0.1
"
,
}
trialDispatcher
=
new
TrialDispatcher
();
component
.
Container
.
bind
(
EnvironmentService
)
.
to
(
UtEnvironmentService
)
.
scope
(
Scope
.
Singleton
);
await
trialDispatcher
.
setClusterMetadata
(
TrialConfigMetadataKey
.
TRIAL_CONFIG
,
JSON
.
stringify
(
trialConfig
));
await
trialDispatcher
.
setClusterMetadata
(
TrialConfigMetadataKey
.
NNI_MANAGER_IP
,
JSON
.
stringify
(
nniManagerIpConfig
));
trialRunPromise
=
trialDispatcher
.
run
();
environmentService
=
component
.
get
(
EnvironmentService
)
as
UtEnvironmentService
;
commandChannel
=
environmentService
.
testGetCommandChannel
();
});
afterEach
(
async
()
=>
{
previousEnvironments
.
clear
();
await
trialDispatcher
.
cleanUp
();
environmentService
.
testReset
();
await
trialRunPromise
;
});
it
(
'
reuse env
'
,
async
()
=>
{
let
trialDetail
=
await
newTrial
(
trialDispatcher
);
await
waitEnvironment
(
1
,
previousEnvironments
,
environmentService
,
commandChannel
);
await
verifyTrialRunning
(
commandChannel
,
trialDetail
);
await
verifyTrialResult
(
commandChannel
,
trialDetail
,
0
);
trialDetail
=
await
newTrial
(
trialDispatcher
);
await
verifyTrialRunning
(
commandChannel
,
trialDetail
);
await
verifyTrialResult
(
commandChannel
,
trialDetail
,
-
1
);
chai
.
assert
.
equal
(
environmentService
.
testGetEnvironments
().
size
,
1
,
"
as env reused, so only 1 env should be here.
"
);
const
trials
=
await
trialDispatcher
.
listTrialJobs
();
chai
.
assert
.
equal
(
trials
.
length
,
2
,
"
there should be 2 trials
"
);
});
it
(
'
not reusable env
'
,
async
()
=>
{
trialDispatcher
.
setClusterMetadata
(
TrialConfigMetadataKey
.
TRIAL_CONFIG
,
JSON
.
stringify
({
reuseEnvironment
:
false
,
codeDir
:
currentDir
,
}));
let
trialDetail
=
await
newTrial
(
trialDispatcher
);
let
environment
=
await
waitEnvironment
(
1
,
previousEnvironments
,
environmentService
,
commandChannel
);
await
verifyTrialRunning
(
commandChannel
,
trialDetail
);
await
verifyTrialResult
(
commandChannel
,
trialDetail
,
0
);
await
waitResultMust
<
true
>
(
async
()
=>
{
return
environment
.
status
===
'
USER_CANCELED
'
?
true
:
undefined
;
});
trialDetail
=
await
newTrial
(
trialDispatcher
);
await
waitEnvironment
(
2
,
previousEnvironments
,
environmentService
,
commandChannel
);
await
verifyTrialRunning
(
commandChannel
,
trialDetail
);
await
verifyTrialResult
(
commandChannel
,
trialDetail
,
-
1
);
await
waitResultMust
<
true
>
(
async
()
=>
{
return
environment
.
status
===
'
USER_CANCELED
'
?
true
:
undefined
;
});
chai
.
assert
.
equal
(
environmentService
.
testGetEnvironments
().
size
,
2
,
"
as env not reused, so only 2 envs should be here.
"
);
const
trials
=
await
trialDispatcher
.
listTrialJobs
();
chai
.
assert
.
equal
(
trials
.
length
,
2
,
"
there should be 2 trials
"
);
});
it
(
'
no more env
'
,
async
()
=>
{
const
trialDetail1
=
await
newTrial
(
trialDispatcher
);
await
waitEnvironment
(
1
,
previousEnvironments
,
environmentService
,
commandChannel
);
// set to no more environment
environmentService
.
testSetNoMoreEnvironment
(
false
);
const
trialDetail2
=
await
newTrial
(
trialDispatcher
);
await
verifyTrialRunning
(
commandChannel
,
trialDetail1
);
await
verifyTrialResult
(
commandChannel
,
trialDetail1
,
0
);
await
verifyTrialRunning
(
commandChannel
,
trialDetail2
);
await
verifyTrialResult
(
commandChannel
,
trialDetail2
,
-
1
);
chai
.
assert
.
equal
(
environmentService
.
testGetEnvironments
().
size
,
1
,
"
as env not reused, so only 1 envs should be here.
"
);
const
trials
=
await
trialDispatcher
.
listTrialJobs
();
chai
.
assert
.
equal
(
trials
.
length
,
2
,
"
there should be 2 trials
"
);
});
it
(
'
2trial2env
'
,
async
()
=>
{
let
trialDetail1
=
await
newTrial
(
trialDispatcher
);
let
trialDetail2
=
await
newTrial
(
trialDispatcher
);
await
waitEnvironment
(
2
,
previousEnvironments
,
environmentService
,
commandChannel
);
await
verifyTrialRunning
(
commandChannel
,
trialDetail1
);
await
verifyTrialResult
(
commandChannel
,
trialDetail1
,
0
);
await
verifyTrialRunning
(
commandChannel
,
trialDetail2
);
await
verifyTrialResult
(
commandChannel
,
trialDetail2
,
0
);
chai
.
assert
.
equal
(
environmentService
.
testGetEnvironments
().
size
,
2
,
"
2 envs should be here.
"
);
const
trials
=
await
trialDispatcher
.
listTrialJobs
();
chai
.
assert
.
equal
(
trials
.
length
,
2
,
"
there should be 2 trials
"
);
});
it
(
'
3trial2env
'
,
async
()
=>
{
let
trialDetail1
=
await
newTrial
(
trialDispatcher
);
let
trialDetail2
=
await
newTrial
(
trialDispatcher
);
await
waitEnvironment
(
2
,
previousEnvironments
,
environmentService
,
commandChannel
);
await
verifyTrialRunning
(
commandChannel
,
trialDetail1
);
await
verifyTrialResult
(
commandChannel
,
trialDetail1
,
0
);
await
verifyTrialRunning
(
commandChannel
,
trialDetail2
);
await
verifyTrialResult
(
commandChannel
,
trialDetail2
,
0
);
chai
.
assert
.
equal
(
environmentService
.
testGetEnvironments
().
size
,
2
,
"
2 envs should be here.
"
);
let
trials
=
await
trialDispatcher
.
listTrialJobs
();
chai
.
assert
.
equal
(
trials
.
length
,
2
,
"
there should be 2 trials
"
);
let
trialDetail3
=
await
newTrial
(
trialDispatcher
);
await
verifyTrialRunning
(
commandChannel
,
trialDetail3
);
await
verifyTrialResult
(
commandChannel
,
trialDetail3
,
0
);
chai
.
assert
.
equal
(
environmentService
.
testGetEnvironments
().
size
,
2
,
"
2 envs should be here.
"
);
trials
=
await
trialDispatcher
.
listTrialJobs
();
chai
.
assert
.
equal
(
trials
.
length
,
3
,
"
there should be 2 trials
"
);
});
it
(
'
stop trial
'
,
async
()
=>
{
let
trialDetail1
=
await
newTrial
(
trialDispatcher
);
await
waitEnvironment
(
1
,
previousEnvironments
,
environmentService
,
commandChannel
);
await
verifyTrialRunning
(
commandChannel
,
trialDetail1
);
await
trialDispatcher
.
cancelTrialJob
(
trialDetail1
.
id
,
false
);
let
command
=
await
waitResultMust
<
Command
>
(
async
()
=>
{
return
await
commandChannel
.
testReceiveCommandFromTrialDispatcher
();
});
chai
.
assert
.
equal
(
command
.
command
,
KILL_TRIAL_JOB
);
log
.
info
(
`command:
${
JSON
.
stringify
(
command
)}
`
);
chai
.
assert
.
equal
(
command
.
data
,
trialDetail1
.
id
);
await
waitResultMust
<
boolean
>
(
async
()
=>
{
return
trialDetail1
.
status
!==
'
RUNNING
'
?
true
:
undefined
;
});
let
trialDetail2
=
await
newTrial
(
trialDispatcher
);
await
verifyTrialRunning
(
commandChannel
,
trialDetail2
);
await
trialDispatcher
.
cancelTrialJob
(
trialDetail2
.
id
,
true
);
command
=
await
waitResultMust
<
Command
>
(
async
()
=>
{
return
await
commandChannel
.
testReceiveCommandFromTrialDispatcher
();
});
chai
.
assert
.
equal
(
command
.
command
,
KILL_TRIAL_JOB
);
log
.
info
(
`command:
${
JSON
.
stringify
(
command
)}
`
);
chai
.
assert
.
equal
(
command
.
data
,
trialDetail2
.
id
);
await
waitResultMust
<
boolean
>
(
async
()
=>
{
return
trialDetail2
.
status
!==
'
RUNNING
'
?
true
:
undefined
;
});
chai
.
assert
.
equal
(
environmentService
.
testGetEnvironments
().
size
,
1
,
"
only one trial, so one env
"
);
const
trials
=
await
trialDispatcher
.
listTrialJobs
();
chai
.
assert
.
equal
(
trials
.
length
,
2
,
"
there should be 1 stopped trial only
"
);
let
trial
=
await
trialDispatcher
.
getTrialJob
(
trialDetail1
.
id
);
chai
.
assert
.
equal
<
TrialJobStatus
>
(
trial
.
status
,
'
USER_CANCELED
'
,
`trial is canceled.`
);
trial
=
await
trialDispatcher
.
getTrialJob
(
trialDetail2
.
id
);
chai
.
assert
.
equal
<
TrialJobStatus
>
(
trial
.
status
,
'
EARLY_STOPPED
'
,
`trial is earlier stopped.`
);
});
it
(
'
multi phase
'
,
async
()
=>
{
let
trialDetail
=
await
newTrial
(
trialDispatcher
);
await
waitEnvironment
(
1
,
previousEnvironments
,
environmentService
,
commandChannel
);
await
verifyTrialRunning
(
commandChannel
,
trialDetail
);
let
content
=
{
test
:
2
,
}
await
trialDispatcher
.
updateTrialJob
(
trialDetail
.
id
,
createTrialForm
(
content
));
let
command
=
await
waitResultMust
<
Command
>
(
async
()
=>
{
return
await
commandChannel
.
testReceiveCommandFromTrialDispatcher
();
});
chai
.
assert
.
equal
(
command
.
command
,
SEND_TRIAL_JOB_PARAMETER
);
chai
.
assert
.
equal
(
command
.
data
[
"
trialId
"
],
trialDetail
.
id
);
chai
.
assert
.
equal
(
command
.
data
.
parameters
.
index
,
0
);
chai
.
assert
.
equal
(
command
.
data
.
parameters
.
value
,
JSON
.
stringify
(
content
));
content
=
{
test
:
3
,
}
await
trialDispatcher
.
updateTrialJob
(
trialDetail
.
id
,
createTrialForm
(
content
));
command
=
await
waitResultMust
<
Command
>
(
async
()
=>
{
return
await
commandChannel
.
testReceiveCommandFromTrialDispatcher
();
});
chai
.
assert
.
equal
(
command
.
command
,
SEND_TRIAL_JOB_PARAMETER
);
chai
.
assert
.
equal
(
command
.
data
[
"
trialId
"
],
trialDetail
.
id
);
chai
.
assert
.
equal
(
command
.
data
.
parameters
.
index
,
0
);
chai
.
assert
.
equal
(
command
.
data
.
parameters
.
value
,
JSON
.
stringify
(
content
));
await
verifyTrialResult
(
commandChannel
,
trialDetail
,
0
);
chai
.
assert
.
equal
(
environmentService
.
testGetEnvironments
().
size
,
1
,
"
only one trial, so one env
"
);
const
trials
=
await
trialDispatcher
.
listTrialJobs
();
chai
.
assert
.
equal
(
trials
.
length
,
1
,
"
there should be 1 stopped trial only
"
);
});
it
(
'
multi node
'
,
async
()
=>
{
let
trialDetail
=
await
newTrial
(
trialDispatcher
);
const
environment
=
await
waitEnvironment
(
1
,
previousEnvironments
,
environmentService
,
commandChannel
,
2
,
2
);
log
.
debug
(
`environment
${
JSON
.
stringify
(
environment
)}
`
);
await
verifyTrialRunning
(
commandChannel
,
trialDetail
);
await
verifyTrialResult
(
commandChannel
,
trialDetail
,
0
);
chai
.
assert
.
equal
(
environment
.
nodes
.
size
,
2
);
let
command
=
await
waitResultMust
<
Command
>
(
async
()
=>
{
return
await
commandChannel
.
testReceiveCommandFromTrialDispatcher
();
});
chai
.
assert
.
equal
(
command
.
command
,
KILL_TRIAL_JOB
);
chai
.
assert
.
equal
(
environmentService
.
testGetEnvironments
().
size
,
1
,
"
only one trial, so one env
"
);
const
trials
=
await
trialDispatcher
.
listTrialJobs
();
chai
.
assert
.
equal
(
trials
.
length
,
1
,
"
there should be 1 stopped trial only
"
);
});
it
(
'
env timeout
'
,
async
()
=>
{
let
trialDetail
=
await
newTrial
(
trialDispatcher
);
let
environment
=
await
waitEnvironment
(
1
,
previousEnvironments
,
environmentService
,
commandChannel
);
await
verifyTrialRunning
(
commandChannel
,
trialDetail
);
await
verifyTrialResult
(
commandChannel
,
trialDetail
,
0
);
environmentService
.
testSetEnvironmentStatus
(
environment
,
'
SUCCEEDED
'
);
await
waitResultMust
<
boolean
>
(
async
()
=>
{
return
environment
.
status
===
'
SUCCEEDED
'
?
true
:
undefined
;
});
trialDetail
=
await
newTrial
(
trialDispatcher
);
await
waitEnvironment
(
2
,
previousEnvironments
,
environmentService
,
commandChannel
);
await
verifyTrialRunning
(
commandChannel
,
trialDetail
);
await
verifyTrialResult
(
commandChannel
,
trialDetail
,
0
);
chai
.
assert
.
equal
(
previousEnvironments
.
size
,
2
,
"
as an env timeout, so 2 envs should be here.
"
);
const
trials
=
await
trialDispatcher
.
listTrialJobs
();
chai
.
assert
.
equal
(
trials
.
length
,
2
,
"
there should be 2 trials
"
);
});
it
(
'
env failed with trial
'
,
async
()
=>
{
let
trialDetail
=
await
newTrial
(
trialDispatcher
);
let
environment
=
await
waitEnvironment
(
1
,
previousEnvironments
,
environmentService
,
commandChannel
);
await
verifyTrialRunning
(
commandChannel
,
trialDetail
);
environmentService
.
testSetEnvironmentStatus
(
environment
,
'
FAILED
'
);
await
waitResultMust
<
boolean
>
(
async
()
=>
{
return
environment
.
status
===
'
FAILED
'
?
true
:
undefined
;
});
await
waitResultMust
<
boolean
>
(
async
()
=>
{
return
trialDetail
.
status
===
'
FAILED
'
?
true
:
undefined
;
});
chai
.
assert
.
equal
<
TrialJobStatus
>
(
trialDetail
.
status
,
'
FAILED
'
,
"
env failed, so trial also failed.
"
);
});
it
(
'
GPUScheduler disabled gpuNum === undefined
'
,
async
()
=>
{
let
trialDetail
=
await
newTrial
(
trialDispatcher
);
await
waitEnvironment
(
1
,
previousEnvironments
,
environmentService
,
commandChannel
);
const
command
=
await
verifyTrialRunning
(
commandChannel
,
trialDetail
);
await
verifyTrialResult
(
commandChannel
,
trialDetail
,
0
);
chai
.
assert
.
equal
(
command
.
data
[
"
gpuIndices
"
],
undefined
);
});
it
(
'
GPUScheduler disabled gpuNum === 0
'
,
async
()
=>
{
trialDispatcher
.
setClusterMetadata
(
TrialConfigMetadataKey
.
TRIAL_CONFIG
,
JSON
.
stringify
({
reuseEnvironment
:
false
,
codeDir
:
currentDir
,
gpuNum
:
0
,
}));
let
trialDetail
=
await
newTrial
(
trialDispatcher
);
await
waitEnvironment
(
1
,
previousEnvironments
,
environmentService
,
commandChannel
);
const
command
=
await
verifyTrialRunning
(
commandChannel
,
trialDetail
);
await
verifyTrialResult
(
commandChannel
,
trialDetail
,
0
);
chai
.
assert
.
equal
(
command
.
data
[
"
gpuIndices
"
],
""
);
});
it
(
'
GPUScheduler enable no cluster gpu config
'
,
async
()
=>
{
trialDispatcher
.
setClusterMetadata
(
TrialConfigMetadataKey
.
TRIAL_CONFIG
,
JSON
.
stringify
({
reuseEnvironment
:
false
,
codeDir
:
currentDir
,
gpuNum
:
1
,
}));
let
trialDetail
=
await
newTrial
(
trialDispatcher
);
await
waitEnvironment
(
1
,
previousEnvironments
,
environmentService
,
commandChannel
);
const
command
=
await
verifyTrialRunning
(
commandChannel
,
trialDetail
);
await
verifyTrialResult
(
commandChannel
,
trialDetail
,
0
);
chai
.
assert
.
equal
(
command
.
data
[
"
gpuIndices
"
],
"
0
"
);
});
it
(
'
GPUScheduler skipped no GPU info
'
,
async
()
=>
{
trialDispatcher
.
setClusterMetadata
(
TrialConfigMetadataKey
.
TRIAL_CONFIG
,
JSON
.
stringify
({
reuseEnvironment
:
false
,
codeDir
:
currentDir
,
}));
let
trialDetail
=
await
newTrial
(
trialDispatcher
);
await
waitEnvironment
(
1
,
previousEnvironments
,
environmentService
,
commandChannel
);
const
command
=
await
verifyTrialRunning
(
commandChannel
,
trialDetail
);
await
verifyTrialResult
(
commandChannel
,
trialDetail
,
0
);
chai
.
assert
.
equal
(
command
.
data
[
"
gpuIndices
"
],
undefined
);
});
it
(
'
GPUScheduler disabled multi-node
'
,
async
()
=>
{
trialDispatcher
.
setClusterMetadata
(
TrialConfigMetadataKey
.
TRIAL_CONFIG
,
JSON
.
stringify
({
reuseEnvironment
:
false
,
codeDir
:
currentDir
,
gpuNum
:
0
,
}));
let
trialDetail
=
await
newTrial
(
trialDispatcher
);
await
waitEnvironment
(
1
,
previousEnvironments
,
environmentService
,
commandChannel
);
const
command
=
await
verifyTrialRunning
(
commandChannel
,
trialDetail
);
await
verifyTrialResult
(
commandChannel
,
trialDetail
,
0
);
chai
.
assert
.
equal
(
command
.
data
[
"
gpuIndices
"
],
""
);
});
it
(
'
GPUScheduler enabled 2 gpus 2 trial
'
,
async
()
=>
{
trialDispatcher
.
setClusterMetadata
(
TrialConfigMetadataKey
.
TRIAL_CONFIG
,
JSON
.
stringify
({
reuseEnvironment
:
false
,
codeDir
:
currentDir
,
gpuNum
:
1
,
}));
const
trialDetail1
=
await
newTrial
(
trialDispatcher
);
const
trialDetail2
=
await
newTrial
(
trialDispatcher
);
await
waitEnvironment
(
1
,
previousEnvironments
,
environmentService
,
commandChannel
);
let
command
=
await
verifyTrialRunning
(
commandChannel
,
trialDetail1
);
chai
.
assert
.
equal
(
command
.
data
[
"
gpuIndices
"
],
"
0
"
);
command
=
await
verifyTrialRunning
(
commandChannel
,
trialDetail2
);
chai
.
assert
.
equal
(
command
.
data
[
"
gpuIndices
"
],
"
1
"
);
await
verifyTrialResult
(
commandChannel
,
trialDetail1
,
0
);
await
verifyTrialResult
(
commandChannel
,
trialDetail2
,
0
);
chai
.
assert
.
equal
(
environmentService
.
testGetEnvironments
().
size
,
1
);
const
trials
=
await
trialDispatcher
.
listTrialJobs
();
chai
.
assert
.
equal
(
trials
.
length
,
2
,
"
there should be 2 trials
"
);
});
it
(
'
GPUScheduler enabled 4 gpus 2 trial(need 2 gpus)
'
,
async
()
=>
{
trialDispatcher
.
setClusterMetadata
(
TrialConfigMetadataKey
.
TRIAL_CONFIG
,
JSON
.
stringify
({
reuseEnvironment
:
false
,
codeDir
:
currentDir
,
gpuNum
:
2
,
}));
const
trialDetail1
=
await
newTrial
(
trialDispatcher
);
const
trialDetail2
=
await
newTrial
(
trialDispatcher
);
await
waitEnvironment
(
1
,
previousEnvironments
,
environmentService
,
commandChannel
,
4
);
let
command
=
await
verifyTrialRunning
(
commandChannel
,
trialDetail1
);
chai
.
assert
.
equal
(
command
.
data
[
"
gpuIndices
"
],
"
0,1
"
);
command
=
await
verifyTrialRunning
(
commandChannel
,
trialDetail2
);
chai
.
assert
.
equal
(
command
.
data
[
"
gpuIndices
"
],
"
2,3
"
);
await
verifyTrialResult
(
commandChannel
,
trialDetail1
,
0
);
await
verifyTrialResult
(
commandChannel
,
trialDetail2
,
0
);
chai
.
assert
.
equal
(
environmentService
.
testGetEnvironments
().
size
,
1
);
const
trials
=
await
trialDispatcher
.
listTrialJobs
();
chai
.
assert
.
equal
(
trials
.
length
,
2
,
"
there should be 2 trials
"
);
});
it
(
'
GPUScheduler enabled use 4 gpus but only 1 usable(4)
'
,
async
()
=>
{
trialDispatcher
.
setClusterMetadata
(
TrialConfigMetadataKey
.
TRIAL_CONFIG
,
JSON
.
stringify
({
reuseEnvironment
:
false
,
codeDir
:
currentDir
,
gpuNum
:
1
,
}));
const
trialDetail
=
await
newTrial
(
trialDispatcher
);
await
waitEnvironment
(
1
,
previousEnvironments
,
environmentService
,
commandChannel
,
4
,
1
,
async
(
environment
)
=>
{
environment
.
usableGpus
=
[
3
];
});
let
command
=
await
verifyTrialRunning
(
commandChannel
,
trialDetail
);
chai
.
assert
.
equal
(
command
.
data
[
"
gpuIndices
"
],
"
3
"
);
await
verifyTrialResult
(
commandChannel
,
trialDetail
,
0
);
chai
.
assert
.
equal
(
environmentService
.
testGetEnvironments
().
size
,
1
);
const
trials
=
await
trialDispatcher
.
listTrialJobs
();
chai
.
assert
.
equal
(
trials
.
length
,
1
);
});
it
(
'
GPUScheduler enabled TMP_NO_AVAILABLE_GPU, request new env
'
,
async
()
=>
{
trialDispatcher
.
setClusterMetadata
(
TrialConfigMetadataKey
.
TRIAL_CONFIG
,
JSON
.
stringify
({
reuseEnvironment
:
false
,
codeDir
:
currentDir
,
gpuNum
:
1
,
}));
const
trialDetail1
=
await
newTrial
(
trialDispatcher
);
await
waitEnvironment
(
1
,
previousEnvironments
,
environmentService
,
commandChannel
,
1
);
let
command
=
await
verifyTrialRunning
(
commandChannel
,
trialDetail1
);
chai
.
assert
.
equal
(
command
.
data
[
"
gpuIndices
"
],
"
0
"
);
const
trialDetail2
=
await
newTrial
(
trialDispatcher
);
await
waitEnvironment
(
2
,
previousEnvironments
,
environmentService
,
commandChannel
,
1
);
await
verifyTrialResult
(
commandChannel
,
trialDetail1
,
0
);
command
=
await
verifyTrialRunning
(
commandChannel
,
trialDetail2
);
await
verifyTrialResult
(
commandChannel
,
trialDetail2
,
0
);
chai
.
assert
.
equal
(
command
.
data
[
"
gpuIndices
"
],
"
0
"
);
chai
.
assert
.
equal
(
environmentService
.
testGetEnvironments
().
size
,
2
,
'
environments
'
);
const
trials
=
await
trialDispatcher
.
listTrialJobs
();
chai
.
assert
.
equal
(
trials
.
length
,
2
,
'
trials
'
);
});
it
(
'
GPUScheduler enabled REQUIRE_EXCEED_TOTAL, need fail
'
,
async
()
=>
{
trialDispatcher
.
setClusterMetadata
(
TrialConfigMetadataKey
.
TRIAL_CONFIG
,
JSON
.
stringify
({
reuseEnvironment
:
false
,
codeDir
:
currentDir
,
gpuNum
:
8
,
}));
await
newTrial
(
trialDispatcher
);
await
waitEnvironment
(
1
,
previousEnvironments
,
environmentService
,
commandChannel
);
await
chai
.
expect
(
trialRunPromise
).
rejectedWith
(
NNIError
,
"
REQUIRE_EXCEED_TOTAL
"
);
const
deferred
=
new
Deferred
<
void
>
();
trialRunPromise
=
deferred
.
promise
;
deferred
.
resolve
();
});
it
(
'
GPUScheduler enabled maxTrialNumberPerGpu=2, 4 trials, 2 gpus
'
,
async
()
=>
{
trialDispatcher
.
setClusterMetadata
(
TrialConfigMetadataKey
.
TRIAL_CONFIG
,
JSON
.
stringify
({
reuseEnvironment
:
false
,
codeDir
:
currentDir
,
gpuNum
:
1
,
}));
const
trials
=
[];
// last two trials shouldn't be in first environment.
for
(
let
index
=
0
;
index
<
6
;
index
++
)
{
const
trial
=
await
newTrial
(
trialDispatcher
);
trials
.
push
(
trial
);
}
await
waitEnvironment
(
1
,
previousEnvironments
,
environmentService
,
commandChannel
,
2
,
1
,
async
(
environment
)
=>
{
environment
.
maxTrialNumberPerGpu
=
2
;
});
await
waitEnvironment
(
2
,
previousEnvironments
,
environmentService
,
commandChannel
,
2
,
1
,
async
(
environment
)
=>
{
environment
.
maxTrialNumberPerGpu
=
2
;
});
const
gpuIndexMap
=
new
Map
<
string
,
number
>
();
for
(
let
index
=
0
;
index
<
6
;
index
++
)
{
const
trial
=
trials
[
index
];
let
command
=
await
verifyTrialRunning
(
commandChannel
,
trial
);
const
gpuIndex
=
command
.
data
[
"
gpuIndices
"
];
const
trialNumbers
=
gpuIndexMap
.
get
(
gpuIndex
);
if
(
index
<
4
)
{
if
(
undefined
===
trialNumbers
)
{
gpuIndexMap
.
set
(
gpuIndex
,
1
);
}
else
{
gpuIndexMap
.
set
(
gpuIndex
,
trialNumbers
+
1
);
}
}
}
chai
.
assert
.
equal
(
gpuIndexMap
.
size
,
2
);
chai
.
assert
.
equal
(
gpuIndexMap
.
get
(
"
0
"
),
2
);
chai
.
assert
.
equal
(
gpuIndexMap
.
get
(
"
1
"
),
2
);
for
(
let
index
=
0
;
index
<
6
;
index
++
)
{
const
trial
=
trials
[
index
];
await
verifyTrialResult
(
commandChannel
,
trial
,
0
);
}
chai
.
assert
.
equal
(
environmentService
.
testGetEnvironments
().
size
,
2
);
const
listedTrials
=
await
trialDispatcher
.
listTrialJobs
();
chai
.
assert
.
equal
(
listedTrials
.
length
,
6
);
});
});
src/nni_manager/training_service/reusable/test/utCommandChannel.ts
0 → 100644
View file @
143c6615
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
import
{
encodeCommand
}
from
"
../../../core/ipcInterface
"
;
import
{
Command
,
CommandChannel
,
RunnerConnection
}
from
"
../commandChannel
"
;
import
{
Channel
,
EnvironmentInformation
}
from
"
../environment
"
;
class
UtRunnerConnection
extends
RunnerConnection
{
}
export
class
UtCommandChannel
extends
CommandChannel
{
private
readonly
receivedCommands
:
Command
[]
=
[];
public
get
channelName
():
Channel
{
return
"
ut
"
;
}
public
async
testSendCommandToTrialDispatcher
(
environment
:
EnvironmentInformation
,
commandType
:
string
,
commandData
:
any
)
{
const
content
=
encodeCommand
(
commandType
,
JSON
.
stringify
(
commandData
));
this
.
log
.
debug
(
`UtCommandChannel: env
${
environment
.
id
}
send test command
${
content
}
`
);
this
.
handleCommand
(
environment
,
content
.
toString
(
"
utf8
"
));
}
public
async
testReceiveCommandFromTrialDispatcher
():
Promise
<
Command
|
undefined
>
{
return
this
.
receivedCommands
.
shift
();
}
public
async
config
(
_key
:
string
,
value
:
any
):
Promise
<
void
>
{
// do nothing
}
public
async
start
():
Promise
<
void
>
{
// do nothing
}
public
async
stop
():
Promise
<
void
>
{
// do nothing
}
public
async
run
():
Promise
<
void
>
{
// do nothing
}
protected
async
sendCommandInternal
(
environment
:
EnvironmentInformation
,
message
:
string
):
Promise
<
void
>
{
const
parsedCommands
=
this
.
parseCommands
(
message
);
for
(
const
parsedCommand
of
parsedCommands
)
{
const
command
=
new
Command
(
environment
,
parsedCommand
[
0
],
parsedCommand
[
1
]);
this
.
receivedCommands
.
push
(
command
);
}
}
protected
createRunnerConnection
(
environment
:
EnvironmentInformation
):
RunnerConnection
{
// do nothing
return
new
UtRunnerConnection
(
environment
);
}
}
src/nni_manager/training_service/reusable/test/utEnvironmentService.ts
0 → 100644
View file @
143c6615
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
import
{
EnvironmentInformation
,
EnvironmentService
,
EnvironmentStatus
}
from
"
../environment
"
;
import
{
EventEmitter
}
from
"
events
"
;
import
{
CommandChannel
}
from
"
../commandChannel
"
;
import
{
UtCommandChannel
}
from
"
./utCommandChannel
"
;
export
class
UtEnvironmentService
extends
EnvironmentService
{
private
commandChannel
:
UtCommandChannel
|
undefined
;
private
allEnvironments
=
new
Map
<
string
,
EnvironmentInformation
>
();
private
hasMoreEnvironmentsInternal
=
true
;
constructor
()
{
super
();
}
public
get
hasStorageService
():
boolean
{
// storage service is tested by integration testing.
return
false
;
}
public
get
environmentMaintenceLoopInterval
():
number
{
return
1
;
}
public
testSetEnvironmentStatus
(
environment
:
EnvironmentInformation
,
newStatus
:
EnvironmentStatus
):
void
{
environment
.
status
=
newStatus
;
}
public
testReset
():
void
{
this
.
allEnvironments
.
clear
();
}
public
testGetEnvironments
():
Map
<
string
,
EnvironmentInformation
>
{
return
this
.
allEnvironments
;
}
public
testGetCommandChannel
():
UtCommandChannel
{
if
(
this
.
commandChannel
===
undefined
)
{
throw
new
Error
(
`command channel shouldn't be undefined.`
);
}
return
this
.
commandChannel
;
}
public
testSetNoMoreEnvironment
(
hasMore
:
boolean
):
void
{
this
.
hasMoreEnvironmentsInternal
=
hasMore
;
}
public
get
hasMoreEnvironments
():
boolean
{
return
this
.
hasMoreEnvironmentsInternal
;
}
public
createCommandChannel
(
commandEmitter
:
EventEmitter
):
CommandChannel
{
this
.
commandChannel
=
new
UtCommandChannel
(
commandEmitter
)
return
this
.
commandChannel
;
}
public
async
config
(
_key
:
string
,
_value
:
string
):
Promise
<
void
>
{
// do nothing
}
public
async
refreshEnvironmentsStatus
(
environments
:
EnvironmentInformation
[]):
Promise
<
void
>
{
// do nothing
}
public
async
startEnvironment
(
environment
:
EnvironmentInformation
):
Promise
<
void
>
{
if
(
!
this
.
allEnvironments
.
has
(
environment
.
id
))
{
this
.
allEnvironments
.
set
(
environment
.
id
,
environment
);
environment
.
status
=
"
WAITING
"
;
}
}
public
async
stopEnvironment
(
environment
:
EnvironmentInformation
):
Promise
<
void
>
{
environment
.
status
=
"
USER_CANCELED
"
;
}
}
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment