Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
3bce6926
Unverified
Commit
3bce6926
authored
Jul 26, 2021
by
Zhenhua Han
Committed by
GitHub
Jul 26, 2021
Browse files
extend reusable training service to support placement constraint (#3897)
parent
b0de7c93
Changes
18
Hide whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
327 additions
and
59 deletions
+327
-59
nni/common/device.py
nni/common/device.py
+29
-0
nni/experiment/config/local.py
nni/experiment/config/local.py
+1
-0
nni/retiarii/execution/base.py
nni/retiarii/execution/base.py
+1
-0
nni/retiarii/execution/cgo_engine.py
nni/retiarii/execution/cgo_engine.py
+2
-1
nni/retiarii/experiment/pytorch.py
nni/retiarii/experiment/pytorch.py
+14
-1
nni/retiarii/integration.py
nni/retiarii/integration.py
+29
-2
nni/retiarii/integration_api.py
nni/retiarii/integration_api.py
+2
-2
nni/runtime/msg_dispatcher_base.py
nni/runtime/msg_dispatcher_base.py
+1
-1
test/retiarii_test/darts/.nniignore
test/retiarii_test/darts/.nniignore
+2
-0
test/retiarii_test/darts/test.py
test/retiarii_test/darts/test.py
+1
-1
test/retiarii_test/darts/test_training_service.py
test/retiarii_test/darts/test_training_service.py
+64
-0
ts/nni_manager/common/experimentConfig.ts
ts/nni_manager/common/experimentConfig.ts
+1
-0
ts/nni_manager/common/trainingService.ts
ts/nni_manager/common/trainingService.ts
+20
-1
ts/nni_manager/core/nnimanager.ts
ts/nni_manager/core/nnimanager.ts
+18
-12
ts/nni_manager/test/training_service/remoteMachineTrainingService.test.ts
...est/training_service/remoteMachineTrainingService.test.ts
+4
-0
ts/nni_manager/training_service/reusable/environments/remoteEnvironmentService.ts
...service/reusable/environments/remoteEnvironmentService.ts
+1
-0
ts/nni_manager/training_service/reusable/gpuScheduler.ts
ts/nni_manager/training_service/reusable/gpuScheduler.ts
+134
-35
ts/nni_manager/training_service/reusable/trialDispatcher.ts
ts/nni_manager/training_service/reusable/trialDispatcher.ts
+3
-3
No files found.
nni/common/device.py
0 → 100644
View file @
3bce6926
from
dataclasses
import
dataclass
from
typing
import
Literal
@
dataclass
class
GPUDevice
:
node_id
:
str
gpu_id
:
int
status
:
Literal
[
'idle'
,
'busy'
,
'unknown'
]
=
'idle'
def
__eq__
(
self
,
o
)
->
bool
:
return
self
.
node_id
==
o
.
node_id
and
self
.
gpu_id
==
o
.
gpu_id
def
__lt__
(
self
,
o
)
->
bool
:
if
self
.
node_id
<
o
.
node_id
:
return
True
elif
self
.
node_id
>
o
.
node_id
:
return
False
else
:
return
self
.
gpu_id
<
o
.
gpu_id
def
__repr__
(
self
)
->
str
:
return
"{Environment %s, GPU %d, Status %s}"
%
(
self
.
node_id
,
self
.
gpu_id
,
self
.
status
)
def
__hash__
(
self
)
->
int
:
return
hash
(
self
.
node_id
+
'_'
+
self
.
gpu_id
)
def
set_status
(
self
,
status
):
self
.
status
=
status
nni/experiment/config/local.py
View file @
3bce6926
...
...
@@ -12,6 +12,7 @@ __all__ = ['LocalConfig']
@
dataclass
(
init
=
False
)
class
LocalConfig
(
TrainingServiceConfig
):
platform
:
str
=
'local'
reuse_mode
:
bool
=
False
use_active_gpu
:
Optional
[
bool
]
=
None
max_trial_number_per_gpu
:
int
=
1
gpu_indices
:
Union
[
List
[
int
],
str
,
int
,
None
]
=
None
...
...
nni/retiarii/execution/base.py
View file @
3bce6926
...
...
@@ -14,6 +14,7 @@ from ..integration_api import send_trial, receive_trial_parameters, get_advisor
_logger
=
logging
.
getLogger
(
__name__
)
class
BaseGraphData
:
def
__init__
(
self
,
model_script
:
str
,
evaluator
:
Evaluator
)
->
None
:
self
.
model_script
=
model_script
...
...
nni/retiarii/execution/cgo_engine.py
View file @
3bce6926
...
...
@@ -17,7 +17,7 @@ _logger = logging.getLogger(__name__)
class
CGOExecutionEngine
(
AbstractExecutionEngine
):
def
__init__
(
self
,
n_model_per_graph
=
4
)
->
None
:
def
__init__
(
self
,
devices
=
None
,
n_model_per_graph
=
4
)
->
None
:
self
.
_listeners
:
List
[
AbstractGraphListener
]
=
[]
self
.
_running_models
:
Dict
[
int
,
Model
]
=
dict
()
self
.
logical_plan_counter
=
0
...
...
@@ -25,6 +25,7 @@ class CGOExecutionEngine(AbstractExecutionEngine):
self
.
_optimizers
=
[
DedupInputOptimizer
()]
self
.
_original_models
=
{}
self
.
_original_model_to_multi_model
=
{}
self
.
devices
=
[]
if
devices
is
None
else
devices
# register advisor callbacks
advisor
=
get_advisor
()
...
...
nni/retiarii/experiment/pytorch.py
View file @
3bce6926
...
...
@@ -25,6 +25,7 @@ from nni.experiment.config import util
from
nni.experiment.config.base
import
ConfigBase
,
PathLike
from
nni.experiment.pipe
import
Pipe
from
nni.tools.nnictl.command_utils
import
kill_command
from
nni.common.device
import
GPUDevice
from
..codegen
import
model_to_pytorch_script
from
..converter
import
convert_to_graph
...
...
@@ -193,13 +194,14 @@ class RetiariiExperiment(Experiment):
"""
atexit
.
register
(
self
.
stop
)
devices
=
self
.
_construct_devices
()
# we will probably need a execution engine factory to make this clean and elegant
if
self
.
config
.
execution_engine
==
'base'
:
from
..execution.base
import
BaseExecutionEngine
engine
=
BaseExecutionEngine
()
elif
self
.
config
.
execution_engine
==
'cgo'
:
from
..execution.cgo_engine
import
CGOExecutionEngine
engine
=
CGOExecutionEngine
()
engine
=
CGOExecutionEngine
(
devices
=
devices
)
elif
self
.
config
.
execution_engine
==
'py'
:
from
..execution.python
import
PurePythonExecutionEngine
engine
=
PurePythonExecutionEngine
()
...
...
@@ -241,6 +243,17 @@ class RetiariiExperiment(Experiment):
_logger
.
info
(
'Waiting for experiment to become DONE (you can ctrl+c if there is no running trial jobs)...'
)
exp_status_checker
.
join
()
def
_construct_devices
(
self
):
devices
=
[]
if
hasattr
(
self
.
config
.
training_service
,
'machine_list'
):
for
machine_idx
,
machine
in
enumerate
(
self
.
config
.
training_service
.
machine_list
):
for
gpu_idx
in
machine
.
gpu_indices
:
devices
.
append
(
GPUDevice
(
machine
.
host
,
gpu_idx
))
else
:
for
gpu_idx
in
self
.
config
.
training_service
.
gpu_indices
:
devices
.
append
(
GPUDevice
(
'local'
,
gpu_idx
))
return
devices
def
_create_dispatcher
(
self
):
return
self
.
_dispatcher
...
...
nni/retiarii/integration.py
View file @
3bce6926
...
...
@@ -68,7 +68,27 @@ class RetiariiAdvisor(MsgDispatcherBase):
self
.
handle_update_search_space
(
data
)
send
(
CommandType
.
Initialized
,
''
)
def
send_trial
(
self
,
parameters
):
def
_validate_placement_constraint
(
self
,
placement_constraint
):
if
placement_constraint
is
None
:
raise
ValueError
(
'placement_constraint is None'
)
if
not
'type'
in
placement_constraint
:
raise
ValueError
(
'placement_constraint must have `type`'
)
if
not
'gpus'
in
placement_constraint
:
raise
ValueError
(
'placement_constraint must have `gpus`'
)
if
placement_constraint
[
'type'
]
not
in
[
'None'
,
'GPUNumber'
,
'Device'
]:
raise
ValueError
(
'placement_constraint.type must be either `None`,. `GPUNumber` or `Device`'
)
if
placement_constraint
[
'type'
]
==
'None'
and
len
(
placement_constraint
[
'gpus'
])
>
0
:
raise
ValueError
(
'placement_constraint.gpus must be an empty list when type == None'
)
if
placement_constraint
[
'type'
]
==
'Device'
and
len
(
placement_constraint
[
'gpus'
])
!=
1
:
raise
ValueError
(
'placement_constraint.gpus must be a list of number (currently only support one host)'
)
if
placement_constraint
[
'type'
]
==
'Device'
:
for
e
in
placement_constraint
[
'gpus'
]:
if
not
isinstance
(
e
,
tuple
):
raise
ValueError
(
'placement_constraint.gpus must be a list of tuple when type == Device'
)
if
not
(
len
(
e
)
==
2
and
isinstance
(
e
[
0
],
str
)
and
isinstance
(
e
[
1
],
int
)):
raise
ValueError
(
'placement_constraint.gpus`s tuple must be (str, int)'
)
def
send_trial
(
self
,
parameters
,
placement_constraint
=
None
):
"""
Send parameters to NNI.
...
...
@@ -84,10 +104,17 @@ class RetiariiAdvisor(MsgDispatcherBase):
which will be used for identification in future.
"""
self
.
parameters_count
+=
1
if
placement_constraint
is
None
:
placement_constraint
=
{
'type'
:
'None'
,
'gpus'
:
[]
}
self
.
_validate_placement_constraint
(
placement_constraint
)
new_trial
=
{
'parameter_id'
:
self
.
parameters_count
,
'parameters'
:
parameters
,
'parameter_source'
:
'algorithm'
'parameter_source'
:
'algorithm'
,
'placement_constraint'
:
placement_constraint
}
_logger
.
debug
(
'New trial sent: %s'
,
new_trial
)
send
(
CommandType
.
NewTrialJob
,
json_dumps
(
new_trial
))
...
...
nni/retiarii/integration_api.py
View file @
3bce6926
...
...
@@ -27,12 +27,12 @@ def register_advisor(advisor: 'RetiariiAdvisor'):
_advisor
=
advisor
def
send_trial
(
parameters
:
dict
)
->
int
:
def
send_trial
(
parameters
:
dict
,
placement_constraint
=
None
)
->
int
:
"""
Send a new trial. Executed on tuner end.
Return a ID that is the unique identifier for this trial.
"""
return
get_advisor
().
send_trial
(
parameters
)
return
get_advisor
().
send_trial
(
parameters
,
placement_constraint
)
def
receive_trial_parameters
()
->
dict
:
...
...
nni/runtime/msg_dispatcher_base.py
View file @
3bce6926
...
...
@@ -242,4 +242,4 @@ class MsgDispatcherBase(Recoverable):
hyper_params: the string that is sent by message dispatcher during the creation of trials.
"""
raise
NotImplementedError
(
'handle_trial_end not implemented'
)
raise
NotImplementedError
(
'handle_trial_end not implemented'
)
\ No newline at end of file
test/retiarii_test/darts/.nniignore
0 → 100644
View file @
3bce6926
lightning_logs
data
\ No newline at end of file
test/retiarii_test/darts/test.py
View file @
3bce6926
...
...
@@ -45,4 +45,4 @@ if __name__ == '__main__':
exp_config
.
training_service
.
use_active_gpu
=
True
exp_config
.
training_service
.
gpu_indices
=
[
1
,
2
]
exp
.
run
(
exp_config
,
8081
)
exp
.
run
(
exp_config
,
8081
)
\ No newline at end of file
test/retiarii_test/darts/test_training_service.py
0 → 100644
View file @
3bce6926
import
json
from
nni.common.device
import
GPUDevice
import
os
import
sys
import
torch
from
pathlib
import
Path
import
nni.retiarii.evaluator.pytorch.lightning
as
pl
import
nni.retiarii.strategy
as
strategy
from
nni.experiment
import
RemoteMachineConfig
from
nni.retiarii
import
serialize
from
nni.retiarii.experiment.pytorch
import
RetiariiExperiment
,
RetiariiExeConfig
from
torchvision
import
transforms
from
torchvision.datasets
import
CIFAR10
from
darts_model
import
CNN
if
__name__
==
'__main__'
:
base_model
=
CNN
(
32
,
3
,
16
,
10
,
8
)
train_transform
=
transforms
.
Compose
([
transforms
.
RandomCrop
(
32
,
padding
=
4
),
transforms
.
RandomHorizontalFlip
(),
transforms
.
ToTensor
(),
transforms
.
Normalize
((
0.4914
,
0.4822
,
0.4465
),
(
0.2023
,
0.1994
,
0.2010
)),
])
valid_transform
=
transforms
.
Compose
([
transforms
.
ToTensor
(),
transforms
.
Normalize
((
0.4914
,
0.4822
,
0.4465
),
(
0.2023
,
0.1994
,
0.2010
)),
])
train_dataset
=
serialize
(
CIFAR10
,
root
=
'data/cifar10'
,
train
=
True
,
download
=
True
,
transform
=
train_transform
)
test_dataset
=
serialize
(
CIFAR10
,
root
=
'data/cifar10'
,
train
=
False
,
download
=
True
,
transform
=
valid_transform
)
trainer
=
pl
.
Classification
(
train_dataloader
=
pl
.
DataLoader
(
train_dataset
,
batch_size
=
100
),
val_dataloaders
=
pl
.
DataLoader
(
test_dataset
,
batch_size
=
100
),
max_epochs
=
1
,
limit_train_batches
=
0.2
)
simple_strategy
=
strategy
.
Random
()
exp
=
RetiariiExperiment
(
base_model
,
trainer
,
[],
simple_strategy
)
exp_config
=
RetiariiExeConfig
(
'remote'
)
exp_config
.
experiment_name
=
'darts_search'
exp_config
.
trial_concurrency
=
2
exp_config
.
max_trial_number
=
10
exp_config
.
trial_gpu_number
=
1
exp_config
.
training_service
.
use_active_gpu
=
True
exp_config
.
training_service
.
reuse_mode
=
True
exp_config
.
training_service
.
gpu_indices
=
[
0
,
1
,
2
]
rm_conf
=
RemoteMachineConfig
()
rm_conf
.
host
=
'127.0.0.1'
rm_conf
.
user
=
'xxx'
rm_conf
.
password
=
'xxx'
rm_conf
.
port
=
22
rm_conf
.
python_path
=
'/home/xxx/py38/bin'
rm_conf
.
gpu_indices
=
[
0
,
1
,
2
]
rm_conf
.
use_active_gpu
=
True
rm_conf
.
max_trial_number_per_gpu
=
3
exp_config
.
training_service
.
machine_list
=
[
rm_conf
]
exp_config
.
execution_engine
=
'py'
exp
.
run
(
exp_config
,
8081
)
ts/nni_manager/common/experimentConfig.ts
View file @
3bce6926
...
...
@@ -16,6 +16,7 @@ export interface TrainingServiceConfig {
export
interface
LocalConfig
extends
TrainingServiceConfig
{
platform
:
'
local
'
;
reuseMode
:
boolean
;
useActiveGpu
?:
boolean
;
maxTrialNumberPerGpu
:
number
;
gpuIndices
?:
number
[];
...
...
ts/nni_manager/common/trainingService.ts
View file @
3bce6926
...
...
@@ -18,12 +18,31 @@ interface HyperParameters {
readonly
index
:
number
;
}
type
PlacementConstraintType
=
'
None
'
|
'
GPUNumber
'
|
'
Device
'
interface
PlacementConstraint
{
readonly
type
:
PlacementConstraintType
;
readonly
gpus
:
Array
<
number
>
|
Array
<
[
string
,
number
]
>
;
/**
* GPUNumber constraint is in form of Array<number>, e.g., [3] means it must be placed on a node of 3 GPUs
*
* Device constraint is in form of Array<[string,number]>, e.g., [('Node-0',1),('Node-1',0)] means it must be placed on
* Node-0's GPU-1 and Node-1's GPU-0
*/
}
/**
* define TrialJobApplicationForm
*/
interface
TrialJobApplicationForm
{
readonly
sequenceId
:
number
;
readonly
hyperParameters
:
HyperParameters
;
readonly
placementConstraint
?:
PlacementConstraint
;
}
interface
TrialCommandContent
{
readonly
parameter_id
:
string
;
readonly
parameters
:
string
;
readonly
parameter_source
:
string
;
readonly
placement_constraint
?:
PlacementConstraint
;
}
/**
...
...
@@ -101,5 +120,5 @@ class NNIManagerIpConfig {
export
{
TrainingService
,
TrainingServiceError
,
TrialJobStatus
,
TrialJobApplicationForm
,
TrainingServiceMetadata
,
TrialJobDetail
,
TrialJobMetric
,
HyperParameters
,
NNIManagerIpConfig
NNIManagerIpConfig
,
PlacementConstraint
,
TrialCommandContent
};
ts/nni_manager/core/nnimanager.ts
View file @
3bce6926
...
...
@@ -19,7 +19,7 @@ import { ExperimentConfig, toSeconds, toCudaVisibleDevices } from '../common/exp
import
{
ExperimentManager
}
from
'
../common/experimentManager
'
;
import
{
TensorboardManager
}
from
'
../common/tensorboardManager
'
;
import
{
TrainingService
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
,
TrialJobStatus
TrainingService
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
,
TrialJobStatus
,
TrialCommandContent
}
from
'
../common/trainingService
'
;
import
{
delay
,
getCheckpointDir
,
getExperimentRootDir
,
getLogDir
,
getMsgDispatcherCommand
,
mkDirP
,
getTunerProc
,
getLogLevel
,
isAlive
,
killPid
}
from
'
../common/utils
'
;
import
{
...
...
@@ -178,7 +178,7 @@ class NNIManager implements Manager {
this
.
config
=
config
;
this
.
log
.
info
(
`Starting experiment:
${
this
.
experimentProfile
.
id
}
`
);
await
this
.
storeExperimentProfile
();
if
(
this
.
trainingService
===
undefined
)
{
this
.
log
.
info
(
'
Setup training service...
'
);
this
.
trainingService
=
await
this
.
initTrainingService
(
config
);
...
...
@@ -449,8 +449,12 @@ class NNIManager implements Manager {
if
(
!
platform
)
{
throw
new
Error
(
'
Cannot detect training service platform
'
);
}
const
reuseMode
=
Array
.
isArray
(
config
.
trainingService
)
||
(
config
.
trainingService
as
any
).
reuseMode
;
if
(
platform
===
'
local
'
)
{
if
(
reuseMode
)
{
const
module_
=
await
import
(
'
../training_service/reusable/routerTrainingService
'
);
return
await
module_
.
RouterTrainingService
.
construct
(
config
);
}
else
if
(
platform
===
'
local
'
)
{
const
module_
=
await
import
(
'
../training_service/local/localTrainingService
'
);
return
new
module_
.
LocalTrainingService
(
config
);
}
else
if
(
platform
===
'
frameworkcontroller
'
)
{
...
...
@@ -542,13 +546,13 @@ class NNIManager implements Manager {
private
async
stopTrialJobIfOverMaxDurationTimer
(
trialJobId
:
string
):
Promise
<
void
>
{
const
trialJobDetail
:
TrialJobDetail
|
undefined
=
this
.
trialJobs
.
get
(
trialJobId
);
if
(
undefined
!==
trialJobDetail
&&
if
(
undefined
!==
trialJobDetail
&&
trialJobDetail
.
status
===
'
RUNNING
'
&&
trialJobDetail
.
startTime
!==
undefined
){
const
isEarlyStopped
=
true
;
await
this
.
trainingService
.
cancelTrialJob
(
trialJobId
,
isEarlyStopped
);
this
.
log
.
info
(
`Trial job
${
trialJobId
}
has stoped because it is over maxTrialDuration.`
);
}
trialJobDetail
.
startTime
!==
undefined
)
{
const
isEarlyStopped
=
true
;
await
this
.
trainingService
.
cancelTrialJob
(
trialJobId
,
isEarlyStopped
);
this
.
log
.
info
(
`Trial job
${
trialJobId
}
has stoped because it is over maxTrialDuration.`
);
}
}
private
async
requestTrialJobsStatus
():
Promise
<
number
>
{
...
...
@@ -674,7 +678,7 @@ class NNIManager implements Manager {
this
.
currSubmittedTrialNum
++
;
this
.
log
.
info
(
'
submitTrialJob: form:
'
,
form
);
const
trialJobDetail
:
TrialJobDetail
=
await
this
.
trainingService
.
submitTrialJob
(
form
);
setTimeout
(
async
()
=>
this
.
stopTrialJobIfOverMaxDurationTimer
(
trialJobDetail
.
id
),
1000
*
this
.
maxTrialDuration
);
setTimeout
(
async
()
=>
this
.
stopTrialJobIfOverMaxDurationTimer
(
trialJobDetail
.
id
),
1000
*
this
.
maxTrialDuration
);
const
Snapshot
:
TrialJobDetail
=
Object
.
assign
({},
trialJobDetail
);
await
this
.
storeExperimentProfile
();
this
.
trialJobs
.
set
(
trialJobDetail
.
id
,
Snapshot
);
...
...
@@ -747,7 +751,7 @@ class NNIManager implements Manager {
private
async
onTrialJobMetrics
(
metric
:
TrialJobMetric
):
Promise
<
void
>
{
this
.
log
.
debug
(
'
NNIManager received trial job metrics:
'
,
metric
);
if
(
this
.
trialJobs
.
has
(
metric
.
id
)){
if
(
this
.
trialJobs
.
has
(
metric
.
id
))
{
await
this
.
dataStore
.
storeMetricData
(
metric
.
id
,
metric
.
data
);
if
(
this
.
dispatcher
===
undefined
)
{
throw
new
Error
(
'
Error: tuner has not been setup
'
);
...
...
@@ -796,12 +800,14 @@ class NNIManager implements Manager {
this
.
log
.
warning
(
'
It is not supposed to receive more trials after NO_MORE_TRIAL is set
'
);
this
.
setStatus
(
'
RUNNING
'
);
}
const
trialRequestContent
:
TrialCommandContent
=
JSON
.
parse
(
content
);
const
form
:
TrialJobApplicationForm
=
{
sequenceId
:
this
.
experimentProfile
.
nextSequenceId
++
,
hyperParameters
:
{
value
:
content
,
index
:
0
}
},
placementConstraint
:
trialRequestContent
.
placement_constraint
};
this
.
waitingTrials
.
push
(
form
);
break
;
...
...
ts/nni_manager/test/training_service/remoteMachineTrainingService.test.ts
View file @
3bce6926
...
...
@@ -125,6 +125,10 @@ describe('Unit Test for RemoteMachineTrainingService', () => {
hyperParameters
:
{
value
:
'
mock hyperparameters
'
,
index
:
0
},
placementConstraint
:
{
type
:
"
None
"
,
gpus
:
[]
}
};
const
jobDetail
:
TrialJobDetail
=
await
remoteMachineTrainingService
.
submitTrialJob
(
form
);
...
...
ts/nni_manager/training_service/reusable/environments/remoteEnvironmentService.ts
View file @
3bce6926
...
...
@@ -248,6 +248,7 @@ export class RemoteEnvironmentService extends EnvironmentService {
}
environment
.
command
=
await
this
.
getScript
(
environment
);
environment
.
useActiveGpu
=
rmMachineMeta
.
useActiveGpu
;
return
Promise
.
resolve
(
true
);
}
}
...
...
ts/nni_manager/training_service/reusable/gpuScheduler.ts
View file @
3bce6926
...
...
@@ -4,10 +4,12 @@
'
use strict
'
;
import
*
as
assert
from
'
assert
'
;
import
{
PlacementConstraint
}
from
'
common/trainingService
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
randomSelect
}
from
'
../../common/utils
'
;
import
{
GPUInfo
,
ScheduleResultType
}
from
'
../common/gpuData
'
;
import
{
EnvironmentInformation
}
from
'
./environment
'
;
import
{
RemoteMachineEnvironmentInformation
}
from
'
./remote/remoteConfig
'
;
import
{
TrialDetail
}
from
'
./trial
'
;
type
SCHEDULE_POLICY_NAME
=
'
random
'
|
'
round-robin
'
|
'
recently-idle
'
;
...
...
@@ -51,46 +53,143 @@ export class GpuScheduler {
/**
* Schedule a machine according to the constraints (requiredGPUNum)
* @param
r
equiredGPUNum required GPU number
* @param
defaultR
equiredGPUNum
the default
required GPU number
when constraint.type === 'None'
*/
public
scheduleMachine
(
environments
:
EnvironmentInformation
[],
requiredGPUNum
:
number
|
undefined
,
trialDetail
:
TrialDetail
):
GpuScheduleResult
{
if
(
requiredGPUNum
===
undefined
)
{
requiredGPUNum
=
0
;
}
assert
(
requiredGPUNum
>=
0
);
// Step 1: Check if required GPU number not exceeds the total GPU number in all machines
const
eligibleEnvironments
:
EnvironmentInformation
[]
=
environments
.
filter
((
environment
:
EnvironmentInformation
)
=>
environment
.
defaultGpuSummary
===
undefined
||
requiredGPUNum
===
0
||
(
requiredGPUNum
!==
undefined
&&
environment
.
defaultGpuSummary
.
gpuCount
>=
requiredGPUNum
));
if
(
eligibleEnvironments
.
length
===
0
)
{
// If the required gpu number exceeds the upper limit of all machine's GPU number
// Return REQUIRE_EXCEED_TOTAL directly
return
({
resultType
:
ScheduleResultType
.
REQUIRE_EXCEED_TOTAL
,
public
scheduleMachine
(
environments
:
EnvironmentInformation
[],
constraint
:
PlacementConstraint
,
defaultRequiredGPUNum
:
number
|
undefined
,
trialDetail
:
TrialDetail
):
GpuScheduleResult
{
if
(
constraint
.
type
==
'
None
'
||
constraint
.
type
==
'
GPUNumber
'
)
{
let
requiredGPUNum
=
0
;
if
(
constraint
.
type
==
'
None
'
)
{
if
(
defaultRequiredGPUNum
===
undefined
)
{
requiredGPUNum
=
0
;
}
else
{
requiredGPUNum
=
defaultRequiredGPUNum
;
}
}
else
if
(
constraint
.
type
==
'
GPUNumber
'
)
{
const
gpus
=
constraint
.
gpus
as
Array
<
number
>
;
// TODO: remove the following constraint when supporting distributed trial
if
(
gpus
.
length
!=
1
)
{
throw
new
Error
(
"
Placement constraint of GPUNumber must have exactly one number.
"
);
}
requiredGPUNum
=
gpus
[
0
];
}
assert
(
requiredGPUNum
>=
0
);
// Step 1: Check if required GPU number not exceeds the total GPU number in all machines
const
eligibleEnvironments
:
EnvironmentInformation
[]
=
environments
.
filter
((
environment
:
EnvironmentInformation
)
=>
environment
.
defaultGpuSummary
===
undefined
||
requiredGPUNum
===
0
||
(
requiredGPUNum
!==
undefined
&&
environment
.
defaultGpuSummary
.
gpuCount
>=
requiredGPUNum
));
if
(
eligibleEnvironments
.
length
===
0
)
{
// If the required gpu number exceeds the upper limit of all machine's GPU number
// Return REQUIRE_EXCEED_TOTAL directly
return
({
resultType
:
ScheduleResultType
.
REQUIRE_EXCEED_TOTAL
,
gpuIndices
:
undefined
,
environment
:
undefined
,
});
}
// Step 2: Allocate Host/GPU for specified trial job
// Currenty the requireGPUNum parameter for all trial jobs are identical.
if
(
requiredGPUNum
>
0
)
{
// Trial job requires GPU
const
result
:
GpuScheduleResult
|
undefined
=
this
.
scheduleGPUHost
(
environments
,
requiredGPUNum
,
trialDetail
);
if
(
result
!==
undefined
)
{
return
result
;
}
}
else
{
// Trail job does not need GPU
const
allocatedRm
:
EnvironmentInformation
=
this
.
selectMachine
(
environments
,
environments
);
return
this
.
allocateHost
(
requiredGPUNum
,
allocatedRm
,
[],
trialDetail
);
}
return
{
resultType
:
ScheduleResultType
.
TMP_NO_AVAILABLE_GPU
,
gpuIndices
:
undefined
,
environment
:
undefined
,
});
}
};
}
else
{
assert
(
constraint
.
type
===
'
Device
'
)
if
(
constraint
.
gpus
.
length
==
0
)
{
throw
new
Error
(
"
Device constraint is used but no device is specified.
"
);
}
const
gpus
=
constraint
.
gpus
as
Array
<
[
string
,
number
]
>
;
const
selectedHost
=
gpus
[
0
][
0
];
// Step 2: Allocate Host/GPU for specified trial job
// Currenty the requireGPUNum parameter for all trial jobs are identical.
if
(
requiredGPUNum
>
0
)
{
// Trial job requires GPU
const
result
:
GpuScheduleResult
|
undefined
=
this
.
scheduleGPUHost
(
environments
,
requiredGPUNum
,
trialDetail
);
if
(
result
!==
undefined
)
{
return
result
;
const
hostsOfConstraint
:
Array
<
[
string
,
number
]
>
=
gpus
.
filter
((
gpuTuple
:
[
string
,
number
])
=>
gpuTuple
[
0
]
===
selectedHost
);
if
(
hostsOfConstraint
.
length
>
1
)
{
//TODO: remove this constraint when supporting multi-host placement
throw
new
Error
(
"
Device constraint does not support using multiple hosts
"
)
}
}
else
{
// Trail job does not need GPU
const
allocatedRm
:
EnvironmentInformation
=
this
.
selectMachine
(
environments
,
environments
);
if
(
environments
.
length
==
0
)
{
return
{
resultType
:
ScheduleResultType
.
TMP_NO_AVAILABLE_GPU
,
gpuIndices
:
undefined
,
environment
:
undefined
,
};
}
for
(
const
environment
of
environments
)
{
if
(
!
(
'
rmMachineMeta
'
in
environment
)){
//TODO: remove this constraint when supporting other training services
throw
new
Error
(
`Environment Device placement constraint only supports remote training service for now.`
);
}
}
//TODO:
const
eligibleEnvironments
:
EnvironmentInformation
[]
=
environments
.
filter
(
(
environment
:
EnvironmentInformation
)
=>
(
environment
as
RemoteMachineEnvironmentInformation
).
rmMachineMeta
!=
undefined
&&
(
environment
as
RemoteMachineEnvironmentInformation
).
rmMachineMeta
?.
host
==
selectedHost
);
if
(
eligibleEnvironments
.
length
===
0
)
{
throw
new
Error
(
`The the required host (host:
${
selectedHost
}
) is not found.`
);
}
const
selectedEnvironment
=
eligibleEnvironments
[
0
];
const
availableResources
=
this
.
gpuResourceDetection
([
selectedEnvironment
]);
const
selectedGPUs
:
Array
<
GPUInfo
>
=
[];
return
this
.
allocateHost
(
requiredGPUNum
,
allocatedRm
,
[],
trialDetail
);
}
if
(
selectedEnvironment
.
defaultGpuSummary
===
undefined
)
{
//GPU summary may not be ready, retry until it is ready
return
{
resultType
:
ScheduleResultType
.
TMP_NO_AVAILABLE_GPU
,
gpuIndices
:
undefined
,
environment
:
undefined
,
};
}
for
(
const
gpuTuple
of
gpus
)
{
const
gpuIdx
:
number
=
gpuTuple
[
1
];
if
(
gpuIdx
>=
selectedEnvironment
.
defaultGpuSummary
.
gpuCount
)
{
throw
new
Error
(
`The gpuIdx of placement constraint
${
gpuIdx
}
exceeds gpuCount of the host
${
selectedHost
}
`
);
}
return
{
resultType
:
ScheduleResultType
.
TMP_NO_AVAILABLE_GPU
,
gpuIndices
:
undefined
,
environment
:
undefined
,
};
if
(
availableResources
.
has
(
selectedEnvironment
))
{
for
(
const
gpuInfo
of
availableResources
.
get
(
selectedEnvironment
)
!
)
{
if
(
gpuInfo
.
index
===
gpuIdx
)
{
selectedGPUs
.
push
(
gpuInfo
);
}
}
}
}
if
(
selectedGPUs
.
length
===
constraint
.
gpus
.
length
)
{
for
(
const
gpuInfo
of
selectedGPUs
)
{
let
num
=
selectedEnvironment
.
defaultGpuSummary
?.
assignedGpuIndexMap
.
get
(
gpuInfo
.
index
);
if
(
num
===
undefined
)
{
num
=
0
;
}
selectedEnvironment
.
defaultGpuSummary
?.
assignedGpuIndexMap
.
set
(
gpuInfo
.
index
,
num
+
1
);
}
return
{
resultType
:
ScheduleResultType
.
SUCCEED
,
environment
:
selectedEnvironment
,
gpuIndices
:
selectedGPUs
,
};
}
else
{
return
{
resultType
:
ScheduleResultType
.
TMP_NO_AVAILABLE_GPU
,
gpuIndices
:
undefined
,
environment
:
undefined
,
};
}
}
}
/**
...
...
@@ -101,7 +200,7 @@ export class GpuScheduler {
trial
.
environment
.
defaultGpuSummary
!==
undefined
&&
trial
.
assignedGpus
!==
undefined
&&
trial
.
assignedGpus
.
length
>
0
)
{
for
(
const
gpuInfo
of
trial
.
assignedGpus
)
{
const
defaultGpuSummary
=
trial
.
environment
.
defaultGpuSummary
;
const
num
:
number
|
undefined
=
defaultGpuSummary
.
assignedGpuIndexMap
.
get
(
gpuInfo
.
index
);
...
...
@@ -197,7 +296,7 @@ export class GpuScheduler {
throw
new
Error
(
`Unsupported schedule policy:
${
this
.
policyName
}
`
);
}
}
// Select the environment which is idle most recently. If all environments are not idle, use round robin to select an environment.
private
recentlyIdleSelect
(
qualifiedEnvironments
:
EnvironmentInformation
[],
allEnvironments
:
EnvironmentInformation
[]):
EnvironmentInformation
{
const
now
=
Date
.
now
();
...
...
ts/nni_manager/training_service/reusable/trialDispatcher.ts
View file @
3bce6926
...
...
@@ -535,8 +535,8 @@ class TrialDispatcher implements TrainingService {
if
(
undefined
===
trial
)
{
throw
new
Error
(
`TrialDispatcher: waiting trial shouldn't be undefined!`
);
}
const
g
puNum
=
this
.
config
.
trialGpuNumber
;
const
result
=
this
.
gpuScheduler
.
scheduleMachine
(
reusableEnvironments
,
g
puNum
,
trial
);
const
defaultG
puNum
=
this
.
config
.
trialGpuNumber
;
const
result
=
this
.
gpuScheduler
.
scheduleMachine
(
reusableEnvironments
,
trial
.
form
.
placementConstraint
!
,
defaultG
puNum
,
trial
);
switch
(
result
.
resultType
)
{
case
ScheduleResultType
.
REQUIRE_EXCEED_TOTAL
:
{
...
...
@@ -546,7 +546,7 @@ class TrialDispatcher implements TrainingService {
waitingTrials
=
[];
this
.
isLoggedNoGpuAvailable
=
false
;
}
else
if
(
reusableEnvironments
.
length
>
0
)
{
const
errorMessage
:
string
=
`TrialDispatcher: REQUIRE_EXCEED_TOTAL Required GPU number
${
g
puNum
}
is too large, no machine can meet`
;
const
errorMessage
:
string
=
`TrialDispatcher: REQUIRE_EXCEED_TOTAL Required GPU number
${
defaultG
puNum
}
is too large, no machine can meet`
;
this
.
log
.
error
(
errorMessage
);
throw
new
NNIError
(
NNIErrorNames
.
RESOURCE_NOT_AVAILABLE
,
errorMessage
);
}
else
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment