Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
1458312e
Unverified
Commit
1458312e
authored
Oct 11, 2021
by
Zhenhua Han
Committed by
GitHub
Oct 11, 2021
Browse files
[Retiarii] Bugfix: wrong device placement and invalid CUDA ordinal when using CGO engine (#4086)
parent
6a6bdeed
Changes
8
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
202 additions
and
60 deletions
+202
-60
nni/common/device.py
nni/common/device.py
+50
-7
nni/retiarii/codegen/pytorch.py
nni/retiarii/codegen/pytorch.py
+34
-3
nni/retiarii/execution/cgo_engine.py
nni/retiarii/execution/cgo_engine.py
+29
-26
nni/retiarii/execution/logical_optimizer/logical_plan.py
nni/retiarii/execution/logical_optimizer/logical_plan.py
+52
-17
nni/retiarii/execution/logical_optimizer/opt_dedup_input.py
nni/retiarii/execution/logical_optimizer/opt_dedup_input.py
+7
-0
nni/retiarii/integration.py
nni/retiarii/integration.py
+6
-2
nni/retiarii/operation_def/torch_op_def.py
nni/retiarii/operation_def/torch_op_def.py
+22
-3
ts/nni_manager/training_service/reusable/gpuScheduler.ts
ts/nni_manager/training_service/reusable/gpuScheduler.ts
+2
-2
No files found.
nni/common/device.py
View file @
1458312e
...
...
@@ -2,6 +2,8 @@
# Licensed under the MIT license.
from
dataclasses
import
dataclass
from
abc
import
ABC
,
abstractmethod
try
:
from
typing
import
Literal
except
ImportError
:
...
...
@@ -9,23 +11,54 @@ except ImportError:
@
dataclass
class
GPU
Device
:
class
Device
(
ABC
)
:
node_id
:
str
gpu_id
:
int
status
:
Literal
[
'idle'
,
'busy'
,
'unknown'
]
=
'idle'
def
__eq__
(
self
,
o
)
->
bool
:
if
isinstance
(
self
,
type
(
o
)):
return
self
.
node_id
==
o
.
node_id
else
:
return
False
def
__lt__
(
self
,
o
)
->
bool
:
return
self
.
node_id
<
o
.
node_id
def
set_status
(
self
,
status
):
self
.
status
=
status
def
__repr__
(
self
)
->
str
:
return
"{Abstract Device %s, Status %s}"
%
(
self
.
node_id
,
self
.
status
)
@
abstractmethod
def
device_repr
(
self
)
->
str
:
pass
@
dataclass
class
GPUDevice
(
Device
):
gpu_id
:
str
=
-
1
def
__init__
(
self
,
node_id
,
gpu_id
,
status
=
'idle'
):
self
.
node_id
=
node_id
self
.
gpu_id
=
gpu_id
self
.
status
=
status
def
__eq__
(
self
,
o
:
Device
)
->
bool
:
if
isinstance
(
o
,
GPUDevice
):
return
self
.
node_id
==
o
.
node_id
and
self
.
gpu_id
==
o
.
gpu_id
return
False
def
__lt__
(
self
,
o
)
->
bool
:
def
__lt__
(
self
,
o
:
Device
)
->
bool
:
if
self
.
node_id
<
o
.
node_id
:
return
True
elif
self
.
node_id
>
o
.
node_id
:
return
False
else
:
if
isinstance
(
o
,
GPUDevice
):
return
self
.
gpu_id
<
o
.
gpu_id
else
:
return
True
def
__repr__
(
self
)
->
str
:
return
"{Environment %s, GPU %d, Status %s}"
%
(
self
.
node_id
,
self
.
gpu_id
,
self
.
status
)
...
...
@@ -33,8 +66,18 @@ class GPUDevice:
def
__hash__
(
self
)
->
int
:
return
hash
(
self
.
node_id
+
'_'
+
str
(
self
.
gpu_id
))
def
set_status
(
self
,
status
):
self
.
status
=
status
def
device_repr
(
self
,):
return
f
"cuda:
{
self
.
gpu_id
}
"
@
dataclass
class
CPUDevice
(
Device
):
def
__init__
(
self
,
node_id
):
self
.
node_id
=
node_id
self
.
device
=
'cpu'
def
__repr__
(
self
)
->
str
:
return
"{CPU Device, NodeID %s, Status %s}"
%
(
self
.
node_id
,
self
.
status
)
def
device_repr
(
self
):
return
"cpu"
nni/retiarii/codegen/pytorch.py
View file @
1458312e
...
...
@@ -2,7 +2,10 @@
# Licensed under the MIT license.
import
logging
from
typing
import
List
,
Tuple
,
Any
from
typing
import
Dict
,
List
,
Tuple
,
Any
from
nni.retiarii.operation_def.torch_op_def
import
ToDevice
from
nni.common.device
import
Device
,
GPUDevice
from
..graph
import
IllegalGraphError
,
Edge
,
Graph
,
Node
,
Model
...
...
@@ -98,6 +101,24 @@ def _remove_prefix(names, graph_name):
return
names
[
len
(
graph_name
):]
if
names
.
startswith
(
graph_name
)
else
names
def
generate_cuda_mapping
(
placement
:
Dict
[
Node
,
Device
])
->
Dict
[
Device
,
int
]:
'''
Since CUDA_VISIBLE_DEVICES will be set to the list of real GPU ID,
we need to remap the GPU ID when generating code to match them correctly.
For example, when CUDA_VISIBLE_DEVICES="0,3", we need to use "cuda:0", "cuda:1" in the generated code.
'''
unique_devices
=
sorted
(
list
(
set
([
e
for
e
in
placement
.
values
()
if
isinstance
(
e
,
GPUDevice
)])))
node_gpu_cnt
=
{}
cuda_remapped_id
=
{}
for
d
in
unique_devices
:
if
d
.
node_id
not
in
node_gpu_cnt
:
node_gpu_cnt
[
d
.
node_id
]
=
0
node_gpu_cnt
[
d
.
node_id
]
+=
1
cuda_remapped_id
[
d
]
=
node_gpu_cnt
[
d
.
node_id
]
-
1
return
cuda_remapped_id
def
graph_to_pytorch_model
(
graph_name
:
str
,
graph
:
Graph
,
placement
=
None
)
->
str
:
nodes
=
graph
.
topo_sort
()
...
...
@@ -105,8 +126,14 @@ def graph_to_pytorch_model(graph_name: str, graph: Graph, placement=None) -> str
# only need to generate code for module here
import_pkgs
=
set
()
node_codes
=
[]
cuda_remapped_id
=
None
if
placement
:
cuda_remapped_id
=
generate_cuda_mapping
(
placement
)
for
node
in
nodes
:
if
node
.
operation
:
if
placement
and
isinstance
(
node
.
operation
,
ToDevice
):
node
.
operation
.
override_device_repr
(
"cuda:%d"
%
cuda_remapped_id
[
node
.
operation
.
device
])
if
node
.
operation
.
type
==
'shared'
:
continue
pkg_name
=
node
.
operation
.
get_import_pkg
()
...
...
@@ -115,7 +142,11 @@ def graph_to_pytorch_model(graph_name: str, graph: Graph, placement=None) -> str
node_code
=
node
.
operation
.
to_init_code
(
_remove_prefix
(
node
.
name
,
graph_name
))
if
node_code
is
not
None
:
if
placement
and
node
in
placement
and
len
(
node_code
)
>
0
:
node_codes
.
append
(
f
"
{
node_code
}
.to('
{
placement
[
node
].
device_repr
()
}
')"
)
if
isinstance
(
placement
[
node
],
GPUDevice
):
device_repr
=
"cuda:%d"
%
cuda_remapped_id
[
placement
[
node
]]
else
:
device_repr
=
placement
[
node
].
device_repr
()
node_codes
.
append
(
f
"
{
node_code
}
.to('
{
device_repr
}
')"
)
else
:
node_codes
.
append
(
node_code
)
...
...
nni/retiarii/execution/cgo_engine.py
View file @
1458312e
...
...
@@ -9,7 +9,7 @@ import time
import
threading
from
typing
import
Iterable
,
List
,
Dict
,
Tuple
from
nni.common.device
import
GPUDevice
from
nni.common.device
import
GPUDevice
,
Device
from
.interface
import
AbstractExecutionEngine
,
AbstractGraphListener
,
WorkerInfo
from
..
import
codegen
,
utils
from
..graph
import
Model
,
ModelStatus
,
MetricData
,
Node
...
...
@@ -33,9 +33,8 @@ class CGOExecutionEngine(AbstractExecutionEngine):
Parameters
----------
devices : List[
str] or List[GPU
Device]
devices : List[Device]
Available devices for execution.
If a list of str is provided, it will build a list of GPUDevice in a server named ``single_server``
max_concurrency : int
The maximum number of trials to run concurrently.
batch_waiting_time: int
...
...
@@ -43,14 +42,14 @@ class CGOExecutionEngine(AbstractExecutionEngine):
The trials within one batch could apply cross-graph optimization.
"""
def
__init__
(
self
,
devices
:
List
[
GPU
Device
]
=
None
,
def
__init__
(
self
,
devices
:
List
[
Device
]
=
None
,
max_concurrency
:
int
=
None
,
batch_waiting_time
:
int
=
60
,
)
->
None
:
self
.
_listeners
:
List
[
AbstractGraphListener
]
=
[]
self
.
_running_models
:
Dict
[
int
,
Model
]
=
dict
()
self
.
logical_plan_counter
=
0
self
.
available_devices
:
List
[
GPU
Device
]
=
[]
self
.
available_devices
:
List
[
Device
]
=
[]
self
.
max_concurrency
:
int
=
max_concurrency
for
device
in
devices
:
self
.
available_devices
.
append
(
device
)
...
...
@@ -61,7 +60,7 @@ class CGOExecutionEngine(AbstractExecutionEngine):
self
.
_original_models
=
{}
self
.
_original_model_to_multi_model
=
{}
self
.
_trial_to_original_models
=
{}
self
.
_trial_used_devices
:
Dict
[
int
,
List
[
GPU
Device
]]
=
{}
self
.
_trial_used_devices
:
Dict
[
int
,
List
[
Device
]]
=
{}
self
.
_history
:
List
[
Model
]
=
[]
...
...
@@ -110,6 +109,15 @@ class CGOExecutionEngine(AbstractExecutionEngine):
self
.
_queue_lock
.
release
()
time
.
sleep
(
1
)
def
_extract_placement_constaint
(
self
,
placement_mapping
:
Dict
[
Node
,
Device
]):
unique_gpus
=
sorted
(
list
(
set
([
e
for
e
in
placement_mapping
.
values
()
if
isinstance
(
e
,
GPUDevice
)])))
placement_constraint
=
None
if
len
(
unique_gpus
)
>
0
:
placement_constraint
=
{}
placement_constraint
[
'type'
]
=
'Device'
placement_constraint
[
'gpus'
]
=
[(
e
.
node_id
,
e
.
gpu_id
)
for
e
in
unique_gpus
]
return
placement_constraint
def
_submit_models_in_batch
(
self
,
*
models
:
List
[
Model
])
->
None
:
_logger
.
info
(
'%d models are submitted in batch'
,
len
(
models
))
logical
=
self
.
_build_logical
(
models
)
...
...
@@ -120,9 +128,10 @@ class CGOExecutionEngine(AbstractExecutionEngine):
phy_models_and_placements
=
self
.
_assemble
(
logical
)
for
model
,
placement
,
grouped_models
in
phy_models_and_placements
:
data
=
BaseGraphData
(
codegen
.
model_to_pytorch_script
(
model
,
placement
=
placement
),
model
.
evaluator
)
trial_id
=
send_trial
(
data
.
dump
())
placement_constraint
=
self
.
_extract_placement_constaint
(
placement
)
trial_id
=
send_trial
(
data
.
dump
(),
placement_constraint
=
placement_constraint
)
# unique non-cpu devices used by the trial
self
.
_trial_used_devices
[
trial_id
]
=
list
([
_
for
_
in
set
(
placement
.
values
()
)
if
isinstance
(
_
,
GPUDevice
)])
self
.
_trial_used_devices
[
trial_id
]
=
list
(
set
(
[
_
for
_
in
placement
.
values
()
if
isinstance
(
_
,
GPUDevice
)])
)
# currently, it is impossible for search strategy to submit models more than the number of available devices
for
used_device
in
self
.
_trial_used_devices
[
trial_id
]:
...
...
@@ -139,14 +148,18 @@ class CGOExecutionEngine(AbstractExecutionEngine):
def
list_models
(
self
)
->
Iterable
[
Model
]:
return
self
.
_history
def
_assemble
(
self
,
logical_plan
:
LogicalPlan
)
->
List
[
Tuple
[
Model
,
Dict
[
Node
,
GPUDevice
],
List
[
Model
]]]:
def
_assemble
(
self
,
logical_plan
:
LogicalPlan
)
->
List
[
Tuple
[
Model
,
Dict
[
Node
,
Device
],
List
[
Model
]]]:
"""
Return the assembled models as a list of tuple.
Each tuple contains the assembled model, the device placement of graph nodes, and the original models.
"""
# try to use the available_devices first so that it can be launched as early as possible
# if free devices are not enough to assemble all models in one trial, try all devices
if
len
(
self
.
available_devices
)
>
0
:
grouped_models
:
List
[
Dict
[
Model
,
GPU
Device
]]
=
AssemblePolicy
().
group
(
logical_plan
,
self
.
available_devices
)
grouped_models
:
List
[
Dict
[
Model
,
Device
]]
=
AssemblePolicy
().
group
(
logical_plan
,
self
.
available_devices
)
if
len
(
self
.
available_devices
)
==
0
or
len
(
grouped_models
)
>
1
:
grouped_models
:
List
[
Dict
[
Model
,
GPU
Device
]]
=
AssemblePolicy
().
group
(
logical_plan
,
self
.
all_devices
)
grouped_models
:
List
[
Dict
[
Model
,
Device
]]
=
AssemblePolicy
().
group
(
logical_plan
,
self
.
all_devices
)
phy_models_and_placements
=
[]
for
multi_model
in
grouped_models
:
...
...
@@ -256,17 +269,7 @@ class CGOExecutionEngine(AbstractExecutionEngine):
os
.
remove
(
file_name
)
def
_remap_cuda_device
(
group_model
:
Dict
[
Model
,
GPUDevice
]):
used_devices
=
{}
for
m
in
group_model
:
if
group_model
[
m
].
node_id
not
in
used_devices
:
used_devices
[
group_model
[
m
].
node_id
]
=
{}
if
isinstance
(
group_model
[
m
],
GPUDevice
):
if
group_model
[
m
].
gpu_id
not
in
used_devices
[
group_model
[
m
].
node_id
]:
n_used_gpu_in_server
=
len
(
used_devices
[
group_model
[
m
].
node_id
])
used_devices
[
group_model
[
m
].
node_id
][
group_model
[
m
].
gpu_id
]
=
n_used_gpu_in_server
group_model
[
m
].
gpu_id
=
used_devices
[
group_model
[
m
].
node_id
][
group_model
[
m
].
gpu_id
]
return
group_model
class
AssemblePolicy
:
...
...
@@ -282,7 +285,7 @@ class AssemblePolicy:
@
staticmethod
def
_check_graph_connectivity
(
model
:
Model
,
group_model
:
Dict
[
Model
,
GPU
Device
],
group_model
:
Dict
[
Model
,
Device
],
logical_plan
:
LogicalPlan
)
->
bool
:
for
edge
in
logical_plan
.
logical_graph
.
edges
:
if
AssemblePolicy
.
_is_related_node
(
model
,
edge
.
head
)
or
\
...
...
@@ -294,7 +297,7 @@ class AssemblePolicy:
return
False
@
staticmethod
def
_check_evaluator
(
new_model
:
Model
,
group_model
:
Dict
[
Model
,
GPU
Device
])
->
bool
:
def
_check_evaluator
(
new_model
:
Model
,
group_model
:
Dict
[
Model
,
Device
])
->
bool
:
if
not
(
isinstance
(
new_model
.
evaluator
,
Lightning
)
and
isinstance
(
new_model
.
evaluator
.
module
,
MultiModelSupervisedLearningModule
)):
return
False
...
...
@@ -318,11 +321,11 @@ class AssemblePolicy:
if
len
(
group_model
)
>
0
and
\
(
AssemblePolicy
.
_check_graph_connectivity
(
m
,
group_model
,
logical_plan
)
==
False
or
AssemblePolicy
.
_check_evaluator
(
m
,
group_model
)
==
False
):
all_grouped_models
.
append
(
_remap_cuda_device
(
group_model
)
)
all_grouped_models
.
append
(
group_model
)
group_model
=
{}
group_model
[
m
]
=
available_devices
[
idx
%
len
(
available_devices
)]
if
len
(
group_model
)
==
len
(
available_devices
)
or
\
idx
==
len
(
logical_plan
.
models
)
-
1
:
all_grouped_models
.
append
(
_remap_cuda_device
(
group_model
)
)
all_grouped_models
.
append
(
group_model
)
group_model
=
{}
return
all_grouped_models
nni/retiarii/execution/logical_optimizer/logical_plan.py
View file @
1458312e
...
...
@@ -2,30 +2,39 @@
# Licensed under the MIT license.
import
copy
from
typing
import
Dict
,
Tuple
,
Any
,
Union
from
typing
import
Dict
,
Tuple
,
Any
from
nni.retiarii.utils
import
uid
from
nni.common.device
import
G
PUDevice
from
nni.common.device
import
Device
,
C
PUDevice
from
...graph
import
Cell
,
Edge
,
Graph
,
Model
,
Node
from
...operation
import
Operation
,
_IOPseudoOperation
class
CPUDevice
:
def
__init__
(
self
,
node_id
):
self
.
node_id
=
node_id
self
.
device
=
'cpu'
def
device_repr
(
self
):
return
"cpu"
class
AbstractLogicalNode
(
Node
):
def
__init__
(
self
,
graph
,
node_id
,
name
,
operation
,
_internal
=
False
):
super
().
__init__
(
graph
,
node_id
,
name
,
operation
,
_internal
=
_internal
)
self
.
related_models
=
[]
def
assemble
(
self
,
multi_model_placement
:
Dict
[
Model
,
GPUDevice
])
->
Tuple
[
Node
,
GPUDevice
]:
def
assemble
(
self
,
multi_model_placement
:
Dict
[
Model
,
Device
])
->
Tuple
[
Node
,
Device
]:
"""
Given a set of models to be formed in a physical model and their device placement,
this function replaces the logical node with an executable physical node for the physical model.
Parameters
----------
multi_model_placement : dict
a dict of models and device placement.
These models will be assembled into the same physical model to run.
Returns
-------
node : Node
the physical node to replace the logical node in the physical model
placement : Device
the device placement of the returned physical node
"""
raise
NotImplementedError
def
_fork_to
(
self
,
graph
:
Graph
):
...
...
@@ -85,6 +94,11 @@ class LogicalGraph(Graph):
class
OriginNode
(
AbstractLogicalNode
):
"""
This is logical node representing the original node without any modification.
In assemble, just return the original node along with the physical placement given by multi_model_placement.
"""
def
__init__
(
self
,
logical_graph
:
LogicalGraph
,
original_graph
:
Graph
,
original_node
:
Node
,
name
:
str
,
operation
,
_internal
=
False
):
...
...
@@ -92,7 +106,7 @@ class OriginNode(AbstractLogicalNode):
self
.
original_graph
=
original_graph
self
.
original_node
=
original_node
def
assemble
(
self
,
multi_model_placement
:
Dict
[
Model
,
GPU
Device
])
->
Tuple
[
Node
,
GPU
Device
]:
def
assemble
(
self
,
multi_model_placement
:
Dict
[
Model
,
Device
])
->
Tuple
[
Node
,
Device
]:
model_id
=
self
.
original_node
.
graph
.
model
.
model_id
new_node
=
Node
(
self
.
original_node
.
graph
,
self
.
original_node
.
id
,
f
"M_
{
model_id
}
_"
+
...
...
@@ -138,8 +152,27 @@ class LogicalPlan:
new_tail
=
id_to_new_node
[
edge
.
tail
.
id
]
Edge
((
new_head
,
edge
.
head_slot
),
(
new_tail
,
edge
.
tail_slot
),
_internal
=
True
).
_register
()
def
assemble
(
self
,
multi_model_placement
:
Dict
[
Model
,
GPUDevice
])
\
->
Tuple
[
Model
,
Dict
[
Node
,
Union
[
GPUDevice
,
CPUDevice
]]]:
def
assemble
(
self
,
multi_model_placement
:
Dict
[
Model
,
Device
])
\
->
Tuple
[
Model
,
Dict
[
Node
,
Device
]]:
"""
Given a set of models to be formed in a physical model and their device placement,
this function replaces all the logical node in this LogicalPlan with executable physical nodes
for the physical model.
Parameters
----------
multi_model_placement : dict
a dict of models and device placement.
These models will be assembled into the same physical model to run.
Returns
-------
phy_model : Model
the physical model formed by models in `multi_model_placement`
all logical node are replaced by physical nodes
node_placements : dict
the device placement of the nodes in `phy_model`
"""
phy_model
=
Model
(
_internal
=
True
)
phy_graph
=
self
.
lp_model
.
root_graph
.
_fork_to
(
phy_model
)
phy_graph
.
_rename_graph
(
phy_graph
.
name
,
"_model"
)
...
...
@@ -222,9 +255,10 @@ class LogicalPlan:
node
.
remove
()
# If two nodes are placed on different devices, use ToDevice op to copy the node
# TODO: when copying one node to multiple devices, broadcast is more efficient than P2P communication
existing_edges
=
phy_graph
.
edges
.
copy
()
# Avoid a node is copied multiple times on the same device
copied_op
:
Dict
[
Tuple
(
Node
,
Union
[
GPUDevice
,
CPUDevice
]
),
Node
]
=
{}
copied_op
:
Dict
[
Tuple
(
Node
,
Device
),
Node
]
=
{}
for
edge
in
existing_edges
:
head_placement
=
node_placements
[
edge
.
head
]
tail_placement
=
node_placements
[
edge
.
tail
]
...
...
@@ -238,11 +272,12 @@ class LogicalPlan:
dst_name
=
edge
.
head
.
name
+
"_to_"
+
edge
.
tail
.
name
to_operation
=
Operation
.
new
(
'ToDevice'
,
{
"device"
:
tail_placement
.
device_repr
()
,
"src"
:
(
"device"
:
tail_placement
,
"src"
:
(
edge
.
head
.
name
,
edge
.
head_slot
),
"dst"
:
dst_name
})
to_node
=
Node
(
phy_graph
,
uid
(),
dst_name
,
to_operation
).
_register
()
Edge
((
edge
.
head
,
edge
.
head_slot
),
(
to_node
,
None
),
_internal
=
True
).
_register
()
copied_op
[(
edge
.
head
,
tail_placement
)]
=
to_node
node_placements
[
to_node
]
=
head_placement
edge
.
head
=
to_node
edge
.
head_slot
=
None
...
...
nni/retiarii/execution/logical_optimizer/opt_dedup_input.py
View file @
1458312e
...
...
@@ -17,6 +17,13 @@ _supported_evaluators = [MultiModelSupervisedLearningModule]
class
DedupInputNode
(
AbstractLogicalNode
):
"""
This is logical node representing the node for deduplication.
In assemble, just return one copy of the original node when multiple models are assembled.
These models will share the result of once calculation.
"""
def
__init__
(
self
,
logical_graph
:
LogicalGraph
,
node_id
:
int
,
nodes_to_dedup
:
List
[
Node
],
_internal
=
False
):
super
().
__init__
(
logical_graph
,
node_id
,
...
...
nni/retiarii/integration.py
View file @
1458312e
...
...
@@ -79,8 +79,12 @@ class RetiariiAdvisor(MsgDispatcherBase):
raise
ValueError
(
'placement_constraint.type must be either `None`,. `GPUNumber` or `Device`'
)
if
placement_constraint
[
'type'
]
==
'None'
and
len
(
placement_constraint
[
'gpus'
])
>
0
:
raise
ValueError
(
'placement_constraint.gpus must be an empty list when type == None'
)
if
placement_constraint
[
'type'
]
==
'Device'
and
len
(
placement_constraint
[
'gpus'
])
!=
1
:
raise
ValueError
(
'placement_constraint.gpus must be a list of number (currently only support one host)'
)
if
placement_constraint
[
'type'
]
==
'GPUNumber'
:
if
len
(
placement_constraint
[
'gpus'
])
!=
1
:
raise
ValueError
(
'placement_constraint.gpus currently only support one host when type == GPUNumber'
)
for
e
in
placement_constraint
[
'gpus'
]:
if
not
isinstance
(
e
,
int
):
raise
ValueError
(
'placement_constraint.gpus must be a list of number when type == GPUNumber'
)
if
placement_constraint
[
'type'
]
==
'Device'
:
for
e
in
placement_constraint
[
'gpus'
]:
if
not
isinstance
(
e
,
tuple
):
...
...
nni/retiarii/operation_def/torch_op_def.py
View file @
1458312e
...
...
@@ -494,17 +494,36 @@ class ToDevice(PyTorchOperation):
def
__init__
(
self
,
type_name
:
str
,
parameters
:
Dict
[
str
,
Any
],
_internal
:
bool
=
False
):
self
.
type
=
"ToDevice"
self
.
device
=
parameters
[
'device'
]
self
.
overridden_device_repr
=
None
self
.
src
=
parameters
[
'src'
]
self
.
dst
=
parameters
[
'dst'
]
def
override_device_repr
(
self
,
device_repr
):
# CUDA GPUDevice may remap GPU physical ID to CUDA ID. The device repr is different from GPUDevice.device_repr()
# override_device_repr will be called in pytorch.graph_to_pytorch_model to replace device_repr with the correct
# CUDA ID, e.g., when a job uses Physical GPU-1,2, its CUDA ID should be "cuda:0" and "cuda:1".
# self.device.device_repr() would return "cuda:1" and "cuda:2", but override_device_repr should be "cuda:0" and
# "cuda:1"
self
.
overridden_device_repr
=
device_repr
def
__repr__
(
self
):
return
f
'to("
{
self
.
device
}
")'
if
self
.
overridden_device_repr
is
None
:
return
f
'to("
{
self
.
device
.
device_repr
()
}
")'
else
:
return
f
'to("
{
self
.
overridden_device_repr
}
")'
def
to_forward_code
(
self
,
field
:
str
,
output
:
str
,
inputs
:
List
[
str
],
inputs_value
:
List
[
Any
])
->
str
:
return
f
'
{
output
}
=
{
inputs
[
0
]
}
.to("
{
self
.
device
}
")'
if
self
.
overridden_device_repr
is
None
:
forward_code
=
f
'
{
output
}
=
{
inputs
[
0
]
}
.to("
{
self
.
device
.
device_repr
()
}
")'
else
:
forward_code
=
f
'
{
output
}
=
{
inputs
[
0
]
}
.to("
{
self
.
overridden_device_repr
}
")'
return
forward_code
class
AtenDet
(
PyTorchOperation
):
# for torch 1.9
# NOTE: it is not included in the above aten ops, maybe because torch.det is alias for torch.linalg.det
_ori_type_name
=
[
'aten::linalg_det'
]
def
to_forward_code
(
self
,
field
:
str
,
output
:
str
,
inputs
:
List
[
str
],
inputs_value
:
List
[
Any
]
=
None
)
->
str
:
return
f
'
{
output
}
= torch.det(
{
inputs
[
0
]
}
)'
ts/nni_manager/training_service/reusable/gpuScheduler.ts
View file @
1458312e
...
...
@@ -115,8 +115,8 @@ export class GpuScheduler {
const
gpus
=
constraint
.
gpus
as
Array
<
[
string
,
number
]
>
;
const
selectedHost
=
gpus
[
0
][
0
];
const
hostsOfConstraint
:
Array
<
[
string
,
number
]
>
=
gpus
.
filter
((
gpuTuple
:
[
string
,
number
])
=>
gpuTuple
[
0
]
==
=
selectedHost
);
if
(
hostsOfConstraint
.
length
>
1
)
{
const
differentHosts
:
Array
<
[
string
,
number
]
>
=
gpus
.
filter
((
gpuTuple
:
[
string
,
number
])
=>
gpuTuple
[
0
]
!
=
selectedHost
);
if
(
differentHosts
.
length
>
=
1
)
{
//TODO: remove this constraint when supporting multi-host placement
throw
new
Error
(
"
Device constraint does not support using multiple hosts
"
)
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment