Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
70fcdda6
Unverified
Commit
70fcdda6
authored
Oct 25, 2021
by
Zhenhua Han
Committed by
GitHub
Oct 25, 2021
Browse files
CGO execution engine handles missing GPU indices in RemoteMachineConfig (#4270)
parent
e428db54
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
5 additions
and
4 deletions
+5
-4
docs/en_US/NAS/ExecutionEngines.rst
docs/en_US/NAS/ExecutionEngines.rst
+1
-0
nni/retiarii/experiment/pytorch.py
nni/retiarii/experiment/pytorch.py
+4
-4
No files found.
docs/en_US/NAS/ExecutionEngines.rst
View file @
70fcdda6
...
@@ -81,6 +81,7 @@ To enable CGO execution engine, you need to follow these steps:
...
@@ -81,6 +81,7 @@ To enable CGO execution engine, you need to follow these steps:
# ...
# ...
# server configuration in rm_conf
# server configuration in rm_conf
rm_conf.gpu_indices = [0, 1, 2, 3] # gpu_indices must be set in RemoteMachineConfig for CGO execution engine
config.training_service.machine_list = [rm_conf]
config.training_service.machine_list = [rm_conf]
exp.run(config, 8099)
exp.run(config, 8099)
...
...
nni/retiarii/experiment/pytorch.py
View file @
70fcdda6
...
@@ -219,7 +219,8 @@ class RetiariiExperiment(Experiment):
...
@@ -219,7 +219,8 @@ class RetiariiExperiment(Experiment):
elif
self
.
config
.
execution_engine
==
'cgo'
:
elif
self
.
config
.
execution_engine
==
'cgo'
:
from
..execution.cgo_engine
import
CGOExecutionEngine
from
..execution.cgo_engine
import
CGOExecutionEngine
# assert self.config.trial_gpu_number==1, "trial_gpu_number must be 1 to use CGOExecutionEngine"
assert
self
.
config
.
training_service
.
platform
==
'remote'
,
\
"CGO execution engine currently only supports remote training service"
assert
self
.
config
.
batch_waiting_time
is
not
None
assert
self
.
config
.
batch_waiting_time
is
not
None
devices
=
self
.
_construct_devices
()
devices
=
self
.
_construct_devices
()
engine
=
CGOExecutionEngine
(
devices
,
engine
=
CGOExecutionEngine
(
devices
,
...
@@ -273,11 +274,10 @@ class RetiariiExperiment(Experiment):
...
@@ -273,11 +274,10 @@ class RetiariiExperiment(Experiment):
devices
=
[]
devices
=
[]
if
hasattr
(
self
.
config
.
training_service
,
'machine_list'
):
if
hasattr
(
self
.
config
.
training_service
,
'machine_list'
):
for
machine
in
self
.
config
.
training_service
.
machine_list
:
for
machine
in
self
.
config
.
training_service
.
machine_list
:
assert
machine
.
gpu_indices
is
not
None
,
\
'gpu_indices must be set in RemoteMachineConfig for CGO execution engine'
for
gpu_idx
in
machine
.
gpu_indices
:
for
gpu_idx
in
machine
.
gpu_indices
:
devices
.
append
(
GPUDevice
(
machine
.
host
,
gpu_idx
))
devices
.
append
(
GPUDevice
(
machine
.
host
,
gpu_idx
))
else
:
for
gpu_idx
in
self
.
config
.
training_service
.
gpu_indices
:
devices
.
append
(
GPUDevice
(
'local'
,
gpu_idx
))
return
devices
return
devices
def
_create_dispatcher
(
self
):
def
_create_dispatcher
(
self
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment