Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
fairscale
Commits
b52041d9
Unverified
Commit
b52041d9
authored
Jan 20, 2021
by
Benjamin Lefaudeux
Committed by
GitHub
Jan 20, 2021
Browse files
[fix] MPI init for unit tests (#316)
* using a global variable to share the init filename across processes
parent
ce2f64f9
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
6 additions
and
2 deletions
+6
-2
fairscale/utils/testing.py
fairscale/utils/testing.py
+6
-2
No files found.
fairscale/utils/testing.py
View file @
b52041d9
...
@@ -53,6 +53,8 @@ skip_if_single_gpu = pytest.mark.skipif(
...
@@ -53,6 +53,8 @@ skip_if_single_gpu = pytest.mark.skipif(
not
torch
.
cuda
.
is_available
()
or
torch
.
cuda
.
device_count
()
<
2
,
reason
=
"multiple GPUs required"
not
torch
.
cuda
.
is_available
()
or
torch
.
cuda
.
device_count
()
<
2
,
reason
=
"multiple GPUs required"
)
)
_
,
filename_mpi
=
tempfile
.
mkstemp
()
class
IdentityLayer
(
torch
.
nn
.
Module
):
class
IdentityLayer
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
size
:
int
,
scale
:
float
=
1.0
)
->
None
:
def
__init__
(
self
,
size
:
int
,
scale
:
float
=
1.0
)
->
None
:
...
@@ -241,10 +243,12 @@ def torch_spawn(world_sizes: Optional[List[int]] = None) -> Callable:
...
@@ -241,10 +243,12 @@ def torch_spawn(world_sizes: Optional[List[int]] = None) -> Callable:
error_queue
=
multiprocessing
.
get_context
(
"spawn"
).
SimpleQueue
()
error_queue
=
multiprocessing
.
get_context
(
"spawn"
).
SimpleQueue
()
if
"OMPI_COMM_WORLD_RANK"
in
os
.
environ
:
if
"OMPI_COMM_WORLD_RANK"
in
os
.
environ
:
global
filename_mpi
os
.
environ
[
"RANK"
]
=
os
.
environ
[
"OMPI_COMM_WORLD_RANK"
]
os
.
environ
[
"RANK"
]
=
os
.
environ
[
"OMPI_COMM_WORLD_RANK"
]
os
.
environ
[
"WORLD_SIZE"
]
=
os
.
environ
[
"OMPI_COMM_WORLD_SIZE"
]
os
.
environ
[
"WORLD_SIZE"
]
=
os
.
environ
[
"OMPI_COMM_WORLD_SIZE"
]
_
,
filename
=
tempfile
.
mkstemp
(
)
torch
.
distributed
.
init_process_group
(
"mpi"
,
init_method
=
f
"file://
{
filename_mpi
}
"
)
torch
.
distributed
.
init_process_group
(
"mpi"
,
init_method
=
f
"file://
{
filename
}
"
)
world_size
=
torch
.
distributed
.
get_world_size
()
world_size
=
torch
.
distributed
.
get_world_size
()
destroy_model_parallel
()
destroy_model_parallel
()
initialize_model_parallel
(
1
,
world_size
)
initialize_model_parallel
(
1
,
world_size
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment