Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
TransformerEngine
Commits
74983b36
Unverified
Commit
74983b36
authored
Mar 06, 2025
by
Nicolas Castet
Committed by
GitHub
Mar 06, 2025
Browse files
Fix UB with MPI init (#1538)
Signed-off-by:
Nicolas Castet
<
ncastet@nvidia.com
>
parent
bd278fff
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
4 additions
and
2 deletions
+4
-2
transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp
...common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp
+4
-2
No files found.
transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp
View file @
74983b36
...
...
@@ -280,7 +280,7 @@ int create_communicator_grouped2(communicator **comm, int myrank, int numranks,
int
fd
;
volatile
uint32_t
abortFlag
=
0
;
IpcSocketHandle
ipcSock
=
{
0
};
uint64_t
opId
=
0xdeadcafe
b
000
+
(
*
comm
)
->
my_node
+
(
*
comm
)
->
ar2_firstgpu
;
uint64_t
opId
=
0xdeadcafe
0
000
+
(
*
comm
)
->
my_node
;
ipcSocketResult_t
ret
=
ipcSocketSuccess
;
IPCCHECK
(
ipcSocketInit
(
&
ipcSock
,
(
*
comm
)
->
ar2_nvrank
,
(
uint64_t
)
opId
,
&
abortFlag
));
(
*
comm
)
->
_barrier
((
*
comm
)
->
comm_world
);
...
...
@@ -416,6 +416,8 @@ int create_communicator_grouped2_mpi(communicator **comm, int pipegpus, int pipe
// find internode numbers and make internode communicator
NVTE_CHECK_CUDA
(
cudaFree
(
0
));
int
mynode
,
numnodes
;
mynode
=
myrank
/
numlocal
;
numnodes
=
numranks
/
numlocal
;
// finally call the abstracted constructor with MPI info
return
create_communicator_grouped2
(
comm
,
myrank
,
numranks
,
mylocal
,
numlocal
,
mynode
,
numnodes
,
...
...
@@ -549,7 +551,7 @@ int register_user_buffer_collective(void **gpubuff, size_t bytes, communicator *
volatile
uint32_t
abortFlag
=
0
;
IpcSocketHandle
ipcSock
=
{
0
};
uint64_t
opId
=
0xdeadcafe
beef
+
comm
->
my_node
;
uint64_t
opId
=
0xdeadcafe
0000
+
comm
->
my_node
;
ipcSocketResult_t
ret
=
ipcSocketSuccess
;
// All-gather POSIX file descriptors across local ranks
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment