Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
lishen01
Sccl
Commits
a4ac3320
Commit
a4ac3320
authored
Jul 07, 2025
by
lishen
Browse files
通过线程池实现ipcsocket,满足节点内通信
parent
d9d23f34
Changes
132
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1070 additions
and
8470 deletions
+1070
-8470
src/hardware/graph/connect.cc
src/hardware/graph/connect.cc
+0
-726
src/hardware/graph/devcomm.h
src/hardware/graph/devcomm.h
+0
-586
src/hardware/graph/graph.h
src/hardware/graph/graph.h
+0
-131
src/hardware/graph/paths.cc
src/hardware/graph/paths.cc
+0
-935
src/hardware/graph/rings.cc
src/hardware/graph/rings.cc
+0
-69
src/hardware/graph/rings.h
src/hardware/graph/rings.h
+0
-16
src/hardware/graph/rome_models.cc
src/hardware/graph/rome_models.cc
+0
-3325
src/hardware/graph/rome_models.h
src/hardware/graph/rome_models.h
+0
-21
src/hardware/graph/sccl_bfloat16.h
src/hardware/graph/sccl_bfloat16.h
+0
-188
src/hardware/graph/search.cc
src/hardware/graph/search.cc
+0
-1470
src/hardware/graph/trees.cc
src/hardware/graph/trees.cc
+0
-116
src/hardware/graph/tuning.cc
src/hardware/graph/tuning.cc
+0
-784
src/hardware/hardware_utils.cpp
src/hardware/hardware_utils.cpp
+10
-0
src/hardware/hardware_utils.h
src/hardware/hardware_utils.h
+7
-1
src/hardware/net/device/net_ib.h
src/hardware/net/device/net_ib.h
+0
-26
src/hardware/net/host/net_socket.h
src/hardware/net/host/net_socket.h
+0
-26
src/hardware/net/ipc_socket/ipc_socket.cpp
src/hardware/net/ipc_socket/ipc_socket.cpp
+781
-0
src/hardware/net/ipc_socket/ipc_socket.h
src/hardware/net/ipc_socket/ipc_socket.h
+107
-0
src/hardware/net/net.cpp
src/hardware/net/net.cpp
+147
-0
src/hardware/net/net.h
src/hardware/net/net.h
+18
-50
No files found.
src/hardware/graph/connect.cc
deleted
100644 → 0
View file @
d9d23f34
#include "comm.h"
#include "graph.h"
#include "trees.h"
#include "rings.h"
#include "topo.h"
namespace
sccl
{
namespace
hardware
{
namespace
topology
{
namespace
detect
{
/******************************************************************/
/********************* Internode connection ***********************/
/******************************************************************/
scclResult_t
scclTopoPreset
(
struct
scclComm
*
comm
,
struct
scclTopoGraph
**
graphs
,
struct
scclTopoRanks
*
topoRanks
)
{
int
rank
=
comm
->
rank
;
int
localRanks
=
comm
->
topo
->
nodes
[
GPU
].
count
;
int
nChannels
=
comm
->
nChannels
;
for
(
int
c
=
0
;
c
<
nChannels
;
c
++
)
{
struct
scclChannel
*
channel
=
comm
->
channels
+
c
;
channel
->
ring
.
prev
=
channel
->
ring
.
next
=
-
1
;
channel
->
tree
.
up
=
-
1
;
channel
->
collnetChain
.
up
=
-
1
;
for
(
int
i
=
0
;
i
<
SCCL_MAX_TREE_ARITY
;
i
++
)
channel
->
tree
.
down
[
i
]
=
-
1
;
for
(
int
i
=
0
;
i
<
SCCL_MAX_TREE_ARITY
;
i
++
)
channel
->
collnetChain
.
down
[
i
]
=
-
1
;
channel
->
collnetDirect
.
out
=
-
1
;
channel
->
collnetDirect
.
headRank
=
-
1
;
channel
->
collnetDirect
.
nHeads
=
0
;
channel
->
collnetDirect
.
shift
=
0
;
for
(
int
i
=
0
;
i
<
SCCL_MAX_DIRECT_ARITY
;
i
++
)
channel
->
collnetDirect
.
up
[
i
]
=
-
1
;
for
(
int
i
=
0
;
i
<
SCCL_MAX_DIRECT_ARITY
;
i
++
)
channel
->
collnetDirect
.
down
[
i
]
=
-
1
;
int
*
ringIntra
=
graphs
[
SCCL_ALGO_RING
]
->
intra
+
c
*
localRanks
;
int
*
treeIntra
=
graphs
[
SCCL_ALGO_TREE
]
->
intra
+
c
*
localRanks
;
int
*
collNetIntra
=
graphs
[
SCCL_ALGO_COLLNET_CHAIN
]
->
intra
+
c
*
localRanks
;
int
*
nvlsIntra
=
graphs
[
SCCL_ALGO_NVLS
]
->
intra
+
c
*
localRanks
;
for
(
int
i
=
0
;
i
<
localRanks
;
i
++
)
{
if
(
ringIntra
[
i
]
==
rank
)
{
topoRanks
->
ringRecv
[
c
]
=
ringIntra
[
0
];
topoRanks
->
ringSend
[
c
]
=
ringIntra
[
localRanks
-
1
];
channel
->
ring
.
prev
=
(
i
==
0
)
?
-
1
:
ringIntra
[
i
-
1
];
channel
->
ring
.
next
=
(
i
==
localRanks
-
1
)
?
-
1
:
ringIntra
[
i
+
1
];
}
if
(
treeIntra
[
i
]
==
rank
)
{
int
parentIndex
=
0
;
int
child0Index
=
graphs
[
SCCL_ALGO_TREE
]
->
pattern
==
SCCL_TOPO_PATTERN_TREE
?
0
:
1
;
int
child1Index
=
graphs
[
SCCL_ALGO_TREE
]
->
pattern
==
SCCL_TOPO_PATTERN_SPLIT_TREE
?
1
:
0
;
topoRanks
->
treeToParent
[
c
]
=
treeIntra
[
parentIndex
];
topoRanks
->
treeToChild0
[
c
]
=
treeIntra
[
child0Index
];
topoRanks
->
treeToChild1
[
c
]
=
treeIntra
[
child1Index
];
channel
->
tree
.
up
=
i
==
0
?
-
1
:
treeIntra
[
i
-
1
];
channel
->
tree
.
down
[
0
]
=
i
==
localRanks
-
1
?
-
1
:
treeIntra
[
i
+
1
];
}
if
(
collNetIntra
[
i
]
==
rank
)
{
channel
->
collnetChain
.
up
=
i
==
0
?
comm
->
nRanks
:
collNetIntra
[
i
-
1
];
channel
->
collnetChain
.
down
[
0
]
=
i
==
localRanks
-
1
?
-
1
:
collNetIntra
[
i
+
1
];
}
}
topoRanks
->
ringPrev
[
c
]
=
channel
->
ring
.
prev
;
topoRanks
->
ringNext
[
c
]
=
channel
->
ring
.
next
;
topoRanks
->
nvlsHeads
[
c
]
=
nvlsIntra
[
0
];
}
// Duplicate channels rings/trees
struct
scclChannel
*
channel0
=
comm
->
channels
;
struct
scclChannel
*
channel1
=
(
nChannels
>
MAXCHANNELS
/
2
)
?
0
:
channel0
+
nChannels
;
if
(
channel1
)
memcpy
(
channel1
,
channel0
,
nChannels
*
sizeof
(
struct
scclChannel
));
return
scclSuccess
;
}
bool
isRankHere
(
const
char
*
s
,
int
start
,
int
end
,
int
rank
)
{
if
(
end
<=
start
||
start
<
0
||
end
<
0
)
return
false
;
int
num
=
0
;
while
(
start
<
end
)
{
char
currChar
=
s
[
start
];
if
(
isdigit
(
currChar
))
{
num
=
num
*
10
+
(
currChar
-
'0'
);
if
(
isdigit
(
s
[
start
+
1
]))
{
start
++
;
continue
;
}
}
else
if
(
currChar
==
'('
||
currChar
==
')'
)
{
start
++
;
num
=
0
;
continue
;
}
if
(
num
==
rank
)
return
true
;
start
++
;
}
return
false
;
}
scclResult_t
scclTreeBasePostset
(
struct
scclComm
*
comm
,
struct
scclTopoGraph
*
treeGraph
)
{
int
x
=
0
,
y
=
0
;
for
(
int
i
=
0
;
treeGraph
->
treeBase
[
i
][
0
]
!=
0
;
i
++
)
{
x
=
i
+
1
;
}
if
(
treeGraph
->
treeBase
[
0
][
0
]
==
0
)
return
scclSuccess
;
int
nChannels
=
comm
->
nChannels
;
int
localRanks
=
comm
->
topo
->
nodes
[
GPU
].
count
;
// new tree
for
(
int
c
=
0
;
c
<
nChannels
;
c
++
)
{
// in here
int
buff
=
c
%
x
;
char
tempString
[
SCCL_TOPO_MAX_NODES
*
4
];
int
ko
=
0
;
while
(
treeGraph
->
treeBase
[
buff
][
ko
]
!=
0
)
{
tempString
[
ko
]
=
treeGraph
->
treeBase
[
buff
][
ko
];
ko
++
;
}
tempString
[
ko
]
=
0
;
int
start
=
0
;
int
curRank
=
comm
->
rank
;
struct
scclChannel
*
channel
=
comm
->
channels
+
c
;
int
end
=
0
;
while
(
tempString
[
end
]
!=
0
)
end
++
;
int
parent
=
-
1
;
// constructing a number from the continuous digits
while
(
start
<
end
)
{
int
num
=
0
,
num_found
=
0
;
start
++
;
while
(
start
<
end
&&
tempString
[
start
]
!=
'('
&&
tempString
[
start
]
!=
')'
)
{
int
num_here
=
(
int
)(
tempString
[
start
]
-
'0'
);
num
=
num
*
10
+
num_here
;
start
=
start
+
1
;
if
(
tempString
[
start
]
==
'('
||
tempString
[
start
]
==
')'
||
start
==
end
)
num_found
=
1
;
}
if
(
num_found
!=
0
&&
num
==
curRank
)
{
channel
->
tree
.
up
=
parent
;
int
depth
=
0
;
for
(
int
childId
=
0
;
childId
<
SCCL_MAX_TREE_ARITY
;
childId
++
)
{
int
or_start
=
start
;
int
child
=
-
1
;
channel
->
tree
.
down
[
childId
]
=
-
1
;
if
(
or_start
>=
end
-
1
)
continue
;
num
=
0
;
or_start
++
;
while
(
tempString
[
or_start
]
!=
0
&&
tempString
[
or_start
]
!=
'('
&&
tempString
[
or_start
]
!=
')'
)
{
int
num_here
=
(
int
)(
tempString
[
or_start
]
-
'0'
);
num
=
num
*
10
+
num_here
;
or_start
++
;
}
child
=
num
;
// find next child start
while
(
start
<
end
)
{
if
(
tempString
[
start
]
==
'('
)
depth
++
;
else
if
(
tempString
[
start
]
==
')'
)
depth
--
;
if
(
depth
==
0
)
break
;
// next child
start
++
;
}
start
++
;
channel
->
tree
.
down
[
childId
]
=
child
;
// get kids, update numbers, get out of this string
}
break
;
}
else
{
// go to the next one
parent
=
num
;
int
start_c
=
start
;
int
end_c
=
start_c
;
while
(
end_c
<
end
)
{
int
depth
=
0
;
while
(
end_c
<
end
)
{
if
(
tempString
[
end_c
]
==
'('
)
depth
++
;
else
if
(
tempString
[
end_c
]
==
')'
)
depth
--
;
if
(
depth
==
0
)
break
;
// next child
end_c
++
;
}
if
(
isRankHere
(
tempString
,
start_c
,
end_c
,
curRank
))
{
start
=
start_c
;
end
=
end_c
;
break
;
}
else
{
end_c
++
;
start_c
=
end_c
;
}
}
}
}
}
return
scclSuccess
;
}
static
scclResult_t
connectRings
(
struct
scclComm
*
comm
,
int
*
ringRecv
,
int
*
ringSend
,
int
*
ringPrev
,
int
*
ringNext
)
{
int
nChannels
=
comm
->
nChannels
;
int
nNodes
=
comm
->
nNodes
;
for
(
int
c
=
0
;
c
<
nChannels
;
c
++
)
{
int
*
recv
=
ringRecv
+
c
*
comm
->
nNodes
;
int
*
send
=
ringSend
+
c
*
comm
->
nNodes
;
int
*
prev
=
ringPrev
+
c
*
comm
->
nRanks
;
int
*
next
=
ringNext
+
c
*
comm
->
nRanks
;
struct
scclChannel
*
channel0
=
comm
->
channels
+
c
;
struct
scclChannel
*
channel1
=
(
nChannels
>
MAXCHANNELS
/
2
)
?
0
:
channel0
+
nChannels
;
for
(
int
n
=
0
;
n
<
nNodes
;
n
++
)
{
int
recvRank
=
recv
[
n
];
int
prevSendRank
=
send
[(
n
-
1
+
nNodes
)
%
nNodes
];
prev
[
recvRank
]
=
prevSendRank
;
if
(
comm
->
rank
==
recvRank
)
{
channel0
->
ring
.
prev
=
prevSendRank
;
if
(
channel1
)
channel1
->
ring
.
prev
=
prevSendRank
;
}
int
sendRank
=
send
[
n
];
int
nextRecvRank
=
recv
[(
n
+
1
)
%
nNodes
];
next
[
sendRank
]
=
nextRecvRank
;
if
(
comm
->
rank
==
sendRank
)
{
channel0
->
ring
.
next
=
nextRecvRank
;
if
(
channel1
)
channel1
->
ring
.
next
=
nextRecvRank
;
}
}
}
return
scclSuccess
;
}
static
scclResult_t
getIndexes
(
int
*
ranks
,
int
*
indexes
,
int
nNodes
)
{
for
(
int
n
=
0
;
n
<
nNodes
;
n
++
)
indexes
[
n
]
=
ranks
[
n
];
return
scclSuccess
;
}
static
scclResult_t
setTreeUp
(
struct
scclTree
*
tree
,
int
*
indexes
,
int
u
)
{
if
(
u
==
-
1
)
return
scclSuccess
;
tree
->
up
=
indexes
[
u
];
return
scclSuccess
;
}
static
scclResult_t
setTreeDown
(
struct
scclTree
*
tree
,
int
*
indexes
,
int
d
)
{
if
(
d
==
-
1
)
return
scclSuccess
;
int
x
=
0
;
while
(
x
<
SCCL_MAX_TREE_ARITY
&&
tree
->
down
[
x
]
>=
0
)
x
++
;
if
(
x
==
SCCL_MAX_TREE_ARITY
)
{
WARN
(
"Internal error : tree already has %d children (%d %d %d)"
,
x
,
tree
->
down
[
0
],
tree
->
down
[
1
],
tree
->
down
[
2
]);
return
scclInternalError
;
}
tree
->
down
[
x
]
=
indexes
[
d
];
return
scclSuccess
;
}
static
scclResult_t
connectTrees
(
struct
scclComm
*
comm
,
int
*
treeToParent
,
int
*
treeToChild0
,
int
*
treeToChild1
,
int
*
treePatterns
)
{
const
int
nChannels
=
(
comm
->
nChannels
>
MAXCHANNELS
/
2
)
?
comm
->
nChannels
/
2
:
comm
->
nChannels
,
nNodes
=
comm
->
nNodes
,
node
=
comm
->
node
;
// Compute tree depth. Not an exact value but a good approximation in most
// cases
int
depth
=
comm
->
nRanks
/
nNodes
-
1
+
log2i
(
nNodes
);
int
t0u
,
t0d0
,
t0d1
,
t0ChildType
,
t1u
,
t1d0
,
t1d1
,
t1ChildType
;
int
*
ttp
,
*
ttc0
,
*
ttc1
;
SCCLCHECK
(
scclGetDtree
(
nNodes
,
node
,
&
t0u
,
&
t0d0
,
&
t0d1
,
&
t0ChildType
,
&
t1u
,
&
t1d0
,
&
t1d1
,
&
t1ChildType
));
if
(
comm
->
nChannels
<=
MAXCHANNELS
/
2
)
{
for
(
int
c
=
0
;
c
<
nChannels
;
c
++
)
{
struct
scclChannel
*
channel0
=
comm
->
channels
+
c
;
struct
scclChannel
*
channel1
=
channel0
+
nChannels
;
ttp
=
treeToParent
+
c
*
comm
->
nNodes
;
ttc0
=
treeToChild0
+
c
*
comm
->
nNodes
;
ttc1
=
treeToChild1
+
c
*
comm
->
nNodes
;
if
(
comm
->
rank
==
ttp
[
node
])
{
SCCLCHECK
(
setTreeUp
(
&
channel0
->
tree
,
t0ChildType
==
0
?
ttc0
:
ttc1
,
t0u
));
SCCLCHECK
(
setTreeUp
(
&
channel1
->
tree
,
t1ChildType
==
0
?
ttc0
:
ttc1
,
t1u
));
}
if
(
comm
->
rank
==
ttc0
[
node
])
{
SCCLCHECK
(
setTreeDown
(
&
channel0
->
tree
,
ttp
,
t0d0
));
SCCLCHECK
(
setTreeDown
(
&
channel1
->
tree
,
ttp
,
t1d0
));
}
if
(
comm
->
rank
==
ttc1
[
node
])
{
SCCLCHECK
(
setTreeDown
(
&
channel0
->
tree
,
ttp
,
t0d1
));
SCCLCHECK
(
setTreeDown
(
&
channel1
->
tree
,
ttp
,
t1d1
));
}
if
(
comm
->
rank
==
ttp
[
node
]
||
comm
->
rank
==
ttc0
[
node
]
||
comm
->
rank
==
ttc1
[
node
])
{
INFO
(
SCCL_LOG_TOPO
,
"Tree %d : %d -> %d -> %d/%d/%d"
,
c
,
channel0
->
tree
.
up
,
comm
->
rank
,
channel0
->
tree
.
down
[
0
],
channel0
->
tree
.
down
[
1
],
channel0
->
tree
.
down
[
2
]);
INFO
(
SCCL_LOG_TOPO
,
"Tree %d : %d -> %d -> %d/%d/%d"
,
c
+
nChannels
,
channel1
->
tree
.
up
,
comm
->
rank
,
channel1
->
tree
.
down
[
0
],
channel1
->
tree
.
down
[
1
],
channel1
->
tree
.
down
[
2
]);
}
channel0
->
tree
.
depth
=
channel1
->
tree
.
depth
=
depth
;
}
}
else
{
for
(
int
c
=
0
;
c
<
nChannels
;
c
++
)
{
struct
scclChannel
*
channel0
=
comm
->
channels
+
c
;
ttp
=
treeToParent
+
c
*
comm
->
nNodes
;
ttc0
=
treeToChild0
+
c
*
comm
->
nNodes
;
ttc1
=
treeToChild1
+
c
*
comm
->
nNodes
;
if
(
comm
->
rank
==
ttp
[
node
])
{
SCCLCHECK
(
setTreeUp
(
&
channel0
->
tree
,
t0ChildType
==
0
?
ttc0
:
ttc1
,
t0u
));
}
if
(
comm
->
rank
==
ttc0
[
node
])
{
SCCLCHECK
(
setTreeDown
(
&
channel0
->
tree
,
ttp
,
t0d0
));
}
if
(
comm
->
rank
==
ttc1
[
node
])
{
SCCLCHECK
(
setTreeDown
(
&
channel0
->
tree
,
ttp
,
t0d1
));
}
if
(
comm
->
rank
==
ttp
[
node
]
||
comm
->
rank
==
ttc0
[
node
]
||
comm
->
rank
==
ttc1
[
node
])
{
INFO
(
SCCL_LOG_TOPO
,
"Tree %d : %d -> %d -> %d/%d/%d"
,
c
,
channel0
->
tree
.
up
,
comm
->
rank
,
channel0
->
tree
.
down
[
0
],
channel0
->
tree
.
down
[
1
],
channel0
->
tree
.
down
[
2
]);
}
channel0
->
tree
.
depth
=
depth
;
}
for
(
int
c
=
nChannels
;
c
<
nChannels
*
2
;
c
++
)
{
struct
scclChannel
*
channel1
=
comm
->
channels
+
c
;
ttp
=
treeToParent
+
c
*
comm
->
nNodes
;
ttc0
=
treeToChild0
+
c
*
comm
->
nNodes
;
ttc1
=
treeToChild1
+
c
*
comm
->
nNodes
;
if
(
comm
->
rank
==
ttp
[
node
])
{
SCCLCHECK
(
setTreeUp
(
&
channel1
->
tree
,
t1ChildType
==
0
?
ttc0
:
ttc1
,
t1u
));
}
if
(
comm
->
rank
==
ttc0
[
node
])
{
SCCLCHECK
(
setTreeDown
(
&
channel1
->
tree
,
ttp
,
t1d0
));
}
if
(
comm
->
rank
==
ttc1
[
node
])
{
SCCLCHECK
(
setTreeDown
(
&
channel1
->
tree
,
ttp
,
t1d1
));
}
if
(
comm
->
rank
==
ttp
[
node
]
||
comm
->
rank
==
ttc0
[
node
]
||
comm
->
rank
==
ttc1
[
node
])
{
INFO
(
SCCL_LOG_TOPO
,
"Tree %d : %d -> %d -> %d/%d/%d"
,
c
+
nChannels
,
channel1
->
tree
.
up
,
comm
->
rank
,
channel1
->
tree
.
down
[
0
],
channel1
->
tree
.
down
[
1
],
channel1
->
tree
.
down
[
2
]);
}
channel1
->
tree
.
depth
=
depth
;
}
}
return
scclSuccess
;
}
static
scclResult_t
connectCollNet
(
struct
scclComm
*
comm
,
struct
scclTopoGraph
*
collNetGraph
)
{
int
rank
=
comm
->
rank
;
int
localRanks
=
comm
->
localRanks
;
int
nHeads
=
0
;
int
*
heads
;
SCCLCHECK
(
scclCalloc
(
&
heads
,
localRanks
));
// Find all head ranks
// Head index is always 0
for
(
int
c
=
0
;
c
<
collNetGraph
->
nChannels
;
c
++
)
{
int
*
collNetIntra
=
collNetGraph
->
intra
+
c
*
localRanks
;
int
head
=
collNetIntra
[
0
];
for
(
int
h
=
0
;
h
<
nHeads
;
h
++
)
if
(
heads
[
h
]
==
head
)
head
=
-
1
;
if
(
head
!=
-
1
)
heads
[
nHeads
++
]
=
collNetIntra
[
0
];
}
// For all channels
for
(
int
c
=
0
;
c
<
comm
->
nChannels
;
c
++
)
{
struct
scclChannel
*
channel
=
comm
->
channels
+
c
;
char
line
[
1024
];
sprintf
(
line
,
"CollNet channel %d rank %d "
,
c
,
rank
);
int
nDown
=
0
;
for
(
int
i
=
0
;
i
<
nHeads
;
i
++
)
{
if
(
rank
==
heads
[
i
])
{
// is head
channel
->
collnetDirect
.
headRank
=
i
;
// Mark the index for deciding offset in the CUDA kernel
channel
->
collnetDirect
.
out
=
comm
->
nRanks
;
// Set root of collnetDirect to id nranks
int
*
collNetIntra
=
collNetGraph
->
intra
+
i
*
localRanks
;
sprintf
(
line
+
strlen
(
line
),
"down "
);
for
(
int
r
=
0
;
r
<
localRanks
;
r
++
)
{
if
(
collNetIntra
[
r
]
==
rank
)
continue
;
channel
->
collnetDirect
.
down
[
nDown
++
]
=
collNetIntra
[
r
];
// connect to all peers
sprintf
(
line
+
strlen
(
line
),
" %d "
,
collNetIntra
[
r
]);
}
sprintf
(
line
+
strlen
(
line
),
"nDown %d "
,
nDown
);
break
;
}
}
// Connect to all heads
int
nUp
=
0
;
sprintf
(
line
+
strlen
(
line
),
"up "
);
for
(
int
h
=
0
;
h
<
nHeads
;
h
++
)
{
if
(
rank
==
heads
[
h
])
continue
;
channel
->
collnetDirect
.
up
[
nUp
++
]
=
heads
[
h
];
sprintf
(
line
+
strlen
(
line
),
" %d "
,
heads
[
h
]);
}
channel
->
collnetDirect
.
nHeads
=
nHeads
;
channel
->
collnetDirect
.
shift
=
(
rank
%
localRanks
)
%
nHeads
;
// Shift by intraRank so that leaves don't send to same head simultaneously
channel
->
collnetDirect
.
depth
=
(
nUp
==
0
&&
nDown
==
0
)
?
1
:
2
;
sprintf
(
line
+
strlen
(
line
),
"nUp %d nHeads %d "
,
nUp
,
nHeads
);
sprintf
(
line
+
strlen
(
line
),
"headRank %d out %d shift %d"
,
channel
->
collnetDirect
.
headRank
,
channel
->
collnetDirect
.
out
,
channel
->
collnetDirect
.
shift
);
INFO
(
SCCL_LOG_TOPO
,
"%s"
,
line
);
channel
->
collnetChain
.
depth
=
comm
->
nRanks
/
comm
->
nNodes
;
}
for
(
int
c
=
0
;
c
<
comm
->
nvlsChannels
;
c
++
)
{
struct
scclChannel
*
channel
=
comm
->
channels
+
c
;
if
(
channel
->
nvls
.
headRank
!=
-
1
)
channel
->
nvls
.
out
=
comm
->
nRanks
;
}
free
(
heads
);
return
scclSuccess
;
}
static
scclResult_t
connectNvls
(
struct
scclComm
*
comm
,
int
*
nvlsHeads
,
struct
scclTopoGraph
*
nvlsGraph
)
{
int
nHeads
=
nvlsGraph
->
nChannels
;
int
headRank
=
-
1
;
for
(
int
h
=
0
;
h
<
nHeads
;
h
++
)
{
if
(
nvlsGraph
->
intra
[
h
*
comm
->
localRanks
]
==
comm
->
rank
)
headRank
=
h
;
}
if
(
nHeads
==
0
)
{
comm
->
nvlsChannels
=
0
;
return
scclSuccess
;
}
for
(
int
c
=
0
;
c
<
comm
->
nvlsChannels
;
c
++
)
{
struct
scclChannel
*
channel
=
comm
->
channels
+
c
;
channel
->
nvls
.
nHeads
=
nHeads
;
for
(
int
h
=
0
;
h
<
nHeads
;
h
++
)
channel
->
nvls
.
up
[
h
]
=
comm
->
nRanks
+
1
+
h
;
for
(
int
h
=
nHeads
;
h
<
SCCL_MAX_NVLS_ARITY
;
h
++
)
channel
->
nvls
.
up
[
h
]
=
-
1
;
channel
->
nvls
.
down
=
comm
->
nRanks
+
1
+
headRank
;
channel
->
nvls
.
out
=
-
1
;
// NVLS+SHARP not yet implemented.
channel
->
nvls
.
headRank
=
headRank
;
channel
->
nvls
.
treeUp
=
channel
->
nvls
.
treeDown
[
0
]
=
channel
->
nvls
.
treeDown
[
1
]
=
channel
->
nvls
.
treeDown
[
2
]
=
-
1
;
channel
->
nvls
.
node
=
comm
->
node
;
channel
->
nvls
.
nNodes
=
comm
->
nNodes
;
}
if
(
comm
->
nNodes
==
1
)
return
scclSuccess
;
// Connect Trees
int
tree0Parent
,
tree0Child0
,
tree0Child1
,
tree1Parent
,
tree1Child0
,
tree1Child1
;
int
pc0
,
pc1
;
// ignored
SCCLCHECK
(
scclGetDtree
(
comm
->
nNodes
,
comm
->
node
,
&
tree0Parent
,
&
tree0Child0
,
&
tree0Child1
,
&
pc0
,
&
tree1Parent
,
&
tree1Child0
,
&
tree1Child1
,
&
pc1
));
int
*
heads
=
NULL
;
int
treeUp
[
2
]
=
{
-
1
,
-
1
};
int
treeDown0
[
2
]
=
{
-
1
,
-
1
};
int
treeDown1
[
2
]
=
{
-
1
,
-
1
};
if
(
comm
->
node
==
0
)
{
for
(
int
h
=
0
;
h
<
nHeads
;
h
++
)
{
char
line
[
1024
];
sprintf
(
line
,
"NVLS Head %2d:"
,
h
);
heads
=
nvlsHeads
+
h
*
comm
->
nNodes
;
for
(
int
n
=
0
;
n
<
comm
->
nNodes
&&
n
<
20
;
n
++
)
{
sprintf
(
line
+
strlen
(
line
),
" %2d"
,
heads
[
n
]);
}
INFO
(
SCCL_INIT
,
"%s"
,
line
);
}
}
// Find the heads where I'm the head rank and retain tree up/down
for
(
int
h
=
0
;
h
<
nHeads
;
h
++
)
{
heads
=
nvlsHeads
+
h
*
comm
->
nNodes
;
if
(
heads
[
comm
->
node
]
==
comm
->
rank
)
{
treeUp
[
0
]
=
tree0Parent
==
-
1
?
-
1
:
heads
[
tree0Parent
];
treeDown0
[
0
]
=
tree0Child0
==
-
1
?
-
1
:
heads
[
tree0Child0
];
treeDown1
[
0
]
=
tree0Child1
==
-
1
?
-
1
:
heads
[
tree0Child1
];
treeUp
[
1
]
=
tree1Parent
==
-
1
?
-
1
:
heads
[
tree1Parent
];
treeDown0
[
1
]
=
tree1Child0
==
-
1
?
-
1
:
heads
[
tree1Child0
];
treeDown1
[
1
]
=
tree1Child1
==
-
1
?
-
1
:
heads
[
tree1Child1
];
break
;
}
}
// Set prev/next in all channels (NVLS compute channels work
// orthogonally to NVLS search channels).
for
(
int
c
=
0
;
c
<
comm
->
nvlsChannels
;
c
++
)
{
struct
scclChannel
*
channel
=
comm
->
channels
+
c
;
channel
->
nvls
.
treeUp
=
treeUp
[
c
%
2
];
channel
->
nvls
.
treeDown
[
0
]
=
channel
->
nvls
.
down
;
int
ix
=
1
;
if
(
treeDown0
[
c
%
2
]
!=
-
1
)
channel
->
nvls
.
treeDown
[
ix
++
]
=
treeDown0
[
c
%
2
];
if
(
treeDown1
[
c
%
2
]
!=
-
1
)
channel
->
nvls
.
treeDown
[
ix
]
=
treeDown1
[
c
%
2
];
}
struct
scclNvls
*
nvls0
=
&
comm
->
channels
[
0
].
nvls
;
struct
scclNvls
*
nvls1
=
&
comm
->
channels
[
1
].
nvls
;
INFO
(
SCCL_LOG_TOPO
,
"NVLS Trees : %d/%d->%d->%d %d/%d->%d->%d"
,
nvls0
->
treeDown
[
0
],
nvls0
->
treeDown
[
1
],
comm
->
rank
,
nvls0
->
treeUp
,
nvls1
->
treeDown
[
0
],
nvls1
->
treeDown
[
1
],
comm
->
rank
,
nvls1
->
treeUp
);
return
scclSuccess
;
}
// Legacy naming
SCCL_PARAM
(
MinNrings
,
"MIN_NRINGS"
,
-
2
);
SCCL_PARAM
(
MaxNrings
,
"MAX_NRINGS"
,
-
2
);
// New naming
SCCL_PARAM
(
MinNchannels
,
"MIN_NCHANNELS"
,
4
);
SCCL_PARAM
(
MaxNchannels
,
"MAX_NCHANNELS"
,
-
2
);
int
scclMinNchannels
()
{
int
minNchannels
=
2
;
if
(
scclParamMinNrings
()
!=
-
2
)
minNchannels
=
scclParamMinNrings
();
if
(
scclParamMinNchannels
()
!=
-
2
)
minNchannels
=
scclParamMinNchannels
();
if
(
minNchannels
>
MAXCHANNELS
)
{
WARN
(
"User asked for a minimum of %d channels, limiting to %d"
,
minNchannels
,
MAXCHANNELS
);
minNchannels
=
MAXCHANNELS
;
}
if
(
minNchannels
<
0
)
minNchannels
=
0
;
return
minNchannels
;
}
int
scclMaxNchannels
()
{
int
maxNchannels
=
MAXCHANNELS
;
if
(
scclParamMaxNrings
()
!=
-
2
)
maxNchannels
=
scclParamMaxNrings
();
if
(
scclParamMaxNchannels
()
!=
-
2
)
maxNchannels
=
scclParamMaxNchannels
();
if
(
maxNchannels
>
MAXCHANNELS
)
maxNchannels
=
MAXCHANNELS
;
if
(
maxNchannels
<
1
)
{
WARN
(
"User asked for a maximum of %d channels, setting it to 1"
,
maxNchannels
);
maxNchannels
=
1
;
}
return
maxNchannels
;
}
static
int
copyChannels
(
struct
scclComm
*
comm
,
int
start
,
int
end
,
int
*
ringPrev
,
int
*
ringNext
)
{
int
nranks
=
comm
->
nRanks
;
int
c
;
for
(
c
=
start
;
c
<
end
;
c
++
)
{
memcpy
(
ringPrev
+
c
*
nranks
,
ringPrev
+
(
c
-
start
)
*
nranks
,
nranks
*
sizeof
(
int
));
memcpy
(
ringNext
+
c
*
nranks
,
ringNext
+
(
c
-
start
)
*
nranks
,
nranks
*
sizeof
(
int
));
memcpy
(
comm
->
channels
+
c
,
comm
->
channels
+
c
-
start
,
sizeof
(
struct
scclChannel
));
}
return
c
;
}
static
int
copyMixedChannels
(
struct
scclComm
*
comm
,
int
start
,
int
end
,
int
*
ringPrev
,
int
*
ringNext
)
{
int
nranks
=
comm
->
nRanks
;
int
c
;
for
(
c
=
start
;
c
<
end
;
c
++
)
{
memcpy
(
ringPrev
+
c
*
nranks
,
ringPrev
+
(
c
-
start
)
*
nranks
,
nranks
*
sizeof
(
int
));
memcpy
(
ringNext
+
c
*
nranks
,
ringNext
+
(
c
-
start
)
*
nranks
,
nranks
*
sizeof
(
int
));
memcpy
(
comm
->
channels
+
c
,
comm
->
channels
+
c
-
start
,
sizeof
(
struct
scclChannel
));
comm
->
channels
[
c
].
transportType
=
comm
->
mixedTransportType
;
}
return
c
;
}
RCCL_PARAM
(
MaxMixedHylinkNChannels
,
"MAX_MIXED_HYLINK_NCHANNELS"
,
0
);
RCCL_PARAM
(
MixedTransportType
,
"MIXED_TRANSPORT_TYPE"
,
TRANSPORT_SHM
);
scclResult_t
scclTopoPostset
(
struct
scclComm
*
comm
,
int
*
firstRanks
,
int
*
treePatterns
,
struct
scclTopoRanks
**
allTopoRanks
,
int
*
rings
,
struct
scclTopoGraph
**
graphs
,
int
nc
)
{
// Gather data from all ranks
int
*
ringRecv
,
*
ringSend
,
*
ringPrev
,
*
ringNext
,
*
treeToParent
,
*
treeToChild0
,
*
treeToChild1
,
*
nvlsHeads
;
int
nranks
=
comm
->
nRanks
;
int
nNodes
=
comm
->
nNodes
;
int
nChannels
=
comm
->
nChannels
;
int
MinNChannels
=
scclMinNchannels
();
int
MaxNChannels
=
scclMaxNchannels
();
SCCLCHECK
(
scclCalloc
(
&
ringRecv
,
nNodes
*
MAXCHANNELS
));
SCCLCHECK
(
scclCalloc
(
&
ringSend
,
nNodes
*
MAXCHANNELS
));
SCCLCHECK
(
scclCalloc
(
&
ringPrev
,
nranks
*
MAXCHANNELS
));
SCCLCHECK
(
scclCalloc
(
&
ringNext
,
nranks
*
MAXCHANNELS
));
SCCLCHECK
(
scclCalloc
(
&
treeToParent
,
nNodes
*
MAXCHANNELS
));
SCCLCHECK
(
scclCalloc
(
&
treeToChild0
,
nNodes
*
MAXCHANNELS
));
SCCLCHECK
(
scclCalloc
(
&
treeToChild1
,
nNodes
*
MAXCHANNELS
));
SCCLCHECK
(
scclCalloc
(
&
nvlsHeads
,
nNodes
*
MAXCHANNELS
));
for
(
int
c
=
0
;
c
<
nChannels
;
c
++
)
{
for
(
int
n
=
0
;
n
<
nNodes
;
n
++
)
{
int
r
=
firstRanks
[
n
];
ringRecv
[
c
*
nNodes
+
n
]
=
allTopoRanks
[
r
]
->
ringRecv
[
c
];
ringSend
[
c
*
nNodes
+
n
]
=
allTopoRanks
[
r
]
->
ringSend
[
c
];
treeToParent
[
c
*
nNodes
+
n
]
=
allTopoRanks
[
r
]
->
treeToParent
[
c
];
treeToChild0
[
c
*
nNodes
+
n
]
=
allTopoRanks
[
r
]
->
treeToChild0
[
c
];
treeToChild1
[
c
*
nNodes
+
n
]
=
allTopoRanks
[
r
]
->
treeToChild1
[
c
];
nvlsHeads
[
c
*
nNodes
+
n
]
=
allTopoRanks
[
r
]
->
nvlsHeads
[
c
];
}
for
(
int
r
=
0
;
r
<
nranks
;
r
++
)
{
ringPrev
[
c
*
nranks
+
r
]
=
allTopoRanks
[
r
]
->
ringPrev
[
c
];
ringNext
[
c
*
nranks
+
r
]
=
allTopoRanks
[
r
]
->
ringNext
[
c
];
}
}
// Connect rings and trees. This should also duplicate the channels.
SCCLCHECK
(
connectRings
(
comm
,
ringRecv
,
ringSend
,
ringPrev
,
ringNext
));
SCCLCHECK
(
connectTrees
(
comm
,
treeToParent
,
treeToChild0
,
treeToChild1
,
treePatterns
));
SCCLCHECK
(
connectNvls
(
comm
,
nvlsHeads
,
graphs
[
SCCL_ALGO_NVLS
]));
// Duplicate ringPrev/ringNext for scclBuildRing
if
(
nChannels
<=
MAXCHANNELS
/
2
)
memcpy
(
ringPrev
+
nChannels
*
nranks
,
ringPrev
,
nChannels
*
nranks
*
sizeof
(
int
));
if
(
nChannels
<=
MAXCHANNELS
/
2
)
memcpy
(
ringNext
+
nChannels
*
nranks
,
ringNext
,
nChannels
*
nranks
*
sizeof
(
int
));
if
(
scclTopoPathAllNVLink
(
comm
->
topo
)
==
1
&&
getenv
(
"SCCL_MIN_NCHANNELS"
)
==
NULL
)
MinNChannels
=
32
;
if
(
scclTopoPathAllNVLink
(
comm
->
topo
)
==
1
&&
getenv
(
"SCCL_MAX_NCHANNELS"
)
==
NULL
)
MaxNChannels
=
32
;
#ifdef HCU_SDMA_FEATURE
int
ncSdma
=
nc
;
ncSdma
=
std
::
min
((
int
)
scclMaxNchannels
()
/
comm
->
nChannels
,
nc
);
ncSdma
*=
comm
->
nChannels
;
#endif
// Get number of channels after duplication
nc
=
std
::
min
((
int
)
MaxNChannels
/
comm
->
nChannels
,
nc
);
nc
*=
comm
->
nChannels
;
// Duplication should be complete now
nChannels
=
comm
->
nChannels
=
std
::
min
(
MAXCHANNELS
,
(
nChannels
<=
MAXCHANNELS
/
2
)
?
nChannels
*
2
:
nChannels
);
// Setup CollNet
if
(
comm
->
collNetSupport
==
1
)
{
struct
scclTopoGraph
*
collNetGraph
=
graphs
[
SCCL_ALGO_COLLNET_DIRECT
];
// Add more channels to saturate intra-node bandwidth, except the 1 PPN case
if
(
collNetGraph
->
bwIntra
>
collNetGraph
->
bwInter
&&
comm
->
nRanks
>
comm
->
nNodes
)
{
int
collNetNchannels
=
std
::
min
(
MAXCHANNELS
,
nChannels
+
nChannels
/
2
);
nChannels
=
comm
->
nChannels
=
copyChannels
(
comm
,
nChannels
,
collNetNchannels
,
ringPrev
,
ringNext
);
}
SCCLCHECK
(
connectCollNet
(
comm
,
collNetGraph
));
}
// Use 4 compute channels per search channel to reach peak BW on <8 PPN
if
(
comm
->
minCompCap
==
90
&&
comm
->
nNodes
>
1
&&
graphs
[
SCCL_ALGO_RING
]
->
bwIntra
>
45.0
&&
2
*
nChannels
<=
MAXCHANNELS
)
{
nChannels
=
comm
->
nChannels
=
copyChannels
(
comm
,
nChannels
,
2
*
nChannels
,
ringPrev
,
ringNext
);
}
// Add Hylink + PCIE double channel path
if
(
graphs
[
SCCL_ALGO_RING
]
->
typeIntra
==
PATH_NVL
)
{
comm
->
nMixedHylinkChannels
=
std
::
min
(
MAXCHANNELS
-
comm
->
nChannels
,
(
int
)
rcclParamMaxMixedHylinkNChannels
());
if
(
comm
->
nMixedHylinkChannels
>
0
)
{
INFO
(
SCCL_LOG_TOPO
,
"<%s:%d> -----> comm->nMixedHylinkShmChannels: %d, comm->nChannels: %d
\n
"
,
__func__
,
__LINE__
,
comm
->
nMixedHylinkChannels
,
comm
->
nChannels
);
comm
->
mixedTransportType
=
std
::
max
((
int
)
rcclParamMixedTransportType
(),
TRANSPORT_SHM
);
nChannels
=
comm
->
nChannels
=
copyMixedChannels
(
comm
,
nChannels
,
nChannels
+
comm
->
nMixedHylinkChannels
,
ringPrev
,
ringNext
);
}
}
// Honor SCCL_MIN_NRINGS/SCCL_MAX_NRINGS.
// We permit combining max, then min, to only use the first channels, then duplicate them.
if
(
checkSdmaCopyEnable
(
comm
))
{
uint32_t
sdmaChannelNum
;
uint32_t
maxChannels
;
sdmaChannelNum
=
getSdmaChannelNum
(
comm
);
if
(
comm
->
sharedRes
->
owner
!=
comm
)
{
/* child comm #channels cannot exceed top parent #channels. */
nChannels
=
comm
->
nChannels
=
std
::
min
(
std
::
min
(
std
::
min
(
scclMaxNchannels
(),
nChannels
),
comm
->
config
.
maxCTAs
),
comm
->
sharedRes
->
tpNChannels
);
maxChannels
=
sdmaChannelNum
?
sdmaChannelNum
:
std
::
min
(
std
::
max
(
scclMinNchannels
(),
std
::
max
(
ncSdma
,
comm
->
config
.
minCTAs
)),
comm
->
sharedRes
->
tpNChannels
);
nChannels
=
comm
->
nChannels
=
copyChannels
(
comm
,
nChannels
,
maxChannels
,
ringPrev
,
ringNext
);
}
else
{
nChannels
=
comm
->
nChannels
=
std
::
min
(
std
::
min
(
scclMaxNchannels
(),
nChannels
),
comm
->
config
.
maxCTAs
);
maxChannels
=
sdmaChannelNum
?
sdmaChannelNum
:
std
::
max
(
scclMinNchannels
(),
std
::
max
(
ncSdma
,
comm
->
config
.
minCTAs
));
nChannels
=
comm
->
nChannels
=
copyChannels
(
comm
,
nChannels
,
maxChannels
,
ringPrev
,
ringNext
);
}
INFO
(
SCCL_INIT
,
"-hcugon- scclTopoPostset rank %d sdmaChannelNum %d nChannels %d"
,
comm
->
rank
,
sdmaChannelNum
,
comm
->
nChannels
);
}
else
{
if
(
comm
->
sharedRes
->
owner
!=
comm
)
{
/* child comm #channels cannot exceed top parent #channels. */
nChannels
=
comm
->
nChannels
=
std
::
min
(
std
::
min
(
std
::
min
(
MaxNChannels
,
nChannels
),
comm
->
config
.
maxCTAs
),
comm
->
sharedRes
->
tpNChannels
);
nChannels
=
comm
->
nChannels
=
copyChannels
(
comm
,
nChannels
,
std
::
min
(
std
::
max
(
MinNChannels
,
std
::
max
(
nc
,
comm
->
config
.
minCTAs
)),
comm
->
sharedRes
->
tpNChannels
),
ringPrev
,
ringNext
);
}
else
{
nChannels
=
comm
->
nChannels
=
std
::
min
(
std
::
min
(
MaxNChannels
,
nChannels
),
comm
->
config
.
maxCTAs
);
nChannels
=
comm
->
nChannels
=
copyChannels
(
comm
,
nChannels
,
std
::
max
(
MinNChannels
,
std
::
max
(
nc
,
comm
->
config
.
minCTAs
)),
ringPrev
,
ringNext
);
}
}
// Create rings array and check all is fine
SCCLCHECK
(
scclBuildRings
(
nChannels
,
rings
,
comm
->
rank
,
comm
->
nRanks
,
ringPrev
,
ringNext
));
free
(
ringRecv
);
free
(
ringSend
);
free
(
ringPrev
);
free
(
ringNext
);
free
(
treeToParent
);
free
(
treeToChild0
);
free
(
treeToChild1
);
free
(
nvlsHeads
);
return
scclSuccess
;
}
}
// namespace detect
}
// namespace topology
}
// namespace hardware
}
// namespace sccl
src/hardware/graph/devcomm.h
deleted
100644 → 0
View file @
d9d23f34
/*************************************************************************
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef SCCL_DEVICE_H_
#define SCCL_DEVICE_H_
#include "check.h"
#include "sccl_bfloat16.h"
#include "align.h"
#if defined(ENABLE_NPKIT)
#include "npkit/npkit_struct.h"
#endif
#if defined(ENABLE_TIMELINE)
#include "timeline/timeline.h"
#endif
#include <stdint.h>
#ifdef HCU_SDMA_FEATURE
#include "hsa/hsa_ext_amd.h"
#include "hsa_extra.h"
// #define HCU_PRINT_DEBUG
#endif
namespace
sccl
{
#define PRINT_ERR(...)
#define PRINT_INFO(...)
#define PRINT_INFOM(...)
#define PRINT_INFOT(tid, ...)
#define PRINT_DEBUG(...)
#if defined(ENABLE_NPKIT) && defined(HCU_SDMA_FEATURE)
#define NPKIT_SET_GPU_EVENT(event, size, cost) \
NpKit::CollectGpuEvent(event, size, cost, NPKIT_GET_GPU_TIMESTAMP(), scclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
#define NPKIT_SET_GPU_EVENT_TM(event, size, cost, tm) NpKit::CollectGpuEvent(event, size, cost, tm, scclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
#else
#define NPKIT_SET_GPU_EVENT(event, size, cost)
#define NPKIT_SET_GPU_EVENT_TM(event, size, cost, tm)
#endif
#ifdef HCU_SDMA_FEATURE
#define INIT_PRIMS_SDMA(prims, args) \
{ \
prims.rank = scclShmem.comm.rank; \
prims.useSdmaConfig = args->useSdma; \
prims.useSdmaCopy = args->useSdma && prims.sdmaQueueCtx; \
prims.preFnOps = args->preFnOps; \
prims.sdmaMinCopySize = args->useSdma && prims.sdmaQueueCtx ? prims.sdmaQueueCtx->minCopySize : 0; \
prims.sdmaCountEnable = args->useSdma && prims.sdmaQueueCtx ? prims.sdmaQueueCtx->copyCountEnable : 0; \
prims.sdmaCopyCount = 0; \
prims.allCopyCount = 0; \
}
#endif
#define SCCL_NUM_FUNCTIONS 5 // SendRecv and AllToAllPivot not included for now
typedef
enum
{
scclFuncBroadcast
,
scclFuncReduce
,
scclFuncAllGather
,
scclFuncReduceScatter
,
scclFuncAllReduce
,
scclFuncSendRecv
,
scclFuncSend
,
scclFuncRecv
,
scclFuncAllToAllPivot
,
scclNumFuncs
}
scclFunc_t
;
extern
const
char
*
scclFuncStr
[
SCCL_NUM_FUNCTIONS
+
2
];
#define SCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
#define SCCL_ALGO_TREE 0
#define SCCL_ALGO_RING 1
#define SCCL_ALGO_COLLNET_DIRECT 2
#define SCCL_ALGO_COLLNET_CHAIN 3
#define SCCL_ALGO_NVLS 4
#define SCCL_ALGO_NVLS_TREE 5
enum
scclAlgo
{
SCCL_ALGO_TREE
=
0
,
// 树形算法
SCCL_ALGO_RING
=
1
,
// 环形算法
SCCL_ALGO_COLLNET_DIRECT
=
2
,
// 直接网络算法
SCCL_ALGO_COLLNET_CHAIN
=
3
,
// 链式网络算法
SCCL_ALGO_NVLS
=
4
,
// NVLink算法
SCCL_ALGO_NVLS_TREE
=
5
,
// NVLink树形算法
};
extern
const
char
*
scclAlgoStr
[
SCCL_NUM_ALGORITHMS
];
#define SCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
#define SCCL_PROTO_LL 0
#define SCCL_PROTO_LL128 1
#define SCCL_PROTO_SIMPLE 2
extern
const
char
*
scclProtoStr
[
SCCL_NUM_PROTOCOLS
];
#define SCCL_MAX_OPS 2048
#define SCCL_STEPS 8
union
scclLLFifoLine
{
/* Flags have to be *after* data, because otherwise, an incomplete receive
from the network may receive the flag but not the data.
Note this is assuming that either we receive contiguous chunks of data
(sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
struct
{
uint32_t
data1
;
uint32_t
flag1
;
uint32_t
data2
;
uint32_t
flag2
;
};
uint64_t
v
[
2
];
int4
i4
;
};
#define WARP_SIZE warpSize
#define MAXCHANNELS 32
#define SCCL_MAX_NTHREADS 256
#define SCCL_SIMPLE_MAX_NTHREADS SCCL_MAX_NTHREADS
#define SCCL_LL_MAX_NTHREADS SCCL_MAX_NTHREADS
#define SCCL_LL_LINES_PER_THREAD 8
#ifdef TEST_LL_CLEANUP
#define SCCL_LL_CLEAN_MASK 0x078 // Set to 0x100 to disable cleanup
#define SCCL_LL_FLAG_MAX 0x100
#define SCCL_LL_FLAG(a) ((uint32_t)((a) % SCCL_LL_FLAG_MAX))
#else
#define SCCL_LL_CLEAN_MASK 0x7ffffff8
#define SCCL_LL_FLAG(a) ((uint32_t)(a))
#endif
// Make sure the clean mask will last for at least SCCL_NSTEPS
static_assert
(
SCCL_LL_CLEAN_MASK
%
SCCL_STEPS
==
0
,
"Invalid SCCL_LL_CLEAN_MASK value"
);
#define SCCL_LL128_LINESIZE 64
#define SCCL_LL128_LINEELEMS (SCCL_LL128_LINESIZE / sizeof(uint64_t))
#define SCCL_LL128_DATAELEMS (SCCL_LL128_LINEELEMS - 1)
#define SCCL_LL128_MAX_NTHREADS 256
#define SCCL_LL128_ELEMS_PER_THREAD 28
#define SCCL_LL128_SHMEM_ELEMS_PER_THREAD 4
#define SCCL_LL128_SHMEM_SIZE (SCCL_LL128_SHMEM_ELEMS_PER_THREAD * SCCL_LL128_MAX_NTHREADS)
#define SCCL_DIRECT_WRITE 0x01
#define SCCL_DIRECT_READ 0x02
#define SCCL_DIRECT_NIC 0x04
#define SCCL_IPC_WRITE 0x08
#define SCCL_IPC_READ 0x10
#define SCCL_NVLS_MIN_POLL 0x20
#ifdef HCU_SDMA_FEATURE
#define SDMA_CTX_VALID_MAGIC 0xD65A
#endif
struct
scclConnInfo
{
// Regular comm mechanism
char
*
buffs
[
SCCL_NUM_PROTOCOLS
];
// Local for recv, remote for send
uint64_t
*
tail
;
// Local for recv, remote for send
uint64_t
*
head
;
// Local for send, remote for recv
int
flags
;
// Direct communication / other flags
int
shared
;
// Buffers are shared
void
**
ptrExchange
;
// Pointer exchange for direct communication
uint64_t
*
redOpArgExchange
;
// PreOp scaler exchange for direct pull case
int
*
sizesFifo
;
// Sizes fifo from GPU to proxy
int
*
offsFifo
;
// Buffer fifo from proxy to GPU
uint64_t
step
;
// Keep where we are
uint64_t
llLastCleaning
;
// GPU's HDP_MEM_FLUSH_ADDR: HDP Memory Coherency Flush Control. This register
// allows software to explicitly initiate a flush read to HDP memory. See more
// descriptions in primitives.h.
uint32_t
*
next_hdp_reg
;
// Next GPU in ring (for p2p transport use only)
uint32_t
*
curr_hdp_reg
;
// Current GPU's HDP register
#ifdef HCU_SDMA_FEATURE
struct
sdmaQueueContext
*
sdmaQueueCtx
;
uint32_t
sdmaCtxValidMagic
;
#endif
};
struct
scclProxyConnector
{
int
tpRank
;
int
tpLocalRank
;
int
sameProcess
;
struct
scclProxyConnection
*
connection
;
};
struct
scclConnector
{
int
connected
;
struct
scclProxyConnector
proxyConn
;
struct
scclTransportComm
*
transportComm
;
void
*
transportResources
;
struct
scclConnInfo
conn
;
};
struct
scclRing
{
// Shortcuts for userRanks[1] and userRanks[n-1]
int
prev
;
int
next
;
// Maps an internal sccl index to user-specified rank order. This is necessary
// since we need to know how the user expects data to be ordered across
// devices. Ordered from current device.
int
*
userRanks
;
int
index
;
// This rank's index in the ring
};
// The root of each tree only has one node down (+1 intra-node).
#define SCCL_MAX_TREE_ARITY_TOP 2
// Nodes inside the binary tree can have to two nodes down (+1 intra-node).
#define SCCL_MAX_TREE_ARITY 3
struct
scclTree
{
int
depth
;
int
up
;
int
down
[
SCCL_MAX_TREE_ARITY
];
};
#define SCCL_MAX_DIRECT_ARITY 7
struct
scclDirect
{
int
depth
;
int
out
;
int
nHeads
;
// Number of parallel N<->1<->net operations we'll do in parallel; size of up/down
int
headRank
;
// Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC)
int
shift
;
// Shuffling of send/recv for scatter/gather operations, basically localRank%nHeads
int
up
[
SCCL_MAX_DIRECT_ARITY
];
int
down
[
SCCL_MAX_DIRECT_ARITY
];
};
#define SCCL_CONN_IDX_P2P_NET 2
#define SCCL_MAX_NVLS_ARITY 8
#define SCCL_MAX_NVLS_TREE_ARITY 3
struct
scclNvls
{
int
out
;
int
nHeads
;
// Number of parallel N<->1<->net operations we'll do in parallel; size of up/down
int
headRank
;
// Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC)
int
up
[
SCCL_MAX_NVLS_ARITY
];
int
down
;
int
treeUp
;
int
treeDown
[
SCCL_MAX_NVLS_TREE_ARITY
];
int
node
;
int
nNodes
;
};
#define SCCL_MAX_CONNS 3
struct
scclChannelPeer
{
struct
scclConnector
send
[
SCCL_MAX_CONNS
];
struct
scclConnector
recv
[
SCCL_MAX_CONNS
];
int
refCount
;
};
struct
scclDevComm
;
#pragma pack(push)
/* push current alignment to stack */
#pragma pack(8)
/* set alignment to 8 bytes boundary */
/* scclWork is to be a power of two, currently 8x64 bytes, */
/* to make sure reads to host from the CUDA kernel are aligned. */
/* Make sure to adjust padding at the end of scclWorkElem. */
#define SCCL_WORK_SIZE 256
enum
scclWorkType
:
uint8_t
{
scclWorkTypeUnused
=
0
,
scclWorkTypeColl
=
1
,
scclWorkTypeP2p
=
2
,
scclWorkTypeRegColl
=
3
};
enum
scclWorkP2PType
:
uint8_t
{
scclWorkP2pTypeUnused
=
0
,
scclWorkP2pTypeSend
,
scclWorkP2pTypeRecv
};
struct
scclWorkHeader
{
union
{
int32_t
workNext
;
// when isLast=0: Offset from kernel argument workHead
uint32_t
doneAcks
;
// when isLast=1: Monotonic (mod 1<<32) ack value to send back.
};
uint16_t
funcIndex
;
uint8_t
isLast
:
1
;
// last work for this kernel
uint8_t
inFifo
:
1
;
// is this work in the fifo
enum
scclWorkType
type
;
};
struct
scclWorkElem
{
union
{
uint8_t
flagBits
;
struct
{
uint8_t
isUsed
:
1
,
redOpArgIsPtr
:
1
,
regUsed
:
1
,
nWarps
:
5
;
};
};
uint8_t
direct
;
uint8_t
bid
;
uint8_t
nChannels
;
struct
{
uint32_t
root
:
28
;
uint32_t
preFnOps
:
1
;
uint32_t
useSdma
:
1
;
uint32_t
connIndex
:
2
;
};
const
void
*
sendbuff
;
void
*
recvbuff
;
size_t
count
;
union
{
size_t
lastChunkSize
;
// Pivot A2A kernel computes chunk size itself.
// Instead, it needs the number of bidirectional rings.
size_t
pivotA2ANumBiRings
;
};
uint64_t
redOpArg
;
uint64_t
opCount
;
};
static_assert
((
SCCL_WORK_SIZE
-
alignUp
(
sizeof
(
scclWorkHeader
),
alignof
(
scclWorkElem
)))
/
sizeof
(
scclWorkElem
)
==
4
,
"Sanity check: SCCL_MAX_WORK_ELEMENTS == 4"
);
#define SCCL_MAX_WORK_ELEMENTS 1
struct
scclWorkElemP2p
{
struct
{
int32_t
peer
:
26
;
uint32_t
preFnOps
:
1
;
uint32_t
useSdma
:
1
;
uint32_t
connIndex
:
2
;
int32_t
proto
:
2
;
};
union
{
uint16_t
flagBits
;
struct
{
enum
scclWorkP2PType
p2pType
:
4
;
uint16_t
nWarps
:
4
;
uint16_t
warpStart
:
4
;
uint16_t
ngroups
:
4
;
};
};
uint16_t
opCount
;
// Important not to use any fields with greater than 4-byte alignment since
// we need sizeof(scclWorkElemP2p)==28, but that would be padded up to 32 if
// there were 8-byte fields.
// void* buff;
uint32_t
buffHi32
,
buffLo32
;
// buff = buffHi32<<32 | buffLo32;
// size_t count;
uint32_t
countHi32
,
countLo32
;
// count = countHi32<<32 | countLo32;
int
chunkSize
;
};
static_assert
(((
SCCL_WORK_SIZE
-
alignUp
(
sizeof
(
scclWorkHeader
),
alignof
(
scclWorkElemP2p
)))
/
sizeof
(
scclWorkElemP2p
))
==
8
,
"Sanity check: SCCL_MAX_WORK_ELEMENTS_P2P == 8"
);
#define SCCL_MAX_WORK_ELEMENTS_P2P 2
struct
scclWorkElemReg
{
struct
scclWorkElem
elem
;
void
*
dnInputs
[
SCCL_MAX_DIRECT_ARITY
+
1
];
void
*
dnOutputs
[
SCCL_MAX_DIRECT_ARITY
+
1
];
void
*
upOutputs
[
SCCL_MAX_DIRECT_ARITY
+
1
];
};
#define SCCL_MAX_WORK_ELEMENTS_REG ((SCCL_WORK_SIZE - alignUp(sizeof(scclWorkHeader), alignof(scclWorkElemReg))) / sizeof(scclWorkElemReg))
static_assert
(
SCCL_MAX_WORK_ELEMENTS_REG
==
1
,
"Sanity check: SCCL_MAX_WORK_ELEMENTS_REG == 1"
);
// Number of named barriers supported by CUDA
#define SCCL_MAX_GROUPS (SCCL_MAX_NTHREADS / WARP_SIZE)
struct
scclWork
{
struct
scclWorkHeader
header
;
union
{
char
pad
[
SCCL_WORK_SIZE
-
sizeof
(
struct
scclWorkHeader
)];
struct
scclWorkElem
elems
[
SCCL_MAX_WORK_ELEMENTS
];
struct
scclWorkElemP2p
p2pElems
[
SCCL_MAX_WORK_ELEMENTS_P2P
];
struct
scclWorkElemReg
regElems
[
SCCL_MAX_WORK_ELEMENTS_REG
];
};
};
static_assert
(
sizeof
(
struct
scclWork
)
==
SCCL_WORK_SIZE
,
"Sanity check: sizeof(struct scclWork) == SCCL_WORK_SIZE"
);
static_assert
(
sizeof
(
struct
scclWork
)
%
16
==
0
,
"Sanity check: sizeof(struct scclWork)%16 == 0"
);
struct
scclDevChannelPeer
{
// Stripped version of scclChannelPeer where we only keep the scclConnInfo
// instead of the full scclConnector.
struct
scclConnInfo
send
[
SCCL_MAX_CONNS
];
struct
scclConnInfo
recv
[
SCCL_MAX_CONNS
];
};
#pragma pack(pop)
/* restore original alignment from stack */
#ifdef ENABLE_PROFILING
#define PROFILE_NUM_ITEMS 31
#define PROFILE_NUM_LAUNCHES 1024
struct
scclProf
{
uint32_t
count
;
uint32_t
seq
;
// only entry from first launch is used
struct
{
uint64_t
line
:
16
;
uint64_t
timeStamp
:
48
;
}
elem
[
PROFILE_NUM_ITEMS
];
};
static_assert
(
sizeof
(
struct
scclProf
)
==
256
,
"scclProf must have size of 256"
);
#endif
#ifdef ENABLE_COLLTRACE
typedef
enum
{
scclCollTraceNotReady
=
0
,
scclCollTraceKernelLaunchType
=
1
,
scclCollTraceKernelEndType
=
2
,
scclCollTraceCollLaunchType
=
3
,
scclCollTraceAbortType
=
4
,
scclCollTraceDataType
=
5
,
scclCollTraceCollElemType
=
(
1
<<
4
),
scclCollTraceP2pElemType
=
(
1
<<
5
),
}
scclCollTraceDataType_t
;
struct
scclCollTrace
{
uint8_t
type
;
uint8_t
bid
;
int16_t
funcIndex
;
uint32_t
data_0
;
uint64_t
timeStamp
;
union
{
uint64_t
opCount
;
uint32_t
p2pOpCount
[
2
];
};
union
{
uint64_t
data_1
;
struct
{
uint8_t
nWarps
;
uint8_t
bid
;
uint8_t
nChannels
;
}
coll
;
struct
{
int16_t
peer
;
uint8_t
ngroups
:
4
;
uint8_t
connIndex
:
4
;
uint8_t
warpStart
:
4
;
uint8_t
nWarps
:
4
;
}
p2p
[
2
];
};
};
static_assert
(
sizeof
(
struct
scclCollTrace
)
==
8
*
sizeof
(
int
),
"scclCollTrace must have a pow2 size"
);
union
scclCollTraceTail
{
uint32_t
tail
;
char
padding
[
4096
];
};
#define COLLTRACE_NUM_ITEMS 8192
#endif
#ifdef HCU_SDMA_FEATURE
struct
sdmaQueueContext
{
hsa_sdma_info_t
*
sdmaInfo
;
uint64_t
pkgIndex
;
uint32_t
queueId
;
uint32_t
sumSdmaCopyCount
;
uint32_t
sumAllCopyCount
;
uint32_t
queueLock
;
uint32_t
minCopySize
;
uint32_t
copyCountEnable
;
uint32_t
sdmaQueueDepth
;
uint32_t
sdmaPkgLen
;
uint32_t
sdmaQueueLen
;
};
#endif
struct
alignas
(
16
)
scclDevChannel
{
struct
scclDevChannelPeer
**
peers
;
struct
scclRing
ring
;
struct
scclTree
tree
;
struct
scclTree
collnetChain
;
struct
scclDirect
collnetDirect
;
struct
scclTree
binTree
;
struct
scclNvls
nvls
;
uint32_t
*
workFifoDone
;
// Location of done counter, device writes index+1 of last work processed
};
struct
scclDevComm
{
int
rank
;
int
nRanks
;
int
buffSizes
[
SCCL_NUM_PROTOCOLS
];
// Operation list for aggregation
int
workFifoDepth
;
struct
scclWork
*
workFifoHeap
;
// may be cudaHost or GDR memory
// Flag to ask SCCL kernels to abort
volatile
uint32_t
*
abortFlag
;
// Channels, device side
struct
scclDevChannel
*
channels
/*[MAXCHANNELS]*/
;
#if defined(ENABLE_NPKIT)
NpKitEventCollectContext
*
npKitEventCollectContexts
;
#endif
#ifdef ENABLE_COLLTRACE
struct
scclCollTrace
*
collTrace
;
union
scclCollTraceTail
*
collTraceTail
;
pthread_t
collTraceThread
;
#endif
#ifdef ENABLE_PROFILING
struct
scclProf
*
devProf
;
#endif
#if defined(ENABLE_TIMELINE)
TimelineGpuEventContext
*
gpuEventContext
;
#endif
#if defined(ENABLE_NPKIT) || defined(ENABLE_TIMELINE)
uint64_t
*
cpuTimestamp
;
#endif
};
struct
alignas
(
16
)
scclDevCommAndChannels
{
struct
scclDevComm
comm
;
struct
scclDevChannel
channels
[
MAXCHANNELS
];
};
#ifdef __CUDA_ARCH__
#define SCCL_CUDA_ARCH __CUDA_ARCH__
#else
#define SCCL_CUDA_ARCH 0
#endif
template
<
typename
T
>
__host__
__device__
constexpr
T
min_constexpr
(
T
a
)
{
return
a
;
}
template
<
typename
T
,
typename
...
Ts
>
__host__
__device__
constexpr
T
min_constexpr
(
T
a
,
T
b
,
Ts
...
c
)
{
return
min_constexpr
<
T
>
((
a
<
b
?
a
:
b
),
c
...);
}
template
<
typename
T
>
__host__
__device__
constexpr
T
max_constexpr
(
T
a
)
{
return
a
;
}
template
<
typename
T
,
typename
...
Ts
>
__host__
__device__
constexpr
T
max_constexpr
(
T
a
,
T
b
,
Ts
...
c
)
{
return
max_constexpr
<
T
>
((
a
>
b
?
a
:
b
),
c
...);
}
// Calculate the unroll factor given:
// * bytePerPack: number of bytes accessed per instruction
// * insns: max permissible unroll value
// * bytes: desired number of in-flight bytes per iteration ( = unroll*bytePerPack)
__host__
__device__
constexpr
int
scclCalcUnroll
(
int
bytePerPack
,
int
insns
,
int
bytes
)
{
return
min_constexpr
(
insns
,
(
bytes
+
bytePerPack
-
1
)
/
bytePerPack
);
}
// Note that all unroll value logic should depend on a given cudaArch argument
// and not __CUDA_ARCH__ since these need to be host-side executable where the
// arch value is strictly runtime only. By defaulting to SCCL_CUDA_ARCH, device
// side code can elide passing the arch for brevity.
__host__
__device__
constexpr
int
scclCollUnroll
(
int
cudaArch
=
SCCL_CUDA_ARCH
)
{
// Our collective unroll should move to the same bytes&insns model as NVLS.
return
cudaArch
>=
800
?
8
:
4
;
}
__host__
__device__
constexpr
int
scclNvlsUnrollBytes
(
int
cudaArch
=
SCCL_CUDA_ARCH
)
{
return
4
*
16
;
}
__host__
__device__
constexpr
int
scclNvlsUnrollInsns
(
int
cudaArch
=
SCCL_CUDA_ARCH
)
{
return
16
;
}
__host__
__device__
constexpr
int
scclNvlsUnroll
(
int
bytePerPack
,
int
cudaArch
=
SCCL_CUDA_ARCH
)
{
return
scclCalcUnroll
(
bytePerPack
,
scclNvlsUnrollInsns
(
cudaArch
),
scclNvlsUnrollBytes
(
cudaArch
));
}
// The amount of dynamic shmem per warp
__host__
__device__
constexpr
int
scclShmemScratchWarpSize
(
int
cudaArch
=
SCCL_CUDA_ARCH
)
{
return
(
max_constexpr
<
int
>
(
/*LL */
0
,
/*LL128 */
(
SCCL_LL128_SHMEM_ELEMS_PER_THREAD
*
WARP_SIZE
)
*
sizeof
(
uint64_t
),
/*SIMPLE*/
(
scclCollUnroll
(
cudaArch
)
*
WARP_SIZE
+
1
)
*
16
,
// NVLS needs an extra 16B to read unaligned data.
/*NVLS */
WARP_SIZE
*
(
cudaArch
>=
900
?
scclNvlsUnrollBytes
(
cudaArch
)
:
0
)
+
16
)
+
15
)
&
-
16
;
// pad to 16 bytes
}
// The amount of dynamic shmem per block
__host__
__device__
constexpr
int
scclShmemDynamicSize
(
int
cudaArch
=
SCCL_CUDA_ARCH
)
{
return
cudaArch
<
700
?
0
:
scclShmemScratchWarpSize
(
cudaArch
)
*
(
SCCL_MAX_NTHREADS
/
WARP_SIZE
);
}
}
// namespace sccl
#endif
src/hardware/graph/graph.h
deleted
100644 → 0
View file @
d9d23f34
#ifndef SCCL_GRAPH_H_
#define SCCL_GRAPH_H_
// #include "topo_utils.h"
#include "devcomm.h"
#include <limits.h>
#include <stdlib.h>
#include <ctype.h>
#include <stdio.h>
#include <sched.h>
namespace
sccl
{
namespace
hardware
{
namespace
topology
{
#define MAX_XGMI_INTER_GPUS 4
struct
scclTopoGraph
{
// Input / output
int
id
;
// ring : 0, tree : 1, collnet : 2
int
pattern
;
int
crossNic
;
int
collNet
;
int
minChannels
;
int
maxChannels
;
// Output
int
nChannels
;
float
bwIntra
;
float
bwInter
;
float
latencyInter
;
int
typeIntra
;
int
typeInter
;
int
sameChannels
;
int
nHops
;
int
intra
[
MAXCHANNELS
*
SCCL_TOPO_MAX_NODES
];
int
inter
[
MAXCHANNELS
*
2
];
int
nIntraChannels
;
int
intraNets
[
MAXCHANNELS
*
SCCL_TOPO_MAX_NODES
*
2
];
char
treeBase
[
SCCL_TOPO_MAX_NODES
][
SCCL_TOPO_MAX_NODES
*
4
];
};
struct
scclTopoRanks
{
int
ringRecv
[
MAXCHANNELS
];
int
ringSend
[
MAXCHANNELS
];
int
ringPrev
[
MAXCHANNELS
];
int
ringNext
[
MAXCHANNELS
];
int
treeToParent
[
MAXCHANNELS
];
int
treeToChild0
[
MAXCHANNELS
];
int
treeToChild1
[
MAXCHANNELS
];
int
nvlsHeads
[
MAXCHANNELS
];
};
// struct sccl::hardware::topology::topo::scclTopoSystem;
// 对系统拓扑结构进行排序
scclResult_t
scclTopoSortSystem
(
struct
scclTopoSystem
*
system
);
// 打印系统拓扑结构
scclResult_t
scclTopoPrint
(
struct
scclTopoSystem
*
system
);
// 计算系统中的路径
scclResult_t
scclTopoComputePaths
(
struct
scclTopoSystem
*
system
,
struct
scclComm
*
comm
);
// // 释放系统拓扑结构
// void scclTopoFree(struct scclTopoSystem* system);
// // 裁剪系统拓扑结构
// scclResult_t scclTopoTrimSystem(struct scclTopoSystem* system, struct scclComm* comm);
// // 计算点对点通道
// scclResult_t scclTopoComputeP2pChannels(struct scclComm* comm);
// // 获取指定rank的Nvidia GPU信息
// scclResult_t scclTopoGetNvbGpus(struct scclTopoSystem* system, int rank, int* nranks, int** ranks);
// // 检查系统中是否所有路径都通过NVLink
// int scclTopoPathAllNVLink(struct scclTopoSystem* system);
// // 获取网络设备信息
// scclResult_t scclTopoGetNetDev(struct scclComm* comm, int rank, struct scclTopoGraph* graph, int channelId, int peerRank, int* net, int* proxyRank);
// // 检查两个设备之间是否存在点对点连接
scclResult_t
scclTopoCheckP2p
(
struct
scclTopoSystem
*
system
,
int64_t
id1
,
int64_t
id2
,
int
*
p2p
,
int
*
read
,
int
*
intermediateRank
);
// // 检查是否使用GDR
// scclResult_t scclTopoCheckGdr(struct scclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
// // 获取内部网络设备信息
// scclResult_t scclTopoGetIntraNetDev(struct scclTopoSystem* system, int rank, struct scclTopoGraph* graph, int channelId, int type, int* dev);
// // 获取两个CUDA设备之间的连接类型
// scclResult_t scclTopoGetLinkType(
// struct scclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, int maxInter = MAX_XGMI_INTER_GPUS, int nInter = 0, int* inter = nullptr);
// // 检查是否需要刷新
// scclResult_t scclTopoNeedFlush(struct scclTopoSystem* system, int64_t busId, int* flush);
// // 检查两个设备是否在同一网络中
// scclResult_t scclTopoCheckNet(struct scclTopoSystem* system, int64_t id1, int64_t id2, int* net);
// // 禁用PXE网络
// int scclPxnDisable(struct scclComm* comm);
// // 获取PXE网络中的中间节点
// scclResult_t scclTopoGetPxnRanks(struct scclComm* comm, int** intermediateRanks, int* nranks);
// // 获取本地节点的rank
// scclResult_t scclTopoGetLocalRank(struct scclTopoSystem* system, int rank, int* localRank);
// // 获取CPU亲和性
// scclResult_t scclTopoGetCpuAffinity(struct scclTopoSystem* system, int rank, cpu_set_t* affinity);
// // 获取CPU类型信息
// scclResult_t scclTopoCpuType(struct scclTopoSystem* system, int* arch, int* vendor, int* model);
// // 获取GPU数量
// scclResult_t scclTopoGetGpuCount(struct scclTopoSystem* system, int* count);
// // 获取NVS数量
// scclResult_t scclTopoGetNvsCount(struct scclTopoSystem* system, int* count);
// // 获取本地网络设备信息
// scclResult_t scclTopoGetLocalNet(struct scclTopoSystem* system, int rank, int channelId, int* id);
// // 获取本地GPU索引
// scclResult_t scclTopoGetLocalGpu(struct scclTopoSystem* system, int net, int* gpuIndex);
// // 初始化搜索,调用scclTopoCompute之前需要执行
// scclResult_t scclTopoSearchInit(struct scclTopoSystem* system);
// // 计算拓扑图
// scclResult_t scclTopoCompute(struct scclTopoSystem* system, struct scclTopoGraph* graph);
// // 打印拓扑图
// scclResult_t scclTopoPrintGraph(struct scclTopoSystem* system, struct scclTopoGraph* graph);
// // 导出拓扑图
// scclResult_t scclTopoDumpGraphs(struct scclTopoSystem* system, int ngraphs, struct scclTopoGraph** graphs);
// // 设置预定义拓扑图
// scclResult_t scclTopoPreset(struct scclComm* comm, struct scclTopoGraph** graphs, struct scclTopoRanks* topoRanks);
// // 设置后处理拓扑图
// scclResult_t scclTopoPostset(
// struct scclComm* comm, int* firstRanks, int* treePatterns, struct scclTopoRanks** allTopoRanks, int* rings, struct scclTopoGraph** graphs, int nc);
// // 设置基于树的后处理拓扑图
// scclResult_t scclTreeBasePostset(struct scclComm* comm, struct scclTopoGraph* treeGraph);
// // 调整模型以适应计算能力
// scclResult_t scclTopoTuneModel(struct scclComm* comm, int minCompCap, int maxCompCap, struct scclTopoGraph** graphs);
// scclResult_t scclTopoCudaPath(int cudaDev, char** path);
// #include "info.h"
// scclResult_t scclTopoGetAlgoTime(struct scclInfo* info, int algorithm, int protocol, int numPipeOps, float* time);
}
// namespace topology
}
// namespace hardware
}
// namespace sccl
#endif
src/hardware/graph/paths.cc
deleted
100644 → 0
View file @
d9d23f34
#include "core.h"
#include "graph.h"
#include "topo.h"
#include "comm.h"
#include "net.h"
#include "channel.h"
#include "xml.h"
namespace
sccl
{
namespace
hardware
{
namespace
topology
{
namespace
graph
{
// Pre-compute GPU->NIC, GPU->GPU and NIC->GPU paths
struct
scclTopoNodeList
{
struct
scclTopoNode
*
list
[
SCCL_TOPO_MAX_NODES
];
int
count
;
};
static
scclResult_t
getPath
(
struct
scclTopoSystem
*
system
,
struct
scclTopoNode
*
node
,
int
t
,
int64_t
id
,
struct
scclTopoLinkList
**
path
)
{
for
(
int
i
=
0
;
i
<
system
->
nodes
[
t
].
count
;
i
++
)
{
if
(
system
->
nodes
[
t
].
nodes
[
i
].
id
==
id
)
{
*
path
=
node
->
paths
[
t
]
+
i
;
return
scclSuccess
;
}
}
WARN
(
"Could not find node of type %d id %lx"
,
t
,
id
);
return
scclInternalError
;
}
static
scclResult_t
scclTopoSetPaths
(
struct
scclTopoNode
*
baseNode
,
struct
scclTopoSystem
*
system
)
{
if
(
baseNode
->
paths
[
baseNode
->
type
]
==
NULL
)
{
SCCLCHECK
(
scclCalloc
(
baseNode
->
paths
+
baseNode
->
type
,
system
->
nodes
[
baseNode
->
type
].
count
));
}
// breadth-first search to set all paths to that node in the system
struct
scclTopoNodeList
nodeList
;
struct
scclTopoNodeList
nextNodeList
;
nodeList
.
count
=
1
;
nodeList
.
list
[
0
]
=
baseNode
;
nextNodeList
.
count
=
0
;
struct
scclTopoLinkList
*
basePath
;
SCCLCHECK
(
getPath
(
system
,
baseNode
,
baseNode
->
type
,
baseNode
->
id
,
&
basePath
));
basePath
->
count
=
0
;
basePath
->
bw
=
LOC_BW
;
basePath
->
type
=
PATH_LOC
;
while
(
nodeList
.
count
)
{
nextNodeList
.
count
=
0
;
for
(
int
n
=
0
;
n
<
nodeList
.
count
;
n
++
)
{
struct
scclTopoNode
*
node
=
nodeList
.
list
[
n
];
struct
scclTopoLinkList
*
path
;
SCCLCHECK
(
getPath
(
system
,
node
,
baseNode
->
type
,
baseNode
->
id
,
&
path
));
for
(
int
l
=
0
;
l
<
node
->
nlinks
;
l
++
)
{
struct
scclTopoLink
*
link
=
node
->
links
+
l
;
struct
scclTopoNode
*
remNode
=
link
->
remNode
;
if
(
remNode
->
paths
[
baseNode
->
type
]
==
NULL
)
{
SCCLCHECK
(
scclCalloc
(
remNode
->
paths
+
baseNode
->
type
,
system
->
nodes
[
baseNode
->
type
].
count
));
}
struct
scclTopoLinkList
*
remPath
;
SCCLCHECK
(
getPath
(
system
,
remNode
,
baseNode
->
type
,
baseNode
->
id
,
&
remPath
));
float
bw
=
std
::
min
(
path
->
bw
,
link
->
bw
);
// allow routing through a GPU only as 1 hop
if
(
node
!=
baseNode
&&
node
->
type
==
GPU
&&
(
link
->
type
!=
LINK_NVL
||
remNode
->
type
!=
GPU
||
path
->
count
>
1
))
continue
;
if
((
remPath
->
bw
==
0
||
remPath
->
count
>
path
->
count
)
&&
remPath
->
bw
<
bw
)
{
// Find reverse link
for
(
int
l
=
0
;
l
<
remNode
->
nlinks
;
l
++
)
{
if
(
remNode
->
links
[
l
].
remNode
==
node
)
{
remPath
->
list
[
0
]
=
remNode
->
links
+
l
;
break
;
}
}
if
(
remPath
->
list
[
0
]
==
NULL
)
{
WARN
(
"Failed to find reverse path from remNode %d/%lx nlinks %d to node %d/%lx"
,
remNode
->
type
,
remNode
->
id
,
remNode
->
nlinks
,
node
->
type
,
node
->
id
);
return
scclInternalError
;
}
// Copy the rest of the path
for
(
int
i
=
0
;
i
<
path
->
count
;
i
++
)
remPath
->
list
[
i
+
1
]
=
path
->
list
[
i
];
remPath
->
count
=
path
->
count
+
1
;
remPath
->
bw
=
bw
;
// Start with path type = link type. PATH and LINK types are supposed to match.
// Don't consider LINK_NET as we only care about the NIC->GPU path.
int
type
=
link
->
type
==
LINK_NET
?
LINK_LOC
:
link
->
type
;
// Differentiate between one and multiple PCI switches
if
(
node
->
type
==
PCI
&&
remNode
->
type
==
PCI
)
type
=
PATH_PXB
;
// Consider a path going through the CPU as PATH_PHB
if
(
link
->
type
==
LINK_PCI
&&
(
node
->
type
==
CPU
||
link
->
remNode
->
type
==
CPU
))
type
=
PATH_PHB
;
// Set 1 hop NVLink as NVB
// if (node->type == GPU && path->type == PATH_NVL && type == PATH_NVL && remPath->count > 1) type = PATH_NVB;
remPath
->
type
=
std
::
max
(
path
->
type
,
type
);
// Add to the list for the next iteration if not already in the list
// Disallow GPUs as intermediate steps for now
if
(
remNode
->
type
!=
GPU
)
{
int
i
;
for
(
i
=
0
;
i
<
nextNodeList
.
count
;
i
++
)
if
(
nextNodeList
.
list
[
i
]
==
remNode
)
break
;
if
(
i
==
nextNodeList
.
count
)
nextNodeList
.
list
[
nextNodeList
.
count
++
]
=
remNode
;
}
}
}
}
memcpy
(
&
nodeList
,
&
nextNodeList
,
sizeof
(
nodeList
));
}
return
scclSuccess
;
}
/**
* 打印节点路径信息
*
* @param system 拓扑系统指针
* @param node 待打印路径的节点指针
*
* 该函数用于输出指定节点的路径信息,包括路径类型、目标节点ID、
* 路径跳数、带宽和路径类型字符串。输出格式为一行字符串。
*/
static
void
printNodePaths
(
struct
scclTopoSystem
*
system
,
struct
scclTopoNode
*
node
)
{
char
line
[
1024
];
sprintf
(
line
,
"%s/%lX :"
,
topoNodeTypeStr
[
node
->
type
],
node
->
id
);
int
offset
=
strlen
(
line
);
for
(
int
t
=
0
;
t
<
SCCL_TOPO_NODE_TYPES
;
t
++
)
{
if
(
node
->
paths
[
t
]
==
NULL
)
continue
;
for
(
int
n
=
0
;
n
<
system
->
nodes
[
t
].
count
;
n
++
)
{
sprintf
(
line
+
offset
,
"%s/%lX (%d/%f/%s) "
,
topoNodeTypeStr
[
t
],
system
->
nodes
[
t
].
nodes
[
n
].
id
,
node
->
paths
[
t
][
n
].
count
,
node
->
paths
[
t
][
n
].
bw
,
topoPathTypeStr
[
node
->
paths
[
t
][
n
].
type
]);
offset
=
strlen
(
line
);
}
}
}
static
scclResult_t
getLocalCpu
(
struct
scclTopoSystem
*
system
,
int
gpu
,
int
*
retCpu
)
{
// Find the closest CPU to a GPU
int
minHops
=
0
;
int
localCpu
=
-
1
;
struct
scclTopoLinkList
*
paths
=
system
->
nodes
[
GPU
].
nodes
[
gpu
].
paths
[
CPU
];
for
(
int
c
=
0
;
c
<
system
->
nodes
[
CPU
].
count
;
c
++
)
{
int
hops
=
paths
[
c
].
count
;
if
(
minHops
==
0
||
hops
<
minHops
)
{
localCpu
=
c
;
minHops
=
hops
;
}
}
if
(
localCpu
==
-
1
)
{
WARN
(
"Error : could not find CPU close to GPU %d"
,
gpu
);
return
scclInternalError
;
}
*
retCpu
=
localCpu
;
return
scclSuccess
;
}
static
scclResult_t
addInterStep
(
struct
scclTopoSystem
*
system
,
int
tx
,
int
ix
,
int
t1
,
int
i1
,
int
t2
,
int
i2
)
{
struct
scclTopoNode
*
cpuNode
=
system
->
nodes
[
tx
].
nodes
+
ix
;
struct
scclTopoNode
*
srcNode
=
system
->
nodes
[
t1
].
nodes
+
i1
;
int
l
=
0
;
// Node 1 -> CPU
for
(
int
i
=
0
;
i
<
srcNode
->
paths
[
tx
][
ix
].
count
;
i
++
)
srcNode
->
paths
[
t2
][
i2
].
list
[
l
++
]
=
srcNode
->
paths
[
tx
][
ix
].
list
[
i
];
// CPU -> Node 2
for
(
int
i
=
0
;
i
<
cpuNode
->
paths
[
t2
][
i2
].
count
;
i
++
)
srcNode
->
paths
[
t2
][
i2
].
list
[
l
++
]
=
cpuNode
->
paths
[
t2
][
i2
].
list
[
i
];
// Update path characteristics
srcNode
->
paths
[
t2
][
i2
].
count
=
l
;
srcNode
->
paths
[
t2
][
i2
].
type
=
std
::
max
(
srcNode
->
paths
[
tx
][
ix
].
type
,
cpuNode
->
paths
[
t2
][
i2
].
type
);
if
(
tx
==
GPU
)
srcNode
->
paths
[
t2
][
i2
].
type
=
PATH_PXN
;
srcNode
->
paths
[
t2
][
i2
].
bw
=
std
::
min
(
srcNode
->
paths
[
tx
][
ix
].
bw
,
cpuNode
->
paths
[
t2
][
i2
].
bw
);
return
scclSuccess
;
}
// Remove/free paths for a given type
static
void
scclTopoRemovePathType
(
struct
scclTopoSystem
*
system
,
int
nodeType
)
{
for
(
int
t
=
0
;
t
<
SCCL_TOPO_NODE_TYPES
;
t
++
)
{
// Remove links _to_ the given type
for
(
int
n
=
0
;
n
<
system
->
nodes
[
t
].
count
;
n
++
)
{
struct
scclTopoNode
*
node
=
system
->
nodes
[
t
].
nodes
+
n
;
free
(
node
->
paths
[
nodeType
]);
node
->
paths
[
nodeType
]
=
NULL
;
}
// Remove links _from_ the given type
for
(
int
n
=
0
;
n
<
system
->
nodes
[
nodeType
].
count
;
n
++
)
{
struct
scclTopoNode
*
node
=
system
->
nodes
[
nodeType
].
nodes
+
n
;
free
(
node
->
paths
[
t
]);
node
->
paths
[
t
]
=
NULL
;
}
}
}
static
const
int
levelsOldToNew
[]
=
{
PATH_LOC
,
PATH_PIX
,
PATH_PXB
,
PATH_PHB
,
PATH_SYS
,
PATH_SYS
};
scclResult_t
scclGetLevel
(
int
*
level
,
const
char
*
disableEnv
,
const
char
*
levelEnv
)
{
if
(
*
level
==
-
1
)
{
int
l
=
-
1
;
if
(
disableEnv
)
{
char
*
str
=
getenv
(
disableEnv
);
if
(
str
)
{
int
disable
=
strtol
(
str
,
NULL
,
0
);
if
(
disable
==
1
)
l
=
0
;
}
}
if
(
l
==
-
1
)
{
char
*
str
=
getenv
(
levelEnv
);
if
(
str
)
{
for
(
int
i
=
0
;
i
<=
PATH_SYS
;
i
++
)
{
if
(
strcmp
(
str
,
topoPathTypeStr
[
i
])
==
0
)
{
l
=
i
;
break
;
}
}
// Old style numbering
// levelsOldToNew to is an array with each index corresponding to the
// "old level" int, and each value mapping to the correct value defined in topo.h
// maxOldLevel is a quick check to handle out of bounds (based on the length of levelsOldToNew)
if
(
l
==
-
1
&&
str
[
0
]
>=
'0'
&&
str
[
0
]
<=
'9'
)
{
int
oldLevel
=
strtol
(
str
,
NULL
,
0
);
const
int
maxOldLevel
=
sizeof
(
levelsOldToNew
)
/
sizeof
(
int
)
-
1
;
if
(
oldLevel
>
maxOldLevel
)
oldLevel
=
maxOldLevel
;
l
=
levelsOldToNew
[
oldLevel
];
}
}
}
if
(
l
>=
0
)
INFO
(
SCCL_ALL
,
"%s set by environment to %s"
,
levelEnv
,
topoPathTypeStr
[
l
]);
*
level
=
l
>=
0
?
l
:
-
2
;
}
return
scclSuccess
;
}
SCCL_PARAM
(
NetGdrRead
,
"NET_GDR_READ"
,
-
2
);
int
scclTopoUserGdrLevel
=
-
1
;
scclResult_t
scclTopoCheckGdr
(
struct
scclTopoSystem
*
system
,
int64_t
busId
,
int
netDev
,
int
read
,
int
*
useGdr
)
{
*
useGdr
=
0
;
// Get GPU and NET
int
n
,
g
;
SCCLCHECK
(
scclTopoIdToIndex
(
system
,
NET
,
netDev
,
&
n
));
struct
scclTopoNode
*
net
=
system
->
nodes
[
NET
].
nodes
+
n
;
SCCLCHECK
(
scclTopoIdToIndex
(
system
,
GPU
,
busId
,
&
g
));
struct
scclTopoNode
*
gpu
=
system
->
nodes
[
GPU
].
nodes
+
g
;
// Check that both the NIC and GPUs support it
if
(
net
->
net
.
gdrSupport
==
0
)
return
scclSuccess
;
if
(
gpu
->
gpu
.
gdrSupport
==
0
)
return
scclSuccess
;
if
(
read
)
{
// For reads (sends) only enable under certain conditions
int
gdrReadParam
=
scclParamNetGdrRead
();
if
(
gdrReadParam
==
0
)
return
scclSuccess
;
if
(
gdrReadParam
<
0
)
{
int
nvlink
=
0
;
// Since we don't know whether there are other communicators,
// it's better to keep things local if we have a single GPU.
if
(
system
->
nodes
[
GPU
].
count
==
1
)
nvlink
=
1
;
for
(
int
i
=
0
;
i
<
system
->
nodes
[
GPU
].
count
;
i
++
)
{
if
(
i
==
g
)
continue
;
if
(
gpu
->
paths
[
GPU
][
i
].
type
==
PATH_NVL
)
{
nvlink
=
1
;
break
;
}
}
if
(
!
nvlink
)
return
scclSuccess
;
}
}
// Check if we are close enough that it makes sense to enable GDR
int
netGdrLevel
=
system
->
netGdrLevel
==
-
2
?
PATH_PXB
:
system
->
netGdrLevel
;
SCCLCHECK
(
scclGetLevel
(
&
scclTopoUserGdrLevel
,
NULL
,
"SCCL_NET_GDR_LEVEL"
));
if
(
scclTopoUserGdrLevel
!=
-
2
)
netGdrLevel
=
scclTopoUserGdrLevel
;
else
{
int
arch
,
vendor
,
model
;
SCCLCHECK
(
scclTopoCpuType
(
system
,
&
arch
,
&
vendor
,
&
model
));
if
(
arch
==
SCCL_TOPO_CPU_ARCH_X86
&&
vendor
==
SCCL_TOPO_CPU_VENDOR_AMD
&&
model
==
SCCL_TOPO_CPU_TYPE_ROME
)
{
int
i
,
d1
=
-
1
,
d2
=
-
1
;
for
(
i
=
0
;
i
<
system
->
nodes
[
CPU
].
count
;
i
++
)
if
(
system
->
nodes
[
GPU
].
nodes
[
g
].
paths
[
CPU
][
i
].
count
==
2
)
break
;
if
(
i
<
system
->
nodes
[
CPU
].
count
)
d1
=
system
->
nodes
[
CPU
].
nodes
[
i
].
id
;
for
(
i
=
0
;
i
<
system
->
nodes
[
CPU
].
count
;
i
++
)
if
(
system
->
nodes
[
NET
].
nodes
[
n
].
paths
[
CPU
][
i
].
count
==
2
)
break
;
if
(
i
<
system
->
nodes
[
CPU
].
count
)
d2
=
system
->
nodes
[
CPU
].
nodes
[
i
].
id
;
if
(
d1
!=
-
1
&&
d2
!=
-
1
&&
d1
==
d2
&&
(
system
->
nodes
[
GPU
].
nodes
[
g
].
id
&
0xf0000
)
==
(
system
->
nodes
[
NET
].
nodes
[
n
].
net
.
busId
&
0xf0000
))
{
netGdrLevel
=
PATH_PHB
;
}
}
}
int
distance
=
gpu
->
paths
[
NET
][
n
].
type
;
if
(
distance
==
PATH_PXN
)
{
// In case of PXN, use the intermediate GPU distance instead
int
proxyRank
,
g
;
SCCLCHECK
(
scclTopoGetIntermediateRank
(
system
,
gpu
->
gpu
.
rank
,
netDev
,
&
proxyRank
));
SCCLCHECK
(
scclTopoRankToIndex
(
system
,
proxyRank
,
&
g
));
struct
scclTopoNode
*
proxyGpu
=
system
->
nodes
[
GPU
].
nodes
+
g
;
distance
=
proxyGpu
->
paths
[
NET
][
n
].
type
;
}
if
(
distance
>
netGdrLevel
)
{
INFO
(
SCCL_NET
,
"GPU Direct RDMA Disabled for GPU %lx / HCA %d (distance %d > %d)"
,
busId
,
netDev
,
distance
,
netGdrLevel
);
return
scclSuccess
;
}
*
useGdr
=
1
;
INFO
(
SCCL_NET
,
"GPU Direct RDMA Enabled for GPU %lx / HCA %d (distance %d <= %d), read %d"
,
busId
,
netDev
,
distance
,
netGdrLevel
,
read
);
return
scclSuccess
;
}
// Set to 0 to disable the flush on Hopper when using GDR
SCCL_PARAM
(
NetForceFlush
,
"NET_FORCE_FLUSH"
,
1
);
// Determine whether we need to flush the GDR recv buffers
scclResult_t
scclTopoNeedFlush
(
struct
scclTopoSystem
*
system
,
int64_t
busId
,
int
*
flush
)
{
int
g
;
SCCLCHECK
(
scclTopoIdToIndex
(
system
,
GPU
,
busId
,
&
g
));
struct
scclTopoNode
*
gpu
=
system
->
nodes
[
GPU
].
nodes
+
g
;
// Flush is required on Ampere and earlier
*
flush
=
gpu
->
gpu
.
cudaCompCap
<
90
?
1
:
scclParamNetForceFlush
();
return
scclSuccess
;
}
SCCL_PARAM
(
NetDisableIntra
,
"NET_DISABLE_INTRA"
,
1
);
// Check whether going through the network would be faster than going through P2P/SHM.
scclResult_t
scclTopoCheckNet
(
struct
scclTopoSystem
*
system
,
int64_t
id1
,
int64_t
id2
,
int
*
net
)
{
if
(
scclParamNetDisableIntra
()
==
1
)
{
*
net
=
0
;
return
scclSuccess
;
}
*
net
=
1
;
// First check the current GPU-to-GPU speed.
int
g1
,
g2
;
if
(
scclTopoIdToIndex
(
system
,
GPU
,
id1
,
&
g1
)
!=
scclSuccess
||
scclTopoIdToIndex
(
system
,
GPU
,
id2
,
&
g2
)
!=
scclSuccess
)
{
return
scclSuccess
;
}
struct
scclTopoNode
*
gpu1
=
system
->
nodes
[
GPU
].
nodes
+
g1
;
struct
scclTopoNode
*
gpu2
=
system
->
nodes
[
GPU
].
nodes
+
g2
;
float
speed
=
gpu1
->
paths
[
GPU
][
g2
].
bw
;
// Now check the speed each GPU can access the network through PXB or better
float
netSpeed1
=
0
,
netSpeed2
=
0
;
for
(
int
n
=
0
;
n
<
system
->
nodes
[
NET
].
count
;
n
++
)
{
struct
scclTopoLinkList
*
path
=
gpu1
->
paths
[
NET
]
+
n
;
if
(
path
->
type
<=
PATH_PXB
&&
path
->
bw
>
netSpeed1
)
netSpeed1
=
path
->
bw
;
path
=
gpu2
->
paths
[
NET
]
+
n
;
if
(
path
->
type
<=
PATH_PXB
&&
path
->
bw
>
netSpeed2
)
netSpeed2
=
path
->
bw
;
}
if
(
netSpeed1
>
speed
&&
netSpeed2
>
speed
)
return
scclSuccess
;
*
net
=
0
;
return
scclSuccess
;
}
scclResult_t
scclTopoGetIntermediateRank
(
struct
scclTopoSystem
*
system
,
int
rank
,
int
netDev
,
int
*
intermediateRank
)
{
// Get GPU and NET
int
n
,
g
;
SCCLCHECK
(
scclTopoIdToIndex
(
system
,
NET
,
netDev
,
&
n
));
SCCLCHECK
(
scclTopoRankToIndex
(
system
,
rank
,
&
g
));
struct
scclTopoNode
*
gpu
=
system
->
nodes
[
GPU
].
nodes
+
g
;
struct
scclTopoLinkList
*
path
=
gpu
->
paths
[
NET
]
+
n
;
if
(
path
->
type
==
PATH_PXN
)
{
struct
scclTopoNode
*
node
;
int
type
=
NVS
;
for
(
int
i
=
0
;
i
<
path
->
count
&&
type
==
NVS
;
i
++
)
{
node
=
path
->
list
[
i
]
->
remNode
;
type
=
node
->
type
;
}
if
(
type
!=
GPU
)
{
WARN
(
"Could not find intermediate GPU between GPU rank %d and NIC %d"
,
rank
,
netDev
);
return
scclInternalError
;
}
*
intermediateRank
=
node
->
gpu
.
rank
;
}
else
{
*
intermediateRank
=
rank
;
}
return
scclSuccess
;
}
SCCL_PARAM
(
PxnDisable
,
"PXN_DISABLE"
,
1
);
// Net v4 plugins don't have non-blocking connect/accept. We can't therefore use
// remote proxies without risking deadlocks
int
scclPxnDisable
(
struct
scclComm
*
comm
)
{
static
int
pxnDisable
=
-
1
;
if
(
pxnDisable
==
-
1
)
{
if
(
comm
&&
scclNetVersion
(
comm
)
==
4
)
{
INFO
(
SCCL_INIT
,
"PXN Disabled as plugin is v4"
);
pxnDisable
=
1
;
}
else
{
pxnDisable
=
scclParamPxnDisable
();
}
}
return
pxnDisable
;
}
scclResult_t
scclTopoGetPxnRanks
(
struct
scclComm
*
comm
,
int
**
intermediateRanks
,
int
*
nranks
)
{
struct
scclTopoSystem
*
system
=
comm
->
topo
;
*
nranks
=
0
;
*
intermediateRanks
=
NULL
;
if
(
system
->
nodes
[
NET
].
count
==
0
)
return
scclSuccess
;
int
nr
=
0
;
int
*
ranks
=
NULL
;
for
(
int
rank
=
0
;
rank
<
comm
->
nRanks
;
rank
++
)
{
int
netDev
,
proxyRank
;
SCCLCHECK
(
scclTopoGetNetDev
(
comm
,
comm
->
rank
,
NULL
,
0
,
rank
,
&
netDev
,
&
proxyRank
));
if
(
proxyRank
==
comm
->
rank
)
continue
;
int
useGdr
;
SCCLCHECK
(
scclTopoCheckGdr
(
comm
->
topo
,
comm
->
busId
,
netDev
,
1
,
&
useGdr
));
if
(
useGdr
==
0
)
continue
;
int
found
=
0
;
for
(
int
r
=
0
;
r
<
nr
;
r
++
)
{
if
(
ranks
[
r
]
==
proxyRank
)
found
=
1
;
}
if
(
!
found
)
{
SCCLCHECK
(
scclRealloc
(
&
ranks
,
nr
,
nr
+
1
));
ranks
[
nr
++
]
=
proxyRank
;
}
}
*
nranks
=
nr
;
*
intermediateRanks
=
ranks
;
return
scclSuccess
;
}
static
bool
rcclPathOverride
(
struct
scclTopoSystem
*
system
,
uint64_t
distance
)
{
int
i
,
j
;
for
(
i
=
0
;
i
<
system
->
nodes
[
GPU
].
count
;
i
++
)
{
for
(
j
=
0
;
j
<
system
->
nodes
[
NET
].
count
;
j
++
)
{
if
((
system
->
nodes
[
NET
].
nodes
[
j
].
net
.
busId
-
system
->
nodes
[
GPU
].
nodes
[
i
].
id
==
distance
)
||
(
system
->
nodes
[
GPU
].
nodes
[
i
].
id
-
system
->
nodes
[
NET
].
nodes
[
j
].
net
.
busId
==
distance
))
break
;
}
if
(
j
>=
system
->
nodes
[
NET
].
count
)
break
;
}
if
(
i
>=
system
->
nodes
[
GPU
].
count
)
{
for
(
i
=
0
;
i
<
system
->
nodes
[
GPU
].
count
;
i
++
)
{
for
(
j
=
0
;
j
<
system
->
nodes
[
NET
].
count
;
j
++
)
{
if
((
system
->
nodes
[
NET
].
nodes
[
j
].
net
.
busId
-
system
->
nodes
[
GPU
].
nodes
[
i
].
id
==
distance
)
||
(
system
->
nodes
[
GPU
].
nodes
[
i
].
id
-
system
->
nodes
[
NET
].
nodes
[
j
].
net
.
busId
==
distance
))
system
->
nodes
[
GPU
].
nodes
[
i
].
paths
[
NET
][
j
].
type
=
PATH_PXB
;
}
}
return
true
;
}
else
{
return
false
;
}
}
RCCL_PARAM
(
EnableIntranet
,
"ENABLE_INTRANET"
,
-
2
);
scclResult_t
scclTopoTrimSystem
(
struct
scclTopoSystem
*
system
,
struct
scclComm
*
comm
)
{
int
*
domains
;
int64_t
*
ids
;
SCCLCHECK
(
scclCalloc
(
&
domains
,
system
->
nodes
[
GPU
].
count
));
SCCLCHECK
(
scclCalloc
(
&
ids
,
system
->
nodes
[
GPU
].
count
));
int
myDomain
=
0
;
for
(
int
g
=
0
;
g
<
system
->
nodes
[
GPU
].
count
;
g
++
)
{
struct
scclTopoNode
*
gpu
=
system
->
nodes
[
GPU
].
nodes
+
g
;
domains
[
g
]
=
g
;
ids
[
g
]
=
gpu
->
id
;
for
(
int
p
=
0
;
p
<
g
;
p
++
)
{
if
(
gpu
->
paths
[
GPU
][
p
].
type
<
PATH_NET
)
{
domains
[
g
]
=
std
::
min
(
domains
[
g
],
domains
[
p
]);
}
}
if
(
gpu
->
gpu
.
rank
==
comm
->
rank
)
myDomain
=
domains
[
g
];
}
int
ngpus
=
system
->
nodes
[
GPU
].
count
;
for
(
int
i
=
0
;
i
<
ngpus
;
i
++
)
{
if
(
domains
[
i
]
==
myDomain
)
continue
;
struct
scclTopoNode
*
gpu
=
NULL
;
int
g
;
for
(
g
=
0
;
g
<
system
->
nodes
[
GPU
].
count
/* This one varies over the loops */
;
g
++
)
{
gpu
=
system
->
nodes
[
GPU
].
nodes
+
g
;
if
(
gpu
->
id
==
ids
[
i
])
break
;
else
gpu
=
NULL
;
}
if
(
gpu
==
NULL
)
{
WARN
(
"Could not find id %lx"
,
ids
[
i
]);
free
(
domains
);
free
(
ids
);
return
scclInternalError
;
}
SCCLCHECK
(
scclTopoRemoveNode
(
system
,
GPU
,
g
));
}
// trim low speed port on same NIC
for
(
int
i
=
0
;
i
<
system
->
nodes
[
NET
].
count
;
i
++
)
{
for
(
int
j
=
0
;
j
<
system
->
nodes
[
NET
].
count
;
j
++
)
{
if
(
i
==
j
)
continue
;
if
(
system
->
nodes
[
NET
].
nodes
[
i
].
net
.
asic
==
system
->
nodes
[
NET
].
nodes
[
j
].
net
.
asic
)
{
if
(
system
->
nodes
[
NET
].
nodes
[
i
].
net
.
bw
>
system
->
nodes
[
NET
].
nodes
[
j
].
net
.
bw
)
system
->
nodes
[
NET
].
nodes
[
j
].
net
.
bw
=
0
;
}
}
}
do
{
int
n
;
for
(
n
=
0
;
n
<
system
->
nodes
[
NET
].
count
;
n
++
)
{
if
(
system
->
nodes
[
NET
].
nodes
[
n
].
net
.
bw
==
0
)
break
;
}
if
(
n
<
system
->
nodes
[
NET
].
count
)
{
SCCLCHECK
(
scclTopoRemoveNode
(
system
,
NET
,
n
));
}
else
break
;
}
while
(
system
->
nodes
[
NET
].
count
);
int
remove
=
1
;
int
gdr
=
1
;
bool
allXgmi
=
true
;
// detect if all GPUs are connected by XGMI
for
(
int
i
=
0
;
i
<
system
->
nodes
[
GPU
].
count
&&
allXgmi
;
i
++
)
{
int
cudaDev1
=
system
->
nodes
[
GPU
].
nodes
[
i
].
gpu
.
dev
;
for
(
int
j
=
0
;
j
<
system
->
nodes
[
GPU
].
count
&&
allXgmi
;
j
++
)
{
if
(
i
==
j
)
continue
;
int
cudaDev2
=
system
->
nodes
[
GPU
].
nodes
[
j
].
gpu
.
dev
;
bool
isXGMI
;
SCCLCHECK
(
scclTopoGetLinkType
(
comm
->
topo
,
cudaDev1
,
cudaDev2
,
&
isXGMI
));
allXgmi
&=
isXGMI
;
}
}
if
(
allXgmi
)
system
->
type
|=
RCCL_TOPO_XGMI_ALL
;
for
(
int
g
=
0
;
g
<
system
->
nodes
[
GPU
].
count
;
g
++
)
{
int
net
;
SCCLCHECK
(
scclTopoGetLocalNet
(
system
,
system
->
nodes
[
GPU
].
nodes
[
g
].
gpu
.
rank
,
0
,
&
net
));
SCCLCHECK
(
scclTopoCheckGdr
(
system
,
system
->
nodes
[
GPU
].
nodes
[
g
].
id
,
net
,
1
,
&
gdr
));
if
(
!
gdr
)
break
;
}
if
(
gdr
&&
!
allXgmi
)
{
remove
=
0
;
system
->
type
|=
RCCL_TOPO_GDR_ALL
;
INFO
(
SCCL_LOG_TOPO
,
"GDR is available on all GPUs"
);
}
// Special handling of gfx94x
if
(
rcclParamEnableIntranet
()
==
1
||
(
rcclParamEnableIntranet
()
==
-
2
&&
IsArchMatch
(
system
->
nodes
[
GPU
].
nodes
[
0
].
gpu
.
gcn
,
"gfx94"
)
&&
system
->
nodes
[
GPU
].
count
==
8
&&
system
->
nodes
[
NET
].
count
==
8
))
{
remove
=
0
;
system
->
type
|=
RCCL_TOPO_FORCE_INTRA
;
}
comm
->
localRanks
=
system
->
nodes
[
GPU
].
count
;
if
(
system
->
nodes
[
GPU
].
count
==
comm
->
nRanks
&&
remove
)
{
for
(
int
n
=
system
->
nodes
[
NET
].
count
-
1
;
n
>=
0
;
n
--
)
SCCLCHECK
(
scclTopoRemoveNode
(
system
,
NET
,
n
));
}
free
(
domains
);
free
(
ids
);
return
scclSuccess
;
}
void
scclTopoFree
(
struct
scclTopoSystem
*
system
)
{
for
(
int
t
=
0
;
t
<
SCCL_TOPO_NODE_TYPES
;
t
++
)
scclTopoRemovePathType
(
system
,
t
);
free
(
system
);
}
SCCL_PARAM
(
NChannelsPerNetPeer
,
"NCHANNELS_PER_NET_PEER"
,
1
);
SCCL_PARAM
(
NChannelsPerPeer
,
"NCHANNELS_PER_PEER"
,
4
);
static
scclResult_t
scclTopoGetNchannels
(
struct
scclTopoSystem
*
system
,
int
g
/*local gpu index*/
,
int
peerRank
,
int
*
nChannels
)
{
int
peer
;
struct
scclTopoLinkList
*
path
=
NULL
;
if
(
scclTopoRankToIndex
(
system
,
peerRank
,
&
peer
)
==
scclSuccess
)
{
// Same rank
if
(
g
==
peer
)
{
*
nChannels
=
-
1
;
return
scclSuccess
;
}
// Local rank
path
=
system
->
nodes
[
GPU
].
nodes
[
peer
].
paths
[
GPU
]
+
g
;
if
(
path
->
type
==
PATH_NVL
)
{
float
nvlBw
=
scclTopoXGMISpeed
(
system
->
nodes
[
GPU
].
nodes
[
g
].
gpu
.
gcn
);
*
nChannels
=
(
IsArchMatch
(
system
->
nodes
[
GPU
].
nodes
[
0
].
gpu
.
gcn
,
"gfx94"
)
?
4
:
2
)
*
std
::
max
(
1
,
(
int
)(
path
->
bw
/
nvlBw
));
}
else
{
*
nChannels
=
2
;
}
}
else
{
// Remote rank, use network
*
nChannels
=
scclParamNChannelsPerNetPeer
();
}
return
scclSuccess
;
}
SCCL_PARAM
(
MinP2pNChannels
,
"MIN_P2P_NCHANNELS"
,
4
);
SCCL_PARAM
(
MaxP2pNChannels
,
"MAX_P2P_NCHANNELS"
,
MAXCHANNELS
);
static
int
nextPow2
(
int
v
)
{
int
pow2
=
1
;
while
(
pow2
<
v
)
pow2
<<=
1
;
return
pow2
;
}
scclResult_t
scclTopoComputeP2pChannels
(
struct
scclComm
*
comm
)
{
/* here we already honor comm->max/minCTAs for p2pnChannels. */
int
MinP2pNchannels
=
(
int
)
scclParamMinP2pNChannels
();
int
MaxP2pNchannels
=
(
int
)
scclParamMaxP2pNChannels
();
int
NchannelsPerPeer
=
(
int
)
scclParamNChannelsPerPeer
();
if
(
scclTopoPathAllNVLink
(
comm
->
topo
)
==
1
&&
getenv
(
"SCCL_MIN_P2P_NCHANNELS"
)
==
NULL
)
MinP2pNchannels
=
32
;
if
(
scclTopoPathAllNVLink
(
comm
->
topo
)
==
1
&&
getenv
(
"SCCL_MAX_P2P_NCHANNELS"
)
==
NULL
)
MaxP2pNchannels
=
32
;
if
(
scclTopoPathAllNVLink
(
comm
->
topo
)
==
1
&&
getenv
(
"SCCL_NCHANNELS_PER_PEER"
)
==
NULL
)
NchannelsPerPeer
=
32
;
int
scclMinP2pNchannels
=
MinP2pNchannels
;
if
(
comm
->
sharedRes
->
owner
!=
comm
)
{
comm
->
p2pnChannels
=
std
::
min
(
comm
->
nChannels
,
MaxP2pNchannels
);
comm
->
p2pnChannels
=
std
::
min
(
std
::
max
(
comm
->
p2pnChannels
,
scclMinP2pNchannels
),
comm
->
sharedRes
->
tpP2pNChannels
);
}
else
{
comm
->
p2pnChannels
=
std
::
min
(
comm
->
nChannels
,
MaxP2pNchannels
);
comm
->
p2pnChannels
=
std
::
max
(
comm
->
p2pnChannels
,
scclMinP2pNchannels
);
}
int
minChannels
=
comm
->
p2pnChannels
;
// We need to loop through all local GPUs to have a global picture
for
(
int
g
=
0
;
g
<
comm
->
topo
->
nodes
[
GPU
].
count
;
g
++
)
{
for
(
int
r
=
0
;
r
<
comm
->
nRanks
;
r
++
)
{
int
nChannels
;
SCCLCHECK
(
scclTopoGetNchannels
(
comm
->
topo
,
g
,
r
,
&
nChannels
));
if
(
nChannels
>=
0
)
minChannels
=
std
::
min
(
minChannels
,
nChannels
);
}
}
int
arch
,
vendor
,
model
;
SCCLCHECK
(
scclTopoCpuType
(
comm
->
topo
,
&
arch
,
&
vendor
,
&
model
));
// Round to next pow2 nChannelsPerPeer and nChannels
if
(
getNumaMaxGpus
()
==
1
&&
!
scclTopoPathAllNVLink
(
comm
->
topo
))
{
comm
->
p2pnChannelsPerPeer
=
nextPow2
(
comm
->
p2pnChannels
);
}
else
{
comm
->
p2pnChannelsPerPeer
=
(
NchannelsPerPeer
==
-
2
?
nextPow2
(
minChannels
)
:
NchannelsPerPeer
);
}
comm
->
p2pnChannels
=
nextPow2
(
comm
->
p2pnChannels
);
// Init channels that weren't used so far
for
(
int
c
=
comm
->
nChannels
;
c
<
std
::
max
(
comm
->
nChannels
,
comm
->
p2pnChannels
);
c
++
)
SCCLCHECK
(
initChannel
(
comm
,
c
));
// We want to spread channels used when there aren't many and progressively
// fill the whole space of nChannels. To do so we mirror the bits in the
// nChannels space.
for
(
int
c
=
0
;
c
<
comm
->
p2pnChannels
;
c
++
)
{
int
mirror
=
0
;
for
(
int
b
=
1
,
mb
=
(
comm
->
p2pnChannels
>>
1
);
b
<
comm
->
p2pnChannels
;
b
<<=
1
,
mb
>>=
1
)
if
(
c
&
b
)
mirror
|=
mb
;
comm
->
p2pChannels
[
c
]
=
mirror
;
}
return
scclSuccess
;
}
scclResult_t
scclTopoGetNvbGpus
(
struct
scclTopoSystem
*
system
,
int
rank
,
int
*
nranks
,
int
**
ranks
)
{
int
ngpus
=
system
->
nodes
[
GPU
].
count
;
SCCLCHECK
(
scclCalloc
(
ranks
,
ngpus
));
int
nvbGpus
=
0
;
for
(
int
g
=
0
;
g
<
ngpus
;
g
++
)
{
struct
scclTopoNode
*
gpu
=
system
->
nodes
[
GPU
].
nodes
+
g
;
if
(
gpu
->
gpu
.
rank
!=
rank
)
continue
;
for
(
int
p
=
0
;
p
<
ngpus
;
p
++
)
{
if
(
gpu
->
paths
[
GPU
][
p
].
type
==
PATH_NVB
)
{
(
*
ranks
)[
nvbGpus
++
]
=
system
->
nodes
[
GPU
].
nodes
[
p
].
gpu
.
rank
;
}
}
}
*
nranks
=
nvbGpus
;
return
scclSuccess
;
}
int
scclTopoPathAllNVLink
(
struct
scclTopoSystem
*
system
)
{
int
minPath
=
PATH_DIS
;
for
(
int
i
=
0
;
i
<
system
->
nodes
[
GPU
].
count
;
i
++
)
{
struct
scclTopoLinkList
*
paths
=
system
->
nodes
[
GPU
].
nodes
[
i
].
paths
[
GPU
];
for
(
int
j
=
0
;
j
<
system
->
nodes
[
GPU
].
count
;
j
++
)
{
if
(
i
==
j
)
continue
;
minPath
=
std
::
min
(
minPath
,
paths
[
j
].
type
);
}
}
return
minPath
>=
PATH_PIX
?
0
:
1
;
}
}
// namespace graph
scclResult_t
scclTopoPrintPaths
(
struct
scclTopoSystem
*
system
)
{
for
(
int
i
=
0
;
i
<
system
->
nodes
[
GPU
].
count
;
i
++
)
{
graph
::
printNodePaths
(
system
,
system
->
nodes
[
GPU
].
nodes
+
i
);
}
for
(
int
i
=
0
;
i
<
system
->
nodes
[
NET
].
count
;
i
++
)
{
graph
::
printNodePaths
(
system
,
system
->
nodes
[
NET
].
nodes
+
i
);
}
return
scclSuccess
;
}
int
scclTopoUserP2pLevel
=
-
1
;
scclResult_t
scclTopoCheckP2p
(
struct
scclTopoSystem
*
system
,
int64_t
id1
,
int64_t
id2
,
int
*
p2p
,
int
*
read
,
int
*
intermediateRank
)
{
*
p2p
=
0
;
if
(
read
)
*
read
=
0
;
if
(
intermediateRank
)
*
intermediateRank
=
-
1
;
// Get GPUs from topology
int
g1
,
g2
;
SCCLCHECK
(
scclTopoIdToIndex
(
system
,
GPU
,
id1
,
&
g1
));
struct
scclTopoNode
*
gpu1
=
system
->
nodes
[
GPU
].
nodes
+
g1
;
if
(
scclTopoIdToIndex
(
system
,
GPU
,
id2
,
&
g2
)
==
scclInternalError
)
{
// GPU not found, we can't use p2p.
return
scclSuccess
;
}
int
intermediateIndex
=
-
1
;
// Set intermediate GPU rank, if routing through an intermediate GPU.
struct
scclTopoLinkList
*
path
=
gpu1
->
paths
[
GPU
]
+
g2
;
if
(
path
->
count
==
2
)
{
struct
scclTopoNode
*
intermediateNode
=
path
->
list
[
0
]
->
remNode
;
if
(
intermediateNode
->
type
==
GPU
)
{
intermediateIndex
=
intermediateNode
-
system
->
nodes
[
GPU
].
nodes
;
if
(
intermediateRank
)
*
intermediateRank
=
intermediateNode
->
gpu
.
rank
;
}
}
// In general, use P2P whenever we can.
int
p2pLevel
=
PATH_SYS
;
// User override
if
(
scclTopoUserP2pLevel
==
-
1
)
SCCLCHECK
(
scclGetLevel
(
&
scclTopoUserP2pLevel
,
"SCCL_P2P_DISABLE"
,
"SCCL_P2P_LEVEL"
));
if
(
scclTopoUserP2pLevel
!=
-
2
)
{
p2pLevel
=
scclTopoUserP2pLevel
;
goto
compare
;
}
// Don't use P2P through ARM CPUs
int
arch
,
vendor
,
model
;
SCCLCHECK
(
scclTopoCpuType
(
system
,
&
arch
,
&
vendor
,
&
model
));
if
(
arch
==
SCCL_TOPO_CPU_ARCH_ARM
)
p2pLevel
=
PATH_PXB
;
if
(
arch
==
SCCL_TOPO_CPU_ARCH_X86
&&
vendor
==
SCCL_TOPO_CPU_VENDOR_INTEL
)
{
p2pLevel
=
PATH_PXB
;
}
if
(
arch
==
SCCL_TOPO_CPU_ARCH_X86
&&
vendor
==
SCCL_TOPO_CPU_VENDOR_ZHAOXIN
)
{
p2pLevel
=
PATH_PXB
;
}
compare:
// Compute the PCI distance and compare with the p2pLevel.
if
(
path
->
type
<=
p2pLevel
)
*
p2p
=
1
;
if
(
path
->
type
==
PATH_NVL
)
{
struct
scclTopoNode
*
gpu2
=
system
->
nodes
[
GPU
].
nodes
+
g2
;
// Enable P2P Read for Ampere/NVLink only
if
(
read
&&
(
gpu1
->
gpu
.
cudaCompCap
==
gpu2
->
gpu
.
cudaCompCap
)
&&
(
gpu1
->
gpu
.
cudaCompCap
==
80
))
*
read
=
1
;
}
return
scclSuccess
;
}
scclResult_t
scclTopoComputePaths
(
struct
scclTopoSystem
*
system
,
struct
scclComm
*
comm
)
{
// Precompute paths between GPUs/NICs.
// Remove everything in case we're re-computing
for
(
int
t
=
0
;
t
<
SCCL_TOPO_NODE_TYPES
;
t
++
)
graph
::
scclTopoRemovePathType
(
system
,
t
);
// Set direct paths to CPUs. We need them in many cases.
for
(
int
c
=
0
;
c
<
system
->
nodes
[
CPU
].
count
;
c
++
)
{
SCCLCHECK
(
graph
::
scclTopoSetPaths
(
system
->
nodes
[
CPU
].
nodes
+
c
,
system
));
}
// Set direct paths to GPUs.
for
(
int
g
=
0
;
g
<
system
->
nodes
[
GPU
].
count
;
g
++
)
{
SCCLCHECK
(
graph
::
scclTopoSetPaths
(
system
->
nodes
[
GPU
].
nodes
+
g
,
system
));
}
// Set direct paths to NICs.
for
(
int
n
=
0
;
n
<
system
->
nodes
[
NET
].
count
;
n
++
)
{
SCCLCHECK
(
graph
::
scclTopoSetPaths
(
system
->
nodes
[
NET
].
nodes
+
n
,
system
));
}
// Set direct paths to NVSwitches.
for
(
int
n
=
0
;
n
<
system
->
nodes
[
NVS
].
count
;
n
++
)
{
SCCLCHECK
(
graph
::
scclTopoSetPaths
(
system
->
nodes
[
NVS
].
nodes
+
n
,
system
));
}
// Update path for GPUs when we don't want to / can't use GPU Direct P2P
for
(
int
g
=
0
;
g
<
system
->
nodes
[
GPU
].
count
;
g
++
)
{
for
(
int
p
=
0
;
p
<
system
->
nodes
[
GPU
].
count
;
p
++
)
{
int
p2p
;
SCCLCHECK
(
scclTopoCheckP2p
(
system
,
system
->
nodes
[
GPU
].
nodes
[
p
].
id
,
system
->
nodes
[
GPU
].
nodes
[
g
].
id
,
&
p2p
,
NULL
,
NULL
));
if
(
p2p
==
0
)
{
// Divert all traffic through the CPU
int
cpu
;
SCCLCHECK
(
getLocalCpu
(
system
,
g
,
&
cpu
));
SCCLCHECK
(
addInterStep
(
system
,
CPU
,
cpu
,
GPU
,
p
,
GPU
,
g
));
}
}
if
(
comm
==
NULL
)
continue
;
// Remove GPUs we can't (or don't want to) communicate with through P2P or SHM
struct
scclPeerInfo
*
dstInfo
=
comm
->
peerInfo
+
system
->
nodes
[
GPU
].
nodes
[
g
].
gpu
.
rank
;
for
(
int
p
=
0
;
p
<
system
->
nodes
[
GPU
].
count
;
p
++
)
{
if
(
p
==
g
)
continue
;
struct
scclPeerInfo
*
srcInfo
=
comm
->
peerInfo
+
system
->
nodes
[
GPU
].
nodes
[
p
].
gpu
.
rank
;
int
p2p
;
SCCLCHECK
(
scclTransports
[
TRANSPORT_P2P
]
->
canConnect
(
&
p2p
,
system
,
NULL
,
srcInfo
,
dstInfo
));
if
(
p2p
==
0
)
{
int
shm
;
SCCLCHECK
(
scclTransports
[
TRANSPORT_SHM
]
->
canConnect
(
&
shm
,
system
,
NULL
,
srcInfo
,
dstInfo
));
if
(
shm
==
0
)
{
// Mark this peer as inaccessible. We'll trim it later.
system
->
nodes
[
GPU
].
nodes
[
p
].
paths
[
GPU
][
g
].
type
=
PATH_NET
;
}
}
}
}
// Special handling of gfx94x
#if !defined(TOPO_EXPL)
char
strValue
[
1024
];
SCCLCHECK
(
scclTopoGetStrFromSys
(
"/sys/devices/virtual/dmi/id"
,
"bios_version"
,
strValue
));
if
(
strncmp
(
"Hyper-V UEFI Release"
,
strValue
,
20
)
==
0
)
{
#endif
int
arch
,
vendor
,
model
;
SCCLCHECK
(
scclTopoCpuType
(
system
,
&
arch
,
&
vendor
,
&
model
));
if
(
arch
==
SCCL_TOPO_CPU_ARCH_X86
&&
vendor
==
SCCL_TOPO_CPU_VENDOR_INTEL
&&
IsArchMatch
(
system
->
nodes
[
GPU
].
nodes
[
0
].
gpu
.
gcn
,
"gfx94"
)
&&
((
system
->
nodes
[
GPU
].
count
==
8
&&
system
->
nodes
[
NET
].
count
==
8
&&
system
->
nodes
[
GPU
].
count
==
system
->
nRanks
)
||
(
system
->
nodes
[
GPU
].
count
!=
system
->
nRanks
)))
{
if
(
!
rcclPathOverride
(
system
,
0x100000
))
rcclPathOverride
(
system
,
0x1000
);
}
#if !defined(TOPO_EXPL)
}
#endif
// Update paths for NICs (no GPU Direct, PXN, ...)
for
(
int
n
=
0
;
n
<
system
->
nodes
[
NET
].
count
;
n
++
)
{
struct
scclTopoNode
*
netNode
=
system
->
nodes
[
NET
].
nodes
+
n
;
for
(
int
g
=
0
;
g
<
system
->
nodes
[
GPU
].
count
;
g
++
)
{
// Check whether we can access the NIC through another NVLink-connected GPU (PXN)
struct
scclTopoNode
*
gpu
=
system
->
nodes
[
GPU
].
nodes
+
g
;
if
(
scclPxnDisable
(
comm
)
!=
1
)
{
int
localGpuIndex
;
SCCLCHECK
(
scclTopoGetLocalGpu
(
system
,
system
->
nodes
[
NET
].
nodes
[
n
].
id
,
&
localGpuIndex
));
if
(
localGpuIndex
!=
g
&&
localGpuIndex
!=
-
1
)
{
// PXN = PCI + NVLink.
struct
scclTopoNode
*
peerNode
=
system
->
nodes
[
GPU
].
nodes
+
localGpuIndex
;
// Only use PXN for NIC n if remote GPU p ...
if
(
peerNode
->
paths
[
NET
][
n
].
type
<=
PATH_PXB
&&
// Is connected to the NIC through PCI
peerNode
->
paths
[
GPU
][
g
].
type
<=
PATH_NVL
&&
// Is connected to us through NVLink
(
peerNode
->
paths
[
NET
][
n
].
bw
>
gpu
->
paths
[
NET
][
n
].
bw
||
// Has either higher BW to that NIC
gpu
->
paths
[
NET
][
n
].
type
>
PATH_PXB
))
// or avoids going through a CPU
// We can use that GPU as relay to communicate with that NIC.
// Only enabling it in the GPU->NIC direction for now to favor
// receiving locally and sending remotely (consistent with net.cc)
SCCLCHECK
(
addInterStep
(
system
,
GPU
,
localGpuIndex
,
GPU
,
g
,
NET
,
n
));
}
}
// Update path when we dont want to / can't use GPU Direct RDMA.
int
gdr
;
SCCLCHECK
(
scclTopoCheckGdr
(
system
,
system
->
nodes
[
GPU
].
nodes
[
g
].
id
,
netNode
->
id
,
0
,
&
gdr
));
if
(
gdr
==
0
)
{
// We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU
int
localCpu
;
SCCLCHECK
(
getLocalCpu
(
system
,
g
,
&
localCpu
));
SCCLCHECK
(
addInterStep
(
system
,
CPU
,
localCpu
,
NET
,
n
,
GPU
,
g
));
SCCLCHECK
(
addInterStep
(
system
,
CPU
,
localCpu
,
GPU
,
g
,
NET
,
n
));
}
}
}
return
scclSuccess
;
}
}
// namespace topology
}
// namespace hardware
}
// namespace sccl
src/hardware/graph/rings.cc
deleted
100644 → 0
View file @
d9d23f34
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
namespace
sccl
{
namespace
hardware
{
namespace
topology
{
namespace
detect
{
#define MAXWIDTH 20
#define PREFIXLEN 15
#define STRLENGTH (PREFIXLEN + 5 * MAXWIDTH)
void
dumpLine
(
int
*
values
,
int
nranks
,
const
char
*
prefix
)
{
int
prefixlen
=
strlen
(
prefix
);
char
line
[
STRLENGTH
+
1
];
line
[
STRLENGTH
]
=
'\0'
;
memset
(
line
,
' '
,
STRLENGTH
);
strncpy
(
line
,
prefix
,
PREFIXLEN
);
for
(
int
i
=
0
;
i
<
nranks
&&
i
<
MAXWIDTH
;
i
++
)
sprintf
(
line
+
prefixlen
+
4
*
i
,
" %3d"
,
values
[
i
]);
INFO
(
SCCL_INIT
,
"%s"
,
line
);
}
scclResult_t
scclBuildRings
(
int
nrings
,
int
*
rings
,
int
rank
,
int
nranks
,
int
*
prev
,
int
*
next
)
{
for
(
int
r
=
0
;
r
<
nrings
;
r
++
)
{
char
prefix
[
40
];
/*sprintf(prefix, "[%d] Channel %d Prev : ", rank, r);
dumpLine(prev+r*nranks, nranks, prefix);
sprintf(prefix, "[%d] Channel %d Next : ", rank, r);
dumpLine(next+r*nranks, nranks, prefix);*/
int
current
=
rank
;
for
(
int
i
=
0
;
i
<
nranks
;
i
++
)
{
rings
[
r
*
nranks
+
i
]
=
current
;
current
=
next
[
r
*
nranks
+
current
];
}
sprintf
(
prefix
,
"Channel %02d/%02d : "
,
r
,
nrings
);
if
(
rank
==
0
)
dumpLine
(
rings
+
r
*
nranks
,
nranks
,
prefix
);
if
(
current
!=
rank
)
{
WARN
(
"Error : ring %d does not loop back to start (%d != %d)"
,
r
,
current
,
rank
);
return
scclInternalError
;
}
// Check that all ranks are there
for
(
int
i
=
0
;
i
<
nranks
;
i
++
)
{
int
found
=
0
;
for
(
int
j
=
0
;
j
<
nranks
;
j
++
)
{
if
(
rings
[
r
*
nranks
+
j
]
==
i
)
{
found
=
1
;
break
;
}
}
if
(
found
==
0
)
{
WARN
(
"Error : ring %d does not contain rank %d"
,
r
,
i
);
return
scclInternalError
;
}
}
}
return
scclSuccess
;
}
}
// namespace detect
}
// namespace topology
}
// namespace hardware
}
// namespace sccl
src/hardware/graph/rings.h
deleted
100644 → 0
View file @
d9d23f34
/*************************************************************************
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
namespace
sccl
{
namespace
hardware
{
namespace
topology
{
namespace
detect
{
scclResult_t
scclBuildRings
(
int
nrings
,
int
*
rings
,
int
rank
,
int
nranks
,
int
*
prev
,
int
*
next
);
}
// namespace detect
}
// namespace topology
}
// namespace hardware
}
// namespace sccl
src/hardware/graph/rome_models.cc
deleted
100644 → 0
View file @
d9d23f34
#include "core.h"
#include "graph.h"
#include "topo.h"
#include "xml.h"
#include <math.h>
#include <sys/time.h>
#include <algorithm>
#include <string.h>
#include "rome_models.h"
namespace
sccl
{
namespace
hardware
{
namespace
topology
{
namespace
detect
{
struct
scclRomeModel
{
int
nGpus
;
int
nCpus
;
int
nNics
;
int
nLinks
;
int64_t
gpuIds
[
SCCL_TOPO_MAX_NODES
];
int64_t
nicIds
[
SCCL_TOPO_MAX_NODES
];
int64_t
gpuNuma
[
SCCL_TOPO_MAX_NODES
];
int64_t
nicNuma
[
SCCL_TOPO_MAX_NODES
];
uint8_t
connMatrix
[
SCCL_TOPO_MAX_NODES
*
SCCL_TOPO_MAX_NODES
];
uint8_t
gdrLevel
[
SCCL_TOPO_MAX_NODES
*
SCCL_TOPO_MAX_NODES
];
const
char
*
pattern
;
const
char
*
ringBase
;
const
char
*
options
;
const
char
*
treeBase
;
};
static
struct
scclRomeModel
rome_model_22
=
{
.
nGpus
=
8
,
.
nCpus
=
4
,
.
nNics
=
1
,
.
nLinks
=
2
,
.
gpuIds
=
{
0x3000
,
0x43000
,
0x26000
,
0xc3000
,
0x83000
,
0x23000
,
0xc6000
,
0xa3000
,
},
.
nicIds
=
{
0xe1000
,
},
.
gpuNuma
=
{
1
,
0
,
1
,
2
,
3
,
1
,
2
,
3
,
},
.
nicNuma
=
{
2
,
},
.
connMatrix
=
{
0
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
1
,
0
,
0
,
0
,
1
,
1
,
0
,
0
,
0
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
0
,
0
,
},
.
gdrLevel
=
{
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PHB
,
PATH_SYS
,
PATH_SYS
,
PATH_PHB
,
PATH_SYS
,
},
.
pattern
=
"10302120"
,
.
ringBase
=
"7 4 5 3 1 0 6 2|4 7 3 5 0 1 2 6"
,
.
options
=
""
,
};
static
struct
scclRomeModel
rome_model_25
=
{
.
nGpus
=
8
,
.
nCpus
=
4
,
.
nNics
=
2
,
.
nLinks
=
2
,
.
gpuIds
=
{
0x43000
,
0x23000
,
0x26000
,
0x3000
,
0xe3000
,
0xc3000
,
0xc6000
,
0x83000
,
},
.
nicIds
=
{
0x61000
,
0xa1000
,
},
.
gpuNuma
=
{
0
,
1
,
1
,
1
,
2
,
2
,
2
,
3
,
},
.
nicNuma
=
{
0
,
3
,
},
.
connMatrix
=
{
0
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
},
.
gdrLevel
=
{
PATH_PHB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PHB
,
},
.
pattern
=
"11303011"
,
.
ringBase
=
"2 1 0 3 6 7 5 4|7 6 4 5 1 2 3 0"
,
.
options
=
""
,
};
static
struct
scclRomeModel
rome_model_27
=
{
.
nGpus
=
8
,
.
nCpus
=
4
,
.
nNics
=
2
,
.
nLinks
=
2
,
.
gpuIds
=
{
0x43000
,
0x23000
,
0x26000
,
0x3000
,
0xe3000
,
0xc3000
,
0xc6000
,
0x83000
,
},
.
nicIds
=
{
0x61000
,
0xa1000
,
},
.
gpuNuma
=
{
0
,
1
,
1
,
1
,
2
,
2
,
2
,
3
,
},
.
nicNuma
=
{
0
,
3
,
},
.
connMatrix
=
{
0
,
0
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
0
,
1
,
0
,
0
,
1
,
0
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
1
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
1
,
0
,
0
,
},
.
gdrLevel
=
{
PATH_PHB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PHB
,
},
.
pattern
=
"11303011"
,
.
ringBase
=
"0 6 2 3 1 7 5 4|7 1 4 5 6 0 3 2"
,
.
options
=
""
,
};
static
struct
scclRomeModel
rome_model_29
=
{
.
nGpus
=
8
,
.
nCpus
=
4
,
.
nNics
=
1
,
.
nLinks
=
3
,
.
gpuIds
=
{
0x43000
,
0x23000
,
0x26000
,
0x3000
,
0xc3000
,
0xc6000
,
0xa3000
,
0x83000
,
},
.
nicIds
=
{
0xe1000
,
},
.
gpuNuma
=
{
0
,
1
,
1
,
1
,
2
,
2
,
3
,
3
,
},
.
nicNuma
=
{
2
,
},
.
connMatrix
=
{
0
,
1
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
0
,
},
.
gdrLevel
=
{
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PHB
,
PATH_PHB
,
PATH_SYS
,
PATH_SYS
,
},
.
pattern
=
"10302120"
,
.
ringBase
=
"6 5 7 4 0 1 3 2|6 4 7 5 2 3 1 0"
,
.
options
=
""
,
};
static
struct
scclRomeModel
rome_model_31
=
{
.
nGpus
=
8
,
.
nCpus
=
8
,
.
nNics
=
2
,
.
nLinks
=
2
,
.
gpuIds
=
{
0x43000
,
0x23000
,
0x26000
,
0x3000
,
0xe3000
,
0xc3000
,
0xc6000
,
0x83000
,
},
.
nicIds
=
{
0x61000
,
0xa1000
,
},
.
gpuNuma
=
{
1
,
2
,
2
,
3
,
4
,
5
,
5
,
7
,
},
.
nicNuma
=
{
0
,
6
,
},
.
connMatrix
=
{
0
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
},
.
gdrLevel
=
{
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
},
.
pattern
=
"0110201010200110"
,
.
ringBase
=
"1 2 3 0 6 4 5 7|4 6 7 5 2 1 0 3"
,
.
options
=
""
,
};
static
struct
scclRomeModel
rome_model_33
=
{
.
nGpus
=
8
,
.
nCpus
=
8
,
.
nNics
=
2
,
.
nLinks
=
2
,
.
gpuIds
=
{
0x43000
,
0x23000
,
0x26000
,
0x3000
,
0xe3000
,
0xc3000
,
0xc6000
,
0x83000
,
},
.
nicIds
=
{
0x61000
,
0xa1000
,
},
.
gpuNuma
=
{
1
,
2
,
2
,
3
,
4
,
5
,
5
,
7
,
},
.
nicNuma
=
{
0
,
6
,
},
.
connMatrix
=
{
0
,
0
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
0
,
1
,
0
,
0
,
1
,
0
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
1
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
1
,
0
,
0
,
},
.
gdrLevel
=
{
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
},
.
pattern
=
"0110201010200110"
,
.
ringBase
=
"1 4 5 7 0 3 2 6|4 1 7 5 6 2 3 0"
,
.
options
=
""
,
};
static
struct
scclRomeModel
rome_model_30
=
{
.
nGpus
=
8
,
.
nCpus
=
8
,
.
nNics
=
0
,
.
nLinks
=
2
,
.
gpuIds
=
{
0x43000
,
0x23000
,
0x26000
,
0x3000
,
0xe3000
,
0xc3000
,
0xc6000
,
0x83000
,
},
.
nicIds
=
{},
.
gpuNuma
=
{
1
,
2
,
2
,
3
,
4
,
5
,
5
,
7
,
},
.
nicNuma
=
{},
.
connMatrix
=
{
0
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
},
.
gdrLevel
=
{},
.
pattern
=
"0010201010200010"
,
.
ringBase
=
"3 0 1 2 6 7 5 4|2 1 0 3 7 6 4 5"
,
.
options
=
""
,
};
static
struct
scclRomeModel
rome_model_32
=
{
.
nGpus
=
8
,
.
nCpus
=
8
,
.
nNics
=
0
,
.
nLinks
=
2
,
.
gpuIds
=
{
0x43000
,
0x23000
,
0x26000
,
0x3000
,
0xe3000
,
0xc3000
,
0xc6000
,
0x83000
,
},
.
nicIds
=
{},
.
gpuNuma
=
{
1
,
2
,
2
,
3
,
4
,
5
,
5
,
7
,
},
.
nicNuma
=
{},
.
connMatrix
=
{
0
,
0
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
0
,
1
,
0
,
0
,
1
,
0
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
1
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
1
,
0
,
0
,
},
.
gdrLevel
=
{},
.
pattern
=
"0010201010200010"
,
.
ringBase
=
"0 6 2 3 4 5 7 1|3 2 6 0 1 7 5 4"
,
.
options
=
""
,
};
static
struct
scclRomeModel
rome_model_24
=
{
.
nGpus
=
8
,
.
nCpus
=
4
,
.
nNics
=
0
,
.
nLinks
=
2
,
.
gpuIds
=
{
0x43000
,
0x23000
,
0x26000
,
0x3000
,
0xe3000
,
0xc3000
,
0xc6000
,
0x83000
,
},
.
nicIds
=
{},
.
gpuNuma
=
{
0
,
1
,
1
,
1
,
2
,
2
,
2
,
3
,
},
.
nicNuma
=
{},
.
connMatrix
=
{
0
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
},
.
gdrLevel
=
{},
.
pattern
=
"10303010"
,
.
ringBase
=
"0 1 2 3 5 7 6 4|1 0 3 2 7 5 4 6"
,
.
options
=
""
,
};
static
struct
scclRomeModel
rome_model_26
=
{
.
nGpus
=
8
,
.
nCpus
=
4
,
.
nNics
=
0
,
.
nLinks
=
2
,
.
gpuIds
=
{
0x43000
,
0x23000
,
0x26000
,
0x3000
,
0xe3000
,
0xc3000
,
0xc6000
,
0x83000
,
},
.
nicIds
=
{},
.
gpuNuma
=
{
0
,
1
,
1
,
1
,
2
,
2
,
2
,
3
,
},
.
nicNuma
=
{},
.
connMatrix
=
{
0
,
0
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
0
,
1
,
0
,
0
,
1
,
0
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
1
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
1
,
0
,
0
,
},
.
gdrLevel
=
{},
.
pattern
=
"10303010"
,
.
ringBase
=
"4 5 7 1 0 3 2 6|3 0 6 2 1 7 5 4"
,
.
options
=
""
,
};
static
struct
scclRomeModel
rome_model_23
=
{
.
nGpus
=
8
,
.
nCpus
=
4
,
.
nNics
=
0
,
.
nLinks
=
2
,
.
gpuIds
=
{
0x43000
,
0x23000
,
0x26000
,
0x3000
,
0xc3000
,
0xc6000
,
0xa3000
,
0x83000
,
},
.
nicIds
=
{},
.
gpuNuma
=
{
0
,
1
,
1
,
1
,
2
,
2
,
3
,
3
,
},
.
nicNuma
=
{},
.
connMatrix
=
{
0
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
},
.
gdrLevel
=
{},
.
pattern
=
"10302020"
,
.
ringBase
=
"1 7 6 4 5 2 0 3|2 5 3 0 4 6 7 1"
,
.
options
=
""
,
};
static
struct
scclRomeModel
rome_model_38
=
{
.
nGpus
=
8
,
.
nCpus
=
7
,
.
nNics
=
0
,
.
nLinks
=
2
,
.
gpuIds
=
{
0x43000
,
0x23000
,
0x26000
,
0x3000
,
0xc3000
,
0xc6000
,
0xa3000
,
0x83000
,
},
.
nicIds
=
{},
.
gpuNuma
=
{
1
,
2
,
2
,
3
,
5
,
5
,
6
,
7
,
},
.
nicNuma
=
{},
.
connMatrix
=
{
0
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
},
.
gdrLevel
=
{},
.
pattern
=
"10201000201010"
,
.
ringBase
=
"6 7 1 4 3 5 2 0|0 2 5 3 4 1 7 6"
,
.
options
=
""
,
};
static
struct
scclRomeModel
rome_model_28
=
{
.
nGpus
=
8
,
.
nCpus
=
4
,
.
nNics
=
0
,
.
nLinks
=
3
,
.
gpuIds
=
{
0x43000
,
0x23000
,
0x26000
,
0x3000
,
0xc3000
,
0xc6000
,
0xa3000
,
0x83000
,
},
.
nicIds
=
{},
.
gpuNuma
=
{
0
,
1
,
1
,
1
,
2
,
2
,
3
,
3
,
},
.
nicNuma
=
{},
.
connMatrix
=
{
0
,
1
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
0
,
},
.
gdrLevel
=
{},
.
pattern
=
"10302020"
,
.
ringBase
=
"0 3 2 1 4 5 6 7|7 6 5 4 1 2 3 0|0 2 5 7 4 6 3 1|1 3 6 4 7 5 2 0"
,
.
options
=
""
,
};
static
struct
scclRomeModel
rome_model_40
=
{
.
nGpus
=
8
,
.
nCpus
=
4
,
.
nNics
=
1
,
.
nLinks
=
3
,
.
gpuIds
=
{
0x43000
,
0x23000
,
0x26000
,
0x3000
,
0xc3000
,
0xc6000
,
0xa3000
,
0x83000
,
},
.
nicIds
=
{
0xe1000
,
},
.
gpuNuma
=
{
0
,
1
,
1
,
1
,
2
,
2
,
3
,
3
,
},
.
nicNuma
=
{
2
,
},
.
connMatrix
=
{
0
,
0
,
1
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
1
,
1
,
1
,
0
,
0
,
1
,
0
,
1
,
0
,
0
,
1
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
1
,
0
,
1
,
0
,
0
,
1
,
0
,
1
,
0
,
},
.
gdrLevel
=
{
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PHB
,
PATH_PHB
,
PATH_SYS
,
PATH_SYS
,
},
.
pattern
=
"10302120"
,
.
ringBase
=
"6 7 1 4 0 5 3 2|7 6 4 1 0 2 3 5"
,
.
options
=
""
,
};
static
struct
scclRomeModel
rome_model_42
=
{
.
nGpus
=
8
,
.
nCpus
=
7
,
.
nNics
=
1
,
.
nLinks
=
3
,
.
gpuIds
=
{
0x43000
,
0x23000
,
0x26000
,
0x3000
,
0xc3000
,
0xc6000
,
0xa3000
,
0x83000
,
},
.
nicIds
=
{
0xe1000
,
},
.
gpuNuma
=
{
1
,
2
,
2
,
3
,
5
,
5
,
6
,
7
,
},
.
nicNuma
=
{
4
,
},
.
connMatrix
=
{
0
,
0
,
1
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
1
,
1
,
1
,
0
,
0
,
1
,
0
,
1
,
0
,
0
,
1
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
1
,
0
,
1
,
0
,
0
,
1
,
0
,
1
,
0
,
},
.
gdrLevel
=
{
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
},
.
pattern
=
"10201001201010"
,
.
ringBase
=
"7 4 6 1 3 0 2 5|6 4 7 1 3 2 5 0"
,
.
options
=
""
,
};
static
struct
scclRomeModel
rome_model_44
=
{
.
nGpus
=
8
,
.
nCpus
=
4
,
.
nNics
=
1
,
.
nLinks
=
3
,
.
gpuIds
=
{
0x63000
,
0x43000
,
0x27000
,
0x3000
,
0xe3000
,
0xc3000
,
0xa3000
,
0x83000
,
},
.
nicIds
=
{
0xc4000
,
},
.
gpuNuma
=
{
0
,
0
,
1
,
1
,
2
,
2
,
3
,
3
,
},
.
nicNuma
=
{
2
,
},
.
connMatrix
=
{
0
,
1
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
0
,
},
.
gdrLevel
=
{
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PHB
,
PATH_PHB
,
PATH_SYS
,
PATH_SYS
,
},
.
pattern
=
"20202120"
,
.
ringBase
=
"5 4 7 6 2 1 3 0|5 6 7 4 1 0 2 3"
,
.
options
=
""
,
};
static
struct
scclRomeModel
rome_model_45
=
{
.
nGpus
=
8
,
.
nCpus
=
7
,
.
nNics
=
0
,
.
nLinks
=
3
,
.
gpuIds
=
{
0x43000
,
0x23000
,
0x26000
,
0x3000
,
0xc3000
,
0xc6000
,
0xa3000
,
0x83000
,
},
.
nicIds
=
{},
.
gpuNuma
=
{
1
,
2
,
2
,
3
,
5
,
5
,
6
,
7
,
},
.
nicNuma
=
{},
.
connMatrix
=
{
0
,
1
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
0
,
},
.
gdrLevel
=
{},
.
pattern
=
"10201000201010"
,
.
ringBase
=
"0 1 2 3 4 5 6 7|0 2 5 7 4 6 1 3|0 3 1 6 4 7 5 2|0 7 6 5 4 3 2 1"
,
.
options
=
""
,
};
static
struct
scclRomeModel
rome_model_46
=
{
.
nGpus
=
8
,
.
nCpus
=
7
,
.
nNics
=
1
,
.
nLinks
=
3
,
.
gpuIds
=
{
0x43000
,
0x23000
,
0x26000
,
0x3000
,
0xc3000
,
0xc6000
,
0xa3000
,
0x83000
,
},
.
nicIds
=
{
0xe1000
,
},
.
gpuNuma
=
{
1
,
2
,
2
,
3
,
5
,
5
,
6
,
7
,
},
.
nicNuma
=
{
4
,
},
.
connMatrix
=
{
0
,
1
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
0
,
},
.
gdrLevel
=
{
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
},
.
pattern
=
"10201001201010"
,
.
ringBase
=
"6 5 7 4 1 2 3 0|7 4 6 5 1 0 3 2"
,
.
options
=
""
,
};
static
struct
scclRomeModel
rome_model_48
=
{
.
nGpus
=
8
,
.
nCpus
=
4
,
.
nNics
=
0
,
.
nLinks
=
3
,
.
gpuIds
=
{
0x4a000
,
0x50000
,
0xa000
,
0xf000
,
0xcb000
,
0xd1000
,
0x8a000
,
0x90000
,
},
.
nicIds
=
{},
.
gpuNuma
=
{
0
,
0
,
1
,
1
,
2
,
2
,
3
,
3
,
},
.
nicNuma
=
{},
.
connMatrix
=
{
0
,
1
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
0
,
},
.
gdrLevel
=
{},
.
pattern
=
"20202020"
,
.
ringBase
=
"0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0"
,
.
options
=
""
,
};
static
struct
scclRomeModel
rome_model_49
=
{
.
nGpus
=
8
,
.
nCpus
=
4
,
.
nNics
=
4
,
.
nLinks
=
3
,
.
gpuIds
=
{
0x4a000
,
0x50000
,
0xa000
,
0xf000
,
0xcb000
,
0xd1000
,
0x8a000
,
0x90000
,
},
.
nicIds
=
{
0x45000
,
0x13000
,
0xc6000
,
0x85000
,
},
.
gpuNuma
=
{
0
,
0
,
1
,
1
,
2
,
2
,
3
,
3
,
},
.
nicNuma
=
{
0
,
1
,
2
,
3
,
},
.
connMatrix
=
{
0
,
1
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
0
,
},
.
gdrLevel
=
{
PATH_PXB
,
PATH_PXB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PXB
,
PATH_PXB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PXB
,
PATH_PXB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PXB
,
PATH_PXB
,
},
.
pattern
=
"21212121"
,
.
ringBase
=
"N0 0 1 2 3 4 5 6 7 N3|N3 7 6 5 4 3 2 1 0 N0|N1 2 3 0 1 6 7 4 5 N2|N2 5 4 7 6 1 0 3 2 N1"
,
.
options
=
""
,
};
static
struct
scclRomeModel
rome_model_52
=
{
.
nGpus
=
8
,
.
nCpus
=
1
,
.
nNics
=
0
,
.
nLinks
=
3
,
.
gpuIds
=
{
0xc1000
,
0xc5000
,
0xc9000
,
0xcd000
,
0xd1000
,
0xd5000
,
0xd9000
,
0xdd000
,
},
.
nicIds
=
{},
.
gpuNuma
=
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
},
.
nicNuma
=
{},
.
connMatrix
=
{
0
,
1
,
1
,
0
,
0
,
0
,
1
,
0
,
1
,
0
,
0
,
1
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
1
,
1
,
0
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
1
,
1
,
0
,
0
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
0
,
1
,
0
,
1
,
1
,
0
,
},
.
gdrLevel
=
{},
.
pattern
=
"80"
,
.
ringBase
=
"0 1 3 2 4 5 7 6|6 7 5 4 2 3 1 0|0 1 5 4 6 7 3 2|2 3 7 6 4 5 1 0"
,
.
options
=
""
,
};
static
struct
scclRomeModel
rome_model_53
=
{
.
nGpus
=
8
,
.
nCpus
=
4
,
.
nNics
=
4
,
.
nLinks
=
3
,
.
gpuIds
=
{
0x4a000
,
0x50000
,
0xa000
,
0xf000
,
0xcb000
,
0xd1000
,
0x8a000
,
0x90000
,
},
.
nicIds
=
{
0x45000
,
0x13000
,
0xc6000
,
0x85000
,
},
.
gpuNuma
=
{
1
,
1
,
3
,
3
,
5
,
5
,
7
,
7
,
},
.
nicNuma
=
{
1
,
3
,
5
,
7
,
},
.
connMatrix
=
{
0
,
1
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
0
,
},
.
gdrLevel
=
{
PATH_PXB
,
PATH_PXB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PXB
,
PATH_PXB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PXB
,
PATH_PXB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PXB
,
PATH_PXB
,
},
.
pattern
=
"21212121"
,
.
ringBase
=
"N0 0 1 2 3 4 5 6 7 N3|N3 7 6 5 4 3 2 1 0 N0|N1 2 3 0 1 6 7 4 5 N2|N2 5 4 7 6 1 0 3 2 N1"
,
.
options
=
""
,
};
static
struct
scclRomeModel
rome_model_43
=
{
.
nGpus
=
8
,
.
nCpus
=
4
,
.
nNics
=
0
,
.
nLinks
=
3
,
.
gpuIds
=
{
0x63000
,
0x43000
,
0x27000
,
0x3000
,
0xe3000
,
0xc3000
,
0xa3000
,
0x83000
,
},
.
nicIds
=
{},
.
gpuNuma
=
{
0
,
0
,
1
,
1
,
2
,
2
,
3
,
3
,
},
.
nicNuma
=
{},
.
connMatrix
=
{
0
,
1
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
0
,
},
.
gdrLevel
=
{},
.
pattern
=
"20202020"
,
.
ringBase
=
"0 1 2 3 4 5 6 7|0 2 5 7 4 6 1 3|0 3 1 6 4 7 5 2|0 7 6 5 4 3 2 1|0 1 2 3 4 5 6 7|0 2 5 7 4 6 1 3|0 3 1 6 4 7 5 2|0 7 6 5 4 3 2 1|0 1 2 3 4 5 6 "
"7|0 2 5 7 4 6 1 3|0 3 1 6 4 7 5 2|0 7 6 5 4 3 2 1"
,
.
options
=
"treeDefined=1"
,
.
treeBase
=
"(2(5(6(7(4))))(3(0(1))))|(2(5(7(6(4))))(0(1(3))))|(2(5(7(4(6))))(1(3(0))))|(6(1(0(2(3))))(7(4(5))))|(6(1(2(0(3))))(4(5(7))))|(6(1(0(3(2))))(5(7(4))))|"
"(1(6(7(5(4))))(2(3(0))))|(1(6(4(7(5))))(3(2(0))))|(1(6(5(4(7))))(3(0(2))))|(5(2(3(1(0))))(4(6(7))))|(5(2(0(3(1))))(6(4(7))))|(5(2(1(0(3))))(4(7(6))))"
,
};
static
struct
scclRomeModel
rome_model_55
=
{
.
nGpus
=
8
,
.
nCpus
=
4
,
.
nNics
=
0
,
.
nLinks
=
3
,
.
gpuIds
=
{
0x100000
,
0x200000
,
0x300000
,
0x400000
,
0x500000
,
0x600000
,
0x700000
,
0x800000
,
},
.
nicIds
=
{},
.
gpuNuma
=
{
0
,
0
,
1
,
1
,
2
,
2
,
3
,
3
,
},
.
nicNuma
=
{},
.
connMatrix
=
{
0
,
1
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
0
,
},
.
gdrLevel
=
{},
.
pattern
=
"20202020"
,
.
ringBase
=
"0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0|2 3 0 1 6 7 4 5|5 4 7 6 1 0 3 2"
,
.
options
=
""
,
};
static
struct
scclRomeModel
rome_model_56
=
{
.
nGpus
=
16
,
.
nCpus
=
4
,
.
nNics
=
0
,
.
nLinks
=
4
,
.
gpuIds
=
{
0x4e000
,
0x51000
,
0x56000
,
0x59000
,
0xe000
,
0x11000
,
0x16000
,
0x19000
,
0xcf000
,
0xd2000
,
0xd7000
,
0xda000
,
0x8f000
,
0x92000
,
0x97000
,
0x9a000
,
},
.
nicIds
=
{},
.
gpuNuma
=
{
0
,
0
,
0
,
0
,
1
,
1
,
1
,
1
,
2
,
2
,
2
,
2
,
3
,
3
,
3
,
3
,
},
.
nicNuma
=
{},
.
connMatrix
=
{
0
,
4
,
1
,
0
,
2
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
4
,
0
,
1
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
4
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
4
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
,
4
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
4
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
4
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
4
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
4
,
1
,
0
,
2
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
4
,
0
,
1
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
4
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
4
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
,
4
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
0
,
1
,
0
,
0
,
4
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
4
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
2
,
0
,
0
,
4
,
0
,
},
.
gdrLevel
=
{},
.
pattern
=
"40404040"
,
.
ringBase
=
"0 1 3 2 6 7 15 14 10 11 9 8 12 13 5 4|0 1 2 3 7 6 13 12 8 9 10 11 15 14 5 4|0 2 3 7 6 14 15 11 10 8 9 13 12 4 5 1|4 5 13 12 8 9 11 10 14 15 7 "
"6 2 3 1 0|4 5 14 15 11 10 9 8 12 13 6 7 3 2 1 0|1 5 4 12 13 9 8 10 11 15 14 6 7 3 2 0"
,
.
options
=
"pivotA2AEnabled=1,pivotA2ANumBiRings=3,tuning=1,mscclEnabled=1,treeDefined=1"
,
.
treeBase
=
"(0(1(3(2(6(7(15(14(10))))))))(4(5(13(12(8(9(11))))))))|(2(3(7(6(13(12(8(9(10))))))))(1(0(4(5(14(15(11))))))))|(14(15(11(10(8(9(13(12(4))))))))"
"(6(7(3(2(0(1(5))))))))|(10(11(9(8(12(13(5(4(0))))))))(14(15(7(6(2(3(1))))))))|(10(11(15(14(5(4(0(1(2))))))))(9(8(12(13(6(7(3))))))))|(4(5(1(0("
"2(3(7(6(14))))))))(12(13(9(8(10(11(15))))))))|(6(7(15(14(10(11(9(8(12))))))))(2(3(1(0(4(5(13))))))))|(13(12(8(9(10(11(15(14(5))))))))(6(7(3(2("
"1(0(4))))))))|(8(9(13(12(4(5(1(0(2))))))))(10(11(15(14(6(7(3))))))))|(12(13(5(4(0(1(3(2(6))))))))(8(9(11(10(14(15(7))))))))|(5(4(0(1(2(3(7(6("
"13))))))))(14(15(11(10(9(8(12))))))))|(2(3(7(6(14(15(11(10(8))))))))(0(1(5(4(12(13(9))))))))"
,
};
static
struct
scclRomeModel
rome_model_58
=
{
.
nGpus
=
8
,
.
nCpus
=
3
,
.
nNics
=
0
,
.
nLinks
=
3
,
.
gpuIds
=
{
0xc1000
,
0xc6000
,
0xc9000
,
0xce000
,
0xd1000
,
0xd6000
,
0xd9000
,
0xde000
,
},
.
nicIds
=
{},
.
gpuNuma
=
{
3
,
3
,
1
,
1
,
0
,
0
,
0
,
0
,
},
.
nicNuma
=
{},
.
connMatrix
=
{
0
,
1
,
1
,
0
,
0
,
0
,
1
,
0
,
1
,
0
,
0
,
1
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
1
,
1
,
0
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
1
,
1
,
0
,
0
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
0
,
1
,
0
,
1
,
1
,
0
,
},
.
gdrLevel
=
{},
.
pattern
=
"402020"
,
.
ringBase
=
"0 1 3 2 4 5 7 6|6 7 5 4 2 3 1 0|0 1 5 4 6 7 3 2|2 3 7 6 4 5 1 0"
,
.
options
=
""
,
};
static
struct
scclRomeModel
rome_model_59
=
{
.
nGpus
=
16
,
.
nCpus
=
4
,
.
nNics
=
8
,
.
nLinks
=
4
,
.
gpuIds
=
{
0x4e000
,
0x51000
,
0x56000
,
0x59000
,
0xe000
,
0x11000
,
0x16000
,
0x19000
,
0xcf000
,
0xd2000
,
0xd7000
,
0xda000
,
0x8f000
,
0x92000
,
0x97000
,
0x9a000
,
},
.
nicIds
=
{
0x4b000
,
0x5a000
,
0xb000
,
0x1a000
,
0xcc000
,
0xdb000
,
0x8c000
,
0x9b000
,
},
.
gpuNuma
=
{
0
,
0
,
0
,
0
,
1
,
1
,
1
,
1
,
2
,
2
,
2
,
2
,
3
,
3
,
3
,
3
,
},
.
nicNuma
=
{
0
,
0
,
1
,
1
,
2
,
2
,
3
,
3
,
},
.
connMatrix
=
{
0
,
4
,
1
,
0
,
2
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
4
,
0
,
1
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
4
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
4
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
,
4
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
4
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
4
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
4
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
4
,
1
,
0
,
2
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
4
,
0
,
1
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
4
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
4
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
,
4
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
0
,
1
,
0
,
0
,
4
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
4
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
2
,
0
,
0
,
4
,
0
,
},
.
gdrLevel
=
{
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
},
.
pattern
=
"42424242"
,
.
ringBase
=
"N4 9 8 12 13 5 4 0 1 3 2 6 7 15 14 10 11 N5|N1 3 2 6 7 15 14 10 11 9 8 12 13 5 4 0 1 N0|N3 7 6 2 3 1 0 4 5 13 12 8 9 11 10 14 15 N7|N7 15 14 "
"10 11 9 8 12 13 5 4 0 1 3 2 6 7 N3|N5 11 10 14 15 7 6 2 3 1 0 4 5 13 12 8 9 N4|N0 1 0 4 5 13 12 8 9 11 10 14 15 7 6 2 3 N1|N3 6 7 3 2 1 0 4 5 "
"14 15 11 10 9 8 12 13 N6|N7 14 15 11 10 9 8 12 13 6 7 3 2 1 0 4 5 N2|N2 5 4 0 1 2 3 7 6 13 12 8 9 10 11 15 14 N7|N6 13 12 8 9 10 11 15 14 5 4 "
"0 1 2 3 7 6 N3|N4 8 9 13 12 4 5 1 0 2 3 7 6 14 15 11 10 N5|N5 10 11 15 14 6 7 3 2 0 1 5 4 12 13 9 8 N4|N6 12 13 9 8 10 11 15 14 6 7 3 2 0 1 5 "
"4 N2|N2 4 5 1 0 2 3 7 6 14 15 11 10 8 9 13 12 N6|N1 2 3 7 6 14 15 11 10 8 9 13 12 4 5 1 0 N0|N0 0 1 5 4 12 13 9 8 10 11 15 14 6 7 3 2 N1|N5 "
"10 11 9 8 12 13 5 4 0 1 3 2 6 7 15 14 N7|N3 6 7 15 14 10 11 9 8 12 13 5 4 0 1 3 2 N1|N1 2 3 1 0 4 5 13 12 8 9 11 10 14 15 7 6 N3|N7 14 15 7 6 "
"2 3 1 0 4 5 13 12 8 9 11 10 N5|N0 0 1 2 3 7 6 13 12 8 9 10 11 15 14 5 4 N2|N4 8 9 10 11 15 14 5 4 0 1 2 3 7 6 13 12 N6|N3 7 6 13 12 8 9 10 11 "
"15 14 5 4 0 1 2 3 N1|N1 3 2 1 0 4 5 14 15 11 10 9 8 12 13 6 7 N3|N6 12 13 6 7 3 2 1 0 4 5 14 15 11 10 9 8 N4|N2 4 5 14 15 11 10 9 8 12 13 6 7 "
"3 2 1 0 N0|N0 1 0 2 3 7 6 14 15 11 10 8 9 13 12 4 5 N2|N6 13 12 4 5 1 0 2 3 7 6 14 15 11 10 8 9 N4|N5 11 10 8 9 13 12 4 5 1 0 2 3 7 6 14 15 "
"N7|N2 5 4 12 13 9 8 10 11 15 14 6 7 3 2 0 1 N0|N7 15 14 6 7 3 2 0 1 5 4 12 13 9 8 10 11 N5|N4 9 8 10 11 15 14 6 7 3 2 0 1 5 4 12 13 N6"
,
.
options
=
"tuning=4,ll128Enabled=1,baseBw=161.4"
,
};
static
struct
scclRomeModel
rome_model_62
=
{
.
nGpus
=
8
,
.
nCpus
=
4
,
.
nNics
=
0
,
.
nLinks
=
3
,
.
gpuIds
=
{
0xc1000
,
0xc6000
,
0xc9000
,
0xce000
,
0xd1000
,
0xd6000
,
0xd9000
,
0xde000
,
},
.
nicIds
=
{},
.
gpuNuma
=
{
3
,
3
,
1
,
1
,
0
,
0
,
2
,
2
,
},
.
nicNuma
=
{},
.
connMatrix
=
{
0
,
1
,
1
,
0
,
0
,
0
,
1
,
0
,
1
,
0
,
0
,
1
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
1
,
1
,
0
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
1
,
1
,
0
,
0
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
0
,
1
,
0
,
1
,
1
,
0
,
},
.
gdrLevel
=
{},
.
pattern
=
"20202020"
,
.
ringBase
=
"0 1 3 2 4 5 7 6|6 7 5 4 2 3 1 0|0 1 5 4 6 7 3 2|2 3 7 6 4 5 1 0"
,
.
options
=
""
,
};
static
struct
scclRomeModel
rome_model_63
=
{
.
nGpus
=
8
,
.
nCpus
=
4
,
.
nNics
=
4
,
.
nLinks
=
3
,
.
gpuIds
=
{
0xc1000
,
0xc6000
,
0xc9000
,
0xce000
,
0xd1000
,
0xd6000
,
0xd9000
,
0xde000
,
},
.
nicIds
=
{
0xc5000
,
0xcd000
,
0xd5000
,
0xdd000
,
},
.
gpuNuma
=
{
3
,
3
,
1
,
1
,
0
,
0
,
2
,
2
,
},
.
nicNuma
=
{
3
,
1
,
0
,
2
,
},
.
connMatrix
=
{
0
,
1
,
1
,
0
,
0
,
0
,
1
,
0
,
1
,
0
,
0
,
1
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
1
,
1
,
0
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
1
,
1
,
0
,
0
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
0
,
1
,
0
,
1
,
1
,
0
,
},
.
gdrLevel
=
{
PATH_PHB
,
PATH_PHB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PHB
,
PATH_PHB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PHB
,
PATH_PHB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PHB
,
PATH_PHB
,
},
.
pattern
=
"21212121"
,
.
ringBase
=
"N0 0 1 5 4 6 7 3 2 N1|N1 2 3 7 6 4 5 1 0 N0|N3 7 6 0 1 3 2 4 5 N2|N2 5 4 2 3 1 0 6 7 N3|N0 0 1 5 4 6 7 3 2 N1|N1 2 3 7 6 4 5 1 0 N0|N3 7 6 0 "
"1 3 2 4 5 N2|N2 5 4 2 3 1 0 6 7 N3"
,
.
options
=
"tuning=3"
,
};
static
struct
scclRomeModel
rome_model_65
=
{
.
nGpus
=
16
,
.
nCpus
=
4
,
.
nNics
=
8
,
.
nLinks
=
4
,
.
gpuIds
=
{
0x4e000
,
0x51000
,
0x56000
,
0x59000
,
0xe000
,
0x11000
,
0x16000
,
0x19000
,
0xcf000
,
0xd2000
,
0xd7000
,
0xda000
,
0x8f000
,
0x92000
,
0x97000
,
0x9a000
,
},
.
nicIds
=
{
0x4b000
,
0x5a000
,
0xb000
,
0x1a000
,
0xcc000
,
0xdb000
,
0x8c000
,
0x9b000
,
},
.
gpuNuma
=
{
0
,
0
,
0
,
0
,
1
,
1
,
1
,
1
,
2
,
2
,
2
,
2
,
3
,
3
,
3
,
3
,
},
.
nicNuma
=
{
0
,
0
,
1
,
1
,
2
,
2
,
3
,
3
,
},
.
connMatrix
=
{
0
,
4
,
1
,
0
,
2
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
4
,
0
,
1
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
4
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
4
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
,
4
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
4
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
4
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
4
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
4
,
1
,
0
,
2
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
4
,
0
,
1
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
4
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
4
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
,
4
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
0
,
1
,
0
,
0
,
4
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
4
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
2
,
0
,
0
,
4
,
0
,
},
.
gdrLevel
=
{
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
},
.
pattern
=
"42424242"
,
.
ringBase
=
"N4 9 8 12 13 5 4 0 1 3 2 6 7 15 14 10 11 N5|N1 3 2 6 7 15 14 10 11 9 8 12 13 5 4 0 1 N0|N3 7 6 2 3 1 0 4 5 13 12 8 9 11 10 14 15 N7|N7 15 14 "
"10 11 9 8 12 13 5 4 0 1 3 2 6 7 N3|N5 11 10 14 15 7 6 2 3 1 0 4 5 13 12 8 9 N4|N0 1 0 4 5 13 12 8 9 11 10 14 15 7 6 2 3 N1|N3 6 7 3 2 1 0 4 5 "
"14 15 11 10 9 8 12 13 N6|N7 14 15 11 10 9 8 12 13 6 7 3 2 1 0 4 5 N2|N2 5 4 0 1 2 3 7 6 13 12 8 9 10 11 15 14 N7|N6 13 12 8 9 10 11 15 14 5 4 "
"0 1 2 3 7 6 N3|N4 8 9 13 12 4 5 1 0 2 3 7 6 14 15 11 10 N5|N5 10 11 15 14 6 7 3 2 0 1 5 4 12 13 9 8 N4|N6 12 13 9 8 10 11 15 14 6 7 3 2 0 1 5 "
"4 N2|N2 4 5 1 0 2 3 7 6 14 15 11 10 8 9 13 12 N6|N1 2 3 7 6 14 15 11 10 8 9 13 12 4 5 1 0 N0|N0 0 1 5 4 12 13 9 8 10 11 15 14 6 7 3 2 N1|N5 "
"10 11 9 8 12 13 5 4 0 1 3 2 6 7 15 14 N7|N3 6 7 15 14 10 11 9 8 12 13 5 4 0 1 3 2 N1|N1 2 3 1 0 4 5 13 12 8 9 11 10 14 15 7 6 N3|N7 14 15 7 6 "
"2 3 1 0 4 5 13 12 8 9 11 10 N5|N0 0 1 2 3 7 6 13 12 8 9 10 11 15 14 5 4 N2|N4 8 9 10 11 15 14 5 4 0 1 2 3 7 6 13 12 N6|N3 7 6 13 12 8 9 10 11 "
"15 14 5 4 0 1 2 3 N1|N1 3 2 1 0 4 5 14 15 11 10 9 8 12 13 6 7 N3|N6 12 13 6 7 3 2 1 0 4 5 14 15 11 10 9 8 N4|N2 4 5 14 15 11 10 9 8 12 13 6 7 "
"3 2 1 0 N0|N0 1 0 2 3 7 6 14 15 11 10 8 9 13 12 4 5 N2|N6 13 12 4 5 1 0 2 3 7 6 14 15 11 10 8 9 N4|N5 11 10 8 9 13 12 4 5 1 0 2 3 7 6 14 15 "
"N7|N2 5 4 12 13 9 8 10 11 15 14 6 7 3 2 0 1 N0|N7 15 14 6 7 3 2 0 1 5 4 12 13 9 8 10 11 N5|N4 9 8 10 11 15 14 6 7 3 2 0 1 5 4 12 13 N6"
,
.
options
=
"tuning=4,ll128Enabled=1,baseBw=161.4"
,
};
static
struct
scclRomeModel
rome_model_66
=
{
.
nGpus
=
8
,
.
nCpus
=
2
,
.
nNics
=
0
,
.
nLinks
=
3
,
.
gpuIds
=
{
0x29000
,
0x2c000
,
0x2f000
,
0x32000
,
0xad000
,
0xb0000
,
0xb3000
,
0xb6000
,
},
.
nicIds
=
{},
.
gpuNuma
=
{
1
,
1
,
1
,
1
,
3
,
3
,
3
,
3
,
},
.
nicNuma
=
{},
.
connMatrix
=
{
0
,
4
,
0
,
0
,
2
,
0
,
1
,
0
,
4
,
0
,
0
,
1
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
4
,
1
,
0
,
2
,
0
,
0
,
1
,
4
,
0
,
0
,
1
,
0
,
0
,
2
,
0
,
1
,
0
,
0
,
4
,
0
,
0
,
0
,
0
,
0
,
1
,
4
,
0
,
0
,
1
,
1
,
0
,
2
,
0
,
0
,
0
,
0
,
4
,
0
,
1
,
0
,
0
,
0
,
1
,
4
,
0
,
},
.
gdrLevel
=
{},
.
pattern
=
"4040"
,
.
ringBase
=
"0 6 7 5 4 2 3 1|1 3 2 4 5 7 6 0|0 1 7 6 2 3 5 4|4 5 3 2 6 7 1 0"
,
.
options
=
"disableNumaMatching=1,tuning=2"
,
};
static
struct
scclRomeModel
rome_model_67
=
{
.
nGpus
=
8
,
.
nCpus
=
2
,
.
nNics
=
4
,
.
nLinks
=
3
,
.
gpuIds
=
{
0x29000
,
0x2c000
,
0x2f000
,
0x32000
,
0xad000
,
0xb0000
,
0xb3000
,
0xb6000
,
},
.
nicIds
=
{
0x1d000
,
0x1e000
,
0xa1000
,
0xa2000
,
},
.
gpuNuma
=
{
1
,
1
,
1
,
1
,
3
,
3
,
3
,
3
,
},
.
nicNuma
=
{
1
,
1
,
3
,
3
,
},
.
connMatrix
=
{
0
,
4
,
0
,
0
,
2
,
0
,
1
,
0
,
4
,
0
,
0
,
1
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
4
,
1
,
0
,
2
,
0
,
0
,
1
,
4
,
0
,
0
,
1
,
0
,
0
,
2
,
0
,
1
,
0
,
0
,
4
,
0
,
0
,
0
,
0
,
0
,
1
,
4
,
0
,
0
,
1
,
1
,
0
,
2
,
0
,
0
,
0
,
0
,
4
,
0
,
1
,
0
,
0
,
0
,
1
,
4
,
0
,
},
.
gdrLevel
=
{
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
},
.
pattern
=
"4242"
,
.
ringBase
=
"N3 7 6 0 1 3 2 4 5 N2|N2 5 4 2 3 1 0 6 7 N3|N1 2 3 5 4 0 1 7 6 N3|N2 4 5 3 2 6 7 1 0 N0|N1 3 2 4 5 7 6 0 1 N0|N0 1 0 6 7 5 4 2 3 N1|N0 0 1 7 "
"6 2 3 5 4 N2|N3 6 7 1 0 4 5 3 2 N1"
,
.
options
=
"disableNumaMatching=1,tuning=2"
,
};
static
struct
scclRomeModel
rome_model_68
=
{
.
nGpus
=
16
,
.
nCpus
=
1
,
.
nNics
=
16
,
.
nLinks
=
3
,
.
gpuIds
=
{
0xcf000
,
0xd4000
,
0xd5000
,
0xd6000
,
0xd0000
,
0xd1000
,
0xd2000
,
0xd3000
,
0xf0000
,
0xf1000
,
0xf2000
,
0xf3000
,
0xf4000
,
0xf5000
,
0xf6000
,
0xf7000
,
},
.
nicIds
=
{
0xcd000
,
0xc8000
,
0xc9000
,
0xcb000
,
0xcc000
,
0xce000
,
0xc7000
,
0xca000
,
0xe8000
,
0xe9000
,
0xea000
,
0xeb000
,
0xec000
,
0xed000
,
0xee000
,
0xef000
,
},
.
gpuNuma
=
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
},
.
nicNuma
=
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
},
.
connMatrix
=
{
0
,
1
,
1
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
1
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
0
,
},
.
gdrLevel
=
{
PATH_PIX
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PXB
,
PATH_PIX
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PXB
,
PATH_PXB
,
PATH_PIX
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PIX
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PIX
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PIX
,
PATH_PXB
,
PATH_PXB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PIX
,
PATH_PXB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PIX
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PIX
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PXB
,
PATH_PIX
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PXB
,
PATH_PXB
,
PATH_PIX
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PIX
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PIX
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PIX
,
PATH_PXB
,
PATH_PXB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PIX
,
PATH_PXB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PXB
,
PATH_PIX
,
},
.
pattern
=
"@@"
,
.
ringBase
=
"N0 0 1 2 3 N3 N4 4 5 6 7 N7 N8 8 9 10 11 N11 N12 12 13 14 15 N15|N15 15 14 13 12 N12 N11 11 10 9 8 N8 N7 7 6 5 4 N4 N3 3 2 1 0 N0|N1 1 3 0 2 "
"N2 N5 5 7 4 6 N6 N9 9 11 8 10 N10 N13 13 15 12 14 N14|N14 14 12 15 13 N13 N10 10 8 11 9 N9 N6 6 4 7 5 N5 N2 2 0 3 1 N1|N0 0 1 2 3 N3 N4 4 5 6 "
"7 N7 N8 8 9 10 11 N11 N12 12 13 14 15 N15|N15 15 14 13 12 N12 N11 11 10 9 8 N8 N7 7 6 5 4 N4 N3 3 2 1 0 N0|N1 1 3 0 2 N2 N5 5 7 4 6 N6 N9 9 "
"11 8 10 N10 N13 13 15 12 14 N14|N14 14 12 15 13 N13 N10 10 8 11 9 N9 N6 6 4 7 5 N5 N2 2 0 3 1 N1"
,
.
options
=
""
,
};
static
struct
scclRomeModel
rome_model_71
=
{
.
nGpus
=
8
,
.
nCpus
=
2
,
.
nNics
=
0
,
.
nLinks
=
3
,
.
gpuIds
=
{
0x32000
,
0x35000
,
0x11000
,
0x14000
,
0xae000
,
0xb3000
,
0x8e000
,
0x93000
,
},
.
nicIds
=
{},
.
gpuNuma
=
{
0
,
0
,
0
,
0
,
1
,
1
,
1
,
1
,
},
.
nicNuma
=
{},
.
connMatrix
=
{
0
,
4
,
1
,
0
,
0
,
0
,
2
,
0
,
4
,
0
,
0
,
1
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
4
,
2
,
0
,
0
,
0
,
0
,
1
,
4
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
2
,
0
,
0
,
4
,
1
,
0
,
0
,
1
,
0
,
0
,
4
,
0
,
0
,
1
,
2
,
0
,
0
,
0
,
1
,
0
,
0
,
4
,
0
,
0
,
0
,
1
,
0
,
1
,
4
,
0
,
},
.
gdrLevel
=
{},
.
pattern
=
"4040"
,
.
ringBase
=
"0 1 3 2 4 5 7 6|6 7 5 4 2 3 1 0|0 1 5 4 2 3 7 6|6 7 3 2 4 5 1 0"
,
.
options
=
"disableNumaMatching=1,tuning=2"
,
};
static
struct
scclRomeModel
rome_model_72
=
{
.
nGpus
=
8
,
.
nCpus
=
2
,
.
nNics
=
4
,
.
nLinks
=
3
,
.
gpuIds
=
{
0x32000
,
0x35000
,
0x11000
,
0x14000
,
0xae000
,
0xb3000
,
0x8e000
,
0x93000
,
},
.
nicIds
=
{
0x1d000
,
0x1e000
,
0xa0000
,
0xa1000
,
},
.
gpuNuma
=
{
0
,
0
,
0
,
0
,
1
,
1
,
1
,
1
,
},
.
nicNuma
=
{
0
,
0
,
1
,
1
,
},
.
connMatrix
=
{
0
,
4
,
1
,
0
,
0
,
0
,
2
,
0
,
4
,
0
,
0
,
1
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
4
,
2
,
0
,
0
,
0
,
0
,
1
,
4
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
2
,
0
,
0
,
4
,
1
,
0
,
0
,
1
,
0
,
0
,
4
,
0
,
0
,
1
,
2
,
0
,
0
,
0
,
1
,
0
,
0
,
4
,
0
,
0
,
0
,
1
,
0
,
1
,
4
,
0
,
},
.
gdrLevel
=
{
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
},
.
pattern
=
"4242"
,
.
ringBase
=
"N0 0 1 3 2 4 5 7 6 N3|N1 2 3 1 0 6 7 5 4 N2|N3 7 6 0 1 5 4 2 3 N1|N0 1 0 6 7 3 2 4 5 N2|N2 4 5 7 6 0 1 3 2 N1|N3 6 7 5 4 2 3 1 0 N0|N2 5 4 2 "
"3 7 6 0 1 N0|N1 3 2 4 5 1 0 6 7 N3"
,
.
options
=
"disableNumaMatching=1,tuning=2"
,
};
static
struct
scclRomeModel
rome_model_73
=
{
.
nGpus
=
8
,
.
nCpus
=
4
,
.
nNics
=
0
,
.
nLinks
=
3
,
.
gpuIds
=
{
0xc1000
,
0xc6000
,
0xc9000
,
0xce000
,
0xd1000
,
0xd6000
,
0xd9000
,
0xde000
,
},
.
nicIds
=
{},
.
gpuNuma
=
{
3
,
3
,
1
,
1
,
0
,
0
,
2
,
2
,
},
.
nicNuma
=
{},
.
connMatrix
=
{
0
,
4
,
1
,
0
,
0
,
0
,
2
,
0
,
4
,
0
,
0
,
1
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
4
,
2
,
0
,
0
,
0
,
0
,
1
,
4
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
2
,
0
,
0
,
4
,
1
,
0
,
0
,
1
,
0
,
0
,
4
,
0
,
0
,
1
,
2
,
0
,
0
,
0
,
1
,
0
,
0
,
4
,
0
,
0
,
0
,
1
,
0
,
1
,
4
,
0
,
},
.
gdrLevel
=
{},
.
pattern
=
"20202020"
,
.
ringBase
=
"0 1 3 2 4 5 7 6|6 7 5 4 2 3 1 0|0 1 5 4 6 7 3 2|2 3 7 6 4 5 1 0"
,
.
options
=
""
,
};
static
struct
scclRomeModel
rome_model_74
=
{
.
nGpus
=
8
,
.
nCpus
=
4
,
.
nNics
=
4
,
.
nLinks
=
3
,
.
gpuIds
=
{
0xc1000
,
0xc6000
,
0xc9000
,
0xce000
,
0xd1000
,
0xd6000
,
0xd9000
,
0xde000
,
},
.
nicIds
=
{
0xc5000
,
0xcd000
,
0xd5000
,
0xdd000
,
},
.
gpuNuma
=
{
3
,
3
,
1
,
1
,
0
,
0
,
2
,
2
,
},
.
nicNuma
=
{
3
,
1
,
0
,
2
,
},
.
connMatrix
=
{
0
,
4
,
1
,
0
,
0
,
0
,
2
,
0
,
4
,
0
,
0
,
1
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
4
,
2
,
0
,
0
,
0
,
0
,
1
,
4
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
2
,
0
,
0
,
4
,
1
,
0
,
0
,
1
,
0
,
0
,
4
,
0
,
0
,
1
,
2
,
0
,
0
,
0
,
1
,
0
,
0
,
4
,
0
,
0
,
0
,
1
,
0
,
1
,
4
,
0
,
},
.
gdrLevel
=
{
PATH_PHB
,
PATH_PHB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PHB
,
PATH_PHB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PHB
,
PATH_PHB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PHB
,
PATH_PHB
,
},
.
pattern
=
"21212121"
,
.
ringBase
=
"N0 0 1 5 4 6 7 3 2 N1|N1 2 3 7 6 4 5 1 0 N0|N3 7 6 0 1 3 2 4 5 N2|N2 5 4 2 3 1 0 6 7 N3|N0 0 1 5 4 6 7 3 2 N1|N1 2 3 7 6 4 5 1 0 N0|N3 7 6 0 "
"1 3 2 4 5 N2|N2 5 4 2 3 1 0 6 7 N3"
,
.
options
=
"tuning=3"
,
};
static
struct
scclRomeModel
rome_model_76
=
{
.
nGpus
=
8
,
.
nCpus
=
2
,
.
nNics
=
8
,
.
nLinks
=
3
,
.
gpuIds
=
{
0x32000
,
0x35000
,
0x11000
,
0x14000
,
0xae000
,
0xb3000
,
0x8e000
,
0x93000
,
},
.
nicIds
=
{
0x26000
,
0x2d000
,
0x5000
,
0xc000
,
0xab000
,
0xb4000
,
0x8b000
,
0x94000
,
},
.
gpuNuma
=
{
1
,
1
,
1
,
1
,
3
,
3
,
3
,
3
,
},
.
nicNuma
=
{
1
,
1
,
1
,
1
,
3
,
3
,
3
,
3
,
},
.
connMatrix
=
{
0
,
4
,
1
,
0
,
0
,
0
,
2
,
0
,
4
,
0
,
0
,
1
,
0
,
1
,
0
,
0
,
1
,
0
,
0
,
4
,
2
,
0
,
0
,
0
,
0
,
1
,
4
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
2
,
0
,
0
,
4
,
1
,
0
,
0
,
1
,
0
,
0
,
4
,
0
,
0
,
1
,
2
,
0
,
0
,
0
,
1
,
0
,
0
,
4
,
0
,
0
,
0
,
1
,
0
,
1
,
4
,
0
,
},
.
gdrLevel
=
{
PATH_PXB
,
PATH_PXB
,
PATH_PHB
,
PATH_PHB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PXB
,
PATH_PXB
,
PATH_PHB
,
PATH_PHB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PHB
,
PATH_PHB
,
PATH_PXB
,
PATH_PXB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PHB
,
PATH_PHB
,
PATH_PXB
,
PATH_PXB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PXB
,
PATH_PXB
,
PATH_PHB
,
PATH_PHB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PXB
,
PATH_PXB
,
PATH_PHB
,
PATH_PHB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PHB
,
PATH_PHB
,
PATH_PXB
,
PATH_PXB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PHB
,
PATH_PHB
,
PATH_PXB
,
PATH_PXB
,
},
.
pattern
=
"4444"
,
.
ringBase
=
"N0 0 1 3 2 4 5 7 6 N6|N2 2 3 1 0 6 7 5 4 N4|N5 5 4 2 3 7 6 0 1 N1|N1 1 0 6 7 3 2 4 5 N5|N4 4 5 7 6 0 1 3 2 N2|N2 2 3 1 0 6 7 5 4 N4|N0 0 1 5 "
"4 2 3 7 6 N6|N3 3 2 4 5 1 0 6 7 N7|N4 4 5 7 6 0 1 3 2 N2|N6 6 7 5 4 2 3 1 0 N0|N7 7 6 0 1 5 4 2 3 N3|N6 6 7 3 2 4 5 1 0 N0|N3 3 2 0 1 5 4 6 7 "
"N7|N1 1 0 2 3 7 6 4 5 N5|N5 5 4 6 7 3 2 0 1 N1|N7 7 6 4 5 1 0 2 3 N3"
,
.
options
=
"disableNumaMatching=1,tuning=3"
,
};
static
struct
scclRomeModel
rome_model_79
=
{
.
nGpus
=
8
,
.
nCpus
=
2
,
.
nNics
=
0
,
.
nLinks
=
7
,
.
gpuIds
=
{
0x1d000
,
0x2e000
,
0x3f000
,
0x61000
,
0x9f000
,
0xaf000
,
0xbf000
,
0xdf000
,
},
.
nicIds
=
{},
.
gpuNuma
=
{
0
,
0
,
0
,
0
,
1
,
1
,
1
,
1
,
},
.
nicNuma
=
{},
.
connMatrix
=
{
0
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
0
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
0
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
0
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
0
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
0
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
0
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
0
,
},
.
gdrLevel
=
{},
.
pattern
=
"4040"
,
.
ringBase
=
"0 1 2 3 4 5 6 7|0 1 2 3 4 5 7 6|0 2 4 1 3 6 5 7|0 2 4 6 1 7 3 5|0 3 1 5 2 7 4 6|0 3 5 1 6 2 7 4|0 4 1 7 3 6 2 5|7 6 5 4 3 2 1 0|6 7 5 4 3 2 1 "
"0|7 5 6 3 1 4 2 0|5 3 7 1 6 4 2 0|6 4 7 2 5 1 3 0|4 7 2 6 1 5 3 0|5 2 6 3 7 1 4 0"
,
.
options
=
"noCpuCheck=1,mscclEnabled=1"
,
};
static
struct
scclRomeModel
rome_model_80
=
{
.
nGpus
=
4
,
.
nCpus
=
4
,
.
nNics
=
4
,
.
nLinks
=
3
,
.
gpuIds
=
{
0x82000
,
0xc2000
,
0x2000
,
0x42000
,
},
.
nicIds
=
{
0x81000
,
0xc1000
,
0x1000
,
0x41000
,
},
.
gpuNuma
=
{
2
,
3
,
0
,
1
,
},
.
nicNuma
=
{
2
,
3
,
0
,
1
,
},
.
connMatrix
=
{
0
,
2
,
2
,
2
,
2
,
0
,
2
,
2
,
2
,
2
,
0
,
2
,
2
,
2
,
2
,
0
,
},
.
gdrLevel
=
{
PATH_PHB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PHB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PHB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PHB
,
},
.
pattern
=
"11111111"
,
.
ringBase
=
"N2 2 3 0 1 N1|N0 0 1 3 2 N2|N0 0 2 1 3 N3|N3 3 1 0 2 N2|N3 3 1 2 0 N0|N1 1 0 3 2 N2|N1 1 2 3 0 N0|N2 2 0 1 3 N3|N3 3 0 2 1 N1|N2 2 3 1 0 "
"N0|N1 1 2 0 3 N3|N0 0 3 2 1 N1"
,
.
options
=
""
,
};
static
struct
scclRomeModel
rome_model_81
=
{
.
nGpus
=
8
,
.
nCpus
=
2
,
.
nNics
=
8
,
.
nLinks
=
7
,
.
gpuIds
=
{
0xc000
,
0x22000
,
0x38000
,
0x5c000
,
0x9f000
,
0xaf000
,
0xbf000
,
0xdf000
,
},
.
nicIds
=
{
0x7000
,
0x1d000
,
0x33000
,
0x57000
,
0x9a000
,
0xaa000
,
0xba000
,
0xda000
,
},
.
gpuNuma
=
{
0
,
0
,
0
,
0
,
1
,
1
,
1
,
1
,
},
.
nicNuma
=
{
0
,
0
,
0
,
0
,
1
,
1
,
1
,
1
,
},
.
connMatrix
=
{
0
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
0
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
0
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
0
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
0
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
0
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
0
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
0
,
},
.
gdrLevel
=
{
PATH_PXB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PHB
,
PATH_PXB
,
PATH_PHB
,
PATH_PHB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PHB
,
PATH_PHB
,
PATH_PXB
,
PATH_PHB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PXB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PXB
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PHB
,
PATH_PXB
,
PATH_PHB
,
PATH_PHB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PHB
,
PATH_PHB
,
PATH_PXB
,
PATH_PHB
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_SYS
,
PATH_PHB
,
PATH_PHB
,
PATH_PHB
,
PATH_PXB
,
},
.
pattern
=
"4444"
,
.
ringBase
=
"N0 0 1 2 3 4 5 6 7 N7|N1 1 0 2 4 3 5 7 6 N6|N2 2 5 0 3 7 1 6 4 N4|N3 3 6 1 5 2 7 4 0 N0|N4 4 7 0 6 5 1 3 2 N2|N5 5 4 6 3 0 7 2 1 N1|N6 6 2 0 "
"4 1 7 5 3 N3|N7 7 3 1 4 2 6 0 5 N5|N0 0 1 2 3 4 5 6 7 N7|N1 1 0 2 4 3 5 7 6 N6|N2 2 5 0 3 7 1 6 4 N4|N3 3 6 1 5 2 7 4 0 N0|N4 4 7 0 6 5 1 3 2 "
"N2|N5 5 4 6 3 0 7 2 1 N1|N6 6 2 0 4 1 7 5 3 N3|N7 7 3 1 4 2 6 0 5 N5"
,
.
options
=
"noCpuCheck=1,mscclEnabled=1"
,
};
static
struct
scclRomeModel
romeTopoModels
[]
=
{
rome_model_22
,
rome_model_25
,
rome_model_27
,
rome_model_29
,
rome_model_31
,
rome_model_33
,
rome_model_30
,
rome_model_32
,
rome_model_24
,
rome_model_26
,
rome_model_23
,
rome_model_38
,
rome_model_28
,
rome_model_40
,
rome_model_42
,
rome_model_44
,
rome_model_45
,
rome_model_46
,
rome_model_48
,
rome_model_49
,
rome_model_52
,
rome_model_53
,
rome_model_43
,
rome_model_55
,
rome_model_56
,
rome_model_58
,
rome_model_59
,
rome_model_62
,
rome_model_63
,
rome_model_65
,
rome_model_66
,
rome_model_67
,
rome_model_68
,
rome_model_71
,
rome_model_72
,
rome_model_73
,
rome_model_74
,
rome_model_76
,
rome_model_79
,
rome_model_80
,
rome_model_81
,
};
/* Parse user defined rings. Format is like :
* "0 1|1 0|0 1 2 3|3 2 1 0|N0 0 2 3 1 N1|1 3 2 0|0 1 2 3 4 5 6 7|N2 7 6 5 4 3 2 1 0 N1"
* Network interfaces can be optionally specified by N prefix.
* Rings with a non-matching number of gpus are ignored so we can provide
* rings for multiple cases.
*/
scclResult_t
parseGraph
(
const
char
*
str
,
struct
scclTopoSystem
*
system
,
struct
scclTopoGraph
*
graph
,
int
*
gpu_map
,
int
*
net_map
)
{
int
gpus
[
SCCL_TOPO_MAX_NODES
];
int
nChannels
=
0
;
int
gpu
=
0
;
int
offset
=
0
;
int
status
=
0
;
// 0 : between numbers, 1 : inside number, 2: start NET, 3: inside NET
int
nets
[
SCCL_TOPO_MAX_NODES
*
2
];
int
net_offset
=
0
,
net_count
=
0
;
int
ngpus
=
system
->
nodes
[
GPU
].
count
;
int
nnets
=
system
->
nodes
[
NET
].
count
;
do
{
if
(
str
[
offset
]
==
'N'
)
{
if
(
status
==
0
)
{
status
=
2
;
}
}
else
{
int
digit
=
str
[
offset
]
-
'0'
;
if
(
digit
>=
0
&&
digit
<=
9
)
{
switch
(
status
)
{
case
0
:
gpus
[
gpu
]
=
digit
;
status
=
1
;
break
;
case
1
:
gpus
[
gpu
]
=
gpus
[
gpu
]
*
10
+
digit
;
break
;
case
2
:
nets
[
net_offset
]
=
digit
+
'N'
;
status
=
3
;
break
;
case
3
:
nets
[
net_offset
]
=
(
nets
[
net_offset
]
-
'N'
)
*
10
+
digit
+
'N'
;
break
;
}
}
else
{
if
(
status
==
1
)
{
gpu
++
;
net_offset
=
2
*
gpu
-
1
;
if
(
gpu
>
SCCL_TOPO_MAX_NODES
)
goto
end
;
}
else
if
(
status
==
2
||
status
==
3
)
{
net_offset
++
;
net_count
++
;
if
(
net_offset
>
ngpus
*
2
)
goto
end
;
}
status
=
0
;
if
(
str
[
offset
]
==
'|'
||
str
[
offset
]
==
'\0'
)
{
// Ignore if ngpus doesn't match
if
(
gpu
!=
ngpus
)
goto
newchannel
;
// Ignore if net_count is not 0 or odd number
if
(
net_count
&&
net_count
%
2
)
goto
newchannel
;
for
(
int
r
=
0
;
r
<
ngpus
;
r
++
)
{
int
g
=
gpus
[
r
];
// Ignore if gpus are out of bounds
if
(
g
<
0
||
g
>=
ngpus
)
goto
newchannel
;
// Ignore if gpus are duplicate
for
(
int
i
=
0
;
i
<
r
;
i
++
)
if
(
gpus
[
i
]
==
g
)
goto
newchannel
;
// remap if needed
if
(
gpu_map
)
g
=
gpu_map
[
g
];
// Translate gpu numbers into ranks
int
j
=
0
;
for
(
j
=
0
;
j
<
ngpus
;
j
++
)
if
(
g
==
system
->
nodes
[
GPU
].
nodes
[
j
].
gpu
.
dev
)
break
;
if
(
j
<
ngpus
)
graph
->
intra
[
nChannels
*
ngpus
+
r
]
=
system
->
nodes
[
GPU
].
nodes
[
j
].
gpu
.
rank
;
else
return
scclInternalError
;
}
if
(
net_count
)
{
for
(
int
i
=
0
;
net_map
&&
i
<
ngpus
*
2
;
i
++
)
{
if
(
nets
[
i
]
-
'N'
<
0
||
nets
[
i
]
-
'N'
>=
nnets
)
continue
;
nets
[
i
]
=
net_map
[
nets
[
i
]
-
'N'
]
+
'N'
;
}
memcpy
(
&
graph
->
intraNets
[
ngpus
*
nChannels
*
2
],
nets
,
ngpus
*
2
*
sizeof
(
int
));
graph
->
nIntraChannels
++
;
if
(
nets
[
0
]
-
'N'
>=
nnets
||
nets
[
ngpus
*
2
-
1
]
-
'N'
>=
nnets
)
goto
newchannel
;
graph
->
inter
[
nChannels
*
2
]
=
nets
[
0
]
-
'N'
;
graph
->
inter
[
nChannels
*
2
+
1
]
=
nets
[
ngpus
*
2
-
1
]
-
'N'
;
}
else
if
(
nnets
)
{
graph
->
inter
[
nChannels
*
2
]
=
system
->
nodes
[
NET
].
nodes
[
nChannels
%
nnets
].
id
;
graph
->
inter
[
nChannels
*
2
+
1
]
=
system
->
nodes
[
NET
].
nodes
[(
nChannels
+
1
)
%
nnets
].
id
;
}
nChannels
++
;
newchannel:
gpu
=
0
;
net_offset
=
0
;
net_count
=
0
;
}
}
}
}
while
(
str
[
offset
++
]
!=
0
);
end:
graph
->
nChannels
=
nChannels
;
graph
->
bwIntra
=
graph
->
bwInter
=
system
->
totalBw
/
nChannels
;
if
(
graph
->
id
==
1
)
{
for
(
int
i
=
0
;
i
<
graph
->
nChannels
;
i
++
)
{
int
net
;
scclTopoGetLocalNet
(
system
,
graph
->
intra
[
i
*
ngpus
+
1
],
i
,
&
net
);
graph
->
inter
[
i
*
2
+
1
]
=
net
;
}
}
#if 0
for (int i=0; i<graph->nChannels; i++) {
printf("%d: ", i);
printf ("NET/%d ", graph->inter[i*2]);
for (int j=0; j<ngpus; j++) printf("GPU/%d ", graph->intra[i*ngpus+j]);
printf ("NET/%d ", graph->inter[i*2+1]);
printf("\n");
}
#endif
return
scclSuccess
;
}
/* Parse user defined treeBase for complicated trees. Format is like :
* "(4(2(3)(1))(6(5)))"
*
* Rings with a non-matching number of gpus are ignored so we can provide
* rings for multiple cases.
*/
scclResult_t
parseGraphLight
(
const
char
*
str
,
struct
scclTopoSystem
*
system
,
struct
scclTopoGraph
*
graph
,
int
*
gpu_map
)
{
int
gpus
[
SCCL_TOPO_MAX_NODES
];
// transcribe/change according to gpu_map
int
nChannels
=
0
;
int
gpu
=
0
;
int
offset
=
0
;
int
start_offset
=
offset
;
if
(
str
[
0
]
==
0
)
{
graph
->
treeBase
[
0
][
0
]
=
0
;
return
scclSuccess
;
}
int
status
=
0
;
// 0 : between numbers, 1 : inside number
int
ngpus
=
system
->
nodes
[
GPU
].
count
;
int
x
=
0
,
y
=
0
;
do
{
int
digit
=
str
[
offset
]
-
'0'
;
if
(
digit
>=
0
&&
digit
<=
9
)
{
switch
(
status
)
{
case
0
:
gpus
[
gpu
]
=
digit
;
status
=
1
;
break
;
case
1
:
gpus
[
gpu
]
=
gpus
[
gpu
]
*
10
+
digit
;
break
;
}
}
else
{
if
(
status
==
1
)
{
gpu
++
;
}
status
=
0
;
if
(
str
[
offset
]
==
'|'
||
str
[
offset
]
==
0
)
{
int
r
=
0
,
y
=
0
;
while
(
start_offset
<
offset
)
{
// for (int r=0; r<gpu; r++) {
if
(
str
[
start_offset
]
==
'('
||
str
[
start_offset
]
==
')'
)
{
graph
->
treeBase
[
x
][
y
]
=
str
[
start_offset
];
y
++
;
start_offset
++
;
}
else
{
int
g
=
gpus
[
r
];
// remap if needed
if
(
gpu_map
)
g
=
gpu_map
[
g
];
r
++
;
int
j
=
0
;
// Translate gpu numbers into ranks
for
(
j
=
0
;
j
<
ngpus
;
j
++
)
if
(
g
==
system
->
nodes
[
GPU
].
nodes
[
j
].
gpu
.
dev
)
break
;
if
(
j
<
ngpus
)
{
while
(
str
[
start_offset
]
!=
'('
&&
str
[
start_offset
]
!=
')'
)
start_offset
++
;
char
number_str
[
10
];
sprintf
(
number_str
,
"%d"
,
g
);
int
k
=
0
;
while
(
number_str
[
k
]
!=
0
)
{
graph
->
treeBase
[
x
][
y
]
=
number_str
[
k
];
y
++
;
k
++
;
}
}
else
return
scclInternalError
;
}
}
graph
->
treeBase
[
x
][
y
]
=
0
;
x
++
;
gpu
=
0
;
start_offset
=
offset
+
1
;
}
}
}
while
(
str
[
offset
++
]
!=
0
);
graph
->
treeBase
[
x
][
0
]
=
0
;
return
scclSuccess
;
}
#define MAX_OPT_TOKENS 10
extern
const
char
*
topoPathTypeStr
[];
static
void
parseOptions
(
struct
scclTopoSystem
*
system
,
const
char
*
options
)
{
if
(
strcmp
(
options
,
""
))
{
char
*
str_temp
=
(
char
*
)
malloc
(
strlen
(
options
)
+
1
);
strcpy
(
str_temp
,
options
);
char
*
tokens
[
MAX_OPT_TOKENS
];
int
numTokens
=
0
;
char
*
state
;
tokens
[
numTokens
]
=
strtok_r
(
str_temp
,
"=, "
,
&
state
);
numTokens
++
;
while
(
tokens
[
numTokens
-
1
]
!=
NULL
&&
numTokens
<
MAX_OPT_TOKENS
)
tokens
[
numTokens
++
]
=
strtok_r
(
NULL
,
"=, "
,
&
state
);
for
(
int
i
=
0
;
i
<
numTokens
/
2
;
i
++
)
{
if
(
strcmp
(
tokens
[
i
*
2
],
"netGdrLevel"
)
==
0
)
{
int
j
;
for
(
j
=
0
;
j
<=
PATH_SYS
;
j
++
)
{
if
(
strcmp
(
tokens
[
i
*
2
+
1
],
topoPathTypeStr
[
j
])
==
0
)
break
;
}
if
(
j
<=
PATH_SYS
)
system
->
netGdrLevel
=
j
;
else
{
system
->
netGdrLevel
=
-
2
;
WARN
(
"invalid netGdrLevel: %s"
,
tokens
[
i
*
2
+
1
]);
}
}
else
if
(
strcmp
(
tokens
[
i
*
2
],
"pivotA2AEnabled"
)
==
0
)
{
system
->
pivotA2AEnabled
=
(
bool
)
atol
(
tokens
[
i
*
2
+
1
]);
}
else
if
(
strcmp
(
tokens
[
i
*
2
],
"pivotA2ANumBiRings"
)
==
0
)
{
system
->
pivotA2ANumBiRings
=
atol
(
tokens
[
i
*
2
+
1
]);
}
else
if
(
strcmp
(
tokens
[
i
*
2
],
"tuning"
)
==
0
)
{
system
->
tuning
=
atol
(
tokens
[
i
*
2
+
1
]);
}
else
if
(
strcmp
(
tokens
[
i
*
2
],
"ll128Enabled"
)
==
0
)
{
system
->
ll128Enabled
=
(
bool
)
atol
(
tokens
[
i
*
2
+
1
]);
}
else
if
(
strcmp
(
tokens
[
i
*
2
],
"baseBw"
)
==
0
)
{
system
->
baseBw
=
std
::
stof
(
tokens
[
i
*
2
+
1
]);
}
else
if
(
strcmp
(
tokens
[
i
*
2
],
"mscclEnabled"
)
==
0
)
{
system
->
mscclEnabled
=
(
bool
)
atol
(
tokens
[
i
*
2
+
1
]);
}
else
if
(
strcmp
(
tokens
[
i
*
2
],
"treeDefined"
)
==
0
)
{
system
->
treeDefined
=
(
bool
)
atol
(
tokens
[
i
*
2
+
1
]);
}
}
free
(
str_temp
);
}
}
static
bool
checkOption
(
const
char
*
options
,
const
char
*
name
)
{
if
(
strcmp
(
options
,
""
))
{
char
*
str_temp
=
(
char
*
)
malloc
(
strlen
(
options
)
+
1
);
strcpy
(
str_temp
,
options
);
char
*
tokens
[
MAX_OPT_TOKENS
];
int
numTokens
=
0
;
char
*
state
;
tokens
[
numTokens
]
=
strtok_r
(
str_temp
,
"=, "
,
&
state
);
numTokens
++
;
while
(
tokens
[
numTokens
-
1
]
!=
NULL
&&
numTokens
<
MAX_OPT_TOKENS
)
tokens
[
numTokens
++
]
=
strtok_r
(
NULL
,
"=, "
,
&
state
);
for
(
int
i
=
0
;
i
<
numTokens
/
2
;
i
++
)
{
if
(
strcmp
(
tokens
[
i
*
2
],
name
)
==
0
)
{
return
(
bool
)
atol
(
tokens
[
i
*
2
+
1
]);
}
}
free
(
str_temp
);
}
return
false
;
}
scclResult_t
parseChordalRing
(
struct
scclTopoSystem
*
system
,
struct
scclTopoGraph
*
graph
)
{
static
const
char
*
ringBase
=
"0 1 2 3 5 4 7 6|0 2 4 1 7 3 6 5|0 3 1 5 7 2 6 4|0 6 7 4 5 3 2 1|0 5 6 3 7 1 4 2|0 4 6 2 7 5 1 3"
;
int
id
[
8
],
dist
[
8
];
int
i
;
int
ngpus
=
system
->
nodes
[
GPU
].
count
;
if
(
ngpus
!=
8
)
return
scclSuccess
;
// validate chordal ring and calculate distance
for
(
i
=
0
;
i
<
ngpus
;
i
++
)
{
struct
scclTopoNode
*
node
=
system
->
nodes
[
GPU
].
nodes
+
i
;
if
(
node
->
paths
[
GPU
]
==
NULL
)
continue
;
int
sum
=
ngpus
*
(
ngpus
-
1
)
/
2
-
node
->
gpu
.
dev
;
int
count
=
0
;
for
(
int
n
=
0
;
n
<
ngpus
;
n
++
)
{
struct
scclTopoLink
*
link
;
for
(
link
=
node
->
links
;
link
->
remNode
;
link
++
)
{
if
(
link
->
remNode
->
gpu
.
dev
==
n
)
break
;
}
if
(
!
link
->
remNode
)
continue
;
if
(
link
->
type
!=
LINK_NVL
)
continue
;
sum
-=
system
->
nodes
[
GPU
].
nodes
[
n
].
gpu
.
dev
;
count
++
;
}
if
(
count
!=
ngpus
-
2
||
sum
<
0
||
sum
>
ngpus
-
1
)
{
return
scclSuccess
;
}
dist
[
i
]
=
sum
;
}
// remap GPU ids
for
(
i
=
0
;
i
<
ngpus
;
i
++
)
id
[
i
]
=
i
;
for
(
i
=
0
;
i
<
ngpus
;
i
++
)
{
if
(
dist
[
i
]
==
ngpus
-
1
-
i
)
continue
;
int
j
,
m
,
n
,
temp
;
for
(
j
=
i
+
1
;
j
<
ngpus
;
j
++
)
if
(
dist
[
j
]
==
ngpus
-
1
-
i
)
break
;
m
=
dist
[
i
];
n
=
dist
[
j
];
dist
[
i
]
=
n
;
dist
[
j
]
=
m
;
temp
=
id
[
m
];
id
[
m
]
=
id
[
n
];
id
[
n
]
=
temp
;
temp
=
dist
[
m
];
dist
[
m
]
=
dist
[
n
];
dist
[
n
]
=
temp
;
}
// create chordal ring based on reference and remapped ids
system
->
type
|=
RCCL_TOPO_CR8G
;
SCCLCHECK
(
parseGraph
(
ringBase
,
system
,
graph
,
id
,
NULL
));
if
(
system
->
nodes
[
NET
].
count
&&
system
->
nodes
[
GPU
].
count
!=
system
->
nRanks
)
{
int
*
intra
,
*
used
;
graph
->
nChannels
=
system
->
nodes
[
NET
].
count
;
SCCLCHECK
(
scclCalloc
(
&
intra
,
ngpus
));
SCCLCHECK
(
scclCalloc
(
&
used
,
system
->
nodes
[
NET
].
count
));
for
(
int
n
=
0
;
n
<
system
->
nodes
[
NET
].
count
;
n
++
)
{
graph
->
inter
[
n
*
2
]
=
graph
->
inter
[
n
*
2
+
1
]
=
n
;
struct
scclTopoNode
*
net
=
system
->
nodes
[
NET
].
nodes
+
n
;
struct
scclTopoLinkList
*
paths
=
net
->
paths
[
GPU
];
// find the first unsed GPU that is closest to NIC
int
f
,
m
;
for
(
f
=
0
;
f
<
ngpus
;
f
++
)
{
int
j
=
0
;
for
(
j
=
0
;
j
<
n
;
j
++
)
if
(
used
[
j
]
==
system
->
nodes
[
GPU
].
nodes
[
f
].
gpu
.
rank
)
break
;
if
(
j
>=
n
)
break
;
}
for
(
int
i
=
0
;
i
<
ngpus
;
i
++
)
{
int
j
=
0
;
for
(
j
=
0
;
j
<
n
;
j
++
)
if
(
used
[
j
]
==
system
->
nodes
[
GPU
].
nodes
[
i
].
gpu
.
rank
)
break
;
if
(
j
<
n
)
continue
;
if
(
paths
[
i
].
count
<
paths
[
f
].
count
)
f
=
i
;
}
for
(
m
=
0
;
m
<
ngpus
;
m
++
)
if
(
graph
->
intra
[
n
*
ngpus
+
m
]
==
system
->
nodes
[
GPU
].
nodes
[
f
].
gpu
.
rank
)
break
;
used
[
n
]
=
graph
->
intra
[
n
*
ngpus
+
m
];
for
(
int
i
=
0
;
i
<
ngpus
;
i
++
)
intra
[
i
]
=
graph
->
intra
[
n
*
ngpus
+
((
i
+
m
)
%
ngpus
)];
for
(
int
i
=
0
;
i
<
ngpus
;
i
++
)
graph
->
intra
[
n
*
ngpus
+
i
]
=
intra
[
i
];
}
free
(
used
);
free
(
intra
);
}
return
scclSuccess
;
}
static
scclResult_t
parseRomeSystem
(
struct
scclTopoSystem
*
system
,
struct
scclRomeModel
*
romeTopo
,
char
*
pattern
)
{
pattern
[
0
]
=
0
;
// pattern will be NULL for invalid topology
romeTopo
->
nGpus
=
system
->
nodes
[
GPU
].
count
;
romeTopo
->
nCpus
=
system
->
nodes
[
CPU
].
count
;
romeTopo
->
nNics
=
system
->
nodes
[
NET
].
count
;
romeTopo
->
nLinks
=
0
;
struct
scclGpuIdHIP
{
int
g
;
int
dev
;
};
auto
cmpIds
=
[](
const
void
*
g1
,
const
void
*
g2
)
{
struct
scclGpuIdHIP
*
s1
=
(
struct
scclGpuIdHIP
*
)
g1
;
struct
scclGpuIdHIP
*
s2
=
(
struct
scclGpuIdHIP
*
)
g2
;
return
s1
->
dev
-
s2
->
dev
;
};
struct
scclCpuNuma
{
int
c
;
uint64_t
numa
;
};
auto
cmpNuma
=
[](
const
void
*
g1
,
const
void
*
g2
)
{
struct
scclCpuNuma
*
s1
=
(
struct
scclCpuNuma
*
)
g1
;
struct
scclCpuNuma
*
s2
=
(
struct
scclCpuNuma
*
)
g2
;
return
(
int
)(
s1
->
numa
-
s2
->
numa
);
};
struct
scclNetId
{
int
n
;
uint64_t
id
;
};
auto
cmpNets
=
[](
const
void
*
g1
,
const
void
*
g2
)
{
struct
scclNetId
*
s1
=
(
struct
scclNetId
*
)
g1
;
struct
scclNetId
*
s2
=
(
struct
scclNetId
*
)
g2
;
return
(
int
)(
s1
->
id
-
s2
->
id
);
};
// sort GPU devices by HIP device ID
struct
scclGpuIdHIP
gpu_scores
[
SCCL_TOPO_MAX_NODES
];
for
(
int
i
=
0
;
i
<
romeTopo
->
nGpus
;
i
++
)
{
gpu_scores
[
i
].
g
=
i
;
gpu_scores
[
i
].
dev
=
system
->
nodes
[
GPU
].
nodes
[
i
].
gpu
.
dev
;
}
qsort
(
gpu_scores
,
romeTopo
->
nGpus
,
sizeof
(
struct
scclGpuIdHIP
),
cmpIds
);
// sort CPU devices by NUMA id
struct
scclCpuNuma
cpu_scores
[
SCCL_TOPO_MAX_NODES
];
for
(
int
i
=
0
;
i
<
romeTopo
->
nCpus
;
i
++
)
{
cpu_scores
[
i
].
c
=
i
;
cpu_scores
[
i
].
numa
=
system
->
nodes
[
CPU
].
nodes
[
i
].
id
;
}
qsort
(
cpu_scores
,
romeTopo
->
nCpus
,
sizeof
(
struct
scclCpuNuma
),
cmpNuma
);
// sort NET devices by id
struct
scclNetId
net_scores
[
SCCL_TOPO_MAX_NODES
];
for
(
int
i
=
0
;
i
<
romeTopo
->
nNics
;
i
++
)
{
net_scores
[
i
].
n
=
i
;
net_scores
[
i
].
id
=
system
->
nodes
[
NET
].
nodes
[
i
].
id
;
}
qsort
(
net_scores
,
romeTopo
->
nNics
,
sizeof
(
struct
scclNetId
),
cmpNets
);
for
(
int
i
=
0
;
i
<
romeTopo
->
nGpus
;
i
++
)
{
int
gpu
,
n
,
m
,
distance
;
gpu
=
gpu_scores
[
i
].
g
;
romeTopo
->
gpuIds
[
i
]
=
system
->
nodes
[
GPU
].
nodes
[
gpu
].
id
;
m
=
0
;
distance
=
system
->
nodes
[
GPU
].
nodes
[
gpu
].
paths
[
CPU
][
m
].
count
;
for
(
n
=
1
;
n
<
romeTopo
->
nCpus
;
n
++
)
{
if
(
system
->
nodes
[
GPU
].
nodes
[
gpu
].
paths
[
CPU
][
n
].
count
<
distance
)
{
distance
=
system
->
nodes
[
GPU
].
nodes
[
gpu
].
paths
[
CPU
][
n
].
count
;
m
=
n
;
}
}
if
(
m
<
romeTopo
->
nCpus
)
romeTopo
->
gpuNuma
[
i
]
=
system
->
nodes
[
CPU
].
nodes
[
m
].
id
;
struct
scclTopoNode
*
node
=
system
->
nodes
[
GPU
].
nodes
+
gpu
;
if
(
node
->
paths
[
GPU
]
==
NULL
)
continue
;
int
count
=
0
;
for
(
n
=
0
;
n
<
romeTopo
->
nGpus
;
n
++
)
{
romeTopo
->
connMatrix
[
i
*
romeTopo
->
nGpus
+
n
]
=
0
;
struct
scclTopoLink
*
link
;
for
(
link
=
node
->
links
;
link
->
remNode
;
link
++
)
{
if
(
link
->
remNode
->
gpu
.
dev
==
n
)
break
;
}
if
(
!
link
->
remNode
)
continue
;
if
(
link
->
type
!=
LINK_NVL
)
continue
;
romeTopo
->
connMatrix
[
i
*
romeTopo
->
nGpus
+
n
]
=
link
->
bw
/
scclTopoXGMISpeed
(
node
->
gpu
.
gcn
);
count
++
;
}
if
(
romeTopo
->
nLinks
<
count
)
romeTopo
->
nLinks
=
count
;
}
for
(
int
i
=
0
;
i
<
romeTopo
->
nNics
;
i
++
)
{
int
n
,
m
,
distance
;
m
=
0
;
int
net
=
net_scores
[
i
].
n
;
romeTopo
->
nicIds
[
i
]
=
system
->
nodes
[
NET
].
nodes
[
net
].
net
.
busId
;
distance
=
system
->
nodes
[
NET
].
nodes
[
net
].
paths
[
CPU
][
m
].
count
;
for
(
n
=
0
;
n
<
romeTopo
->
nCpus
;
n
++
)
if
(
system
->
nodes
[
NET
].
nodes
[
net
].
paths
[
CPU
][
n
].
count
<
distance
)
{
distance
=
system
->
nodes
[
NET
].
nodes
[
net
].
paths
[
CPU
][
n
].
count
;
m
=
n
;
}
if
(
m
<
romeTopo
->
nCpus
)
romeTopo
->
nicNuma
[
i
]
=
system
->
nodes
[
CPU
].
nodes
[
m
].
id
;
else
return
scclSuccess
;
}
// number of GPUs and NICs on each numa node is used as first screening pattern
for
(
int
i
=
0
;
i
<
romeTopo
->
nCpus
;
i
++
)
{
uint64_t
id
=
system
->
nodes
[
CPU
].
nodes
[
cpu_scores
[
i
].
c
].
id
;
int
g
=
0
,
n
=
0
;
for
(
int
j
=
0
;
j
<
romeTopo
->
nGpus
;
j
++
)
if
(
romeTopo
->
gpuNuma
[
j
]
==
id
)
g
++
;
for
(
int
j
=
0
;
j
<
romeTopo
->
nNics
;
j
++
)
if
(
romeTopo
->
nicNuma
[
j
]
==
id
)
n
++
;
pattern
[
i
*
2
]
=
'0'
+
g
;
pattern
[
i
*
2
+
1
]
=
'0'
+
n
;
}
pattern
[
romeTopo
->
nCpus
*
2
]
=
0
;
// compute gdr level matrix
for
(
int
i
=
0
;
i
<
romeTopo
->
nNics
;
i
++
)
{
int
n
=
net_scores
[
i
].
n
;
for
(
int
j
=
0
;
j
<
romeTopo
->
nGpus
;
j
++
)
{
int
g
=
gpu_scores
[
j
].
g
;
romeTopo
->
gdrLevel
[
i
*
romeTopo
->
nGpus
+
j
]
=
system
->
nodes
[
GPU
].
nodes
[
g
].
paths
[
NET
][
n
].
type
;
}
}
const
char
*
romeModelFile
=
getenv
(
"RCCL_DUMP_ROME_MODEL_FILE"
);
if
(
romeModelFile
)
{
INFO
(
SCCL_ENV
,
"RCCL_DUMP_ROME_MODEL_FILE set by environment to %s"
,
romeModelFile
);
FILE
*
file
=
fopen
(
romeModelFile
,
"w"
);
if
(
file
==
NULL
)
{
WARN
(
"Unable to open %s, not dumping Rome model."
,
romeModelFile
);
return
scclSuccess
;
}
fprintf
(
file
,
"static struct scclRomeModel rome_model_ = {
\n
"
);
fprintf
(
file
,
" .nGpus = %d, .nCpus = %d, .nNics = %d, .nLinks = %d,
\n
"
,
romeTopo
->
nGpus
,
romeTopo
->
nCpus
,
romeTopo
->
nNics
,
romeTopo
->
nLinks
);
fprintf
(
file
,
" .gpuIds = { "
);
for
(
int
i
=
0
;
i
<
romeTopo
->
nGpus
;
i
++
)
fprintf
(
file
,
"0x%lx, "
,
romeTopo
->
gpuIds
[
i
]);
fprintf
(
file
,
"},
\n
"
);
fprintf
(
file
,
" .nicIds = { "
);
for
(
int
i
=
0
;
i
<
romeTopo
->
nNics
;
i
++
)
fprintf
(
file
,
"0x%lx, "
,
romeTopo
->
nicIds
[
i
]);
fprintf
(
file
,
"},
\n
"
);
fprintf
(
file
,
" .gpuNuma = { "
);
for
(
int
i
=
0
;
i
<
romeTopo
->
nGpus
;
i
++
)
fprintf
(
file
,
"%ld, "
,
romeTopo
->
gpuNuma
[
i
]);
fprintf
(
file
,
"},
\n
"
);
fprintf
(
file
,
" .nicNuma = { "
);
for
(
int
i
=
0
;
i
<
romeTopo
->
nNics
;
i
++
)
fprintf
(
file
,
"%ld, "
,
romeTopo
->
nicNuma
[
i
]);
fprintf
(
file
,
"},
\n
"
);
fprintf
(
file
,
" .connMatrix = { "
);
for
(
int
i
=
0
;
i
<
romeTopo
->
nGpus
;
i
++
)
for
(
int
n
=
0
;
n
<
romeTopo
->
nGpus
;
n
++
)
fprintf
(
file
,
"%d, "
,
romeTopo
->
connMatrix
[
i
*
romeTopo
->
nGpus
+
n
]);
fprintf
(
file
,
"},
\n
"
);
fprintf
(
file
,
" .gdrLevel = { "
);
for
(
int
i
=
0
;
i
<
romeTopo
->
nNics
;
i
++
)
for
(
int
n
=
0
;
n
<
romeTopo
->
nGpus
;
n
++
)
fprintf
(
file
,
"PATH_%s, "
,
topoPathTypeStr
[
romeTopo
->
gdrLevel
[
i
*
romeTopo
->
nGpus
+
n
]]);
fprintf
(
file
,
"},
\n
"
);
fprintf
(
file
,
" .pattern =
\"
%s
\"
,
\n
"
,
pattern
);
fprintf
(
file
,
" .ringBase =
\"\"
,
\n
"
);
fprintf
(
file
,
" .options =
\"\"
,
\n
"
);
fprintf
(
file
,
"};
\n
"
);
fclose
(
file
);
}
return
scclSuccess
;
}
static
bool
permuteGpuIds
(
int
*
g
,
int
n
,
int
last
,
struct
scclRomeModel
*
ref
,
struct
scclRomeModel
*
topo
,
int
*
time
,
bool
nbio
,
bool
ignore_numa
)
{
(
*
time
)
++
;
if
(
n
==
last
)
{
int
i
,
j
;
// match GPU numa
if
(
!
ignore_numa
)
{
for
(
i
=
0
;
i
<
ref
->
nGpus
;
i
++
)
if
(
ref
->
gpuNuma
[
i
]
!=
topo
->
gpuNuma
[
g
[
i
]])
break
;
if
(
i
<
ref
->
nGpus
)
return
false
;
}
// match XGMI connection
for
(
i
=
0
;
i
<
ref
->
nGpus
;
i
++
)
{
for
(
j
=
0
;
j
<
ref
->
nGpus
;
j
++
)
{
if
(
ref
->
connMatrix
[
i
*
ref
->
nGpus
+
j
]
!=
topo
->
connMatrix
[
g
[
i
]
*
ref
->
nGpus
+
g
[
j
]])
break
;
if
((
ref
->
gpuIds
[
i
]
-
ref
->
gpuIds
[
j
])
*
(
topo
->
gpuIds
[
g
[
i
]]
-
topo
->
gpuIds
[
g
[
j
]])
<
0
)
break
;
}
if
(
j
<
ref
->
nGpus
)
break
;
}
if
(
i
<
ref
->
nGpus
)
return
false
;
// match NBIO
if
(
nbio
)
{
for
(
i
=
0
;
i
<
ref
->
nGpus
;
i
++
)
{
for
(
j
=
0
;
j
<
ref
->
nGpus
;
j
++
)
{
if
(
i
==
j
)
continue
;
bool
nbio_ref
=
(
ref
->
gpuIds
[
i
]
&
0xf0000
)
==
(
ref
->
gpuIds
[
j
]
&
0xf0000
);
bool
nbio_topo
=
(
topo
->
gpuIds
[
g
[
i
]]
&
0xf0000
)
==
(
topo
->
gpuIds
[
g
[
j
]]
&
0xf0000
);
if
(
nbio_ref
!=
nbio_topo
)
break
;
if
(
nbio_ref
&&
((
ref
->
gpuIds
[
i
]
-
ref
->
gpuIds
[
j
])
*
(
topo
->
gpuIds
[
g
[
i
]]
-
topo
->
gpuIds
[
g
[
j
]])
<
0
))
break
;
}
if
(
j
<
ref
->
nGpus
)
break
;
}
if
(
i
<
ref
->
nGpus
)
return
false
;
}
return
true
;
}
else
{
for
(
int
i
=
n
;
i
<=
last
;
i
++
)
{
std
::
swap
(
g
[
n
],
g
[
i
]);
if
(
permuteGpuIds
(
g
,
n
+
1
,
last
,
ref
,
topo
,
time
,
nbio
,
ignore_numa
))
return
true
;
std
::
swap
(
g
[
n
],
g
[
i
]);
}
}
return
false
;
}
static
bool
permuteNetIds
(
int
*
n
,
int
*
g
,
int
s
,
int
last
,
struct
scclRomeModel
*
ref
,
struct
scclRomeModel
*
topo
,
int
*
time
,
bool
ignore_numa
)
{
(
*
time
)
++
;
if
(
s
==
last
)
{
int
i
,
j
;
// match NET numa
if
(
!
ignore_numa
)
{
for
(
i
=
0
;
i
<
ref
->
nNics
;
i
++
)
{
if
(
ref
->
nicNuma
[
i
]
!=
topo
->
nicNuma
[
n
[
i
]])
break
;
}
if
(
i
<
ref
->
nNics
)
return
false
;
}
// match gdr level
for
(
i
=
0
;
i
<
ref
->
nNics
;
i
++
)
{
for
(
j
=
0
;
j
<
ref
->
nGpus
;
j
++
)
{
if
(
ref
->
gdrLevel
[
i
*
ref
->
nGpus
+
j
]
!=
topo
->
gdrLevel
[
n
[
i
]
*
ref
->
nGpus
+
g
[
j
]])
break
;
}
if
(
j
<
ref
->
nGpus
)
break
;
}
if
(
i
<
ref
->
nNics
)
return
false
;
return
true
;
}
else
{
for
(
int
i
=
s
;
i
<=
last
;
i
++
)
{
std
::
swap
(
n
[
s
],
n
[
i
]);
if
(
permuteNetIds
(
n
,
g
,
s
+
1
,
last
,
ref
,
topo
,
time
,
ignore_numa
))
return
true
;
std
::
swap
(
n
[
s
],
n
[
i
]);
}
}
return
false
;
}
scclResult_t
parseRome4P2H
(
struct
scclTopoSystem
*
system
,
struct
scclTopoGraph
*
graph
)
{
static
char
ringRemap
[
64
];
int
i
;
int
ngpus
=
system
->
nodes
[
GPU
].
count
;
int
ncpus
=
system
->
nodes
[
CPU
].
count
;
int
nnets
=
system
->
nodes
[
NET
].
count
;
if
(
ngpus
>
8
)
return
scclSuccess
;
// only valid on Rome
int
arch
,
vendor
,
model
;
SCCLCHECK
(
scclTopoCpuType
(
system
,
&
arch
,
&
vendor
,
&
model
));
// number of GPUs and NICs on each numa node is used as first screening pattern
struct
scclRomeModel
romeTopo
;
char
pattern
[
256
];
SCCLCHECK
(
parseRomeSystem
(
system
,
&
romeTopo
,
pattern
));
// recognize system as Rome 4P2H even if no matching model
if
(
ngpus
>
4
&&
romeTopo
.
nLinks
)
system
->
type
|=
RCCL_TOPO_4P2H_ROME
;
int
g
[
SCCL_TOPO_MAX_NODES
],
n
[
SCCL_TOPO_MAX_NODES
];
int
time
=
0
;
struct
timeval
tvs
,
tve
;
gettimeofday
(
&
tvs
,
NULL
);
// check if GPUs are directly connected to CPU
bool
match_nbio
=
true
;
for
(
i
=
0
;
i
<
romeTopo
.
nGpus
;
i
++
)
{
int
cpu
,
gpu
;
SCCLCHECK
(
scclTopoIdToIndex
(
system
,
CPU
,
romeTopo
.
gpuNuma
[
i
],
&
cpu
));
SCCLCHECK
(
scclTopoIdToIndex
(
system
,
GPU
,
romeTopo
.
gpuIds
[
i
],
&
gpu
));
if
(
system
->
nodes
[
GPU
].
nodes
[
gpu
].
paths
[
CPU
][
cpu
].
count
>
2
)
break
;
}
if
(
i
<
romeTopo
.
nGpus
)
match_nbio
=
false
;
for
(
i
=
0
;
i
<
sizeof
(
romeTopoModels
)
/
sizeof
(
romeTopoModels
[
0
]);
i
++
)
{
bool
ignore_cpu
=
checkOption
(
romeTopoModels
[
i
].
options
,
"noCpuCheck"
);
if
(
!
ignore_cpu
&&
(
arch
!=
SCCL_TOPO_CPU_ARCH_X86
||
vendor
!=
SCCL_TOPO_CPU_VENDOR_AMD
||
model
!=
SCCL_TOPO_CPU_TYPE_ROME
))
continue
;
bool
ignore_numa
=
checkOption
(
romeTopoModels
[
i
].
options
,
"disableNumaMatching"
);
if
(
!
ignore_numa
&&
romeTopo
.
nCpus
!=
romeTopoModels
[
i
].
nCpus
)
continue
;
if
(
romeTopo
.
nGpus
!=
romeTopoModels
[
i
].
nGpus
||
romeTopo
.
nNics
!=
romeTopoModels
[
i
].
nNics
||
romeTopo
.
nLinks
!=
romeTopoModels
[
i
].
nLinks
)
continue
;
if
(
!
ignore_numa
&&
strcmp
(
romeTopoModels
[
i
].
pattern
,
pattern
))
continue
;
// permute GPU IDs
for
(
int
j
=
0
;
j
<
ngpus
;
j
++
)
g
[
j
]
=
(
j
+
2
)
%
ngpus
;
if
(
!
permuteGpuIds
(
g
,
0
,
ngpus
-
1
,
romeTopoModels
+
i
,
&
romeTopo
,
&
time
,
ignore_cpu
?
false
:
match_nbio
,
ignore_numa
))
continue
;
if
(
nnets
>
1
)
{
// permute NET IDs
for
(
int
j
=
0
;
j
<
nnets
;
j
++
)
n
[
j
]
=
(
j
+
2
)
%
nnets
;
if
(
permuteNetIds
(
n
,
g
,
0
,
nnets
-
1
,
romeTopoModels
+
i
,
&
romeTopo
,
&
time
,
ignore_numa
))
break
;
}
else
break
;
}
gettimeofday
(
&
tve
,
NULL
);
float
t
=
(
tve
.
tv_sec
-
tvs
.
tv_sec
)
*
1E3
+
(
tve
.
tv_usec
-
tvs
.
tv_usec
)
/
1E3
;
if
(
i
>=
sizeof
(
romeTopoModels
)
/
sizeof
(
romeTopoModels
[
0
]))
{
// printf("No solution in %.2fms (%d iter)\n", t, time);
return
scclSuccess
;
}
char
line
[
1024
];
// sprintf(line, "Found matching Rome model index %d in %.2fms (%d iter) with GPU mapping: ", i, t, time);
sprintf
(
line
,
"Found matching Rome model index %d with GPU mapping: "
,
i
);
int
offset
=
strlen
(
line
);
for
(
int
k
=
0
;
k
<
ngpus
;
k
++
)
{
sprintf
(
line
+
offset
,
"%d "
,
g
[
k
]);
offset
=
strlen
(
line
);
}
if
(
nnets
>
1
)
{
sprintf
(
line
+
offset
,
"NET mapping: "
);
offset
=
strlen
(
line
);
for
(
int
k
=
0
;
k
<
nnets
;
k
++
)
{
sprintf
(
line
+
offset
,
"%d "
,
n
[
k
]);
offset
=
strlen
(
line
);
}
}
INFO
(
SCCL_GRAPH
,
"%s"
,
line
);
parseOptions
(
system
,
romeTopoModels
[
i
].
options
);
// create 4P2H based on reference and remapped ids
SCCLCHECK
(
parseGraph
(
romeTopoModels
[
i
].
ringBase
,
system
,
graph
,
g
,
nnets
>
1
?
n
:
NULL
));
if
(
romeTopoModels
[
i
].
treeBase
!=
nullptr
)
SCCLCHECK
(
parseGraphLight
(
romeTopoModels
[
i
].
treeBase
,
system
,
graph
,
g
));
return
scclSuccess
;
}
scclResult_t
parse1H16P
(
struct
scclTopoSystem
*
system
,
struct
scclTopoGraph
*
graph
)
{
#define NUMA_CPUS 4
#define NUMA_GPUS 4
#define NUMA_PERMUTE_COUNT 24
#define TOTAL_PERMUTE_COUNT (NUMA_PERMUTE_COUNT * NUMA_PERMUTE_COUNT * NUMA_PERMUTE_COUNT * NUMA_PERMUTE_COUNT)
static
char
ringRemap
[
256
];
int
i
;
int
ngpus
=
system
->
nodes
[
GPU
].
count
;
int
ncpus
=
system
->
nodes
[
CPU
].
count
;
int
nnets
=
system
->
nodes
[
NET
].
count
;
// only valid on Rome
int
arch
,
vendor
,
model
;
SCCLCHECK
(
scclTopoCpuType
(
system
,
&
arch
,
&
vendor
,
&
model
));
if
(
arch
!=
SCCL_TOPO_CPU_ARCH_X86
||
vendor
!=
SCCL_TOPO_CPU_VENDOR_AMD
||
model
!=
SCCL_TOPO_CPU_TYPE_ROME
)
return
scclSuccess
;
// number of GPUs and NICs on each numa node is used as first screening pattern
struct
scclRomeModel
romeTopo
;
char
pattern
[
256
];
SCCLCHECK
(
parseRomeSystem
(
system
,
&
romeTopo
,
pattern
));
// only match for system with 16 GPUs
if
(
ngpus
!=
16
||
ncpus
!=
NUMA_CPUS
)
return
scclSuccess
;
int
gcnt
=
0
;
int
*
g16
,
n
[
SCCL_TOPO_MAX_NODES
];
int
*
all_gpu_permutations
=
(
int
*
)
malloc
(
TOTAL_PERMUTE_COUNT
*
NUMA_CPUS
*
NUMA_GPUS
*
sizeof
(
int
));
struct
timeval
tvs
,
tve
;
gettimeofday
(
&
tvs
,
NULL
);
for
(
i
=
0
;
i
<
sizeof
(
romeTopoModels
)
/
sizeof
(
romeTopoModels
[
0
]);
i
++
)
{
if
(
romeTopo
.
nCpus
!=
romeTopoModels
[
i
].
nCpus
||
romeTopo
.
nGpus
!=
romeTopoModels
[
i
].
nGpus
||
romeTopo
.
nNics
!=
romeTopoModels
[
i
].
nNics
||
romeTopo
.
nLinks
!=
romeTopoModels
[
i
].
nLinks
)
continue
;
if
(
strcmp
(
romeTopoModels
[
i
].
pattern
,
pattern
))
continue
;
int
j
,
r
[
ngpus
],
g
[
ngpus
];
int
numa_gpu_permutations
[
NUMA_CPUS
][
NUMA_PERMUTE_COUNT
][
NUMA_GPUS
];
// permute GPUs for each CPU NUMA nodes
for
(
j
=
0
;
j
<
ncpus
;
j
++
)
{
int
ngpusPerNuma
=
0
,
cnt
=
0
,
npermute
=
0
;
for
(
int
k
=
0
;
k
<
ngpus
;
k
++
)
{
if
(
romeTopoModels
[
i
].
gpuNuma
[
k
]
!=
j
)
continue
;
r
[
ngpusPerNuma
++
]
=
k
;
}
if
(
ngpusPerNuma
==
0
)
continue
;
if
(
ngpusPerNuma
!=
NUMA_GPUS
)
break
;
gcnt
++
;
// init GPU mapping
for
(
int
k
=
0
;
k
<
ngpus
;
k
++
)
{
if
(
romeTopo
.
gpuNuma
[
k
]
!=
j
)
continue
;
g
[(
2
+
cnt
++
)
%
ngpusPerNuma
]
=
k
;
}
std
::
sort
(
g
,
g
+
ngpusPerNuma
);
do
{
for
(
int
n
=
0
;
n
<
ngpusPerNuma
;
n
++
)
numa_gpu_permutations
[
j
][
npermute
][
n
]
=
g
[
n
];
npermute
++
;
}
while
(
std
::
next_permutation
(
g
,
g
+
ngpusPerNuma
));
if
(
npermute
!=
NUMA_PERMUTE_COUNT
)
break
;
}
if
(
j
<
ncpus
)
continue
;
// permute GPUs for all CPU NUMA nodes
for
(
int
a
=
0
;
a
<
NUMA_PERMUTE_COUNT
;
a
++
)
{
for
(
int
b
=
0
;
b
<
NUMA_PERMUTE_COUNT
;
b
++
)
{
for
(
int
c
=
0
;
c
<
NUMA_PERMUTE_COUNT
;
c
++
)
{
for
(
int
d
=
0
;
d
<
NUMA_PERMUTE_COUNT
;
d
++
)
{
uint64_t
offset
=
((
a
*
NUMA_PERMUTE_COUNT
+
b
)
*
NUMA_PERMUTE_COUNT
+
c
)
*
NUMA_PERMUTE_COUNT
+
d
;
// offset = (offset+TOTAL_PERMUTE_COUNT/2)%TOTAL_PERMUTE_COUNT;
offset
*=
(
NUMA_CPUS
*
NUMA_GPUS
);
memcpy
(
all_gpu_permutations
+
offset
,
&
numa_gpu_permutations
[
0
][
a
][
0
],
NUMA_GPUS
*
sizeof
(
int
));
memcpy
(
all_gpu_permutations
+
offset
+
NUMA_GPUS
,
&
numa_gpu_permutations
[
1
][
b
][
0
],
NUMA_GPUS
*
sizeof
(
int
));
memcpy
(
all_gpu_permutations
+
offset
+
NUMA_GPUS
*
2
,
&
numa_gpu_permutations
[
2
][
c
][
0
],
NUMA_GPUS
*
sizeof
(
int
));
memcpy
(
all_gpu_permutations
+
offset
+
NUMA_GPUS
*
3
,
&
numa_gpu_permutations
[
3
][
d
][
0
],
NUMA_GPUS
*
sizeof
(
int
));
}
}
}
}
// match all GPUs' XGMI connection
int
p
;
for
(
p
=
0
;
p
<
TOTAL_PERMUTE_COUNT
;
p
++
)
{
g16
=
all_gpu_permutations
+
p
*
NUMA_CPUS
*
NUMA_GPUS
;
int
k
;
for
(
k
=
0
;
k
<
romeTopoModels
[
i
].
nGpus
;
k
++
)
{
int
m
;
for
(
m
=
0
;
m
<
romeTopoModels
[
i
].
nGpus
;
m
++
)
{
if
(
romeTopoModels
[
i
].
connMatrix
[
k
*
romeTopoModels
[
i
].
nGpus
+
m
]
!=
romeTopo
.
connMatrix
[
g16
[
k
]
*
romeTopoModels
[
i
].
nGpus
+
g16
[
m
]])
break
;
}
if
(
m
<
romeTopoModels
[
i
].
nGpus
)
break
;
}
if
(
k
<
romeTopoModels
[
i
].
nGpus
)
continue
;
// printf("found match %d: ", p); for (int n = 0; n < NUMA_CPUS*NUMA_GPUS; n++) printf("%d ", g16[n]); printf("\n");
if
(
nnets
>
1
)
{
// permute NET IDs
int
time
=
0
;
for
(
int
m
=
0
;
m
<
nnets
;
m
++
)
n
[
m
]
=
(
m
+
2
)
%
nnets
;
if
(
permuteNetIds
(
n
,
g16
,
0
,
nnets
-
1
,
romeTopoModels
+
i
,
&
romeTopo
,
&
time
,
false
))
break
;
}
else
break
;
}
if
(
p
<
TOTAL_PERMUTE_COUNT
)
break
;
}
gettimeofday
(
&
tve
,
NULL
);
float
t
=
(
tve
.
tv_sec
-
tvs
.
tv_sec
)
*
1E3
+
(
tve
.
tv_usec
-
tvs
.
tv_usec
)
/
1E3
;
if
(
i
>=
sizeof
(
romeTopoModels
)
/
sizeof
(
romeTopoModels
[
0
]))
{
// printf("No solution in %.2fms\n", t);
return
scclSuccess
;
}
char
line
[
1024
];
// sprintf(line, "Found matching Rome model index %d in %.2fms with GPU mapping: ", i, t);
sprintf
(
line
,
"Found matching Rome model index %d with GPU mapping: "
,
i
);
int
offset
=
strlen
(
line
);
for
(
int
k
=
0
;
k
<
ngpus
;
k
++
)
{
sprintf
(
line
+
offset
,
"%d "
,
g16
[
k
]);
offset
=
strlen
(
line
);
}
if
(
nnets
>
1
)
{
sprintf
(
line
+
offset
,
"NET mapping: "
);
offset
=
strlen
(
line
);
for
(
int
k
=
0
;
k
<
nnets
;
k
++
)
{
sprintf
(
line
+
offset
,
"%d "
,
n
[
k
]);
offset
=
strlen
(
line
);
}
}
INFO
(
SCCL_GRAPH
,
"%s"
,
line
);
system
->
type
|=
RCCL_TOPO_16P1H
;
parseOptions
(
system
,
romeTopoModels
[
i
].
options
);
// create 16P1H based on reference and remapped ids
SCCLCHECK
(
parseGraph
(
romeTopoModels
[
i
].
ringBase
,
system
,
graph
,
g16
,
nnets
>
1
?
n
:
NULL
));
if
(
romeTopoModels
[
i
].
treeBase
!=
nullptr
)
SCCLCHECK
(
parseGraphLight
(
romeTopoModels
[
i
].
treeBase
,
system
,
graph
,
g16
));
// clean up
free
(
all_gpu_permutations
);
return
scclSuccess
;
}
scclResult_t
parse4H4P
(
struct
scclTopoSystem
*
system
,
struct
scclTopoGraph
*
graph
)
{
#define NUM_HIVES 4
#define HIVE_GPUS 4
static
char
ringRemap
[
256
];
int
ngpus
=
system
->
nodes
[
GPU
].
count
;
int
nnets
=
system
->
nodes
[
NET
].
count
;
// only valid on Rome
int
arch
,
vendor
,
model
;
SCCLCHECK
(
scclTopoCpuType
(
system
,
&
arch
,
&
vendor
,
&
model
));
if
(
arch
!=
SCCL_TOPO_CPU_ARCH_X86
||
vendor
!=
SCCL_TOPO_CPU_VENDOR_AMD
||
model
!=
SCCL_TOPO_CPU_TYPE_ROME
)
return
scclSuccess
;
// number of GPUs and NICs on each numa node is used as first screening pattern
struct
scclRomeModel
romeTopo
;
char
pattern
[
256
];
SCCLCHECK
(
parseRomeSystem
(
system
,
&
romeTopo
,
pattern
));
// only match for system with 16 GPUs
if
(
ngpus
!=
NUM_HIVES
*
HIVE_GPUS
||
nnets
!=
NUM_HIVES
*
HIVE_GPUS
)
return
scclSuccess
;
int
g_hives
[
ngpus
],
n_hives
[
nnets
];
int
ng_hives
[
NUM_HIVES
];
// try to sort GPUs into hives
for
(
int
i
=
0
;
i
<
NUM_HIVES
;
i
++
)
ng_hives
[
i
]
=
0
;
for
(
int
i
=
0
;
i
<
nnets
;
i
++
)
n_hives
[
i
]
=
-
1
;
for
(
int
i
=
0
;
i
<
ngpus
;
i
++
)
g_hives
[
i
]
=
-
1
;
for
(
int
i
=
0
;
i
<
ngpus
;
i
++
)
{
int
j
,
h
;
for
(
j
=
0
;
j
<
NUM_HIVES
;
j
++
)
{
if
(
ng_hives
[
j
])
{
if
(
romeTopo
.
connMatrix
[
i
*
ngpus
+
g_hives
[
j
*
HIVE_GPUS
]])
{
g_hives
[
j
*
HIVE_GPUS
+
ng_hives
[
j
]]
=
i
;
ng_hives
[
j
]
++
;
break
;
}
}
}
if
(
j
>=
NUM_HIVES
)
{
for
(
h
=
0
;
h
<
NUM_HIVES
;
h
++
)
{
if
(
ng_hives
[
h
]
==
0
)
{
g_hives
[
h
*
HIVE_GPUS
]
=
i
;
ng_hives
[
h
]
++
;
break
;
}
}
if
(
h
>=
NUM_HIVES
)
return
scclSuccess
;
}
}
for
(
int
i
=
0
;
i
<
NUM_HIVES
;
i
++
)
if
(
ng_hives
[
i
]
!=
4
)
return
scclSuccess
;
// remap NET ids
for
(
int
i
=
0
;
i
<
nnets
;
i
++
)
{
int
j
;
for
(
j
=
0
;
j
<
ngpus
;
j
++
)
{
if
(
romeTopo
.
gdrLevel
[
i
*
nnets
+
g_hives
[
j
]]
==
3
)
{
n_hives
[
j
]
=
i
;
break
;
}
}
if
(
j
>=
ngpus
)
return
scclSuccess
;
}
// validation
for
(
int
i
=
0
;
i
<
nnets
;
i
++
)
if
(
n_hives
[
i
]
==
-
1
)
return
scclSuccess
;
for
(
int
i
=
0
;
i
<
ngpus
;
i
++
)
if
(
g_hives
[
i
]
==
-
1
)
return
scclSuccess
;
char
line
[
1024
];
sprintf
(
line
,
"Found matching Rome model 4P4H with GPU mapping: "
);
int
offset
=
strlen
(
line
);
for
(
int
k
=
0
;
k
<
ngpus
;
k
++
)
{
sprintf
(
line
+
offset
,
"%d "
,
g_hives
[
k
]);
offset
=
strlen
(
line
);
}
if
(
nnets
>
1
)
{
sprintf
(
line
+
offset
,
"NET mapping: "
);
offset
=
strlen
(
line
);
for
(
int
k
=
0
;
k
<
nnets
;
k
++
)
{
sprintf
(
line
+
offset
,
"%d "
,
n_hives
[
k
]);
offset
=
strlen
(
line
);
}
}
INFO
(
SCCL_GRAPH
,
"%s"
,
line
);
if
(
arch
==
SCCL_TOPO_CPU_ARCH_X86
&&
vendor
==
SCCL_TOPO_CPU_VENDOR_AMD
&&
model
==
SCCL_TOPO_CPU_TYPE_ROME
)
system
->
type
|=
RCCL_TOPO_4P2H_ROME
;
parseOptions
(
system
,
rome_model_68
.
options
);
// create 4P4H based on reference and remapped ids
SCCLCHECK
(
parseGraph
(
rome_model_68
.
ringBase
,
system
,
graph
,
g_hives
,
n_hives
));
return
scclSuccess
;
}
}
// namespace detect
}
// namespace topology
}
// namespace hardware
}
// namespace sccl
src/hardware/graph/rome_models.h
deleted
100644 → 0
View file @
d9d23f34
#ifndef SCCL_ROME_MODELS_H_
#define SCCL_ROME_MODELS_H_
namespace
sccl
{
namespace
hardware
{
namespace
topology
{
namespace
detect
{
scclResult_t
parseGraph
(
const
char
*
str
,
struct
scclTopoSystem
*
system
,
struct
scclTopoGraph
*
graph
,
int
*
gpu_map
,
int
*
net_map
);
scclResult_t
parseGraphLight
(
const
char
*
str
,
struct
scclTopoSystem
*
system
,
struct
scclTopoGraph
*
graph
,
int
*
gpu_map
);
scclResult_t
parseRome4P2H
(
struct
scclTopoSystem
*
system
,
struct
scclTopoGraph
*
graph
);
scclResult_t
parseChordalRing
(
struct
scclTopoSystem
*
system
,
struct
scclTopoGraph
*
graph
);
scclResult_t
parse1H16P
(
struct
scclTopoSystem
*
system
,
struct
scclTopoGraph
*
graph
);
scclResult_t
parse4H4P
(
struct
scclTopoSystem
*
system
,
struct
scclTopoGraph
*
graph
);
}
// namespace detect
}
// namespace topology
}
// namespace hardware
}
// namespace sccl
#endif
\ No newline at end of file
src/hardware/graph/sccl_bfloat16.h
deleted
100644 → 0
View file @
d9d23f34
/**
* MIT License
*
* Copyright 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
/*!\file
* \brief sccl_bfloat16.h provides struct for sccl_bfloat16 typedef
*/
#ifndef _SCCL_BFLOAT16_H_
#define _SCCL_BFLOAT16_H_
#if __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__) && !defined(__HIP_PLATFORM_HCC__))
// If this is a C compiler, C++ compiler below C++11, or a host-only compiler, we only
// include a minimal definition of sccl_bfloat16
#include <stdint.h>
/*! \brief Struct to represent a 16 bit brain floating point number. */
namespace
sccl
{
typedef
struct
{
uint16_t
data
;
}
sccl_bfloat16
;
}
// namespace sccl
#else // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__) && !defined(__HIP_PLATFORM_HCC__))
#include <cmath>
#include <cstddef>
#include <cstdint>
#include <hip/hip_runtime.h>
#include <ostream>
#include <type_traits>
namespace
sccl
{
struct
sccl_bfloat16
{
uint16_t
data
;
enum
truncate_t
{
truncate
};
__host__
__device__
sccl_bfloat16
()
=
default
;
// round upper 16 bits of IEEE float to convert to bfloat16
explicit
__host__
__device__
sccl_bfloat16
(
float
f
)
:
data
(
float_to_bfloat16
(
f
))
{}
explicit
__host__
__device__
sccl_bfloat16
(
float
f
,
truncate_t
)
:
data
(
truncate_float_to_bfloat16
(
f
))
{}
// zero extend lower 16 bits of bfloat16 to convert to IEEE float
__host__
__device__
operator
float
()
const
{
union
{
uint32_t
int32
;
float
fp32
;
}
u
=
{
uint32_t
(
data
)
<<
16
};
return
u
.
fp32
;
}
private:
static
__host__
__device__
uint16_t
float_to_bfloat16
(
float
f
)
{
union
{
float
fp32
;
uint32_t
int32
;
}
u
=
{
f
};
if
(
~
u
.
int32
&
0x7f800000
)
{
// When the exponent bits are not all 1s, then the value is zero, normal,
// or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
// 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
// This causes the bfloat16's mantissa to be incremented by 1 if the 16
// least significant bits of the float mantissa are greater than 0x8000,
// or if they are equal to 0x8000 and the least significant bit of the
// bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when
// the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already
// has the value 0x7f, then incrementing it causes it to become 0x00 and
// the exponent is incremented by one, which is the next higher FP value
// to the unrounded bfloat16 value. When the bfloat16 value is subnormal
// with an exponent of 0x00 and a mantissa of 0x7F, it may be rounded up
// to a normal value with an exponent of 0x01 and a mantissa of 0x00.
// When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
// incrementing it causes it to become an exponent of 0xFF and a mantissa
// of 0x00, which is Inf, the next higher value to the unrounded value.
u
.
int32
+=
0x7fff
+
((
u
.
int32
>>
16
)
&
1
);
// Round to nearest, round to even
}
else
if
(
u
.
int32
&
0xffff
)
{
// When all of the exponent bits are 1, the value is Inf or NaN.
// Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
// mantissa bit. Quiet NaN is indicated by the most significant mantissa
// bit being 1. Signaling NaN is indicated by the most significant
// mantissa bit being 0 but some other bit(s) being 1. If any of the
// lower 16 bits of the mantissa are 1, we set the least significant bit
// of the bfloat16 mantissa, in order to preserve signaling NaN in case
// the bloat16's mantissa bits are all 0.
u
.
int32
|=
0x10000
;
// Preserve signaling NaN
}
return
uint16_t
(
u
.
int32
>>
16
);
}
// Truncate instead of rounding, preserving SNaN
static
__host__
__device__
uint16_t
truncate_float_to_bfloat16
(
float
f
)
{
union
{
float
fp32
;
uint32_t
int32
;
}
u
=
{
f
};
return
uint16_t
(
u
.
int32
>>
16
)
|
(
!
(
~
u
.
int32
&
0x7f800000
)
&&
(
u
.
int32
&
0xffff
));
}
};
typedef
struct
{
uint16_t
data
;
}
sccl_bfloat16_public
;
static_assert
(
std
::
is_standard_layout
<
sccl_bfloat16
>
{},
"sccl_bfloat16 is not a standard layout type, and thus is "
"incompatible with C."
);
static_assert
(
std
::
is_trivial
<
sccl_bfloat16
>
{},
"sccl_bfloat16 is not a trivial type, and thus is "
"incompatible with C."
);
static_assert
(
sizeof
(
sccl_bfloat16
)
==
sizeof
(
sccl_bfloat16_public
)
&&
offsetof
(
sccl_bfloat16
,
data
)
==
offsetof
(
sccl_bfloat16_public
,
data
),
"internal sccl_bfloat16 does not match public sccl_bfloat16"
);
inline
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
const
sccl_bfloat16
&
bf16
)
{
return
os
<<
float
(
bf16
);
}
inline
__host__
__device__
sccl_bfloat16
operator
+
(
sccl_bfloat16
a
)
{
return
a
;
}
inline
__host__
__device__
sccl_bfloat16
operator
-
(
sccl_bfloat16
a
)
{
a
.
data
^=
0x8000
;
return
a
;
}
inline
__host__
__device__
sccl_bfloat16
operator
+
(
sccl_bfloat16
a
,
sccl_bfloat16
b
)
{
return
sccl_bfloat16
(
float
(
a
)
+
float
(
b
));
}
inline
__host__
__device__
sccl_bfloat16
operator
-
(
sccl_bfloat16
a
,
sccl_bfloat16
b
)
{
return
sccl_bfloat16
(
float
(
a
)
-
float
(
b
));
}
inline
__host__
__device__
sccl_bfloat16
operator
*
(
sccl_bfloat16
a
,
sccl_bfloat16
b
)
{
return
sccl_bfloat16
(
float
(
a
)
*
float
(
b
));
}
inline
__host__
__device__
sccl_bfloat16
operator
/
(
sccl_bfloat16
a
,
sccl_bfloat16
b
)
{
return
sccl_bfloat16
(
float
(
a
)
/
float
(
b
));
}
inline
__host__
__device__
bool
operator
<
(
sccl_bfloat16
a
,
sccl_bfloat16
b
)
{
return
float
(
a
)
<
float
(
b
);
}
inline
__host__
__device__
bool
operator
==
(
sccl_bfloat16
a
,
sccl_bfloat16
b
)
{
return
float
(
a
)
==
float
(
b
);
}
inline
__host__
__device__
bool
operator
>
(
sccl_bfloat16
a
,
sccl_bfloat16
b
)
{
return
b
<
a
;
}
inline
__host__
__device__
bool
operator
<=
(
sccl_bfloat16
a
,
sccl_bfloat16
b
)
{
return
!
(
a
>
b
);
}
inline
__host__
__device__
bool
operator
!=
(
sccl_bfloat16
a
,
sccl_bfloat16
b
)
{
return
!
(
a
==
b
);
}
inline
__host__
__device__
bool
operator
>=
(
sccl_bfloat16
a
,
sccl_bfloat16
b
)
{
return
!
(
a
<
b
);
}
inline
__host__
__device__
sccl_bfloat16
&
operator
+=
(
sccl_bfloat16
&
a
,
sccl_bfloat16
b
)
{
return
a
=
a
+
b
;
}
inline
__host__
__device__
sccl_bfloat16
&
operator
-=
(
sccl_bfloat16
&
a
,
sccl_bfloat16
b
)
{
return
a
=
a
-
b
;
}
inline
__host__
__device__
sccl_bfloat16
&
operator
*=
(
sccl_bfloat16
&
a
,
sccl_bfloat16
b
)
{
return
a
=
a
*
b
;
}
inline
__host__
__device__
sccl_bfloat16
&
operator
/=
(
sccl_bfloat16
&
a
,
sccl_bfloat16
b
)
{
return
a
=
a
/
b
;
}
inline
__host__
__device__
sccl_bfloat16
&
operator
++
(
sccl_bfloat16
&
a
)
{
return
a
+=
sccl_bfloat16
(
1.0
f
);
}
inline
__host__
__device__
sccl_bfloat16
&
operator
--
(
sccl_bfloat16
&
a
)
{
return
a
-=
sccl_bfloat16
(
1.0
f
);
}
inline
__host__
__device__
sccl_bfloat16
operator
++
(
sccl_bfloat16
&
a
,
int
)
{
sccl_bfloat16
orig
=
a
;
++
a
;
return
orig
;
}
inline
__host__
__device__
sccl_bfloat16
operator
--
(
sccl_bfloat16
&
a
,
int
)
{
sccl_bfloat16
orig
=
a
;
--
a
;
return
orig
;
}
namespace
std
{
constexpr
__host__
__device__
bool
isinf
(
sccl_bfloat16
a
)
{
return
!
(
~
a
.
data
&
0x7f80
)
&&
!
(
a
.
data
&
0x7f
);
}
constexpr
__host__
__device__
bool
isnan
(
sccl_bfloat16
a
)
{
return
!
(
~
a
.
data
&
0x7f80
)
&&
+
(
a
.
data
&
0x7f
);
}
constexpr
__host__
__device__
bool
iszero
(
sccl_bfloat16
a
)
{
return
!
(
a
.
data
&
0x7fff
);
}
inline
sccl_bfloat16
sin
(
sccl_bfloat16
a
)
{
return
sccl_bfloat16
(
sinf
(
float
(
a
)));
}
inline
sccl_bfloat16
cos
(
sccl_bfloat16
a
)
{
return
sccl_bfloat16
(
cosf
(
float
(
a
)));
}
}
// namespace std
}
// namespace sccl
#endif // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__))
#endif // _SCCL_BFLOAT16_H_
src/hardware/graph/search.cc
deleted
100644 → 0
View file @
d9d23f34
#include "core.h"
#include "graph.h"
#include "topo.h"
#include "xml.h"
#include <math.h>
#include <sys/time.h>
#include "rome_models.h"
namespace
sccl
{
namespace
hardware
{
namespace
topology
{
namespace
detect
{
SCCL_PARAM
(
CrossNic
,
"CROSS_NIC"
,
2
);
// Initialize system->maxBw. This is the per-channel (i.e. per-SM)
// max bw.
static
float
getMaxBw
(
struct
scclTopoSystem
*
system
,
struct
scclTopoNode
*
gpu
,
int
type
)
{
float
maxBw
=
0.0
;
for
(
int
i
=
0
;
i
<
system
->
nodes
[
type
].
count
;
i
++
)
{
struct
scclTopoLinkList
*
path
=
gpu
->
paths
[
type
]
+
i
;
float
bw
=
path
->
bw
;
if
(
path
->
count
==
0
)
continue
;
maxBw
=
std
::
max
(
maxBw
,
bw
);
}
return
maxBw
;
}
static
float
getTotalBw
(
struct
scclTopoSystem
*
system
,
struct
scclTopoNode
*
gpu
)
{
float
nvlinkBw
=
0.0
,
pciBw
=
0.0
;
for
(
int
l
=
0
;
l
<
gpu
->
nlinks
;
l
++
)
{
struct
scclTopoLink
*
link
=
gpu
->
links
+
l
;
if
(
link
->
type
==
LINK_NVL
)
nvlinkBw
+=
link
->
bw
;
if
(
link
->
type
==
LINK_PCI
)
pciBw
=
link
->
bw
;
}
return
std
::
max
(
pciBw
,
nvlinkBw
);
}
scclResult_t
scclTopoSearchInit
(
struct
scclTopoSystem
*
system
)
{
system
->
maxBw
=
0.0
;
system
->
totalBw
=
0.0
;
int
inter
=
system
->
nodes
[
NET
].
count
;
if
(
inter
==
0
&&
system
->
nodes
[
GPU
].
count
==
1
)
{
system
->
maxBw
=
LOC_BW
;
return
scclSuccess
;
}
for
(
int
g
=
0
;
g
<
system
->
nodes
[
GPU
].
count
;
g
++
)
{
struct
scclTopoNode
*
gpu
=
system
->
nodes
[
GPU
].
nodes
+
g
;
system
->
maxBw
=
std
::
max
(
system
->
maxBw
,
getMaxBw
(
system
,
gpu
,
inter
?
NET
:
GPU
));
system
->
totalBw
=
std
::
max
(
system
->
totalBw
,
getTotalBw
(
system
,
gpu
));
}
return
scclSuccess
;
}
static
scclResult_t
findRevLink
(
struct
scclTopoNode
*
node1
,
struct
scclTopoNode
*
node2
,
struct
scclTopoLink
**
revLink
)
{
for
(
int
l
=
0
;
l
<
node2
->
nlinks
;
l
++
)
{
struct
scclTopoLink
*
link
=
node2
->
links
+
l
;
if
(
link
->
remNode
==
node1
)
{
*
revLink
=
link
;
return
scclSuccess
;
}
}
WARN
(
"Could not find rev link for %d/%ld -> %d/%ld"
,
node1
->
type
,
node1
->
id
,
node2
->
type
,
node2
->
id
);
return
scclInternalError
;
}
// This is unfortunately needed since manipulating floats often results in rounding errors.
#define SUB_ROUND(a, b) (a = roundf((a - b) * 1000) / 1000)
static
scclResult_t
followPath
(
struct
scclTopoLinkList
*
path
,
struct
scclTopoNode
*
start
,
int
maxSteps
,
float
bw
,
int
*
steps
)
{
float
pciBw
=
bw
;
for
(
int
step
=
0
;
step
<
path
->
count
;
step
++
)
{
struct
scclTopoNode
*
node
=
path
->
list
[
step
]
->
remNode
;
if
(
node
->
type
==
CPU
)
{
// Account for P2P inefficiency through Intel CPU RC
if
(
path
->
type
==
PATH_PHB
&&
start
->
type
==
GPU
&&
node
->
cpu
.
arch
==
SCCL_TOPO_CPU_ARCH_X86
&&
node
->
cpu
.
vendor
==
SCCL_TOPO_CPU_VENDOR_INTEL
)
{
pciBw
=
INTEL_P2P_OVERHEAD
(
bw
);
}
}
}
struct
scclTopoNode
*
node
=
start
;
for
(
int
step
=
0
;
step
<
maxSteps
;
step
++
)
{
struct
scclTopoLink
*
link
=
path
->
list
[
step
];
struct
scclTopoLink
*
revLink
=
NULL
;
float
fwBw
=
link
->
type
==
LINK_PCI
?
pciBw
:
bw
;
float
revBw
=
0
;
if
(
link
->
remNode
->
type
==
GPU
&&
link
->
remNode
->
gpu
.
cudaCompCap
<
80
&&
start
->
type
!=
GPU
)
{
if
(
revLink
==
NULL
)
SCCLCHECK
(
findRevLink
(
node
,
link
->
remNode
,
&
revLink
));
revBw
+=
fwBw
/
8
;
}
if
(
link
->
remNode
->
type
==
CPU
&&
link
->
type
==
LINK_NVL
)
{
if
(
revLink
==
NULL
)
SCCLCHECK
(
findRevLink
(
node
,
link
->
remNode
,
&
revLink
));
revBw
+=
fwBw
;
}
if
(
link
->
bw
<
fwBw
||
(
revBw
&&
revLink
->
bw
<
revBw
))
{
*
steps
=
step
;
return
scclSuccess
;
}
SUB_ROUND
(
link
->
bw
,
fwBw
);
if
(
revBw
)
SUB_ROUND
(
revLink
->
bw
,
revBw
);
node
=
link
->
remNode
;
}
*
steps
=
maxSteps
;
return
scclSuccess
;
}
// Try to go from node type1/index1 to no type2/index2. mult indicates whether we are counting the bandwidth (1) or undoing (-1).
static
scclResult_t
scclTopoFollowPath
(
struct
scclTopoSystem
*
system
,
struct
scclTopoGraph
*
graph
,
int
type1
,
int
index1
,
int
type2
,
int
index2
,
int
mult
,
struct
scclTopoNode
**
node
)
{
// First handle easy cases
*
node
=
system
->
nodes
[
type2
].
nodes
+
index2
;
if
(
type1
==
-
1
)
return
scclSuccess
;
struct
scclTopoNode
*
node1
=
system
->
nodes
[
type1
].
nodes
+
index1
;
struct
scclTopoLinkList
*
path
=
node1
->
paths
[
type2
]
+
index2
;
struct
scclTopoNode
*
node2
=
system
->
nodes
[
type2
].
nodes
+
index2
;
struct
scclTopoLinkList
*
revPath
=
node2
->
paths
[
type1
]
+
index1
;
if
(
path
==
NULL
)
{
WARN
(
"No path computed to go from %s/%d to %s/%d"
,
topoNodeTypeStr
[
type1
],
index1
,
topoNodeTypeStr
[
type2
],
index2
);
return
scclInternalError
;
}
if
(
path
->
count
==
0
)
return
scclSuccess
;
// Now check link type
*
node
=
NULL
;
int
intra
=
(
type1
==
GPU
||
type1
==
NVS
)
&&
(
type2
==
GPU
||
type2
==
NVS
);
float
bw
=
intra
?
graph
->
bwIntra
:
graph
->
bwInter
;
int
type
=
intra
?
graph
->
typeIntra
:
graph
->
typeInter
;
if
(
mult
==
1
&&
(
path
->
type
>
type
))
return
scclSuccess
;
if
(
mult
==
1
&&
(
graph
->
pattern
==
SCCL_TOPO_PATTERN_BALANCED_TREE
||
graph
->
pattern
==
SCCL_TOPO_PATTERN_TREE
||
graph
->
pattern
==
SCCL_TOPO_PATTERN_SPLIT_TREE
)
&&
(
revPath
->
type
>
type
))
return
scclSuccess
;
bw
*=
mult
;
// Check there is enough bandwidth on paths.
int
step
=
0
;
SCCLCHECK
(
followPath
(
path
,
node1
,
path
->
count
,
bw
,
&
step
));
if
(
step
<
path
->
count
)
goto
rewind
;
// Enough bandwidth : return destination node.
graph
->
nHops
+=
mult
*
path
->
count
;
*
node
=
system
->
nodes
[
type2
].
nodes
+
index2
;
return
scclSuccess
;
rewind:
// Not enough bandwidth : rewind and exit.
SCCLCHECK
(
followPath
(
path
,
node1
,
step
,
-
bw
,
&
step
));
return
scclSuccess
;
}
static
int
gpuPciBw
(
struct
scclTopoNode
*
gpu
)
{
for
(
int
l
=
0
;
l
<
gpu
->
nlinks
;
l
++
)
{
struct
scclTopoLink
*
gpuLink
=
gpu
->
links
+
l
;
if
(
gpuLink
->
type
!=
LINK_PCI
)
continue
;
struct
scclTopoNode
*
pci
=
gpuLink
->
remNode
;
for
(
int
l
=
0
;
l
<
pci
->
nlinks
;
l
++
)
{
struct
scclTopoLink
*
pciLink
=
pci
->
links
+
l
;
if
(
pciLink
->
remNode
!=
gpu
)
continue
;
return
std
::
min
(
gpuLink
->
bw
,
pciLink
->
bw
);
}
}
return
-
1
;
}
/* Choose the order in which we try next GPUs. This is critical for the search
to quickly converge to the best solution even if it eventually times out. */
struct
scclGpuScore
{
int
g
;
// Retain the index
int
startIndex
;
// Least important
int
intraNhops
;
int
intraBw
;
int
interNhops
;
int
interPciBw
;
int
interBw
;
// Most important
};
static
int
cmpScore
(
const
void
*
g1
,
const
void
*
g2
)
{
struct
scclGpuScore
*
s1
=
(
struct
scclGpuScore
*
)
g1
;
struct
scclGpuScore
*
s2
=
(
struct
scclGpuScore
*
)
g2
;
int
d
;
if
((
d
=
(
s2
->
interBw
-
s1
->
interBw
)))
return
d
;
if
((
d
=
(
s2
->
interPciBw
-
s1
->
interPciBw
)))
return
d
;
if
((
d
=
(
s1
->
interNhops
-
s2
->
interNhops
)))
return
d
;
if
((
d
=
(
s2
->
startIndex
-
s1
->
startIndex
)))
return
d
;
if
((
d
=
(
s2
->
intraBw
-
s1
->
intraBw
)))
return
d
;
if
((
d
=
(
s1
->
intraNhops
-
s2
->
intraNhops
)))
return
d
;
return
s1
->
startIndex
-
s2
->
startIndex
;
}
static
int
cmpIntraScores
(
struct
scclGpuScore
*
scores
,
int
count
)
{
int
intraBw
=
scores
[
0
].
intraBw
;
int
intraNhops
=
scores
[
0
].
intraNhops
;
for
(
int
i
=
1
;
i
<
count
;
i
++
)
{
if
(
scores
[
i
].
intraBw
!=
intraBw
||
scores
[
i
].
intraNhops
!=
intraNhops
)
return
1
;
}
return
0
;
}
static
scclResult_t
getGpuIndex
(
struct
scclTopoSystem
*
system
,
int
rank
,
int
*
index
)
{
for
(
int
g
=
0
;
g
<
system
->
nodes
[
GPU
].
count
;
g
++
)
{
if
(
system
->
nodes
[
GPU
].
nodes
[
g
].
gpu
.
rank
==
rank
)
{
*
index
=
g
;
return
scclSuccess
;
}
}
WARN
(
"Could not find gpu rank %d"
,
rank
);
return
scclInternalError
;
}
static
scclResult_t
getNetIndex
(
struct
scclTopoSystem
*
system
,
int64_t
id
,
int
*
index
)
{
for
(
int
n
=
0
;
n
<
system
->
nodes
[
NET
].
count
;
n
++
)
{
if
(
system
->
nodes
[
NET
].
nodes
[
n
].
id
==
id
)
{
*
index
=
n
;
return
scclSuccess
;
}
}
WARN
(
"Could not find net id %lx"
,
id
);
return
scclInternalError
;
}
static
scclResult_t
getNetPaths
(
struct
scclTopoSystem
*
system
,
struct
scclTopoGraph
*
graph
,
struct
scclTopoLinkList
**
netPaths
)
{
int
netId
=
graph
->
inter
[
graph
->
nChannels
*
2
];
int
n
;
SCCLCHECK
(
getNetIndex
(
system
,
netId
,
&
n
));
*
netPaths
=
system
->
nodes
[
NET
].
nodes
[
n
].
paths
[
GPU
];
return
scclSuccess
;
}
scclResult_t
scclTopoSearchNextGpuSort
(
struct
scclTopoSystem
*
system
,
struct
scclTopoGraph
*
graph
,
struct
scclTopoNode
*
gpu
,
int
*
next
,
int
*
countPtr
,
int
sortNet
)
{
const
uint64_t
flag
=
1ULL
<<
(
graph
->
nChannels
);
int
ngpus
=
system
->
nodes
[
GPU
].
count
;
struct
scclTopoLinkList
*
paths
=
gpu
->
paths
[
GPU
];
struct
scclTopoLinkList
*
netPaths
=
NULL
;
if
(
sortNet
)
SCCLCHECK
(
getNetPaths
(
system
,
graph
,
&
netPaths
));
struct
scclGpuScore
scores
[
SCCL_TOPO_MAX_NODES
];
memset
(
scores
,
0
,
ngpus
*
sizeof
(
struct
scclGpuScore
));
int
start
=
gpu
-
system
->
nodes
[
GPU
].
nodes
;
int
count
=
0
;
for
(
int
i
=
1
;
i
<
ngpus
;
i
++
)
{
int
g
=
(
start
+
i
)
%
ngpus
;
if
(
paths
[
g
].
count
==
0
)
continue
;
// There is no path to that GPU
if
(
system
->
nodes
[
GPU
].
nodes
[
g
].
used
&
flag
)
continue
;
scores
[
count
].
g
=
g
;
scores
[
count
].
startIndex
=
i
;
scores
[
count
].
intraNhops
=
paths
[
g
].
count
;
scores
[
count
].
intraBw
=
paths
[
g
].
bw
;
if
(
netPaths
)
{
scores
[
count
].
interNhops
=
netPaths
[
g
].
count
;
scores
[
count
].
interPciBw
=
gpuPciBw
(
system
->
nodes
[
GPU
].
nodes
+
g
);
scores
[
count
].
interBw
=
netPaths
[
g
].
bw
;
}
count
++
;
}
// Sort GPUs
qsort
(
scores
,
count
,
sizeof
(
struct
scclGpuScore
),
cmpScore
);
// Check if all have the same intra-node score in which case we go reverse for sortNet = -1
if
(
sortNet
==
-
1
&&
cmpIntraScores
(
scores
,
count
)
==
0
)
{
for
(
int
i
=
0
;
i
<
count
;
i
++
)
next
[
i
]
=
scores
[
count
-
1
-
i
].
g
;
}
else
{
for
(
int
i
=
0
;
i
<
count
;
i
++
)
next
[
i
]
=
scores
[
i
].
g
;
}
*
countPtr
=
count
;
return
scclSuccess
;
}
scclResult_t
scclTopoSearchRec
(
struct
scclTopoSystem
*
system
,
struct
scclTopoGraph
*
graph
,
struct
scclTopoGraph
*
saveGraph
,
int
*
time
);
// Try to keep all searchs within one second
#define SCCL_SEARCH_GLOBAL_TIMEOUT (5ULL << 16)
#define SCCL_SEARCH_TIMEOUT (1 << 14)
#define SCCL_SEARCH_TIMEOUT_TREE (1 << 14)
#define SCCL_SEARCH_TIMEOUT_SAMECHANNELS (1 << 8)
#define FORCED_ORDER_PCI 1
#define FORCED_ORDER_REPLAY 2
scclResult_t
scclTopoReplayGetGpu
(
struct
scclTopoSystem
*
system
,
struct
scclTopoGraph
*
graph
,
int
step
,
int
*
g
)
{
*
g
=
-
1
;
if
(
graph
->
nChannels
==
0
)
return
scclInternalError
;
int
ngpus
=
system
->
nodes
[
GPU
].
count
;
int
nextRank
=
graph
->
intra
[(
graph
->
nChannels
-
1
)
*
ngpus
+
step
+
1
];
for
(
int
i
=
0
;
i
<
ngpus
;
i
++
)
if
(
system
->
nodes
[
GPU
].
nodes
[
i
].
gpu
.
rank
==
nextRank
)
{
*
g
=
i
;
return
scclSuccess
;
}
if
(
*
g
==
-
1
)
return
scclInternalError
;
return
scclSuccess
;
}
scclResult_t
scclTopoSearchRecGpu
(
struct
scclTopoSystem
*
system
,
struct
scclTopoGraph
*
graph
,
struct
scclTopoGraph
*
saveGraph
,
struct
scclTopoNode
*
gpu
,
int
step
,
int
backToNet
,
int
backToFirstRank
,
int
forcedOrder
,
int
*
time
);
scclResult_t
scclTopoSearchTryGpu
(
struct
scclTopoSystem
*
system
,
struct
scclTopoGraph
*
graph
,
struct
scclTopoGraph
*
saveGraph
,
int
step
,
int
backToNet
,
int
backToFirstRank
,
int
forcedOrder
,
int
*
time
,
int
type
,
int
index
,
int
g
)
{
const
uint64_t
flag
=
1ULL
<<
(
graph
->
nChannels
);
struct
scclTopoNode
*
gpu
;
SCCLCHECK
(
scclTopoFollowPath
(
system
,
graph
,
type
,
index
,
GPU
,
g
,
1
,
&
gpu
));
if
(
gpu
)
{
gpu
->
used
^=
flag
;
SCCLCHECK
(
scclTopoSearchRecGpu
(
system
,
graph
,
saveGraph
,
gpu
,
step
,
backToNet
,
backToFirstRank
,
forcedOrder
,
time
));
gpu
->
used
^=
flag
;
SCCLCHECK
(
scclTopoFollowPath
(
system
,
graph
,
type
,
index
,
GPU
,
g
,
-
1
,
&
gpu
));
}
return
scclSuccess
;
}
static
int
scclTopoCountXGMI
(
struct
scclTopoSystem
*
system
,
struct
scclTopoGraph
*
graph
)
{
int
ngpus
=
system
->
nodes
[
GPU
].
count
;
int
count
=
0
;
for
(
int
c
=
0
;
c
<
graph
->
nChannels
;
c
++
)
{
for
(
int
i
=
0
;
i
<
ngpus
;
i
++
)
{
int
g
=
graph
->
intra
[
ngpus
*
c
+
i
];
int
n
=
graph
->
intra
[
ngpus
*
c
+
((
i
+
1
)
%
ngpus
)];
struct
scclTopoNode
*
node
;
int
j
;
for
(
j
=
0
;
j
<
ngpus
;
j
++
)
if
(
system
->
nodes
[
GPU
].
nodes
[
j
].
gpu
.
rank
==
g
)
break
;
if
(
j
<
ngpus
)
{
node
=
system
->
nodes
[
GPU
].
nodes
+
j
;
for
(
int
k
=
0
;
k
<
system
->
nodes
[
GPU
].
count
;
k
++
)
{
if
(
node
->
paths
[
GPU
][
k
].
count
==
1
)
{
struct
scclTopoLink
*
link
=
node
->
paths
[
GPU
][
k
].
list
[
0
];
struct
scclTopoNode
*
remNode
=
link
->
remNode
;
if
(
remNode
->
gpu
.
rank
==
n
)
{
if
(
link
->
type
==
LINK_NVL
)
count
++
;
}
}
}
}
}
}
return
count
;
}
scclResult_t
scclTopoSearchTryNvls
(
struct
scclTopoSystem
*
system
,
struct
scclTopoGraph
*
graph
,
struct
scclTopoGraph
*
saveGraph
,
int
g
,
int
ngpus
,
int
*
time
)
{
struct
scclTopoNode
*
nvs
;
struct
scclTopoNode
*
gpu
;
int
d0
=
0
;
// See if there is enough bandwidth for NVS->GPU traffic
do
{
SCCLCHECK
(
scclTopoFollowPath
(
system
,
graph
,
NVS
,
0
,
GPU
,
d0
,
d0
==
g
?
2
:
1
,
&
gpu
));
d0
++
;
}
while
(
gpu
&&
d0
<
system
->
nodes
[
GPU
].
count
);
if
(
gpu
==
NULL
)
{
d0
--
;
}
else
{
int
d1
=
0
;
// See if there is enough bandwidth for GPU->NVS traffic
do
{
SCCLCHECK
(
scclTopoFollowPath
(
system
,
graph
,
GPU
,
d1
,
NVS
,
0
,
d1
==
g
?
2
:
1
,
&
nvs
));
d1
++
;
}
while
(
nvs
&&
d1
<
system
->
nodes
[
GPU
].
count
);
if
(
nvs
==
NULL
)
{
d1
--
;
}
else
{
// Both directions worked. Move on to the next path.
SCCLCHECK
(
scclTopoSearchRecGpu
(
system
,
graph
,
saveGraph
,
NULL
,
ngpus
,
-
1
,
-
1
,
0
,
time
));
}
while
(
d1
)
{
d1
--
;
SCCLCHECK
(
scclTopoFollowPath
(
system
,
graph
,
GPU
,
d1
,
NVS
,
0
,
d1
==
g
?
-
2
:
-
1
,
&
nvs
));
}
}
while
(
d0
)
{
d0
--
;
SCCLCHECK
(
scclTopoFollowPath
(
system
,
graph
,
NVS
,
0
,
GPU
,
d0
,
d0
==
g
?
-
2
:
-
1
,
&
gpu
));
}
return
scclSuccess
;
}
scclResult_t
scclTopoCompareGraphs
(
struct
scclTopoSystem
*
system
,
struct
scclTopoGraph
*
graph
,
struct
scclTopoGraph
*
refGraph
,
int
*
copy
)
{
// 1. Try to get the same nChannels between Rings and Trees
if
(
graph
->
nChannels
<
graph
->
minChannels
)
return
scclSuccess
;
if
(
graph
->
pattern
==
SCCL_TOPO_PATTERN_NVLS
)
{
// NVLS channels correspond to GPUs pulling from NVLS. So the more the better.
if
(
graph
->
nChannels
>
refGraph
->
nChannels
&&
graph
->
nChannels
<=
system
->
nodes
[
GPU
].
count
)
*
copy
=
1
;
return
scclSuccess
;
}
// 2. Try to get better bandwidth
// Give a 15% perf bonus to paths not crossing nics
float
target
=
1.0
-
(
refGraph
->
crossNic
-
graph
->
crossNic
)
*
.15
;
if
(
graph
->
nChannels
*
graph
->
bwIntra
>
refGraph
->
nChannels
*
refGraph
->
bwIntra
*
target
)
{
*
copy
=
1
;
return
scclSuccess
;
}
if
(
graph
->
nChannels
*
graph
->
bwIntra
<
refGraph
->
nChannels
*
refGraph
->
bwIntra
*
target
)
return
scclSuccess
;
// 3. Less hops
if
(
graph
->
pattern
==
refGraph
->
pattern
&&
graph
->
crossNic
==
refGraph
->
crossNic
&&
graph
->
nHops
<
refGraph
->
nHops
)
*
copy
=
1
;
// 4. Prefer graph with more XGMI connections
if
(
graph
->
nChannels
==
refGraph
->
nChannels
&&
scclTopoCountXGMI
(
system
,
refGraph
)
<
scclTopoCountXGMI
(
system
,
graph
))
*
copy
=
1
;
return
scclSuccess
;
}
// Build a list of the best NETs to try.
//
// "gpu" can be set to -1 to build a list suitable for all GPUs (search start) or to a given gpu
// index when trying to get back to the NIC.
//
// The list is built the following way:
// 1. Select NETs starting with those close to GPU(s), based on paths[n].type.
// 2. For each GPU, once that list of NICs with a given distance is prepared, shuffle the list
// based on the GPU NVML index so that e.g. GPU 1 chooses NIC 1 first instead of NIC 0 which
// might have been choosen by GPU 0 (case with multiple independent communicators per node)
// 3. Then add the NETs to the final list if they were not already added by another closer GPU.
scclResult_t
scclTopoSelectNets
(
struct
scclTopoSystem
*
system
,
int
typeInter
,
int
gpu
,
int
*
nets
,
int
*
netCountRet
)
{
int
netCount
=
0
;
int
localNetCount
;
int
*
localNets
;
SCCLCHECK
(
scclCalloc
(
&
localNets
,
system
->
nodes
[
NET
].
count
));
for
(
int
t
=
0
;
t
<=
typeInter
;
t
++
)
{
for
(
int
g
=
0
;
g
<
system
->
nodes
[
GPU
].
count
;
g
++
)
{
if
(
gpu
!=
-
1
&&
gpu
!=
g
)
continue
;
localNetCount
=
0
;
struct
scclTopoNode
*
gpu
=
system
->
nodes
[
GPU
].
nodes
+
g
;
struct
scclTopoLinkList
*
paths
=
gpu
->
paths
[
NET
];
for
(
int
n
=
0
;
n
<
system
->
nodes
[
NET
].
count
;
n
++
)
{
if
(
paths
[
n
].
type
==
t
)
localNets
[
localNetCount
++
]
=
n
;
}
if
(
localNetCount
==
0
)
continue
;
// Shuffle by gpu NVML device number so that GPUs on the same PCI switch
// with multiple NICs don't use the same one as first choice.
for
(
int
r
=
0
;
r
<
system
->
nodes
[
GPU
].
nodes
[
g
].
gpu
.
dev
%
localNetCount
;
r
++
)
{
int
net0
=
localNets
[
0
];
for
(
int
i
=
0
;
i
<
localNetCount
-
1
;
i
++
)
localNets
[
i
]
=
localNets
[
i
+
1
];
localNets
[
localNetCount
-
1
]
=
net0
;
}
// Append NICs to list
for
(
int
i
=
0
;
i
<
localNetCount
;
i
++
)
{
int
n
=
localNets
[
i
];
int
found
=
0
;
while
(
nets
[
found
]
!=
n
&&
found
<
netCount
)
found
++
;
if
(
found
==
netCount
)
nets
[
netCount
++
]
=
n
;
}
}
}
*
netCountRet
=
netCount
;
free
(
localNets
);
return
scclSuccess
;
}
scclResult_t
scclTopoSearchRecGpu
(
struct
scclTopoSystem
*
system
,
struct
scclTopoGraph
*
graph
,
struct
scclTopoGraph
*
saveGraph
,
struct
scclTopoNode
*
gpu
,
int
step
,
int
backToNet
,
int
backToFirstRank
,
int
forcedOrder
,
int
*
time
)
{
if
((
*
time
)
<=
0
)
return
scclSuccess
;
(
*
time
)
--
;
int
ngpus
=
system
->
nodes
[
GPU
].
count
;
if
(
step
==
ngpus
)
{
// Determine whether we found a better solution or not
int
copy
=
0
;
graph
->
nChannels
++
;
SCCLCHECK
(
scclTopoCompareGraphs
(
system
,
graph
,
saveGraph
,
&
copy
));
if
(
copy
)
{
memcpy
(
saveGraph
,
graph
,
sizeof
(
struct
scclTopoGraph
));
if
(
graph
->
nChannels
==
graph
->
maxChannels
)
*
time
=
-
1
;
}
if
(
graph
->
nChannels
<
graph
->
maxChannels
)
{
SCCLCHECK
(
scclTopoSearchRec
(
system
,
graph
,
saveGraph
,
time
));
}
graph
->
nChannels
--
;
return
scclSuccess
;
}
graph
->
intra
[
graph
->
nChannels
*
ngpus
+
step
]
=
gpu
->
gpu
.
rank
;
int
g
=
gpu
-
system
->
nodes
[
GPU
].
nodes
;
if
(
step
==
backToNet
)
{
// first get back to NIC
if
(
system
->
nodes
[
NET
].
count
)
{
int
startNetIndex
;
SCCLCHECK
(
getNetIndex
(
system
,
graph
->
inter
[
graph
->
nChannels
*
2
],
&
startNetIndex
));
struct
scclTopoNode
*
startNet
=
system
->
nodes
[
NET
].
nodes
+
startNetIndex
;
int
netcount
;
int
*
nets
;
SCCLCHECK
(
scclCalloc
(
&
nets
,
system
->
nodes
[
NET
].
count
));
SCCLCHECK
(
scclTopoSelectNets
(
system
,
graph
->
typeInter
,
g
,
nets
,
&
netcount
));
for
(
int
i
=
0
;
i
<
netcount
;
i
++
)
{
int
n
=
nets
[
i
];
struct
scclTopoNode
*
net
=
system
->
nodes
[
NET
].
nodes
+
n
;
if
(
graph
->
pattern
==
SCCL_TOPO_PATTERN_TREE
&&
net
->
id
!=
startNet
->
id
)
continue
;
// Trees are symmetric
if
(
graph
->
crossNic
!=
1
&&
(
net
->
net
.
asic
!=
startNet
->
net
.
asic
||
net
->
net
.
port
!=
startNet
->
net
.
port
))
continue
;
// Balanced Tree : count half of the bandwidth on first two GPUs
int
nextBackToNet
=
-
1
;
float
bwInterSave
=
graph
->
bwInter
;
if
(
graph
->
pattern
==
SCCL_TOPO_PATTERN_BALANCED_TREE
)
{
// Count half of the bandwidth on each of the first two GPUs
if
(
step
==
0
)
nextBackToNet
=
1
;
else
if
(
net
->
id
!=
graph
->
inter
[
graph
->
nChannels
*
2
+
1
])
continue
;
graph
->
bwInter
/=
2
;
}
SCCLCHECK
(
scclTopoFollowPath
(
system
,
graph
,
GPU
,
g
,
NET
,
n
,
1
,
&
net
));
graph
->
bwInter
=
bwInterSave
;
if
(
net
)
{
graph
->
inter
[
graph
->
nChannels
*
2
+
1
]
=
net
->
id
;
SCCLCHECK
(
scclTopoSearchRecGpu
(
system
,
graph
,
saveGraph
,
gpu
,
step
,
nextBackToNet
,
backToFirstRank
,
forcedOrder
,
time
));
if
(
graph
->
pattern
==
SCCL_TOPO_PATTERN_BALANCED_TREE
)
graph
->
bwInter
/=
2
;
SCCLCHECK
(
scclTopoFollowPath
(
system
,
graph
,
GPU
,
g
,
NET
,
n
,
-
1
,
&
net
));
graph
->
bwInter
=
bwInterSave
;
}
}
free
(
nets
);
}
}
else
if
(
graph
->
pattern
==
SCCL_TOPO_PATTERN_NVLS
)
{
SCCLCHECK
(
scclTopoSearchTryNvls
(
system
,
graph
,
saveGraph
,
g
,
ngpus
,
time
));
}
else
if
(
step
<
system
->
nodes
[
GPU
].
count
-
1
)
{
// Go to next GPU
int
next
[
SCCL_TOPO_MAX_NODES
];
int
count
;
if
(
forcedOrder
==
FORCED_ORDER_PCI
)
{
// Try the PCI order
next
[
0
]
=
step
+
1
;
count
=
1
;
}
else
if
(
forcedOrder
==
FORCED_ORDER_REPLAY
)
{
// Try last channel order
SCCLCHECK
(
scclTopoReplayGetGpu
(
system
,
graph
,
step
,
next
));
count
=
1
;
}
else
{
// Normal search
SCCLCHECK
(
scclTopoSearchNextGpuSort
(
system
,
graph
,
gpu
,
next
,
&
count
,
backToNet
==
-
1
?
0
:
backToNet
==
step
+
1
?
1
:
-
1
));
}
for
(
int
i
=
0
;
i
<
count
;
i
++
)
{
SCCLCHECK
(
scclTopoSearchTryGpu
(
system
,
graph
,
saveGraph
,
step
+
1
,
backToNet
,
backToFirstRank
,
forcedOrder
,
time
,
GPU
,
g
,
next
[
i
]));
}
}
else
if
(
step
==
backToFirstRank
)
{
// Find first GPU and loop back to it
int
p
;
SCCLCHECK
(
getGpuIndex
(
system
,
graph
->
intra
[
graph
->
nChannels
*
ngpus
],
&
p
));
struct
scclTopoNode
*
firstGpu
;
SCCLCHECK
(
scclTopoFollowPath
(
system
,
graph
,
GPU
,
g
,
GPU
,
p
,
1
,
&
firstGpu
));
if
(
firstGpu
)
{
SCCLCHECK
(
scclTopoSearchRecGpu
(
system
,
graph
,
saveGraph
,
firstGpu
,
step
+
1
,
backToNet
,
-
1
,
forcedOrder
,
time
));
SCCLCHECK
(
scclTopoFollowPath
(
system
,
graph
,
GPU
,
g
,
GPU
,
p
,
-
1
,
&
firstGpu
));
}
}
else
{
// Next path
SCCLCHECK
(
scclTopoSearchRecGpu
(
system
,
graph
,
saveGraph
,
gpu
,
ngpus
,
-
1
,
-
1
,
forcedOrder
,
time
));
}
return
scclSuccess
;
}
scclResult_t
scclTopoSearchRecNet
(
struct
scclTopoSystem
*
system
,
struct
scclTopoGraph
*
graph
,
struct
scclTopoGraph
*
saveGraph
,
int
backToNet
,
int
backToFirstRank
,
int
*
time
)
{
const
int
bw
=
graph
->
bwInter
;
int
*
nets
;
SCCLCHECK
(
scclCalloc
(
&
nets
,
system
->
nodes
[
NET
].
count
));
int
netcount
;
SCCLCHECK
(
scclTopoSelectNets
(
system
,
graph
->
typeInter
,
-
1
,
nets
,
&
netcount
));
for
(
int
i
=
0
;
i
<
netcount
;
i
++
)
{
int
n
=
nets
[
i
];
struct
scclTopoNode
*
net
=
system
->
nodes
[
NET
].
nodes
+
n
;
struct
scclTopoNode
*
gpu
;
if
(
graph
->
collNet
&&
net
->
net
.
collSupport
==
0
)
continue
;
if
(
net
->
net
.
bw
<
bw
)
continue
;
graph
->
inter
[
graph
->
nChannels
*
2
]
=
net
->
id
;
graph
->
latencyInter
=
net
->
net
.
latency
;
for
(
int
i
=
0
;
i
<
system
->
nodes
[
NET
].
count
;
i
++
)
{
if
((
system
->
nodes
[
NET
].
nodes
[
i
].
net
.
asic
==
net
->
net
.
asic
)
&&
(
system
->
nodes
[
NET
].
nodes
[
i
].
net
.
port
==
net
->
net
.
port
))
{
system
->
nodes
[
NET
].
nodes
[
i
].
net
.
bw
-=
bw
;
}
}
// NVLS needs to balance on all NICs
if
(
graph
->
pattern
==
SCCL_TOPO_PATTERN_NVLS
)
{
SCCLCHECK
(
scclTopoSearchTryGpu
(
system
,
graph
,
saveGraph
,
0
,
backToNet
,
backToFirstRank
,
0
,
time
,
-
1
,
-
1
,
nets
[
graph
->
nChannels
]));
}
else
{
if
(
graph
->
nChannels
>
0
)
{
// Try to replay the last channel
int
g
;
SCCLCHECK
(
scclTopoReplayGetGpu
(
system
,
graph
,
-
1
,
&
g
));
SCCLCHECK
(
scclTopoSearchTryGpu
(
system
,
graph
,
saveGraph
,
0
,
backToNet
,
backToFirstRank
,
FORCED_ORDER_REPLAY
,
time
,
NET
,
n
,
g
));
}
if
(
graph
->
nChannels
==
0
||
graph
->
sameChannels
==
0
)
{
if
(
graph
->
nChannels
==
0
)
{
// Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long
struct
scclTopoLinkList
*
paths
=
net
->
paths
[
GPU
];
int
f
=
0
,
f_gdr
=
0
;
// find the first GPU that is closest to NIC
for
(
int
i
=
0
;
i
<
system
->
nodes
[
GPU
].
count
;
i
++
)
{
if
(
paths
[
i
].
count
<=
paths
[
f
].
count
)
{
// prefer GPU direct RDMA
int
gdr
;
SCCLCHECK
(
scclTopoCheckGdr
(
system
,
system
->
nodes
[
GPU
].
nodes
[
i
].
id
,
net
->
id
,
0
,
&
gdr
));
if
(
paths
[
i
].
count
<
paths
[
f
].
count
||
(
paths
[
i
].
count
==
paths
[
f
].
count
&&
!
f_gdr
&&
gdr
))
{
f
=
i
;
f_gdr
=
gdr
;
}
}
}
int
t
=
1
<<
10
;
SCCLCHECK
(
scclTopoSearchTryGpu
(
system
,
graph
,
saveGraph
,
0
,
backToNet
,
backToFirstRank
,
FORCED_ORDER_PCI
,
&
t
,
NET
,
n
,
0
));
if
(
t
==
-
1
)
*
time
=
-
1
;
}
// Then try the most local GPUs
float
maxBw
=
0
;
int
minHops
=
0xfffffff
;
struct
scclTopoLinkList
*
paths
=
net
->
paths
[
GPU
];
for
(
int
g
=
0
;
g
<
system
->
nodes
[
GPU
].
count
;
g
++
)
{
if
(
paths
[
g
].
bw
>
maxBw
)
{
maxBw
=
paths
[
g
].
bw
;
minHops
=
paths
[
g
].
count
;
}
else
if
(
paths
[
g
].
bw
==
maxBw
&&
paths
[
g
].
count
<
minHops
)
{
minHops
=
paths
[
g
].
count
;
}
}
if
(
maxBw
>=
bw
)
{
// In the first loop, avoid using GPUs in both directions between channels (one channel
// sending from that GPU and one channel receiving to that GPU), since that usually leads
// to lower BW.
for
(
int
tryGpuBidir
=
0
;
tryGpuBidir
<
2
;
tryGpuBidir
++
)
{
for
(
int
g
=
0
;
g
<
system
->
nodes
[
GPU
].
count
;
g
++
)
{
if
(
paths
[
g
].
bw
==
maxBw
&&
paths
[
g
].
count
==
minHops
)
{
gpu
=
system
->
nodes
[
GPU
].
nodes
+
g
;
int
gpuUsed
=
gpuPciBw
(
gpu
)
>
0
?
0
:
1
;
if
(
tryGpuBidir
==
gpuUsed
)
{
SCCLCHECK
(
scclTopoSearchTryGpu
(
system
,
graph
,
saveGraph
,
0
,
backToNet
,
backToFirstRank
,
0
,
time
,
NET
,
n
,
g
));
}
}
}
}
}
}
}
for
(
int
i
=
0
;
i
<
system
->
nodes
[
NET
].
count
;
i
++
)
{
if
((
system
->
nodes
[
NET
].
nodes
[
i
].
net
.
asic
==
net
->
net
.
asic
)
&&
(
system
->
nodes
[
NET
].
nodes
[
i
].
net
.
port
==
net
->
net
.
port
))
{
system
->
nodes
[
NET
].
nodes
[
i
].
net
.
bw
+=
bw
;
}
}
}
free
(
nets
);
return
scclSuccess
;
}
/* Search Patterns
*
* Intra-node
* Ring : GPU a -> GPU b -> .. -> GPU x -> GPU a
* (=Split Tree Loop)
* Tree : GPU a -> GPU b -> .. -> GPU x
* (=Split Tree)
*
* Inter-node
* Ring : NET n -> GPU a -> GPU b -> .. -> GPU x -> NET n (or m if crossNic)
* Tree : NET n -> GPU a -> GPU b -> .. -> GPU x
* `--> NET n (or m if crossNic)
* Split Tree : NET n -> GPU a -> GPU b -> .. -> GPU x
* `--> NET n (or m if crossNic)
* Split Tree Loop : NET n -> GPU a -> GPU b -> .. -> GPU x -> GPU a
* `--> NET n (or m if crossNic)
*/
scclResult_t
scclTopoSearchParams
(
struct
scclTopoSystem
*
system
,
int
pattern
,
int
*
backToNet
,
int
*
backToFirstRank
)
{
if
(
system
->
nodes
[
NET
].
count
&&
system
->
nodes
[
GPU
].
count
!=
system
->
nRanks
)
{
if
(
pattern
==
SCCL_TOPO_PATTERN_RING
)
*
backToNet
=
system
->
nodes
[
GPU
].
count
-
1
;
else
if
(
pattern
==
SCCL_TOPO_PATTERN_SPLIT_TREE
)
*
backToNet
=
1
;
else
*
backToNet
=
0
;
*
backToFirstRank
=
-
1
;
}
else
{
*
backToNet
=
-
1
;
if
(
pattern
==
SCCL_TOPO_PATTERN_RING
)
*
backToFirstRank
=
system
->
nodes
[
GPU
].
count
-
1
;
else
*
backToFirstRank
=
-
1
;
}
return
scclSuccess
;
}
scclResult_t
scclTopoSearchRec
(
struct
scclTopoSystem
*
system
,
struct
scclTopoGraph
*
graph
,
struct
scclTopoGraph
*
saveGraph
,
int
*
time
)
{
int
backToNet
,
backToFirstRank
;
SCCLCHECK
(
scclTopoSearchParams
(
system
,
graph
->
pattern
,
&
backToNet
,
&
backToFirstRank
));
if
(
system
->
nodes
[
NET
].
count
&&
system
->
nodes
[
GPU
].
count
!=
system
->
nRanks
)
{
// Start from NET
scclTopoSearchRecNet
(
system
,
graph
,
saveGraph
,
backToNet
,
backToFirstRank
,
time
);
}
else
{
// Intra-node only.
if
(
graph
->
pattern
==
SCCL_TOPO_PATTERN_NVLS
)
{
SCCLCHECK
(
scclTopoSearchTryGpu
(
system
,
graph
,
saveGraph
,
0
,
backToNet
,
backToFirstRank
,
0
,
time
,
-
1
,
-
1
,
graph
->
nChannels
));
return
scclSuccess
;
}
else
if
(
graph
->
nChannels
==
0
)
{
// Try PCI order first
SCCLCHECK
(
scclTopoSearchTryGpu
(
system
,
graph
,
saveGraph
,
0
,
backToNet
,
backToFirstRank
,
FORCED_ORDER_PCI
,
time
,
-
1
,
-
1
,
0
));
}
else
{
// Also try to replay previous channel
int
g
;
SCCLCHECK
(
scclTopoReplayGetGpu
(
system
,
graph
,
-
1
,
&
g
));
SCCLCHECK
(
scclTopoSearchTryGpu
(
system
,
graph
,
saveGraph
,
0
,
backToNet
,
backToFirstRank
,
0
,
time
,
-
1
,
-
1
,
g
));
}
if
(
graph
->
sameChannels
==
0
||
graph
->
nChannels
==
0
)
{
// Finally, try all other possibilities unless we are forced to use the same channels
for
(
int
g
=
0
;
g
<
system
->
nodes
[
GPU
].
count
;
g
++
)
{
SCCLCHECK
(
scclTopoSearchTryGpu
(
system
,
graph
,
saveGraph
,
0
,
backToNet
,
backToFirstRank
,
0
,
time
,
-
1
,
-
1
,
g
));
}
}
}
return
scclSuccess
;
}
/************************************/
/* User defined graph from XML file */
/************************************/
struct
kvDict
kvDictLinkType
[]
=
{{
"LOC"
,
PATH_LOC
},
{
"NVL"
,
PATH_NVL
},
{
"NVB"
,
PATH_NVB
},
{
"PIX"
,
PATH_PIX
},
{
"PXB"
,
PATH_PXB
},
{
"PXN"
,
PATH_PXN
},
{
"PHB"
,
PATH_PHB
},
{
"SYS"
,
PATH_SYS
},
{
NULL
,
0
}};
scclResult_t
scclTopoGetChannelFromXml
(
struct
scclXmlNode
*
xmlChannel
,
int
c
,
struct
scclTopoSystem
*
system
,
struct
scclTopoGraph
*
graph
)
{
int
ngpus
=
system
->
nodes
[
GPU
].
count
;
int
*
inter
=
graph
->
inter
+
2
*
c
;
int
*
intra
=
graph
->
intra
+
ngpus
*
c
;
int
n
=
0
,
g
=
0
;
for
(
int
s
=
0
;
s
<
xmlChannel
->
nSubs
;
s
++
)
{
struct
scclXmlNode
*
sub
=
xmlChannel
->
subs
[
s
];
int
dev
;
SCCLCHECK
(
xmlGetAttrInt
(
sub
,
"dev"
,
&
dev
));
if
(
strcmp
(
sub
->
name
,
"net"
)
==
0
)
{
inter
[
n
++
]
=
dev
;
}
else
if
(
strcmp
(
sub
->
name
,
"gpu"
)
==
0
)
{
int
rank
=
-
1
;
for
(
int
g
=
0
;
g
<
ngpus
;
g
++
)
{
if
(
system
->
nodes
[
GPU
].
nodes
[
g
].
gpu
.
dev
==
dev
)
rank
=
system
->
nodes
[
GPU
].
nodes
[
g
].
gpu
.
rank
;
}
if
(
rank
==
-
1
)
{
WARN
(
"XML Import Channel : dev %d not found."
,
dev
);
return
scclSystemError
;
}
intra
[
g
++
]
=
rank
;
}
}
return
scclSuccess
;
}
scclResult_t
scclTopoGetGraphFromXmlSub
(
struct
scclXmlNode
*
xmlGraph
,
struct
scclTopoSystem
*
system
,
struct
scclTopoGraph
*
graph
,
int
*
nChannels
)
{
int
id
;
SCCLCHECK
(
xmlGetAttrInt
(
xmlGraph
,
"id"
,
&
id
));
if
(
graph
->
id
!=
id
)
return
scclSuccess
;
int
crossNic
;
SCCLCHECK
(
xmlGetAttrInt
(
xmlGraph
,
"crossnic"
,
&
crossNic
));
if
(
scclParamCrossNic
()
==
0
&&
crossNic
==
1
)
return
scclSuccess
;
graph
->
crossNic
=
crossNic
;
SCCLCHECK
(
xmlGetAttrInt
(
xmlGraph
,
"pattern"
,
&
graph
->
pattern
));
SCCLCHECK
(
xmlGetAttrInt
(
xmlGraph
,
"nchannels"
,
&
graph
->
nChannels
));
SCCLCHECK
(
xmlGetAttrFloat
(
xmlGraph
,
"speedintra"
,
&
graph
->
bwIntra
));
SCCLCHECK
(
xmlGetAttrFloat
(
xmlGraph
,
"speedinter"
,
&
graph
->
bwInter
));
if
(
xmlGetAttrFloat
(
xmlGraph
,
"latencyinter"
,
&
graph
->
latencyInter
)
!=
scclSuccess
)
graph
->
latencyInter
=
0.0
;
const
char
*
str
;
SCCLCHECK
(
xmlGetAttr
(
xmlGraph
,
"typeintra"
,
&
str
));
SCCLCHECK
(
kvConvertToInt
(
str
,
&
graph
->
typeIntra
,
kvDictLinkType
));
SCCLCHECK
(
xmlGetAttr
(
xmlGraph
,
"typeinter"
,
&
str
));
SCCLCHECK
(
kvConvertToInt
(
str
,
&
graph
->
typeInter
,
kvDictLinkType
));
SCCLCHECK
(
xmlGetAttrInt
(
xmlGraph
,
"samechannels"
,
&
graph
->
sameChannels
));
for
(
int
s
=
0
;
s
<
xmlGraph
->
nSubs
;
s
++
)
{
SCCLCHECK
(
scclTopoGetChannelFromXml
(
xmlGraph
->
subs
[
s
],
s
,
system
,
graph
));
}
*
nChannels
=
xmlGraph
->
nSubs
;
return
scclSuccess
;
}
scclResult_t
scclTopoGetGraphFromXml
(
struct
scclXmlNode
*
xmlGraphs
,
struct
scclTopoSystem
*
system
,
struct
scclTopoGraph
*
graph
,
int
*
nChannels
)
{
for
(
int
s
=
0
;
s
<
xmlGraphs
->
nSubs
;
s
++
)
{
SCCLCHECK
(
scclTopoGetGraphFromXmlSub
(
xmlGraphs
->
subs
[
s
],
system
,
graph
,
nChannels
));
}
return
scclSuccess
;
}
/* And the reverse : graph->xml */
scclResult_t
scclTopoGetXmlFromChannel
(
struct
scclTopoGraph
*
graph
,
int
c
,
struct
scclTopoSystem
*
system
,
struct
scclXml
*
xml
,
struct
scclXmlNode
*
parent
)
{
struct
scclXmlNode
*
xmlChannel
;
int
ngpus
=
system
->
nodes
[
GPU
].
count
;
int
*
inter
=
graph
->
inter
+
2
*
c
;
int
*
intra
=
graph
->
intra
+
ngpus
*
c
;
SCCLCHECK
(
xmlAddNode
(
xml
,
parent
,
"channel"
,
&
xmlChannel
));
struct
scclXmlNode
*
node
;
if
(
system
->
nodes
[
NET
].
count
)
{
SCCLCHECK
(
xmlAddNode
(
xml
,
xmlChannel
,
"net"
,
&
node
));
SCCLCHECK
(
xmlSetAttrInt
(
node
,
"dev"
,
inter
[
0
]));
}
for
(
int
g
=
0
;
g
<
ngpus
;
g
++
)
{
SCCLCHECK
(
xmlAddNode
(
xml
,
xmlChannel
,
"gpu"
,
&
node
));
int
dev
=
-
1
;
for
(
int
i
=
0
;
i
<
ngpus
;
i
++
)
{
if
(
system
->
nodes
[
GPU
].
nodes
[
i
].
gpu
.
rank
==
intra
[
g
])
dev
=
system
->
nodes
[
GPU
].
nodes
[
i
].
gpu
.
dev
;
}
if
(
dev
==
-
1
)
{
WARN
(
"XML Export Channel : rank %d not found."
,
intra
[
g
]);
return
scclInternalError
;
}
SCCLCHECK
(
xmlSetAttrInt
(
node
,
"dev"
,
dev
));
}
if
(
system
->
nodes
[
NET
].
count
)
{
SCCLCHECK
(
xmlAddNode
(
xml
,
xmlChannel
,
"net"
,
&
node
));
SCCLCHECK
(
xmlSetAttrInt
(
node
,
"dev"
,
inter
[
1
]));
}
return
scclSuccess
;
}
scclResult_t
scclTopoGetXmlFromGraph
(
struct
scclTopoGraph
*
graph
,
struct
scclTopoSystem
*
system
,
struct
scclXml
*
xml
,
struct
scclXmlNode
*
parent
)
{
struct
scclXmlNode
*
xmlGraph
;
SCCLCHECK
(
xmlAddNode
(
xml
,
parent
,
"graph"
,
&
xmlGraph
));
SCCLCHECK
(
xmlSetAttrInt
(
xmlGraph
,
"id"
,
graph
->
id
));
SCCLCHECK
(
xmlSetAttrInt
(
xmlGraph
,
"pattern"
,
graph
->
pattern
));
SCCLCHECK
(
xmlSetAttrInt
(
xmlGraph
,
"crossnic"
,
graph
->
crossNic
));
SCCLCHECK
(
xmlSetAttrInt
(
xmlGraph
,
"nchannels"
,
graph
->
nChannels
));
SCCLCHECK
(
xmlSetAttrFloat
(
xmlGraph
,
"speedintra"
,
graph
->
bwIntra
));
SCCLCHECK
(
xmlSetAttrFloat
(
xmlGraph
,
"speedinter"
,
graph
->
bwInter
));
SCCLCHECK
(
xmlSetAttrFloat
(
xmlGraph
,
"latencyinter"
,
graph
->
latencyInter
));
const
char
*
str
;
SCCLCHECK
(
kvConvertToStr
(
graph
->
typeIntra
,
&
str
,
kvDictLinkType
));
SCCLCHECK
(
xmlSetAttr
(
xmlGraph
,
"typeintra"
,
str
));
SCCLCHECK
(
kvConvertToStr
(
graph
->
typeInter
,
&
str
,
kvDictLinkType
));
SCCLCHECK
(
xmlSetAttr
(
xmlGraph
,
"typeinter"
,
str
));
SCCLCHECK
(
xmlSetAttrInt
(
xmlGraph
,
"samechannels"
,
graph
->
sameChannels
));
for
(
int
c
=
0
;
c
<
graph
->
nChannels
;
c
++
)
{
SCCLCHECK
(
scclTopoGetXmlFromChannel
(
graph
,
c
,
system
,
xml
,
xmlGraph
));
}
return
scclSuccess
;
}
scclResult_t
scclTopoGetXmlFromGraphs
(
int
ngraphs
,
struct
scclTopoGraph
**
graphs
,
struct
scclTopoSystem
*
system
,
struct
scclXml
*
xml
)
{
xml
->
maxIndex
=
0
;
struct
scclXmlNode
*
xmlGraphs
;
SCCLCHECK
(
xmlAddNode
(
xml
,
NULL
,
"graphs"
,
&
xmlGraphs
));
SCCLCHECK
(
xmlSetAttrInt
(
xmlGraphs
,
"version"
,
SCCL_GRAPH_XML_VERSION
));
for
(
int
g
=
0
;
g
<
ngraphs
;
g
++
)
{
SCCLCHECK
(
scclTopoGetXmlFromGraph
(
graphs
[
g
],
system
,
xml
,
xmlGraphs
));
}
return
scclSuccess
;
}
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
float
speedArrayIntra
[]
=
{
48.0
,
24.0
,
20.0
,
18.0
,
15.0
,
12.0
,
10.0
,
9.0
,
7.0
,
6.0
,
5.0
,
4.0
,
3.0
,
2.4
,
1.2
,
0.24
,
0.12
};
float
speedArrayInter
[]
=
{
48.0
,
24.0
,
20.0
,
18.0
,
15.0
,
12.0
,
10.0
,
9.0
,
7.0
,
6.0
,
5.0
,
4.0
,
3.0
,
2.4
,
1.2
,
0.24
,
0.12
};
#define NSPEEDSINTRA (sizeof(speedArrayIntra) / sizeof(float))
#define NSPEEDSINTER (sizeof(speedArrayInter) / sizeof(float))
#else
float
speedArrayIntra
[]
=
{
40.0
,
30.0
,
20.0
,
18.0
,
15.0
,
12.0
,
10.0
,
9.0
,
7.0
,
6.0
,
5.0
,
4.0
,
3.0
};
float
speedArrayInter
[]
=
{
48.0
,
30.0
,
28.0
,
24.0
,
20.0
,
18.0
,
15.0
,
12.0
,
10.0
,
9.0
,
7.0
,
6.0
,
5.0
,
4.0
,
3.0
,
2.4
,
1.2
,
0.24
,
0.12
};
#define NSPEEDSINTRA (sizeof(speedArrayIntra) / sizeof(float))
#define NSPEEDSINTER (sizeof(speedArrayInter) / sizeof(float))
float
sm90SpeedArrayIntra
[]
=
{
60.0
,
40.0
,
30.0
,
24.0
,
20.0
,
15.0
,
12.0
,
6.0
,
3.0
};
float
sm90SpeedArrayInter
[]
=
{
48.0
,
45.0
,
42.0
,
40.0
,
30.0
,
24.0
,
20.0
,
17.5
,
15.0
,
12.0
,
6.0
,
3.0
,
2.4
,
1.2
,
0.24
,
0.12
};
#define NSPEEDSINTRA_SM90 (sizeof(sm90SpeedArrayIntra) / sizeof(float))
#define NSPEEDSINTER_SM90 (sizeof(sm90SpeedArrayInter) / sizeof(float))
#endif
RCCL_PARAM
(
ModelMatchingDisable
,
"MODEL_MATCHING_DISABLE"
,
0
);
RCCL_PARAM
(
NChannels
,
"NCHANNELS"
,
0
);
scclResult_t
scclTopoCompute
(
scclTopoSystem
*
system
,
struct
scclTopoGraph
*
graph
)
{
int
ngpus
=
system
->
nodes
[
GPU
].
count
;
graph
->
crossNic
=
scclParamCrossNic
();
int
crossNic
=
(
system
->
nodes
[
NET
].
count
>
1
)
&&
graph
->
crossNic
&&
(
graph
->
pattern
==
SCCL_TOPO_PATTERN_RING
||
graph
->
pattern
==
SCCL_TOPO_PATTERN_BALANCED_TREE
||
graph
->
pattern
==
SCCL_TOPO_PATTERN_SPLIT_TREE
)
?
1
:
0
;
graph
->
bwIntra
=
graph
->
bwInter
=
0
;
graph
->
latencyInter
=
0
;
if
(
graph
->
crossNic
==
2
)
graph
->
crossNic
=
0
;
graph
->
typeIntra
=
ngpus
==
1
?
PATH_LOC
:
PATH_NVL
;
graph
->
typeInter
=
PATH_PIX
;
graph
->
nChannels
=
0
;
graph
->
nIntraChannels
=
0
;
memset
(
graph
->
intraNets
,
0
,
MAXCHANNELS
*
SCCL_TOPO_MAX_NODES
*
2
*
sizeof
(
int
));
int
trySameChannels
=
graph
->
pattern
==
SCCL_TOPO_PATTERN_NVLS
?
0
:
1
;
graph
->
sameChannels
=
trySameChannels
;
char
*
str
=
getenv
(
"SCCL_GRAPH_FILE"
);
if
(
str
)
{
INFO
(
SCCL_ENV
,
"SCCL_GRAPH_FILE set by environment to %s"
,
str
);
struct
scclXml
*
xml
;
SCCLCHECK
(
scclCalloc
(
&
xml
,
1
));
SCCLCHECK
(
scclTopoGetXmlGraphFromFile
(
str
,
xml
));
int
nChannels
;
SCCLCHECK
(
scclTopoGetGraphFromXml
(
xml
->
nodes
,
system
,
graph
,
&
nChannels
));
INFO
(
SCCL_GRAPH
,
"Search %d : %d channels loaded from XML graph"
,
graph
->
id
,
nChannels
);
free
(
xml
);
if
(
graph
->
nChannels
>
0
)
return
scclSuccess
;
}
str
=
getenv
(
"SCCL_RINGS"
);
char
*
strTrees
=
getenv
(
"RCCL_TREES"
);
if
(
str
||
strTrees
)
{
// user supplied topo
if
(
strTrees
)
{
SCCLCHECK
(
parseGraphLight
(
strTrees
,
system
,
graph
,
NULL
));
system
->
treeDefined
=
true
;
}
else
{
SCCLCHECK
(
parseGraph
(
str
,
system
,
graph
,
NULL
,
NULL
));
int
arch
,
vendor
,
model
;
SCCLCHECK
(
scclTopoCpuType
(
system
,
&
arch
,
&
vendor
,
&
model
));
if
(
graph
->
nChannels
&&
arch
==
SCCL_TOPO_CPU_ARCH_X86
&&
vendor
==
SCCL_TOPO_CPU_VENDOR_AMD
&&
model
==
SCCL_TOPO_CPU_TYPE_ROME
)
{
system
->
type
|=
RCCL_TOPO_4P2H_ROME
;
}
}
}
else
if
(
!
rcclParamModelMatchingDisable
()
&&
!
graph
->
collNet
)
{
// try to match 8P6L
SCCLCHECK
(
parseChordalRing
(
system
,
graph
));
if
(
graph
->
nChannels
)
return
scclSuccess
;
// try to match Rome 4P2H
SCCLCHECK
(
parseRome4P2H
(
system
,
graph
));
if
(
graph
->
nChannels
)
return
scclSuccess
;
// try to match 1H16P
SCCLCHECK
(
parse1H16P
(
system
,
graph
));
if
(
graph
->
nChannels
)
return
scclSuccess
;
// try to match 4H4P
SCCLCHECK
(
parse4H4P
(
system
,
graph
));
}
if
(
graph
->
nChannels
)
return
scclSuccess
;
if
((
graph
->
pattern
==
SCCL_TOPO_PATTERN_RING
)
&&
(
system
->
type
&
RCCL_TOPO_4P2H_ROME
)
&&
(
ngpus
==
system
->
nRanks
))
{
// limit single node max channels when searching ring graph on Rome
graph
->
maxChannels
=
2
;
}
if
(
ngpus
==
1
)
if
(
graph
->
pattern
!=
SCCL_TOPO_PATTERN_RING
)
graph
->
pattern
=
SCCL_TOPO_PATTERN_TREE
;
int
ccMin
;
SCCLCHECK
(
scclTopoGetCompCap
(
system
,
&
ccMin
,
NULL
));
if
(
graph
->
pattern
==
SCCL_TOPO_PATTERN_NVLS
&&
(
system
->
nodes
[
NVS
].
count
==
0
||
ccMin
<
90
))
return
scclSuccess
;
if
(
ngpus
==
1
)
if
(
graph
->
pattern
!=
SCCL_TOPO_PATTERN_RING
)
graph
->
pattern
=
SCCL_TOPO_PATTERN_TREE
;
if
(
system
->
nodes
[
NET
].
count
==
0
&&
graph
->
pattern
==
SCCL_TOPO_PATTERN_NVLS
)
{
// Force intra-node NVLS algorithm to pull evenly from all GPUs.
graph
->
minChannels
=
graph
->
maxChannels
=
system
->
nodes
[
GPU
].
count
;
}
struct
scclTopoGraph
tmpGraph
;
memcpy
(
&
tmpGraph
,
graph
,
sizeof
(
struct
scclTopoGraph
));
// First try crossnic, then decrease bw and finally increase bwIntra.
int
nspeeds
=
0
;
float
*
speedArray
=
NULL
;
if
(
system
->
nodes
[
NET
].
count
==
0
)
{
nspeeds
=
NSPEEDSINTRA
;
speedArray
=
speedArrayIntra
;
}
else
{
nspeeds
=
NSPEEDSINTER
;
speedArray
=
speedArrayInter
;
}
int
pass
=
1
;
int
speedIndex
=
0
;
float
maxBw
=
system
->
maxBw
;
float
totalBw
=
system
->
totalBw
;
if
(
ngpus
==
1
||
graph
->
pattern
!=
SCCL_TOPO_PATTERN_RING
)
totalBw
*=
ngpus
*
1.0
/
(
ngpus
-
1
);
while
((
speedArray
[
speedIndex
]
>
maxBw
||
speedArray
[
speedIndex
]
*
graph
->
minChannels
>
totalBw
)
&&
speedIndex
<
nspeeds
-
1
)
speedIndex
++
;
tmpGraph
.
bwIntra
=
tmpGraph
.
bwInter
=
speedArray
[
speedIndex
];
int64_t
globalTimeout
=
SCCL_SEARCH_GLOBAL_TIMEOUT
;
search:
int
time
=
tmpGraph
.
sameChannels
?
SCCL_SEARCH_TIMEOUT_SAMECHANNELS
:
tmpGraph
.
pattern
==
SCCL_TOPO_PATTERN_TREE
?
SCCL_SEARCH_TIMEOUT_TREE
:
SCCL_SEARCH_TIMEOUT
;
tmpGraph
.
nChannels
=
0
;
globalTimeout
-=
time
;
SCCLCHECK
(
scclTopoSearchRec
(
system
,
&
tmpGraph
,
graph
,
&
time
));
#if 0
printf("Pattern %d, crossNic %d, Bw %g/%g, type %d/%d, channels %d-%d sameChannels %d -> nChannels %dx%g/%g %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.bwInter, tmpGraph.bwIntra, tmpGraph.typeInter, tmpGraph.typeIntra, tmpGraph.minChannels, tmpGraph.maxChannels, tmpGraph.sameChannels, graph->nChannels, graph->bwInter, graph->bwIntra, time == 0 ? "TIMEOUT" : time == -1 ? "PERFECT" : "");
for (int c=0; c<graph->nChannels; c++) {
printf("%2d : ", c);
for (int g=0; g<ngpus; g++) {
printf("%d ", graph->intra[c*ngpus+g]);
}
printf("[%d %d]", graph->inter[c*2+0], graph->inter[c*2+1]);
printf("\n");
}
#endif
// Optimal solution, stop here
if
(
time
==
-
1
)
goto
done
;
if
(
graph
->
nChannels
*
graph
->
bwInter
>=
system
->
totalBw
)
goto
done
;
if
(
pass
==
1
)
{
// First pass, we don't have a solution yet ; try other options
// Try having different channels
if
(
tmpGraph
.
sameChannels
==
1
)
{
tmpGraph
.
sameChannels
=
0
;
goto
search
;
}
tmpGraph
.
sameChannels
=
trySameChannels
;
if
(
time
!=
-
1
)
globalTimeout
+=
time
;
else
globalTimeout
=
SCCL_SEARCH_GLOBAL_TIMEOUT
;
if
(
globalTimeout
<
0
&&
graph
->
nChannels
)
goto
done
;
tmpGraph
.
pattern
=
graph
->
pattern
;
int
maxTypeIntra
=
system
->
nodes
[
NET
].
count
>
0
?
tmpGraph
.
typeInter
:
PATH_SYS
;
if
(
tmpGraph
.
typeIntra
<
maxTypeIntra
&&
(
graph
->
nChannels
==
0
||
tmpGraph
.
typeIntra
<
graph
->
typeIntra
))
{
tmpGraph
.
typeIntra
+=
1
;
goto
search
;
}
tmpGraph
.
typeIntra
=
ngpus
==
1
?
PATH_LOC
:
PATH_NVL
;
if
(
system
->
nodes
[
NET
].
count
>
0
&&
tmpGraph
.
typeInter
<
PATH_SYS
&&
(
graph
->
nChannels
==
0
||
tmpGraph
.
typeInter
<
graph
->
typeInter
||
tmpGraph
.
typeInter
<
PATH_PXN
))
{
tmpGraph
.
typeInter
+=
1
;
goto
search
;
}
tmpGraph
.
typeInter
=
PATH_PIX
;
if
(
crossNic
&&
tmpGraph
.
crossNic
==
0
)
{
// Try again with crossNic if permitted
tmpGraph
.
crossNic
=
crossNic
;
goto
search
;
}
tmpGraph
.
crossNic
=
0
;
// Decrease bw until we find a solution
if
((
speedIndex
<
nspeeds
-
1
)
&&
(
graph
->
nChannels
==
0
||
(
speedArray
[
speedIndex
+
1
]
/
graph
->
bwInter
>
.49
)))
{
tmpGraph
.
bwInter
=
tmpGraph
.
bwIntra
=
speedArray
[
++
speedIndex
];
goto
search
;
}
speedIndex
=
0
;
while
(
speedArray
[
speedIndex
]
>
maxBw
&&
speedIndex
<
nspeeds
-
1
)
speedIndex
++
;
tmpGraph
.
bwIntra
=
tmpGraph
.
bwInter
=
speedArray
[
speedIndex
];
}
done:
// We have a solution. Start from that solution and move to pass 2.
if
(
pass
==
1
)
{
time
=
-
1
;
memcpy
(
&
tmpGraph
,
graph
,
sizeof
(
tmpGraph
));
speedIndex
=
0
;
while
(
speedArray
[
speedIndex
]
>
graph
->
bwInter
&&
speedIndex
<
nspeeds
-
1
)
speedIndex
++
;
tmpGraph
.
bwIntra
=
tmpGraph
.
bwInter
=
speedArray
[
speedIndex
];
tmpGraph
.
minChannels
=
graph
->
nChannels
;
pass
=
2
;
}
// 3. See if we can increase bwIntra for trees (2 nodes or collnet)
if
(
pass
==
2
)
{
if
(
time
!=
0
&&
graph
->
pattern
!=
SCCL_TOPO_PATTERN_RING
&&
tmpGraph
.
bwIntra
==
graph
->
bwIntra
&&
tmpGraph
.
bwIntra
<
tmpGraph
.
bwInter
*
2
&&
speedIndex
>
0
)
{
tmpGraph
.
bwIntra
=
speedArray
[
--
speedIndex
];
goto
search
;
}
time
=
-
1
;
memcpy
(
&
tmpGraph
,
graph
,
sizeof
(
tmpGraph
));
}
if
(
graph
->
nChannels
==
0
&&
graph
->
collNet
==
0
&&
graph
->
pattern
!=
SCCL_TOPO_PATTERN_NVLS
)
{
WARN
(
"Could not find a path for pattern %d, falling back to simple order"
,
graph
->
pattern
);
for
(
int
i
=
0
;
i
<
ngpus
;
i
++
)
graph
->
intra
[
i
]
=
system
->
nodes
[
GPU
].
nodes
[
i
].
gpu
.
rank
;
graph
->
inter
[
0
]
=
graph
->
inter
[
1
]
=
0
;
graph
->
bwIntra
=
graph
->
bwInter
=
0.1
;
graph
->
typeIntra
=
graph
->
typeInter
=
PATH_SYS
;
graph
->
nChannels
=
1
;
}
if
(
graph
->
nChannels
==
0
)
return
scclSuccess
;
if
(
graph
->
pattern
==
SCCL_TOPO_PATTERN_NVLS
)
return
scclSuccess
;
if
(
graph
->
bwIntra
<
25.0
)
return
scclSuccess
;
if
(
ccMin
>
80
&&
graph
->
bwIntra
<
50.0
&&
graph
->
nChannels
>
4
)
return
scclSuccess
;
int
dupChannels
=
std
::
min
(
graph
->
nChannels
*
2
,
graph
->
maxChannels
);
memcpy
(
graph
->
intra
+
graph
->
nChannels
*
ngpus
,
graph
->
intra
,
(
dupChannels
-
graph
->
nChannels
)
*
ngpus
*
sizeof
(
int
));
memcpy
(
graph
->
inter
+
graph
->
nChannels
*
2
,
graph
->
inter
,
(
dupChannels
-
graph
->
nChannels
)
*
2
*
sizeof
(
int
));
graph
->
bwIntra
/=
DIVUP
(
dupChannels
,
graph
->
nChannels
);
graph
->
bwInter
/=
DIVUP
(
dupChannels
,
graph
->
nChannels
);
graph
->
nChannels
=
dupChannels
;
int
nc
=
rcclParamNChannels
();
if
(
graph
->
nChannels
>
0
&&
nc
>
0
&&
nc
<=
MAXCHANNELS
/
2
&&
nc
>
graph
->
nChannels
)
{
int
nChannels
=
nc
-
graph
->
nChannels
;
int
nnets
=
system
->
nodes
[
NET
].
count
;
if
(
nnets
<=
2
)
{
for
(
int
i
=
0
;
i
<
nChannels
;
++
i
)
{
memcpy
(
graph
->
intra
+
graph
->
nChannels
*
ngpus
,
graph
->
intra
,
ngpus
*
sizeof
(
int
));
memcpy
(
graph
->
inter
+
graph
->
nChannels
*
2
,
graph
->
inter
,
2
*
sizeof
(
int
));
memcpy
(
graph
->
intraNets
+
graph
->
nChannels
*
ngpus
*
2
,
graph
->
intraNets
,
2
*
ngpus
*
sizeof
(
int
));
graph
->
nChannels
++
;
}
}
else
{
typedef
struct
{
int
id
;
int
used
;
}
Net
;
Net
nets
[
nnets
];
auto
sortFunc
=
[](
const
void
*
a
,
const
void
*
b
)
->
int
{
return
((
Net
*
)
a
)
->
used
-
((
Net
*
)
b
)
->
used
;
};
memset
(
nets
,
0
,
nnets
*
sizeof
(
Net
));
for
(
int
i
=
0
;
i
<
nnets
;
++
i
)
{
nets
[
i
].
id
=
system
->
nodes
[
NET
].
nodes
[
i
].
id
;
}
for
(
int
i
=
0
;
i
<
graph
->
nChannels
;
++
i
)
{
for
(
int
j
=
0
;
j
<
nnets
;
++
j
)
{
if
(
nets
[
j
].
id
==
*
(
graph
->
inter
+
i
*
2
)
||
nets
[
j
].
id
==
*
(
graph
->
inter
+
i
*
2
+
1
))
{
nets
[
j
].
used
++
;
}
}
}
for
(
int
i
=
0
;
i
<
nChannels
;
++
i
)
{
memcpy
(
graph
->
intra
+
graph
->
nChannels
*
ngpus
,
graph
->
intra
,
ngpus
*
sizeof
(
int
));
qsort
(
nets
,
nnets
,
sizeof
(
Net
),
sortFunc
);
*
(
graph
->
inter
+
graph
->
nChannels
*
2
)
=
nets
[
0
].
id
;
nets
[
0
].
used
++
;
qsort
(
nets
,
nnets
,
sizeof
(
Net
),
sortFunc
);
if
(
graph
->
crossNic
==
0
||
graph
->
crossNic
==
2
)
{
*
(
graph
->
inter
+
graph
->
nChannels
*
2
+
1
)
=
nets
[
0
].
id
;
nets
[
0
].
used
++
;
qsort
(
nets
,
nnets
,
sizeof
(
Net
),
sortFunc
);
}
else
{
nets
[
0
].
used
++
;
qsort
(
nets
,
nnets
,
sizeof
(
Net
),
sortFunc
);
*
(
graph
->
inter
+
graph
->
nChannels
*
2
+
1
)
=
nets
[
0
].
id
;
}
nets
[
0
].
used
++
;
memcpy
(
graph
->
intraNets
+
graph
->
nChannels
*
ngpus
*
2
,
graph
->
intraNets
,
2
*
ngpus
*
sizeof
(
int
));
graph
->
nChannels
++
;
}
}
graph
->
bwIntra
/=
DIVUP
(
nc
,
graph
->
nChannels
);
graph
->
bwInter
/=
DIVUP
(
nc
,
graph
->
nChannels
);
}
return
scclSuccess
;
}
scclResult_t
scclTopoPrintGraph
(
struct
scclTopoSystem
*
system
,
struct
scclTopoGraph
*
graph
)
{
INFO
(
SCCL_GRAPH
,
"Pattern %d, crossNic %d, nChannels %d, bw %f/%f, type %s/%s, sameChannels %d"
,
graph
->
pattern
,
graph
->
crossNic
,
graph
->
nChannels
,
graph
->
bwIntra
,
graph
->
bwInter
,
topoPathTypeStr
[
graph
->
typeIntra
],
topoPathTypeStr
[
graph
->
typeInter
],
graph
->
sameChannels
);
int
ngpus
=
system
->
nodes
[
GPU
].
count
;
char
line
[
1024
];
for
(
int
c
=
0
;
c
<
graph
->
nChannels
;
c
++
)
{
sprintf
(
line
,
"%2d :"
,
c
);
int
offset
=
strlen
(
line
);
if
(
system
->
nodes
[
NET
].
count
>
0
&&
system
->
nodes
[
GPU
].
count
!=
system
->
nRanks
&&
!
graph
->
nIntraChannels
)
{
sprintf
(
line
+
offset
,
" %s/%d"
,
topoNodeTypeStr
[
NET
],
graph
->
inter
[
2
*
c
]);
offset
=
strlen
(
line
);
}
for
(
int
i
=
0
;
i
<
ngpus
;
i
++
)
{
int
n
=
graph
->
intraNets
[(
ngpus
*
c
+
i
)
*
2
]
-
'N'
;
if
(
n
>=
0
&&
n
<
system
->
nodes
[
NET
].
count
)
{
sprintf
(
line
+
offset
,
" NET/%d"
,
n
);
offset
=
strlen
(
line
);
}
sprintf
(
line
+
offset
,
" %s/%d"
,
topoNodeTypeStr
[
GPU
],
graph
->
intra
[
ngpus
*
c
+
i
]);
offset
=
strlen
(
line
);
n
=
graph
->
intraNets
[(
ngpus
*
c
+
i
)
*
2
+
1
]
-
'N'
;
if
(
n
>=
0
&&
n
<
system
->
nodes
[
NET
].
count
)
{
sprintf
(
line
+
offset
,
" NET/%d"
,
n
);
offset
=
strlen
(
line
);
}
}
if
(
system
->
nodes
[
NET
].
count
>
0
&&
system
->
nodes
[
GPU
].
count
!=
system
->
nRanks
&&
!
graph
->
nIntraChannels
)
{
sprintf
(
line
+
offset
,
" %s/%d"
,
topoNodeTypeStr
[
NET
],
graph
->
inter
[
2
*
c
+
1
]);
offset
=
strlen
(
line
);
}
INFO
(
SCCL_GRAPH
,
"%s"
,
line
);
}
return
scclSuccess
;
}
scclResult_t
scclTopoDumpGraphs
(
struct
scclTopoSystem
*
system
,
int
ngraphs
,
struct
scclTopoGraph
**
graphs
)
{
char
*
str
=
getenv
(
"SCCL_GRAPH_DUMP_FILE"
);
if
(
str
)
{
INFO
(
SCCL_ENV
,
"SCCL_GRAPH_DUMP_FILE set by environment to %s"
,
str
);
struct
scclXml
*
xml
;
SCCLCHECK
(
scclCalloc
(
&
xml
,
1
));
SCCLCHECK
(
scclTopoGetXmlFromGraphs
(
ngraphs
,
graphs
,
system
,
xml
));
SCCLCHECK
(
scclTopoDumpXmlToFile
(
str
,
xml
));
free
(
xml
);
}
return
scclSuccess
;
}
#include "comm.h"
// NVLS channels aren't compute channels. Find which NIC corresponds to our rank being the head
scclResult_t
getNvlsNetDev
(
struct
scclComm
*
comm
,
struct
scclTopoGraph
*
graph
,
int
*
dev
)
{
int
localRanks
=
comm
->
topo
->
nodes
[
GPU
].
count
;
for
(
int
c
=
0
;
c
<
graph
->
nChannels
;
c
++
)
{
if
(
graph
->
intra
[
c
*
localRanks
]
==
comm
->
rank
)
{
*
dev
=
graph
->
inter
[
c
*
2
];
return
scclSuccess
;
}
}
WARN
(
"Could not find NIC for rank %d in NVLS graph
\n
"
,
comm
->
rank
);
return
scclInternalError
;
}
// 0: don't use PXN for P2P, 1: use PXN if needed, 2: use PXN as much as possible to maximize aggregation
SCCL_PARAM
(
P2pPxnLevel
,
"P2P_PXN_LEVEL"
,
2
);
scclResult_t
scclTopoGetNetDev
(
struct
scclComm
*
comm
,
int
rank
,
struct
scclTopoGraph
*
graph
,
int
channelId
,
int
peerRank
,
int
*
dev
,
int
*
proxyRank
)
{
if
(
graph
)
{
// Honor the net device in the graph
int
channel
=
channelId
%
graph
->
nChannels
;
int
ngpus
=
comm
->
topo
->
nodes
[
GPU
].
count
;
int
index
=
graph
->
intra
[
channel
*
ngpus
]
==
rank
?
0
:
1
;
if
(
graph
->
pattern
!=
SCCL_TOPO_PATTERN_NVLS
)
{
*
dev
=
graph
->
inter
[
channel
*
2
+
index
];
}
else
{
SCCLCHECK
(
getNvlsNetDev
(
comm
,
graph
,
dev
));
}
SCCLCHECK
(
scclTopoGetIntermediateRank
(
comm
->
topo
,
rank
,
*
dev
,
proxyRank
));
}
else
if
(
peerRank
==
-
1
)
{
return
scclInternalError
;
}
else
{
// Start with our local NIC and local Rank
SCCLCHECK
(
scclTopoGetLocalNet
(
comm
->
topo
,
rank
,
channelId
,
dev
));
*
proxyRank
=
rank
;
int
pxnLevel
=
scclPxnDisable
(
comm
)
==
1
?
0
:
scclParamP2pPxnLevel
();
// See whether we can use the remote rank preferred device.
if
(
scclParamCrossNic
()
==
0
||
(
pxnLevel
!=
0
))
{
// Find local NIC number close to local cudaDev
int
cudaDev
=
comm
->
peerInfo
[
peerRank
].
cudaDev
;
int
localRank
;
if
(
scclTopoDevToRank
(
comm
->
topo
,
cudaDev
,
&
localRank
)
!=
scclSuccess
)
return
scclSuccess
;
int
netDev
;
SCCLCHECK
(
scclTopoGetLocalNet
(
comm
->
topo
,
localRank
,
channelId
,
&
netDev
));
int
n
;
// Check that device exists on our node
if
(
scclParamCrossNic
()
==
0
)
{
if
(
scclTopoIdToIndex
(
comm
->
topo
,
NET
,
netDev
,
&
n
)
!=
scclSuccess
)
{
WARN
(
"Rank %d requires NIC %d but that NIC is not available for rank %d"
,
peerRank
,
netDev
,
rank
);
return
scclInvalidUsage
;
}
*
dev
=
netDev
;
}
if
(
pxnLevel
==
1
)
{
int
g
,
n
;
SCCLCHECK
(
scclTopoRankToIndex
(
comm
->
topo
,
rank
,
&
g
));
SCCLCHECK
(
scclTopoIdToIndex
(
comm
->
topo
,
NET
,
netDev
,
&
n
));
struct
scclTopoNode
*
gpu
=
comm
->
topo
->
nodes
[
GPU
].
nodes
+
g
;
if
(
gpu
->
paths
[
NET
][
n
].
type
<=
PATH_PXN
)
{
*
dev
=
netDev
;
SCCLCHECK
(
scclTopoGetIntermediateRank
(
comm
->
topo
,
rank
,
*
dev
,
proxyRank
));
}
}
else
if
(
pxnLevel
==
2
)
{
// Check which local GPU corresponds to that NIC and see if we can use PXN.
int
n
,
g1
,
g2
;
SCCLCHECK
(
scclTopoIdToIndex
(
comm
->
topo
,
NET
,
netDev
,
&
n
));
SCCLCHECK
(
scclTopoRankToIndex
(
comm
->
topo
,
rank
,
&
g1
));
SCCLCHECK
(
scclTopoGetLocalGpu
(
comm
->
topo
,
netDev
,
&
g2
));
if
(
g2
!=
-
1
)
{
struct
scclTopoNode
*
peerGpu
=
comm
->
topo
->
nodes
[
GPU
].
nodes
+
g2
;
if
(
peerGpu
->
paths
[
GPU
][
g1
].
type
<=
PATH_NVL
&&
peerGpu
->
paths
[
NET
][
n
].
type
<=
PATH_PXB
)
{
*
proxyRank
=
peerGpu
->
gpu
.
rank
;
*
dev
=
netDev
;
return
scclSuccess
;
}
}
}
}
}
return
scclSuccess
;
}
scclResult_t
scclTopoGetIntraNetDev
(
struct
scclTopoSystem
*
system
,
int
rank
,
struct
scclTopoGraph
*
graph
,
int
channelId
,
int
type
,
int
*
dev
)
{
*
dev
=
-
1
;
if
(
graph
&&
graph
->
nIntraChannels
)
{
int
n1
=
-
1
;
int
ngpus
=
system
->
nodes
[
GPU
].
count
;
int
nnets
=
system
->
nodes
[
NET
].
count
;
int
chan
=
channelId
%
graph
->
nIntraChannels
;
for
(
int
i
=
0
;
i
<
ngpus
;
i
++
)
{
if
(
graph
->
intra
[
ngpus
*
chan
+
i
]
==
rank
)
{
n1
=
graph
->
intraNets
[(
ngpus
*
chan
+
i
)
*
2
+
type
]
-
'N'
;
break
;
}
}
if
(
n1
>=
0
&&
n1
<
nnets
)
{
*
dev
=
n1
;
}
}
return
scclSuccess
;
}
scclResult_t
scclTopoGetLinkType
(
struct
scclTopoSystem
*
system
,
int
cudaDev1
,
int
cudaDev2
,
bool
*
isXGMI
,
int
maxInter
,
int
nInter
,
int
*
inter
)
{
int
interGpus
[
MAX_XGMI_INTER_GPUS
+
1
];
int
ngpus
=
system
->
nodes
[
GPU
].
count
;
*
isXGMI
=
false
;
// check for direct XGMI connection
for
(
int
i
=
0
;
i
<
ngpus
;
i
++
)
{
if
(
system
->
nodes
[
GPU
].
nodes
[
i
].
gpu
.
dev
==
cudaDev1
)
{
struct
scclTopoNode
*
node
=
system
->
nodes
[
GPU
].
nodes
+
i
;
for
(
int
k
=
0
;
k
<
system
->
nodes
[
GPU
].
count
;
k
++
)
{
if
(
node
->
paths
[
GPU
][
k
].
count
==
1
)
{
struct
scclTopoLink
*
link
=
node
->
paths
[
GPU
][
k
].
list
[
0
];
struct
scclTopoNode
*
remNode
=
link
->
remNode
;
if
(
remNode
->
gpu
.
dev
==
cudaDev2
)
{
*
isXGMI
=
(
link
->
type
==
LINK_NVL
);
if
(
*
isXGMI
)
return
scclSuccess
;
}
}
}
}
}
// try intermediate GPUs
if
(
maxInter
)
{
// check if there are intermediate GPUs that are connected to both
bool
res1
,
res2
,
res3
;
int
j
;
for
(
j
=
0
;
j
<
nInter
;
j
++
)
{
scclTopoGetLinkType
(
system
,
inter
[
j
],
inter
[
j
+
1
],
&
res1
,
0
);
if
(
!
res1
)
break
;
}
if
(
j
<
nInter
)
return
scclSuccess
;
if
(
nInter
>
0
&&
inter
!=
nullptr
)
{
scclTopoGetLinkType
(
system
,
inter
[
nInter
],
cudaDev2
,
&
res2
,
0
);
if
(
res2
)
{
*
isXGMI
=
true
;
return
scclSuccess
;
}
memcpy
(
interGpus
+
1
,
inter
+
1
,
sizeof
(
int
)
*
nInter
);
}
interGpus
[
0
]
=
cudaDev1
;
// add one more intermediate GPU recursively util reaching max depth
nInter
++
;
if
(
nInter
+
2
>
ngpus
||
nInter
>
MAX_XGMI_INTER_GPUS
||
nInter
>
maxInter
)
return
scclSuccess
;
for
(
int
i
=
0
;
i
<
ngpus
;
i
++
)
{
int
dev
=
system
->
nodes
[
GPU
].
nodes
[
i
].
gpu
.
dev
;
// skip duplicated GPU
if
(
dev
==
cudaDev2
)
continue
;
for
(
j
=
0
;
j
<
nInter
;
j
++
)
if
(
dev
==
interGpus
[
j
])
break
;
if
(
j
<
nInter
)
continue
;
// check connectivity with intermediate GPUs
interGpus
[
nInter
]
=
dev
;
scclTopoGetLinkType
(
system
,
cudaDev1
,
cudaDev2
,
&
res3
,
maxInter
,
nInter
,
interGpus
);
if
(
res3
)
{
*
isXGMI
=
true
;
return
scclSuccess
;
}
}
}
return
scclSuccess
;
}
}
// namespace detect
}
// namespace topology
}
// namespace hardware
}
// namespace sccl
src/hardware/graph/trees.cc
deleted
100644 → 0
View file @
d9d23f34
#include "sccl.h"
namespace
sccl
{
namespace
hardware
{
namespace
topology
{
namespace
detect
{
#define RANK_TO_INDEX(r) (rank > root ? rank - 1 : rank)
/* Btree which alternates leaves and nodes.
* Assumes root is 0, which conveniently builds a tree on powers of two,
* (because we have pow2-1 ranks) which lets us manipulate bits.
* Find first non-zero bit, then :
* Find the parent :
* xx01[0] -> xx10[0] (1,5,9 below) or xx00[0] if xx10[0] is out of bounds (13 below)
* xx11[0] -> xx10[0] (3,7,11 below)
* Find the children :
* xx10[0] -> xx01[0] (2,4,6,8,10,12) or -1 (1,3,5,7,9,11,13)
* xx10[0] -> xx11[0] (2,4,6,8,10) or xx101[0] (12) or xx1001[0] ... or -1 (1,3,5,7,9,11,13)
*
* Illustration :
* 0---------------8
* ______/ \______
* 4 12
* / \ / \
* 2 6 10 \
* / \ / \ / \ \
* 1 3 5 7 9 11 13
*/
scclResult_t
scclGetBtree
(
int
nranks
,
int
rank
,
int
*
u
,
int
*
d0
,
int
*
d1
,
int
*
parentChildType
)
{
int
up
,
down0
,
down1
;
int
bit
;
for
(
bit
=
1
;
bit
<
nranks
;
bit
<<=
1
)
{
if
(
bit
&
rank
)
break
;
}
if
(
rank
==
0
)
{
*
u
=
-
1
;
*
d0
=
-
1
;
// Child rank is > 0 so it has to be our child 1, not 0.
*
d1
=
nranks
>
1
?
bit
>>
1
:
-
1
;
return
scclSuccess
;
}
up
=
(
rank
^
bit
)
|
(
bit
<<
1
);
// if smaller than the parent, we are his first child, otherwise we're his second
if
(
up
>=
nranks
)
up
=
(
rank
^
bit
);
*
parentChildType
=
(
rank
<
up
)
?
0
:
1
;
*
u
=
up
;
int
lowbit
=
bit
>>
1
;
// down0 is always within bounds
down0
=
lowbit
==
0
?
-
1
:
rank
-
lowbit
;
down1
=
lowbit
==
0
?
-
1
:
rank
+
lowbit
;
// Make sure down1 is within bounds
while
(
down1
>=
nranks
)
{
down1
=
lowbit
==
0
?
-
1
:
rank
+
lowbit
;
lowbit
>>=
1
;
}
*
d0
=
down0
;
*
d1
=
down1
;
return
scclSuccess
;
}
/* Build a double binary tree. Take the previous tree for the first tree.
* For the second tree, we use a mirror tree (if nranks is even)
*
* 0---------------8 3----------------11
* ______/ \ / \______
* 4 \ / 7
* / \ \ / / \
* 2 6 10 1 5 9
* / \ / \ / \ / \ / \ / \
* 1 3 5 7 9 11 0 2 4 6 8 10
*
* or shift it by one rank (if nranks is odd).
*
* 0---------------8 1---------------9
* ______/ \______ ______/ \______
* 4 12 5 0
* / \ / / \ /
* 2 6 10 3 7 11
* / \ / \ / \ / \ / \ / \
* 1 3 5 7 9 11 2 4 6 8 10 12
*/
scclResult_t
scclGetDtree
(
int
nranks
,
int
rank
,
int
*
s0
,
int
*
d0_0
,
int
*
d0_1
,
int
*
parentChildType0
,
int
*
s1
,
int
*
d1_0
,
int
*
d1_1
,
int
*
parentChildType1
)
{
// First tree ... use a btree
scclGetBtree
(
nranks
,
rank
,
s0
,
d0_0
,
d0_1
,
parentChildType0
);
// Second tree ... mirror or shift
if
(
nranks
%
2
==
1
)
{
// shift
int
shiftrank
=
(
rank
-
1
+
nranks
)
%
nranks
;
int
u
,
d0
,
d1
;
scclGetBtree
(
nranks
,
shiftrank
,
&
u
,
&
d0
,
&
d1
,
parentChildType1
);
*
s1
=
u
==
-
1
?
-
1
:
(
u
+
1
)
%
nranks
;
*
d1_0
=
d0
==
-
1
?
-
1
:
(
d0
+
1
)
%
nranks
;
*
d1_1
=
d1
==
-
1
?
-
1
:
(
d1
+
1
)
%
nranks
;
}
else
{
// mirror
int
u
,
d0
,
d1
;
scclGetBtree
(
nranks
,
nranks
-
1
-
rank
,
&
u
,
&
d0
,
&
d1
,
parentChildType1
);
*
s1
=
u
==
-
1
?
-
1
:
nranks
-
1
-
u
;
*
d1_0
=
d0
==
-
1
?
-
1
:
nranks
-
1
-
d0
;
*
d1_1
=
d1
==
-
1
?
-
1
:
nranks
-
1
-
d1
;
}
return
scclSuccess
;
}
}
// namespace detect
}
// namespace topology
}
// namespace hardware
}
// namespace sccl
src/hardware/graph/tuning.cc
deleted
100644 → 0
View file @
d9d23f34
#include "core.h"
#include "devcomm.h"
#include "comm.h"
#include "topo.h"
namespace
sccl
{
namespace
hardware
{
namespace
topology
{
namespace
detect
{
SCCL_PARAM
(
Nthreads
,
"NTHREADS"
,
-
2
);
SCCL_PARAM
(
Ll128Nthreads
,
"LL128_NTHREADS"
,
-
2
);
static
int
getNthreads
(
const
char
*
name
,
int
env
,
int
min
,
int
max
,
int
def
,
int
WarpSize
)
{
int
nt
=
env
;
if
(
nt
>
0
)
{
if
(
nt
%
WarpSize
!=
0
)
{
WARN
(
"Invalid %s %d (must be a multiple of %d)"
,
name
,
nt
,
WarpSize
);
nt
=
max
;
}
else
if
(
nt
>
max
)
{
WARN
(
"Invalid %s %d (maximum %d)."
,
name
,
nt
,
max
);
nt
=
max
;
}
else
if
(
nt
<
min
)
{
WARN
(
"Invalid %s %d (minimum %d)."
,
name
,
nt
,
min
);
nt
=
min
;
}
}
else
{
nt
=
def
;
}
return
nt
;
}
scclResult_t
parseList
(
const
char
*
str
,
const
char
*
elems
[],
int
nelems
,
int
*
list
)
{
int
def
,
set
;
if
(
str
[
0
]
==
'^'
)
{
def
=
1
;
set
=
0
;
str
++
;
}
else
{
def
=
0
;
set
=
1
;
}
for
(
int
i
=
0
;
i
<
nelems
;
i
++
)
list
[
i
]
=
def
;
char
*
tokStr
=
strdup
(
str
);
char
*
tmpStr
;
char
*
token
=
strtok_r
(
tokStr
,
","
,
&
tmpStr
);
while
(
token
)
{
for
(
int
i
=
0
;
i
<
nelems
;
i
++
)
if
(
strcasecmp
(
token
,
elems
[
i
])
==
0
)
list
[
i
]
=
set
;
token
=
strtok_r
(
NULL
,
","
,
&
tmpStr
);
}
free
(
tokStr
);
return
scclSuccess
;
}
// Latencies in us, Bandwidths in GB/s
// Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple }
static
const
float
baseLat
[
SCCL_NUM_ALGORITHMS
][
SCCL_NUM_PROTOCOLS
]
=
{{
12.0
,
12.0
,
17.0
},
{
12.0
,
12.0
,
17.0
},
// Tree, Ring
{
12.0
,
12.0
,
17.0
},
{
12.0
,
12.0
,
17.0
},
// Collnet Direct, Chain
{
0
,
0
,
0
},
{
0
,
0
,
0
}};
// NVLS, NVLS Tree
// NVLink, PCI, Network
#define SCCL_HW_NVLINK 0
#define SCCL_HW_PCI 1
#define SCCL_HW_NET 2
struct
tuningModel
{
float
hwLat
[
3
][
SCCL_NUM_ALGORITHMS
][
SCCL_NUM_PROTOCOLS
];
float
bwRatio
[
2
][
SCCL_NUM_ALGORITHMS
][
SCCL_NUM_PROTOCOLS
];
float
treeCorrectionFactor
[
SCCL_NUM_PROTOCOLS
][
27
];
float
ringCorrectionFactor
[
SCCL_NUM_PROTOCOLS
][
27
];
};
static
struct
tuningModel
tuning_model_0
{
.
hwLat
=
{
/* NVLINK */
{
/* Tree (LL/LL128/Simple)*/
{
0.8
,
1.4
,
2.5
},
/* Ring (LL/LL128/Simple)*/
{
0.8
,
2.2
,
3.6
},
/* CollNetDirect (Simple)*/
{
0.0
,
0.0
,
0.8
},
/* CollNetChain (Simple)*/
{
0.0
,
0.0
,
1.4
},
/* NVLS */
{
0
,
0
,
0
},
/* NVLS Tree */
{
0
,
0
,
0
}},
/* PCI */
{
/* Tree (LL/LL128/Simple)*/
{
2.2
,
2.2
,
5.7
},
/* Ring (LL/LL128/Simple)*/
{
2.2
,
2.2
,
5.7
},
/* CollNetDirect (Simple)*/
{
0.0
,
0.0
,
5.7
},
/* CollNetChain (Simple)*/
{
0.0
,
0.0
,
5.7
},
/* NVLS */
{
0
,
0
,
0
},
/* NVLS Tree */
{
0
,
0
,
0
}},
/* NET */
{
/* Tree (LL/LL128/Simple)*/
{
11.8
,
18.2
,
20.8
},
/* Ring (LL/LL128/Simple)*/
{
9.5
,
19.8
,
15.1
},
/* CollNetDirect (Simple)*/
{
0.0
,
0.0
,
11.8
},
/* CollNetChain (Simple)*/
{
0.0
,
0.0
,
18.2
},
/* NVLS */
{
0
,
0
,
0
},
/* NVLS Tree */
{
0
,
0
,
0
}},
},
.
bwRatio
=
{
/* 2 nodes */
{
/* Tree (LL/LL128/Simple)*/
{
0.28
,
0.22
,
0.91
},
/* Ring (LL/LL128/Simple)*/
{
0.31
,
0.34
,
1.00
},
/* CollNetDirect (Simple)*/
{
0.00
,
0.00
,
1.00
},
/* CollNetChain (Simple)*/
{
0.00
,
0.00
,
1.00
},
/* NVLS */
{
0
,
0
,
0
},
/* NVLS Tree */
{
0
,
0
,
0
}},
/* more than 2 nodes */
{
/* Tree (LL/LL128/Simple)*/
{
0.04
,
0.22
,
0.95
},
/* Ring (LL/LL128/Simple)*/
{
0.04
,
0.34
,
1.00
},
/* CollNetDirect (Simple)*/
{
0.00
,
0.00
,
1.00
},
/* CollNetChain (Simple)*/
{
0.00
,
0.00
,
1.00
},
/* NVLS */
{
0
,
0
,
0
},
/* NVLS Tree */
{
0
,
0
,
0
}},
},
.
treeCorrectionFactor
=
{
{
0.1
,
0.2
,
0.1
,
0.1
,
0.9
,
0.3
,
0.4
,
0.1
,
0.2
,
0.4
,
0.2
,
0.1
,
0.3
,
0.3
,
0.2
,
0.2
,
0.2
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
},
{
0.1
,
0.3
,
1.0
,
0.1
,
0.5
,
1.0
,
0.9
,
1.0
,
1.0
,
1.0
,
0.3
,
0.1
,
0.4
,
0.5
,
0.5
,
0.4
,
0.4
,
0.3
,
0.3
,
0.2
,
0.2
,
0.2
,
0.2
,
0.2
,
0.2
,
0.2
,
0.2
,
},
// { 0.2, 1.0, 0.1, 0.1, 0.7, 0.2, 0.4, 0.1, 0.1, 0.3, 0.4, 0.3, 0.6, 0.8, 1.0, 1.0, 1.0, 1.0, 0.9, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 0.9, },
{
0.2
,
1.0
,
0.1
,
0.1
,
0.7
,
0.2
,
0.4
,
0.1
,
0.1
,
0.3
,
0.4
,
0.3
,
0.6
,
0.8
,
1.0
,
1.0
,
1.0
,
1.0
,
0.9
,
0.4
,
0.4
,
0.4
,
0.4
,
0.4
,
0.4
,
0.4
,
0.4
,
},
},
.
ringCorrectionFactor
=
{
{
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.2
,
0.4
,
0.2
,
0.3
,
0.5
,
0.3
,
0.1
,
0.5
,
0.5
,
0.3
,
0.2
,
0.2
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
},
{
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.3
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
0.8
,
0.7
,
0.5
,
0.4
,
0.4
,
0.3
,
0.3
,
0.3
,
0.3
,
0.3
,
0.3
,
},
{
1.0
,
0.8
,
0.2
,
1.0
,
1.0
,
0.3
,
1.0
,
0.1
,
0.1
,
0.2
,
0.2
,
0.1
,
0.5
,
1.0
,
0.8
,
0.8
,
1.0
,
0.9
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
},
},
};
static
struct
tuningModel
tuning_model_1
{
.
hwLat
=
{
/* NVLINK */
{
/* Tree (LL/LL128/Simple)*/
{
1.5
,
1.5
,
4.5
},
/* Ring (LL/LL128/Simple)*/
{
1.5
,
1.5
,
4.5
},
/* CollNetDirect (Simple)*/
{
0.0
,
0.0
,
4.5
},
/* CollNetChain (Simple)*/
{
0.0
,
0.0
,
4.5
},
/* NVLS */
{
0
,
0
,
0
},
/* NVLS Tree */
{
0
,
0
,
0
}},
/* PCI */
{
/* Tree (LL/LL128/Simple)*/
{
2.2
,
2.2
,
5.7
},
/* Ring (LL/LL128/Simple)*/
{
2.2
,
2.2
,
5.7
},
/* CollNetDirect (Simple)*/
{
0.0
,
0.0
,
5.7
},
/* CollNetChain (Simple)*/
{
0.0
,
0.0
,
5.7
},
/* NVLS */
{
0
,
0
,
0
},
/* NVLS Tree */
{
0
,
0
,
0
}},
/* NET */
{
/* Tree (LL/LL128/Simple)*/
{
33.0
,
33.0
,
15.8
},
/* Ring (LL/LL128/Simple)*/
{
5.1
,
5.1
,
68.8
},
/* CollNetDirect (Simple)*/
{
0.0
,
0.0
,
15.8
},
/* CollNetChain (Simple)*/
{
0.0
,
0.0
,
15.8
},
/* NVLS */
{
0
,
0
,
0
},
/* NVLS Tree */
{
0
,
0
,
0
}},
},
.
bwRatio
=
{
/* 2 nodes */
{
/* Tree (LL/LL128/Simple)*/
{
0.30
,
1.00
,
0.99
},
/* Ring (LL/LL128/Simple)*/
{
0.31
,
1.00
,
1.00
},
/* CollNetDirect (Simple)*/
{
0.00
,
0.00
,
1.00
},
/* CollNetChain (Simple)*/
{
0.00
,
0.00
,
1.00
},
/* NVLS */
{
0
,
0
,
0
},
/* NVLS Tree */
{
0
,
0
,
0
}},
/* more than 2 nodes */
{
/* Tree (LL/LL128/Simple)*/
{
0.15
,
1.00
,
0.42
},
/* Ring (LL/LL128/Simple)*/
{
0.20
,
1.00
,
1.00
},
/* CollNetDirect (Simple)*/
{
0.00
,
0.00
,
1.00
},
/* CollNetChain (Simple)*/
{
0.00
,
0.00
,
1.00
},
/* NVLS */
{
0
,
0
,
0
},
/* NVLS Tree */
{
0
,
0
,
0
}},
},
.
treeCorrectionFactor
=
{
{
0.5
,
0.4
,
0.7
,
0.6
,
1.0
,
1.0
,
0.5
,
0.4
,
0.1
,
0.5
,
0.4
,
0.6
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
0.8
,
0.6
,
0.5
,
0.4
,
0.4
,
0.3
,
0.2
,
0.1
,
0.1
,
0.1
,
},
{
0.5
,
0.4
,
0.7
,
0.6
,
1.0
,
1.0
,
0.5
,
0.4
,
0.1
,
0.5
,
0.4
,
0.6
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
0.8
,
0.6
,
0.5
,
0.4
,
0.4
,
0.3
,
0.2
,
0.1
,
0.1
,
0.1
,
},
// { 0.1, 0.1, 0.1, 0.1, 0.1, 0.3, 0.4, 0.5, 0.1, 0.6, 1.0, 1.0, 1.0, 0.6, 0.5, 0.7, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.7, 0.5, 0.3, 0.3, },
{
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.3
,
0.4
,
0.5
,
0.1
,
0.6
,
1.0
,
1.0
,
1.0
,
0.6
,
0.5
,
0.7
,
1.0
,
1.0
,
1.0
,
0.4
,
0.4
,
0.4
,
0.4
,
0.3
,
0.2
,
0.1
,
0.1
,
},
},
.
ringCorrectionFactor
=
{
{
1.0
,
0.5
,
1.0
,
1.0
,
0.6
,
0.7
,
1.0
,
1.0
,
0.2
,
1.0
,
0.9
,
0.7
,
1.0
,
1.0
,
1.0
,
0.9
,
0.9
,
0.8
,
0.8
,
0.7
,
0.6
,
0.5
,
0.5
,
0.3
,
0.2
,
0.1
,
0.1
,
},
{
1.0
,
0.5
,
1.0
,
1.0
,
0.6
,
0.7
,
1.0
,
1.0
,
0.2
,
1.0
,
0.9
,
0.7
,
1.0
,
1.0
,
1.0
,
0.9
,
0.9
,
0.8
,
0.8
,
0.7
,
0.6
,
0.5
,
0.5
,
0.3
,
0.2
,
0.1
,
0.1
,
},
{
0.3
,
1.0
,
0.3
,
0.1
,
0.1
,
0.1
,
0.3
,
0.7
,
1.0
,
0.2
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.2
,
0.3
,
0.5
,
0.9
,
1.0
,
1.0
,
1.0
,
1.0
,
},
},
};
static
struct
tuningModel
tuning_model_2
{
.
hwLat
=
{
/* NVLINK */
{
/* Tree (LL/LL128/Simple)*/
{
1.5
,
1.5
,
4.5
},
/* Ring (LL/LL128/Simple)*/
{
1.5
,
1.5
,
4.5
},
/* CollNetDirect (Simple)*/
{
0.0
,
0.0
,
4.5
},
/* CollNetChain (Simple)*/
{
0.0
,
0.0
,
4.5
},
/* NVLS */
{
0
,
0
,
0
},
/* NVLS Tree */
{
0
,
0
,
0
}},
/* PCI */
{
/* Tree (LL/LL128/Simple)*/
{
2.2
,
2.2
,
5.7
},
/* Ring (LL/LL128/Simple)*/
{
2.2
,
2.2
,
5.7
},
/* CollNetDirect (Simple)*/
{
0.0
,
0.0
,
5.7
},
/* CollNetChain (Simple)*/
{
0.0
,
0.0
,
5.7
},
/* NVLS */
{
0
,
0
,
0
},
/* NVLS Tree */
{
0
,
0
,
0
}},
/* NET */
{
/* Tree (LL/LL128/Simple)*/
{
27.9
,
27.9
,
15.8
},
/* Ring (LL/LL128/Simple)*/
{
12.1
,
12.1
,
68.8
},
/* CollNetDirect (Simple)*/
{
0.0
,
0.0
,
15.8
},
/* CollNetChain (Simple)*/
{
0.0
,
0.0
,
15.8
},
/* NVLS */
{
0
,
0
,
0
},
/* NVLS Tree */
{
0
,
0
,
0
}},
},
.
bwRatio
=
{
/* 2 nodes */
{
/* Tree (LL/LL128/Simple)*/
{
0.30
,
1.00
,
0.99
},
/* Ring (LL/LL128/Simple)*/
{
0.31
,
1.00
,
1.00
},
/* CollNetDirect (Simple)*/
{
0.00
,
0.00
,
1.00
},
/* CollNetChain (Simple)*/
{
0.00
,
0.00
,
1.00
},
/* NVLS */
{
0
,
0
,
0
},
/* NVLS Tree */
{
0
,
0
,
0
}},
/* more than 2 nodes */
{
/* Tree (LL/LL128/Simple)*/
{
0.07
,
1.00
,
0.42
},
/* Ring (LL/LL128/Simple)*/
{
0.08
,
1.00
,
1.00
},
/* CollNetDirect (Simple)*/
{
0.00
,
0.00
,
1.00
},
/* CollNetChain (Simple)*/
{
0.00
,
0.00
,
1.00
},
/* NVLS */
{
0
,
0
,
0
},
/* NVLS Tree */
{
0
,
0
,
0
}},
},
.
treeCorrectionFactor
=
{
{
0.1
,
0.4
,
0.3
,
0.3
,
0.2
,
0.4
,
0.5
,
0.1
,
0.1
,
0.6
,
0.7
,
0.7
,
0.8
,
1.0
,
0.9
,
0.7
,
0.6
,
0.5
,
0.4
,
0.3
,
0.2
,
0.2
,
0.2
,
0.2
,
0.2
,
0.2
,
0.2
,
},
{
0.1
,
0.4
,
0.3
,
0.3
,
0.2
,
0.4
,
0.5
,
0.1
,
0.1
,
0.6
,
0.7
,
0.7
,
0.8
,
1.0
,
0.9
,
0.7
,
0.6
,
0.5
,
0.4
,
0.3
,
0.2
,
0.2
,
0.2
,
0.2
,
0.2
,
0.2
,
0.2
,
},
// { 1.0, 0.1, 0.1, 0.1, 0.1, 0.2, 0.3, 0.5, 0.1, 0.6, 0.9, 0.8, 0.7, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.7, 0.9, 0.9, 1.0, 1.0, 1.0, },
{
1.0
,
0.1
,
0.1
,
0.1
,
0.1
,
0.2
,
0.3
,
0.5
,
0.1
,
0.6
,
0.9
,
0.8
,
0.7
,
0.9
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
0.4
,
0.4
,
0.3
,
0.4
,
0.4
,
0.4
,
0.4
,
0.4
,
},
},
.
ringCorrectionFactor
=
{
{
0.1
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
0.4
,
1.0
,
1.0
,
1.0
,
1.0
,
0.7
,
0.6
,
0.5
,
0.4
,
0.3
,
0.2
,
0.2
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
},
{
0.1
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
0.4
,
1.0
,
1.0
,
1.0
,
1.0
,
0.7
,
0.6
,
0.5
,
0.4
,
0.3
,
0.2
,
0.2
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
},
{
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
1.0
,
0.2
,
0.2
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.2
,
0.4
,
0.5
,
0.6
,
0.9
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
},
},
};
static
struct
tuningModel
tuning_model_3
{
.
hwLat
=
{
/* NVLINK */
{
/* Tree (LL/LL128/Simple)*/
{
0.8
,
0.0
,
2.5
},
/* Ring (LL/LL128/Simple)*/
{
0.8
,
0.0
,
3.6
},
/* CollNetDirect (Simple)*/
{
0.0
,
0.0
,
0.8
},
/* CollNetChain (Simple)*/
{
0.0
,
0.0
,
0.0
},
/* NVLS */
{
0
,
0
,
0
},
/* NVLS Tree */
{
0
,
0
,
0
}},
/* PCI */
{
/* Tree (LL/LL128/Simple)*/
{
2.2
,
2.2
,
5.7
},
/* Ring (LL/LL128/Simple)*/
{
2.2
,
2.2
,
5.7
},
/* CollNetDirect (Simple)*/
{
0.0
,
0.0
,
5.7
},
/* CollNetChain (Simple)*/
{
0.0
,
0.0
,
5.7
},
/* NVLS */
{
0
,
0
,
0
},
/* NVLS Tree */
{
0
,
0
,
0
}},
/* NET */
{
/* Tree (LL/LL128/Simple)*/
{
12.5
,
0.0
,
22.4
},
/* Ring (LL/LL128/Simple)*/
{
9.5
,
0.0
,
19.8
},
/* CollNetDirect (Simple)*/
{
0.0
,
0.0
,
12.5
},
/* CollNetChain (Simple)*/
{
0.0
,
0.0
,
0.0
},
/* NVLS */
{
0
,
0
,
0
},
/* NVLS Tree */
{
0
,
0
,
0
}},
},
.
bwRatio
=
{
/* 2 nodes */
{
/* Tree (LL/LL128/Simple)*/
{
0.20
,
0.00
,
1.75
},
/* Ring (LL/LL128/Simple)*/
{
0.20
,
0.00
,
1.00
},
/* CollNetDirect (Simple)*/
{
0.00
,
0.00
,
1.00
},
/* CollNetChain (Simple)*/
{
0.00
,
0.00
,
1.00
},
/* NVLS */
{
0
,
0
,
0
},
/* NVLS Tree */
{
0
,
0
,
0
}},
/* more than 2 nodes */
{
/* Tree (LL/LL128/Simple)*/
{
0.20
,
0.00
,
0.96
},
/* Ring (LL/LL128/Simple)*/
{
0.20
,
0.00
,
1.00
},
/* CollNetDirect (Simple)*/
{
0.00
,
0.00
,
1.00
},
/* CollNetChain (Simple)*/
{
0.00
,
0.00
,
1.00
},
/* NVLS */
{
0
,
0
,
0
},
/* NVLS Tree */
{
0
,
0
,
0
}},
},
.
treeCorrectionFactor
=
{
{
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
1.0
,
1.0
,
0.2
,
1.0
,
0.9
,
1.0
,
0.6
,
0.4
,
0.6
,
0.4
,
0.3
,
0.3
,
0.3
,
0.3
,
0.3
,
0.2
,
0.2
,
0.2
,
0.2
,
0.2
,
0.2
,
},
{
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
},
// { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.1, 0.1, 0.1, 0.2, 1.0, 0.8, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.7, 0.8, 0.9, 0.7, 0.7, },
{
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
0.1
,
0.1
,
0.1
,
0.2
,
1.0
,
0.8
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
0.4
,
0.4
,
0.3
,
0.3
,
0.3
,
0.4
,
0.3
,
0.3
,
},
},
.
ringCorrectionFactor
=
{
{
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.3
,
0.1
,
0.2
,
0.1
,
0.4
,
0.4
,
0.2
,
0.2
,
0.3
,
0.7
,
0.5
,
0.4
,
0.3
,
0.3
,
0.3
,
0.3
,
0.2
,
0.2
,
0.2
,
0.2
,
0.2
,
0.2
,
},
{
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
},
{
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.5
,
1.0
,
0.1
,
0.3
,
0.1
,
0.1
,
0.1
,
0.2
,
0.2
,
0.2
,
0.3
,
0.4
,
0.7
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
},
},
};
static
struct
tuningModel
tuning_model_4
{
.
hwLat
=
{
/* NVLINK */
{
/* Tree (LL/LL128/Simple)*/
{
0.8
,
1.4
,
2.5
},
/* Ring (LL/LL128/Simple)*/
{
0.8
,
2.2
,
3.6
},
/* CollNetDirect (Simple)*/
{
0.8
,
1.4
,
2.5
},
/* CollNetChain (Simple)*/
{
0.8
,
1.4
,
2.5
},
/* NVLS */
{
0
,
0
,
0
},
/* NVLS Tree */
{
0
,
0
,
0
}},
/* PCI */
{
/* Tree (LL/LL128/Simple)*/
{
2.2
,
2.2
,
5.7
},
/* Ring (LL/LL128/Simple)*/
{
2.2
,
2.2
,
5.7
},
/* CollNetDirect (Simple)*/
{
0.0
,
0.0
,
5.7
},
/* CollNetChain (Simple)*/
{
0.0
,
0.0
,
5.7
},
/* NVLS */
{
0
,
0
,
0
},
/* NVLS Tree */
{
0
,
0
,
0
}},
/* NET */
{
/* Tree (LL/LL128/Simple)*/
{
32.2
,
34.4
,
47.6
},
/* Ring (LL/LL128/Simple)*/
{
35.4
,
87.8
,
209.2
},
/* CollNetDirect (Simple)*/
{
0.0
,
0.0
,
47.6
},
/* CollNetChain (Simple)*/
{
0.0
,
0.0
,
47.6
},
/* NVLS */
{
0
,
0
,
0
},
/* NVLS Tree */
{
0
,
0
,
0
}},
},
.
bwRatio
=
{
/* 2 nodes */
{
/* Tree (LL/LL128/Simple)*/
{
0.16
,
1.09
,
1.61
},
/* Ring (LL/LL128/Simple)*/
{
0.15
,
0.41
,
1.00
},
/* CollNetDirect (Simple)*/
{
0.00
,
0.00
,
1.00
},
/* CollNetChain (Simple)*/
{
0.00
,
0.00
,
1.00
},
/* NVLS */
{
0
,
0
,
0
},
/* NVLS Tree */
{
0
,
0
,
0
}},
/* more than 2 nodes */
{
/* Tree (LL/LL128/Simple)*/
{
0.16
,
1.09
,
1.08
},
/* Ring (LL/LL128/Simple)*/
{
0.15
,
0.41
,
1.00
},
/* CollNetDirect (Simple)*/
{
0.00
,
0.00
,
1.00
},
/* CollNetChain (Simple)*/
{
0.00
,
0.00
,
1.00
},
/* NVLS */
{
0
,
0
,
0
},
/* NVLS Tree */
{
0
,
0
,
0
}},
},
.
treeCorrectionFactor
=
{
{
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
1.0
,
0.1
,
0.1
,
0.2
,
0.4
,
0.6
,
0.5
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
},
{
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.2
,
0.1
,
0.1
,
0.2
,
1.0
,
0.5
,
0.2
,
0.2
,
0.2
,
0.2
,
0.2
,
0.2
,
0.2
,
0.2
,
0.2
,
0.2
,
0.2
,
0.2
,
},
// { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.4, 0.3, 0.3, 0.1, 0.1, 1.0, 1.0, 0.7, 0.5, 0.6, 0.5, 0.6, 0.6, 0.5, 0.6, 0.6, 0.6, 0.7, },
// { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.4, 0.3, 0.3, 0.1, 0.1, 1.0, 1.0, 0.7, 0.5, 0.6, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, },
},
.
ringCorrectionFactor
=
{
{
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.2
,
0.2
,
0.1
,
0.3
,
0.1
,
0.1
,
0.1
,
0.2
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
},
{
0.4
,
0.5
,
0.5
,
0.4
,
0.4
,
0.4
,
0.4
,
0.2
,
0.2
,
0.1
,
0.3
,
1.0
,
1.0
,
0.7
,
0.8
,
0.5
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
0.9
,
0.8
,
0.5
,
0.4
,
0.3
,
0.3
,
},
{
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
1.0
,
1.0
,
0.8
,
0.5
,
0.1
,
0.7
,
0.2
,
0.4
,
0.4
,
0.6
,
0.7
,
0.9
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
},
},
};
static
struct
tuningModel
rcclTuningModel
[]
=
{
tuning_model_0
,
tuning_model_1
,
tuning_model_2
,
tuning_model_3
,
tuning_model_4
,
};
/* Array indexes used below */
#define VOLTA_COMPCAP_IDX 0
#define AMPERE_COMPCAP_IDX 1
#define HOPPER_COMPCAP_IDX 2
// LL128 max BW per channel
static
const
double
llMaxBws
[
3
][
3
]
=
{
/* Volta-N1/Intel-N2/Intel-N4) */
{
39.0
,
39.0
,
20.4
},
/* Ampere-N1/AMD-N2/AMD-N4) */
{
87.7
,
22.5
/*avg of ring & tree*/
,
19.0
},
/* Hopper-N1/AMD-N2/AMD-N4) */
{
87.7
,
22.5
/*avg of ring & tree*/
,
19.0
}};
static
const
double
perChMaxRingLL128Bws
[
3
][
3
]
=
{
/* Volta (N1/N2/N4) */
{
20.0
,
20.0
,
20.0
},
/* Ampere (N1/N2/N4) */
{
20.0
,
20.0
,
20.0
},
/* Hopper (N1/N2/N4) */
{
36.7
,
36.7
,
36.7
},
};
static
const
double
perChMaxTreeLL128Bws
[
3
][
3
]
=
{
/* Volta (N1/N2/N4) */
{
20.0
,
20.0
,
20.0
},
/* Ampere (N1/N2/N4) */
{
20.0
,
20.0
,
20.0
},
/* Hopper (N1/N2/N4) */
{
36.7
,
36.7
,
29.0
},
};
static
const
double
perChMaxTreeBws
[
3
][
3
]
=
{
/* Volta (N1/N2/N4) */
{
26.5
,
18.5
,
10.0
},
/* Ampere (N1/N2/N4) */
{
24.0
,
23.6
,
17.8
},
/* Hopper (N1/N2/N4) */
{
38.7
,
41.4
,
36.0
},
};
// Network post overhead in ns (1000 = 1 us)
SCCL_PARAM
(
NetOverhead
,
"NET_OVERHEAD"
,
-
2
);
static
float
getNetOverhead
(
struct
scclComm
*
comm
)
{
if
(
scclParamNetOverhead
()
!=
-
2
)
return
scclParamNetOverhead
()
*
.001
;
int
cpuArch
,
cpuVendor
,
cpuModel
;
SCCLCHECK
(
scclTopoCpuType
(
comm
->
topo
,
&
cpuArch
,
&
cpuVendor
,
&
cpuModel
));
if
(
cpuArch
==
SCCL_TOPO_CPU_ARCH_X86
&&
cpuVendor
==
SCCL_TOPO_CPU_VENDOR_INTEL
)
return
1.0
;
if
(
cpuArch
==
SCCL_TOPO_CPU_ARCH_X86
&&
cpuVendor
==
SCCL_TOPO_CPU_VENDOR_AMD
)
return
2.0
;
else
return
1.0
;
}
scclResult_t
scclTopoTuneModel
(
struct
scclComm
*
comm
,
int
minCompCap
,
int
maxCompCap
,
struct
scclTopoGraph
**
graphs
)
{
int
simpleDefaultThreads
=
(
graphs
[
SCCL_ALGO_RING
]
->
bwIntra
*
graphs
[
SCCL_ALGO_RING
]
->
nChannels
<=
PCI_BW
)
?
256
:
SCCL_SIMPLE_MAX_NTHREADS
;
comm
->
maxThreads
[
SCCL_ALGO_RING
][
SCCL_PROTO_SIMPLE
]
=
getNthreads
(
"SCCL_NTHREADS"
,
scclParamNthreads
(),
4
*
comm
->
WarpSize
,
SCCL_MAX_NTHREADS
,
simpleDefaultThreads
,
comm
->
WarpSize
);
comm
->
maxThreads
[
SCCL_ALGO_TREE
][
SCCL_PROTO_SIMPLE
]
=
comm
->
maxThreads
[
SCCL_ALGO_COLLNET_DIRECT
][
SCCL_PROTO_SIMPLE
]
=
getNthreads
(
"SCCL_NTHREADS"
,
scclParamNthreads
(),
4
*
comm
->
WarpSize
,
SCCL_MAX_NTHREADS
,
SCCL_MAX_NTHREADS
,
comm
->
WarpSize
);
comm
->
maxThreads
[
SCCL_ALGO_RING
][
SCCL_PROTO_LL
]
=
comm
->
maxThreads
[
SCCL_ALGO_TREE
][
SCCL_PROTO_LL
]
=
comm
->
maxThreads
[
SCCL_ALGO_COLLNET_DIRECT
][
SCCL_PROTO_LL
]
=
getNthreads
(
"SCCL_NTHREADS"
,
scclParamNthreads
(),
4
*
comm
->
WarpSize
,
SCCL_MAX_NTHREADS
,
SCCL_MAX_NTHREADS
,
comm
->
WarpSize
);
comm
->
maxThreads
[
SCCL_ALGO_RING
][
SCCL_PROTO_LL128
]
=
comm
->
maxThreads
[
SCCL_ALGO_TREE
][
SCCL_PROTO_LL128
]
=
getNthreads
(
"SCCL_LL128_NTHREADS"
,
scclParamLl128Nthreads
(),
4
*
comm
->
WarpSize
,
SCCL_LL128_MAX_NTHREADS
,
SCCL_LL128_MAX_NTHREADS
,
comm
->
WarpSize
);
int
nNodes
=
comm
->
nNodes
;
int
nRanks
=
comm
->
nRanks
;
if
(
nRanks
<=
1
)
return
scclSuccess
;
int
compCapIndex
=
minCompCap
>=
90
?
HOPPER_COMPCAP_IDX
:
minCompCap
>=
80
?
AMPERE_COMPCAP_IDX
:
VOLTA_COMPCAP_IDX
;
int
cpuArch
,
cpuVendor
,
cpuModel
;
SCCLCHECK
(
scclTopoCpuType
(
comm
->
topo
,
&
cpuArch
,
&
cpuVendor
,
&
cpuModel
));
int
index2
=
nNodes
<=
2
?
nNodes
-
1
:
2
;
// LL: for single node, we look at GPU type; for multi-node, we look at CPU type
int
index1
=
nNodes
==
1
?
compCapIndex
:
cpuVendor
==
SCCL_TOPO_CPU_VENDOR_AMD
?
1
:
0
;
double
llMaxBw
=
llMaxBws
[
index1
][
index2
];
double
perChMaxTreeBw
=
perChMaxTreeBws
[
compCapIndex
][
index2
];
double
perChMaxRingLL128Bw
=
perChMaxRingLL128Bws
[
compCapIndex
][
index2
];
double
perChMaxTreeLL128Bw
=
perChMaxTreeLL128Bws
[
compCapIndex
][
index2
];
// De-penalize Tree/Simple latency on Power systems to favor Tree than Ring
// if (cpuArch == SCCL_TOPO_CPU_ARCH_POWER) hwLat[SCCL_HW_PCI][SCCL_ALGO_TREE][SCCL_PROTO_SIMPLE] = hwLat[SCCL_HW_PCI][SCCL_ALGO_RING][SCCL_PROTO_SIMPLE];
float
ppn
=
(
float
)
nRanks
/
nNodes
;
// if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount
int
intraHw
[
SCCL_NUM_ALGORITHMS
],
hw
[
SCCL_NUM_ALGORITHMS
];
for
(
int
a
=
0
;
a
<
SCCL_NUM_ALGORITHMS
;
a
++
)
intraHw
[
a
]
=
graphs
[
a
]
->
typeIntra
==
LINK_NVL
?
SCCL_HW_NVLINK
:
SCCL_HW_PCI
;
for
(
int
a
=
0
;
a
<
SCCL_NUM_ALGORITHMS
;
a
++
)
hw
[
a
]
=
nNodes
==
1
?
intraHw
[
a
]
:
SCCL_HW_NET
;
for
(
int
coll
=
0
;
coll
<
SCCL_NUM_FUNCTIONS
;
coll
++
)
{
int
nsteps
=
coll
==
scclFuncAllReduce
?
2
*
(
nRanks
-
1
)
:
coll
==
scclFuncReduceScatter
||
coll
==
scclFuncAllGather
?
nRanks
-
1
:
nRanks
;
int
nInterSteps
=
coll
==
scclFuncAllReduce
?
(
nNodes
>
1
?
2
*
nNodes
:
0
)
:
coll
==
scclFuncReduceScatter
||
coll
==
scclFuncAllGather
?
nNodes
-
1
:
nNodes
;
for
(
int
a
=
0
;
a
<
SCCL_NUM_ALGORITHMS
;
a
++
)
{
if
(
coll
==
scclFuncBroadcast
&&
a
!=
SCCL_ALGO_RING
)
continue
;
if
(
coll
==
scclFuncReduce
&&
a
!=
SCCL_ALGO_RING
)
continue
;
if
(
coll
==
scclFuncReduceScatter
&&
a
!=
SCCL_ALGO_RING
)
continue
;
if
(
coll
==
scclFuncAllGather
&&
a
!=
SCCL_ALGO_RING
)
continue
;
for
(
int
p
=
0
;
p
<
SCCL_NUM_PROTOCOLS
;
p
++
)
{
if
((
a
==
SCCL_ALGO_NVLS
||
a
==
SCCL_ALGO_NVLS_TREE
)
&&
p
!=
SCCL_PROTO_SIMPLE
)
continue
;
int
collnet
=
(
a
==
SCCL_ALGO_COLLNET_DIRECT
||
a
==
SCCL_ALGO_COLLNET_CHAIN
)
?
1
:
0
;
float
bw
=
nNodes
<=
2
||
collnet
?
graphs
[
a
]
->
bwIntra
:
graphs
[
a
]
->
bwInter
;
float
busBw
=
comm
->
topo
->
baseBw
!=
0.0
?
comm
->
topo
->
baseBw
:
graphs
[
a
]
->
nChannels
*
bw
;
// INFO(SCCL_INIT, "algo %s proto %s busBw %f baseBw %f bw %f nChannels %d bwIntra %f bwInter %f", scclAlgoStr[a], scclProtoStr[p], busBw,
// comm->topo->baseBw, bw, graphs[a]->nChannels, graphs[a]->bwIntra, graphs[a]->bwInter);
// Various model refinements
if
(
nNodes
<=
2
)
busBw
*=
rcclTuningModel
[
comm
->
topo
->
tuning
].
bwRatio
[
0
][
a
][
p
];
else
busBw
*=
rcclTuningModel
[
comm
->
topo
->
tuning
].
bwRatio
[
1
][
a
][
p
];
if
(
a
==
SCCL_ALGO_COLLNET_DIRECT
&&
p
==
SCCL_PROTO_SIMPLE
&&
minCompCap
>=
90
)
busBw
*=
.85
;
// Convert bus BW to algorithm BW
float
ratio
;
if
(
a
==
SCCL_ALGO_RING
)
ratio
=
(
1.0
*
nRanks
)
/
nsteps
;
else
if
(
a
==
SCCL_ALGO_NVLS
)
ratio
=
5.0
/
6.0
;
else
if
(
a
==
SCCL_ALGO_NVLS_TREE
)
ratio
=
.70
*
nNodes
/
(
2
*
(
nNodes
-
1
));
else
ratio
=
.5
;
comm
->
bandwidths
[
coll
][
a
][
p
]
=
busBw
*
ratio
;
comm
->
latencies
[
coll
][
a
][
p
]
=
baseLat
[
a
][
p
];
float
intraLat
=
rcclTuningModel
[
comm
->
topo
->
tuning
].
hwLat
[
intraHw
[
a
]][
a
][
p
];
float
interLat
=
graphs
[
a
]
->
latencyInter
?
graphs
[
a
]
->
latencyInter
:
rcclTuningModel
[
comm
->
topo
->
tuning
].
hwLat
[
SCCL_HW_NET
][
a
][
p
];
// if (nNodes > 1 && p == SCCL_PROTO_LL) intraLat *= 1.8;
if
(
p
==
SCCL_PROTO_SIMPLE
)
interLat
+=
graphs
[
a
]
->
latencyInter
;
if
(
a
==
SCCL_ALGO_RING
)
{
float
lat
=
rcclTuningModel
[
comm
->
topo
->
tuning
].
hwLat
[
hw
[
a
]][
a
][
p
];
if
((
coll
==
scclFuncReduce
||
coll
==
scclFuncBroadcast
))
{
if
(
graphs
[
a
]
->
sameChannels
)
{
comm
->
latencies
[
coll
][
a
][
p
]
+=
lat
;
}
else
{
if
(
p
==
SCCL_PROTO_SIMPLE
)
lat
=
rcclTuningModel
[
comm
->
topo
->
tuning
]
.
hwLat
[
hw
[
a
]][
SCCL_ALGO_TREE
][
p
];
// Add some chunk latency, waiting for proper chunk modeling
comm
->
latencies
[
coll
][
a
][
p
]
+=
nsteps
*
lat
;
}
}
else
{
// Inter-node rings still have to launch nsteps * net overhead.
float
netOverhead
=
0.0
;
if
(
nNodes
>
1
)
{
netOverhead
=
getNetOverhead
(
comm
);
if
(
p
==
SCCL_PROTO_SIMPLE
)
netOverhead
*=
3
;
}
intraLat
=
std
::
max
(
intraLat
,
netOverhead
);
comm
->
latencies
[
coll
][
a
][
p
]
+=
(
nsteps
-
nInterSteps
)
*
intraLat
+
nInterSteps
*
interLat
;
}
}
else
if
(
a
==
SCCL_ALGO_TREE
)
{
comm
->
latencies
[
coll
][
a
][
p
]
+=
2
*
((
nRanks
/
nNodes
-
1
)
*
intraLat
+
log2i
(
nNodes
)
*
interLat
);
}
else
if
(
a
==
SCCL_ALGO_COLLNET_DIRECT
)
{
comm
->
latencies
[
coll
][
a
][
p
]
+=
2
*
(
std
::
min
(
1
,
(
nRanks
/
nNodes
-
1
))
*
intraLat
+
(
nRanks
/
nNodes
-
1
)
*
0.5
)
+
interLat
;
// Add 0.5 arity serialization latency
}
else
if
(
a
==
SCCL_ALGO_COLLNET_CHAIN
)
{
comm
->
latencies
[
coll
][
a
][
p
]
+=
2
*
(
nRanks
/
nNodes
-
1
)
*
intraLat
+
interLat
;
}
else
if
(
a
==
SCCL_ALGO_NVLS
)
{
if
(
nNodes
>
1
)
comm
->
latencies
[
coll
][
a
][
p
]
+=
rcclTuningModel
[
comm
->
topo
->
tuning
].
hwLat
[
SCCL_HW_NET
][
a
][
p
];
}
else
if
(
a
==
SCCL_ALGO_NVLS_TREE
)
{
comm
->
latencies
[
coll
][
a
][
p
]
+=
2
*
(
nNodes
-
1
)
*
rcclTuningModel
[
comm
->
topo
->
tuning
].
hwLat
[
SCCL_HW_NET
][
a
][
p
];
}
}
}
}
// Protocols/Algorithms enable/disable, and user overrides.
// All are enabled except ll128 which is enabled by default only in certain cases.
int
protoEnable
[
SCCL_NUM_PROTOCOLS
]
=
{
1
,
2
,
1
};
int
algoEnable
[
SCCL_NUM_ALGORITHMS
]
=
{
1
,
1
,
1
,
1
,
1
,
1
};
const
char
*
protoStr
=
getenv
(
"SCCL_PROTO"
);
if
(
protoStr
)
{
INFO
(
SCCL_ENV
,
"SCCL_PROTO set by environment to %s"
,
protoStr
);
SCCLCHECK
(
parseList
(
protoStr
,
scclProtoStr
,
SCCL_NUM_PROTOCOLS
,
protoEnable
));
}
const
char
*
algoStr
=
getenv
(
"SCCL_ALGO"
);
if
(
algoStr
)
{
INFO
(
SCCL_ENV
,
"SCCL_ALGO set by environment to %s"
,
algoStr
);
SCCLCHECK
(
parseList
(
algoStr
,
scclAlgoStr
,
SCCL_NUM_ALGORITHMS
,
algoEnable
));
}
if
(
comm
->
nNodes
==
1
)
algoEnable
[
SCCL_ALGO_NVLS_TREE
]
=
0
;
// Disable CollNet if it is not supported
if
(
comm
->
collNetSupport
==
0
)
{
algoEnable
[
SCCL_ALGO_COLLNET_DIRECT
]
=
0
;
algoEnable
[
SCCL_ALGO_COLLNET_CHAIN
]
=
0
;
if
(
comm
->
nNodes
>
1
)
algoEnable
[
SCCL_ALGO_NVLS
]
=
0
;
// If user has hard set SCCL_ALGO=COLLNET, ignore it
if
(
algoEnable
[
SCCL_ALGO_RING
]
==
0
&&
algoEnable
[
SCCL_ALGO_TREE
]
==
0
&&
algoEnable
[
SCCL_ALGO_NVLS
]
==
0
&&
algoEnable
[
SCCL_ALGO_NVLS_TREE
]
==
0
)
{
algoEnable
[
SCCL_ALGO_RING
]
=
algoEnable
[
SCCL_ALGO_TREE
]
=
1
;
if
(
comm
->
rank
==
0
)
WARN
(
"CollNet is not supported or fails to initialize, ignoring SCCL_ALGO=COLLNET"
);
}
}
else
{
// Disable CollNet+Direct if not on an NVSwitch system
int
nvsCount
=
0
;
SCCLCHECK
(
scclTopoGetNvsCount
(
comm
->
topo
,
&
nvsCount
));
if
(
nvsCount
==
0
)
algoEnable
[
SCCL_ALGO_COLLNET_DIRECT
]
=
0
;
}
for
(
int
c
=
0
;
c
<
SCCL_NUM_FUNCTIONS
;
c
++
)
for
(
int
a
=
0
;
a
<
SCCL_NUM_ALGORITHMS
;
a
++
)
for
(
int
p
=
0
;
p
<
SCCL_NUM_PROTOCOLS
;
p
++
)
{
// Disable LL protocol on gfx11xx
int
pEnable
=
protoEnable
[
p
];
if
(
pEnable
==
2
&&
p
==
SCCL_PROTO_LL128
)
{
#if defined(ENABLE_LL128)
// Enable LL128 by default only on gfx90a with available tuning table
pEnable
=
(
graphs
[
a
]
->
typeInter
<=
PATH_PXB
)
&&
graphs
[
a
]
->
typeIntra
<=
PATH_NVL
&&
(
IsArchMatch
(
comm
->
topo
->
nodes
[
GPU
].
nodes
[
0
].
gpu
.
gcn
,
"gfx90a"
)
&&
comm
->
topo
->
ll128Enabled
)
?
1
:
0
;
#else
pEnable
=
0
;
#endif
}
if
(
pEnable
==
0
)
comm
->
bandwidths
[
c
][
a
][
p
]
=
0
;
// Never disable ring for non-allreduce operations. That allows to run real apps with SCCL_ALGO=TREE.
if
(
a
==
SCCL_ALGO_RING
&&
c
!=
scclFuncAllReduce
)
continue
;
if
(
algoEnable
[
a
]
==
0
)
comm
->
bandwidths
[
c
][
a
][
p
]
=
0
;
}
if
(
comm
->
rank
==
0
)
{
char
line
[
1024
];
for
(
int
block
=
0
;
block
<
2
;
block
++
)
{
sprintf
(
line
,
" Algorithm |"
);
for
(
int
ba
=
0
;
ba
<
SCCL_NUM_ALGORITHMS
/
2
;
ba
++
)
{
int
a
=
block
*
SCCL_NUM_ALGORITHMS
/
2
+
ba
;
sprintf
(
line
+
strlen
(
line
),
" %14s %14s %14s |"
,
""
,
scclAlgoStr
[
a
],
""
);
}
INFO
(
SCCL_TUNING
,
"%s"
,
line
);
sprintf
(
line
,
" Protocol |"
);
for
(
int
ba
=
0
;
ba
<
SCCL_NUM_ALGORITHMS
/
2
;
ba
++
)
{
for
(
int
p
=
0
;
p
<
SCCL_NUM_PROTOCOLS
;
p
++
)
{
sprintf
(
line
+
strlen
(
line
),
" %14s |"
,
scclProtoStr
[
p
]);
}
}
INFO
(
SCCL_TUNING
,
"%s"
,
line
);
sprintf
(
line
,
" Max NThreads |"
);
for
(
int
ba
=
0
;
ba
<
SCCL_NUM_ALGORITHMS
/
2
;
ba
++
)
{
int
a
=
block
*
SCCL_NUM_ALGORITHMS
/
2
+
ba
;
for
(
int
p
=
0
;
p
<
SCCL_NUM_PROTOCOLS
;
p
++
)
{
sprintf
(
line
+
strlen
(
line
),
" %14d |"
,
comm
->
maxThreads
[
a
][
p
]);
}
}
INFO
(
SCCL_TUNING
,
"%s"
,
line
);
for
(
int
c
=
0
;
c
<
SCCL_NUM_FUNCTIONS
;
c
++
)
{
sprintf
(
line
,
"%13s |"
,
scclFuncStr
[
c
]);
for
(
int
ba
=
0
;
ba
<
SCCL_NUM_ALGORITHMS
/
2
;
ba
++
)
{
int
a
=
block
*
SCCL_NUM_ALGORITHMS
/
2
+
ba
;
for
(
int
p
=
0
;
p
<
SCCL_NUM_PROTOCOLS
;
p
++
)
{
sprintf
(
line
+
strlen
(
line
),
"%8.1f/%6.1f |"
,
comm
->
latencies
[
c
][
a
][
p
],
comm
->
bandwidths
[
c
][
a
][
p
]);
}
}
INFO
(
SCCL_TUNING
,
"%s"
,
line
);
}
}
}
// Set per-thread amount of work before we increase nThreads and nChannels
for
(
int
a
=
0
;
a
<
SCCL_NUM_ALGORITHMS
;
a
++
)
{
comm
->
threadThresholds
[
a
][
SCCL_PROTO_LL
]
=
SCCL_LL_THREAD_THRESHOLD
;
comm
->
threadThresholds
[
a
][
SCCL_PROTO_LL128
]
=
SCCL_LL128_THREAD_THRESHOLD
;
comm
->
threadThresholds
[
a
][
SCCL_PROTO_SIMPLE
]
=
SCCL_SIMPLE_THREAD_THRESHOLD
;
}
comm
->
threadThresholds
[
SCCL_ALGO_RING
][
SCCL_PROTO_LL
]
*=
nRanks
;
comm
->
threadThresholds
[
SCCL_ALGO_COLLNET_DIRECT
][
SCCL_PROTO_SIMPLE
]
=
256
;
comm
->
threadThresholds
[
SCCL_ALGO_COLLNET_CHAIN
][
SCCL_PROTO_SIMPLE
]
=
256
;
// Override defaults with user env
char
*
str
=
getenv
(
"SCCL_THREAD_THRESHOLDS"
);
if
(
str
)
{
INFO
(
SCCL_ENV
,
"SCCL_THREAD_THRESHOLDS set by environment to %s"
,
str
);
ssize_t
t
[
2
][
SCCL_NUM_PROTOCOLS
]
=
{{
-
2
,
-
2
,
-
2
},
{
-
2
,
-
2
,
-
2
}};
sscanf
(
str
,
"%ld %ld %ld %ld %ld %ld"
,
t
[
0
],
t
[
0
]
+
1
,
t
[
0
]
+
2
,
t
[
1
],
t
[
1
]
+
1
,
t
[
1
]
+
2
);
for
(
int
a
=
0
;
a
<
2
;
a
++
)
{
for
(
int
p
=
0
;
p
<
SCCL_NUM_PROTOCOLS
;
p
++
)
{
if
(
t
[
a
][
p
]
>=
0
)
comm
->
threadThresholds
[
a
][
p
]
=
t
[
a
][
p
];
}
}
}
INFO
(
SCCL_INIT
,
"threadThresholds %ld/%ld/%ld | %ld/%ld/%ld | %ld | %ld"
,
comm
->
threadThresholds
[
SCCL_ALGO_TREE
][
SCCL_PROTO_LL
],
comm
->
threadThresholds
[
SCCL_ALGO_TREE
][
SCCL_PROTO_LL128
],
comm
->
threadThresholds
[
SCCL_ALGO_TREE
][
SCCL_PROTO_SIMPLE
],
comm
->
threadThresholds
[
SCCL_ALGO_RING
][
SCCL_PROTO_LL
],
comm
->
threadThresholds
[
SCCL_ALGO_RING
][
SCCL_PROTO_LL128
],
comm
->
threadThresholds
[
SCCL_ALGO_RING
][
SCCL_PROTO_SIMPLE
],
comm
->
threadThresholds
[
SCCL_ALGO_COLLNET_DIRECT
][
SCCL_PROTO_SIMPLE
],
comm
->
threadThresholds
[
SCCL_ALGO_COLLNET_CHAIN
][
SCCL_PROTO_SIMPLE
]);
return
scclSuccess
;
}
scclResult_t
scclTopoGetAlgoTime
(
struct
scclInfo
*
info
,
int
algorithm
,
int
protocol
,
int
numPipeOps
,
float
*
time
)
{
float
bw
=
info
->
comm
->
bandwidths
[
info
->
coll
][
algorithm
][
protocol
];
float
lat
=
info
->
comm
->
latencies
[
info
->
coll
][
algorithm
][
protocol
];
if
(
bw
==
0
)
{
*
time
=
-
1.0
;
return
scclSuccess
;
}
int
logSize
=
log2i
(
info
->
nBytes
>>
6
);
if
(
algorithm
==
SCCL_ALGO_TREE
)
{
if
(
logSize
<
27
)
bw
*=
rcclTuningModel
[
info
->
comm
->
topo
->
tuning
].
treeCorrectionFactor
[
protocol
][
logSize
];
else
bw
*=
rcclTuningModel
[
info
->
comm
->
topo
->
tuning
].
treeCorrectionFactor
[
protocol
][
26
];
}
else
if
(
algorithm
==
SCCL_ALGO_RING
&&
info
->
comm
->
nNodes
>
1
)
{
if
(
logSize
<
27
)
bw
*=
rcclTuningModel
[
info
->
comm
->
topo
->
tuning
].
ringCorrectionFactor
[
protocol
][
logSize
];
else
bw
*=
rcclTuningModel
[
info
->
comm
->
topo
->
tuning
].
ringCorrectionFactor
[
protocol
][
26
];
}
// Tree pipelining saves latency in aggregation cases
int
latCount
=
algorithm
==
SCCL_ALGO_RING
?
numPipeOps
:
DIVUP
(
numPipeOps
,
SCCL_MAX_WORK_ELEMENTS
);
*
time
=
lat
*
latCount
+
(
info
->
nBytes
)
/
(
1000
*
bw
);
return
scclSuccess
;
}
}
// namespace detect
}
// namespace topology
}
// namespace hardware
}
// namespace sccl
src/hardware/hardware_utils.cpp
0 → 100644
View file @
a4ac3320
#include <stdint.h>
#include <hip/hip_runtime.h>
#include <hip/hip_runtime_api.h>
#include "base.h"
#include "hardware_utils.h"
namespace
sccl
{
namespace
hardware
{}
// namespace hardware
}
// namespace sccl
src/hardware/hardware_utils.h
View file @
a4ac3320
...
...
@@ -2,7 +2,13 @@
#include <stdint.h>
#include "base.h"
#include "comm.h"
namespace
sccl
{
namespace
hardware
{}
// namespace hardware
namespace
hardware
{
namespace
ops
{
////
}
// namespace ops
}
// namespace hardware
}
// namespace sccl
src/hardware/net/device/net_ib.h
deleted
100644 → 0
View file @
d9d23f34
#pragma once
#include <assert.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <poll.h>
#include <sys/types.h>
#include <unistd.h>
#include "ibvwrap.h"
#include "net_utils.h"
namespace
sccl
{
namespace
hardware
{
namespace
net
{
namespace
device
{
//////////////////////////////////
extern
scclNet_t
scclNetIb
;
}
// namespace device
}
// namespace net
}
// namespace hardware
}
// namespace sccl
src/hardware/net/host/net_socket.h
deleted
100644 → 0
View file @
d9d23f34
#pragma once
#include <assert.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <poll.h>
#include <sys/types.h>
#include <unistd.h>
#include "base.h"
#include "net_utils.h"
namespace
sccl
{
namespace
hardware
{
namespace
net
{
namespace
host
{
//////////////////////////////////
extern
scclNet_t
scclNetSocket
;
}
// namespace host
}
// namespace net
}
// namespace hardware
}
// namespace sccl
src/hardware/net/ipc_socket/ipc_socket.cpp
0 → 100644
View file @
a4ac3320
#include <pthread.h>
#include <stdlib.h>
#include <poll.h>
#include <limits.h>
#include <fcntl.h>
#include <thread> // 为了使用 std::this_thread::sleep_for
#include "ipc_socket.h"
namespace
sccl
{
namespace
hardware
{
namespace
net
{
namespace
ipc_socket
{
//////////////////////////////////////// scclIpcSocket调用的函数 ////////////////////////////////////////
scclIpcSocket
::
scclIpcSocket
(
int
localRank
,
int
localRanks
,
uint64_t
hash
,
volatile
uint32_t
*
abortFlag
)
:
localRank
(
localRank
),
localRanks
(
localRanks
),
ipc_hash
(
hash
)
{
scclResult_t
res
;
handle
=
new
struct
scclIpcSocketHandle
();
if
(
localRanks
>
0
)
{
pthread_pool
=
new
ThreadPool
(
localRanks
*
2
);
// 其中一半用于发送一半,用于接收
}
SCCLCHECKGOTO
(
scclIpcSocketInit
(
abortFlag
),
res
,
failure
);
return
;
failure:
WARN
(
"scclIpcSocket init failed"
);
return
;
}
scclIpcSocket
::~
scclIpcSocket
()
{
// 释放pthpool
if
(
pthread_pool
)
{
delete
(
pthread_pool
);
}
// 释放handle
if
(
handle
->
socketName
[
0
]
!=
'\0'
)
{
unlink
(
handle
->
socketName
);
}
if
(
handle
->
fd
>=
0
)
{
close
(
handle
->
fd
);
}
delete
(
handle
);
}
////////////////////////////////////////////////////////////////////////////////////////////////////
scclResult_t
scclIpcSocket
::
scclIpcSocketInit
(
volatile
uint32_t
*
abortFlag
)
{
// 中间变量
int
fd
=
-
1
;
char
temp_addr
[
SCCL_IPC_SOCKNAME_LEN
];
// 初始化handle的成员变量
handle
->
fd
=
-
1
;
handle
->
socketName
[
0
]
=
'\0'
;
// 创建Unix域套接字
// af是本机IP地址类型,一般有PF_INET或者AF_INET(IPv4互联网协议族),还有PF_INET6(IPv6互联网协议族)等,但是一般用IPv4。
// type有两种SOCK_STREAM 和SOCK_DGRAM分别对应tcp和udp协议,区别是用不用建立连接。
if
((
fd
=
socket
(
AF_UNIX
,
SOCK_DGRAM
,
0
))
<
0
)
{
WARN
(
"UDS: Socket creation error : %d"
,
errno
);
return
scclSystemError
;
}
// 将cliaddr结构体清零,确保没有残留数据
bzero
(
&
my_cliaddr
,
sizeof
(
my_cliaddr
));
my_cliaddr
.
sun_family
=
AF_UNIX
;
// 为套接字创建唯一名称
int
len
=
snprintf
(
temp_addr
,
SCCL_IPC_SOCKNAME_LEN
,
SCCL_IPC_SOCKNAME_STR
,
localRank
,
ipc_hash
);
if
(
len
>
(
sizeof
(
my_cliaddr
.
sun_path
)
-
1
))
{
WARN
(
"UDS: Cannot bind provided name to socket. Name too large"
);
return
scclInternalError
;
}
INFO
(
SCCL_LOG_BOOTSTRAP
,
"UDS: Creating socket %s"
,
temp_addr
);
// 设置套接字路径
strncpy
(
my_cliaddr
.
sun_path
,
temp_addr
,
len
);
my_cliaddr
.
sun_path
[
0
]
=
'\0'
;
// Linux抽象套接字技巧
// 绑定套接字
if
(
bind
(
fd
,
(
struct
sockaddr
*
)
&
my_cliaddr
,
sizeof
(
my_cliaddr
))
<
0
)
{
WARN
(
"UDS: Binding to socket %s failed : %d"
,
temp_addr
,
errno
);
close
(
fd
);
return
scclSystemError
;
}
// 设置handle的成员变量
handle
->
fd
=
fd
;
strcpy
(
handle
->
socketName
,
temp_addr
);
// 设置中止标志
handle
->
abortFlag
=
abortFlag
;
// 将套接字标记为非阻塞
if
(
handle
->
abortFlag
)
{
int
flags
;
EQCHECK
(
flags
=
fcntl
(
fd
,
F_GETFL
),
-
1
);
SYSCHECK
(
fcntl
(
fd
,
F_SETFL
,
flags
|
O_NONBLOCK
),
"fcntl"
);
}
return
scclSuccess
;
}
/**
* 设置中止标志并更新socket的非阻塞模式
*
* @param flag 指向中止标志的指针。如果非空,将socket设为非阻塞模式;
* 如果为空,则恢复为阻塞模式。
* @note 该函数仅在handle有效时执行操作
*/
scclResult_t
scclIpcSocket
::
setAbortFlag
(
volatile
uint32_t
*
flag
)
{
if
(
handle
)
{
handle
->
abortFlag
=
flag
;
if
(
flag
)
{
int
flags
;
EQCHECK
(
flags
=
fcntl
(
handle
->
fd
,
F_GETFL
),
-
1
);
SYSCHECK
(
fcntl
(
handle
->
fd
,
F_SETFL
,
flags
|
O_NONBLOCK
),
"fcntl"
);
}
else
{
int
flags
;
EQCHECK
(
flags
=
fcntl
(
handle
->
fd
,
F_GETFL
),
-
1
);
SYSCHECK
(
fcntl
(
handle
->
fd
,
F_SETFL
,
flags
&
~
O_NONBLOCK
),
"fcntl"
);
}
}
return
scclSuccess
;
}
// 获取 abortFlag 的函数
volatile
uint32_t
*
scclIpcSocket
::
getAbortFlag
()
const
{
return
handle
?
handle
->
abortFlag
:
nullptr
;
}
/**
* 设置IPC套接字的超时时间
*
* @param timeout_ms 超时时间(毫秒)
* @return 成功返回scclSuccess
*/
scclResult_t
scclIpcSocket
::
setTimeout
(
int
timeout_ms
)
{
timeoutMs
=
timeout_ms
;
return
scclSuccess
;
}
ThreadPool
*
scclIpcSocket
::
getPthreadPool
()
{
return
pthread_pool
;
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
/**
* @brief 通过Unix域套接字发送文件描述符
*
* @param sendFd 要发送的文件描述符
* @param dst_rank 目标rank号
* @return scclResult_t 返回操作结果:
* - scclSuccess: 发送成功
* - scclInternalError: 内部错误(如地址过长或中止标志被设置)
* - scclSystemError: 系统调用错误
*
* @note 使用Linux抽象套接字技巧(将sun_path[0]置为'\0')
* 通过SCM_RIGHTS机制发送文件描述符
* 函数会循环尝试发送直到成功或遇到错误
*/
scclResult_t
scclIpcSocket
::
scclIpcSocketSendFd
(
const
int
sendFd
,
int
dst_rank
)
{
// 创建一个临时地址字符串
char
temp_addr
[
SCCL_IPC_SOCKNAME_LEN
];
// 格式化地址字符串
int
len
=
snprintf
(
temp_addr
,
SCCL_IPC_SOCKNAME_LEN
,
SCCL_IPC_SOCKNAME_STR
,
dst_rank
,
ipc_hash
);
// 检查地址字符串长度是否超过限制
if
(
len
>
(
sizeof
(
my_cliaddr
.
sun_path
)
-
1
))
{
WARN
(
"UDS: Cannot connect to provided name for socket. Name too large"
);
return
scclInternalError
;
}
// 记录发送文件描述符的信息
INFO
(
SCCL_LOG_BOOTSTRAP
,
"UDS: Sending fd %d to UDS socket %s/fd:%d"
,
sendFd
,
temp_addr
,
handle
->
fd
);
// 初始化消息头结构体和iovec结构体
struct
msghdr
msg
;
struct
iovec
iov
[
1
];
// 联合体用于保证控制数组的对齐要求
union
{
struct
cmsghdr
cm
;
char
control
[
CMSG_SPACE
(
sizeof
(
int
))];
}
control_un
;
struct
cmsghdr
*
cmptr
;
struct
sockaddr_un
cliaddr
;
// 构造客户端地址以发送共享句柄
bzero
(
&
cliaddr
,
sizeof
(
cliaddr
));
cliaddr
.
sun_family
=
AF_UNIX
;
strncpy
(
cliaddr
.
sun_path
,
temp_addr
,
len
);
cliaddr
.
sun_path
[
0
]
=
'\0'
;
// Linux抽象套接字技巧
// 设置消息头的控制信息部分
msg
.
msg_control
=
control_un
.
control
;
msg
.
msg_controllen
=
sizeof
(
control_un
.
control
);
cmptr
=
CMSG_FIRSTHDR
(
&
msg
);
cmptr
->
cmsg_len
=
CMSG_LEN
(
sizeof
(
int
));
cmptr
->
cmsg_level
=
SOL_SOCKET
;
cmptr
->
cmsg_type
=
SCM_RIGHTS
;
// 将要发送的文件描述符复制到控制信息中
memmove
(
CMSG_DATA
(
cmptr
),
&
sendFd
,
sizeof
(
sendFd
));
// 设置消息头的地址信息部分
msg
.
msg_name
=
(
void
*
)
&
cliaddr
;
msg
.
msg_namelen
=
sizeof
(
struct
sockaddr_un
);
// 设置iovec结构体,用于指定要发送的数据
iov
[
0
].
iov_base
=
(
void
*
)
""
;
iov
[
0
].
iov_len
=
1
;
// 将iovec结构体关联到消息头
msg
.
msg_iov
=
iov
;
msg
.
msg_iovlen
=
1
;
// 初始化消息标志
msg
.
msg_flags
=
0
;
ssize_t
sendResult
;
// 循环发送消息,直到成功发送数据
while
((
sendResult
=
sendmsg
(
handle
->
fd
,
&
msg
,
0
))
<=
0
)
{
// 如果发送失败且错误不是EAGAIN, EWOULDBLOCK或EINTR,则记录警告并返回错误
if
(
errno
!=
EAGAIN
&&
errno
!=
EWOULDBLOCK
&&
errno
!=
EINTR
)
{
WARN
(
"UDS: Sending data over socket %s failed : %d"
,
temp_addr
,
errno
);
return
scclSystemError
;
}
// 如果设置了中止标志,则返回内部错误
if
(
handle
->
abortFlag
&&
*
handle
->
abortFlag
)
return
scclInternalError
;
}
// 返回成功
return
scclSuccess
;
}
/**
* @brief 通过IPC socket接收文件描述符
*
* 该函数使用recvmsg系统调用从socket接收文件描述符。函数会循环尝试接收,
* 直到成功或发生错误。接收到的文件描述符会通过参数recvFd返回。
*
* @param recvFd 用于存储接收到的文件描述符的指针
* @return scclResult_t 返回操作结果:
* - scclSuccess: 成功接收文件描述符
* - scclSystemError: 系统调用失败
* - scclInternalError: 操作被中止
*
* @note 函数会处理EAGAIN、EWOULDBLOCK和EINTR错误,其他错误会导致返回失败。
* 接收到的控制消息必须符合SOL_SOCKET级别和SCM_RIGHTS类型。
*/
scclResult_t
scclIpcSocket
::
scclIpcSocketRecvFd
(
int
*
recvFd
)
{
// 初始化消息头结构体和iovec结构体
struct
msghdr
msg
=
{
0
,
0
,
0
,
0
,
0
,
0
,
0
};
struct
iovec
iov
[
1
];
// 联合体用于保证控制数组的对齐要求
union
{
struct
cmsghdr
cm
;
char
control
[
CMSG_SPACE
(
sizeof
(
int
))];
}
control_un
;
struct
cmsghdr
*
cmptr
;
char
dummy_buffer
[
1
];
int
ret
;
// 设置消息头的控制信息部分
msg
.
msg_control
=
control_un
.
control
;
msg
.
msg_controllen
=
sizeof
(
control_un
.
control
);
// 设置iovec结构体,用于指定要接收的数据
iov
[
0
].
iov_base
=
(
void
*
)
dummy_buffer
;
iov
[
0
].
iov_len
=
sizeof
(
dummy_buffer
);
// 将iovec结构体关联到消息头
msg
.
msg_iov
=
iov
;
msg
.
msg_iovlen
=
1
;
// 循环接收消息,直到成功接收到数据
while
((
ret
=
recvmsg
(
handle
->
fd
,
&
msg
,
0
))
<=
0
)
{
// 如果接收失败且错误不是EAGAIN, EWOULDBLOCK或EINTR,则记录警告并返回错误
if
(
errno
!=
EAGAIN
&&
errno
!=
EWOULDBLOCK
&&
errno
!=
EINTR
)
{
WARN
(
"UDS: Receiving data over socket failed : %d"
,
errno
);
return
scclSystemError
;
}
// 如果设置了中止标志,则返回内部错误
if
(
handle
->
abortFlag
&&
*
handle
->
abortFlag
)
return
scclInternalError
;
}
// 检查接收到的控制信息
if
(((
cmptr
=
CMSG_FIRSTHDR
(
&
msg
))
!=
NULL
)
&&
(
cmptr
->
cmsg_len
==
CMSG_LEN
(
sizeof
(
int
))))
{
// 如果控制信息的级别或类型不正确,则记录警告并返回错误
if
((
cmptr
->
cmsg_level
!=
SOL_SOCKET
)
||
(
cmptr
->
cmsg_type
!=
SCM_RIGHTS
))
{
WARN
(
"UDS: Receiving data over socket failed"
);
return
scclSystemError
;
}
// 将接收到的文件描述符复制到recvFd
memmove
(
recvFd
,
CMSG_DATA
(
cmptr
),
sizeof
(
*
recvFd
));
}
else
{
// 如果没有接收到控制信息,则记录警告并返回错误
WARN
(
"UDS: Receiving data over socket %s failed"
,
handle
->
socketName
);
return
scclSystemError
;
}
// 记录成功接收到文件描述符的信息
INFO
(
SCCL_LOG_BOOTSTRAP
,
"UDS: Got recvFd %d from socket %s"
,
*
recvFd
,
handle
->
socketName
);
// 返回成功
return
scclSuccess
;
}
/**
* @brief 通过IPC套接字发送数据到指定目标rank
*
* @param data 要发送的数据指针
* @param dataLen 要发送的数据长度
* @param dst_rank 目标rank号
* @return scclResult_t 返回操作结果状态码:
* - scclSuccess: 发送成功
* - scclInternalError: 内部错误(如套接字名称过长或中止标志被设置)
* - scclSystemError: 系统调用错误(如poll超时或sendmsg失败)
*
* @note 使用Linux抽象套接字技术,通过poll机制确保套接字可写后再发送数据
* 支持EAGAIN/EWOULDBLOCK/EINTR错误重试机制
*/
scclResult_t
scclIpcSocket
::
scclIpcSocketSendData
(
const
void
*
data
,
size_t
dataLen
,
int
dst_rank
)
{
// 构造目标地址字符串
char
temp_addr
[
SCCL_IPC_SOCKNAME_LEN
];
int
len
=
snprintf
(
temp_addr
,
SCCL_IPC_SOCKNAME_LEN
,
SCCL_IPC_SOCKNAME_STR
,
dst_rank
,
ipc_hash
);
if
(
len
>
(
sizeof
(
my_cliaddr
.
sun_path
)
-
1
))
{
WARN
(
"UDS: Unable to connect to the provided socket name. Name too long"
);
return
scclInternalError
;
}
// 设置消息结构体
struct
msghdr
msg
;
struct
iovec
iov
[
1
];
struct
sockaddr_un
cliaddr
;
bzero
(
&
cliaddr
,
sizeof
(
cliaddr
));
cliaddr
.
sun_family
=
AF_UNIX
;
strncpy
(
cliaddr
.
sun_path
,
temp_addr
,
len
);
cliaddr
.
sun_path
[
0
]
=
'\0'
;
// Linux抽象套接字技巧
iov
[
0
].
iov_base
=
(
void
*
)
data
;
iov
[
0
].
iov_len
=
dataLen
;
msg
.
msg_name
=
(
void
*
)
&
cliaddr
;
msg
.
msg_namelen
=
sizeof
(
cliaddr
);
msg
.
msg_iov
=
iov
;
msg
.
msg_iovlen
=
1
;
msg
.
msg_control
=
NULL
;
msg
.
msg_controllen
=
0
;
msg
.
msg_flags
=
0
;
// 使用 poll 等待 socket 可写
struct
pollfd
pfd
;
pfd
.
fd
=
handle
->
fd
;
pfd
.
events
=
POLLOUT
;
int
pollResult
=
poll
(
&
pfd
,
1
,
timeoutMs
);
if
(
pollResult
<=
0
)
{
if
(
pollResult
==
0
)
{
WARN
(
"UDS: Timeout occurred while waiting to send data to socket %s"
,
temp_addr
);
}
else
{
WARN
(
"UDS: Error occurred while polling socket %s for writability : %d"
,
temp_addr
,
errno
);
}
return
scclSystemError
;
}
ssize_t
sendResult
;
while
((
sendResult
=
sendmsg
(
handle
->
fd
,
&
msg
,
0
))
<=
0
)
{
if
(
errno
!=
EAGAIN
&&
errno
!=
EWOULDBLOCK
&&
errno
!=
EINTR
)
{
WARN
(
"UDS: Error occurred while sending data through socket %s : %d"
,
temp_addr
,
errno
);
return
scclSystemError
;
}
if
(
handle
->
abortFlag
&&
*
handle
->
abortFlag
)
return
scclInternalError
;
// 如果 sendmsg 因为 EAGAIN 或 EWOULDBLOCK 失败,重新 poll
pollResult
=
poll
(
&
pfd
,
1
,
timeoutMs
);
if
(
pollResult
<=
0
)
{
if
(
pollResult
==
0
)
{
WARN
(
"UDS: Timeout occurred while waiting to send data to socket %s"
,
temp_addr
);
}
else
{
WARN
(
"UDS: Error occurred while polling socket %s for writability : %d"
,
temp_addr
,
errno
);
}
return
scclSystemError
;
}
}
INFO
(
SCCL_LOG_BOOTSTRAP
,
"UDS: Successfully sent %zu bytes of data through UDS socket %s"
,
dataLen
,
temp_addr
);
return
scclSuccess
;
}
/**
* @brief 通过IPC socket接收数据
*
* 该函数使用poll机制等待socket可读,然后通过recvmsg接收数据。
* 支持超时设置和中断处理,当发生错误或超时时返回相应错误码。
*
* @param buffer 接收数据的缓冲区指针
* @param bufferLen 缓冲区长度
* @param receivedLen 实际接收到的数据长度(输出参数)
* @return scclResult_t 操作结果状态码:
* - scclSuccess: 成功接收数据
* - scclSystemError: 系统调用错误
* - scclInternalError: 被中断标志终止
*/
scclResult_t
scclIpcSocket
::
scclIpcSocketRecvData
(
void
*
buffer
,
size_t
bufferLen
,
size_t
*
receivedLen
)
{
// 设置消息结构体
struct
msghdr
msg
=
{
0
};
struct
iovec
iov
[
1
];
iov
[
0
].
iov_base
=
buffer
;
iov
[
0
].
iov_len
=
bufferLen
;
msg
.
msg_iov
=
iov
;
msg
.
msg_iovlen
=
1
;
// 使用 poll 等待 socket 可读
struct
pollfd
pfd
;
pfd
.
fd
=
handle
->
fd
;
pfd
.
events
=
POLLIN
;
int
pollResult
=
poll
(
&
pfd
,
1
,
timeoutMs
);
if
(
pollResult
<=
0
)
{
if
(
pollResult
==
0
)
{
WARN
(
"UDS: Timeout occurred while waiting to receive data from socket %s"
,
handle
->
socketName
);
}
else
{
WARN
(
"UDS: Error occurred while polling socket %s for readability : %d"
,
handle
->
socketName
,
errno
);
}
return
scclSystemError
;
}
int
ret
;
while
((
ret
=
recvmsg
(
handle
->
fd
,
&
msg
,
0
))
<=
0
)
{
if
(
errno
!=
EAGAIN
&&
errno
!=
EWOULDBLOCK
&&
errno
!=
EINTR
)
{
WARN
(
"UDS: Error occurred while receiving data through socket %s : %d"
,
handle
->
socketName
,
errno
);
return
scclSystemError
;
}
if
(
handle
->
abortFlag
&&
*
handle
->
abortFlag
)
return
scclInternalError
;
// 如果 recvmsg 因为 EAGAIN 或 EWOULDBLOCK 失败,重新 poll
pollResult
=
poll
(
&
pfd
,
1
,
timeoutMs
);
if
(
pollResult
<=
0
)
{
if
(
pollResult
==
0
)
{
WARN
(
"UDS: Timeout occurred while waiting to receive data from socket %s"
,
handle
->
socketName
);
}
else
{
WARN
(
"UDS: Error occurred while polling socket %s for readability : %d"
,
handle
->
socketName
,
errno
);
}
return
scclSystemError
;
}
}
if
(
ret
>
0
)
{
*
receivedLen
=
ret
;
INFO
(
SCCL_LOG_BOOTSTRAP
,
"UDS: Successfully received %zu bytes of data from socket %s"
,
ret
,
handle
->
socketName
);
return
scclSuccess
;
}
else
{
WARN
(
"UDS: Error occurred while receiving data through socket %s"
,
handle
->
socketName
);
return
scclSystemError
;
}
}
/**
* @brief 通过Unix域套接字非阻塞发送数据到指定rank节点
*
* @param data 要发送的数据指针
* @param dataLen 要发送的数据长度(字节)
* @param dst_rank 目标rank号
* @return scclResult_t 返回操作结果:
* - scclSuccess: 发送成功
* - scclInternalError: 内部错误(地址过长或中止标志被设置)
* - scclSystemError: 系统调用错误
*
* @note 使用Linux抽象套接字命名空间技术
* 函数会持续重试直到发送成功或发生错误
* 使用poll系统调用等待套接字变为可写状态
*/
scclResult_t
scclIpcSocket
::
scclIpcSocketSendDataNonBlocking
(
const
void
*
data
,
size_t
dataLen
,
int
dst_rank
)
{
// 创建一个临时地址字符串,用于存储目标套接字的地址
char
temp_addr
[
SCCL_IPC_SOCKNAME_LEN
];
// 格式化目标地址字符串
int
len
=
snprintf
(
temp_addr
,
SCCL_IPC_SOCKNAME_LEN
,
SCCL_IPC_SOCKNAME_STR
,
dst_rank
,
ipc_hash
);
// 如果地址字符串太长,则返回错误
if
(
len
>
(
sizeof
(
my_cliaddr
.
sun_path
)
-
1
))
{
WARN
(
"UDS: Cannot connect to provided name for socket. Name too large"
);
return
scclInternalError
;
}
// 记录日志,表示正在发送数据
INFO
(
SCCL_LOG_BOOTSTRAP
,
"UDS: Sending %zu bytes of data to UDS socket %s"
,
dataLen
,
temp_addr
);
// 设置消息头结构体
struct
msghdr
msg
;
struct
iovec
iov
[
1
];
struct
sockaddr_un
cliaddr
;
bzero
(
&
cliaddr
,
sizeof
(
cliaddr
));
cliaddr
.
sun_family
=
AF_UNIX
;
strncpy
(
cliaddr
.
sun_path
,
temp_addr
,
len
);
cliaddr
.
sun_path
[
0
]
=
'\0'
;
// Linux抽象套接字技巧
iov
[
0
].
iov_base
=
(
void
*
)
data
;
iov
[
0
].
iov_len
=
dataLen
;
msg
.
msg_name
=
(
void
*
)
&
cliaddr
;
msg
.
msg_namelen
=
sizeof
(
cliaddr
);
msg
.
msg_iov
=
iov
;
msg
.
msg_iovlen
=
1
;
msg
.
msg_control
=
NULL
;
msg
.
msg_controllen
=
0
;
msg
.
msg_flags
=
0
;
ssize_t
sendResult
;
// 尝试发送数据,如果失败则等待套接字变得可写后重试
while
((
sendResult
=
sendmsg
(
handle
->
fd
,
&
msg
,
0
))
<=
0
)
{
// 如果错误不是 EAGAIN, EWOULDBLOCK 或 EINTR,则记录警告并返回错误
if
(
errno
!=
EAGAIN
&&
errno
!=
EWOULDBLOCK
&&
errno
!=
EINTR
)
{
WARN
(
"UDS: Sending data over socket %s failed : %d"
,
temp_addr
,
errno
);
return
scclSystemError
;
}
// 如果设置了中止标志,则返回内部错误
if
(
handle
->
abortFlag
&&
*
handle
->
abortFlag
)
return
scclInternalError
;
// 使用 poll 系统调用等待套接字变得可写
struct
pollfd
pfd
;
pfd
.
fd
=
handle
->
fd
;
pfd
.
events
=
POLLOUT
;
int
pollResult
=
poll
(
&
pfd
,
1
,
-
1
);
// 无限等待
if
(
pollResult
<=
0
)
{
WARN
(
"UDS: Polling for socket %s to become writable failed : %d"
,
temp_addr
,
errno
);
return
scclSystemError
;
}
}
return
scclSuccess
;
}
/**
* @brief 非阻塞接收IPC socket数据
*
* 通过UDS套接字非阻塞接收数据,当数据不可读时会等待直到可读或发生错误。
*
* @param buffer 接收数据的缓冲区指针
* @param bufferLen 缓冲区长度
* @param receivedLen 实际接收到的数据长度(输出参数)
* @return scclResult_t 操作结果:
* - scclSuccess: 成功接收数据
* - scclSystemError: 系统调用错误
* - scclInternalError: 被中止标志中断
*
* @note 内部使用recvmsg和poll系统调用实现
*/
scclResult_t
scclIpcSocket
::
scclIpcSocketRecvDataNonBlocking
(
void
*
buffer
,
size_t
bufferLen
,
size_t
*
receivedLen
)
{
// 初始化消息头结构体和iovec结构体
struct
msghdr
msg
=
{
0
};
struct
iovec
iov
[
1
];
iov
[
0
].
iov_base
=
buffer
;
iov
[
0
].
iov_len
=
bufferLen
;
msg
.
msg_iov
=
iov
;
msg
.
msg_iovlen
=
1
;
int
ret
;
// 尝试接收消息,如果失败则等待套接字变得可读后重试
while
((
ret
=
recvmsg
(
handle
->
fd
,
&
msg
,
0
))
<=
0
)
{
// 如果接收失败且错误不是EAGAIN, EWOULDBLOCK或EINTR,则记录警告并返回错误
if
(
errno
!=
EAGAIN
&&
errno
!=
EWOULDBLOCK
&&
errno
!=
EINTR
)
{
WARN
(
"UDS: Receiving data over socket failed : %d"
,
errno
);
return
scclSystemError
;
}
// 如果设置了中止标志,则返回内部错误
if
(
handle
->
abortFlag
&&
*
handle
->
abortFlag
)
return
scclInternalError
;
// 使用 poll 系统调用等待套接字变得可读
struct
pollfd
pfd
;
pfd
.
fd
=
handle
->
fd
;
pfd
.
events
=
POLLIN
;
int
pollResult
=
poll
(
&
pfd
,
1
,
-
1
);
// 无限等待
if
(
pollResult
<=
0
)
{
WARN
(
"UDS: Polling for socket %s to become readable failed : %d"
,
handle
->
socketName
,
errno
);
return
scclSystemError
;
}
}
// 如果成功接收到数据,则记录接收到的数据长度并返回成功
if
(
ret
>
0
)
{
*
receivedLen
=
ret
;
INFO
(
SCCL_LOG_BOOTSTRAP
,
"UDS: Received %zu bytes of data from socket %s"
,
*
receivedLen
,
handle
->
socketName
);
return
scclSuccess
;
}
else
{
WARN
(
"UDS: Receiving data over socket %s failed"
,
handle
->
socketName
);
return
scclSystemError
;
}
}
/**
* @brief 使用IPC套接字实现Allgather操作
*
* 该函数通过线程池并行发送和接收数据,实现多节点间的Allgather集合通信。
*
* @param sendData 发送数据缓冲区指针
* @param recvData 接收数据缓冲区指针
* @param dataLen 每个节点的数据长度(字节)
* @param wait 是否等待所有通信完成
* @return scclResult_t 返回操作结果(scclSuccess表示成功)
*
* @note 1. 会跳过本地rank的数据传输
* 2. 数据包格式: [发送rank(int)][数据]
* 3. 接收缓冲区需要预先分配足够空间(大小=localRanks*dataLen)
*/
scclResult_t
scclIpcSocket
::
scclIpcSocketAllgather
(
const
void
*
sendData
,
void
*
recvData
,
size_t
dataLen
,
bool
wait
)
{
if
(
pthread_pool
==
nullptr
||
localRanks
<=
0
)
{
WARN
(
"scclIpcSocket init error!"
);
return
scclInternalError
;
}
std
::
vector
<
std
::
future
<
void
>>
futures
;
// 采用线程池发送和接收数据
for
(
int
i
=
0
;
i
<
localRanks
;
++
i
)
{
if
(
i
!=
localRank
)
{
auto
sendTask
=
[
this
,
sendData
,
dataLen
,
i
]()
{
// 计算 DataPackage 的总大小
size_t
packageSize
=
sizeof
(
int
)
+
dataLen
;
char
*
buffer
=
new
char
[
packageSize
];
// 将 rank 信息和数据一起拷贝到 buffer 中
int
*
rankPtr
=
reinterpret_cast
<
int
*>
(
buffer
);
*
rankPtr
=
localRank
;
char
*
dataPtr
=
buffer
+
sizeof
(
int
);
memcpy
(
dataPtr
,
sendData
,
dataLen
);
// 一次性发送 rank 信息和数据
scclIpcSocketSendData
(
buffer
,
packageSize
,
i
);
delete
[]
buffer
;
};
futures
.
push_back
(
pthread_pool
->
enqueue
(
sendTask
));
auto
recvTask
=
[
this
,
recvData
,
dataLen
,
i
]()
{
// 准备接收缓冲区
size_t
packageSize
=
sizeof
(
int
)
+
dataLen
;
char
*
buffer
=
new
char
[
packageSize
];
size_t
receivedLen
;
// 一次性接收 rank 信息和数据
scclIpcSocketRecvData
(
buffer
,
packageSize
,
&
receivedLen
);
// 从 buffer 中提取 rank 信息和数据
int
*
rankPtr
=
reinterpret_cast
<
int
*>
(
buffer
);
int
senderRank
=
*
rankPtr
;
char
*
dataPtr
=
buffer
+
sizeof
(
int
);
memcpy
(
static_cast
<
char
*>
(
recvData
)
+
senderRank
*
dataLen
,
dataPtr
,
dataLen
);
delete
[]
buffer
;
};
futures
.
push_back
(
pthread_pool
->
enqueue
(
recvTask
));
}
else
{
// 自己的数据直接放置到正确位置
memcpy
(
static_cast
<
char
*>
(
recvData
)
+
localRank
*
dataLen
,
sendData
,
dataLen
);
}
}
if
(
wait
)
{
// 等待所有任务完成
for
(
auto
&
fut
:
futures
)
{
fut
.
get
();
}
}
return
scclSuccess
;
}
/**
* @brief 使用IPC套接字进行Allgather同步操作
*
* 该函数实现了基于IPC套接字的Allgather同步操作,将各进程的数据收集到所有进程的接收缓冲区中。
*
* @param sendData 发送数据缓冲区指针
* @param recvData 接收数据缓冲区指针
* @param dataLen 每个进程发送/接收的数据长度
* @param wait 是否等待所有通信任务完成
* @return scclResult_t 返回操作结果,成功返回scclSuccess,失败返回错误码
*
* @note 1. 函数会先将本地数据复制到接收缓冲区对应位置
* 2. 使用线程池并行处理与其他进程的通信任务
* 3. 当wait为true时会阻塞等待所有通信完成
*/
scclResult_t
scclIpcSocket
::
scclIpcSocketAllgatherSync
(
const
void
*
sendData
,
void
*
recvData
,
size_t
dataLen
,
bool
wait
)
{
if
(
pthread_pool
==
nullptr
||
localRanks
<=
0
)
{
WARN
(
"scclIpcSocket init error!"
);
return
scclInternalError
;
}
// 将当前进程的数据复制到接收缓冲区的对应位置
memcpy
(
static_cast
<
char
*>
(
recvData
)
+
localRank
*
dataLen
,
sendData
,
dataLen
);
std
::
vector
<
std
::
future
<
void
>>
futures
;
// 采用线程池发送和接收数据
for
(
int
i
=
0
;
i
<
localRanks
;
++
i
)
{
if
(
i
!=
localRank
)
{
auto
sendTask
=
[
this
,
sendData
,
dataLen
,
i
]()
{
scclIpcSocketSendData
(
sendData
,
dataLen
,
i
);
};
futures
.
push_back
(
pthread_pool
->
enqueue
(
sendTask
));
auto
recvTask
=
[
this
,
recvData
,
dataLen
,
i
]()
{
size_t
receivedLen
;
scclIpcSocketRecvData
(
reinterpret_cast
<
char
*>
(
recvData
)
+
i
*
dataLen
,
dataLen
,
&
receivedLen
);
};
futures
.
push_back
(
pthread_pool
->
enqueue
(
recvTask
));
}
}
if
(
wait
)
{
// 等待所有任务完成
for
(
auto
&
fut
:
futures
)
{
fut
.
get
();
}
}
return
scclSuccess
;
}
/**
* @brief 通过IPC Socket进行广播操作
*
* 该函数实现了基于IPC Socket的广播通信机制。根进程(root)将数据发送给所有其他进程,
* 非根进程从根进程接收数据。可以选择是否等待所有通信操作完成。
*
* @param sendData 发送数据缓冲区指针(根进程使用)
* @param recvData 接收数据缓冲区指针(非根进程使用)
* @param dataLen 数据长度(字节)
* @param root 根进程的rank值
* @param wait 是否等待所有通信操作完成
*
* @return scclResult_t 返回操作结果状态码
* - scclSuccess: 操作成功
* - scclInternalError: IPC Socket未初始化或本地rank数无效
* - scclInvalidArgument: 根进程rank值无效
*/
scclResult_t
scclIpcSocket
::
scclIpcSocketBroadcast
(
const
void
*
sendData
,
void
*
recvData
,
size_t
dataLen
,
int
root
,
bool
wait
)
{
if
(
pthread_pool
==
nullptr
||
localRanks
<=
0
)
{
WARN
(
"scclIpcSocket init error!"
);
return
scclInternalError
;
}
if
(
root
<
0
||
root
>=
localRanks
)
{
WARN
(
"scclIpcSocketBroadcast: Invalid root rank %d"
,
root
);
return
scclInvalidArgument
;
}
std
::
vector
<
std
::
future
<
scclResult_t
>>
futures
;
// 使用 future 来收集每个任务的返回结果
if
(
localRank
==
root
)
{
// 根进程:发送数据给所有其他进程
for
(
int
i
=
0
;
i
<
localRanks
;
++
i
)
{
if
(
i
!=
root
)
{
auto
sendTask
=
[
this
,
sendData
,
dataLen
,
i
]()
->
scclResult_t
{
return
scclIpcSocketSendData
(
sendData
,
dataLen
,
i
);
};
futures
.
push_back
(
pthread_pool
->
enqueue
(
sendTask
));
}
}
}
else
{
// 非根进程:从根进程接收数据
auto
recvTask
=
[
this
,
recvData
,
dataLen
,
root
]()
->
scclResult_t
{
size_t
receivedLen
;
return
scclIpcSocketRecvData
(
recvData
,
dataLen
,
&
receivedLen
);
};
futures
.
push_back
(
pthread_pool
->
enqueue
(
recvTask
));
}
if
(
wait
)
{
// 等待所有任务完成并检查结果
for
(
auto
&
fut
:
futures
)
{
scclResult_t
result
=
fut
.
get
();
if
(
result
!=
scclSuccess
)
{
WARN
(
"scclIpcSocketBroadcast: Task failed with error %d"
,
result
);
return
scclInternalError
;
}
}
}
return
scclSuccess
;
}
}
// namespace ipc_socket
}
// namespace net
}
// namespace hardware
}
// namespace sccl
src/hardware/net/ipc_socket/ipc_socket.h
0 → 100644
View file @
a4ac3320
#pragma once
#include <assert.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <poll.h>
#include <sys/types.h>
#include <unistd.h>
#include <sys/un.h>
#include "base.h"
#include "net_utils.h"
#include "socket.h"
#include "thread_pool.h"
namespace
sccl
{
namespace
hardware
{
namespace
net
{
namespace
ipc_socket
{
#define SCCL_IPC_SOCKNAME_LEN 64
#define SCCL_IPC_SOCKNAME_STR "/tmp/sccl-socket-%d-%lx"
// 定义IPC套接字结构体
struct
scclIpcSocketHandle
{
int
fd
;
// 文件描述符
char
socketName
[
SCCL_IPC_SOCKNAME_LEN
];
// 套接字名称
volatile
uint32_t
*
abortFlag
;
// 用于中止操作的标志
};
// 封装发送数据,包括rank信息和实际数据的引用
struct
DataPackage
{
int
rank
;
char
data
[];
// 灵活数组成员,用于存储实际数据
};
//////////////////////////////////////////////////////////////////////////////////////////////////////
class
scclIpcSocket
{
public:
// 构造函数和析构函数
scclIpcSocket
(
int
localRank
,
int
localRanks
,
uint64_t
hash
,
volatile
uint32_t
*
abortFlag
=
nullptr
);
virtual
~
scclIpcSocket
();
// 初始化IPC套接字
scclResult_t
scclIpcSocketInit
(
volatile
uint32_t
*
abortFlag
);
// 设置 abortFlag 的函数
scclResult_t
setAbortFlag
(
volatile
uint32_t
*
flag
);
// 获取 abortFlag 的函数
volatile
uint32_t
*
getAbortFlag
()
const
;
// 设置IPC套接字的超时时间
scclResult_t
setTimeout
(
int
timeout_ms
);
// 获取线程池指针
ThreadPool
*
getPthreadPool
();
//////////////////////////////////////////////////////////////////////////////////////////////////////
/*
并行计算时,不同的进程可能需要访问相同的文件或网络资源。通过发送文件描述符,可以避免多个进程重复打开相同的文件或建立相同的网络连接,从而节省资源和时间。
*/
// 发送文件描述符
scclResult_t
scclIpcSocketSendFd
(
const
int
sendFd
,
int
dst_rank
);
// 接收文件描述符
scclResult_t
scclIpcSocketRecvFd
(
int
*
fd
);
// 通过Unix域套接字发送数据到指定目标,阻塞方式
scclResult_t
scclIpcSocketSendData
(
const
void
*
data
,
size_t
dataLen
,
int
dst_rank
);
// 通过Unix域套接字接收数据,阻塞方式
scclResult_t
scclIpcSocketRecvData
(
void
*
buffer
,
size_t
bufferLen
,
size_t
*
receivedLen
);
// 通过Unix域套接字发送数据到指定目标,非阻塞方式
scclResult_t
scclIpcSocketSendDataNonBlocking
(
const
void
*
data
,
size_t
dataLen
,
int
dst_rank
);
// 通过Unix域套接字接收数据,非阻塞方式
scclResult_t
scclIpcSocketRecvDataNonBlocking
(
void
*
buffer
,
size_t
bufferLen
,
size_t
*
receivedLen
);
// local rank内的allgather操作。保证接收顺序
scclResult_t
scclIpcSocketAllgather
(
const
void
*
sendData
,
void
*
recvData
,
size_t
dataLen
,
bool
wait
=
true
);
// local rank内的allgather操作。为了性能,不保证接收顺序,所以发送的信息中需要添加进程ID
scclResult_t
scclIpcSocketAllgatherSync
(
const
void
*
sendData
,
void
*
recvData
,
size_t
dataLen
,
bool
wait
=
true
);
// local rank内的broadcast操作
scclResult_t
scclIpcSocketBroadcast
(
const
void
*
sendData
,
void
*
recvData
,
size_t
dataLen
,
int
root
,
bool
wait
=
true
);
private:
// 定义并初始化一个 scclIpcSocket 结构体,用于处理 IPC 套接字连接
struct
scclIpcSocketHandle
*
handle
=
nullptr
;
// 定义一个 sockaddr_un 结构体,用于存储客户端地址信息
struct
sockaddr_un
my_cliaddr
;
// 用于生成唯一套接字名称的hash值
const
uint64_t
ipc_hash
;
// 非阻塞套接字设置
const
volatile
uint32_t
*
my_abortFlag
;
// 进程id信息
int
localRank
=
-
1
;
int
localRanks
=
0
;
// 线程池指针
ThreadPool
*
pthread_pool
=
nullptr
;
// 设置超时时间为 10000 毫秒
int
timeoutMs
=
10000
;
};
}
// namespace ipc_socket
}
// namespace net
}
// namespace hardware
}
// namespace sccl
src/hardware/net/net.cpp
0 → 100644
View file @
a4ac3320
#include <stdint.h>
#include "net.h"
namespace
sccl
{
namespace
hardware
{
namespace
net
{
/**
* 打印套接字地址信息
*
* @param sock_addr 套接字地址结构体指针
* @param prefix 输出信息的前缀字符串
* @return scclResult_t 返回操作结果,scclSuccess表示成功
*
* @note 该函数会格式化输出套接字地址信息,包含在分隔线中以便阅读
*/
scclResult_t
printSocketAddr
(
union
net_socket
::
scclSocketAddress
*
sock_addr
,
const
char
*
prefix
)
{
char
line
[
SOCKET_NAME_MAXLEN
+
MAX_IF_NAME_SIZE
+
2
];
net
::
net_socket
::
scclSocketToString
(
sock_addr
,
line
);
printf
(
"
\n
==========================================
\n
%s addr: %s"
"
\n
==========================================
\n
"
,
prefix
,
line
);
return
scclSuccess
;
}
/**
* 打印套接字信息
*
* @param sock 指向scclSocket结构体的指针,包含套接字相关信息
* @param prefix 输出信息的前缀字符串
* @return 返回scclResult_t类型,成功时返回scclSuccess
*
* 该函数用于格式化输出套接字的详细信息,包括文件描述符、重试次数、
* 地址信息、状态标志等调试信息。输出格式包含分隔线以便于阅读。
*/
scclResult_t
printSocketInfo
(
struct
net_socket
::
scclSocket
*
sock
,
const
char
*
prefix
)
{
char
line
[
SOCKET_NAME_MAXLEN
+
MAX_IF_NAME_SIZE
+
2
];
net
::
net_socket
::
scclSocketToString
(
&
sock
->
addr
,
line
);
printf
(
"
\n
==========================================
\n
%s: fd: %d, acceptFd: %d, timedOutRetries: %d, refusedRetries: %d,
\n
addr: %s, abortFlag=%u, "
"asyncFlag=%d, state=%d, salen=%d, magic=%lu, type=%d"
"
\n
==========================================
\n
"
,
prefix
,
sock
->
fd
,
sock
->
acceptFd
,
sock
->
timedOutRetries
,
sock
->
refusedRetries
,
line
,
sock
->
abortFlag
!=
NULL
?
*
sock
->
abortFlag
:
0
,
sock
->
asyncFlag
,
int
(
sock
->
state
),
sock
->
salen
,
sock
->
magic
,
int
(
sock
->
type
));
return
scclSuccess
;
}
////////////////////////////////////////////////////////////////////////////////////////
// 定义网络状态的枚举类型
typedef
enum
scclNetState
{
scclNetStateInit
=
0
,
// 初始化状态
scclNetStateEnabled
=
1
,
// 启用状态
scclNetStateDisabled
=
2
// 禁用状态
}
scclNetState_t
;
// 定义一个数组,存储每种网络类型的状态,初始值均为初始化状态
scclNetState_t
scclNetStates
[
scclNetTypeNum
]
=
{
scclNetStateInit
,
scclNetStateInit
,
scclNetStateInit
};
/**
* 获取指定网络接口的状态
*
* @param i 网络接口索引
* @param state 输出参数,用于存储获取到的网络状态
* @return scclResult_t 返回操作结果,成功返回scclSuccess
*
* @note 该函数是线程安全的,内部使用互斥锁保护共享状态
* @note 如果网络未初始化,会自动执行初始化并更新状态
*/
scclResult_t
netGetState
(
int
i
,
scclNetState_t
*
state
)
{
pthread_mutex_lock
(
&
netLock
);
if
(
scclNetStates
[
i
]
==
scclNetStateInit
)
{
int
ndev
;
if
(
scclNets
[
i
]
->
init
()
!=
scclSuccess
)
scclNetStates
[
i
]
=
scclNetStateDisabled
;
else
if
(
scclNets
[
i
]
->
devices
(
&
ndev
)
!=
scclSuccess
||
ndev
<=
0
)
scclNetStates
[
i
]
=
scclNetStateDisabled
;
else
scclNetStates
[
i
]
=
scclNetStateEnabled
;
}
*
state
=
scclNetStates
[
i
];
pthread_mutex_unlock
(
&
netLock
);
return
scclSuccess
;
}
/**
* @brief 初始化指定名称的网络
*
* 遍历所有可用的网络类型,查找与指定名称匹配且状态为启用的网络。
* 如果找到匹配的网络,则将其赋值给scclNet参数。
*
* @param netName 要查找的网络名称,可为NULL表示匹配任意名称
* @param scclNet 输出参数,用于返回找到的网络实例
*
* @return scclResult_t 返回操作结果:
* - scclSuccess 成功找到匹配网络
* - scclInvalidUsage 未找到匹配网络
*/
scclResult_t
scclNetInit
(
const
char
*
netName
,
scclNet_t
*&
scclNet
)
{
// Initialize main communication network
bool
ok
=
false
;
for
(
int
i
=
0
;
i
<
scclNetTypeNum
;
i
++
)
{
if
(
scclNets
[
i
]
==
nullptr
)
continue
;
enum
scclNetState
state
;
SCCLCHECK
(
netGetState
(
i
,
&
state
));
if
(
state
!=
scclNetStateEnabled
)
continue
;
if
(
netName
&&
strcasecmp
(
netName
,
scclNets
[
i
]
->
name
)
!=
0
)
continue
;
scclNet
=
scclNets
[
i
];
ok
=
true
;
// if(scclCollNets[i]) {
// SCCLCHECK(collNetGetState(i, &state));
// if(state == scclNetStateEnabled) {
// comm->scclCollNet = scclCollNets[i];
// }
// }
break
;
}
if
(
!
ok
)
{
WARN
(
"Error: network %s not found."
,
netName
?
netName
:
""
);
return
scclInvalidUsage
;
}
return
scclSuccess
;
}
}
// namespace net
}
// namespace hardware
}
// namespace sccl
src/hardware/net/net.h
View file @
a4ac3320
#pragma once
#include <stdint.h>
#include <memory>
#include "base.h"
#include "net_utils.h"
#include "device/net_ib.h"
#include "host/net_socket.h"
#include "net_socket/socket.h"
#include "net_ib/net_ib.h"
#include "net_socket/net_socket.h"
namespace
sccl
{
namespace
hardware
{
namespace
net
{
//////////////////////////////////
typedef
enum
net_type
:
uint8_t
{
NET_IB
=
0
,
NET_SOCKET
=
1
}
net_type_t
;
//////////////////////////////////
inline
scclResult_t
initNetSpecial
(
scclNet_t
*
net
)
{
int
ndev
;
// 初始化网络,如果初始化失败则返回内部错误
if
(
net
->
init
()
!=
scclSuccess
)
return
scclInternalError
;
// 获取设备数量,如果获取失败则返回内部错误
if
(
net
->
devices
(
&
ndev
)
!=
scclSuccess
)
return
scclInternalError
;
// 如果设备数量小于或等于0,则返回系统错误
if
(
ndev
<=
0
)
return
scclSystemError
;
return
scclSuccess
;
}
// 定义一个静态的pthread互斥锁,用于线程同步
static
pthread_mutex_t
netLock
=
PTHREAD_MUTEX_INITIALIZER
;
/**
* 初始化网络设备
*
* @param net 指向scclNet_t结构体的指针,表示要初始化的网络设备
* @return scclResult_t 返回操作结果:
* - scclSuccess: 初始化成功
* - scclInternalError: 网络初始化或获取设备数量失败
* - scclSystemError: 系统中无可用设备
*/
inline
scclNet_t
*
initNet
(
net_type_t
t
)
{
scclNet_t
*
scclNet
=
NULL
;
//////////////////////////////////// 功能函数 ////////////////////////////////////
// 打印Socket信息
scclResult_t
printSocketAddr
(
union
net_socket
::
scclSocketAddress
*
sock_addr
,
const
char
*
prefix
);
scclResult_t
printSocketInfo
(
struct
net_socket
::
scclSocket
*
sock
,
const
char
*
prefix
);
if
(
t
==
NET_IB
)
{
if
(
initNetSpecial
(
&
(
device
::
scclNetIb
))
==
scclSuccess
)
{
scclNet
=
&
(
device
::
scclNetIb
);
}
}
else
if
(
t
==
NET_SOCKET
)
{
if
(
initNetSpecial
(
&
(
host
::
scclNetSocket
))
==
scclSuccess
)
{
scclNet
=
&
(
host
::
scclNetSocket
);
}
}
else
{
WARN
(
"Unsupported network type."
);
}
//////////////////////////////////// 网络接口 ////////////////////////////////////
// 定义网络类型数量的常量
constexpr
int
scclNetTypeNum
=
3
;
return
scclNet
;
}
// 定义一个内联数组,存储不同类型的sccl网络指针
inline
scclNetBase
*
scclNets
[]
=
{
nullptr
,
new
net_ib
::
scclNetIb
(),
new
net_socket
::
scclNetSocket
()};
//
//////////////////////////////////
inline
scclNe
t_t
*
scclNet
s
[
3
]
=
{
nullptr
,
&
device
::
scclNetIb
,
&
host
::
scclNetSocket
}
;
//
定义初始化sccl网络的函数
scclResul
t_t
scclNet
Init
(
const
char
*
netName
,
scclNet_t
*&
scclNet
)
;
}
// namespace net
}
// namespace hardware
...
...
Prev
1
2
3
4
5
6
7
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment