graph.cpp 20 KB
Newer Older
1
#include <iostream>
2
#include "base.h"
3
4
5
6
7
8
9
#include "graph.h"

namespace sccl {
namespace hardware {
namespace topology {
namespace graph {

10
11
12
13
14
15
16
17
18
19
20
21
22
23
// 构造函数的实现
Graph::Graph(const Bootstrap* bootstrap)
    : sccl_bootstrap(bootstrap),
      rank(sccl_bootstrap->rank),
      nRanks(sccl_bootstrap->nRanks),
      localRank(sccl_bootstrap->localRank),
      nLocalRanks(sccl_bootstrap->nLocalRanks),
      interRank(sccl_bootstrap->interRank),
      nInterRanks(sccl_bootstrap->nInterRanks) {

    // 与scclNodeInfo_t中的定义一致,预留足够大小的node空间
    this->node_info_total_bytes = sizeof(scclTopoNode_t) * topoNodeMaxLocalNodes / nLocalRanks;
    node_info_vec.reserve(nRanks * node_info_total_bytes); // 预留空间
    node_info_vec.clear();
24
25
}

26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
// 析构函数的实现
Graph::~Graph() {}

scclResult_t Graph::establishGraph(const BootstrapComm_t* bootstrap_comm) {
    //////// 初始化topo node ////////
    scclNodeInfo_t local_topo_nodes(nLocalRanks);
    // 使用ByteSpan替代std::vector,并指定容量为pNodes_len
    ByteSpanVector<scclTopoNode_t> nodes_span((void*)local_topo_nodes.nodes, local_topo_nodes.totalByteSize);

    // 遍历所有的GPU的pciPath,添加topo node
    for(int r = localRank; r < bootstrap_comm->deviceCnt; r += nLocalRanks) {
        auto gpu_path = physical_links::getGpuPciPath(r);
        physical_links::generate_topo_nodes(gpu_path, this->interRank, r, nodes_span);
        delete(gpu_path);
    }
    // 遍历所有的NIC的pciPath,添加topo node
    bootstrap::scclRankInfo_t local_rank_info = bootstrap_comm->rank_phys_set->rank_info_vec[this->rank];
    for(int r = localRank; r < local_rank_info.net.count; r += nLocalRanks) {
        auto net_path = physical_links::getNetPciPath(bootstrap_comm->scclNet, r);
        physical_links::generate_topo_nodes(net_path, this->interRank, r, nodes_span);
        delete(net_path);
    }

#if 0
    if(interRank == 0) {
        char line[30];
        sprintf(line, "print rank=%d: ", rank);
        bootstrap::printRankInfo(std::string(line), &local_rank_info);
    }
#endif
#if 0
    if(interRank == 0) {
        ByteSpanArray<scclTopoNode_t> nodes_span_array(nodes_span.data(), local_topo_nodes.totalByteSize);
        printf("print rank=%d, nodes_span size=%zu\n", rank, nodes_span.size());
        char line[30];
        sprintf(line, "print rank=%d: ", rank);
        for(int i = 0; i < nodes_span.size(); i++) {
            printf("============================**============================\n");
            physical_links::printTopoNode(nodes_span_array, i, line);
            printf("============================**============================\n");
        }
    }
#endif
#if 0
            // 尝试采用软件识别GPU之间互联
            for(int i = 0; i < bootstrap_comm->deviceCnt; i++) {
                // if(i != bootstrap_comm->hipDev) {
                RSMI_IO_LINK_TYPE rsmi_type;
                int hops, count;
                if(rocm_smi_getLinkInfo(bootstrap_comm->hipDev, i, &rsmi_type, &hops, &count) == scclSuccess) {
                    printf("rank=%d, i=%d, dev=%d, rsmi_type=%d, hops=%d, count=%d\n", rank, i, bootstrap_comm->hipDev, rsmi_type, hops, count);
                    // if(rsmi_type == RSMI_IOLINK_TYPE_XGMI && hops <= 2) {
                    // if(1) {
                    //     char busIdStr[] = "00000000:00:00.0";
                    //     SCCLCHECK(rocm_smi_getDevicePciBusIdString(i, busIdStr, sizeof(busIdStr)));
                    //     char lowerId[16];
                    //     for(int c = 0; c < 16; c++) {
                    //         lowerId[c] = tolower(busIdStr[c]);
                    //         if(busIdStr[c] == 0)
                    //             break;
                    //     }
                    // }
                } else {
                    printf("rsmi get type fail\n");
                }
                // }
            }
#endif

    // -------------------------- 4.Comm信息的allgather ----------------------------------- //
    SCCLCHECK(sccl_bootstrap->bootstrapAllGather(local_topo_nodes.nodes, this->node_info_vec.data(), this->node_info_total_bytes));

    // TODO: 目前手动将节点内的GPU进行mesh连接,因为无法从/sys/device中获取NIC的拓扑信息,rsmi函数也无法获取NIC的拓扑信息。后续优化
    SCCLCHECK(bootstrapNodesLink(this->node_info_vec.data(), this->node_info_total_bytes));
#if 0
    if(rank == 1) {
        size_t dataLen = this->node_info_total_bytes;
        printf("nRanks * this->node_info_total_bytes=%zu, %lu\n", dataLen, nRanks * dataLen);
        auto node_info_data = reinterpret_cast<char*>(this->node_info_vec.data());
        ByteSpanArray<scclTopoNode_t> nodes_span_all(node_info_data, nRanks * dataLen);
        printf("print rank=%d, nodes_span_all size=%zu, scclTopoNode_t size=%zu\n", rank, nodes_span_all.size(), sizeof(scclTopoNode_t));

        char line[30];
        sprintf(line, "print rank=%d: ", rank);
        int node_cnt = 0;
        for(int i = 0; i < nodes_span_all.size(); i++) {
            if(nodes_span_all[i] && nodes_span_all[i]->type > 0) {

                if(i < 64) {
                    printf("============================&&============================\n");
                    physical_links::printTopoNode(nodes_span_all, i, line);
                    printf("============================&&============================\n");
                } else if(i < 128) {
                    printf("============================((============================\n");
                    physical_links::printTopoNode(nodes_span_all, i, line);
                    printf("============================))============================\n");
                } else {
                    printf("============================@@============================\n");
                    physical_links::printTopoNode(nodes_span_all, i, line);
                    printf("============================@@============================\n");
                }
                node_cnt += 1;
            }
        }
        printf("print rank=%d, node_cnt=%d\n", rank, node_cnt);
    }
#endif

    return scclSuccess;
135
136
}

137
scclResult_t Graph::calculateCommunicationPaths(const BootstrapComm_t* bootstrap_comm, scclTopoGraph_t* topo_graph) {
138
139
    // 通信路径计算的实现
    std::cout << "Calculating communication paths..." << std::endl;
140
141

    // 调用pathFinder类,实现硬件路径搜索
142
143
    auto path_finder = PathFinder(bootstrap_comm, this->node_info_vec, this->node_info_total_bytes);

144
145
    // 将搜索结果写入topo_graph中,并记录有效node
    SCCLCHECK(path_finder.computeTopoGpuP2pMap(topo_graph));
146
    // topo_graph->printGPUPaths();
147
148
149
150

    // 调用bootstrap类,将transport_map进行allgather统计
    uint8_t* local_transport_map = topo_graph->getTransportMapRowStart(rank);
    SCCLCHECK(sccl_bootstrap->bootstrapAllGather(local_transport_map, topo_graph->transport_map.data(), nRanks * sizeof(uint8_t)));
151
#if 1
152
153
154
155
    // 打印transport_map
    if(bootstrap_comm->rank == 0) {
        SCCLCHECK(topo_graph->printTransportMap());
    }
156
#endif
157
158
159
160

    return scclSuccess;
}

161
scclResult_t Graph::searchLogicalTopology() {
162
163
164
165
166
167
168
169
170
171
172
173
174
    // 逻辑拓扑构建的实现
    std::cout << "Building logical topology..." << std::endl;
    // 具体的实现细节
    return scclSuccess;
}

scclResult_t Graph::calculateTopoChannels() {
    // 根据无向图计算topo路径的实现
    std::cout << "Calculating topo paths based on undirected graph..." << std::endl;
    // 具体的实现细节
    return scclSuccess;
}

175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
////////////////////////////////////////////////// private //////////////////////////////////////////////////

// TODO: 当前实现使用了较多的for循环,在节点数量较大时速度较慢,可以考虑采用cuda kernel
/**
 * @brief 初始化并连接节点之间的链接
 *
 * 该函数接收一个指向节点信息的字节数组的指针和节点信息的总字节数,用于初始化并连接节点之间的链接。
 * 1.创建一个`ByteSpanArray`对象来管理节点信息的内存,然后根据节点的类型(GPU、PCI或NIC)将它们分类存储。
 * 2.它使相同`interRank`下的GPU节点两两互联
 * 3.遍历所有的`interRank`来合并具有相同`id`、`type`和`busIdStr`的PCI节点。
 * 4.使CPU node即numa node的neighbors两两互联。
 * 5.它使相同`deviceId`下的NIC节点两两互联。
 *
 * @param node_info_vec 指向节点信息的字节数组的指针
 * @param node_info_total_bytes 节点信息的总字节数
 * @return scclResult_t 返回操作结果状态码:
 *     - scclSuccess: 操作成功
 *     - scclError: 操作失败
 */
scclResult_t Graph::bootstrapNodesLink(void* node_info_vec, int node_info_total_bytes) {
    // 创建一个ByteSpanArray对象,用于管理节点信息的内存
    ByteSpanArray<scclTopoNode_t> node_info_span(node_info_vec, nRanks * node_info_total_bytes);
    // 用于将nodes的deviceId对应的node
    std::unordered_map<uint64_t, std::vector<scclTopoNode_t*>> nodes_map_by_deviceId;
    // 用于将interRank内nodes的deviceSig对应的NIC节点连接
    std::unordered_map<uint64_t, std::vector<scclTopoNode_t*>> nic_nodes_by_deviceId;
    // 用于识别并连接节点内的GPU node
    std::vector<std::vector<scclTopoNode_t*>> gpu_nodes_by_interRank(nInterRanks);

    // -------------------------- 1.遍历所有的节点信息,记录node -------------------------- //
    for(size_t i = 0; i < node_info_span.size(); ++i) {
        scclTopoNode_t* node = node_info_span[i];
        // 跳过空节点、跳过没有busId的节点(如空节点或CPU)
        if(node->type <= 0) {
            continue;
        }
        uint64_t id = node->id;
        int interRank;
        physical_links::getIdComponents(id, &interRank);
        uint64_t deviceSig = id & 0xFFFFFFFFFF; // 计算 interRank(24bit) + hipDev(8bit) + deviceId(16bit) + terminalType(8bit) + numaId(8bit)

        // 选择type为GPU的节点
        if(node->type == GPU) {
            if(interRank >= gpu_nodes_by_interRank.size()) {
                gpu_nodes_by_interRank.resize(interRank + 1);
            }
            gpu_nodes_by_interRank[interRank].push_back(node);
        } else if(node->type == NIC) {
            nic_nodes_by_deviceId[deviceSig].push_back(node);
        }

        nodes_map_by_deviceId[id].push_back(node);
    }

    // 合并id相同和busId相同的node
    for(auto& pair : nodes_map_by_deviceId) {
        auto& nodes = pair.second;
        for(size_t i = 0; i < nodes.size(); ++i) {
            for(size_t j = i + 1; j < nodes.size(); ++j) {
                // if(nodes[i]->id == nodes[j]->id && nodes[i]->type == nodes[j]->type && nodes[i]->busId == nodes[j]->busId) {
                //     SCCLCHECK(nodes[i]->combineNode(nodes[j]));
                // }
                if(nodes[i]->id == nodes[j]->id) {
                    if(nodes[i]->type == nodes[j]->type && nodes[i]->busId == nodes[j]->busId) {
                        SCCLCHECK(nodes[i]->combineNode(nodes[j]));
                    } else {
#if 0
                        int tmpi_interRank, tmpi_deviceValue, tmpi_terminalType, tmpi_hipDev, tmpi_numaId;
                        physical_links::getIdComponents(nodes[i]->id, &tmpi_interRank, &tmpi_deviceValue, &tmpi_terminalType, &tmpi_hipDev, &tmpi_numaId);
                        int tmpj_interRank, tmpj_deviceValue, tmpj_terminalType, tmpj_hipDev, tmpj_numaId;
                        physical_links::getIdComponents(nodes[j]->id, &tmpj_interRank, &tmpj_deviceValue, &tmpj_terminalType, &tmpj_hipDev, &tmpj_numaId);
                        char busIdStr_i[17], busIdStr_j[17];
                        int64ToBusId(nodes[i]->busId, busIdStr_i);
                        int64ToBusId(nodes[j]->busId, busIdStr_j);

                        printf("same Id but different type or busId: %lu(InterRank:%d, V:%d, T:%d, H:%d, N:%d, busIdStr:%s) and %lu(InterRank:%d, V:%d, T:%d, "
                               "H:%d, N:%d, busIdStr:%s)\n",
                               nodes[i]->id,
                               tmpi_interRank,
                               tmpi_deviceValue,
                               tmpi_terminalType,
                               tmpi_hipDev,
                               tmpi_numaId,
                               busIdStr_i,
                               nodes[j]->id,
                               tmpj_interRank,
                               tmpj_deviceValue,
                               tmpj_terminalType,
                               tmpj_hipDev,
                               tmpj_numaId,
                               busIdStr_j);
#endif
                    }
                }
            }
        }
    }

    // 遍历所有的节点信息,将CPU的所有neighbor node两两互联
    for(size_t i = 0; i < node_info_span.size(); ++i) {
        scclTopoNode_t* node = node_info_span[i];
        // 跳过空节点、跳过没有busId的节点(如空节点或CPU)
        if(node->type == CPU) {
            for(size_t i = 0; i < node->neighborCount; ++i) {
                for(size_t j = i + 1; j < node->neighborCount; ++j) {
                    // 使用unordered_map来加速查找
                    auto it_i = nodes_map_by_deviceId.find(node->neighbors[i]);
                    auto it_j = nodes_map_by_deviceId.find(node->neighbors[j]);
                    if(it_i != nodes_map_by_deviceId.end() && it_j != nodes_map_by_deviceId.end()) {
                        scclTopoNode_t* neighbor_i = nullptr;
                        scclTopoNode_t* neighbor_j = nullptr;
                        for(auto& n : it_i->second) {
                            if(n->type > 0) {
                                neighbor_i = n;
                                break;
                            }
                        }
                        for(auto& n : it_j->second) {
                            if(n->type > 0) {
                                neighbor_j = n;
                                break;
                            }
                        }
                        if(neighbor_i && neighbor_j) {
                            neighbor_i->addNeighbor(neighbor_j->id);
                            neighbor_j->addNeighbor(neighbor_i->id);
#if 0
                            {
                                int tmpi_interRank, tmpi_deviceValue, tmpi_terminalType, tmpi_hipDev, tmpi_numaId;
                                physical_links::getIdComponents(
                                    neighbor_i->id, &tmpi_interRank, &tmpi_deviceValue, &tmpi_terminalType, &tmpi_hipDev, &tmpi_numaId);
                                int tmpj_interRank, tmpj_deviceValue, tmpj_terminalType, tmpj_hipDev, tmpj_numaId;
                                physical_links::getIdComponents(
                                    neighbor_j->id, &tmpj_interRank, &tmpj_deviceValue, &tmpj_terminalType, &tmpj_hipDev, &tmpj_numaId);
                                char busIdStr_i[17], busIdStr_j[17];
                                int64ToBusId(neighbor_i->busId, busIdStr_i);
                                int64ToBusId(neighbor_j->busId, busIdStr_j);

                                printf("connect CPU neighbors %lu(InterRank:%d, V:%d, T:%d, H:%d, N:%d, busIdStr:%s) and %lu(InterRank:%d, V:%d, T:%d, H:%d, "
                                       "N:%d, busIdStr:%s)\n",
                                       neighbor_i->id,
                                       tmpi_interRank,
                                       tmpi_deviceValue,
                                       tmpi_terminalType,
                                       tmpi_hipDev,
                                       tmpi_numaId,
                                       busIdStr_i,
                                       neighbor_j->id,
                                       tmpj_interRank,
                                       tmpj_deviceValue,
                                       tmpj_terminalType,
                                       tmpj_hipDev,
                                       tmpj_numaId,
                                       busIdStr_j);
                            }
#endif
                        }
                    }
                }
            }
        }
    }

    // 使相同interRank下的GPU node两两互联
    for(const auto& nodes : gpu_nodes_by_interRank) {
        for(size_t i = 0; i < nodes.size(); ++i) {
            for(size_t j = i + 1; j < nodes.size(); ++j) {
                nodes[i]->addNeighbor(nodes[j]->id);
                nodes[j]->addNeighbor(nodes[i]->id);
#if 0
                {
                    int tmpi_interRank, tmpi_deviceValue, tmpi_terminalType, tmpi_hipDev, tmpi_numaId;
                    physical_links::getIdComponents(nodes[i]->id, &tmpi_interRank, &tmpi_deviceValue, &tmpi_terminalType, &tmpi_hipDev, &tmpi_numaId);
                    int tmpj_interRank, tmpj_deviceValue, tmpj_terminalType, tmpj_hipDev, tmpj_numaId;
                    physical_links::getIdComponents(nodes[j]->id, &tmpj_interRank, &tmpj_deviceValue, &tmpj_terminalType, &tmpj_hipDev, &tmpj_numaId);
                    char busIdStr_i[17], busIdStr_j[17];
                    int64ToBusId(nodes[i]->busId, busIdStr_i);
                    int64ToBusId(nodes[j]->busId, busIdStr_j);

                    printf("connect GPU %lu(InterRank:%d, V:%d, T:%d, H:%d, N:%d, busIdStr:%s) and %lu(InterRank:%d, V:%d, T:%d, H:%d, N:%d, busIdStr:%s)\n",
                           nodes[i]->id,
                           tmpi_interRank,
                           tmpi_deviceValue,
                           tmpi_terminalType,
                           tmpi_hipDev,
                           tmpi_numaId,
                           busIdStr_i,
                           nodes[j]->id,
                           tmpj_interRank,
                           tmpj_deviceValue,
                           tmpj_terminalType,
                           tmpj_hipDev,
                           tmpj_numaId,
                           busIdStr_j);
                }
#endif
            }
        }
    }

    // 使相同deviceId下的NIC节点两两互联
    for(const auto& pair : nic_nodes_by_deviceId) {
        const auto& nodes = pair.second;
        for(size_t i = 0; i < nodes.size(); ++i) {
            for(size_t j = i + 1; j < nodes.size(); ++j) {
                // 在deviceId相同的情况下,比较busIdStr
                if(nodes[i]->busId == nodes[j]->busId) {
                    nodes[i]->addNeighbor(nodes[j]->id);
                    nodes[j]->addNeighbor(nodes[i]->id);
#if 0
                    {
                        int tmpi_interRank, tmpi_deviceValue, tmpi_terminalType, tmpi_hipDev, tmpi_numaId;
                        physical_links::getIdComponents(nodes[i]->id, &tmpi_interRank, &tmpi_deviceValue, &tmpi_terminalType, &tmpi_hipDev, &tmpi_numaId);
                        int tmpj_interRank, tmpj_deviceValue, tmpj_terminalType, tmpj_hipDev, tmpj_numaId;
                        physical_links::getIdComponents(nodes[j]->id, &tmpj_interRank, &tmpj_deviceValue, &tmpj_terminalType, &tmpj_hipDev, &tmpj_numaId);
                        char busIdStr_i[17], busIdStr_j[17];
                        int64ToBusId(nodes[i]->busId, busIdStr_i);
                        int64ToBusId(nodes[j]->busId, busIdStr_j);

                        printf("connect NIC interRank %lu(InterRank:%d, V:%d, T:%d, H:%d, N:%d, busIdStr:%s) and %lu(InterRank:%d, V:%d, T:%d, H:%d, N:%d, "
                               "busIdStr:%s)\n",
                               nodes[i]->id,
                               tmpi_interRank,
                               tmpi_deviceValue,
                               tmpi_terminalType,
                               tmpi_hipDev,
                               tmpi_numaId,
                               busIdStr_i,
                               nodes[j]->id,
                               tmpj_interRank,
                               tmpj_deviceValue,
                               tmpj_terminalType,
                               tmpj_hipDev,
                               tmpj_numaId,
                               busIdStr_j);
                    }
#endif
                }
            }
        }
    }

    return scclSuccess;
}

420
421
422
423
} // namespace graph
} // namespace topology
} // namespace hardware
} // namespace sccl