bootstrap.h 1.55 KB
Newer Older
lishen's avatar
lishen committed
1
2
3
4
#pragma once

#include <string.h>
#include "base.h"
5
6
7
8
#include "socket.h"
#include "bootstrap_utils.h"
#include "bootstrap_net.h"
#include "thread_pool.h"
lishen's avatar
lishen committed
9
10
11
12
13
14

namespace sccl {
namespace hardware {
namespace topology {
namespace bootstrap {

15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
class scclBootstrap {
public:
    scclBootstrap(struct scclRankInfo* rank_info, struct scclBootstrapComm* comm);
    ~scclBootstrap();

    // 初始化bootstrap通信环境
    scclResult_t bootstrapInit(const struct scclRankInfo* rank_info, struct scclBootstrapComm* comm);
    // 检查bootstrap是否已成功初始化
    scclResult_t bootstrapInitCheck();

    // 广播节点信息
    scclResult_t bootstrapAllGather(struct scclUniqueInfo* unique_info);

private:
    // 执行基本的引导程序初始化
    scclResult_t bootstrapBasicInit();
    // 初始化唯一ID信息结构体
    scclResult_t bootstrapUniqueInfoInit(const struct scclRankInfo* rank_info, scclNet_t* scclNet, struct scclUniqueInfo* unique_info);

    // scclResult_t bootstrapGetAllNodes(const struct scclUniqueInfo* unique_info, struct scclBootstrapComm* comm);

private:
    pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER;
    bool initialized         = false;
    bool hsaFineGrainFlag    = true;

    // 分配并初始化引导句柄
    struct scclBootstrapHandle* handle = nullptr;
    // 分配并初始化网络结构体
    class bootstrapNet* bootstrap_net = nullptr;

    int max_pthreads               = 0;
    class ThreadPool* pthread_pool = nullptr;
};

lishen's avatar
lishen committed
50
51
52
53
} // namespace bootstrap
} // namespace topology
} // namespace hardware
} // namespace sccl