bootstrap_utils.cpp 6.31 KB
Newer Older
lishen's avatar
lishen committed
1
2
#include <string.h>
#include "base.h"
3
#include "bootstrap_utils.h"
lishen's avatar
lishen committed
4
5
6
7
8

namespace sccl {
namespace hardware {
namespace topology {
namespace bootstrap {
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188

/**
 * 计算字符串的哈希值(基于DJB2a算法)
 *
 * @param string 输入字符串指针
 * @param n 字符串长度
 * @return 计算得到的64位无符号哈希值
 *
 * @note 算法实现:result = result * 33 ^ char
 *       初始种子值为5381
 */
static uint64_t getHash(const char* string, int n) {
    // Based on DJB2a, result = result * 33 ^ char
    uint64_t result = 5381;
    for(int c = 0; c < n; c++) {
        result = ((result << 5) + result) ^ string[c];
    }
    return result;
}

/**
 * @brief 获取主机名并截断到指定分隔符
 *
 * 获取当前主机名,并将其截断到第一个出现的分隔符(或字符串结尾)。
 * 如果获取失败,则使用"unknown"作为默认主机名。
 *
 * @param hostname 用于存储主机名的缓冲区
 * @param maxlen 缓冲区最大长度
 * @param delim 截断分隔符
 * @return scclResult_t 成功返回scclSuccess,失败返回scclSystemError
 */
static scclResult_t getHostName(char* hostname, int maxlen, const char delim) {
    if(gethostname(hostname, maxlen) != 0) {
        strncpy(hostname, "unknown", maxlen);
        return scclSystemError;
    }
    int i = 0;
    while((hostname[i] != delim) && (hostname[i] != '\0') && (i < maxlen - 1))
        i++;
    hostname[i] = '\0';
    return scclSuccess;
}

#define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
/**
 * 获取主机唯一标识的哈希值,该哈希值在裸机和容器实例中都是唯一的
 *
 * 该函数通过以下方式获取主机标识并计算其哈希值:
 * 1. 首先尝试从环境变量 SCCL_HOSTID 获取
 * 2. 若未设置,则尝试从 HOSTID_FILE 文件中读取
 * 3. 若都失败,则使用主机名作为后备方案
 *
 *
 * $(hostname)$(cat /proc/sys/kernel/random/boot_id)
 *
 * This string can be overridden by using the SCCL_HOSTID env var.
 *
 * @return 返回主机标识字符串的64位哈希值
 */
uint64_t getHostHash(void) {
    char hostHash[1024];
    char* hostId;

    // Fall back is the full hostname if something fails
    (void)getHostName(hostHash, sizeof(hostHash), '\0');
    int offset = strlen(hostHash);

    if((hostId = getenv("SCCL_HOSTID")) != NULL) {
        INFO(SCCL_LOG_BOOTSTRAP, "SCCL_HOSTID set by environment to %s", hostId);
        strncpy(hostHash, hostId, sizeof(hostHash));
    } else {
        FILE* file = fopen(HOSTID_FILE, "r");
        if(file != NULL) {
            char* p;
            if(fscanf(file, "%ms", &p) == 1) {
                strncpy(hostHash + offset, p, sizeof(hostHash) - offset - 1);
                free(p);
            }
        }
        fclose(file);
    }

    // Make sure the string is terminated
    hostHash[sizeof(hostHash) - 1] = '\0';

    INFO(SCCL_LOG_BOOTSTRAP, "unique hostname '%s'", hostHash);

    return getHash(hostHash, strlen(hostHash));
}

/**
 * 获取当前进程的唯一哈希标识符
 * 为这个进程的唯一标识字符串生成一个哈希值该哈希值在裸机和容器实例中都是唯一的
 * 相当于以下内容的哈希值
 * $$ $(readlink /proc/self/ns/pid)
 *
 * 通过组合进程ID和PID命名空间路径生成唯一字符串,并计算其哈希值
 *
 * @return uint64_t 返回基于进程ID和PID命名空间路径生成的哈希值
 * @note 如果读取PID命名空间路径失败,则仅使用进程ID生成哈希
 */
uint64_t getPidHash(void) {
    char pname[1024];
    // Start off with our pid ($$)
    sprintf(pname, "%ld", (long)getpid());
    int plen = strlen(pname);
    int len  = readlink("/proc/self/ns/pid", pname + plen, sizeof(pname) - 1 - plen);
    if(len < 0)
        len = 0;

    pname[plen + len] = '\0';
    INFO(SCCL_LOG_BOOTSTRAP, "unique PID '%s' ", pname);

    return getHash(pname, strlen(pname));
}

/**
 * @brief 从/dev/urandom设备获取随机数据填充缓冲区
 *
 * @param buffer 指向接收随机数据的缓冲区的指针
 * @param bytes 需要获取的随机数据字节数
 * @return scclResult_t 操作结果状态码(scclSuccess表示成功,scclSystemError表示系统错误)
 *
 * @note 如果bytes为0,函数将直接返回成功状态而不执行任何操作
 */
scclResult_t getRandomData(void* buffer, size_t bytes) {
    scclResult_t ret = scclSuccess;
    if(bytes > 0) {
        const size_t one = 1UL;
        FILE* fp         = fopen("/dev/urandom", "r");
        if(buffer == NULL || fp == NULL || fread(buffer, bytes, one, fp) != one)
            ret = scclSystemError;
        if(fp)
            fclose(fp);
    }
    return ret;
}

// Convert a logical hipDev index to the NVML device minor number

/**
 * 获取指定CUDA设备的PCI总线ID并转换为64位整数
 *
 * @param hipDev 输入的CUDA设备号
 * @param busId 输出参数,用于存储转换后的64位总线ID
 * @return 返回操作结果,成功返回scclSuccess
 *
 * @note PCI总线ID通常格式为"0000:00:00.0",本函数会将其转换为64位整数
 */
scclResult_t getBusId(int hipDev, int64_t* busId) {
    // On most systems, the PCI bus ID comes back as in the 0000:00:00.0
    // format. Still need to allocate proper space in case PCI domain goes higher.
    char busIdStr[] = "00000000:00:00.0";
    HIPCHECK(hipDeviceGetPCIBusId(busIdStr, sizeof(busIdStr), hipDev));
    // printf("get busid func: %s\n", busIdStr);
    SCCLCHECK(busIdToInt64(busIdStr, busId));
    return scclSuccess;
}

// 函数:打印 scclUniqueInfo 结构体的信息
void printUniqueInfo(struct scclUniqueInfo* info) {
    if(info->localRank == 0) {
        printf("\n==========================================\nTotal Rank: %d/%d, Local Rank: %d/%d, CUDA Device ID/Cnt: %d/%d, \n"
               "Host Hash: %lu, PID Hash: %lu, gpu.name=%s, gcn=%s\n"
               "\n==========================================\n",
               info->rank,
               info->nRanks,
               info->localRank,
               info->localRanks,
               info->hipDev,
               info->deviceCnt,
               info->hostHash,
               info->pidHash,
               info->localNode.gpu.name,
               info->localNode.gpu.gcn);
        SCCLCHECK(net::printNetProps(&info->localNode.net.props, info->rank, info->localRank));
    }
    return;
}

lishen's avatar
lishen committed
189
190
191
192
} // namespace bootstrap
} // namespace topology
} // namespace hardware
} // namespace sccl