debug.h 9.58 KB
Newer Older
lishen's avatar
lishen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
#pragma once

#include <hip/hip_runtime_api.h>
#include <hip/hip_fp16.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <unistd.h>
#include <pthread.h>
#include <string.h>
#include <stdarg.h>

#include <sys/syscall.h>

namespace sccl {

#define SCCL_NET_MAX_REQUESTS 8

typedef enum : uint8_t {
    SCCL_LOG_NONE    = 0,
    SCCL_LOG_VERSION = 1,
    SCCL_LOG_WARN    = 2,
    SCCL_LOG_INFO    = 3,
    SCCL_LOG_ABORT   = 4
} scclDebugLogLevel_t;

29
30
31
32
33
34
35
36
37
38
39
40
typedef enum : int64_t {
    SCCL_LOG_CODEALL    = ~0,
    SCCL_LOG_NET        = 0x0001,
    SCCL_LOG_TOPO       = 0x0002,
    SCCL_LOG_BOOTSTRAP  = 0x0004,
    SCCL_LOG_TRANSPORT  = 0x0008,
    SCCL_LOG_GRAPH      = 0x0010,
    SCCL_LOG_CONNECT    = 0x0020,
    SCCL_LOG_P2P        = 0x0040,
    SCCL_LOG_COLLECTIVE = 0x0080,
    SCCL_LOG_ALLOC      = 0x0100
} scclDebugLogSubSys_t;
lishen's avatar
lishen committed
41
42
43
44
45
46
47
48
49
50

namespace debug {

static char scclLastError[1024] = "";                             // 全局字符串,用于存储可读的最后错误信息
static char hostname[1024];                                       // 存储主机名的全局字符串
static pthread_mutex_t scclDebugLock = PTHREAD_MUTEX_INITIALIZER; // 用于调试操作的互斥锁,保证多线程环境下的线程安全
static __thread int tid              = -1;                        // 线程局部存储(Thread Local Storage)变量,存储当前线程的ID,默认值为-1
static int pid                       = -1;                        // 存储当前进程的ID,默认值为-1
static FILE* scclDebugFile           = stdout;                    // 指向调试输出流的文件指针,默认指向标准输出(stdout

51
52
static uint64_t scclDebugMask = SCCL_LOG_TOPO | SCCL_LOG_BOOTSTRAP; // Default debug sub-system mask is INIT and ENV
static int scclDebugLevel     = -1;                                 // 初始化为 -1,表示未设置
lishen's avatar
lishen committed
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119

// 在文件顶部或适当位置定义变量
static int scclDebugPos = -1; // 初始化为 -1,表示未设置

/**
 * @brief 获取主机名并截断到指定分隔符
 *
 * 该函数获取当前主机名,并将其截断到第一个出现的指定分隔符处。
 * 如果获取主机名失败,则返回"unknown"。
 *
 * @param hostname 用于存储主机名的缓冲区
 * @param maxlen 缓冲区最大长度
 * @param delim 用于截断主机名的分隔符
 */
static void getHostName(char* hostname, int maxlen, const char delim) {
    if(gethostname(hostname, maxlen) != 0) {
        strncpy(hostname, "unknown", maxlen);
        return;
    }
    int i = 0;
    while((hostname[i] != delim) && (hostname[i] != '\0') && (i < maxlen - 1))
        i++;
    hostname[i] = '\0';
}

////////////////////////////// 初始化debug //////////////////////////////
/**
 * @brief 初始化SCCL调试系统
 *
 * 该函数负责初始化SCCL的调试功能,包括:
 * 1. 从环境变量SCCL_DEBUG_LEVEL读取并设置调试等级
 * 2. 从环境变量SCCL_DEBUG_POS读取并设置调试位置
 * 3. 缓存当前进程的PID和主机名
 * 4. 根据SCCL_DEBUG_FILE环境变量创建调试日志文件
 *
 * 函数使用互斥锁保证线程安全,并通过原子操作设置最终的调试等级和位置。
 * 调试等级和位置的默认值分别为SCCL_LOG_INFO和SCCL_LOG_CODEALL。
 *
 * @note 该函数是线程安全的,但应在程序早期调用以避免竞态条件
 */
static void scclDebugInit() {
    pthread_mutex_lock(&scclDebugLock);

    if(scclDebugLevel != -1) {
        pthread_mutex_unlock(&scclDebugLock);
        return;
    }

    //// 按照debug等级划分
    int tempScclDebugLevel = -1;
    {
        const char* sccl_debug = getenv("SCCL_DEBUG_LEVEL");

        if(sccl_debug == NULL) {
            tempScclDebugLevel = SCCL_LOG_INFO;
        } else if(strcasecmp(sccl_debug, "VERSION") == 0) {
            tempScclDebugLevel = SCCL_LOG_VERSION;
        } else if(strcasecmp(sccl_debug, "WARN") == 0) {
            tempScclDebugLevel = SCCL_LOG_WARN;
        } else if(strcasecmp(sccl_debug, "INFO") == 0) {
            tempScclDebugLevel = SCCL_LOG_INFO;
        } else if(strcasecmp(sccl_debug, "ABORT") == 0) {
            tempScclDebugLevel = SCCL_LOG_ABORT;
        }
    }

    //// 按照代码位置划分
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
    char* scclDebugSubsysEnv = getenv("SCCL_DEBUG_SUBSYS");
    if(scclDebugSubsysEnv != NULL) {
        int invert = 0;
        if(scclDebugSubsysEnv[0] == '^') {
            invert = 1;
            scclDebugSubsysEnv++;
        }
        scclDebugMask         = invert ? ~0ULL : 0ULL;
        char* scclDebugSubsys = strdup(scclDebugSubsysEnv);
        char* subsys          = strtok(scclDebugSubsys, ",");
        while(subsys != NULL) {
            uint64_t mask = 0;
            if(strcasecmp(subsys, "NET") == 0) {
                mask = SCCL_LOG_NET;
            } else if(strcasecmp(subsys, "TOPO") == 0) {
                mask = SCCL_LOG_TOPO;
            } else if(strcasecmp(subsys, "BOOTSTRAP") == 0) {
                mask = SCCL_LOG_BOOTSTRAP;
            } else if(strcasecmp(subsys, "TRANSPORT") == 0) {
                mask = SCCL_LOG_TRANSPORT;
            } else if(strcasecmp(subsys, "GRAPH") == 0) {
                mask = SCCL_LOG_GRAPH;
            } else if(strcasecmp(subsys, "CONNECT") == 0) {
                mask = SCCL_LOG_CONNECT;
            } else if(strcasecmp(subsys, "P2P") == 0) {
                mask = SCCL_LOG_P2P;
            } else if(strcasecmp(subsys, "COLLECTIVE") == 0) {
                mask = SCCL_LOG_COLLECTIVE;
            } else if(strcasecmp(subsys, "ALLOC") == 0) {
                mask = SCCL_LOG_ALLOC;
            } else if(strcasecmp(subsys, "ALL") == 0) {
                mask = SCCL_LOG_CODEALL;
            }
            if(mask) {
                if(invert)
                    scclDebugMask &= ~mask;
                else
                    scclDebugMask |= mask;
            }
            subsys = strtok(NULL, ",");
lishen's avatar
lishen committed
160
        }
161
        free(scclDebugSubsys);
lishen's avatar
lishen committed
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
    }

    // Cache pid and hostname
    getHostName(hostname, 1024, '.');
    pid = getpid();

    /* Parse and expand the SCCL_DEBUG_FILE path and
     * then create the debug file. But don't bother unless the
     * SCCL_DEBUG level is > VERSION
     */
    const char* scclDebugFileEnv = getenv("SCCL_DEBUG_FILE");
    if(tempScclDebugLevel > SCCL_LOG_VERSION && scclDebugFileEnv != NULL) {
        int c                      = 0;
        char debugFn[PATH_MAX + 1] = "";
        char* dfn                  = debugFn;
        while(scclDebugFileEnv[c] != '\0' && c < PATH_MAX) {
            if(scclDebugFileEnv[c++] != '%') {
                *dfn++ = scclDebugFileEnv[c - 1];
                continue;
            }
            switch(scclDebugFileEnv[c++]) {
                case '%': // Double %
                    *dfn++ = '%';
                    break;
                case 'h': // %h = hostname
                    dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
                    break;
                case 'p': // %p = pid
                    dfn += snprintf(dfn, PATH_MAX, "%d", pid);
                    break;
                default: // Echo everything we don't understand
                    *dfn++ = '%';
                    *dfn++ = scclDebugFileEnv[c - 1];
                    break;
            }
        }
        *dfn = '\0';
        if(debugFn[0] != '\0') {
            FILE* file = fopen(debugFn, "w");
            if(file != nullptr) {
                setbuf(file, nullptr); // disable buffering
                scclDebugFile = file;
            }
        }
    }

    __atomic_store_n(&scclDebugLevel, tempScclDebugLevel, __ATOMIC_RELEASE);

    pthread_mutex_unlock(&scclDebugLock);
}

////////////////////////////// 打印DEBUG信息 //////////////////////////////

template <scclDebugLogLevel_t level>
216
void scclDebugLog(scclDebugLogSubSys_t pos_flags, const char* filepath, const char* filefunc, int line, const char* fmt, ...) {
lishen's avatar
lishen committed
217
218
219
220
221
222
223
224
    if(__atomic_load_n(&scclDebugLevel, __ATOMIC_ACQUIRE) == -1)
        scclDebugInit();

    if constexpr(level == SCCL_LOG_WARN)
        scclDebugPos = SCCL_LOG_CODEALL;

    // 检查调试级别和位置标志
    bool isDebugLevelSufficient = (scclDebugLevel >= level);
225
    bool isDebugPositionMatch   = (pos_flags & scclDebugMask) != 0;
lishen's avatar
lishen committed
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
    // 如果调试级别不足或位置标志不匹配,则不执行后续操作
    if(!isDebugLevelSufficient || !isDebugPositionMatch) {
        return;
    }

    // Save the last error (WARN) as a human readable string
    if constexpr(level == SCCL_LOG_WARN) {
        pthread_mutex_lock(&scclDebugLock);
        va_list vargs;
        va_start(vargs, fmt);
        (void)vsnprintf(scclLastError, sizeof(scclLastError), fmt, vargs);
        va_end(vargs);
        pthread_mutex_unlock(&scclDebugLock);
    }

    if(tid == -1) {
        tid = syscall(SYS_gettid);
    }

    char buffer[1024];
    size_t len = 0;
    if constexpr(level == SCCL_LOG_WARN) {
248
        len = snprintf(buffer, sizeof(buffer), "\n%s:%d:%d %s:%s:%d SCCL WARN ", hostname, pid, tid, filepath, filefunc, line);
lishen's avatar
lishen committed
249
    } else if constexpr(level == SCCL_LOG_INFO) {
250
        len = snprintf(buffer, sizeof(buffer), "%s:%d:%d %s:%s:%d SCCL INFO ", hostname, pid, tid, filepath, filefunc, line);
lishen's avatar
lishen committed
251
252
253
254
255
256
257
258
259
260
261
262
263
264
    }

    if(len) {
        va_list vargs;
        va_start(vargs, fmt);
        len += vsnprintf(buffer + len, sizeof(buffer) - len, fmt, vargs);
        va_end(vargs);
        buffer[len++] = '\n';
        fwrite(buffer, 1, len, scclDebugFile);
    }
}

} // namespace debug

265
266
#define WARN(...) debug::scclDebugLog<SCCL_LOG_WARN>(SCCL_LOG_CODEALL, __FILE__, __func__, __LINE__, __VA_ARGS__)
#define INFO(FLAGS, ...) debug::scclDebugLog<SCCL_LOG_INFO>((FLAGS), __FILE__, __func__, __LINE__, __VA_ARGS__)
lishen's avatar
lishen committed
267
268

} // namespace sccl