check.h 12.7 KB
Newer Older
lishen's avatar
lishen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
#pragma once

#include <hip/hip_runtime_api.h>
#include <hip/hip_fp16.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <unistd.h>
#include <pthread.h>
#include <pwd.h>
#include <errno.h>

#include "debug.h"

#define SCCL_MAJOR 1
#define SCCL_MINOR 0
#define SCCL_PATCH 0
#define SCCL_SUFFIX ""

#define SCCL_VERSION(X, Y, Z) ((X) * 1000 + (Y) * 100 + (Z))

namespace sccl {
/**
 * @brief 对选中的代码进行简要功能说明
 * @note 根据代码作用域(如公开API或内部实现)编写适当的文档注释
 */
typedef enum {
30
31
32
33
34
35
36
37
38
    scclSuccess           = 0, /*!< 无错误 */
    scclUnhandledHipError = 1, /*!< 未处理的 HIP 错误 */
    scclSystemError       = 2, /*!< 未处理的系统错误 */
    scclInternalError     = 3, /*!< 内部错误 - 请报告给 RCCL 开发者 */
    scclInvalidArgument   = 4, /*!< 无效参数 */
    scclInvalidUsage      = 5, /*!< 无效使用 */
    scclRemoteError       = 6, /*!< 远程进程退出或发生网络错误 */
    scclInProgress        = 7, /*!< RCCL 操作正在进行中 */
    scclNumResults        = 8  /*!< 结果类型数量 */
lishen's avatar
lishen committed
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
} scclResult_t;

typedef enum {
    testSuccess       = 0,
    testInternalError = 1,
    testHipError      = 2,
    testScclError     = 3,
    testTimeout       = 4,
    testNumResults    = 5
} testResult_t;

static const char* scclGetErrorString(scclResult_t code) {
    switch(code) {
        case scclSuccess: return "success";
        case scclUnhandledHipError: return "unhandled hip error (run with SCCL_DEBUG=INFO for details)";
        case scclSystemError: return "unhandled system error (run with SCCL_DEBUG=INFO for details)";
        case scclInternalError: return "internal error - please report this issue to the SCCL developers";
        case scclInvalidArgument: return "invalid argument (run with SCCL_DEBUG=WARN for details)";
        case scclInvalidUsage: return "invalid usage (run with SCCL_DEBUG=WARN for details)";
        case scclRemoteError: return "remote process exited or there was a network error";
        case scclInProgress: return "SCCL operation in progress";
        default: return "unknown result code";
    }
}

////////////////////////////// SCCL和HIP //////////////////////////////

// Propagate errors up
#define SCCLCHECK(call)                                                        \
    do {                                                                       \
        scclResult_t RES = call;                                               \
        if(RES != scclSuccess && RES != scclInProgress) {                      \
            /* Print the back trace*/                                          \
            INFO(SCCL_LOG_CODEALL, "check fail: %s", scclGetErrorString(RES)); \
            return RES;                                                        \
        }                                                                      \
    } while(0);

77
78
79
80
81
82
83
#define SCCLCHECKGOTO(call, RES, label)                                               \
    do {                                                                              \
        RES = call;                                                                   \
        if(RES != scclSuccess && RES != scclInProgress) {                             \
            INFO(SCCL_LOG_CODEALL, "%s:%d -> %d", __func__, __FILE__, __LINE__, RES); \
            goto label;                                                               \
        }                                                                             \
lishen's avatar
lishen committed
84
85
    } while(0);

86
87
88
89
90
91
92
93
94
#define HIPCHECK(cmd)                                                                                      \
    do {                                                                                                   \
        hipError_t err = cmd;                                                                              \
        if(err != hipSuccess) {                                                                            \
            char hostname[1024];                                                                           \
            gethostname(hostname, 1024);                                                                   \
            INFO(SCCL_LOG_CODEALL, "%s: Test HIP failure %s:%d '%s'\n", hostname, hipGetErrorString(err)); \
            return scclUnhandledHipError;                                                                  \
        }                                                                                                  \
lishen's avatar
lishen committed
95
96
97
98
99
100
101
102
103
104
105
106
107
108
    } while(0)

#define HIPCHECKGOTO(cmd, RES, label)                         \
    do {                                                      \
        hipError_t err = cmd;                                 \
        if(err != hipSuccess) {                               \
            WARN("HIP failure '%s'", hipGetErrorString(err)); \
            RES = scclUnhandledHipError;                      \
            goto label;                                       \
        }                                                     \
    } while(false)

////////////////////////////// Value检查 //////////////////////////////

109
110
111
112
113
114
115
#define EQCHECK(statement, value)                                                                                       \
    do {                                                                                                                \
        if((statement) == value) {                                                                                      \
            /* Print the back trace*/                                                                                   \
            INFO(SCCL_LOG_CODEALL, "%s:%d -> %d (%s)", __func__, __FILE__, __LINE__, scclSystemError, strerror(errno)); \
            return scclSystemError;                                                                                     \
        }                                                                                                               \
lishen's avatar
lishen committed
116
117
    } while(0);

118
119
120
121
122
123
124
125
#define EQCHECKGOTO(statement, value, RES, label)                                                           \
    do {                                                                                                    \
        if((statement) == value) {                                                                          \
            /* Print the back trace*/                                                                       \
            RES = scclSystemError;                                                                          \
            INFO(SCCL_LOG_CODEALL, "%s:%d -> %d (%s)", __func__, __FILE__, __LINE__, RES, strerror(errno)); \
            goto label;                                                                                     \
        }                                                                                                   \
lishen's avatar
lishen committed
126
127
    } while(0);

128
129
130
131
132
133
134
#define NEQCHECK(statement, value)                                                                                      \
    do {                                                                                                                \
        if((statement) != value) {                                                                                      \
            /* Print the back trace*/                                                                                   \
            INFO(SCCL_LOG_CODEALL, "%s:%d -> %d (%s)", __func__, __FILE__, __LINE__, scclSystemError, strerror(errno)); \
            return scclSystemError;                                                                                     \
        }                                                                                                               \
135
136
    } while(0);

137
138
139
140
141
142
143
144
#define NEQCHECKGOTO(statement, value, RES, label)                                                          \
    do {                                                                                                    \
        if((statement) != value) {                                                                          \
            /* Print the back trace*/                                                                       \
            RES = scclSystemError;                                                                          \
            INFO(SCCL_LOG_CODEALL, "%s:%d -> %d (%s)", __func__, __FILE__, __LINE__, RES, strerror(errno)); \
            goto label;                                                                                     \
        }                                                                                                   \
145
146
    } while(0);

147
148
149
150
151
152
153
#define LECHECK(statement, value)                                                                                       \
    do {                                                                                                                \
        if((statement) <= value) {                                                                                      \
            /* Print the back trace*/                                                                                   \
            INFO(SCCL_LOG_CODEALL, "%s:%d -> %d (%s)", __func__, __FILE__, __LINE__, scclSystemError, strerror(errno)); \
            return scclSystemError;                                                                                     \
        }                                                                                                               \
154
155
    } while(0);

156
157
158
159
160
161
162
#define LTCHECK(statement, value)                                                                                       \
    do {                                                                                                                \
        if((statement) < value) {                                                                                       \
            /* Print the back trace*/                                                                                   \
            INFO(SCCL_LOG_CODEALL, "%s:%d -> %d (%s)", __func__, __FILE__, __LINE__, scclSystemError, strerror(errno)); \
            return scclSystemError;                                                                                     \
        }                                                                                                               \
163
164
    } while(0);

lishen's avatar
lishen committed
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
////////////////////////////// SYS //////////////////////////////

// Check system calls
#define SYSCHECK(call, name)             \
    do {                                 \
        int retval;                      \
        SYSCHECKVAL(call, name, retval); \
    } while(false)

#define SYSCHECKVAL(call, name, retval)                            \
    do {                                                           \
        SYSCHECKSYNC(call, name, retval);                          \
        if(retval == -1) {                                         \
            WARN("Call to " name " failed : %s", strerror(errno)); \
            return scclSystemError;                                \
        }                                                          \
    } while(false)

#define SYSCHECKSYNC(call, name, retval)                                                       \
    do {                                                                                       \
        retval = call;                                                                         \
        if(retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) {      \
            INFO(SCCL_LOG_CODEALL, "Call to " name " returned %s, retrying", strerror(errno)); \
        } else {                                                                               \
            break;                                                                             \
        }                                                                                      \
    } while(true)

193
194
195
196
197
198
199
200
#define SYSCHECKGOTO(statement, RES, label)                                                                 \
    do {                                                                                                    \
        if((statement) == -1) {                                                                             \
            /* Print the back trace*/                                                                       \
            RES = scclSystemError;                                                                          \
            INFO(SCCL_LOG_CODEALL, "%s:%d -> %d (%s)", __func__, __FILE__, __LINE__, RES, strerror(errno)); \
            goto label;                                                                                     \
        }                                                                                                   \
lishen's avatar
lishen committed
201
202
203
    } while(0);

} // namespace sccl