logging.h 7.71 KB
Newer Older
Tim Moon's avatar
Tim Moon committed
1
/*************************************************************************
2
 * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
Tim Moon's avatar
Tim Moon committed
3
4
5
6
7
8
9
10
 *
 * See LICENSE for license information.
 ************************************************************************/

#ifndef TRANSFORMER_ENGINE_COMMON_UTIL_LOGGING_H_
#define TRANSFORMER_ENGINE_COMMON_UTIL_LOGGING_H_

#include <cuda_runtime_api.h>
yuguo's avatar
yuguo committed
11
#ifdef __HIP_PLATFORM_AMD__
yuguo's avatar
yuguo committed
12
#include <rccl.h>
yuguo's avatar
yuguo committed
13
14
15
16
17
18
19
20
21
#ifdef USE_HIPBLASLT
#include <hipblaslt/hipblaslt.h>
#endif
#ifdef USE_ROCBLAS
#define ROCBLAS_BETA_FEATURES_API
#include <rocblas/rocblas.h>
#endif
#else
#include <cublas_v2.h>
Tim Moon's avatar
Tim Moon committed
22
#include <cudnn.h>
yuguo's avatar
yuguo committed
23
#endif // __HIP_PLATFORM_AMD__
Tim Moon's avatar
Tim Moon committed
24
25
#include <nvrtc.h>

wenjh's avatar
wenjh committed
26
#ifndef __HIP_PLATFORM_AMD__
Phuong Nguyen's avatar
Phuong Nguyen committed
27
#include "nccl.h"
wenjh's avatar
wenjh committed
28
#endif
Phuong Nguyen's avatar
Phuong Nguyen committed
29

30
31
32
33
#ifdef NVTE_WITH_CUBLASMP
#include <cublasmp.h>
#endif  // NVTE_WITH_CUBLASMP

34
#include <iostream>
35
#include <stdexcept>
36
#include <string>
37

Tim Moon's avatar
Tim Moon committed
38
39
#include "../util/string.h"

40
41
42
43
44
45
46
#define NVTE_WARN(...)                                            \
  do {                                                            \
    std::cerr << ::transformer_engine::concat_strings(            \
        __FILE__ ":", __LINE__, " in function ", __func__, ": ",  \
        ::transformer_engine::concat_strings(__VA_ARGS__), "\n"); \
  } while (false)

47
48
49
50
51
#define NVTE_ERROR(...)                                              \
  do {                                                               \
    throw ::std::runtime_error(::transformer_engine::concat_strings( \
        __FILE__ ":", __LINE__, " in function ", __func__, ": ",     \
        ::transformer_engine::concat_strings(__VA_ARGS__)));         \
Tim Moon's avatar
Tim Moon committed
52
53
  } while (false)

54
55
56
57
58
59
#define NVTE_CHECK(expr, ...)                                        \
  do {                                                               \
    if (!(expr)) {                                                   \
      NVTE_ERROR("Assertion failed: " #expr ". ",                    \
                 ::transformer_engine::concat_strings(__VA_ARGS__)); \
    }                                                                \
Tim Moon's avatar
Tim Moon committed
60
61
  } while (false)

yuguo's avatar
yuguo committed
62
63
64
65
66
67
68
69
70
#define NCCLCHECK(cmd) do {                                \
  ncclResult_t r = cmd;                                    \
  if (r != ncclSuccess) {                                  \
    printf("NCCL error %s:%d: '%s'\n", __FILE__, __LINE__, \
           ncclGetErrorString(r));                         \
    exit(EXIT_FAILURE);                                    \
  }                                                        \
} while(0)

71
72
73
74
75
76
#define NVTE_CHECK_CUDA(expr)                                                 \
  do {                                                                        \
    const cudaError_t status_NVTE_CHECK_CUDA = (expr);                        \
    if (status_NVTE_CHECK_CUDA != cudaSuccess) {                              \
      NVTE_ERROR("CUDA Error: ", cudaGetErrorString(status_NVTE_CHECK_CUDA)); \
    }                                                                         \
Tim Moon's avatar
Tim Moon committed
77
78
  } while (false)

yuguo's avatar
yuguo committed
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#ifdef __HIP_PLATFORM_AMD__
#ifdef USE_HIPBLASLT //hipblaslt
#define NVTE_CHECK_HIPBLASLT(expr)                                         \
  do {                                                                  \
    const hipblasStatus_t status_NVTE_CHECK_CUBLAS = (expr);            \
    if (status_NVTE_CHECK_CUBLAS != CUBLAS_STATUS_SUCCESS) {            \
      NVTE_ERROR("HIPBLASLT Error: ",                                   \
                 std::to_string((int)status_NVTE_CHECK_CUBLAS));        \
    }                                                                   \
  } while (false)
#endif
#ifdef USE_ROCBLAS //rocblas
#define NVTE_CHECK_ROCBLAS(expr)                                         \
  do {                                                                  \
    const rocblas_status status_NVTE_CHECK_CUBLAS = (expr);             \
    if (status_NVTE_CHECK_CUBLAS != rocblas_status_success) {           \
      NVTE_ERROR("ROCBLAS Error: " +                                    \
                 std::string(rocblas_status_to_string(status_NVTE_CHECK_CUBLAS)));      \
    }                                                                   \
  } while (false)
#endif
#else //cublas
101
102
103
104
105
106
#define NVTE_CHECK_CUBLAS(expr)                                                      \
  do {                                                                               \
    const cublasStatus_t status_NVTE_CHECK_CUBLAS = (expr);                          \
    if (status_NVTE_CHECK_CUBLAS != CUBLAS_STATUS_SUCCESS) {                         \
      NVTE_ERROR("cuBLAS Error: ", cublasGetStatusString(status_NVTE_CHECK_CUBLAS)); \
    }                                                                                \
Tim Moon's avatar
Tim Moon committed
107
  } while (false)
yuguo's avatar
yuguo committed
108
#endif
Tim Moon's avatar
Tim Moon committed
109

110
111
112
113
114
115
116
117
118
119
#define NVTE_CHECK_CUDNN(expr)                                                  \
  do {                                                                          \
    const cudnnStatus_t status_NVTE_CHECK_CUDNN = (expr);                       \
    if (status_NVTE_CHECK_CUDNN != CUDNN_STATUS_SUCCESS) {                      \
      NVTE_ERROR("cuDNN Error: ", cudnnGetErrorString(status_NVTE_CHECK_CUDNN), \
                 ". "                                                           \
                 "For more information, enable cuDNN error logging "            \
                 "by setting CUDNN_LOGERR_DBG=1 and "                           \
                 "CUDNN_LOGDEST_DBG=stderr in the environment.");               \
    }                                                                           \
Tim Moon's avatar
Tim Moon committed
120
121
  } while (false)

122
123
124
125
126
127
128
129
130
131
#define NVTE_CHECK_CUDNN_FE(expr)                                    \
  do {                                                               \
    const auto error = (expr);                                       \
    if (error.is_bad()) {                                            \
      NVTE_ERROR("cuDNN Error: ", error.err_msg,                     \
                 ". "                                                \
                 "For more information, enable cuDNN error logging " \
                 "by setting CUDNN_LOGERR_DBG=1 and "                \
                 "CUDNN_LOGDEST_DBG=stderr in the environment.");    \
    }                                                                \
132
133
  } while (false)

134
135
136
137
138
139
#define NVTE_CHECK_NVRTC(expr)                                                   \
  do {                                                                           \
    const nvrtcResult status_NVTE_CHECK_NVRTC = (expr);                          \
    if (status_NVTE_CHECK_NVRTC != NVRTC_SUCCESS) {                              \
      NVTE_ERROR("NVRTC Error: ", nvrtcGetErrorString(status_NVTE_CHECK_NVRTC)); \
    }                                                                            \
Tim Moon's avatar
Tim Moon committed
140
141
  } while (false)

142
143
144
145
146
147
148
149
150
151
152
153
#ifdef NVTE_WITH_CUBLASMP

#define NVTE_CHECK_CUBLASMP(expr)                             \
  do {                                                        \
    const cublasMpStatus_t status = (expr);                   \
    if (status != CUBLASMP_STATUS_SUCCESS) {                  \
      NVTE_ERROR("cuBLASMp Error: ", std::to_string(status)); \
    }                                                         \
  } while (false)

#endif  // NVTE_WITH_CUBLASMP

Phuong Nguyen's avatar
Phuong Nguyen committed
154
155
156
157
158
159
160
161
#define NVTE_CHECK_NCCL(expr)                                                 \
  do {                                                                        \
    const ncclResult_t status_NVTE_CHECK_NCCL = (expr);                       \
    if (status_NVTE_CHECK_NCCL != ncclSuccess) {                              \
      NVTE_ERROR("NCCL Error: ", ncclGetErrorString(status_NVTE_CHECK_NCCL)); \
    }                                                                         \
  } while (false)

Tim Moon's avatar
Tim Moon committed
162
#endif  // TRANSFORMER_ENGINE_COMMON_UTIL_LOGGING_H_