Refactor logging macros (#382)

* Do not include logging macros in installed C headers Signed-off-by: Tim Moon <tmoon@nvidia.com> * Debug logging macros Signed-off-by: Tim Moon <tmoon@nvidia.com> * Debug C++ tests Use Google style for header includes. Signed-off-by: Tim Moon <tmoon@nvidia.com> * Update CUDA driver macros Incorporating changes from #389. Co-authored-by: Tim Moon <tmoon@nvidia.com> Co-authored-by: Jan Bielak <jbielak@nvidia.com> Signed-off-by: Tim Moon <tmoon@nvidia.com> * Use core error checking macros in PyTorch extensions Hack to get around macro redefinition warning. Signed-off-by: Tim Moon <tmoon@nvidia.com> * Fix missing arg when getting CUDA driver error string Signed-off-by: Tim Moon <tmoon@nvidia.com> * Reuse logging header in frameworks Signed-off-by: Tim Moon <tmoon@nvidia.com> --------- Signed-off-by: Tim Moon <tmoon@nvidia.com> Co-authored-by: Jan Bielak <jbielak@nvidia.com>

Refactor logging macros (#382)
* Do not include logging macros in installed C headers Signed-off-by: Tim Moon <tmoon@nvidia.com> * Debug logging macros Signed-off-by: Tim Moon <tmoon@nvidia.com> * Debug C++ tests Use Google style for header includes. Signed-off-by: Tim Moon <tmoon@nvidia.com> * Update CUDA driver macros Incorporating changes from #389. Co-authored-by: Tim Moon <tmoon@nvidia.com> Co-authored-by: Jan Bielak <jbielak@nvidia.com> Signed-off-by: Tim Moon <tmoon@nvidia.com> * Use core error checking macros in PyTorch extensions Hack to get around macro redefinition warning. Signed-off-by: Tim Moon <tmoon@nvidia.com> * Fix missing arg when getting CUDA driver error string Signed-off-by: Tim Moon <tmoon@nvidia.com> * Reuse logging header in frameworks Signed-off-by: Tim Moon <tmoon@nvidia.com> --------- Signed-off-by: Tim Moon <tmoon@nvidia.com> Co-authored-by: Jan Bielak <jbielak@nvidia.com>
6b311da2 · Tim Moon · GitHub · 91b754e0 · 6b311da2 · 6b311da2
Unverified Commit 6b311da2 authored Oct 24, 2023 by Tim Moon Committed by GitHub Oct 24, 2023
10 changed files
--- a/transformer_engine/common/include/transformer_engine/transpose.h
+++ b/transformer_engine/common/include/transformer_engine/transpose.h
@@ -146,9 +146,6 @@ void nvte_multi_cast_transpose(size_t num_tensors,
 *  - `cast_output` is the result of the cast
 *  - `transposed_output` is the transposed result of the cast.
 *
- *  Calling this function with workspace being an empty tensor will not perform the operation,
- *  but instead set the shape and type of the workspace tensor to the required values.
- *
 *  \param[in]     input               Input tensor of shape [N, H].
 *  \param[in]     geglu_input         Tensor used as input to the forward of GeGLU operation.
 *                                     Shape [N, H * 2].

--- a/transformer_engine/common/util/cuda_driver.h
+++ b/transformer_engine/common/util/cuda_driver.h
@@ -43,30 +43,23 @@ inline CUresult call(const char *symbol, ArgTs... args) {

 }  // namespace transformer_engine

-namespace {
-
-/*! \brief Throw exception if CUDA driver call has failed */
-inline void check_cuda_driver_(CUresult status) {
-  if (status != CUDA_SUCCESS) {
-    const char *description;
-    transformer_engine::cuda_driver::call("cuGetErrorString", &description);
-    NVTE_ERROR(transformer_engine::concat_strings("CUDA Error: ", description));
-  }
-}
-
-/*! \brief Call CUDA driver function and throw exception if it fails */
-template <typename... ArgTs>
-inline void call_and_check_cuda_driver_(const char *symbol,
-                                        ArgTs &&... args) {
-  check_cuda_driver_(transformer_engine::cuda_driver::call(symbol,
-                                                           std::forward<ArgTs>(args)...));
-}
-
-}  // namespace
-
-#define NVTE_CHECK_CUDA_DRIVER(ans) { check_cuda_driver_(ans); }
-
-#define NVTE_CALL_CHECK_CUDA_DRIVER(func, ...) \
-  { call_and_check_cuda_driver_(#func, __VA_ARGS__); }
+#define NVTE_CHECK_CUDA_DRIVER(expr)                                    \
+  do {                                                                  \
+    const CUresult status_NVTE_CHECK_CUDA_DRIVER = (expr);              \
+    if (status_NVTE_CHECK_CUDA_DRIVER != CUDA_SUCCESS) {                \
+      const char *desc_NVTE_CHECK_CUDA_DRIVER;                          \
+      ::transformer_engine::cuda_driver::call(                          \
+        "cuGetErrorString",                                             \
+        status_NVTE_CHECK_CUDA_DRIVER,                                  \
+        &desc_NVTE_CHECK_CUDA_DRIVER);                                  \
+      NVTE_ERROR("CUDA Error: ", desc_NVTE_CHECK_CUDA_DRIVER);          \
+    }                                                                   \
+  } while (false)
+
+#define NVTE_CALL_CHECK_CUDA_DRIVER(symbol, ...)                        \
+  do {                                                                  \
+    NVTE_CHECK_CUDA_DRIVER(                                             \
+      ::transformer_engine::cuda_driver::call(#symbol, __VA_ARGS__));   \
+  } while (false)

 #endif  // TRANSFORMER_ENGINE_COMMON_UTIL_CUDA_DRIVER_H_
--- a/transformer_engine/common/util/logging.h
+++ b/transformer_engine/common/util/logging.h
+/*************************************************************************
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * See LICENSE for license information.
+ ************************************************************************/
+
+#ifndef TRANSFORMER_ENGINE_COMMON_UTIL_LOGGING_H_
+#define TRANSFORMER_ENGINE_COMMON_UTIL_LOGGING_H_
+
+#include <stdexcept>
+
+#include <cublas_v2.h>
+#include <cuda_runtime_api.h>
+#include <cudnn.h>
+#include <nvrtc.h>
+
+#include "../util/string.h"
+
+#define NVTE_ERROR(...)                                         \
+  do {                                                          \
+    throw ::std::runtime_error(                                 \
+      ::transformer_engine::concat_strings(                     \
+        __FILE__ ":", __LINE__,                                 \
+        " in function ", __func__, ": ",                        \
+        ::transformer_engine::concat_strings(__VA_ARGS__)));    \
+  } while (false)
+
+#define NVTE_CHECK(expr, ...)                                           \
+  do {                                                                  \
+    if (!(expr)) {                                                      \
+      NVTE_ERROR("Assertion failed: " #expr ". ",                       \
+                 ::transformer_engine::concat_strings(__VA_ARGS__));    \
+    }                                                                   \
+  } while (false)
+
+#define NVTE_CHECK_CUDA(expr)                                           \
+  do {                                                                  \
+    const cudaError_t status_NVTE_CHECK_CUDA = (expr);                  \
+    if (status_NVTE_CHECK_CUDA != cudaSuccess) {                        \
+      NVTE_ERROR("CUDA Error: ",                                        \
+                 cudaGetErrorString(status_NVTE_CHECK_CUDA));           \
+    }                                                                   \
+  } while (false)
+
+#define NVTE_CHECK_CUBLAS(expr)                                         \
+  do {                                                                  \
+    const cublasStatus_t status_NVTE_CHECK_CUBLAS = (expr);             \
+    if (status_NVTE_CHECK_CUBLAS != CUBLAS_STATUS_SUCCESS) {            \
+      NVTE_ERROR("cuBLAS Error: ",                                      \
+                 cublasGetStatusString(status_NVTE_CHECK_CUBLAS));      \
+    }                                                                   \
+  } while (false)
+
+#define NVTE_CHECK_CUDNN(expr)                                          \
+  do {                                                                  \
+    const cudnnStatus_t status_NVTE_CHECK_CUDNN = (expr);               \
+    if (status_NVTE_CHECK_CUDNN != CUDNN_STATUS_SUCCESS) {              \
+      NVTE_ERROR("cuDNN Error: ",                                       \
+                 cudnnGetErrorString(status_NVTE_CHECK_CUDNN),          \
+                 ". "                                                   \
+                 "For more information, enable cuDNN error logging "    \
+                 "by setting CUDNN_LOGERR_DBG=1 and "                   \
+                 "CUDNN_LOGDEST_DBG=stderr in the environment.");       \
+    }                                                                   \
+  } while (false)
+
+#define NVTE_CHECK_NVRTC(expr)                                  \
+  do {                                                          \
+    const nvrtcResult status_NVTE_CHECK_NVRTC = (expr);         \
+    if (status_NVTE_CHECK_NVRTC != NVRTC_SUCCESS) {             \
+      NVTE_ERROR("NVRTC Error: ",                               \
+                 nvrtcGetErrorString(status_NVTE_CHECK_NVRTC)); \
+    }                                                           \
+  } while (false)
+
+#endif  // TRANSFORMER_ENGINE_COMMON_UTIL_LOGGING_H_
--- a/transformer_engine/jax/csrc/modules.h
+++ b/transformer_engine/jax/csrc/modules.h
@@ -7,19 +7,18 @@
 #ifndef TRANSFORMER_ENGINE_JAX_CSRC_FP8_MODULES_H_
 #define TRANSFORMER_ENGINE_JAX_CSRC_FP8_MODULES_H_

-#include <cuda_runtime_api.h>
-
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
 #include <vector>

+#include <cuda_runtime_api.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>

-#include "transformer_engine/fused_attn.h"
-#include "transformer_engine/logging.h"
-#include "transformer_engine/transformer_engine.h"
+#include "common/util/logging.h"
+#include <transformer_engine/fused_attn.h>
+#include <transformer_engine/transformer_engine.h>

 namespace transformer_engine {
 namespace jax {

--- a/transformer_engine/jax/csrc/utils.h
+++ b/transformer_engine/jax/csrc/utils.h
@@ -7,15 +7,16 @@
 #ifndef TRANSFORMER_ENGINE_JAX_CSRC_UTILS_H_
 #define TRANSFORMER_ENGINE_JAX_CSRC_UTILS_H_

-#include <pybind11/pybind11.h>
-
 #include <cstdint>
 #include <numeric>
 #include <stdexcept>
 #include <string>
 #include <type_traits>
-#include "transformer_engine/fused_attn.h"
-#include "transformer_engine/logging.h"
+
+#include <pybind11/pybind11.h>
+
+#include "common/util/logging.h"
+#include <transformer_engine/fused_attn.h>

 namespace transformer_engine {
 namespace jax {

--- a/transformer_engine/paddle/csrc/common.h
+++ b/transformer_engine/paddle/csrc/common.h
@@ -5,22 +5,23 @@
 ************************************************************************/
 #pragma once

+#include <cstdlib>
+#include <vector>
+
 #include <cublasLt.h>
+#include "paddle/extension.h"
+#include "paddle/phi/backends/all_context.h"
+
+#include "common/util/logging.h"
 #include <transformer_engine/activation.h>
 #include <transformer_engine/cast.h>
 #include <transformer_engine/fused_attn.h>
 #include <transformer_engine/gemm.h>
 #include <transformer_engine/layer_norm.h>
-#include <transformer_engine/logging.h>
 #include <transformer_engine/rmsnorm.h>
 #include <transformer_engine/softmax.h>
 #include <transformer_engine/transformer_engine.h>
 #include <transformer_engine/transpose.h>
-#include <cstdlib>
-#include <vector>
-
-#include "paddle/extension.h"
-#include "paddle/phi/backends/all_context.h"

 namespace transformer_engine {
 namespace paddle_ext {

--- a/transformer_engine/paddle/csrc/custom_ops.cu
+++ b/transformer_engine/paddle/csrc/custom_ops.cu
@@ -6,8 +6,9 @@

 #include <cub/cub.cuh>
 #include <vector>
-#include "../common.h"
+
 #include "common.h"
+#include "common/common.h"

 namespace transformer_engine {
 namespace paddle_ext {

--- a/transformer_engine/pytorch/csrc/comm_gemm_overlap.h
+++ b/transformer_engine/pytorch/csrc/comm_gemm_overlap.h
@@ -4,19 +4,22 @@
 * See LICENSE for license information.
 ************************************************************************/

-#include "userbuffers/userbuffers.h"
+#include <stdio.h>
+#include <stdlib.h>
+
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <cuda.h>
 #include <cuda_fp8.h>
-#include <stdio.h>
-#include <stdlib.h>
 #include <torch/cuda.h>
 #include <torch/custom_class.h>
 #include <torch/extension.h>
 #include <torch/types.h>

+#include "common/util/logging.h"
+#include "userbuffers/userbuffers.h"
+
 #define HALF_BYTES 2
 #define UB_MAX_SM 32


--- a/transformer_engine/pytorch/csrc/common.h
+++ b/transformer_engine/pytorch/csrc/common.h
@@ -7,39 +7,40 @@
 #ifndef TRANSFORMER_ENGINE_PYTORCH_CSRC_COMMON_H_
 #define TRANSFORMER_ENGINE_PYTORCH_CSRC_COMMON_H_

-#include <transformer_engine/gemm.h>
-#include <transformer_engine/layer_norm.h>
-#include <transformer_engine/rmsnorm.h>
-#include <transformer_engine/transpose.h>
-#include <transformer_engine/activation.h>
-#include <transformer_engine/logging.h>
-#include <transformer_engine/transformer_engine.h>
-#include <transformer_engine/cast.h>
-#include <transformer_engine/softmax.h>
-#include <transformer_engine/fused_attn.h>
+#include <cstring>
+#include <iomanip>
+#include <iostream>
+#include <memory>
+#include <random>
+#include <stdexcept>
+#include <vector>
+
 #include <ATen/ATen.h>
-#include <ATen/cudnn/Handle.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/macros/Macros.h>
 #include <ATen/Dispatch.h>
-#include <ATen/native/DispatchStub.h>
+#include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/CUDAGeneratorImpl.h>
 #include <ATen/cuda/CUDAGraphsUtils.cuh>
-#include <torch/extension.h>
-#include <torch/torch.h>
+#include <ATen/cudnn/Handle.h>
+#include <ATen/native/DispatchStub.h>
+#include <c10/macros/Macros.h>
+#include <cublasLt.h>
 #include <cuda.h>
-#include <cuda_runtime.h>
 #include <cuda_bf16.h>
-#include <cublasLt.h>
+#include <cuda_runtime.h>
 #include <cudnn.h>
-#include <stdexcept>
-#include <memory>
-#include <iomanip>
-#include <random>
-#include <cstring>
-#include <vector>
-#include <iostream>
+#include <torch/extension.h>
+#include <torch/torch.h>

+#include "common/util/logging.h"
+#include <transformer_engine/activation.h>
+#include <transformer_engine/cast.h>
+#include <transformer_engine/fused_attn.h>
+#include <transformer_engine/gemm.h>
+#include <transformer_engine/layer_norm.h>
+#include <transformer_engine/rmsnorm.h>
+#include <transformer_engine/softmax.h>
+#include <transformer_engine/transformer_engine.h>
+#include <transformer_engine/transpose.h>

 namespace transformer_engine {


--- a/transformer_engine/pytorch/csrc/extensions.h
+++ b/transformer_engine/pytorch/csrc/extensions.h
@@ -5,7 +5,7 @@
 ************************************************************************/

 #include "common.h"
-#include "../common.h"
+#include "common/common.h"

 NVTE_Fused_Attn_Backend get_fused_attn_backend(
                const transformer_engine::DType q_dtype,