[Bug][Feature] Added more missing FP16 specializations (#4140)

* * Added missing specializations for `__half` of `DLDataTypeTraits`, `IndexSelect`, `Full`, `Scatter_`, `CSRGetData`, `CSRMM`, `CSRSum`, `IndexSelectCPUFromGPU` * Fixed casting issue in `_LinearSearchKernel` that was preventing it from supporting `__half` * Added `#if`'d out specializations of `CSRGEMM`, `CSRGEAM`, and `Xgeam`, which would require functions that aren't currently provided by cublas * * Added more specific error messages for unimplemented FP16 specializations of Xgeam, CSRGEMM, and CSRGEAM * * Added missing instantiation of DLDataTypeTraits<__half>::dtype * * Fixed linter error * Added clearer comment explaining why the cast to long long is necessary * * Worked around a compile error in some particular setup, where __half can't be constructed on the host side * * Fixed linter formatting errors * * Changes to comments as recommended * * Made recommended changes to logging errors in FP16 specializations * Also changed the existing Xgeam function for unsupported data types from LOG(INFO) to LOG(FATAL)

[Bug][Feature] Added more missing FP16 specializations (#4140)
* * Added missing specializations for `__half` of `DLDataTypeTraits`, `IndexSelect`, `Full`, `Scatter_`, `CSRGetData`, `CSRMM`, `CSRSum`, `IndexSelectCPUFromGPU` * Fixed casting issue in `_LinearSearchKernel` that was preventing it from supporting `__half` * Added `#if`'d out specializations of `CSRGEMM`, `CSRGEAM`, and `Xgeam`, which would require functions that aren't currently provided by cublas * * Added more specific error messages for unimplemented FP16 specializations of Xgeam, CSRGEMM, and CSRGEAM * * Added missing instantiation of DLDataTypeTraits<__half>::dtype * * Fixed linter error * Added clearer comment explaining why the cast to long long is necessary * * Worked around a compile error in some particular setup, where __half can't be constructed on the host side * * Fixed linter formatting errors * * Changes to comments as recommended * * Made recommended changes to logging errors in FP16 specializations * Also changed the existing Xgeam function for unsupported data types from LOG(INFO) to LOG(FATAL)
a5d8460c · ndickson-nvidia · GitHub · b8f905f1 · a5d8460c · a5d8460c
Unverified Commit a5d8460c authored Jun 27, 2022 by ndickson-nvidia Committed by GitHub Jun 27, 2022
11 changed files
--- a/include/dgl/runtime/ndarray.h
+++ b/include/dgl/runtime/ndarray.h
@@ -18,6 +18,10 @@
 #include "serializer.h"
 #include "shared_mem.h"
+#ifdef DGL_USE_CUDA
+#include <cuda_fp16.h>
+#endif
 // forward declaration
 inline std::ostream& operator << (std::ostream& os, DGLType t);
@@ -46,6 +50,11 @@ GEN_DLDATATYPETRAITS_FOR(int64_t, kDLInt, 64);
 // converting uints to signed DTypes.
 GEN_DLDATATYPETRAITS_FOR(uint32_t, kDLInt, 32);
 GEN_DLDATATYPETRAITS_FOR(uint64_t, kDLInt, 64);
+#ifdef DGL_USE_CUDA
+#ifdef USE_FP16
+GEN_DLDATATYPETRAITS_FOR(__half, kDLFloat, 16);
+#endif
+#endif
 GEN_DLDATATYPETRAITS_FOR(float, kDLFloat, 32);
 GEN_DLDATATYPETRAITS_FOR(double, kDLFloat, 64);
 #undef GEN_DLDATATYPETRAITS_FOR

--- a/src/array/cuda/array_index_select.cu
+++ b/src/array/cuda/array_index_select.cu
@@ -55,6 +55,10 @@ template NDArray IndexSelect<kDLGPU, int32_t, int32_t>(NDArray, IdArray);
 template NDArray IndexSelect<kDLGPU, int32_t, int64_t>(NDArray, IdArray);
 template NDArray IndexSelect<kDLGPU, int64_t, int32_t>(NDArray, IdArray);
 template NDArray IndexSelect<kDLGPU, int64_t, int64_t>(NDArray, IdArray);
+#ifdef USE_FP16
+template NDArray IndexSelect<kDLGPU, __half, int32_t>(NDArray, IdArray);
+template NDArray IndexSelect<kDLGPU, __half, int64_t>(NDArray, IdArray);
+#endif
 template NDArray IndexSelect<kDLGPU, float, int32_t>(NDArray, IdArray);
 template NDArray IndexSelect<kDLGPU, float, int64_t>(NDArray, IdArray);
 template NDArray IndexSelect<kDLGPU, double, int32_t>(NDArray, IdArray);
@@ -63,18 +67,30 @@ template NDArray IndexSelect<kDLGPU, double, int64_t>(NDArray, IdArray);
 template <DLDeviceType XPU, typename DType>
 DType IndexSelect(NDArray array, int64_t index) {
  auto device = runtime::DeviceAPI::Get(array->ctx);
+#ifdef USE_FP16
+  // The initialization constructor for __half is apparently a device-
+  // only function in some setups, but the current function, IndexSelect,
+  // isn't run on the device, so it doesn't have access to that constructor.
+  using SafeDType = typename std::conditional<
+      std::is_same<DType, __half>::value, uint16_t, DType>::type;
+  SafeDType ret = 0;
+#else
  DType ret = 0;
+#endif
  device->CopyDataFromTo(
-      static_cast<DType*>(array->data) + index, 0, &ret, 0,
+      static_cast<DType*>(array->data) + index, 0, reinterpret_cast<DType*>(&ret), 0,
      sizeof(DType), array->ctx, DLContext{kDLCPU, 0},
      array->dtype, nullptr);
-  return ret;
+  return reinterpret_cast<DType&>(ret);
 }
 template int32_t IndexSelect<kDLGPU, int32_t>(NDArray array, int64_t index);
 template int64_t IndexSelect<kDLGPU, int64_t>(NDArray array, int64_t index);
 template uint32_t IndexSelect<kDLGPU, uint32_t>(NDArray array, int64_t index);
 template uint64_t IndexSelect<kDLGPU, uint64_t>(NDArray array, int64_t index);
+#ifdef USE_FP16
+template __half IndexSelect<kDLGPU, __half>(NDArray array, int64_t index);
+#endif
 template float IndexSelect<kDLGPU, float>(NDArray array, int64_t index);
 template double IndexSelect<kDLGPU, double>(NDArray array, int64_t index);

--- a/src/array/cuda/array_op_impl.cu
+++ b/src/array/cuda/array_op_impl.cu
@@ -224,6 +224,9 @@ NDArray Full(DType val, int64_t length, DLContext ctx) {
 template IdArray Full<kDLGPU, int32_t>(int32_t val, int64_t length, DLContext ctx);
 template IdArray Full<kDLGPU, int64_t>(int64_t val, int64_t length, DLContext ctx);
+#ifdef USE_FP16
+template IdArray Full<kDLGPU, __half>(__half val, int64_t length, DLContext ctx);
+#endif
 template IdArray Full<kDLGPU, float>(float val, int64_t length, DLContext ctx);
 template IdArray Full<kDLGPU, double>(double val, int64_t length, DLContext ctx);

--- a/src/array/cuda/array_scatter.cu
+++ b/src/array/cuda/array_scatter.cu
@@ -39,10 +39,16 @@ void Scatter_(IdArray index, NDArray value, NDArray out) {
 template void Scatter_<kDLGPU, int32_t, int32_t>(IdArray, NDArray, NDArray);
 template void Scatter_<kDLGPU, int64_t, int32_t>(IdArray, NDArray, NDArray);
+#ifdef USE_FP16
+template void Scatter_<kDLGPU, __half, int32_t>(IdArray, NDArray, NDArray);
+#endif
 template void Scatter_<kDLGPU, float, int32_t>(IdArray, NDArray, NDArray);
 template void Scatter_<kDLGPU, double, int32_t>(IdArray, NDArray, NDArray);
 template void Scatter_<kDLGPU, int32_t, int64_t>(IdArray, NDArray, NDArray);
 template void Scatter_<kDLGPU, int64_t, int64_t>(IdArray, NDArray, NDArray);
+#ifdef USE_FP16
+template void Scatter_<kDLGPU, __half, int64_t>(IdArray, NDArray, NDArray);
+#endif
 template void Scatter_<kDLGPU, float, int64_t>(IdArray, NDArray, NDArray);
 template void Scatter_<kDLGPU, double, int64_t>(IdArray, NDArray, NDArray);

--- a/src/array/cuda/csr_get_data.cu
+++ b/src/array/cuda/csr_get_data.cu
@@ -52,6 +52,12 @@ NDArray CSRGetData(
  return rst;
 }
+#ifdef USE_FP16
+template NDArray CSRGetData<kDLGPU, int32_t, __half>(
+    CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids, NDArray weights, __half filler);
+template NDArray CSRGetData<kDLGPU, int64_t, __half>(
+    CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids, NDArray weights, __half filler);
+#endif
 template NDArray CSRGetData<kDLGPU, int32_t, float>(
    CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids, NDArray weights, float filler);
 template NDArray CSRGetData<kDLGPU, int64_t, float>(

--- a/src/array/cuda/csr_mm.cu
+++ b/src/array/cuda/csr_mm.cu
@@ -253,6 +253,12 @@ std::pair<CSRMatrix, NDArray> CSRMM(
  }
 }
+#ifdef USE_FP16
+template std::pair<CSRMatrix, NDArray> CSRMM<kDLGPU, int32_t, __half>(
+    const CSRMatrix&, NDArray, const CSRMatrix&, NDArray);
+template std::pair<CSRMatrix, NDArray> CSRMM<kDLGPU, int64_t, __half>(
+    const CSRMatrix&, NDArray, const CSRMatrix&, NDArray);
+#endif
 template std::pair<CSRMatrix, NDArray> CSRMM<kDLGPU, int32_t, float>(
    const CSRMatrix&, NDArray, const CSRMatrix&, NDArray);
 template std::pair<CSRMatrix, NDArray> CSRMM<kDLGPU, int64_t, float>(

--- a/src/array/cuda/csr_sum.cu
+++ b/src/array/cuda/csr_sum.cu
@@ -166,6 +166,12 @@ std::pair<CSRMatrix, NDArray> CSRSum(
  }
 }
+#ifdef USE_FP16
+template std::pair<CSRMatrix, NDArray> CSRSum<kDLGPU, int32_t, __half>(
+    const std::vector<CSRMatrix>&, const std::vector<NDArray>&);
+template std::pair<CSRMatrix, NDArray> CSRSum<kDLGPU, int64_t, __half>(
+    const std::vector<CSRMatrix>&, const std::vector<NDArray>&);
+#endif
 template std::pair<CSRMatrix, NDArray> CSRSum<kDLGPU, int32_t, float>(
    const std::vector<CSRMatrix>&, const std::vector<NDArray>&);
 template std::pair<CSRMatrix, NDArray> CSRSum<kDLGPU, int64_t, float>(

--- a/src/array/cuda/cusparse_dispatcher.cuh
+++ b/src/array/cuda/cusparse_dispatcher.cuh
@@ -34,6 +34,32 @@ struct CSRGEMM {
  }
 };
+#ifdef USE_FP16
+template <>
+struct CSRGEMM<__half> {
+  template <typename... Args>
+  static inline cusparseStatus_t bufferSizeExt(Args... args) {
+    // TODO(ndickson): There is no cusparseHcsrgemm2_bufferSizeExt, so a different
+    // implementation would be required.
+    LOG(FATAL) << "CSRGEMM::bufferSizeExt does not support dtype half (FP16).";
+    return static_cast<cusparseStatus_t>(0);
+  }
+  template <typename... Args>
+  static inline cusparseStatus_t nnz(Args... args) {
+    return cusparseXcsrgemm2Nnz(args...);
+  }
+  template <typename... Args>
+  static inline cusparseStatus_t compute(Args... args) {
+    // TODO(ndickson): There is no cusparseHcsrgemm2, so a different
+    // implementation would be required.
+    LOG(FATAL) << "CSRGEMM::compute does not support dtype half (FP16).";
+    return static_cast<cusparseStatus_t>(0);
+  }
+};
+#endif
 template <>
 struct CSRGEMM<float> {
  template <typename... Args>
@@ -91,6 +117,32 @@ struct CSRGEAM {
  }
 };
+#ifdef USE_FP16
+template <>
+struct CSRGEAM<__half> {
+  template <typename... Args>
+  static inline cusparseStatus_t bufferSizeExt(Args... args) {
+    // TODO(ndickson): There is no cusparseHcsrgeam2_bufferSizeExt, so a different
+    // implementation would be required.
+    LOG(FATAL) << "CSRGEAM::bufferSizeExt does not support dtype half (FP16).";
+    return static_cast<cusparseStatus_t>(0);
+  }
+  template <typename... Args>
+  static inline cusparseStatus_t nnz(Args... args) {
+    return cusparseXcsrgeam2Nnz(args...);
+  }
+  template <typename... Args>
+  static inline cusparseStatus_t compute(Args... args) {
+    // TODO(ndickson): There is no cusparseHcsrgeam2, so a different
+    // implementation would be required.
+    LOG(FATAL) << "CSRGEAM::compute does not support dtype half (FP16).";
+    return static_cast<cusparseStatus_t>(0);
+  }
+};
+#endif
 template <>
 struct CSRGEAM<float> {
  template <typename... Args>

--- a/src/array/cuda/spmm.cuh
+++ b/src/array/cuda/spmm.cuh
@@ -28,10 +28,24 @@ cublasStatus_t Xgeam(cublasHandle_t handle, cublasOperation_t transa,
    const DType* alpha, const DType* A, int lda,
    const DType* beta, const DType* B, int ldb,
    DType* C, int ldc) {
-  LOG(INFO) << "Not supported dtype";
+  LOG(FATAL) << "Not supported dtype";
  return CUBLAS_STATUS_EXECUTION_FAILED;
 }
+#ifdef USE_FP16
+template <>
+cublasStatus_t Xgeam<__half>(cublasHandle_t handle, cublasOperation_t transa,
+    cublasOperation_t transb, int m, int n,
+    const __half* alpha, const __half* A, int lda,
+    const __half* beta, const __half* B, int ldb,
+    __half* C, int ldc) {
+  // TODO(ndickson): There is no cublasHgeam, so a different
+  // implementation would be required.
+  LOG(FATAL) << "Xgeam does not support dtype half (FP16)";
+  return CUBLAS_STATUS_EXECUTION_FAILED;
+}
+#endif
 template <>
 cublasStatus_t Xgeam<float>(cublasHandle_t handle, cublasOperation_t transa,
    cublasOperation_t transb, int m, int n,

--- a/src/array/cuda/utils.h
+++ b/src/array/cuda/utils.h
@@ -166,10 +166,19 @@ __global__ void _LinearSearchKernel(
        break;
      }
    }
-    if (v == -1)
+    if (v == -1) {
      out[tx] = filler;
-    else
+    } else {
-      out[tx] = weights ? weights[v] : v;
+      // The casts here are to be able to handle DType being __half.
+      // GCC treats int64_t as a distinct type from long long, so
+      // without the explcit cast to long long, it errors out saying
+      // that the implicit cast results in an ambiguous choice of
+      // constructor for __half.
+      // The using statement is to avoid a linter error about using
+      // long or long long.
+      using LongLong = long long; // NOLINT
+      out[tx] = weights ? weights[v] : DType(LongLong(v));
+    }
    tx += stride_x;
  }
 }

--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -24,6 +24,9 @@ constexpr DLDataType DLDataTypeTraits<int32_t>::dtype;
 constexpr DLDataType DLDataTypeTraits<int64_t>::dtype;
 constexpr DLDataType DLDataTypeTraits<uint32_t>::dtype;
 constexpr DLDataType DLDataTypeTraits<uint64_t>::dtype;
+#ifdef USE_FP16
+constexpr DLDataType DLDataTypeTraits<__half>::dtype;
+#endif
 constexpr DLDataType DLDataTypeTraits<float>::dtype;
 constexpr DLDataType DLDataTypeTraits<double>::dtype;