// !!! This is a file automatically generated by hipify!!! /** * Copyright (c) 2022 by Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * @file array/cuda/bf16.cuh * @brief bfloat16 related functions. */ #ifndef DGL_ARRAY_CUDA_BF16_CUH_ #define DGL_ARRAY_CUDA_BF16_CUH_ #include #if BF16_ENABLED #include #include static __device__ __forceinline__ __hip_bfloat16 max(__hip_bfloat16 a, __hip_bfloat16 b) { #if defined(__HIP_DEVICE_COMPILE__) return __hmax(a, b); #else return __hip_bfloat16(max(float(a), float(b))); // NOLINT #endif } static __device__ __forceinline__ __hip_bfloat16 min(__hip_bfloat16 a, __hip_bfloat16 b) { #if defined(__HIP_DEVICE_COMPILE__) return __hmin(a, b); #else return __hip_bfloat16(min(float(a), float(b))); // NOLINT #endif } #if HIP_VERSION_MAJOR < 6 // Arithmetic BF16 operations for architecture >= 8.0 are already defined in // hip/__hip_bfloat16.h // #if defined(__DTK_ARCH__) && (__DTK_ARCH__ < 800) // // CUDA 12.2 adds "emulated" support for older architectures. // #if defined(DTKRT_VERSION) && (DTKRT_VERSION < 12020) __device__ __forceinline__ __hip_bfloat16 operator+(const __hip_bfloat16& lh, const __hip_bfloat16& rh) { return __hip_bfloat16(float(lh) + float(rh)); // NOLINT } __device__ __forceinline__ __hip_bfloat16 operator-(const __hip_bfloat16& lh, const __hip_bfloat16& rh) { return __hip_bfloat16(float(lh) - float(rh)); // NOLINT } __device__ __forceinline__ __hip_bfloat16 operator*(const __hip_bfloat16& lh, const __hip_bfloat16& rh) { return __hip_bfloat16(float(lh) * float(rh)); // NOLINT } __device__ __forceinline__ __hip_bfloat16 operator/(const __hip_bfloat16& lh, const __hip_bfloat16& rh) { return __hip_bfloat16(float(lh) / float(rh)); // NOLINT } __device__ __forceinline__ __hip_bfloat16& operator+=( __hip_bfloat16& lh, const __hip_bfloat16& rh) { // NOLINT lh = __hip_bfloat16(float(lh) + float(rh)); // NOLINT return lh; } __device__ __forceinline__ __hip_bfloat16& operator-=( __hip_bfloat16& lh, const __hip_bfloat16& rh) { // NOLINT lh = __hip_bfloat16(float(lh) - float(rh)); // NOLINT return lh; } __device__ __forceinline__ __hip_bfloat16& operator*=( __hip_bfloat16& lh, const __hip_bfloat16& rh) { // NOLINT lh = __hip_bfloat16(float(lh) * float(rh)); // NOLINT return lh; } __device__ __forceinline__ __hip_bfloat16& operator/=( __hip_bfloat16& lh, const __hip_bfloat16& rh) { // NOLINT lh = __hip_bfloat16(float(lh) / float(rh)); // NOLINT return lh; } __device__ __forceinline__ __hip_bfloat16& operator++( __hip_bfloat16& h) { // NOLINT h = __hip_bfloat16(float(h) + 1.0f); // NOLINT return h; } __device__ __forceinline__ __hip_bfloat16& operator--( __hip_bfloat16& h) { // NOLINT h = __hip_bfloat16(float(h) - 1.0f); // NOLINT return h; } __device__ __forceinline__ __hip_bfloat16 operator++(__hip_bfloat16& h, int) { // NOLINT __hip_bfloat16 ret = h; h = __hip_bfloat16(float(h) + 1.0f); // NOLINT return ret; } __device__ __forceinline__ __hip_bfloat16 operator--(__hip_bfloat16& h, int) { // NOLINT __hip_bfloat16 ret = h; h = __hip_bfloat16(float(h) - 1.0f); // NOLINT return ret; } __device__ __forceinline__ __hip_bfloat16 operator+(const __hip_bfloat16& h) { return h; } __device__ __forceinline__ __hip_bfloat16 operator-(const __hip_bfloat16& h) { return __hip_bfloat16(-float(h)); // NOLINT } __device__ __forceinline__ bool operator==( const __hip_bfloat16& lh, const __hip_bfloat16& rh) { return float(lh) == float(rh); // NOLINT } __device__ __forceinline__ bool operator!=( const __hip_bfloat16& lh, const __hip_bfloat16& rh) { return float(lh) != float(rh); // NOLINT } __device__ __forceinline__ bool operator>( const __hip_bfloat16& lh, const __hip_bfloat16& rh) { return float(lh) > float(rh); // NOLINT } __device__ __forceinline__ bool operator<( const __hip_bfloat16& lh, const __hip_bfloat16& rh) { return float(lh) < float(rh); // NOLINT } __device__ __forceinline__ bool operator>=( const __hip_bfloat16& lh, const __hip_bfloat16& rh) { return float(lh) >= float(rh); // NOLINT } __device__ __forceinline__ bool operator<=( const __hip_bfloat16& lh, const __hip_bfloat16& rh) { return float(lh) <= float(rh); // NOLINT } // #endif // defined(DTKRT_VERSION) && (DTKRT_VERSION < 12020) // #endif // defined(__DTK_ARCH__) && (__DTK_ARCH__ < 800) #endif #if __HIPCC__ __device__ inline __hip_bfloat16 __shfl_down(__hip_bfloat16 var, unsigned int lane_delta, int width = warpSize) { union { unsigned short s; __hip_bfloat16 us; } tmp; tmp.us = var; tmp.s = __shfl_down(tmp.s, lane_delta, width); return tmp.us; } #endif // __HIPCC__ #endif // BF16_ENABLED #endif // DGL_ARRAY_CUDA_BF16_CUH_