/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ /* Generate parameters for our error bound model of floating point average * (sum of scaled values) by sampling sums of random sequences for each * floating point type. * * The model has parameters "coef" and "power", where for two floats a & b, * they are close enough if and only if: * abs(intBits(a) - intBits(b)) <= 1 + coef*pow(rank_n, power); * * Where intBits(x) is the reinterpretation of the float bitpattern as an integer. * * Compile with: * nvcc -gencode=arch=compute_80,code=sm_80 */ #include #include #include #include #include #include using std::uint64_t; using std::uint32_t; using bfloat16 = hip_bfloat16; template struct float_traits; template<> struct float_traits { static constexpr int mantissa_bits = 23; static constexpr int exponent_bits = 8; using uint_t = uint32_t; __device__ static float make(double x) { return (float)x; } __device__ static float make(uint64_t x) { return (float)x; } __device__ static double todouble(float x) { return x; } __device__ static float add(float a, float b) { return a+b; } __device__ static float mul(float a, float b) { return a*b; } }; template<> struct float_traits { static constexpr int mantissa_bits = 52; static constexpr int exponent_bits = 11; using uint_t = uint64_t; __device__ static double make(double x) { return x; } __device__ static double make(uint64_t x) { return (double)x; } __device__ static double todouble(double x) { return x; } __device__ static double add(double a, double b) { return a+b; } __device__ static double mul(double a, double b) { return a*b; } }; template<> struct float_traits<__half> { static constexpr int mantissa_bits = 10; static constexpr int exponent_bits = 5; using uint_t = uint16_t; __device__ static __half make(double x) { return __float2half((float)x); } __device__ static __half make(uint64_t x) { return __int2half_rn(x); } __device__ static double todouble(__half x) { return __half2float(x); } __device__ static __half add(__half a, __half b) { return __hadd(a, b); } __device__ static __half mul(__half a, __half b) { return __hmul(a, b); } }; template<> struct float_traits { static constexpr int mantissa_bits = 7; static constexpr int exponent_bits = 8; using uint_t = uint16_t; __device__ static bfloat16 make(double x) { return bfloat16(x); } __device__ static bfloat16 make(uint64_t x) { return bfloat16(x); } __device__ static double todouble(bfloat16 x) { return double(x); } __device__ static bfloat16 add(bfloat16 a, bfloat16 b) { return bfloat16(__hadd((float)a, (float)b)); } __device__ static bfloat16 mul(bfloat16 a, bfloat16 b) { return bfloat16(__hmul((float)a, (float)b)); } }; template __device__ int compare(F a, F b) { union { typename float_traits::uint_t ua; F fa; }; union { typename float_traits::uint_t ub; F fb; }; ua=0; ub=0; fa=a; fb=b; //std::printf("bits(%1.10f)=%x bits(%1.10f)=%x\n", fa, ua, fb, ub); return ua < ub ? ub-ua : ua-ub; } struct xoshiro256ss { uint64_t s[4]; __device__ xoshiro256ss(int seed) { constexpr uint64_t src[4] = {0xbb99e851d1f545cc, 0xbfc4022389ca40cb, 0xe84aff5cb1914af5, 0x845999858284de77}; for(int i=0; i < 4; i++) s[i] = src[i] + (seed + i)*0xb45de8a52fdb65d3; } __device__ uint64_t operator()() { auto rol64 = [](uint64_t x, int k) { return (x << k) | (x >> (64 - k)); }; uint64_t const result = rol64(s[1] * 5, 7) * 9; uint64_t const t = s[1] << 17; s[2] ^= s[0]; s[3] ^= s[1]; s[1] ^= s[2]; s[0] ^= s[3]; s[2] ^= t; s[3] = rol64(s[3], 45); return result; } }; static __device__ int __reduce_max_sync(unsigned int mask, int value) { //We ignore mask, since all bits are set when calling them in the //test code below. int width = warpSize; for (unsigned int i = warpSize; i; i >>= 1) { value = max(__shfl_down(value, i, width), value); } return value; } template __global__ void kernel() { using traits = float_traits; constexpr int samps = 4<<10; __shared__ F accf[samps]; __shared__ double accd[samps]; xoshiro256ss rng(threadIdx.x); float expo_avg = 1; for(int pass=0; pass < 2; pass++) { F scalar = traits::make(1.0/(3.14159 + .5*threadIdx.x)); int err_max = 0; float coef = 0; double expo_sum = 0; int expo_n = 0; int max_ranks = std::is_same::value ? 16<<10 : 1<::value ? double(rng() & m) : 1.0; F f = traits::make(d); accf[i] = traits::add(accf[i], traits::mul(scalar, f)); accd[i] += traits::todouble(f); //if(threadIdx.x==0 && std::is_same::value) std::printf(" r=%d f=%f\n", r, traits::todouble(accf[i])); int e = compare(accf[i], traits::mul(scalar, traits::make(accd[i]))); err = err > e ? err : e; } err = __reduce_max_sync(-1u, err); err_max = err_max > err ? err_max : err; if (r >= 2) { // err = 1 + coef*pow(r,expo) float c = float(err-1)/powf(float(r), expo_avg); coef = coef > c ? coef : c; } if (r >= 2) { double expo = log2f(1+err_max)/log2f(r); expo_sum += expo; expo_n++; //if(threadIdx.x==0 && std::is_same::value) std::printf(" r=%d err=%d errmax=%d expo=%f sum=%f n=%d\n", r, err, err_max, expo, expo_sum, expo_n); } } } if(pass==0) expo_avg = expo_sum/expo_n; else if(threadIdx.x == 0) printf(" coef=%1.10f expo=%1.10f\n", coef, expo_avg); } } int main() { std::printf("type=float:\n"); kernel<<<1,32>>>(); hipDeviceSynchronize(); std::printf("\ntype=half:\n"); kernel<<<1,32>>>(); hipDeviceSynchronize(); std::printf("\ntype=bfloat16:\n"); kernel<<<1,32>>>(); hipDeviceSynchronize(); return 0; }