inexact_regress.cu 6.67 KB
Newer Older
jerrrrry's avatar
jerrrrry committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
/*************************************************************************
 * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
 * Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/

/* Generate parameters for our error bound model of floating point average
 * (sum of scaled values) by sampling sums of random sequences for each
 * floating point type.
 *
 * The model has parameters "coef" and "power", where for two floats a & b,
 * they are close enough if and only if:
 *   abs(intBits(a) - intBits(b)) <= 1 + coef*pow(rank_n, power);
 *
 * Where intBits(x) is the reinterpretation of the float bitpattern as an integer.
 *
 * Compile with:
 *   nvcc -gencode=arch=compute_80,code=sm_80
 */

#include <algorithm>
#include <cmath>
#include <cstdio>
#include <cstdint>
#include <hip/hip_bfloat16.h>
#include <hip/hip_fp16.h>

using std::uint64_t;
using std::uint32_t;
using bfloat16 = hip_bfloat16;

template<typename T>
struct float_traits;

template<>
struct float_traits<float> {
  static constexpr int mantissa_bits = 23;
  static constexpr int exponent_bits = 8;
  using uint_t = uint32_t;
  __device__ static float make(double x) { return (float)x; }
  __device__ static float make(uint64_t x) { return (float)x; }
  __device__ static double todouble(float x) { return x; }
  __device__ static float add(float a, float b) { return a+b; }
  __device__ static float mul(float a, float b) { return a*b; }
};
template<>
struct float_traits<double> {
  static constexpr int mantissa_bits = 52;
  static constexpr int exponent_bits = 11;
  using uint_t = uint64_t;
  __device__ static double make(double x) { return x; }
  __device__ static double make(uint64_t x) { return (double)x; }
  __device__ static double todouble(double x) { return x; }
  __device__ static double add(double a, double b) { return a+b; }
  __device__ static double mul(double a, double b) { return a*b; }
};
template<>
struct float_traits<__half> {
  static constexpr int mantissa_bits = 10;
  static constexpr int exponent_bits = 5;
  using uint_t = uint16_t;
  __device__ static __half make(double x) { return __float2half((float)x); }
  __device__ static __half make(uint64_t x) { return __int2half_rn(x); }
  __device__ static double todouble(__half x) { return __half2float(x); }
  __device__ static __half add(__half a, __half b) { return __hadd(a, b); }
  __device__ static __half mul(__half a, __half b) { return __hmul(a, b); }
};
template<>
struct float_traits<bfloat16> {
  static constexpr int mantissa_bits = 7;
  static constexpr int exponent_bits = 8;
  using uint_t = uint16_t;
  __device__ static bfloat16 make(double x) { return bfloat16(x); }
  __device__ static bfloat16 make(uint64_t x) { return bfloat16(x); }
  __device__ static double todouble(bfloat16 x) { return double(x); }
  __device__ static bfloat16 add(bfloat16 a, bfloat16 b) { return bfloat16(__hadd((float)a, (float)b)); }
  __device__ static bfloat16 mul(bfloat16 a, bfloat16 b) { return bfloat16(__hmul((float)a, (float)b)); }
};

template<typename F>
__device__ int compare(F a, F b) {
  union { typename float_traits<F>::uint_t ua; F fa; };
  union { typename float_traits<F>::uint_t ub; F fb; };
  ua=0; ub=0;
  fa=a; fb=b;
  //std::printf("bits(%1.10f)=%x bits(%1.10f)=%x\n", fa, ua, fb, ub);
  return ua < ub ? ub-ua : ua-ub;
}

struct xoshiro256ss {
	uint64_t s[4];
  __device__ xoshiro256ss(int seed) {
    constexpr uint64_t src[4] = {0xbb99e851d1f545cc, 0xbfc4022389ca40cb, 0xe84aff5cb1914af5, 0x845999858284de77};
    for(int i=0; i < 4; i++)
      s[i] = src[i] + (seed + i)*0xb45de8a52fdb65d3;
  }
  __device__ uint64_t operator()() {
    auto rol64 = [](uint64_t x, int k) {
      return (x << k) | (x >> (64 - k));
    };
    uint64_t const result = rol64(s[1] * 5, 7) * 9;
    uint64_t const t = s[1] << 17;
    s[2] ^= s[0];
    s[3] ^= s[1];
    s[1] ^= s[2];
    s[0] ^= s[3];
    s[2] ^= t;
    s[3] = rol64(s[3], 45);
    return result;
  }
};

static __device__ int __reduce_max_sync(unsigned int mask, int value)
{
  //We ignore mask, since all bits are set when calling them in the
  //test code below.
  int width = warpSize;
  for (unsigned int i = warpSize; i; i >>= 1) {
    value = max(__shfl_down(value, i, width), value);
  }
  return value;
}

template<typename F>
__global__ void kernel() {
  using traits = float_traits<F>;
  constexpr int samps = 4<<10;
  __shared__ F accf[samps];
  __shared__ double accd[samps];

  xoshiro256ss rng(threadIdx.x);
  float expo_avg = 1;
  for(int pass=0; pass < 2; pass++) {
    F scalar = traits::make(1.0/(3.14159 + .5*threadIdx.x));
    int err_max = 0;
    float coef = 0;
    double expo_sum = 0;
    int expo_n = 0;
    int max_ranks = std::is_same<F,float>::value ? 16<<10 : 1<<traits::mantissa_bits;
    for(int round=0; round < 1 + (16<<10)/max_ranks; round++) {
    //for(int round=0; round < 2; round++) {
      for(int i=threadIdx.x; i < samps; i += blockDim.x) {
        accf[i] = (F)0;
        accd[i] = 0;
      }
      __syncthreads();
      for(int r=0; r < max_ranks; r++) {
        int err = 0;
        for(int i=threadIdx.x; i < samps; i+=blockDim.x) {
          constexpr uint64_t m = (1ll<<traits::mantissa_bits)-1;
          double d = std::is_same<F,float>::value ? double(rng() & m) : 1.0;
          F f = traits::make(d);
          accf[i] = traits::add(accf[i], traits::mul(scalar, f));
          accd[i] += traits::todouble(f);
          //if(threadIdx.x==0 && std::is_same<F,half>::value) std::printf(" r=%d f=%f\n", r, traits::todouble(accf[i]));
          int e = compare(accf[i], traits::mul(scalar, traits::make(accd[i])));
          err = err > e ? err : e;
        }
        err = __reduce_max_sync(-1u, err);
        err_max = err_max > err ? err_max : err;
        if (r >= 2) {
          // err = 1 + coef*pow(r,expo)
          float c = float(err-1)/powf(float(r), expo_avg);
          coef = coef > c ? coef : c;
        }
        if (r >= 2) {
          double expo = log2f(1+err_max)/log2f(r);
          expo_sum += expo;
          expo_n++;
          //if(threadIdx.x==0 && std::is_same<F,half>::value) std::printf(" r=%d err=%d errmax=%d expo=%f sum=%f n=%d\n", r, err, err_max, expo, expo_sum, expo_n);
        }
      }
    }
    if(pass==0)
      expo_avg = expo_sum/expo_n;
    else if(threadIdx.x == 0)
      printf("  coef=%1.10f expo=%1.10f\n", coef, expo_avg);
  }
}

int main() {
  std::printf("type=float:\n");
  kernel<float><<<1,32>>>();
  hipDeviceSynchronize();

  std::printf("\ntype=half:\n");
  kernel<half><<<1,32>>>();
  hipDeviceSynchronize();

  std::printf("\ntype=bfloat16:\n");
  kernel<bfloat16><<<1,32>>>();
  hipDeviceSynchronize();
  return 0;
}