ops.cu 12.3 KB
Newer Older
Tim Dettmers's avatar
Tim Dettmers committed
1
2
3
4
5
6
7
8
9
10
// Copyright (c) Facebook, Inc. and its affiliates. 
//   
// This source code is licensed under the MIT license found in the 
// LICENSE file in the root directory of this source tree.

#include <ops.cuh>
#include <kernels.cuh>
#include <cub/device/device_scan.cuh>
#include <limits>
#include <BinSearch.h>
Max Ryabinin's avatar
Max Ryabinin committed
11
#include <common.h>
Tim Dettmers's avatar
Tim Dettmers committed
12
13
14
15
16
17


using namespace BinSearch;
using std::cout;
using std::endl;

Max Ryabinin's avatar
Max Ryabinin committed
18
19
20
21
22
23
void histogramScatterAdd2D(float *histogram, int *index1, int *index2, float *src, int maxidx1, int n) {
    int threads = 512;
    int blocks = n / threads;
    blocks = n % threads == 0 ? blocks : blocks + 1;
    kHistogramScatterAdd2D<<<blocks, 512>>>(histogram, index1, index2, src, maxidx1, n);
    CUDA_CHECK_RETURN(cudaPeekAtLastError());
Tim Dettmers's avatar
Tim Dettmers committed
24
25
}

Max Ryabinin's avatar
Max Ryabinin committed
26
27
28
29
30
31
32
template<typename T>
void estimateQuantiles(T *A, float *code, float offset, int n) {
    int blocks = n / 4096;
    blocks = n % 4096 == 0 ? blocks : blocks + 1;
    CUDA_CHECK_RETURN(cudaMemset(code, 0, 256 * sizeof(float)));
    kEstimateQuantiles < T ><<<blocks, 512>>>(A, code, offset, std::numeric_limits<T>::max(), n);
    CUDA_CHECK_RETURN(cudaPeekAtLastError());
Tim Dettmers's avatar
Tim Dettmers committed
33
34
}

Max Ryabinin's avatar
Max Ryabinin committed
35
36
37
38
39
void quantize(float *code, float *A, unsigned char *out, int n) {
    int blocks = n / 1024;
    blocks = n % 1024 == 0 ? blocks : blocks + 1;
    kQuantize<<<blocks, 1024>>>(code, A, out, n);
    CUDA_CHECK_RETURN(cudaPeekAtLastError());
Tim Dettmers's avatar
Tim Dettmers committed
40
41
}

Max Ryabinin's avatar
Max Ryabinin committed
42
43
44
45
46
void dequantize(float *code, unsigned char *A, float *out, int n) {
    int blocks = n / 1024;
    blocks = n % 1024 == 0 ? blocks : blocks + 1;
    kDequantize<<<blocks, 1024>>>(code, A, out, n);
    CUDA_CHECK_RETURN(cudaPeekAtLastError());
Tim Dettmers's avatar
Tim Dettmers committed
47
48
}

Max Ryabinin's avatar
Max Ryabinin committed
49
50
51
52
53
54
template<typename T, int STOCHASTIC>
void quantizeBlockwise(float *code, T *A, float *absmax, unsigned char *out, float *rand, int rand_offset, const int n) {
    int blocks = n / 4096;
    blocks = n % 4096 == 0 ? blocks : blocks + 1;
    kQuantizeBlockwise < T, 4096, 4, STOCHASTIC ><<<blocks, 1024>>>(code, A, absmax, out, rand, rand_offset, n);
    CUDA_CHECK_RETURN(cudaPeekAtLastError());
Tim Dettmers's avatar
Tim Dettmers committed
55
56
}

Max Ryabinin's avatar
Max Ryabinin committed
57
58
59
60
61
62
63
64
65
template<typename T>
void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, T *out, int blocksize, const int n) {
    int blocks = n / blocksize;
    blocks = n % blocksize == 0 ? blocks : blocks + 1;
    if (blocksize == 4096)
        kDequantizeBlockwise < T, 4096, 1024, 4 ><<<blocks, 4096 / 4>>>(code, A, absmax, out, n);
    else if (blocksize == 2048)
        kDequantizeBlockwise < T, 2048, 512, 4 ><<<blocks, 2048 / 4>>>(code, A, absmax, out, n);
    CUDA_CHECK_RETURN(cudaPeekAtLastError());
Tim Dettmers's avatar
Tim Dettmers committed
66
67
}

Max Ryabinin's avatar
Max Ryabinin committed
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
template<typename T, int OPTIMIZER>
void optimizer32bit(T *g, T *p,
                    float *state1, float *state2, float *unorm, float max_unorm, float param_norm,
                    const float beta1, const float beta2, const float eps, const float weight_decay,
                    const int step, const float lr, const float gnorm_scale, bool skip_zeros, const int n) {
    int blocks = n / 4096;
    blocks = n % 4096 == 0 ? blocks : blocks + 1;
    switch (OPTIMIZER) {
        case ADAM:
            if (max_unorm > 0.0f) {
                CUDA_CHECK_RETURN(cudaMemset(unorm, 0, 1 * sizeof(float)));
                kPreconditionOptimizer32bit2State < T, OPTIMIZER, 4096,
                        8 ><<<blocks, 512>>>(g, p, state1, state2, unorm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, n);
                CUDA_CHECK_RETURN(cudaPeekAtLastError());
            }
            kOptimizer32bit2State < T,
                    OPTIMIZER ><<<blocks, 1024>>>(g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n);
            CUDA_CHECK_RETURN(cudaPeekAtLastError());
            break;
        case MOMENTUM:
        case RMSPROP:
        case ADAGRAD:

            if (max_unorm > 0.0f) {
                CUDA_CHECK_RETURN(cudaMemset(unorm, 0, 1 * sizeof(float)));
                kPreconditionOptimizer32bit1State < T, OPTIMIZER, 4096,
                        8 ><<<blocks, 512>>>(g, p, state1, unorm, beta1, eps, weight_decay, step, lr, gnorm_scale, n);
                CUDA_CHECK_RETURN(cudaPeekAtLastError());
            }

            kOptimizer32bit1State < T,
                    OPTIMIZER ><<<blocks, 1024>>>(g, p, state1, unorm, max_unorm, param_norm, beta1, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n);
            CUDA_CHECK_RETURN(cudaPeekAtLastError());
            break;
    }
Tim Dettmers's avatar
Tim Dettmers committed
103
104
}

Max Ryabinin's avatar
Max Ryabinin committed
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
template<typename T, int OPTIMIZER>
void optimizerStatic8bit(T *p, T *g,
                         unsigned char *state1, unsigned char *state2,
                         float *unorm, float max_unorm, float param_norm,
                         float beta1, float beta2,
                         float eps, int step, float lr,
                         float *quantiles1, float *quantiles2,
                         float *max1, float *max2, float *new_max1, float *new_max2,
                         float weight_decay,
                         const float gnorm_scale, int n) {
    int blocks = n / 4096;
    blocks = n % 4096 == 0 ? blocks : blocks + 1;

    if (max_unorm > 0.0f) { CUDA_CHECK_RETURN(cudaMemset(unorm, 0, 1 * sizeof(float))); }

    switch (OPTIMIZER) {
        case ADAM:
            CUDA_CHECK_RETURN(cudaMemset(new_max1, 0, 1 * sizeof(float)));
            CUDA_CHECK_RETURN(cudaMemset(new_max2, 0, 1 * sizeof(float)));
            kPreconditionOptimizerStatic8bit2State < T,
                    OPTIMIZER ><<<blocks, 256>>>(p, g, state1, state2, unorm, beta1, beta2, eps, step, quantiles1, quantiles2, max1, max2, new_max1, new_max2, gnorm_scale, n);
            CUDA_CHECK_RETURN(cudaPeekAtLastError());
            kOptimizerStatic8bit2State < T,
                    OPTIMIZER ><<<blocks, 1024>>>(p, g, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr,
            quantiles1, quantiles2, max1, max2, new_max1, new_max2, weight_decay, gnorm_scale, n);
            CUDA_CHECK_RETURN(cudaPeekAtLastError());
            break;
        case MOMENTUM:
        case RMSPROP:
        case ADAGRAD:
            CUDA_CHECK_RETURN(cudaMemset(new_max1, 0, 1 * sizeof(float)));
            kPreconditionOptimizerStatic8bit1State < T,
                    OPTIMIZER ><<<blocks, 256>>>(p, g, state1, unorm, beta1, eps, step, quantiles1, max1, new_max1, weight_decay, gnorm_scale, n);
            CUDA_CHECK_RETURN(cudaPeekAtLastError());
            kOptimizerStatic8bit1State < T, OPTIMIZER ><<<blocks, 1024>>>(p, g, state1, unorm, max_unorm, param_norm, beta1, eps, step, lr,
            quantiles1, max1, new_max1, weight_decay, gnorm_scale, n);
            CUDA_CHECK_RETURN(cudaPeekAtLastError());
            break;
        default:
            break;
    }
Tim Dettmers's avatar
Tim Dettmers committed
146
147
148
149
150
151
152
}

#define BLOCKSIZE_2STATE 2048
#define NUM_2STATE 8
#define BLOCKSIZE_1STATE 2048
#define NUM_1STATE 8

Max Ryabinin's avatar
Max Ryabinin committed
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
template<typename T, int OPTIMIZER>
void optimizerStatic8bitBlockwise(T *p, T *g,
                                  unsigned char *state1, unsigned char *state2, float beta1, float beta2, float eps, int step, float lr,
                                  float *quantiles1, float *quantiles2, float *absmax1, float *absmax2, float weight_decay,
                                  const float gnorm_scale, bool skip_zeros, int n) {

    int blocks = 0;
    switch (OPTIMIZER) {
        case ADAM:
            blocks = n / BLOCKSIZE_2STATE;
            blocks = n % BLOCKSIZE_2STATE == 0 ? blocks : blocks + 1;
            kOptimizerStatic8bit2StateBlockwise < T, OPTIMIZER, BLOCKSIZE_2STATE, NUM_2STATE ><<<blocks, BLOCKSIZE_2STATE /
                                                                                                         NUM_2STATE>>>(p, g, state1, state2, beta1, beta2, eps, step, lr,
            quantiles1, quantiles2, absmax1, absmax2, weight_decay, gnorm_scale, skip_zeros, n);
            CUDA_CHECK_RETURN(cudaPeekAtLastError());
            break;
        case MOMENTUM:
        case RMSPROP:
        case ADAGRAD:
            blocks = n / BLOCKSIZE_1STATE;
            blocks = n % BLOCKSIZE_1STATE == 0 ? blocks : blocks + 1;
            kOptimizerStatic8bit1StateBlockwise < T, OPTIMIZER, BLOCKSIZE_1STATE, NUM_1STATE ><<<blocks, BLOCKSIZE_1STATE /
                                                                                                         NUM_1STATE>>>(p, g, state1, beta1, beta2, eps, step, lr,
            quantiles1, absmax1, weight_decay, gnorm_scale, skip_zeros, n);
            CUDA_CHECK_RETURN(cudaPeekAtLastError());
            break;
    }
Tim Dettmers's avatar
Tim Dettmers committed
180
181
182
}


Max Ryabinin's avatar
Max Ryabinin committed
183
184
185
186
187
188
189
template<typename T>
void percentileClipping(T *g, float *gnorm_vec, int step, const int n) {
    int blocks = n / 2048;
    blocks = n % 2048 == 0 ? blocks : blocks + 1;
    CUDA_CHECK_RETURN(cudaMemset(&gnorm_vec[step % 100], 0, 1 * sizeof(float)));
    kPercentileClipping < T, 2048, 4 ><<<blocks, 512>>>(g, gnorm_vec, step, n);
    CUDA_CHECK_RETURN(cudaPeekAtLastError());
Tim Dettmers's avatar
Tim Dettmers committed
190
191
192
193
194
195
196
197
}


//==============================================================
//                   TEMPLATE DEFINITIONS
//==============================================================

template void estimateQuantiles(half *A, float *code, float offset, int n);
Max Ryabinin's avatar
Max Ryabinin committed
198

Tim Dettmers's avatar
Tim Dettmers committed
199
200
template void estimateQuantiles(float *A, float *code, float offset, int n);

Max Ryabinin's avatar
Max Ryabinin committed
201
202
203
204
205
206
207
208
209
210
211
212
template void
quantizeBlockwise<half, 0>(float *code, half *A, float *absmax, unsigned char *out, float *rand, int rand_offset, const int n);

template void
quantizeBlockwise<float, 0>(float *code, float *A, float *absmax, unsigned char *out, float *rand, int rand_offset, const int n);

template void
quantizeBlockwise<half, 1>(float *code, half *A, float *absmax, unsigned char *out, float *rand, int rand_offset, const int n);

template void
quantizeBlockwise<float, 1>(float *code, float *A, float *absmax, unsigned char *out, float *rand, int rand_offset, const int n);

Tim Dettmers's avatar
Tim Dettmers committed
213
template void dequantizeBlockwise<half>(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n);
Max Ryabinin's avatar
Max Ryabinin committed
214

Tim Dettmers's avatar
Tim Dettmers committed
215
216
217
218
219
220
template void dequantizeBlockwise<float>(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n);

#define MAKE_optimizer32bit(name, gtype) \
template void optimizer32bit<gtype, name>(gtype* g, gtype* p, \
                float* state1, float* state2, float* unorm, float max_unorm, float param_norm, \
                const float beta1, const float beta2, const float eps, const float weight_decay, \
221
                const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n);
Tim Dettmers's avatar
Tim Dettmers committed
222
223

MAKE_optimizer32bit(ADAM, half)
Max Ryabinin's avatar
Max Ryabinin committed
224

Tim Dettmers's avatar
Tim Dettmers committed
225
MAKE_optimizer32bit(ADAM, float)
Max Ryabinin's avatar
Max Ryabinin committed
226

Tim Dettmers's avatar
Tim Dettmers committed
227
MAKE_optimizer32bit(MOMENTUM, half)
Max Ryabinin's avatar
Max Ryabinin committed
228

Tim Dettmers's avatar
Tim Dettmers committed
229
MAKE_optimizer32bit(MOMENTUM, float)
Max Ryabinin's avatar
Max Ryabinin committed
230

Tim Dettmers's avatar
Tim Dettmers committed
231
MAKE_optimizer32bit(RMSPROP, half)
Max Ryabinin's avatar
Max Ryabinin committed
232

Tim Dettmers's avatar
Tim Dettmers committed
233
MAKE_optimizer32bit(RMSPROP, float)
Max Ryabinin's avatar
Max Ryabinin committed
234

235
MAKE_optimizer32bit(ADAGRAD, half)
Max Ryabinin's avatar
Max Ryabinin committed
236

237
MAKE_optimizer32bit(ADAGRAD, float)
Tim Dettmers's avatar
Tim Dettmers committed
238
239
240
241
242
243
244
245
246
247
248

#define MAKE_optimizerStatic8bit(name, gtype) \
template void optimizerStatic8bit<gtype, name>(gtype* p, gtype* g, unsigned char* state1, unsigned char* state2, \
                float *unorm, float max_unorm, float param_norm, \
                float beta1, float beta2, \
                float eps, int step, float lr,  \
                float* quantiles1, float* quantiles2, \
                float* max1, float* max2, float* new_max1, float* new_max2, \
                float weight_decay, \
                const float gnorm_scale, int n); \

Max Ryabinin's avatar
Max Ryabinin committed
249

Tim Dettmers's avatar
Tim Dettmers committed
250
MAKE_optimizerStatic8bit(ADAM, half)
Max Ryabinin's avatar
Max Ryabinin committed
251

Tim Dettmers's avatar
Tim Dettmers committed
252
MAKE_optimizerStatic8bit(ADAM, float)
Max Ryabinin's avatar
Max Ryabinin committed
253

Tim Dettmers's avatar
Tim Dettmers committed
254
MAKE_optimizerStatic8bit(MOMENTUM, half)
Max Ryabinin's avatar
Max Ryabinin committed
255

Tim Dettmers's avatar
Tim Dettmers committed
256
MAKE_optimizerStatic8bit(MOMENTUM, float)
Max Ryabinin's avatar
Max Ryabinin committed
257

Tim Dettmers's avatar
Tim Dettmers committed
258
MAKE_optimizerStatic8bit(RMSPROP, half)
Max Ryabinin's avatar
Max Ryabinin committed
259

Tim Dettmers's avatar
Tim Dettmers committed
260
261
262
263
264
MAKE_optimizerStatic8bit(RMSPROP, float)

#define MAKE_optimizerStatic8bitBlockwise(gtype, optim_name) \
template void optimizerStatic8bitBlockwise<gtype, optim_name>(gtype* p, gtype* g, \
                unsigned char* state1, unsigned char* state2, float beta1, float beta2, float eps, int step, float lr,  \
265
                float* quantiles1, float* quantiles2, float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale, bool skip_zeros, int n); \
Tim Dettmers's avatar
Tim Dettmers committed
266

Max Ryabinin's avatar
Max Ryabinin committed
267

Tim Dettmers's avatar
Tim Dettmers committed
268
MAKE_optimizerStatic8bitBlockwise(half, ADAM);
Max Ryabinin's avatar
Max Ryabinin committed
269

Tim Dettmers's avatar
Tim Dettmers committed
270
MAKE_optimizerStatic8bitBlockwise(float, ADAM);
Max Ryabinin's avatar
Max Ryabinin committed
271

Tim Dettmers's avatar
Tim Dettmers committed
272
MAKE_optimizerStatic8bitBlockwise(half, MOMENTUM);
Max Ryabinin's avatar
Max Ryabinin committed
273

Tim Dettmers's avatar
Tim Dettmers committed
274
MAKE_optimizerStatic8bitBlockwise(float, MOMENTUM);
Max Ryabinin's avatar
Max Ryabinin committed
275

Tim Dettmers's avatar
Tim Dettmers committed
276
MAKE_optimizerStatic8bitBlockwise(half, RMSPROP);
Max Ryabinin's avatar
Max Ryabinin committed
277

Tim Dettmers's avatar
Tim Dettmers committed
278
MAKE_optimizerStatic8bitBlockwise(float, RMSPROP);
Max Ryabinin's avatar
Max Ryabinin committed
279

280
MAKE_optimizerStatic8bitBlockwise(half, ADAGRAD);
Max Ryabinin's avatar
Max Ryabinin committed
281

282
MAKE_optimizerStatic8bitBlockwise(float, ADAGRAD);
Tim Dettmers's avatar
Tim Dettmers committed
283

Max Ryabinin's avatar
Max Ryabinin committed
284
285
286
template void percentileClipping(float *g, float *gnorm_vec, int step, const int n);

template void percentileClipping(half *g, float *gnorm_vec, int step, const int n);