kernels.cuh 8.36 KB
Newer Older
1
2
3
// Copyright (c) Facebook, Inc. and its affiliates.
//
// This source code is licensed under the MIT license found in the
Tim Dettmers's avatar
Tim Dettmers committed
4
5
6
7
8
9
10
11
// LICENSE file in the root directory of this source tree.

#include <float.h>
#include <ops.cuh>

#ifndef kernels
#define kernels

Tim Dettmers's avatar
Tim Dettmers committed
12

Tim Dettmers's avatar
Tim Dettmers committed
13
14
15
16
17
template<typename T>__global__ void kEstimateQuantiles(T *__restrict__ const A, float *code, const float offset, const T max_val, const int n);

__global__ void kQuantize(float * code, float * __restrict__ const A, unsigned char *out, const int n);
__global__ void kDequantize(float *code, unsigned char *A, float *out, const int n);

Tim Dettmers's avatar
Tim Dettmers committed
18
19
template<typename T, int BLOCK_SIZE, int NUM_PER_TH, int STOCHASTIC, int DATA_TYPE> __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n);
template<typename T, int BLOCK_SIZE, int THREADS, int NUM_PER_TH, int DATA_TYPE> __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, T *out, const int blocksize, const int n);
Tim Dettmers's avatar
Tim Dettmers committed
20
21

template<typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS>
22
__global__ void kPreconditionOptimizer32bit2State(T* g, T* p,
Tim Dettmers's avatar
Tim Dettmers committed
23
24
25
26
27
                float* state1, float* state2, float *unorm,
                const float beta1, const float beta2, const float eps, const float weight_decay,
                const int step, const float lr, const float gnorm_scale, const int n);

template<typename T, int OPTIMIZER>
28
__global__ void kOptimizer32bit2State(T* g, T* p,
Tim Dettmers's avatar
Tim Dettmers committed
29
                float* state1, float* state2, float *unorm, const float max_unorm, const float param_norm,
30
31
                const float beta1, const float beta2, const float beta3, const float alpha,
                const float eps, const float weight_decay,
32
                const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n);
Tim Dettmers's avatar
Tim Dettmers committed
33
34

template<typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS>
35
__global__ void kPreconditionOptimizer32bit1State(T* g, T* p,
Tim Dettmers's avatar
Tim Dettmers committed
36
                float* state1, float *unorm,
37
                const float beta1, const float beta2, const float eps, const float weight_decay,
Tim Dettmers's avatar
Tim Dettmers committed
38
39
40
                const int step, const float lr, const float gnorm_scale, const int n);

template<typename T, int OPTIMIZER>
41
__global__ void kOptimizer32bit1State(T* g, T* p,
Tim Dettmers's avatar
Tim Dettmers committed
42
                float* state1,  float *unorm, const float max_unorm, const float param_norm,
43
                const float beta1, const float beta2, const float eps, const float weight_decay,
44
                const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n);
Tim Dettmers's avatar
Tim Dettmers committed
45
46
47

template<typename T, int OPTIMIZER>
__global__ void
48
kPreconditionOptimizerStatic8bit1State(T* p, T* __restrict__ const g, unsigned char*__restrict__  const state1,
Tim Dettmers's avatar
Tim Dettmers committed
49
                float *unorm,
50
                const float beta1, const float beta2,
51
52
53
                const float eps, const int step,
                float* __restrict__ const quantiles1,
                float* max1, float* new_max1,
Tim Dettmers's avatar
Tim Dettmers committed
54
55
56
57
58
59
                const float weight_decay,
                const float gnorm_scale, const int n);


template<typename T, int OPTIMIZER>
__global__ void
60
kOptimizerStatic8bit1State(T* p, T* const g, unsigned char* state1,
Tim Dettmers's avatar
Tim Dettmers committed
61
                const float *unorm, const float max_unorm, const float param_norm,
62
                const float beta1, const float beta2,
63
64
65
                const float eps, const int step, const float lr,
                float* __restrict__ const quantiles1,
                float* max1, float* new_max1,
Tim Dettmers's avatar
Tim Dettmers committed
66
67
68
69
70
71
72
73
74
                float weight_decay, const float gnorm_scale, const int n);



template<typename T, int OPTIMIZER>
__global__ void
kPreconditionOptimizerStatic8bit2State(T* p, T* __restrict__ const g, unsigned char*__restrict__  const state1, unsigned char* __restrict__ const state2,
                float *unorm,
                const float beta1, const float beta2,
75
                const float eps, const int step,
Tim Dettmers's avatar
Tim Dettmers committed
76
77
78
79
80
81
82
83
84
85
                float* __restrict__ const quantiles1, float* __restrict__ const quantiles2,
                float* max1, float* max2, float* new_max1, float* new_max2,
                const float gnorm_scale, const int n);


template<typename T, int OPTIMIZER>
__global__ void
kOptimizerStatic8bit2State(T* p, T* const g, unsigned char* state1, unsigned char* state2,
                const float *unorm, const float max_unorm, const float param_norm,
                const float beta1, const float beta2,
86
                const float eps, const int step, const float lr,
Tim Dettmers's avatar
Tim Dettmers committed
87
88
89
90
91
92
                float* __restrict__ const quantiles1, float* __restrict__ const quantiles2,
                float* max1, float* max2, float* new_max1, float* new_max2,
                float weight_decay, const float gnorm_scale, const int n);

template<typename T, int OPTIMIZER, int BLOCK_SIZE, int N_PER_TH> __global__ void kOptimizerStatic8bit2StateBlockwise(
		T* p, T* __restrict__ const g, unsigned char* state1, unsigned char* state2,
93
                const float beta1, const float beta2, const float beta3, const float alpha, const float eps, const int step, const float lr,
Tim Dettmers's avatar
Tim Dettmers committed
94
                float* __restrict__ const quantiles1, float* __restrict__ const quantiles2,
95
                float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale, const bool skip_zeros, const int n);
Tim Dettmers's avatar
Tim Dettmers committed
96
97
98
99
100
101
102
103

template<typename T, int OPTIMIZER, int BLOCK_SIZE, int N_PER_TH> __global__ void kOptimizerStatic8bit1StateBlockwise(
		T* p, T* __restrict__ const g, unsigned char* state1,
                const float beta1, const float beta2,
                const float eps, const int step, const float lr,
                float* __restrict__ const quantiles1,
                float* absmax1,
                float weight_decay,
104
                const float gnorm_scale, const bool skip_zeros, const int n);
Tim Dettmers's avatar
Tim Dettmers committed
105
106
107
108


template<typename T, int BLOCK_SIZE, int NUM_VALS> __global__ void kPercentileClipping(T * __restrict__ g, float *gnorm_vec, int step, const int n);

109

Tim Dettmers's avatar
Tim Dettmers committed
110
111
__global__ void kHistogramScatterAdd2D(float* histogram, int *index1, int *index2, float *src, const int maxidx1, const int n);

Tim Dettmers's avatar
Tim Dettmers committed
112

113
template <typename T, int SPMM_ITEMS, int BITS> __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, T *B, half *out,  float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
Tim Dettmers's avatar
Tim Dettmers committed
114
115
116

template <int ITEMS_PER_THREAD, int SUBTILE_ROWS, int THREADS>__global__ void kdequant_mm_int32_fp16(
  int *__restrict__ const A, float *__restrict__ const rowStats, float *__restrict__ const colStats,
117
  half *out, float* newRowStats, float* newcolStats, half * __restrict__ const bias, const int numRows, const int numCols, const int tileCols, const int n);
Tim Dettmers's avatar
Tim Dettmers committed
118
119
120
121
122
123

template<typename T, int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int SPARSE_DECOMP> __global__ void kgetColRowStats(T * __restrict__ A, float *rowStats, float *colStats, int * nnz_count_row, float nnz_threshold, int rows, int cols, int tiledRows, int tiledCols);
template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int SPARSE_DECOMP> __global__ void kDoubleRowColQuant(half *__restrict__ const A, float *__restrict__ const rowStats, float * __restrict__ const colStats, char *out_col_normed, char *out_row_normed, int *rowidx, int *colidx, half *val, int * __restrict__ nnz_block_ptr, float threshold, int rows, int cols, int tiledCols);

template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int TRANSPOSE, int FORMAT> __global__ void kTransformRowToFormat(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols);

124
template <int FORMAT> __global__ void kExtractOutliers(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA);
125

Tim Dettmers's avatar
Tim Dettmers committed
126
template <typename T, int BITS, int THREADS> __global__ void gemm_device(int M, int N, int K, T * __restrict__ const A,  T* B,  T * out,  int lda, int ldb, int ldc);
Tim Dettmers's avatar
Tim Dettmers committed
127
template <typename T, int THREADS> __global__ void kgemm_4bit_inference(int M, int N, int K, T * __restrict__ const A, unsigned char *B,  float *absmax, T * out,  int lda, int ldb, int ldc, int blocksize);
128
template <typename T, int THREADS, int BITS> __global__ void kgemm_4bit_inference_naive(int M, int N, int K, T * __restrict__ const A, unsigned char *B,  float *absmax, const float *datatype, T * out,  int lda, int ldb, int ldc, int blocksize);
Tim Dettmers's avatar
Tim Dettmers committed
129

Tim Dettmers's avatar
Tim Dettmers committed
130
131
template <typename T, int FUNC> __global__ void kfunc(T *A, T *B, T value, long n);

Tim Dettmers's avatar
Tim Dettmers committed
132
#endif