kernels.cuh 8.27 KB
Newer Older
1
2
3
// Copyright (c) Facebook, Inc. and its affiliates.
//
// This source code is licensed under the MIT license found in the
Tim Dettmers's avatar
Tim Dettmers committed
4
5
6
7
8
9
10
11
// LICENSE file in the root directory of this source tree.

#include <float.h>
#include <ops.cuh>

#ifndef kernels
#define kernels

Tim Dettmers's avatar
Tim Dettmers committed
12

Tim Dettmers's avatar
Tim Dettmers committed
13
14
15
16
17
template<typename T>__global__ void kEstimateQuantiles(T *__restrict__ const A, float *code, const float offset, const T max_val, const int n);

__global__ void kQuantize(float * code, float * __restrict__ const A, unsigned char *out, const int n);
__global__ void kDequantize(float *code, unsigned char *A, float *out, const int n);

Tim Dettmers's avatar
Tim Dettmers committed
18
19
template<typename T, int BLOCK_SIZE, int NUM_PER_TH, int STOCHASTIC, int DATA_TYPE> __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n);
template<typename T, int BLOCK_SIZE, int THREADS, int NUM_PER_TH, int DATA_TYPE> __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, T *out, const int blocksize, const int n);
Tim Dettmers's avatar
Tim Dettmers committed
20
21

template<typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS>
22
__global__ void kPreconditionOptimizer32bit2State(T* g, T* p,
Tim Dettmers's avatar
Tim Dettmers committed
23
24
25
26
27
                float* state1, float* state2, float *unorm,
                const float beta1, const float beta2, const float eps, const float weight_decay,
                const int step, const float lr, const float gnorm_scale, const int n);

template<typename T, int OPTIMIZER>
28
__global__ void kOptimizer32bit2State(T* g, T* p,
Tim Dettmers's avatar
Tim Dettmers committed
29
30
                float* state1, float* state2, float *unorm, const float max_unorm, const float param_norm,
                const float beta1, const float beta2, const float eps, const float weight_decay,
31
                const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n);
Tim Dettmers's avatar
Tim Dettmers committed
32
33

template<typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS>
34
__global__ void kPreconditionOptimizer32bit1State(T* g, T* p,
Tim Dettmers's avatar
Tim Dettmers committed
35
                float* state1, float *unorm,
36
                const float beta1, const float beta2, const float eps, const float weight_decay,
Tim Dettmers's avatar
Tim Dettmers committed
37
38
39
                const int step, const float lr, const float gnorm_scale, const int n);

template<typename T, int OPTIMIZER>
40
__global__ void kOptimizer32bit1State(T* g, T* p,
Tim Dettmers's avatar
Tim Dettmers committed
41
                float* state1,  float *unorm, const float max_unorm, const float param_norm,
42
                const float beta1, const float beta2, const float eps, const float weight_decay,
43
                const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n);
Tim Dettmers's avatar
Tim Dettmers committed
44
45
46

template<typename T, int OPTIMIZER>
__global__ void
47
kPreconditionOptimizerStatic8bit1State(T* p, T* __restrict__ const g, unsigned char*__restrict__  const state1,
Tim Dettmers's avatar
Tim Dettmers committed
48
                float *unorm,
49
                const float beta1, const float beta2,
50
51
52
                const float eps, const int step,
                float* __restrict__ const quantiles1,
                float* max1, float* new_max1,
Tim Dettmers's avatar
Tim Dettmers committed
53
54
55
56
57
58
                const float weight_decay,
                const float gnorm_scale, const int n);


template<typename T, int OPTIMIZER>
__global__ void
59
kOptimizerStatic8bit1State(T* p, T* const g, unsigned char* state1,
Tim Dettmers's avatar
Tim Dettmers committed
60
                const float *unorm, const float max_unorm, const float param_norm,
61
                const float beta1, const float beta2,
62
63
64
                const float eps, const int step, const float lr,
                float* __restrict__ const quantiles1,
                float* max1, float* new_max1,
Tim Dettmers's avatar
Tim Dettmers committed
65
66
67
68
69
70
71
72
73
                float weight_decay, const float gnorm_scale, const int n);



template<typename T, int OPTIMIZER>
__global__ void
kPreconditionOptimizerStatic8bit2State(T* p, T* __restrict__ const g, unsigned char*__restrict__  const state1, unsigned char* __restrict__ const state2,
                float *unorm,
                const float beta1, const float beta2,
74
                const float eps, const int step,
Tim Dettmers's avatar
Tim Dettmers committed
75
76
77
78
79
80
81
82
83
84
                float* __restrict__ const quantiles1, float* __restrict__ const quantiles2,
                float* max1, float* max2, float* new_max1, float* new_max2,
                const float gnorm_scale, const int n);


template<typename T, int OPTIMIZER>
__global__ void
kOptimizerStatic8bit2State(T* p, T* const g, unsigned char* state1, unsigned char* state2,
                const float *unorm, const float max_unorm, const float param_norm,
                const float beta1, const float beta2,
85
                const float eps, const int step, const float lr,
Tim Dettmers's avatar
Tim Dettmers committed
86
87
88
89
90
91
92
93
                float* __restrict__ const quantiles1, float* __restrict__ const quantiles2,
                float* max1, float* max2, float* new_max1, float* new_max2,
                float weight_decay, const float gnorm_scale, const int n);

template<typename T, int OPTIMIZER, int BLOCK_SIZE, int N_PER_TH> __global__ void kOptimizerStatic8bit2StateBlockwise(
		T* p, T* __restrict__ const g, unsigned char* state1, unsigned char* state2,
                const float beta1, const float beta2, const float eps, const int step, const float lr,
                float* __restrict__ const quantiles1, float* __restrict__ const quantiles2,
94
                float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale, const bool skip_zeros, const int n);
Tim Dettmers's avatar
Tim Dettmers committed
95
96
97
98
99
100
101
102

template<typename T, int OPTIMIZER, int BLOCK_SIZE, int N_PER_TH> __global__ void kOptimizerStatic8bit1StateBlockwise(
		T* p, T* __restrict__ const g, unsigned char* state1,
                const float beta1, const float beta2,
                const float eps, const int step, const float lr,
                float* __restrict__ const quantiles1,
                float* absmax1,
                float weight_decay,
103
                const float gnorm_scale, const bool skip_zeros, const int n);
Tim Dettmers's avatar
Tim Dettmers committed
104
105
106
107


template<typename T, int BLOCK_SIZE, int NUM_VALS> __global__ void kPercentileClipping(T * __restrict__ g, float *gnorm_vec, int step, const int n);

108

Tim Dettmers's avatar
Tim Dettmers committed
109
110
__global__ void kHistogramScatterAdd2D(float* histogram, int *index1, int *index2, float *src, const int maxidx1, const int n);

Tim Dettmers's avatar
Tim Dettmers committed
111

112
template <typename T, int SPMM_ITEMS, int BITS> __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, T *B, half *out,  float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
Tim Dettmers's avatar
Tim Dettmers committed
113
114
115

template <int ITEMS_PER_THREAD, int SUBTILE_ROWS, int THREADS>__global__ void kdequant_mm_int32_fp16(
  int *__restrict__ const A, float *__restrict__ const rowStats, float *__restrict__ const colStats,
116
  half *out, float* newRowStats, float* newcolStats, half * __restrict__ const bias, const int numRows, const int numCols, const int tileCols, const int n);
Tim Dettmers's avatar
Tim Dettmers committed
117
118
119
120
121
122

template<typename T, int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int SPARSE_DECOMP> __global__ void kgetColRowStats(T * __restrict__ A, float *rowStats, float *colStats, int * nnz_count_row, float nnz_threshold, int rows, int cols, int tiledRows, int tiledCols);
template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int SPARSE_DECOMP> __global__ void kDoubleRowColQuant(half *__restrict__ const A, float *__restrict__ const rowStats, float * __restrict__ const colStats, char *out_col_normed, char *out_row_normed, int *rowidx, int *colidx, half *val, int * __restrict__ nnz_block_ptr, float threshold, int rows, int cols, int tiledCols);

template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int TRANSPOSE, int FORMAT> __global__ void kTransformRowToFormat(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols);

123
template <int FORMAT> __global__ void kExtractOutliers(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA);
124

Tim Dettmers's avatar
Tim Dettmers committed
125
template <typename T, int BITS, int THREADS> __global__ void gemm_device(int M, int N, int K, T * __restrict__ const A,  T* B,  T * out,  int lda, int ldb, int ldc);
Tim Dettmers's avatar
Tim Dettmers committed
126
template <typename T, int THREADS> __global__ void kgemm_4bit_inference(int M, int N, int K, T * __restrict__ const A, unsigned char *B,  float *absmax, T * out,  int lda, int ldb, int ldc, int blocksize);
127
template <typename T, int THREADS, int BITS> __global__ void kgemm_4bit_inference_naive(int M, int N, int K, T * __restrict__ const A, unsigned char *B,  float *absmax, const float *datatype, T * out,  int lda, int ldb, int ldc, int blocksize);
Tim Dettmers's avatar
Tim Dettmers committed
128

Tim Dettmers's avatar
Tim Dettmers committed
129
130
template <typename T, int FUNC> __global__ void kfunc(T *A, T *B, T value, long n);

Tim Dettmers's avatar
Tim Dettmers committed
131
#endif