kernels.cuh 7.72 KB
Newer Older
1
2
3
// Copyright (c) Facebook, Inc. and its affiliates.
//
// This source code is licensed under the MIT license found in the
Tim Dettmers's avatar
Tim Dettmers committed
4
5
6
7
8
9
10
11
// LICENSE file in the root directory of this source tree.

#include <float.h>
#include <ops.cuh>

#ifndef kernels
#define kernels

Tim Dettmers's avatar
Tim Dettmers committed
12
13
template <int QUANT_TYPE, typename INP_TYPE, typename COMP_TYPE, typename OUT_TYPE>__global__ void kMatmul_inference_4bit(INP_TYPE *A, unsigned char *B, OUT_TYPE *out, int lda, int ldb, int rowsA, int colsA, int colsB);

Tim Dettmers's avatar
Tim Dettmers committed
14
15
16
17
18
template<typename T>__global__ void kEstimateQuantiles(T *__restrict__ const A, float *code, const float offset, const T max_val, const int n);

__global__ void kQuantize(float * code, float * __restrict__ const A, unsigned char *out, const int n);
__global__ void kDequantize(float *code, unsigned char *A, float *out, const int n);

Tim Dettmers's avatar
Tim Dettmers committed
19
20
template<typename T, int BLOCK_SIZE, int NUM_PER_TH, int STOCHASTIC, int DATA_TYPE> __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n);
template<typename T, int BLOCK_SIZE, int THREADS, int NUM_PER_TH, int DATA_TYPE> __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, T *out, const int blocksize, const int n);
Tim Dettmers's avatar
Tim Dettmers committed
21
22

template<typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS>
23
__global__ void kPreconditionOptimizer32bit2State(T* g, T* p,
Tim Dettmers's avatar
Tim Dettmers committed
24
25
26
27
28
                float* state1, float* state2, float *unorm,
                const float beta1, const float beta2, const float eps, const float weight_decay,
                const int step, const float lr, const float gnorm_scale, const int n);

template<typename T, int OPTIMIZER>
29
__global__ void kOptimizer32bit2State(T* g, T* p,
Tim Dettmers's avatar
Tim Dettmers committed
30
31
                float* state1, float* state2, float *unorm, const float max_unorm, const float param_norm,
                const float beta1, const float beta2, const float eps, const float weight_decay,
32
                const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n);
Tim Dettmers's avatar
Tim Dettmers committed
33
34

template<typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS>
35
__global__ void kPreconditionOptimizer32bit1State(T* g, T* p,
Tim Dettmers's avatar
Tim Dettmers committed
36
37
38
39
40
                float* state1, float *unorm,
                const float beta1, const float eps, const float weight_decay,
                const int step, const float lr, const float gnorm_scale, const int n);

template<typename T, int OPTIMIZER>
41
__global__ void kOptimizer32bit1State(T* g, T* p,
Tim Dettmers's avatar
Tim Dettmers committed
42
43
                float* state1,  float *unorm, const float max_unorm, const float param_norm,
                const float beta1, const float eps, const float weight_decay,
44
                const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n);
Tim Dettmers's avatar
Tim Dettmers committed
45
46
47

template<typename T, int OPTIMIZER>
__global__ void
48
kPreconditionOptimizerStatic8bit1State(T* p, T* __restrict__ const g, unsigned char*__restrict__  const state1,
Tim Dettmers's avatar
Tim Dettmers committed
49
                float *unorm,
50
51
52
53
                const float beta1,
                const float eps, const int step,
                float* __restrict__ const quantiles1,
                float* max1, float* new_max1,
Tim Dettmers's avatar
Tim Dettmers committed
54
55
56
57
58
59
                const float weight_decay,
                const float gnorm_scale, const int n);


template<typename T, int OPTIMIZER>
__global__ void
60
kOptimizerStatic8bit1State(T* p, T* const g, unsigned char* state1,
Tim Dettmers's avatar
Tim Dettmers committed
61
                const float *unorm, const float max_unorm, const float param_norm,
62
63
64
65
                const float beta1,
                const float eps, const int step, const float lr,
                float* __restrict__ const quantiles1,
                float* max1, float* new_max1,
Tim Dettmers's avatar
Tim Dettmers committed
66
67
68
69
70
71
72
73
74
                float weight_decay, const float gnorm_scale, const int n);



template<typename T, int OPTIMIZER>
__global__ void
kPreconditionOptimizerStatic8bit2State(T* p, T* __restrict__ const g, unsigned char*__restrict__  const state1, unsigned char* __restrict__ const state2,
                float *unorm,
                const float beta1, const float beta2,
75
                const float eps, const int step,
Tim Dettmers's avatar
Tim Dettmers committed
76
77
78
79
80
81
82
83
84
85
                float* __restrict__ const quantiles1, float* __restrict__ const quantiles2,
                float* max1, float* max2, float* new_max1, float* new_max2,
                const float gnorm_scale, const int n);


template<typename T, int OPTIMIZER>
__global__ void
kOptimizerStatic8bit2State(T* p, T* const g, unsigned char* state1, unsigned char* state2,
                const float *unorm, const float max_unorm, const float param_norm,
                const float beta1, const float beta2,
86
                const float eps, const int step, const float lr,
Tim Dettmers's avatar
Tim Dettmers committed
87
88
89
90
91
92
93
94
                float* __restrict__ const quantiles1, float* __restrict__ const quantiles2,
                float* max1, float* max2, float* new_max1, float* new_max2,
                float weight_decay, const float gnorm_scale, const int n);

template<typename T, int OPTIMIZER, int BLOCK_SIZE, int N_PER_TH> __global__ void kOptimizerStatic8bit2StateBlockwise(
		T* p, T* __restrict__ const g, unsigned char* state1, unsigned char* state2,
                const float beta1, const float beta2, const float eps, const int step, const float lr,
                float* __restrict__ const quantiles1, float* __restrict__ const quantiles2,
95
                float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale, const bool skip_zeros, const int n);
Tim Dettmers's avatar
Tim Dettmers committed
96
97
98
99
100
101
102
103

template<typename T, int OPTIMIZER, int BLOCK_SIZE, int N_PER_TH> __global__ void kOptimizerStatic8bit1StateBlockwise(
		T* p, T* __restrict__ const g, unsigned char* state1,
                const float beta1, const float beta2,
                const float eps, const int step, const float lr,
                float* __restrict__ const quantiles1,
                float* absmax1,
                float weight_decay,
104
                const float gnorm_scale, const bool skip_zeros, const int n);
Tim Dettmers's avatar
Tim Dettmers committed
105
106
107
108
109
110


template<typename T, int BLOCK_SIZE, int NUM_VALS> __global__ void kPercentileClipping(T * __restrict__ g, float *gnorm_vec, int step, const int n);

__global__ void kHistogramScatterAdd2D(float* histogram, int *index1, int *index2, float *src, const int maxidx1, const int n);

Tim Dettmers's avatar
Tim Dettmers committed
111

112
template <typename T, int SPMM_ITEMS, int BITS> __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, T *B, half *out,  float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
Tim Dettmers's avatar
Tim Dettmers committed
113
114
115

template <int ITEMS_PER_THREAD, int SUBTILE_ROWS, int THREADS>__global__ void kdequant_mm_int32_fp16(
  int *__restrict__ const A, float *__restrict__ const rowStats, float *__restrict__ const colStats,
116
  half *out, float* newRowStats, float* newcolStats, half * __restrict__ const bias, const int numRows, const int numCols, const int tileCols, const int n);
Tim Dettmers's avatar
Tim Dettmers committed
117
118
119
120
121
122

template<typename T, int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int SPARSE_DECOMP> __global__ void kgetColRowStats(T * __restrict__ A, float *rowStats, float *colStats, int * nnz_count_row, float nnz_threshold, int rows, int cols, int tiledRows, int tiledCols);
template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int SPARSE_DECOMP> __global__ void kDoubleRowColQuant(half *__restrict__ const A, float *__restrict__ const rowStats, float * __restrict__ const colStats, char *out_col_normed, char *out_row_normed, int *rowidx, int *colidx, half *val, int * __restrict__ nnz_block_ptr, float threshold, int rows, int cols, int tiledCols);

template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int TRANSPOSE, int FORMAT> __global__ void kTransformRowToFormat(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols);

123
template <int FORMAT> __global__ void kExtractOutliers(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA);
124

Tim Dettmers's avatar
Tim Dettmers committed
125
#endif