kernels.cuh 6.97 KB
Newer Older
1
2
3
// Copyright (c) Facebook, Inc. and its affiliates.
//
// This source code is licensed under the MIT license found in the
Tim Dettmers's avatar
Tim Dettmers committed
4
5
6
7
8
9
10
11
// LICENSE file in the root directory of this source tree.

#include <float.h>
#include <ops.cuh>

#ifndef kernels
#define kernels

12
13
14
15
16
17
18
19
20
__global__ void kQuantize(float* code, float* __restrict__ const A, unsigned char* out, const int n);
__global__ void kDequantize(float* code, unsigned char* A, float* out, const int n);

template <typename T, int BLOCK_SIZE, int NUM_PER_TH, int STOCHASTIC, int DATA_TYPE>
__global__ void kQuantizeBlockwise(
    float* code, T* __restrict__ const A, float* absmax, unsigned char* out, float* __restrict__ const rand,
    const int rand_offset, const int n
);
template <typename T, int BLOCK_SIZE, int THREADS, int NUM_PER_TH, int DATA_TYPE>
Tim Dettmers's avatar
Tim Dettmers committed
21
__global__ void
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
    kDequantizeBlockwise(float* code, unsigned char* A, float* absmax, T* out, const int blocksize, const int n);

template <typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS>
__global__ void kPreconditionOptimizer32bit2State(
    T* g, T* p, float* state1, float* state2, float* unorm, const float beta1, const float beta2, const float eps,
    const float weight_decay, const int step, const float lr, const float gnorm_scale, const int n
);

template <typename T, int OPTIMIZER>
__global__ void kOptimizer32bit2State(
    T* g, T* p, float* state1, float* state2, float* unorm, const float max_unorm, const float param_norm,
    const float beta1, const float beta2, const float beta3, const float alpha, const float eps,
    const float weight_decay, const int step, const float lr, const float gnorm_scale, const bool skip_zeros,
    const int n
);

template <typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS>
__global__ void kPreconditionOptimizer32bit1State(
    T* g, T* p, float* state1, float* unorm, const float beta1, const float beta2, const float eps,
    const float weight_decay, const int step, const float lr, const float gnorm_scale, const int n
);

template <typename T, int OPTIMIZER>
__global__ void kOptimizer32bit1State(
    T* g, T* p, float* state1, float* unorm, const float max_unorm, const float param_norm, const float beta1,
    const float beta2, const float eps, const float weight_decay, const int step, const float lr,
    const float gnorm_scale, const bool skip_zeros, const int n
);

template <typename T, int OPTIMIZER>
__global__ void kPreconditionOptimizerStatic8bit1State(
    T* p, T* __restrict__ const g, unsigned char* __restrict__ const state1, float* unorm, const float beta1,
    const float beta2, const float eps, const int step, float* __restrict__ const quantiles1, float* max1,
    float* new_max1, const float weight_decay, const float gnorm_scale, const int n
);

template <typename T, int OPTIMIZER>
__global__ void kOptimizerStatic8bit1State(
    T* p, T* const g, unsigned char* state1, const float* unorm, const float max_unorm, const float param_norm,
    const float beta1, const float beta2, const float eps, const int step, const float lr,
    float* __restrict__ const quantiles1, float* max1, float* new_max1, float weight_decay, const float gnorm_scale,
    const int n
);

template <typename T, int OPTIMIZER>
__global__ void kPreconditionOptimizerStatic8bit2State(
    T* p, T* __restrict__ const g, unsigned char* __restrict__ const state1, unsigned char* __restrict__ const state2,
    float* unorm, const float beta1, const float beta2, const float eps, const int step,
    float* __restrict__ const quantiles1, float* __restrict__ const quantiles2, float* max1, float* max2,
    float* new_max1, float* new_max2, const float gnorm_scale, const int n
);

template <typename T, int OPTIMIZER>
__global__ void kOptimizerStatic8bit2State(
    T* p, T* const g, unsigned char* state1, unsigned char* state2, const float* unorm, const float max_unorm,
    const float param_norm, const float beta1, const float beta2, const float eps, const int step, const float lr,
    float* __restrict__ const quantiles1, float* __restrict__ const quantiles2, float* max1, float* max2,
    float* new_max1, float* new_max2, float weight_decay, const float gnorm_scale, const int n
);

template <typename T, int OPTIMIZER, int BLOCK_SIZE, int N_PER_TH>
__global__ void kOptimizerStatic8bit2StateBlockwise(
    T* p, T* __restrict__ const g, unsigned char* state1, unsigned char* state2, const float beta1, const float beta2,
    const float beta3, const float alpha, const float eps, const int step, const float lr,
    float* __restrict__ const quantiles1, float* __restrict__ const quantiles2, float* absmax1, float* absmax2,
    float weight_decay, const float gnorm_scale, const bool skip_zeros, const int n
);

template <typename T, int OPTIMIZER, int BLOCK_SIZE, int N_PER_TH>
__global__ void kOptimizerStatic8bit1StateBlockwise(
    T* p, T* __restrict__ const g, unsigned char* state1, const float beta1, const float beta2, const float eps,
    const int step, const float lr, float* __restrict__ const quantiles1, float* absmax1, float weight_decay,
    const float gnorm_scale, const bool skip_zeros, const int n
);

template <typename T, int BLOCK_SIZE, int NUM_VALS>
__global__ void kPercentileClipping(T* __restrict__ g, float* gnorm_vec, int step, const int n);

template <typename T, int SPMM_ITEMS, int BITS>
__global__ void kspmm_coo_very_sparse_naive(
    int* max_count, int* max_idx, int* offset_rowidx, int* rowidx, int* colidx, half* values, T* B, half* out,
    float* __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB
);

template <int ITEMS_PER_THREAD, int THREADS>
__global__ void kdequant_mm_int32_fp16(
    int* __restrict__ const A, float* __restrict__ const rowStats, float* __restrict__ const colStats, half* out,
    half* __restrict__ const bias, const int numRows, const int numCols, const int n
);

template <typename T, int THREADS, int SPARSE_DECOMP>
__global__ void kgetRowStats(T* __restrict__ A, float* rowStats, float threshold, int rows, int cols);
template <typename T, int THREADS, int SPARSE_DECOMP>
__global__ void kInt8VectorQuant(T* __restrict__ A, int8_t* out, float* rowStats, float threshold, int rows, int cols);

template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int TRANSPOSE, int FORMAT>
__global__ void kTransformRowToFormat(
    char* __restrict__ const A, char* out, int rows, int cols, int tiledCols, int outRows, int outCols
);

template <typename T, int BITS, int THREADS>
__global__ void gemm_device(int M, int N, int K, T* __restrict__ const A, T* B, T* out, int lda, int ldb, int ldc);
template <typename T, int THREADS>
__global__ void kgemm_4bit_inference(
    int M, int N, int K, T* __restrict__ const A, unsigned char* B, float* absmax, T* out, int lda, int ldb, int ldc,
    int blocksize
);
template <typename T, int THREADS, int BITS>
__global__ void kgemm_4bit_inference_naive(
    int M, int N, int K, T* __restrict__ const A, unsigned char* B, float* absmax, const float* datatype, T* out,
    int lda, int ldb, int ldc, int blocksize
);

template <typename T, int FUNC> __global__ void kfunc(T* A, T* B, T value, long n);
Tim Dettmers's avatar
Tim Dettmers committed
136

Tim Dettmers's avatar
Tim Dettmers committed
137
#endif