"tests/nn/data_parallel/test_sharded_ddp_features.py" did not exist on "a265586bfe891d3eeb87588d3088672e8815179d"
kernels_hip.cuh 7.06 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
// Copyright (c) Facebook, Inc. and its affiliates.
//
// This source code is licensed under the MIT license found in the
// LICENSE file in the root directory of this source tree.

#include <float.h>
#include <ops_hip.cuh>

#ifndef kernels
#define kernels

__global__ void kQuantize(float* code, float* __restrict__ const A, unsigned char* out, const int n);
__global__ void kDequantize(float* code, unsigned char* A, float* out, const int n);

template <typename T, int BLOCK_SIZE, int NUM_PER_TH, int STOCHASTIC, int DATA_TYPE>
__global__ void kQuantizeBlockwise(
    float* code, T* __restrict__ const A, float* absmax, unsigned char* out, float* __restrict__ const rand,
    const int rand_offset, const int n
);
template <typename T, int BLOCK_SIZE, int THREADS, int NUM_PER_TH, int DATA_TYPE>
__global__ void
    kDequantizeBlockwise(float* code, unsigned char* A, float* absmax, T* out, const int blocksize, const int n);

template <typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS>
__global__ void kPreconditionOptimizer32bit2State(
    T* g, T* p, float* state1, float* state2, float* unorm, const float beta1, const float beta2, const float eps,
    const float weight_decay, const int step, const float lr, const float gnorm_scale, const int n
);

template <typename T, int OPTIMIZER>
__global__ void kOptimizer32bit2State(
    T* g, T* p, float* state1, float* state2, float* unorm, const float max_unorm, const float param_norm,
    const float beta1, const float beta2, const float beta3, const float alpha, const float eps,
    const float weight_decay, const int step, const float lr, const float gnorm_scale, const bool skip_zeros,
    const int n
);

template <typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS>
__global__ void kPreconditionOptimizer32bit1State(
    T* g, T* p, float* state1, float* unorm, const float beta1, const float beta2, const float eps,
    const float weight_decay, const int step, const float lr, const float gnorm_scale, const int n
);

template <typename T, int OPTIMIZER>
__global__ void kOptimizer32bit1State(
    T* g, T* p, float* state1, float* unorm, const float max_unorm, const float param_norm, const float beta1,
    const float beta2, const float eps, const float weight_decay, const int step, const float lr,
    const float gnorm_scale, const bool skip_zeros, const int n
);

template <typename T, int OPTIMIZER>
__global__ void kPreconditionOptimizerStatic8bit1State(
    T* p, T* __restrict__ const g, unsigned char* __restrict__ const state1, float* unorm, const float beta1,
    const float beta2, const float eps, const int step, float* __restrict__ const quantiles1, float* max1,
    float* new_max1, const float weight_decay, const float gnorm_scale, const int n
);

template <typename T, int OPTIMIZER>
__global__ void kOptimizerStatic8bit1State(
    T* p, T* const g, unsigned char* state1, const float* unorm, const float max_unorm, const float param_norm,
    const float beta1, const float beta2, const float eps, const int step, const float lr,
    float* __restrict__ const quantiles1, float* max1, float* new_max1, float weight_decay, const float gnorm_scale,
    const int n
);

template <typename T, int OPTIMIZER>
__global__ void kPreconditionOptimizerStatic8bit2State(
    T* p, T* __restrict__ const g, unsigned char* __restrict__ const state1, unsigned char* __restrict__ const state2,
    float* unorm, const float beta1, const float beta2, const float eps, const int step,
    float* __restrict__ const quantiles1, float* __restrict__ const quantiles2, float* max1, float* max2,
    float* new_max1, float* new_max2, const float gnorm_scale, const int n
);

template <typename T, int OPTIMIZER>
__global__ void kOptimizerStatic8bit2State(
    T* p, T* const g, unsigned char* state1, unsigned char* state2, const float* unorm, const float max_unorm,
    const float param_norm, const float beta1, const float beta2, const float eps, const int step, const float lr,
    float* __restrict__ const quantiles1, float* __restrict__ const quantiles2, float* max1, float* max2,
    float* new_max1, float* new_max2, float weight_decay, const float gnorm_scale, const int n
);

template <typename T, int OPTIMIZER, int BLOCK_SIZE, int N_PER_TH>
__global__ void kOptimizerStatic8bit2StateBlockwise(
    T* p, T* __restrict__ const g, unsigned char* state1, unsigned char* state2, const float beta1, const float beta2,
    const float beta3, const float alpha, const float eps, const int step, const float lr,
    float* __restrict__ const quantiles1, float* __restrict__ const quantiles2, float* absmax1, float* absmax2,
    float weight_decay, const float gnorm_scale, const bool skip_zeros, const int n
);

template <typename T, int OPTIMIZER, int BLOCK_SIZE, int N_PER_TH>
__global__ void kOptimizerStatic8bit1StateBlockwise(
    T* p, T* __restrict__ const g, unsigned char* state1, const float beta1, const float beta2, const float eps,
    const int step, const float lr, float* __restrict__ const quantiles1, float* absmax1, float weight_decay,
    const float gnorm_scale, const bool skip_zeros, const int n
);

template <typename T, int BLOCK_SIZE, int NUM_VALS>
__global__ void kPercentileClipping(T* __restrict__ g, float* gnorm_vec, int step, const int n);

template <typename T, int SPMM_ITEMS, int BITS>
__global__ void kspmm_coo_very_sparse_naive(
    int* max_count, int* max_idx, int* offset_rowidx, int* rowidx, int* colidx, half* values, T* B, half* out,
    float* __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB
);

template <int ITEMS_PER_THREAD, int THREADS>
__global__ void kdequant_mm_int32_fp16(
    int* __restrict__ const A, float* __restrict__ const rowStats, float* __restrict__ const colStats, half* out,
    half* __restrict__ const bias, const int numRows, const int numCols, const int n
);

template <typename T, int THREADS, int SPARSE_DECOMP>
__global__ void kgetRowStats(T* __restrict__ A, float* rowStats, float threshold, int rows, int cols);
template <typename T, int THREADS, int SPARSE_DECOMP>
__global__ void kInt8VectorQuant(T* __restrict__ A, int8_t* out, float* rowStats, float threshold, int rows, int cols);

template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int TRANSPOSE, int FORMAT>
__global__ void kTransformRowToFormat(
    char* __restrict__ const A, char* out, int rows, int cols, int tiledCols, int outRows, int outCols
);

template <typename T, int BITS, int THREADS>
__global__ void gemm_device(int M, int N, int K, T* __restrict__ const A, T* B, T* out, int lda, int ldb, int ldc);
template <typename T, int THREADS>
__global__ void kgemm_4bit_inference(
    int M, int N, int K, T* __restrict__ const A, unsigned char* B, float* absmax, T* out, int lda, int ldb, int ldc,
    int blocksize
);
template <typename T, int THREADS, int BITS>
__global__ void kgemm_4bit_inference_naive(
    int M, int N, int K, T* __restrict__ const A, unsigned char* B, float* absmax, const float* datatype, T* out,
    int lda, int ldb, int ldc, int blocksize
);

template <typename T, int FUNC> __global__ void kfunc(T* A, T* B, T value, long n);

#endif