Unverified Commit 4955d136 authored by Matthew Douglas's avatar Matthew Douglas Committed by GitHub
Browse files

Apply clang-format rules (#1678)

parent 61db0859
...@@ -26,10 +26,12 @@ void quantize_block(const quantize_block_args& args) { ...@@ -26,10 +26,12 @@ void quantize_block(const quantize_block_args& args) {
if (idx < 255) { if (idx < 255) {
float dist_left = fabs(normed_value - (args.code[idx])); float dist_left = fabs(normed_value - (args.code[idx]));
float dist_right = fabs(normed_value - (args.code[idx + 1])); float dist_right = fabs(normed_value - (args.code[idx + 1]));
if (dist_right < dist_left) { idx += 1; } if (dist_right < dist_left) {
idx += 1;
}
} }
// 5. store index // 5. store index
args.out[i] = (unsigned char) idx; args.out[i] = (unsigned char)idx;
} }
} }
...@@ -28,7 +28,8 @@ ...@@ -28,7 +28,8 @@
// The maximum number of resident threads per SM varies by arch. // The maximum number of resident threads per SM varies by arch.
// For A100/H100 and all prior to Turing, it is 2048, which allows // For A100/H100 and all prior to Turing, it is 2048, which allows
// for 2 full blocks of 1024 threads per SM. // for 2 full blocks of 1024 threads per SM.
// Reference: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications-technical-specifications-per-compute-capability // Reference:
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications-technical-specifications-per-compute-capability
#if __CUDA_ARCH__ == 750 #if __CUDA_ARCH__ == 750
#define BNB_MAX_THREADS_PER_SM 1024 #define BNB_MAX_THREADS_PER_SM 1024
#elif __CUDA_ARCH__ >= 860 && __CUDA_ARCH__ <= 890 #elif __CUDA_ARCH__ >= 860 && __CUDA_ARCH__ <= 890
......
...@@ -5,21 +5,18 @@ ...@@ -5,21 +5,18 @@
using namespace BinSearch; using namespace BinSearch;
#define BLOCK_SIZE 16384
struct quantize_block_args { struct quantize_block_args {
BinAlgo<Scalar, float, Direct2> *bin_searcher; BinAlgo<Scalar, float, Direct2>* bin_searcher;
float *code; float* code;
float *A; float* A;
float *absmax; float* absmax;
unsigned char *out; unsigned char* out;
long long block_end; long long block_end;
long long block_idx; long long block_idx;
long long threadidx; long long threadidx;
long long blocksize; long long blocksize;
}; };
void quantize_block(const quantize_block_args& args); void quantize_block(const quantize_block_args& args);
#endif #endif
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
using namespace BinSearch; using namespace BinSearch;
void dequantize_cpu(float *code, unsigned char *A, float *absmax, float *out, long long blocksize, long long n) { void dequantize_cpu(float* code, unsigned char* A, float* absmax, float* out, long long blocksize, long long n) {
for (long long block_idx = 0; block_idx < n; block_idx += blocksize) { for (long long block_idx = 0; block_idx < n; block_idx += blocksize) {
long long valid_items = n - block_idx >= blocksize ? blocksize : n - block_idx; long long valid_items = n - block_idx >= blocksize ? blocksize : n - block_idx;
long long block_end = block_idx + valid_items; long long block_end = block_idx + valid_items;
...@@ -13,8 +13,7 @@ void dequantize_cpu(float *code, unsigned char *A, float *absmax, float *out, lo ...@@ -13,8 +13,7 @@ void dequantize_cpu(float *code, unsigned char *A, float *absmax, float *out, lo
} }
} }
void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long long blocksize, long long n) void quantize_cpu(float* code, float* A, float* absmax, unsigned char* out, long long blocksize, long long n) {
{
// the default code is has range [-0.993, 1.0] which can cause an error in the binary search algorithm used below // the default code is has range [-0.993, 1.0] which can cause an error in the binary search algorithm used below
code[0] = -1.0f; code[0] = -1.0f;
...@@ -28,15 +27,13 @@ void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long ...@@ -28,15 +27,13 @@ void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long
int thread_wave_size = 256; int thread_wave_size = 256;
// we chunk the threads into waves of 256 since the max limit is // we chunk the threads into waves of 256 since the max limit is
// between 16k and 64k on Linux (we reach this when running BLOOM-176B with a large batch size) // between 16k and 64k on Linux (we reach this when running BLOOM-176B with a large batch size)
for(long long offset = 0; offset < num_blocks; offset+=thread_wave_size) for (long long offset = 0; offset < num_blocks; offset += thread_wave_size) {
{
long long valid_chunks = num_blocks - offset >= thread_wave_size ? thread_wave_size : num_blocks - offset; long long valid_chunks = num_blocks - offset >= thread_wave_size ? thread_wave_size : num_blocks - offset;
std::vector<std::thread> threads(valid_chunks); std::vector<std::thread> threads(valid_chunks);
std::vector<quantize_block_args> args(valid_chunks); std::vector<quantize_block_args> args(valid_chunks);
int chunks_processed = 0; int chunks_processed = 0;
for(long long block_idx = offset*blocksize; block_idx < n; block_idx += blocksize) for (long long block_idx = offset * blocksize; block_idx < n; block_idx += blocksize) {
{
long long valid_items = n - block_idx >= blocksize ? blocksize : n - block_idx; long long valid_items = n - block_idx >= blocksize ? blocksize : n - block_idx;
long long block_end = block_idx + valid_items; long long block_end = block_idx + valid_items;
...@@ -53,11 +50,12 @@ void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long ...@@ -53,11 +50,12 @@ void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long
threads[chunks_processed] = std::thread([arg] { quantize_block(arg); }); threads[chunks_processed] = std::thread([arg] { quantize_block(arg); });
chunks_processed += 1; chunks_processed += 1;
if(chunks_processed == valid_chunks){ break; } if (chunks_processed == valid_chunks) {
break;
}
} }
for (int i = 0; i < valid_chunks; i++) for (int i = 0; i < valid_chunks; i++)
threads[i].join(); threads[i].join();
} }
} }
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
#include <iostream> #include <iostream>
#include <stdio.h> #include <stdio.h>
void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long long blocksize, long long n); void quantize_cpu(float* code, float* A, float* absmax, unsigned char* out, long long blocksize, long long n);
void dequantize_cpu(float *code, unsigned char *A, float *absmax, float *out, long long blocksize, long long n); void dequantize_cpu(float* code, unsigned char* A, float* absmax, float* out, long long blocksize, long long n);
#endif #endif
This diff is collapsed.
...@@ -9,116 +9,129 @@ ...@@ -9,116 +9,129 @@
#ifndef kernels #ifndef kernels
#define kernels #define kernels
__global__ void kQuantize(float* code, float* __restrict__ const A, unsigned char* out, const int n);
__global__ void kQuantize(float * code, float * __restrict__ const A, unsigned char *out, const int n); __global__ void kDequantize(float* code, unsigned char* A, float* out, const int n);
__global__ void kDequantize(float *code, unsigned char *A, float *out, const int n);
template <typename T, int BLOCK_SIZE, int NUM_PER_TH, int STOCHASTIC, int DATA_TYPE>
template<typename T, int BLOCK_SIZE, int NUM_PER_TH, int STOCHASTIC, int DATA_TYPE> __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); __global__ void kQuantizeBlockwise(
template<typename T, int BLOCK_SIZE, int THREADS, int NUM_PER_TH, int DATA_TYPE> __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, T *out, const int blocksize, const int n); float* code, T* __restrict__ const A, float* absmax, unsigned char* out, float* __restrict__ const rand,
const int rand_offset, const int n
template<typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS> );
__global__ void kPreconditionOptimizer32bit2State(T* g, T* p, template <typename T, int BLOCK_SIZE, int THREADS, int NUM_PER_TH, int DATA_TYPE>
float* state1, float* state2, float *unorm,
const float beta1, const float beta2, const float eps, const float weight_decay,
const int step, const float lr, const float gnorm_scale, const int n);
template<typename T, int OPTIMIZER>
__global__ void kOptimizer32bit2State(T* g, T* p,
float* state1, float* state2, float *unorm, const float max_unorm, const float param_norm,
const float beta1, const float beta2, const float beta3, const float alpha,
const float eps, const float weight_decay,
const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n);
template<typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS>
__global__ void kPreconditionOptimizer32bit1State(T* g, T* p,
float* state1, float *unorm,
const float beta1, const float beta2, const float eps, const float weight_decay,
const int step, const float lr, const float gnorm_scale, const int n);
template<typename T, int OPTIMIZER>
__global__ void kOptimizer32bit1State(T* g, T* p,
float* state1, float *unorm, const float max_unorm, const float param_norm,
const float beta1, const float beta2, const float eps, const float weight_decay,
const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n);
template<typename T, int OPTIMIZER>
__global__ void
kPreconditionOptimizerStatic8bit1State(T* p, T* __restrict__ const g, unsigned char*__restrict__ const state1,
float *unorm,
const float beta1, const float beta2,
const float eps, const int step,
float* __restrict__ const quantiles1,
float* max1, float* new_max1,
const float weight_decay,
const float gnorm_scale, const int n);
template<typename T, int OPTIMIZER>
__global__ void
kOptimizerStatic8bit1State(T* p, T* const g, unsigned char* state1,
const float *unorm, const float max_unorm, const float param_norm,
const float beta1, const float beta2,
const float eps, const int step, const float lr,
float* __restrict__ const quantiles1,
float* max1, float* new_max1,
float weight_decay, const float gnorm_scale, const int n);
template<typename T, int OPTIMIZER>
__global__ void
kPreconditionOptimizerStatic8bit2State(T* p, T* __restrict__ const g, unsigned char*__restrict__ const state1, unsigned char* __restrict__ const state2,
float *unorm,
const float beta1, const float beta2,
const float eps, const int step,
float* __restrict__ const quantiles1, float* __restrict__ const quantiles2,
float* max1, float* max2, float* new_max1, float* new_max2,
const float gnorm_scale, const int n);
template<typename T, int OPTIMIZER>
__global__ void __global__ void
kOptimizerStatic8bit2State(T* p, T* const g, unsigned char* state1, unsigned char* state2, kDequantizeBlockwise(float* code, unsigned char* A, float* absmax, T* out, const int blocksize, const int n);
const float *unorm, const float max_unorm, const float param_norm,
const float beta1, const float beta2, template <typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS>
const float eps, const int step, const float lr, __global__ void kPreconditionOptimizer32bit2State(
float* __restrict__ const quantiles1, float* __restrict__ const quantiles2, T* g, T* p, float* state1, float* state2, float* unorm, const float beta1, const float beta2, const float eps,
float* max1, float* max2, float* new_max1, float* new_max2, const float weight_decay, const int step, const float lr, const float gnorm_scale, const int n
float weight_decay, const float gnorm_scale, const int n); );
template<typename T, int OPTIMIZER, int BLOCK_SIZE, int N_PER_TH> __global__ void kOptimizerStatic8bit2StateBlockwise( template <typename T, int OPTIMIZER>
T* p, T* __restrict__ const g, unsigned char* state1, unsigned char* state2, __global__ void kOptimizer32bit2State(
const float beta1, const float beta2, const float beta3, const float alpha, const float eps, const int step, const float lr, T* g, T* p, float* state1, float* state2, float* unorm, const float max_unorm, const float param_norm,
float* __restrict__ const quantiles1, float* __restrict__ const quantiles2, const float beta1, const float beta2, const float beta3, const float alpha, const float eps,
float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale, const bool skip_zeros, const int n); const float weight_decay, const int step, const float lr, const float gnorm_scale, const bool skip_zeros,
const int n
template<typename T, int OPTIMIZER, int BLOCK_SIZE, int N_PER_TH> __global__ void kOptimizerStatic8bit1StateBlockwise( );
T* p, T* __restrict__ const g, unsigned char* state1,
const float beta1, const float beta2, template <typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS>
const float eps, const int step, const float lr, __global__ void kPreconditionOptimizer32bit1State(
float* __restrict__ const quantiles1, T* g, T* p, float* state1, float* unorm, const float beta1, const float beta2, const float eps,
float* absmax1, const float weight_decay, const int step, const float lr, const float gnorm_scale, const int n
float weight_decay, );
const float gnorm_scale, const bool skip_zeros, const int n);
template <typename T, int OPTIMIZER>
__global__ void kOptimizer32bit1State(
template<typename T, int BLOCK_SIZE, int NUM_VALS> __global__ void kPercentileClipping(T * __restrict__ g, float *gnorm_vec, int step, const int n); T* g, T* p, float* state1, float* unorm, const float max_unorm, const float param_norm, const float beta1,
const float beta2, const float eps, const float weight_decay, const int step, const float lr,
template <typename T, int SPMM_ITEMS, int BITS> __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, T *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB); const float gnorm_scale, const bool skip_zeros, const int n
);
template <int ITEMS_PER_THREAD, int THREADS>__global__ void kdequant_mm_int32_fp16(
int *__restrict__ const A, float *__restrict__ const rowStats, float *__restrict__ const colStats, template <typename T, int OPTIMIZER>
half *out, half * __restrict__ const bias, const int numRows, const int numCols, const int n); __global__ void kPreconditionOptimizerStatic8bit1State(
T* p, T* __restrict__ const g, unsigned char* __restrict__ const state1, float* unorm, const float beta1,
template<typename T, int THREADS, int SPARSE_DECOMP> __global__ void kgetRowStats(T * __restrict__ A, float *rowStats, float threshold, int rows, int cols); const float beta2, const float eps, const int step, float* __restrict__ const quantiles1, float* max1,
template<typename T, int THREADS, int SPARSE_DECOMP> __global__ void kInt8VectorQuant(T * __restrict__ A, int8_t *out, float *rowStats, float threshold, int rows, int cols); float* new_max1, const float weight_decay, const float gnorm_scale, const int n
);
template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int TRANSPOSE, int FORMAT> __global__ void kTransformRowToFormat(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols);
template <typename T, int OPTIMIZER>
template <typename T, int BITS, int THREADS> __global__ void gemm_device(int M, int N, int K, T * __restrict__ const A, T* B, T * out, int lda, int ldb, int ldc); __global__ void kOptimizerStatic8bit1State(
template <typename T, int THREADS> __global__ void kgemm_4bit_inference(int M, int N, int K, T * __restrict__ const A, unsigned char *B, float *absmax, T * out, int lda, int ldb, int ldc, int blocksize); T* p, T* const g, unsigned char* state1, const float* unorm, const float max_unorm, const float param_norm,
template <typename T, int THREADS, int BITS> __global__ void kgemm_4bit_inference_naive(int M, int N, int K, T * __restrict__ const A, unsigned char *B, float *absmax, const float *datatype, T * out, int lda, int ldb, int ldc, int blocksize); const float beta1, const float beta2, const float eps, const int step, const float lr,
float* __restrict__ const quantiles1, float* max1, float* new_max1, float weight_decay, const float gnorm_scale,
template <typename T, int FUNC> __global__ void kfunc(T *A, T *B, T value, long n); const int n
);
template <typename T, int OPTIMIZER>
__global__ void kPreconditionOptimizerStatic8bit2State(
T* p, T* __restrict__ const g, unsigned char* __restrict__ const state1, unsigned char* __restrict__ const state2,
float* unorm, const float beta1, const float beta2, const float eps, const int step,
float* __restrict__ const quantiles1, float* __restrict__ const quantiles2, float* max1, float* max2,
float* new_max1, float* new_max2, const float gnorm_scale, const int n
);
template <typename T, int OPTIMIZER>
__global__ void kOptimizerStatic8bit2State(
T* p, T* const g, unsigned char* state1, unsigned char* state2, const float* unorm, const float max_unorm,
const float param_norm, const float beta1, const float beta2, const float eps, const int step, const float lr,
float* __restrict__ const quantiles1, float* __restrict__ const quantiles2, float* max1, float* max2,
float* new_max1, float* new_max2, float weight_decay, const float gnorm_scale, const int n
);
template <typename T, int OPTIMIZER, int BLOCK_SIZE, int N_PER_TH>
__global__ void kOptimizerStatic8bit2StateBlockwise(
T* p, T* __restrict__ const g, unsigned char* state1, unsigned char* state2, const float beta1, const float beta2,
const float beta3, const float alpha, const float eps, const int step, const float lr,
float* __restrict__ const quantiles1, float* __restrict__ const quantiles2, float* absmax1, float* absmax2,
float weight_decay, const float gnorm_scale, const bool skip_zeros, const int n
);
template <typename T, int OPTIMIZER, int BLOCK_SIZE, int N_PER_TH>
__global__ void kOptimizerStatic8bit1StateBlockwise(
T* p, T* __restrict__ const g, unsigned char* state1, const float beta1, const float beta2, const float eps,
const int step, const float lr, float* __restrict__ const quantiles1, float* absmax1, float weight_decay,
const float gnorm_scale, const bool skip_zeros, const int n
);
template <typename T, int BLOCK_SIZE, int NUM_VALS>
__global__ void kPercentileClipping(T* __restrict__ g, float* gnorm_vec, int step, const int n);
template <typename T, int SPMM_ITEMS, int BITS>
__global__ void kspmm_coo_very_sparse_naive(
int* max_count, int* max_idx, int* offset_rowidx, int* rowidx, int* colidx, half* values, T* B, half* out,
float* __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB
);
template <int ITEMS_PER_THREAD, int THREADS>
__global__ void kdequant_mm_int32_fp16(
int* __restrict__ const A, float* __restrict__ const rowStats, float* __restrict__ const colStats, half* out,
half* __restrict__ const bias, const int numRows, const int numCols, const int n
);
template <typename T, int THREADS, int SPARSE_DECOMP>
__global__ void kgetRowStats(T* __restrict__ A, float* rowStats, float threshold, int rows, int cols);
template <typename T, int THREADS, int SPARSE_DECOMP>
__global__ void kInt8VectorQuant(T* __restrict__ A, int8_t* out, float* rowStats, float threshold, int rows, int cols);
template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int TRANSPOSE, int FORMAT>
__global__ void kTransformRowToFormat(
char* __restrict__ const A, char* out, int rows, int cols, int tiledCols, int outRows, int outCols
);
template <typename T, int BITS, int THREADS>
__global__ void gemm_device(int M, int N, int K, T* __restrict__ const A, T* B, T* out, int lda, int ldb, int ldc);
template <typename T, int THREADS>
__global__ void kgemm_4bit_inference(
int M, int N, int K, T* __restrict__ const A, unsigned char* B, float* absmax, T* out, int lda, int ldb, int ldc,
int blocksize
);
template <typename T, int THREADS, int BITS>
__global__ void kgemm_4bit_inference_naive(
int M, int N, int K, T* __restrict__ const A, unsigned char* B, float* absmax, const float* datatype, T* out,
int lda, int ldb, int ldc, int blocksize
);
template <typename T, int FUNC> __global__ void kfunc(T* A, T* B, T value, long n);
#endif #endif
...@@ -5,37 +5,34 @@ ...@@ -5,37 +5,34 @@
#define NUM 4 #define NUM 4
#define NUM_BLOCK 4096 #define NUM_BLOCK 4096
static inline MPSGraph* get_graph() static inline MPSGraph* get_graph() {
{
static MPSGraph* cur = nil; static MPSGraph* cur = nil;
if(!cur) { if (!cur) {
cur = [[MPSGraph alloc] init]; cur = [[MPSGraph alloc] init];
} }
return cur; return cur;
} }
static inline id<MTLDevice> get_device() static inline id<MTLDevice> get_device() {
{ NSError* error = nil;
NSError *error = nil;
static id<MTLDevice> device = nil; static id<MTLDevice> device = nil;
if(!device) { if (!device) {
device = MTLCreateSystemDefaultDevice(); device = MTLCreateSystemDefaultDevice();
} }
if(!device) { if (!device) {
NSLog(@"Failed to get MPS device"); NSLog(@"Failed to get MPS device");
abort(); abort();
} }
return device; return device;
} }
static inline id<MTLLibrary> get_library() static inline id<MTLLibrary> get_library() {
{ NSError* error = nil;
NSError *error = nil;
static id<MTLLibrary> library = nil; static id<MTLLibrary> library = nil;
if(!library) { if (!library) {
library = [get_device() newLibraryWithURL:[NSURL fileURLWithPath:@"bitsandbytes.metallib"] error:&error]; library = [get_device() newLibraryWithURL:[NSURL fileURLWithPath:@"bitsandbytes.metallib"] error:&error];
} }
if(!library) { if (!library) {
NSLog(@"Failed to load bitsandbytes.metallib"); NSLog(@"Failed to load bitsandbytes.metallib");
abort(); abort();
} }
...@@ -44,20 +41,18 @@ static inline id<MTLLibrary> get_library() ...@@ -44,20 +41,18 @@ static inline id<MTLLibrary> get_library()
/*MPSGraphTensor* dequantize_mps(MPSGraphTensor* code, MPSGraphTensor* A, int n) /*MPSGraphTensor* dequantize_mps(MPSGraphTensor* code, MPSGraphTensor* A, int n)
{ {
id out = [get_graph() dequantizeTensor:(MPSGraphTensor*)A scaleTensor:(MPSGraphTensor*)code zeroPoint:0.0 dataType:MPSDataTypeInt8 axis:0 name:@"out"]; id out = [get_graph() dequantizeTensor:(MPSGraphTensor*)A scaleTensor:(MPSGraphTensor*)code zeroPoint:0.0
return out; dataType:MPSDataTypeInt8 axis:0 name:@"out"]; return out;
}*/ }*/
// MPSGraph function for quantize // MPSGraph function for quantize
extern "C" MPSGraphTensor* quantize_mps(MPSGraph* graph, MPSGraphTensor* code, MPSGraphTensor* A, int n) extern "C" MPSGraphTensor* quantize_mps(MPSGraph* graph, MPSGraphTensor* code, MPSGraphTensor* A, int n) {
{
id<MTLDevice> device = get_device(); id<MTLDevice> device = get_device();
id<MTLLibrary> library = get_library(); id<MTLLibrary> library = get_library();
static id<MTLFunction> kernel = nil; static id<MTLFunction> kernel = nil;
if(!kernel) { if (!kernel) {
kernel = [library newFunctionWithName:@"quantize"]; kernel = [library newFunctionWithName:@"quantize"];
if(!kernel) { if (!kernel) {
NSLog(@"Failed to load bitsandbytes.metallib"); NSLog(@"Failed to load bitsandbytes.metallib");
abort(); abort();
} }
......
This diff is collapsed.
...@@ -3,41 +3,41 @@ ...@@ -3,41 +3,41 @@
// This source code is licensed under the MIT license found in the // This source code is licensed under the MIT license found in the
// LICENSE file in the root directory of this source tree. // LICENSE file in the root directory of this source tree.
#ifndef ops_H #ifndef ops_H
#define ops_H #define ops_H
#include <assert.h>
#include <cstdint> #include <cstdint>
#include <stdio.h>
#include <iostream> #include <iostream>
#include <assert.h> #include <stdio.h>
#include <cuda_runtime_api.h>
#include <cuda_fp16.h>
#include <cublas_v2.h>
#include <cublasLt.h> #include <cublasLt.h>
#include <cublas_v2.h>
#include <cuda_fp16.h>
#include <cuda_runtime_api.h>
#include <cusparse.h> #include <cusparse.h>
#include <vector>
#include <functional> #include <functional>
#include <vector>
#define CUDA_CHECK_RETURN(value) \
#define CUDA_CHECK_RETURN(value) { \ { \
cudaError_t _m_cudaStat = value; \ cudaError_t _m_cudaStat = value; \
if (_m_cudaStat != cudaSuccess) { \ if (_m_cudaStat != cudaSuccess) { \
fprintf(stderr, "Error %s at line %d in file %s\n", \ fprintf(stderr, "Error %s at line %d in file %s\n", cudaGetErrorString(_m_cudaStat), __LINE__, __FILE__); \
cudaGetErrorString(_m_cudaStat), __LINE__, __FILE__); \
exit(1); \ exit(1); \
} } } \
}
#define CHECK_CUSPARSE(value) { \ #define CHECK_CUSPARSE(value) \
{ \
cusparseStatus_t _m_cudaStat = value; \ cusparseStatus_t _m_cudaStat = value; \
if (_m_cudaStat != CUSPARSE_STATUS_SUCCESS) { \ if (_m_cudaStat != CUSPARSE_STATUS_SUCCESS) { \
fprintf(stderr, "Error %s at line %d in file %s\n", \ fprintf( \
cusparseGetErrorString(_m_cudaStat), __LINE__, __FILE__); \ stderr, "Error %s at line %d in file %s\n", cusparseGetErrorString(_m_cudaStat), __LINE__, __FILE__ \
); \
exit(1); \ exit(1); \
} } } \
}
inline void checkCudaStatus(cudaError_t status) { inline void checkCudaStatus(cudaError_t status) {
if (status != cudaSuccess) { if (status != cudaSuccess) {
...@@ -49,19 +49,17 @@ inline void checkCudaStatus(cudaError_t status) { ...@@ -49,19 +49,17 @@ inline void checkCudaStatus(cudaError_t status) {
inline int checkCublasStatus(cublasStatus_t status) { inline int checkCublasStatus(cublasStatus_t status) {
if (status != CUBLAS_STATUS_SUCCESS) { if (status != CUBLAS_STATUS_SUCCESS) {
printf("cuBLAS API failed with status %d\n", status); printf("cuBLAS API failed with status %d\n", status);
//throw std::logic_error("cuBLAS API failed"); // throw std::logic_error("cuBLAS API failed");
return 1; return 1;
} }
return 0; return 0;
} }
typedef enum Operations_t typedef enum Operations_t {
{
ksmul = 0, ksmul = 0,
} Operations_t; } Operations_t;
typedef enum Optimizer_t typedef enum Optimizer_t {
{
ADAM = 0, ADAM = 0,
MOMENTUM = 1, MOMENTUM = 1,
RMSPROP = 2, RMSPROP = 2,
...@@ -71,8 +69,7 @@ typedef enum Optimizer_t ...@@ -71,8 +69,7 @@ typedef enum Optimizer_t
ADEMAMIX = 6 ADEMAMIX = 6
} Optimizer_t; } Optimizer_t;
typedef enum Transform_t typedef enum Transform_t {
{
ROW = 0, ROW = 0,
COL = 1, COL = 1,
COL32 = 2, COL32 = 2,
...@@ -80,109 +77,135 @@ typedef enum Transform_t ...@@ -80,109 +77,135 @@ typedef enum Transform_t
COL_AMPERE = 4, COL_AMPERE = 4,
} Transform_t; } Transform_t;
typedef enum DataType_t typedef enum DataType_t {
{
General8bit = 0, General8bit = 0,
FP4 = 1, FP4 = 1,
NF4 = 2, NF4 = 2,
} DataType_t; } DataType_t;
typedef enum Funcs_t typedef enum Funcs_t {
{
FILL = 0, FILL = 0,
ARANGE = 1, ARANGE = 1,
_MUL = 2, _MUL = 2,
} Funcs_t; } Funcs_t;
class Context class Context {
{
public: public:
cublasHandle_t m_handle; cublasHandle_t m_handle;
Context() Context() {
{
cublasHandle_t handle; cublasHandle_t handle;
cublasCreate_v2(&handle); cublasCreate_v2(&handle);
m_handle = handle; m_handle = handle;
} }
}; };
class ContextLt class ContextLt {
{
public: public:
cublasLtHandle_t m_handle; cublasLtHandle_t m_handle;
ContextLt() ContextLt() {
{
cublasLtHandle_t handle; cublasLtHandle_t handle;
cublasLtCreate(&handle); cublasLtCreate(&handle);
m_handle = handle; m_handle = handle;
} }
}; };
class ContextCusparse class ContextCusparse {
{
public: public:
cusparseHandle_t m_handle; cusparseHandle_t m_handle;
ContextCusparse() ContextCusparse() {
{
cusparseHandle_t handle; cusparseHandle_t handle;
cusparseCreate(&handle); cusparseCreate(&handle);
m_handle = handle; m_handle = handle;
} }
}; };
void quantize(float *code, float *A, unsigned char *out, int n); void quantize(float* code, float* A, unsigned char* out, int n);
void dequantize(float *code, unsigned char *A, float *out, int n, cudaStream_t stream); void dequantize(float* code, unsigned char* A, float* out, int n, cudaStream_t stream);
template <typename T, int STOCHASTIC, int DATA_TYPE> void quantizeBlockwise(float * code, T *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n); template <typename T, int STOCHASTIC, int DATA_TYPE>
template<typename T, int DATA_TYPE> void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, T *out, int block_size, const int n, cudaStream_t stream); void quantizeBlockwise(
float* code, T* A, float* absmax, unsigned char* out, float* rand, int rand_offset, int blocksize, const int n
template<typename T, int OPTIMIZER> void optimizer32bit(T* g, T* p, );
float* state1, float* state2, float *unorm, float max_unorm, float param_norm, template <typename T, int DATA_TYPE>
float beta1, float beta2, float beta3, float alpha, float eps, float weight_decay, void dequantizeBlockwise(
int step, float lr, const float gnorm_scale, bool skip_zeros, int n); float* code, unsigned char* A, float* absmax, T* out, int block_size, const int n, cudaStream_t stream
);
template<typename T, int OPTIMIZER> void optimizerStatic8bit(T* p, T* g, unsigned char* state1, unsigned char* state2,
float *unorm, float max_unorm, float param_norm, template <typename T, int OPTIMIZER>
float beta1, float beta2, void optimizer32bit(
float eps, int step, float lr, T* g, T* p, float* state1, float* state2, float* unorm, float max_unorm, float param_norm, float beta1, float beta2,
float* quantiles1, float* quantiles2, float beta3, float alpha, float eps, float weight_decay, int step, float lr, const float gnorm_scale,
float* max1, float* max2, float* new_max1, float* new_max2, bool skip_zeros, int n
float weight_decay, );
const float gnorm_scale, int n);
template <typename T, int OPTIMIZER>
template<typename T, int OPTIMIZER> void optimizerStatic8bitBlockwise(T* p, T* g, void optimizerStatic8bit(
unsigned char* state1, unsigned char* state2, float beta1, float beta2, float beta3, float alpha, float eps, int step, float lr, T* p, T* g, unsigned char* state1, unsigned char* state2, float* unorm, float max_unorm, float param_norm,
float* quantiles1, float* quantiles2, float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale, float beta1, float beta2, float eps, int step, float lr, float* quantiles1, float* quantiles2, float* max1,
bool skip_zeros, int n); float* max2, float* new_max1, float* new_max2, float weight_decay, const float gnorm_scale, int n
);
template<typename T> void percentileClipping(T * g, float *gnorm_vec, int step, const int n);
template <typename T, int OPTIMIZER>
void gemmex(Context * context, bool transposeA, bool transposeB, int m, int n, int k, void *A, void *B, void *C, int lda, int ldb, int ldc); void optimizerStatic8bitBlockwise(
void strided_gemmex(Context *context, bool transposeA, bool transposeB, int m, int n, int k, void *A, void *B, void *C, int lda, int ldb, int ldc, T* p, T* g, unsigned char* state1, unsigned char* state2, float beta1, float beta2, float beta3, float alpha,
long long int strideA, long long int strideB, long long int strideC, int batchCount); float eps, int step, float lr, float* quantiles1, float* quantiles2, float* absmax1, float* absmax2,
float weight_decay, const float gnorm_scale, bool skip_zeros, int n
template <int DTYPE_OUT, int SCALE_ROWS> int igemmlt(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc, cudaStream_t stream); );
void cutlass_igemm(bool transposeA, bool transposeB, int m, int n, int k, void *A, void *B, void *C, int lda, int ldb, int ldc); template <typename T> void percentileClipping(T* g, float* gnorm_vec, int step, const int n);
void dequant_mm_int32_fp16(int *A, float *rowStats, float *colStats, half *out, half* bias, int numRows, int numCols, cudaStream_t stream);
void getRowStats(half *A, float *rowStats, float threshold, int rows, int cols, cudaStream_t stream); void gemmex(
void int8VectorQuant(half * __restrict__ A, int8_t *out, float *rowStats, float threshold, int rows, int cols, cudaStream_t stream); Context* context, bool transposeA, bool transposeB, int m, int n, int k, void* A, void* B, void* C, int lda,
int ldb, int ldc
void spmm_coo(cusparseHandle_t handle, int *A_rowidx, int *A_colidx, half *A_vals, int A_nnz, int A_rows, int A_cols, int B_cols, int ldb, half *B, int ldc, half* C, bool transposed_B); );
void strided_gemmex(
template <typename T, int BITS> void spmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, T *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB); Context* context, bool transposeA, bool transposeB, int m, int n, int k, void* A, void* B, void* C, int lda,
int ldb, int ldc, long long int strideA, long long int strideB, long long int strideC, int batchCount
void matmul4bite(half *A, unsigned char *B, half*out, int lda, int ldb, int rowsA, int colsA, int colsB); );
template <typename T> void gemm_host(int m, int n, int k, T * A, T* B, T * out, int lda, int ldb, int ldc, int bits); template <int DTYPE_OUT, int SCALE_ROWS>
template <typename T> void gemm_4bit_inference(int m, int n, int k, T * A, unsigned char* B, float *absmax, T * out, int lda, int ldb, int ldc, int blocksize); int igemmlt(
template <typename T, int BITS> void gemm_4bit_inference_naive(int m, int n, int k, T * A, unsigned char* B, float *absmax, float *datatype, T * out, int lda, int ldb, int ldc, int blocksize, cudaStream_t stream); cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t* A, const int8_t* B, void* C, float* row_scale,
int lda, int ldb, int ldc, cudaStream_t stream
template <typename T, int FUNC> void func(T *A, T *B, T value, long n); );
void cutlass_igemm(
bool transposeA, bool transposeB, int m, int n, int k, void* A, void* B, void* C, int lda, int ldb, int ldc
);
void dequant_mm_int32_fp16(
int* A, float* rowStats, float* colStats, half* out, half* bias, int numRows, int numCols, cudaStream_t stream
);
void getRowStats(half* A, float* rowStats, float threshold, int rows, int cols, cudaStream_t stream);
void int8VectorQuant(
half* __restrict__ A, int8_t* out, float* rowStats, float threshold, int rows, int cols, cudaStream_t stream
);
void spmm_coo(
cusparseHandle_t handle, int* A_rowidx, int* A_colidx, half* A_vals, int A_nnz, int A_rows, int A_cols, int B_cols,
int ldb, half* B, int ldc, half* C, bool transposed_B
);
template <typename T, int BITS>
void spmm_coo_very_sparse_naive(
int* max_count, int* max_idx, int* offset_rowidx, int* rowidx, int* colidx, half* values, T* B, half* out,
float* dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB
);
void matmul4bite(half* A, unsigned char* B, half* out, int lda, int ldb, int rowsA, int colsA, int colsB);
template <typename T> void gemm_host(int m, int n, int k, T* A, T* B, T* out, int lda, int ldb, int ldc, int bits);
template <typename T>
void gemm_4bit_inference(
int m, int n, int k, T* A, unsigned char* B, float* absmax, T* out, int lda, int ldb, int ldc, int blocksize
);
template <typename T, int BITS>
void gemm_4bit_inference_naive(
int m, int n, int k, T* A, unsigned char* B, float* absmax, float* datatype, T* out, int lda, int ldb, int ldc,
int blocksize, cudaStream_t stream
);
template <typename T, int FUNC> void func(T* A, T* B, T value, long n);
#endif #endif
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment