Unverified Commit be5cecb8 authored by Tim Dettmers's avatar Tim Dettmers Committed by GitHub
Browse files

Merge branch 'main' into main

parents 8724c990 f0ec93d0
...@@ -12,13 +12,13 @@ import torch ...@@ -12,13 +12,13 @@ import torch
import bitsandbytes.functional as F import bitsandbytes.functional as F
class MockArgs(object): class MockArgs:
def __init__(self, initial_data): def __init__(self, initial_data):
for key in initial_data: for key in initial_data:
setattr(self, key, initial_data[key]) setattr(self, key, initial_data[key])
class GlobalOptimManager(object): class GlobalOptimManager:
_instance = None _instance = None
def __init__(self): def __init__(self):
...@@ -56,9 +56,9 @@ class GlobalOptimManager(object): ...@@ -56,9 +56,9 @@ class GlobalOptimManager(object):
""" """
Overrides initial optimizer config for specific parameters. Overrides initial optimizer config for specific parameters.
The key-values of the optimizer config for the input parameters are overidden The key-values of the optimizer config for the input parameters are overridden
This can be both, optimizer parameters like "betas", or "lr" or it can be This can be both, optimizer parameters like "betas", or "lr" or it can be
8-bit specific paramters like "optim_bits", "percentile_clipping". 8-bit specific parameters like "optim_bits", "percentile_clipping".
Parameters Parameters
---------- ----------
...@@ -93,13 +93,12 @@ class GlobalOptimManager(object): ...@@ -93,13 +93,12 @@ class GlobalOptimManager(object):
class Optimizer8bit(torch.optim.Optimizer): class Optimizer8bit(torch.optim.Optimizer):
def __init__(self, params, defaults, optim_bits=32): def __init__(self, params, defaults, optim_bits=32):
super(Optimizer8bit, self).__init__(params, defaults) super().__init__(params, defaults)
self.initialized = False self.initialized = False
self.name2qmap = {} self.name2qmap = {}
self.mng = GlobalOptimManager.get_instance() self.mng = GlobalOptimManager.get_instance()
self.non_castable_tensor_keys = set( self.non_castable_tensor_keys = {
[
"qmap1", "qmap1",
"qmap2", "qmap2",
"max1", "max1",
...@@ -112,8 +111,7 @@ class Optimizer8bit(torch.optim.Optimizer): ...@@ -112,8 +111,7 @@ class Optimizer8bit(torch.optim.Optimizer):
"absmax1", "absmax1",
"absmax2", "absmax2",
"unorm_vec", "unorm_vec",
] }
)
if optim_bits == 8: if optim_bits == 8:
self.fill_qmap() self.fill_qmap()
...@@ -123,7 +121,7 @@ class Optimizer8bit(torch.optim.Optimizer): ...@@ -123,7 +121,7 @@ class Optimizer8bit(torch.optim.Optimizer):
self.name2qmap["udynamic"] = F.create_dynamic_map(signed=False) self.name2qmap["udynamic"] = F.create_dynamic_map(signed=False)
def __setstate__(self, state): def __setstate__(self, state):
super(Optimizer8bit, self).__setstate__(state) super().__setstate__(state)
def load_state_dict(self, state_dict): def load_state_dict(self, state_dict):
r"""Loads the optimizer state. r"""Loads the optimizer state.
...@@ -155,8 +153,8 @@ class Optimizer8bit(torch.optim.Optimizer): ...@@ -155,8 +153,8 @@ class Optimizer8bit(torch.optim.Optimizer):
id_map = { id_map = {
old_id: p old_id: p
for old_id, p in zip( for old_id, p in zip(
chain.from_iterable((g["params"] for g in saved_groups)), chain.from_iterable(g["params"] for g in saved_groups),
chain.from_iterable((g["params"] for g in groups)), chain.from_iterable(g["params"] for g in groups),
) )
} }
...@@ -284,11 +282,11 @@ class Optimizer8bit(torch.optim.Optimizer): ...@@ -284,11 +282,11 @@ class Optimizer8bit(torch.optim.Optimizer):
return config return config
def init_state(self, group, p, gindex, pindex): def init_state(self, group, p, gindex, pindex):
raise NotImplementedError(f"init_state method needs to be overidden") raise NotImplementedError("init_state method needs to be overridden")
def update_step(self, group, p, gindex, pindex): def update_step(self, group, p, gindex, pindex):
raise NotImplementedError( raise NotImplementedError(
f"The update_step method needs to be overidden" "The update_step method needs to be overridden"
) )
...@@ -310,9 +308,9 @@ class Optimizer2State(Optimizer8bit): ...@@ -310,9 +308,9 @@ class Optimizer2State(Optimizer8bit):
skip_zeros=False, skip_zeros=False,
): ):
if not 0.0 <= lr: if not 0.0 <= lr:
raise ValueError("Invalid learning rate: {}".format(lr)) raise ValueError(f"Invalid learning rate: {lr}")
if not 0.0 <= eps: if not 0.0 <= eps:
raise ValueError("Invalid epsilon value: {}".format(eps)) raise ValueError(f"Invalid epsilon value: {eps}")
if isinstance(betas, str): if isinstance(betas, str):
# format: '(beta1, beta2)' # format: '(beta1, beta2)'
betas = betas.replace("(", "").replace(")", "").strip().split(",") betas = betas.replace("(", "").replace(")", "").strip().split(",")
...@@ -324,10 +322,10 @@ class Optimizer2State(Optimizer8bit): ...@@ -324,10 +322,10 @@ class Optimizer2State(Optimizer8bit):
) )
if not 0.0 <= weight_decay: if not 0.0 <= weight_decay:
raise ValueError( raise ValueError(
"Invalid weight_decay value: {}".format(weight_decay) f"Invalid weight_decay value: {weight_decay}"
) )
defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
super(Optimizer2State, self).__init__(params, defaults, optim_bits) super().__init__(params, defaults, optim_bits)
if args is None: if args is None:
args = {} args = {}
...@@ -542,9 +540,9 @@ class Optimizer1State(Optimizer8bit): ...@@ -542,9 +540,9 @@ class Optimizer1State(Optimizer8bit):
skip_zeros=False, skip_zeros=False,
): ):
if not 0.0 <= lr: if not 0.0 <= lr:
raise ValueError("Invalid learning rate: {}".format(lr)) raise ValueError(f"Invalid learning rate: {lr}")
if not 0.0 <= eps: if not 0.0 <= eps:
raise ValueError("Invalid epsilon value: {}".format(eps)) raise ValueError(f"Invalid epsilon value: {eps}")
for i in range(len(betas)): for i in range(len(betas)):
if not 0.0 <= betas[i] < 1.0: if not 0.0 <= betas[i] < 1.0:
raise ValueError( raise ValueError(
...@@ -552,10 +550,10 @@ class Optimizer1State(Optimizer8bit): ...@@ -552,10 +550,10 @@ class Optimizer1State(Optimizer8bit):
) )
if not 0.0 <= weight_decay: if not 0.0 <= weight_decay:
raise ValueError( raise ValueError(
"Invalid weight_decay value: {}".format(weight_decay) f"Invalid weight_decay value: {weight_decay}"
) )
defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
super(Optimizer1State, self).__init__(params, defaults, optim_bits) super().__init__(params, defaults, optim_bits)
if args is None: if args is None:
args = {} args = {}
......
...@@ -23,11 +23,11 @@ class RMSprop(Optimizer1State): ...@@ -23,11 +23,11 @@ class RMSprop(Optimizer1State):
): ):
if alpha == 0: if alpha == 0:
raise NotImplementedError( raise NotImplementedError(
f"RMSprop with alpha==0.0 is not supported!" "RMSprop with alpha==0.0 is not supported!"
) )
if centered: if centered:
raise NotImplementedError(f"Centered RMSprop is not supported!") raise NotImplementedError("Centered RMSprop is not supported!")
super(RMSprop, self).__init__( super().__init__(
"rmsprop", "rmsprop",
params, params,
lr, lr,
...@@ -59,11 +59,11 @@ class RMSprop8bit(Optimizer1State): ...@@ -59,11 +59,11 @@ class RMSprop8bit(Optimizer1State):
): ):
if alpha == 0: if alpha == 0:
raise NotImplementedError( raise NotImplementedError(
f"RMSprop with alpha==0.0 is not supported!" "RMSprop with alpha==0.0 is not supported!"
) )
if centered: if centered:
raise NotImplementedError(f"Centered RMSprop is not supported!") raise NotImplementedError("Centered RMSprop is not supported!")
super(RMSprop8bit, self).__init__( super().__init__(
"rmsprop", "rmsprop",
params, params,
lr, lr,
...@@ -96,11 +96,11 @@ class RMSprop32bit(Optimizer1State): ...@@ -96,11 +96,11 @@ class RMSprop32bit(Optimizer1State):
if alpha == 0: if alpha == 0:
raise NotImplementedError( raise NotImplementedError(
f"RMSprop with alpha==0.0 is not supported!" "RMSprop with alpha==0.0 is not supported!"
) )
if centered: if centered:
raise NotImplementedError(f"Centered RMSprop is not supported!") raise NotImplementedError("Centered RMSprop is not supported!")
super(RMSprop32bit, self).__init__( super().__init__(
"rmsprop", "rmsprop",
params, params,
lr, lr,
......
...@@ -21,8 +21,8 @@ class SGD(Optimizer1State): ...@@ -21,8 +21,8 @@ class SGD(Optimizer1State):
block_wise=True, block_wise=True,
): ):
if momentum == 0: if momentum == 0:
raise NotImplementedError(f"SGD without momentum is not supported!") raise NotImplementedError("SGD without momentum is not supported!")
super(SGD, self).__init__( super().__init__(
"momentum", "momentum",
params, params,
lr, lr,
...@@ -52,8 +52,8 @@ class SGD8bit(Optimizer1State): ...@@ -52,8 +52,8 @@ class SGD8bit(Optimizer1State):
block_wise=True, block_wise=True,
): ):
if momentum == 0: if momentum == 0:
raise NotImplementedError(f"SGD without momentum is not supported!") raise NotImplementedError("SGD without momentum is not supported!")
super(SGD8bit, self).__init__( super().__init__(
"momentum", "momentum",
params, params,
lr, lr,
...@@ -83,8 +83,8 @@ class SGD32bit(Optimizer1State): ...@@ -83,8 +83,8 @@ class SGD32bit(Optimizer1State):
block_wise=True, block_wise=True,
): ):
if momentum == 0: if momentum == 0:
raise NotImplementedError(f"SGD without momentum is not supported!") raise NotImplementedError("SGD without momentum is not supported!")
super(SGD32bit, self).__init__( super().__init__(
"momentum", "momentum",
params, params,
lr, lr,
......
...@@ -4,7 +4,7 @@ Basic steps. ...@@ -4,7 +4,7 @@ Basic steps.
1. `make [target]` where `[target]` is among `cuda92, cuda10x, cuda110, cuda11x, cpuonly` 1. `make [target]` where `[target]` is among `cuda92, cuda10x, cuda110, cuda11x, cpuonly`
2. `CUDA_VERSION=XXX python setup.py install` 2. `CUDA_VERSION=XXX python setup.py install`
To run these steps you will need to have the nvcc compiler installed that comes with a CUDA installation. If you use anaconda (recommended) then you can figure out which version of CUDA you are using with PyTorch via the command `conda list | grep cudatoolkit`. Then you can install the nvcc compiler by downloading and installing the same CUDA version from the [CUDA toolkit archive](https://developer.nvidia.com/cuda-toolkit-archive). To run these steps you will need to have the nvcc compiler installed that comes with a CUDA installation. If you use anaconda (recommended) then you can figure out which version of CUDA you are using with PyTorch via the command `conda list | grep cudatoolkit`. Then you can install the nvcc compiler by downloading and installing the same CUDA version from the [CUDA toolkit archive](https://developer.nvidia.com/cuda-toolkit-archive).
For your convenience, there is an installation script in the root directory that installs CUDA 11.1 locally and configures it automatically. After installing you should add the `bin` sub-directory to the `$PATH` variable to make the compiler visible to your system. To do this you can add this to your `.bashrc` by executing these commands: For your convenience, there is an installation script in the root directory that installs CUDA 11.1 locally and configures it automatically. After installing you should add the `bin` sub-directory to the `$PATH` variable to make the compiler visible to your system. To do this you can add this to your `.bashrc` by executing these commands:
```bash ```bash
...@@ -13,7 +13,7 @@ echo "export PATH=$PATH:/usr/local/cuda/bin/" >> ~/.bashrc ...@@ -13,7 +13,7 @@ echo "export PATH=$PATH:/usr/local/cuda/bin/" >> ~/.bashrc
source ~/.bashrc source ~/.bashrc
``` ```
By default, the Makefile will look at your `CUDA_HOME` environmental variable to find your CUDA version for compiling the library. If this path is not set it is inferred from the path of your `nvcc` compiler. By default, the Makefile will look at your `CUDA_HOME` environmental variable to find your CUDA version for compiling the library. If this path is not set it is inferred from the path of your `nvcc` compiler.
Either `nvcc` needs to be in path for the `CUDA_HOME` variable needs to be set to the CUDA directory root (e.g. `/usr/local/cuda`) in order for compilation to succeed Either `nvcc` needs to be in path for the `CUDA_HOME` variable needs to be set to the CUDA directory root (e.g. `/usr/local/cuda`) in order for compilation to succeed
......
...@@ -62,7 +62,7 @@ void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long ...@@ -62,7 +62,7 @@ void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long
for (int i = 0; i < valid_chunks; i++) for (int i = 0; i < valid_chunks; i++)
int err = pthread_join(threads[i], NULL); int err = pthread_join(threads[i], NULL);
free(threads); free(threads);
for (int i = 0; i < valid_chunks; i++) for (int i = 0; i < valid_chunks; i++)
free(args[i]); free(args[i]);
......
This diff is collapsed.
// Copyright (c) Facebook, Inc. and its affiliates. // Copyright (c) Facebook, Inc. and its affiliates.
// //
// This source code is licensed under the MIT license found in the // This source code is licensed under the MIT license found in the
// LICENSE file in the root directory of this source tree. // LICENSE file in the root directory of this source tree.
#include <float.h> #include <float.h>
...@@ -15,52 +15,52 @@ __global__ void kQuantize(float * code, float * __restrict__ const A, unsigned c ...@@ -15,52 +15,52 @@ __global__ void kQuantize(float * code, float * __restrict__ const A, unsigned c
__global__ void kDequantize(float *code, unsigned char *A, float *out, const int n); __global__ void kDequantize(float *code, unsigned char *A, float *out, const int n);
template<typename T, int BLOCK_SIZE, int NUM_PER_TH, int STOCHASTIC> __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); template<typename T, int BLOCK_SIZE, int NUM_PER_TH, int STOCHASTIC> __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n);
template<typename T, int BLOCK_SIZE, int THREADS, int NUM_PER_TH> __global__ void kDequantizeBlockwise(float *code, unsigned char * __restrict__ const A, float * __restrict__ const absmax, T *out, const int n); template<typename T, int BLOCK_SIZE, int THREADS, int NUM_PER_TH> __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, T *out, const int n);
template<typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS> template<typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS>
__global__ void kPreconditionOptimizer32bit2State(T* g, T* p, __global__ void kPreconditionOptimizer32bit2State(T* g, T* p,
float* state1, float* state2, float *unorm, float* state1, float* state2, float *unorm,
const float beta1, const float beta2, const float eps, const float weight_decay, const float beta1, const float beta2, const float eps, const float weight_decay,
const int step, const float lr, const float gnorm_scale, const int n); const int step, const float lr, const float gnorm_scale, const int n);
template<typename T, int OPTIMIZER> template<typename T, int OPTIMIZER>
__global__ void kOptimizer32bit2State(T* g, T* p, __global__ void kOptimizer32bit2State(T* g, T* p,
float* state1, float* state2, float *unorm, const float max_unorm, const float param_norm, float* state1, float* state2, float *unorm, const float max_unorm, const float param_norm,
const float beta1, const float beta2, const float eps, const float weight_decay, const float beta1, const float beta2, const float eps, const float weight_decay,
const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n); const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n);
template<typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS> template<typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS>
__global__ void kPreconditionOptimizer32bit1State(T* g, T* p, __global__ void kPreconditionOptimizer32bit1State(T* g, T* p,
float* state1, float *unorm, float* state1, float *unorm,
const float beta1, const float eps, const float weight_decay, const float beta1, const float eps, const float weight_decay,
const int step, const float lr, const float gnorm_scale, const int n); const int step, const float lr, const float gnorm_scale, const int n);
template<typename T, int OPTIMIZER> template<typename T, int OPTIMIZER>
__global__ void kOptimizer32bit1State(T* g, T* p, __global__ void kOptimizer32bit1State(T* g, T* p,
float* state1, float *unorm, const float max_unorm, const float param_norm, float* state1, float *unorm, const float max_unorm, const float param_norm,
const float beta1, const float eps, const float weight_decay, const float beta1, const float eps, const float weight_decay,
const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n); const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n);
template<typename T, int OPTIMIZER> template<typename T, int OPTIMIZER>
__global__ void __global__ void
kPreconditionOptimizerStatic8bit1State(T* p, T* __restrict__ const g, unsigned char*__restrict__ const state1, kPreconditionOptimizerStatic8bit1State(T* p, T* __restrict__ const g, unsigned char*__restrict__ const state1,
float *unorm, float *unorm,
const float beta1, const float beta1,
const float eps, const int step, const float eps, const int step,
float* __restrict__ const quantiles1, float* __restrict__ const quantiles1,
float* max1, float* new_max1, float* max1, float* new_max1,
const float weight_decay, const float weight_decay,
const float gnorm_scale, const int n); const float gnorm_scale, const int n);
template<typename T, int OPTIMIZER> template<typename T, int OPTIMIZER>
__global__ void __global__ void
kOptimizerStatic8bit1State(T* p, T* const g, unsigned char* state1, kOptimizerStatic8bit1State(T* p, T* const g, unsigned char* state1,
const float *unorm, const float max_unorm, const float param_norm, const float *unorm, const float max_unorm, const float param_norm,
const float beta1, const float beta1,
const float eps, const int step, const float lr, const float eps, const int step, const float lr,
float* __restrict__ const quantiles1, float* __restrict__ const quantiles1,
float* max1, float* new_max1, float* max1, float* new_max1,
float weight_decay, const float gnorm_scale, const int n); float weight_decay, const float gnorm_scale, const int n);
...@@ -70,7 +70,7 @@ __global__ void ...@@ -70,7 +70,7 @@ __global__ void
kPreconditionOptimizerStatic8bit2State(T* p, T* __restrict__ const g, unsigned char*__restrict__ const state1, unsigned char* __restrict__ const state2, kPreconditionOptimizerStatic8bit2State(T* p, T* __restrict__ const g, unsigned char*__restrict__ const state1, unsigned char* __restrict__ const state2,
float *unorm, float *unorm,
const float beta1, const float beta2, const float beta1, const float beta2,
const float eps, const int step, const float eps, const int step,
float* __restrict__ const quantiles1, float* __restrict__ const quantiles2, float* __restrict__ const quantiles1, float* __restrict__ const quantiles2,
float* max1, float* max2, float* new_max1, float* new_max2, float* max1, float* max2, float* new_max1, float* new_max2,
const float gnorm_scale, const int n); const float gnorm_scale, const int n);
...@@ -81,7 +81,7 @@ __global__ void ...@@ -81,7 +81,7 @@ __global__ void
kOptimizerStatic8bit2State(T* p, T* const g, unsigned char* state1, unsigned char* state2, kOptimizerStatic8bit2State(T* p, T* const g, unsigned char* state1, unsigned char* state2,
const float *unorm, const float max_unorm, const float param_norm, const float *unorm, const float max_unorm, const float param_norm,
const float beta1, const float beta2, const float beta1, const float beta2,
const float eps, const int step, const float lr, const float eps, const int step, const float lr,
float* __restrict__ const quantiles1, float* __restrict__ const quantiles2, float* __restrict__ const quantiles1, float* __restrict__ const quantiles2,
float* max1, float* max2, float* new_max1, float* new_max2, float* max1, float* max2, float* new_max1, float* new_max2,
float weight_decay, const float gnorm_scale, const int n); float weight_decay, const float gnorm_scale, const int n);
...@@ -121,5 +121,3 @@ template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int T ...@@ -121,5 +121,3 @@ template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int T
template <int FORMAT> __global__ void kExtractOutliers(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA); template <int FORMAT> __global__ void kExtractOutliers(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA);
#endif #endif
// Copyright (c) Facebook, Inc. and its affiliates. // Copyright (c) Facebook, Inc. and its affiliates.
// //
// This source code is licensed under the MIT license found in the // This source code is licensed under the MIT license found in the
// LICENSE file in the root directory of this source tree. // LICENSE file in the root directory of this source tree.
#include <ops.cuh> #include <ops.cuh>
...@@ -50,11 +50,29 @@ void dequantize(float *code, unsigned char *A, float *out, int n) ...@@ -50,11 +50,29 @@ void dequantize(float *code, unsigned char *A, float *out, int n)
CUDA_CHECK_RETURN(cudaPeekAtLastError()); CUDA_CHECK_RETURN(cudaPeekAtLastError());
} }
template <typename T, int STOCHASTIC> void quantizeBlockwise(float * code, T *A, float *absmax, unsigned char *out, float *rand, int rand_offset, const int n) template <typename T, int STOCHASTIC> void quantizeBlockwise(float * code, T *A, float *absmax, unsigned char *out, float *rand, int rand_offset, int blocksize, const int n)
{ {
int num_blocks = n/4096; int num_blocks = n/blocksize;
num_blocks = n % 4096 == 0 ? num_blocks : num_blocks + 1; num_blocks = n % blocksize == 0 ? num_blocks : num_blocks + 1;
kQuantizeBlockwise<T, 4096, 4, STOCHASTIC><<<num_blocks, 1024>>>(code, A, absmax, out, rand, rand_offset, n); if(STOCHASTIC == 1)
assert(blocksize == 4096);
if(blocksize == 4096)
kQuantizeBlockwise<T, 4096, 4, STOCHASTIC><<<num_blocks, 1024>>>(code, A, absmax, out, rand, rand_offset, n);
else if(blocksize == 2048)
kQuantizeBlockwise<T, 2048, 4, 0><<<num_blocks, 512>>>(code, A, absmax, out, rand, rand_offset, n);
else if(blocksize == 1024)
kQuantizeBlockwise<T, 1024, 4, 0><<<num_blocks, 256>>>(code, A, absmax, out, rand, rand_offset, n);
else if(blocksize == 512)
kQuantizeBlockwise<T, 512, 2, 0><<<num_blocks, 256>>>(code, A, absmax, out, rand, rand_offset, n);
else if(blocksize == 256)
kQuantizeBlockwise<T, 256, 2, 0><<<num_blocks, 128>>>(code, A, absmax, out, rand, rand_offset, n);
else if(blocksize == 128)
kQuantizeBlockwise<T, 128, 2, 0><<<num_blocks, 64>>>(code, A, absmax, out, rand, rand_offset, n);
else if(blocksize == 64)
kQuantizeBlockwise<T, 64, 1, 0><<<num_blocks, 64>>>(code, A, absmax, out, rand, rand_offset, n);
CUDA_CHECK_RETURN(cudaPeekAtLastError()); CUDA_CHECK_RETURN(cudaPeekAtLastError());
} }
...@@ -66,6 +84,17 @@ template<typename T> void dequantizeBlockwise(float *code, unsigned char *A, flo ...@@ -66,6 +84,17 @@ template<typename T> void dequantizeBlockwise(float *code, unsigned char *A, flo
kDequantizeBlockwise<T, 4096, 1024, 4><<<num_blocks, 4096/4>>>(code, A, absmax, out, n); kDequantizeBlockwise<T, 4096, 1024, 4><<<num_blocks, 4096/4>>>(code, A, absmax, out, n);
else if(blocksize == 2048) else if(blocksize == 2048)
kDequantizeBlockwise<T, 2048, 512, 4><<<num_blocks, 2048/4>>>(code, A, absmax, out, n); kDequantizeBlockwise<T, 2048, 512, 4><<<num_blocks, 2048/4>>>(code, A, absmax, out, n);
else if(blocksize == 1024)
kDequantizeBlockwise<T, 1024, 256, 4><<<num_blocks, 1024/4>>>(code, A, absmax, out, n);
else if(blocksize == 512)
kDequantizeBlockwise<T, 512, 256, 2><<<num_blocks, 512/2>>>(code, A, absmax, out, n);
else if(blocksize == 256)
kDequantizeBlockwise<T, 256, 128, 2><<<num_blocks, 256/2>>>(code, A, absmax, out, n);
else if(blocksize == 128)
kDequantizeBlockwise<T, 128, 64, 2><<<num_blocks, 128/2>>>(code, A, absmax, out, n);
else if(blocksize == 64)
kDequantizeBlockwise<T, 64, 64, 1><<<num_blocks, 64/1>>>(code, A, absmax, out, n);
CUDA_CHECK_RETURN(cudaPeekAtLastError()); CUDA_CHECK_RETURN(cudaPeekAtLastError());
} }
...@@ -212,7 +241,7 @@ void gemmex(Context *context, bool transposeA, bool transposeB, int m, int n, in ...@@ -212,7 +241,7 @@ void gemmex(Context *context, bool transposeA, bool transposeB, int m, int n, in
} }
void strided_gemmex(Context *context, bool transposeA, bool transposeB, int m, int n, int k, void *A, void *B, void *C, int lda, int ldb, int ldc, void strided_gemmex(Context *context, bool transposeA, bool transposeB, int m, int n, int k, void *A, void *B, void *C, int lda, int ldb, int ldc,
long long int strideA, long long int strideB, long long int strideC, int batchCount) long long int strideA, long long int strideB, long long int strideC, int batchCount)
{ {
const int falpha = 1; const int falpha = 1;
...@@ -322,7 +351,7 @@ template <typename T, int SRC, int TARGET, bool transpose, int DTYPE> void trans ...@@ -322,7 +351,7 @@ template <typename T, int SRC, int TARGET, bool transpose, int DTYPE> void trans
cublasLtOrder_t orderOut = get_order<TARGET>(); cublasLtOrder_t orderOut = get_order<TARGET>();
int ldA = get_leading_dim<SRC>(dim1, dim2); int ldA = get_leading_dim<SRC>(dim1, dim2);
int ldOut = get_leading_dim<TARGET>(dim1, dim2); int ldOut = get_leading_dim<TARGET>(dim1, dim2);
cublasLtMatrixLayout_t A_desc = NULL, out_desc = NULL; cublasLtMatrixLayout_t A_desc = NULL, out_desc = NULL;
cublasLtMatrixTransformDesc_t A2Out_desc = NULL; cublasLtMatrixTransformDesc_t A2Out_desc = NULL;
cublasOperation_t opTranspose = CUBLAS_OP_T; cublasOperation_t opTranspose = CUBLAS_OP_T;
...@@ -368,7 +397,7 @@ template void transform<int8_t, ROW, COL_AMPERE, false, 8>(cublasLtHandle_t ltHa ...@@ -368,7 +397,7 @@ template void transform<int8_t, ROW, COL_AMPERE, false, 8>(cublasLtHandle_t ltHa
template void transform<int8_t, COL32, ROW, false, 8>(cublasLtHandle_t ltHandle, int8_t *A, int8_t *out, int dim1, int dim2); template void transform<int8_t, COL32, ROW, false, 8>(cublasLtHandle_t ltHandle, int8_t *A, int8_t *out, int dim1, int dim2);
template void transform<int32_t, COL32, ROW, false, 32>(cublasLtHandle_t ltHandle, int32_t *A, int32_t *out, int dim1, int dim2); template void transform<int32_t, COL32, ROW, false, 32>(cublasLtHandle_t ltHandle, int32_t *A, int32_t *out, int dim1, int dim2);
template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc) template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
{ {
#ifdef NO_CUBLASLT #ifdef NO_CUBLASLT
cout << "" << endl; cout << "" << endl;
...@@ -659,10 +688,10 @@ template void transformRowToFormat<COL_AMPERE, 1>(char * A, char *out, int rows, ...@@ -659,10 +688,10 @@ template void transformRowToFormat<COL_AMPERE, 1>(char * A, char *out, int rows,
template void estimateQuantiles(half *A, float *code, float offset, int n); template void estimateQuantiles(half *A, float *code, float offset, int n);
template void estimateQuantiles(float *A, float *code, float offset, int n); template void estimateQuantiles(float *A, float *code, float offset, int n);
template void quantizeBlockwise<half, 0>(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n); template void quantizeBlockwise<half, 0>(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
template void quantizeBlockwise<float, 0>(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n); template void quantizeBlockwise<float, 0>(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
template void quantizeBlockwise<half, 1>(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n); template void quantizeBlockwise<half, 1>(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
template void quantizeBlockwise<float, 1>(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n); template void quantizeBlockwise<float, 1>(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
template void dequantizeBlockwise<half>(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n); template void dequantizeBlockwise<half>(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n);
template void dequantizeBlockwise<float>(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n); template void dequantizeBlockwise<float>(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n);
......
// Copyright (c) Facebook, Inc. and its affiliates. // Copyright (c) Facebook, Inc. and its affiliates.
// //
// This source code is licensed under the MIT license found in the // This source code is licensed under the MIT license found in the
// LICENSE file in the root directory of this source tree. // LICENSE file in the root directory of this source tree.
...@@ -128,10 +128,10 @@ template <typename T> void estimateQuantiles(T *A, float *code, float offset, in ...@@ -128,10 +128,10 @@ template <typename T> void estimateQuantiles(T *A, float *code, float offset, in
void quantize(float *code, float *A, unsigned char *out, int n); void quantize(float *code, float *A, unsigned char *out, int n);
void dequantize(float *code, unsigned char *A, float *out, int n); void dequantize(float *code, unsigned char *A, float *out, int n);
template <typename T, int STOCHASTIC> void quantizeBlockwise(float * code, T *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n); template <typename T, int STOCHASTIC> void quantizeBlockwise(float * code, T *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
template<typename T> void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, T *out, int block_size, const int n); template<typename T> void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, T *out, int block_size, const int n);
template<typename T, int OPTIMIZER> void optimizer32bit(T* g, T* p, template<typename T, int OPTIMIZER> void optimizer32bit(T* g, T* p,
float* state1, float* state2, float *unorm, float max_unorm, float param_norm, float* state1, float* state2, float *unorm, float max_unorm, float param_norm,
float beta1, float beta2, float eps, float weight_decay, float beta1, float beta2, float eps, float weight_decay,
int step, float lr, const float gnorm_scale, bool skip_zeros, int n); int step, float lr, const float gnorm_scale, bool skip_zeros, int n);
...@@ -139,15 +139,15 @@ template<typename T, int OPTIMIZER> void optimizer32bit(T* g, T* p, ...@@ -139,15 +139,15 @@ template<typename T, int OPTIMIZER> void optimizer32bit(T* g, T* p,
template<typename T, int OPTIMIZER> void optimizerStatic8bit(T* p, T* g, unsigned char* state1, unsigned char* state2, template<typename T, int OPTIMIZER> void optimizerStatic8bit(T* p, T* g, unsigned char* state1, unsigned char* state2,
float *unorm, float max_unorm, float param_norm, float *unorm, float max_unorm, float param_norm,
float beta1, float beta2, float beta1, float beta2,
float eps, int step, float lr, float eps, int step, float lr,
float* quantiles1, float* quantiles2, float* quantiles1, float* quantiles2,
float* max1, float* max2, float* new_max1, float* new_max2, float* max1, float* max2, float* new_max1, float* new_max2,
float weight_decay, float weight_decay,
const float gnorm_scale, int n); const float gnorm_scale, int n);
template<typename T, int OPTIMIZER> void optimizerStatic8bitBlockwise(T* p, T* g, template<typename T, int OPTIMIZER> void optimizerStatic8bitBlockwise(T* p, T* g,
unsigned char* state1, unsigned char* state2, float beta1, float beta2, float eps, int step, float lr, unsigned char* state1, unsigned char* state2, float beta1, float beta2, float eps, int step, float lr,
float* quantiles1, float* quantiles2, float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale, float* quantiles1, float* quantiles2, float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale,
bool skip_zeros, int n); bool skip_zeros, int n);
template<typename T> void percentileClipping(T * g, float *gnorm_vec, int step, const int n); template<typename T> void percentileClipping(T * g, float *gnorm_vec, int step, const int n);
...@@ -155,7 +155,7 @@ template<typename T> void percentileClipping(T * g, float *gnorm_vec, int step, ...@@ -155,7 +155,7 @@ template<typename T> void percentileClipping(T * g, float *gnorm_vec, int step,
void histogramScatterAdd2D(float* histogram, int *index1, int *index2, float *src, int maxidx1, int n); void histogramScatterAdd2D(float* histogram, int *index1, int *index2, float *src, int maxidx1, int n);
void gemmex(Context * context, bool transposeA, bool transposeB, int m, int n, int k, void *A, void *B, void *C, int lda, int ldb, int ldc); void gemmex(Context * context, bool transposeA, bool transposeB, int m, int n, int k, void *A, void *B, void *C, int lda, int ldb, int ldc);
void strided_gemmex(Context *context, bool transposeA, bool transposeB, int m, int n, int k, void *A, void *B, void *C, int lda, int ldb, int ldc, void strided_gemmex(Context *context, bool transposeA, bool transposeB, int m, int n, int k, void *A, void *B, void *C, int lda, int ldb, int ldc,
long long int strideA, long long int strideB, long long int strideC, int batchCount); long long int strideA, long long int strideB, long long int strideC, int batchCount);
......
// Copyright (c) Facebook, Inc. and its affiliates. // Copyright (c) Facebook, Inc. and its affiliates.
// //
// This source code is licensed under the MIT license found in the // This source code is licensed under the MIT license found in the
// LICENSE file in the root directory of this source tree. // LICENSE file in the root directory of this source tree.
#if BUILD_CUDA #if BUILD_CUDA
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
#include <cpu_ops.h> #include <cpu_ops.h>
// We cannot call templated code from C, so we wrap the template in a C compatible call here if necessary. // We cannot call templated code from C, so we wrap the template in a C compatible call here if necessary.
// We use macro functions to expand all the different optimizers. Looks ugly, and is ugly, but its better than to // We use macro functions to expand all the different optimizers. Looks ugly, and is ugly, but its better than to
// maintain all that boilerplate // maintain all that boilerplate
//=================================================================================== //===================================================================================
// UNMANGLED CALLS // UNMANGLED CALLS
...@@ -75,10 +75,10 @@ MAKE_BLOCKWISE8(adagrad, ADAGRAD, float, 32) ...@@ -75,10 +75,10 @@ MAKE_BLOCKWISE8(adagrad, ADAGRAD, float, 32)
void percentileClipping_g32(float * g, float *gnorm_vec, int step, const int n){ percentileClipping<float>(g, gnorm_vec, step, n); } void percentileClipping_g32(float * g, float *gnorm_vec, int step, const int n){ percentileClipping<float>(g, gnorm_vec, step, n); }
void percentileClipping_g16(half * g, float *gnorm_vec, int step, const int n){ percentileClipping<half>(g, gnorm_vec, step, n); } void percentileClipping_g16(half * g, float *gnorm_vec, int step, const int n){ percentileClipping<half>(g, gnorm_vec, step, n); }
void quantizeBlockwise_fp16(float * code, half *A, float *absmax, unsigned char *out, const int n){ quantizeBlockwise<half, 0>(code, A, absmax, out, NULL, 0, n); } void quantizeBlockwise_fp16(float * code, half *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise<half, 0>(code, A, absmax, out, NULL, 0, blocksize, n); }
void quantizeBlockwise_fp32(float * code, float *A, float *absmax, unsigned char *out, const int n){ quantizeBlockwise<float, 0>(code, A, absmax, out, NULL, 0, n); } void quantizeBlockwise_fp32(float * code, float *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise<float, 0>(code, A, absmax, out, NULL, 0, blocksize, n); }
void quantizeBlockwise_stochastic_fp16(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n){ quantizeBlockwise<half, 1>(code, A, absmax, out, rand, rand_offset, n); } void quantizeBlockwise_stochastic_fp16(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n){ quantizeBlockwise<half, 1>(code, A, absmax, out, rand, rand_offset, 4096, n); }
void quantizeBlockwise_stochastic_fp32(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n){ quantizeBlockwise<float, 1>(code, A, absmax, out, rand, rand_offset, n); } void quantizeBlockwise_stochastic_fp32(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n){ quantizeBlockwise<float, 1>(code, A, absmax, out, rand, rand_offset, 4096, n); }
void dequantizeBlockwise_fp16(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise<half>(code, A, absmax, out, blocksize, n); } \ void dequantizeBlockwise_fp16(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise<half>(code, A, absmax, out, blocksize, n); } \
void dequantizeBlockwise_fp32(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise<float>(code, A, absmax, out, blocksize, n); } void dequantizeBlockwise_fp32(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise<float>(code, A, absmax, out, blocksize, n); }
...@@ -140,8 +140,8 @@ extern "C" ...@@ -140,8 +140,8 @@ extern "C"
void cestimate_quantiles_fp16(half *A, float *code, float offset, int n){ estimateQuantiles_fp16(A, code, offset, n); } void cestimate_quantiles_fp16(half *A, float *code, float offset, int n){ estimateQuantiles_fp16(A, code, offset, n); }
void cquantize(float *code, float *A, unsigned char *out, int n){ quantize(code, A, out, n); } void cquantize(float *code, float *A, unsigned char *out, int n){ quantize(code, A, out, n); }
void cdequantize(float *code, unsigned char *A, float *out, int n){ dequantize(code, A, out, n); } void cdequantize(float *code, unsigned char *A, float *out, int n){ dequantize(code, A, out, n); }
void cquantize_blockwise_fp16(float * code, half *A, float *absmax, unsigned char *out, const int n){ quantizeBlockwise_fp16(code, A, absmax, out, n); } void cquantize_blockwise_fp16(float * code, half *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise_fp16(code, A, absmax, out, blocksize, n); }
void cquantize_blockwise_fp32(float * code, float *A, float *absmax, unsigned char *out, const int n){ quantizeBlockwise_fp32(code, A, absmax, out, n); } void cquantize_blockwise_fp32(float * code, float *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise_fp32(code, A, absmax, out, blocksize, n); }
void cquantize_blockwise_stochastic_fp16(float * code, half *A, float *absmax, unsigned char *out, float *rand, int rand_offset, const int n){ quantizeBlockwise_stochastic_fp16(code, A, absmax, out, rand, rand_offset, n); } void cquantize_blockwise_stochastic_fp16(float * code, half *A, float *absmax, unsigned char *out, float *rand, int rand_offset, const int n){ quantizeBlockwise_stochastic_fp16(code, A, absmax, out, rand, rand_offset, n); }
void cquantize_blockwise_stochastic_fp32(float * code, float *A, float *absmax, unsigned char *out, float *rand, int rand_offset, const int n){ quantizeBlockwise_stochastic_fp32(code, A, absmax, out, rand, rand_offset, n); } void cquantize_blockwise_stochastic_fp32(float * code, float *A, float *absmax, unsigned char *out, float *rand, int rand_offset, const int n){ quantizeBlockwise_stochastic_fp32(code, A, absmax, out, rand, rand_offset, n); }
...@@ -290,4 +290,3 @@ extern "C" ...@@ -290,4 +290,3 @@ extern "C"
void cquantize_blockwise_cpu_fp32(float *code, float *A, float *absmax, unsigned char *out, long long blocksize, long long n){ quantize_cpu(code, A, absmax, out, blocksize, n); } void cquantize_blockwise_cpu_fp32(float *code, float *A, float *absmax, unsigned char *out, long long blocksize, long long n){ quantize_cpu(code, A, absmax, out, blocksize, n); }
void cdequantize_blockwise_cpu_fp32(float *code, unsigned char *A, float *absmax, float *out, long long blocksize, long long n){ dequantize_cpu(code, A, absmax, out, blocksize, n); } void cdequantize_blockwise_cpu_fp32(float *code, unsigned char *A, float *absmax, float *out, long long blocksize, long long n){ dequantize_cpu(code, A, absmax, out, blocksize, n); }
} }
...@@ -76,6 +76,3 @@ if [[ -n "$CUDA_VERSION" ]]; then ...@@ -76,6 +76,3 @@ if [[ -n "$CUDA_VERSION" ]]; then
else else
echo "" echo ""
fi fi
...@@ -14,16 +14,16 @@ mng.register_parameters(model.parameters()) # 1. register parameters while still ...@@ -14,16 +14,16 @@ mng.register_parameters(model.parameters()) # 1. register parameters while still
model = model.cuda() model = model.cuda()
# use 8-bit optimizer states for all parameters # use 8-bit optimizer states for all parameters
adam = bnb.optim.Adam(model.parameters(), lr=0.001, optim_bits=8) adam = bnb.optim.Adam(model.parameters(), lr=0.001, optim_bits=8)
# 2a. override: the parameter model.fc1.weight now uses 32-bit Adam # 2a. override: the parameter model.fc1.weight now uses 32-bit Adam
mng.override_config(model.fc1.weight, 'optim_bits', 32) mng.override_config(model.fc1.weight, 'optim_bits', 32)
# 2b. override: the two special layers use # 2b. override: the two special layers use
# sparse optimization + different learning rate + different Adam betas # sparse optimization + different learning rate + different Adam betas
mng.override_config([model.special.weight, model.also_special.weight], mng.override_config([model.special.weight, model.also_special.weight],
key_value_dict ={'is_sparse': True, 'lr': 1e-5, 'betas'=(0.9, 0.98)}) key_value_dict ={'is_sparse': True, 'lr': 1e-5, 'betas'=(0.9, 0.98)})
``` ```
Possible options for the config override are: `betas, eps, weight_decay, lr, optim_bits, min_8bit_size, percentile_clipping, block_wise, max_unorm` Possible options for the config override are: `betas, eps, weight_decay, lr, optim_bits, min_8bit_size, percentile_clipping, block_wise, max_unorm`
For overrides for particular layers we recommend overriding locally in each module. You can do this by passing the module, the parameter, and its attribute name to the GlobalOptimManager: For overrides for particular layers we recommend overriding locally in each module. You can do this by passing the module, the parameter, and its attribute name to the GlobalOptimManager:
......
...@@ -121,7 +121,7 @@ template <unsigned char Gap, typename T> ...@@ -121,7 +121,7 @@ template <unsigned char Gap, typename T>
struct DirectTraits<true,Gap,T> struct DirectTraits<true,Gap,T>
{ {
typedef FVec1<SSE, T> fVec1; typedef FVec1<SSE, T> fVec1;
static void checkH(T scaler, T H_Times_x0, T xN) static void checkH(T scaler, T H_Times_x0, T xN)
{ {
union { union {
...@@ -177,9 +177,9 @@ struct DirectInfo ...@@ -177,9 +177,9 @@ struct DirectInfo
, cst0(fun_t::cst0(H, x[0])) , cst0(fun_t::cst0(H, x[0]))
{ {
myassert(((bws != NULL) && (isAligned(bws,64))), "bucket pointer not allocated or incorrectly aligned"); myassert(((bws != NULL) && (isAligned(bws,64))), "bucket pointer not allocated or incorrectly aligned");
uint32 nb = 1 + fun_t::f(H, cst0, x[n-1]); uint32 nb = 1 + fun_t::f(H, cst0, x[n-1]);
const uint32 npad = Gap-1; const uint32 npad = Gap-1;
const uint32 n_sz = n + npad; // size of padded vector const uint32 n_sz = n + npad; // size of padded vector
...@@ -320,7 +320,7 @@ struct DirectInfo ...@@ -320,7 +320,7 @@ struct DirectInfo
T cst0 = fun_t::cst0(H, px[0]); T cst0 = fun_t::cst0(H, px[0]);
const uint32 maxIndex = fun_t::f(H, cst0, px[n-1]); const uint32 maxIndex = fun_t::f(H, cst0, px[n-1]);
buckets.resize(maxIndex + 1); buckets.resize(maxIndex + 1);
data = Data(px, n, H, buckets.begin(), (npad? xi.begin(): NULL)); data = Data(px, n, H, buckets.begin(), (npad? xi.begin(): NULL));
} }
......
...@@ -203,7 +203,7 @@ struct IVec<SSE, double> : IVecBase<SSE> ...@@ -203,7 +203,7 @@ struct IVec<SSE, double> : IVecBase<SSE>
#if 1 #if 1
// takes 4 cycles // takes 4 cycles
__m128i hi = _mm_shuffle_epi32(vec, 2); // 1 cycle __m128i hi = _mm_shuffle_epi32(vec, 2); // 1 cycle
__m128i s = _mm_add_epi32(vec, hi); __m128i s = _mm_add_epi32(vec, hi);
int32 x = _mm_cvtsi128_si32(s); int32 x = _mm_cvtsi128_si32(s);
return -x; return -x;
#else #else
......
...@@ -18,7 +18,7 @@ def read(fname): ...@@ -18,7 +18,7 @@ def read(fname):
setup( setup(
name=f"bitsandbytes", name=f"bitsandbytes",
version=f"0.35.3", version=f"0.35.4",
author="Tim Dettmers", author="Tim Dettmers",
author_email="dettmers@cs.washington.edu", author_email="dettmers@cs.washington.edu",
description="8-bit optimizers and matrix multiplication routines.", description="8-bit optimizers and matrix multiplication routines.",
...@@ -26,9 +26,6 @@ setup( ...@@ -26,9 +26,6 @@ setup(
keywords="gpu optimizers optimization 8-bit quantization compression", keywords="gpu optimizers optimization 8-bit quantization compression",
url="https://github.com/TimDettmers/bitsandbytes", url="https://github.com/TimDettmers/bitsandbytes",
packages=find_packages(), packages=find_packages(),
entry_points={
"console_scripts": ["debug_cuda = bitsandbytes.debug_cli:cli"],
},
package_data={"": libs}, package_data={"": libs},
long_description=read("README.md"), long_description=read("README.md"),
long_description_content_type="text/markdown", long_description_content_type="text/markdown",
......
from itertools import product, permutations from itertools import permutations, product
import pytest import pytest
import torch import torch
...@@ -27,7 +27,7 @@ str_values = list( ...@@ -27,7 +27,7 @@ str_values = list(
) )
) )
names = [ names = [
"dim1_{0}_dim2_{1}_dim3_{2}_dim4_{3}_func_{4}_dtype_{5}_requires_grad_{6}_transpose_{7}".format( "dim1_{}_dim2_{}_dim3_{}_dim4_{}_func_{}_dtype_{}_requires_grad_{}_transpose_{}".format(
*vals *vals
) )
for vals in str_values for vals in str_values
...@@ -286,7 +286,7 @@ str_values = list( ...@@ -286,7 +286,7 @@ str_values = list(
has_bias has_bias
) )
) )
names = ["dim1_{0}_dim2_{1}_dim3_{2}_dim4_{3}_func_{4}_dtype_{5}_requires_grad_{6}_transpose_{7}_decomp_{8}_has_fp16_weights_{9}_has_bias_{10}".format(*vals) for vals in str_values] names = ["dim1_{}_dim2_{}_dim3_{}_dim4_{}_func_{}_dtype_{}_requires_grad_{}_transpose_{}_decomp_{}_has_fp16_weights_{}_has_bias_{}".format(*vals) for vals in str_values]
@pytest.mark.parametrize( @pytest.mark.parametrize(
...@@ -336,7 +336,7 @@ def test_matmullt( ...@@ -336,7 +336,7 @@ def test_matmullt(
) )
bias = None bias = None
bias2 = None bias2 = None
if has_bias: if has_bias:
bias = torch.randn(dim4, device='cuda', dtype=dtype, requires_grad=req_grad[2]) bias = torch.randn(dim4, device='cuda', dtype=dtype, requires_grad=req_grad[2])
bias2 = bias.clone() bias2 = bias.clone()
torch.nn.init.xavier_uniform_(B) torch.nn.init.xavier_uniform_(B)
......
import os import os
import pytest
import bitsandbytes as bnb
from typing import List, NamedTuple from typing import List, NamedTuple
import pytest
import bitsandbytes as bnb
from bitsandbytes.cuda_setup import ( from bitsandbytes.cuda_setup import (
CUDA_RUNTIME_LIB, CUDA_RUNTIME_LIB,
evaluate_cuda_setup,
determine_cuda_runtime_lib_path, determine_cuda_runtime_lib_path,
evaluate_cuda_setup,
extract_candidate_paths, extract_candidate_paths,
) )
......
...@@ -6,12 +6,14 @@ from itertools import product ...@@ -6,12 +6,14 @@ from itertools import product
import einops import einops
import pytest import pytest
import torch import torch
import numpy as np
import bitsandbytes as bnb import bitsandbytes as bnb
from bitsandbytes import functional as F from bitsandbytes import functional as F
from scipy.stats import norm
torch.set_printoptions( torch.set_printoptions(
precision=4, sci_mode=False, linewidth=120, edgeitems=20, threshold=10000 precision=5, sci_mode=False, linewidth=120, edgeitems=20, threshold=10000
) )
k = 20 k = 20
...@@ -26,7 +28,7 @@ def assert_all_approx_close(a, b, rtol=1e-3, atol=1e-3, count=0): ...@@ -26,7 +28,7 @@ def assert_all_approx_close(a, b, rtol=1e-3, atol=1e-3, count=0):
class FFN(torch.nn.Module): class FFN(torch.nn.Module):
def __init__(self, input_features, hidden_size, bias=True): def __init__(self, input_features, hidden_size, bias=True):
super(FFN, self).__init__() super().__init__()
self.fc1 = torch.nn.Linear(input_features, hidden_size, bias=bias) self.fc1 = torch.nn.Linear(input_features, hidden_size, bias=bias)
self.fc2 = torch.nn.Linear(hidden_size, input_features, bias=bias) self.fc2 = torch.nn.Linear(hidden_size, input_features, bias=bias)
...@@ -40,7 +42,7 @@ class FFN(torch.nn.Module): ...@@ -40,7 +42,7 @@ class FFN(torch.nn.Module):
return x return x
class Timer(object): class Timer:
def __init__(self): def __init__(self):
self.starts = {} self.starts = {}
self.ends = {} self.ends = {}
...@@ -67,7 +69,7 @@ class Timer(object): ...@@ -67,7 +69,7 @@ class Timer(object):
self.ends.pop(name) self.ends.pop(name)
if print_ms and name in self.agg: if print_ms and name in self.agg:
print("{0} took: {1:.5f}s".format(name, self.agg[name] / 1000.0)) print(f"{name} took: {self.agg[name] / 1000.0:.5f}s")
return self.agg[name] return self.agg[name]
...@@ -149,30 +151,41 @@ def test_dynamic_quantization(): ...@@ -149,30 +151,41 @@ def test_dynamic_quantization():
def test_dynamic_blockwise_quantization(): def test_dynamic_blockwise_quantization():
diffs = [] #print('')
reldiffs = [] for blocksize in [4096, 2048, 1024, 512]:
for i in range(100): diffs = []
A1 = torch.randn(1024, 1024, device="cuda") reldiffs = []
C, S = F.quantize_blockwise(A1) for i in range(100):
A2 = F.dequantize_blockwise(C, S) A1 = torch.randn(1024, 1024, device="cuda")
diff = torch.abs(A1 - A2) C, S = F.quantize_blockwise(A1, blocksize=blocksize)
reldiff = diff / torch.abs(A1 + 1e-8) A2 = F.dequantize_blockwise(C, S, blocksize=blocksize)
diffs.append(diff.mean().item()) diff = torch.abs(A1 - A2)
reldiffs.append(reldiff.mean().item()) reldiff = diff / torch.abs(A1 + 1e-8)
assert diffs[-1] < 0.011 diffs.append(diff.mean().item())
# print(sum(diffs)/len(diffs)) reldiffs.append(reldiff.mean().item())
# print(sum(reldiffs)/len(reldiffs)) abserr = sum(diffs)/len(diffs)
relerr = sum(reldiffs)/len(reldiffs)
diffs = [] assert abserr < 0.011
for i in range(100): assert relerr < 0.018
A1 = torch.rand(1024, 1024, device="cuda") #print('randn', blocksize, sum(diffs)/len(diffs))
C, S = F.quantize_blockwise(A1) #print('randn', blocksize, sum(reldiffs)/len(reldiffs))
A2 = F.dequantize_blockwise(C, S)
diff = torch.abs(A1 - A2).mean().item() diffs = []
assert diff < 0.0033 for i in range(100):
diffs.append(diff) A1 = torch.rand(1024, 1024, device="cuda")
torch.testing.assert_allclose(A1, A2, atol=1e-2, rtol=0) C, S = F.quantize_blockwise(A1, blocksize=blocksize)
# print(sum(diffs)/len(diffs)) A2 = F.dequantize_blockwise(C, S, blocksize=blocksize)
diff = torch.abs(A1 - A2)
reldiff = diff / torch.abs(A1 + 1e-8)
diffs.append(diff.mean().item())
reldiffs.append(reldiff.mean().item())
#torch.testing.assert_allclose(A1, A2, atol=1e-2, rtol=0)
abserr = sum(diffs)/len(diffs)
relerr = sum(reldiffs)/len(reldiffs)
assert abserr < 0.0035
assert relerr < 0.015
#print('rand', blocksize, sum(diffs)/len(diffs))
#print('rand', blocksize, sum(reldiffs)/len(reldiffs))
def test_dynamic_blockwise_stochastic_quantization(): def test_dynamic_blockwise_stochastic_quantization():
...@@ -289,7 +302,7 @@ batched = [False, True] ...@@ -289,7 +302,7 @@ batched = [False, True]
values = list(product(dim1, dim2, methods, batched)) values = list(product(dim1, dim2, methods, batched))
values_names = list(product(dim1, dim2, method_names, batched)) values_names = list(product(dim1, dim2, method_names, batched))
names = [ names = [
"dim1_{0}_dim2_{1}_quant_{2}_batched_{3}".format(*vals) "dim1_{}_dim2_{}_quant_{}_batched_{}".format(*vals)
for vals in values_names for vals in values_names
] ]
...@@ -347,7 +360,7 @@ seq_dim = torch.randint(16, 256, size=(n,)).tolist() ...@@ -347,7 +360,7 @@ seq_dim = torch.randint(16, 256, size=(n,)).tolist()
transpose = [(False, False), (False, True), (True, False), (True, True)] transpose = [(False, False), (False, True), (True, False), (True, True)]
values = list(product(hidden_dim, batch_dim, transpose, seq_dim)) values = list(product(hidden_dim, batch_dim, transpose, seq_dim))
names = [ names = [
"hidden_dim_{0}_batch_dim_{1},transpose_{2}_seq_dim_{3}".format(*vals) "hidden_dim_{}_batch_dim_{},transpose_{}_seq_dim_{}".format(*vals)
for vals in values for vals in values
] ]
...@@ -412,7 +425,7 @@ hidden_dim = torch.randint(32, 1024 * 4, size=(n,)).tolist() ...@@ -412,7 +425,7 @@ hidden_dim = torch.randint(32, 1024 * 4, size=(n,)).tolist()
batch_dim = torch.randint(2, 16, size=(n,)).tolist() batch_dim = torch.randint(2, 16, size=(n,)).tolist()
values = list(product(seq_dim, hidden_dim, batch_dim)) values = list(product(seq_dim, hidden_dim, batch_dim))
names = [ names = [
"seq_dim{0}_hidden_dim{1}_batch_dim{2}".format(*vals) for vals in values "seq_dim{}_hidden_dim{}_batch_dim{}".format(*vals) for vals in values
] ]
...@@ -444,7 +457,7 @@ batch_dim = torch.randint(2, 16, size=(n,)).tolist() ...@@ -444,7 +457,7 @@ batch_dim = torch.randint(2, 16, size=(n,)).tolist()
transpose = [False, True] transpose = [False, True]
values = list(product(seq_dim, hidden_dim, batch_dim, transpose)) values = list(product(seq_dim, hidden_dim, batch_dim, transpose))
names = [ names = [
"seq_dim={0}_hidden_dim={1}_batch_dim={2}_transpose{3}".format(*vals) "seq_dim={}_hidden_dim={}_batch_dim={}_transpose{}".format(*vals)
for vals in values for vals in values
] ]
...@@ -529,7 +542,7 @@ dim4 = torch.randint(32, 256, size=(n,)).tolist() ...@@ -529,7 +542,7 @@ dim4 = torch.randint(32, 256, size=(n,)).tolist()
transpose = [(False, False), (True, False), (False, True), (True, True)] transpose = [(False, False), (True, False), (False, True), (True, True)]
values = list(product(dim1, dim2, dim3, dim4, transpose)) values = list(product(dim1, dim2, dim3, dim4, transpose))
names = [ names = [
"dim1_{0}_dim2_{1}_dim3_{2}_dim4_{3}_transpose_{4}".format(*vals) "dim1_{}_dim2_{}_dim3_{}_dim4_{}_transpose_{}".format(*vals)
for vals in values for vals in values
] ]
...@@ -567,7 +580,7 @@ dim1 = torch.randint(1, 64, size=(n,)).tolist() ...@@ -567,7 +580,7 @@ dim1 = torch.randint(1, 64, size=(n,)).tolist()
dim2 = torch.randint(32, 128, size=(n,)).tolist() dim2 = torch.randint(32, 128, size=(n,)).tolist()
dim3 = torch.randint(32, 256, size=(n,)).tolist() dim3 = torch.randint(32, 256, size=(n,)).tolist()
values = list(product(dim1, dim2, dim3)) values = list(product(dim1, dim2, dim3))
names = ["dim1_{0}_dim2_{1}_dim3_{2}".format(*vals) for vals in values] names = ["dim1_{}_dim2_{}_dim3_{}".format(*vals) for vals in values]
@pytest.mark.parametrize("dim1, dim2, dim3", values, ids=names) @pytest.mark.parametrize("dim1, dim2, dim3", values, ids=names)
...@@ -596,7 +609,7 @@ transpose = [False] ...@@ -596,7 +609,7 @@ transpose = [False]
dims = [2, 3] dims = [2, 3]
values = list(product(dim1, dim2, dim3, dims, dtype, a_order, out_order, transpose)) values = list(product(dim1, dim2, dim3, dims, dtype, a_order, out_order, transpose))
names = ["dim1_{0}_dim2_{1}_dim3_{2}_dims_{3}_dtype_{4}_orderA_{5}_orderOut_{6}_transpose_{7}".format(*vals)for vals in values] names = ["dim1_{}_dim2_{}_dim3_{}_dims_{}_dtype_{}_orderA_{}_orderOut_{}_transpose_{}".format(*vals)for vals in values]
@pytest.mark.parametrize("dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose",values,ids=names) @pytest.mark.parametrize("dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose",values,ids=names)
...@@ -678,7 +691,7 @@ ldb = [0] ...@@ -678,7 +691,7 @@ ldb = [0]
# ldb = list(range(256, 1*1024, 256)) # ldb = list(range(256, 1*1024, 256))
values = list(product(dim1, dim2, dim3, dim4, dims, ldb)) values = list(product(dim1, dim2, dim3, dim4, dims, ldb))
names = [ names = [
"dim1_{0}_dim2_{1}_dim3_{2}_dim4_{3}_dims_{4}_ldb_{5}".format(*vals) "dim1_{}_dim2_{}_dim3_{}_dim4_{}_dims_{}_ldb_{}".format(*vals)
for vals in values for vals in values
] ]
...@@ -726,7 +739,7 @@ dims = (2,) ...@@ -726,7 +739,7 @@ dims = (2,)
# ldb = list(range(256, 1*1024, 256)) # ldb = list(range(256, 1*1024, 256))
values = list(product(dim1, dim2, dim3, dim4, dims)) values = list(product(dim1, dim2, dim3, dim4, dims))
names = [ names = [
"dim1_{0}_dim2_{1}_dim3_{2}_dim4_{3}_dims_{4}".format(*vals) "dim1_{}_dim2_{}_dim3_{}_dim4_{}_dims_{}".format(*vals)
for vals in values for vals in values
] ]
...@@ -784,7 +797,7 @@ values = [ ...@@ -784,7 +797,7 @@ values = [
# values = list(product(batch, seq, model, hidden)) # values = list(product(batch, seq, model, hidden))
names = [ names = [
"batch_{0}_seq_{1}_model_{2}_hidden_{3}".format(*vals) for vals in values "batch_{}_seq_{}_model_{}_hidden_{}".format(*vals) for vals in values
] ]
...@@ -952,7 +965,7 @@ dims = (2,) ...@@ -952,7 +965,7 @@ dims = (2,)
formatB = ["col_turing", "col_ampere"] formatB = ["col_turing", "col_ampere"]
has_bias = [True, False] has_bias = [True, False]
values = list(product(dim1, dim4, dims, formatB, has_bias)) values = list(product(dim1, dim4, dims, formatB, has_bias))
names = ["dim1_{0}_dim4_{1}_dims_{2}_formatB_{3}_has_bias_{4}".format(*vals) for vals in values] names = ["dim1_{}_dim4_{}_dims_{}_formatB_{}_has_bias_{}".format(*vals) for vals in values]
@pytest.mark.parametrize("dim1, dim4, dims, formatB, has_bias", values, ids=names) @pytest.mark.parametrize("dim1, dim4, dims, formatB, has_bias", values, ids=names)
...@@ -1002,7 +1015,7 @@ dim2 = [1 * 1024] ...@@ -1002,7 +1015,7 @@ dim2 = [1 * 1024]
dims = (2,) dims = (2,)
# ldb = list(range(256, 1*1024, 256)) # ldb = list(range(256, 1*1024, 256))
values = list(product(dim1, dim2, dims)) values = list(product(dim1, dim2, dims))
names = ["dim1_{0}_dim2_{1}_dims_{2}".format(*vals) for vals in values] names = ["dim1_{}_dim2_{}_dims_{}".format(*vals) for vals in values]
@pytest.mark.parametrize("dim1, dim2, dims", values, ids=names) @pytest.mark.parametrize("dim1, dim2, dims", values, ids=names)
...@@ -1058,7 +1071,7 @@ dim1 = torch.randint(1, 4 * 1024, size=(n,)).tolist() ...@@ -1058,7 +1071,7 @@ dim1 = torch.randint(1, 4 * 1024, size=(n,)).tolist()
dim2 = torch.randint(1, 4 * 1024, size=(n,)).tolist() dim2 = torch.randint(1, 4 * 1024, size=(n,)).tolist()
values = list(product(dim1, dim2)) values = list(product(dim1, dim2))
names = ["dim1_{0}_dim2_{1}".format(*vals) for vals in values] names = ["dim1_{}_dim2_{}".format(*vals) for vals in values]
@pytest.mark.parametrize("dim1, dim2", values, ids=names) @pytest.mark.parametrize("dim1, dim2", values, ids=names)
...@@ -1105,7 +1118,7 @@ dim4 = torch.randint(1, 4 * 1024, size=(n,)).tolist() ...@@ -1105,7 +1118,7 @@ dim4 = torch.randint(1, 4 * 1024, size=(n,)).tolist()
inner = torch.randint(1, 4 * 1024, size=(n,)).tolist() inner = torch.randint(1, 4 * 1024, size=(n,)).tolist()
values = list(zip(dim1, dim4, inner)) values = list(zip(dim1, dim4, inner))
names = ["dim1_{0}_dim4_{1}_inner_{2}".format(*vals) for vals in values] names = ["dim1_{}_dim4_{}_inner_{}".format(*vals) for vals in values]
@pytest.mark.parametrize("dim1, dim4, inner", values, ids=names) @pytest.mark.parametrize("dim1, dim4, inner", values, ids=names)
...@@ -1149,7 +1162,7 @@ dim4 = torch.randint(1, 4 * 1024, size=(n,)).tolist() ...@@ -1149,7 +1162,7 @@ dim4 = torch.randint(1, 4 * 1024, size=(n,)).tolist()
inner = torch.randint(1, 4 * 1024, size=(n,)).tolist() inner = torch.randint(1, 4 * 1024, size=(n,)).tolist()
values = list(zip(dim1, dim4, inner)) values = list(zip(dim1, dim4, inner))
names = ["dim1_{0}_dim4_{1}_inner_{2}".format(*vals) for vals in values] names = ["dim1_{}_dim4_{}_inner_{}".format(*vals) for vals in values]
@pytest.mark.parametrize("dim1, dim4, inner", values, ids=names) @pytest.mark.parametrize("dim1, dim4, inner", values, ids=names)
...@@ -1224,7 +1237,7 @@ inner = [12288 * 4, 4096 * 4] ...@@ -1224,7 +1237,7 @@ inner = [12288 * 4, 4096 * 4]
dim4 = [12288, 4096] dim4 = [12288, 4096]
values = list(zip(dim1, dim4, inner)) values = list(zip(dim1, dim4, inner))
names = ["dim1_{0}_dim4_{1}_inner_{2}".format(*vals) for vals in values] names = ["dim1_{}_dim4_{}_inner_{}".format(*vals) for vals in values]
@pytest.mark.parametrize("dim1, dim4, inner", values, ids=names) @pytest.mark.parametrize("dim1, dim4, inner", values, ids=names)
...@@ -1290,7 +1303,7 @@ values = list( ...@@ -1290,7 +1303,7 @@ values = list(
product(dim1, dim2, dim3, dims, dtype, a_order, out_order, transpose) product(dim1, dim2, dim3, dims, dtype, a_order, out_order, transpose)
) )
names = [ names = [
"dim1_{0}_dim2_{1}_dim3_{2}_dims_{3}_dtype_{4}_orderA_{5}_orderOut_{6}_{7}".format( "dim1_{}_dim2_{}_dim3_{}_dims_{}_dtype_{}_orderA_{}_orderOut_{}_{}".format(
*vals *vals
) )
for vals in values for vals in values
...@@ -1341,7 +1354,7 @@ a_order = ["col_turing"] ...@@ -1341,7 +1354,7 @@ a_order = ["col_turing"]
out_order = ["row"] out_order = ["row"]
values = list(product(dim1, dim2, dtype, a_order, out_order)) values = list(product(dim1, dim2, dtype, a_order, out_order))
names = [ names = [
"dim1_{0}_dim2_{1}_dtype_{2}_orderA_{3}_orderOut_{4}".format(*vals) "dim1_{}_dim2_{}_dtype_{}_orderA_{}_orderOut_{}".format(*vals)
for vals in values for vals in values
] ]
...@@ -1367,7 +1380,7 @@ dim2 = torch.randint(1, 4 * 1024, size=(n,)).tolist() ...@@ -1367,7 +1380,7 @@ dim2 = torch.randint(1, 4 * 1024, size=(n,)).tolist()
# dim2 = [5] # dim2 = [5]
values = list(product(dim1, dim2)) values = list(product(dim1, dim2))
names = ["dim1_{0}_dim2_{1}".format(*vals) for vals in values] names = ["dim1_{}_dim2_{}".format(*vals) for vals in values]
@pytest.mark.parametrize("dim1, dim2", values, ids=names) @pytest.mark.parametrize("dim1, dim2", values, ids=names)
...@@ -1404,7 +1417,7 @@ dim2 = torch.randint(1, 1 * 1024, size=(n,)).tolist() ...@@ -1404,7 +1417,7 @@ dim2 = torch.randint(1, 1 * 1024, size=(n,)).tolist()
# dim2 = [11] # dim2 = [11]
transposed_B = [False, True] transposed_B = [False, True]
values = list(product(dim1, dim2, transposed_B)) values = list(product(dim1, dim2, transposed_B))
names = ["dim1_{0}_dim2_{1}_transposed_B_{2}".format(*vals) for vals in values] names = ["dim1_{}_dim2_{}_transposed_B_{}".format(*vals) for vals in values]
@pytest.mark.parametrize("dim1, dim2, transposed_B", values, ids=names) @pytest.mark.parametrize("dim1, dim2, transposed_B", values, ids=names)
...@@ -1485,7 +1498,7 @@ n = 2 ...@@ -1485,7 +1498,7 @@ n = 2
dim1 = torch.randint(256, 1 * 1024, size=(n,)).tolist() dim1 = torch.randint(256, 1 * 1024, size=(n,)).tolist()
dim2 = torch.randint(256, 1 * 1024, size=(n,)).tolist() dim2 = torch.randint(256, 1 * 1024, size=(n,)).tolist()
values = list(product(dim1, dim2)) values = list(product(dim1, dim2))
names = ["dim1_{0}_dim2_{1}".format(*vals) for vals in values] names = ["dim1_{}_dim2_{}".format(*vals) for vals in values]
@pytest.mark.parametrize("dim1, dim2", values, ids=names) @pytest.mark.parametrize("dim1, dim2", values, ids=names)
...@@ -1550,7 +1563,7 @@ dtype = [torch.float16] ...@@ -1550,7 +1563,7 @@ dtype = [torch.float16]
out_function = ["zeros", "ones"] out_function = ["zeros", "ones"]
values = list(product(dim1, dim2, dtype, out_function)) values = list(product(dim1, dim2, dtype, out_function))
names = [ names = [
"dim1_{0}_dim2_{1}_dtype_{2}_out_func_{3}".format(*vals) for vals in values "dim1_{}_dim2_{}_dtype_{}_out_func_{}".format(*vals) for vals in values
] ]
...@@ -1616,17 +1629,6 @@ def test_spmm_coo_very_sparse(dim1, dim2, dtype, out_func): ...@@ -1616,17 +1629,6 @@ def test_spmm_coo_very_sparse(dim1, dim2, dtype, out_func):
# print(time.time() - t0) # print(time.time() - t0)
def test_layout():
a1 = torch.rand(16, 64, device="cuda", dtype=torch.float16)
a1 = torch.arange(16 * 64, device="cuda").reshape(16, 64).byte()
a2, s2 = F.transform(a1, "col_turing")
print(a2.shape)
print(a1.flatten()[8 * 64 : 8 * 64 + 32])
for i in range(4):
print(a2.flatten()[i * 8 * 32 : i * 8 * 32 + 32], 0)
def test_coo2csr(): def test_coo2csr():
threshold = 1 threshold = 1
A = torch.randn(128, 128).half().cuda() A = torch.randn(128, 128).half().cuda()
...@@ -1678,7 +1680,7 @@ dim2 = [2048] ...@@ -1678,7 +1680,7 @@ dim2 = [2048]
# dim2 = [2] # dim2 = [2]
dtype = [torch.int8] dtype = [torch.int8]
values = list(product(dim1, dim2, dtype)) values = list(product(dim1, dim2, dtype))
names = ["dim1_{0}_dim2_{1}_dtype_{2}".format(*vals) for vals in values] names = ["dim1_{}_dim2_{}_dtype_{}".format(*vals) for vals in values]
@pytest.mark.parametrize("dim1, dim2, dtype", values, ids=names) @pytest.mark.parametrize("dim1, dim2, dtype", values, ids=names)
...@@ -1794,7 +1796,7 @@ values.append((batch_size, seqdim, 768, 4 * 768)) ...@@ -1794,7 +1796,7 @@ values.append((batch_size, seqdim, 768, 4 * 768))
# values.append((batch_size, seqdim, 5140, 4*5140)) # values.append((batch_size, seqdim, 5140, 4*5140))
#values.append((batch_size, seqdim, 12288, 4*12288)) #values.append((batch_size, seqdim, 12288, 4*12288))
names = [ names = [
"batch_{0}_seq_{1}_model_{2}_hidden_{3}".format(*vals) for vals in values "batch_{}_seq_{}_model_{}_hidden_{}".format(*vals) for vals in values
] ]
...@@ -2040,3 +2042,154 @@ def test_blockwise_cpu_large(): ...@@ -2040,3 +2042,154 @@ def test_blockwise_cpu_large():
assert diffs[-1] < 0.011 assert diffs[-1] < 0.011
# print(sum(diffs)/len(diffs)) # print(sum(diffs)/len(diffs))
# print(sum(reldiffs)/len(reldiffs)) # print(sum(reldiffs)/len(reldiffs))
def test_fp8_quant():
for e_bits in range(1, 7):
p_bits = 7-e_bits
code = F.create_fp8_map(True, e_bits, p_bits).cuda()
print(e_bits, p_bits)
abserr = []
relerr = []
for i in range(100):
A1 = torch.randn(1024, 1024, device="cuda")
C, SC = F.quantize_blockwise(A1, code=code)
A2 = F.dequantize_blockwise(C, SC)
diff = torch.abs(A1 - A2)
reldiff = diff/torch.abs(A1+1e-8)
abserr.append(diff.mean().item())
relerr.append(reldiff.mean().item())
#assert diff < 0.0075
#print(sum(abserr)/len(abserr))
#print(sum(relerr)/len(relerr))
abserr = []
relerr = []
for i in range(100):
A1 = torch.rand(1024, 1024, device="cuda")
C, SC = F.quantize_blockwise(A1, code=code)
A2 = F.dequantize_blockwise(C, SC)
diff = torch.abs(A1 - A2)
reldiff = diff/torch.abs(A1+1e-8)
abserr.append(diff.mean().item())
relerr.append(reldiff.mean().item())
#assert diff < 0.0075
#print(sum(abserr)/len(abserr))
#print(sum(relerr)/len(relerr))
abserr = []
relerr = []
for i in range(100):
A1 = torch.randn(1024, 1024, device="cuda")
C, SC = F.quantize_blockwise(A1)
A2 = F.dequantize_blockwise(C, SC)
diff = torch.abs(A1 - A2)
reldiff = diff/torch.abs(A1+1e-8)
abserr.append(diff.mean().item())
relerr.append(reldiff.mean().item())
#assert diff < 0.0075
#print(3, sum(abserr)/len(abserr))
#print(3, sum(relerr)/len(relerr))
def test_few_bit_quant():
#print('')
for bits in range(2, 9):
#print('='*30, bits, '='*30)
for method in ['linear', 'fp8', 'dynamic', 'quantile']:
abserrs = []
relerrs = []
code = None
if method == 'linear':
code = F.create_linear_map(True, total_bits=bits).cuda()
elif method == 'fp8':
ebits = math.ceil(bits/2)
pbits = bits-ebits-1
code = F.create_fp8_map(True, ebits, pbits, bits).cuda()
elif method == 'dynamic':
code = F.create_dynamic_map(True, bits-0, bits).cuda()
elif method == 'quantile':
values = torch.randn(2048, 2048, device='cuda')
code = F.create_quantile_map(values, bits).cuda()
# for some data types we have no zero
# for some data types we have one zero
# for some data types we have two zeros
assert torch.unique(code).numel() in [2**bits, 2**bits-1], f'bits: {bits}, method: {method}'
#print(method, (code==0).sum())
assert code.numel() == 256
for i in range(10):
values = torch.randn(1, 32, device='cuda')
values /= values.abs().max()
#values[values.abs() < 1e-6] += 1e-5
q1 = []
v1 = []
for v in values[0]:
idx = torch.abs(v-code).argmin()
q1.append(idx.item())
v1.append(code[idx].item())
q1 = torch.Tensor(q1).cuda()
v1 = torch.Tensor(v1).cuda()
q2, S2 = F.quantize_blockwise(values, code=code)
v2 = F.dequantize_blockwise(q2, S2)
idx = torch.isclose(q1.int(), q2.int())
err2 = torch.abs(v2-values)
abserrs.append(err2.mean().item())
relerrs.append((err2/(1e-10+values).abs()).mean().item())
if idx.sum():
# some weird cases
err1 = torch.abs(v1-values).mean()
#assert err2.mean() <= err1
else:
torch.testing.assert_allclose(q1, q2)
#print(method, 'abserr:', sum(abserrs)/len(abserrs), 'relerr:', sum(relerrs)/len(relerrs))
#assert False
def test_kbit_quantile_estimation():
for i in range(100):
data = torch.randn(1024, 1024, device='cuda')
for bits in range(2, 9):
p = np.linspace(1.3e-4, 1-1.3e-4, 2**bits)
val1 = torch.Tensor(norm.ppf(p)).cuda()
val2 = F.estimate_quantiles(data, offset=0, num_quantiles=2**bits)
err = torch.abs(val1-val2).mean()
assert err < 0.038
for i in range(100):
data = torch.randn(1024, 1024, device='cuda')
for bits in range(2, 4):
total_values = 2**bits-1
p = np.linspace(0, 1, 2*total_values+1)
idx = np.arange(1, 2*total_values+1, 2)
p = p[idx]
offset = 1/(2*total_values)
p = np.linspace(offset, 1-offset, total_values)
val1 = torch.Tensor(norm.ppf(p)).cuda()
val2 = F.estimate_quantiles(data, num_quantiles=2**bits-1)
err = torch.abs(val1-val2).mean()
assert err < 0.035
def test_bench_dequantization():
a = torch.rand(1024, 1024, device='cuda').half()
qa, SA = F.quantize_blockwise(a)
max_theoretical_mu = 1024*1024*2/1024**3/672*1000*1000
#print(max_theoretical_mu)
torch.cuda.synchronize()
t0 = time.time()
for i in range(100):
F.dequantize_blockwise(qa, SA, blocksize=2048)
torch.cuda.synchronize()
#print((time.time()-t0)/1e6)
...@@ -7,7 +7,7 @@ from torch import nn ...@@ -7,7 +7,7 @@ from torch import nn
import bitsandbytes as bnb import bitsandbytes as bnb
class MockArgs(object): class MockArgs:
def __init__(self, initial_data): def __init__(self, initial_data):
for key in initial_data: for key in initial_data:
setattr(self, key, initial_data[key]) setattr(self, key, initial_data[key])
...@@ -15,7 +15,7 @@ class MockArgs(object): ...@@ -15,7 +15,7 @@ class MockArgs(object):
class MLP8bit(torch.nn.Module): class MLP8bit(torch.nn.Module):
def __init__(self, dim1, dim2, has_fp16_weights=True, memory_efficient_backward=False, threshold=0.0): def __init__(self, dim1, dim2, has_fp16_weights=True, memory_efficient_backward=False, threshold=0.0):
super(MLP8bit, self).__init__() super().__init__()
self.fc1 = bnb.nn.Linear8bitLt( self.fc1 = bnb.nn.Linear8bitLt(
dim1, dim2, has_fp16_weights=has_fp16_weights, memory_efficient_backward=memory_efficient_backward, dim1, dim2, has_fp16_weights=has_fp16_weights, memory_efficient_backward=memory_efficient_backward,
threshold=threshold threshold=threshold
...@@ -289,7 +289,7 @@ class LinearFunction(torch.autograd.Function): ...@@ -289,7 +289,7 @@ class LinearFunction(torch.autograd.Function):
class Linear8bit(nn.Module): class Linear8bit(nn.Module):
def __init__(self, input_features, output_features, bias=True, args=None): def __init__(self, input_features, output_features, bias=True, args=None):
super(Linear8bit, self).__init__() super().__init__()
self.input_features = input_features self.input_features = input_features
self.output_features = output_features self.output_features = output_features
self.args = args self.args = args
...@@ -312,7 +312,7 @@ class Linear8bit(nn.Module): ...@@ -312,7 +312,7 @@ class Linear8bit(nn.Module):
threshold = [0.0, 3.0] threshold = [0.0, 3.0]
values = threshold values = threshold
names = ["threshold_{0}".format(vals) for vals in values] names = [f"threshold_{vals}" for vals in values]
@pytest.mark.parametrize("threshold", values, ids=names) @pytest.mark.parametrize("threshold", values, ids=names)
...@@ -378,7 +378,7 @@ def test_linear8bitlt_accumulated_gradient(): ...@@ -378,7 +378,7 @@ def test_linear8bitlt_accumulated_gradient():
threshold = [0.0, 2.0] threshold = [0.0, 2.0]
values = threshold values = threshold
names = ["threshold_{0}".format(vals) for vals in values] names = [f"threshold_{vals}" for vals in values]
@pytest.mark.parametrize("threshold", values, ids=names) @pytest.mark.parametrize("threshold", values, ids=names)
......
...@@ -18,7 +18,7 @@ k = 20 ...@@ -18,7 +18,7 @@ k = 20
def get_temp_dir(): def get_temp_dir():
path = "/tmp/autoswap/{0}".format(str(uuid.uuid4())) path = f"/tmp/autoswap/{str(uuid.uuid4())}"
os.makedirs(path, exist_ok=True) os.makedirs(path, exist_ok=True)
return path return path
...@@ -116,7 +116,7 @@ gtype = [torch.float32, torch.float16] ...@@ -116,7 +116,7 @@ gtype = [torch.float32, torch.float16]
optimizer_names = ["adam", "momentum", "rmsprop", "lars"] optimizer_names = ["adam", "momentum", "rmsprop", "lars"]
values = list(product(dim1, dim2, gtype, optimizer_names)) values = list(product(dim1, dim2, gtype, optimizer_names))
names = [ names = [
"dim1_{0}_dim2_{1}_gtype_{2}_optim_{3}".format(*vals) for vals in values "dim1_{}_dim2_{}_gtype_{}_optim_{}".format(*vals) for vals in values
] ]
...@@ -187,7 +187,7 @@ dim1 = [1024] ...@@ -187,7 +187,7 @@ dim1 = [1024]
dim2 = [32, 1024, 4097] dim2 = [32, 1024, 4097]
gtype = [torch.float32, torch.float16] gtype = [torch.float32, torch.float16]
values = list(product(dim1, dim2, gtype)) values = list(product(dim1, dim2, gtype))
names = ["dim1_{0}_dim2_{1}_gtype_{2}".format(*vals) for vals in values] names = ["dim1_{}_dim2_{}_gtype_{}".format(*vals) for vals in values]
@pytest.mark.parametrize("dim1, dim2, gtype", values, ids=names) @pytest.mark.parametrize("dim1, dim2, gtype", values, ids=names)
...@@ -250,7 +250,7 @@ optimizer_names = [ ...@@ -250,7 +250,7 @@ optimizer_names = [
] ]
values = list(product(dim1, dim2, gtype, optimizer_names)) values = list(product(dim1, dim2, gtype, optimizer_names))
names = [ names = [
"dim1_{0}_dim2_{1}_gtype_{2}_optim_{3}".format(*vals) for vals in values "dim1_{}_dim2_{}_gtype_{}_optim_{}".format(*vals) for vals in values
] ]
...@@ -391,7 +391,7 @@ gtype = [torch.float32] ...@@ -391,7 +391,7 @@ gtype = [torch.float32]
optim_bits = [32, 8] optim_bits = [32, 8]
values = list(product(dim1, dim2, gtype, optim_bits)) values = list(product(dim1, dim2, gtype, optim_bits))
names = [ names = [
"dim1_{0}_dim2_{1}_gtype_{2}_optim_bits_{3}".format(*vals) "dim1_{}_dim2_{}_gtype_{}_optim_bits_{}".format(*vals)
for vals in values for vals in values
] ]
...@@ -495,7 +495,7 @@ gtype = [torch.float32, torch.float16] ...@@ -495,7 +495,7 @@ gtype = [torch.float32, torch.float16]
optimizer_names = ["adam8bit_blockwise"] optimizer_names = ["adam8bit_blockwise"]
values = list(product(dim1, dim2, gtype, optimizer_names)) values = list(product(dim1, dim2, gtype, optimizer_names))
names = [ names = [
"dim1_{0}_dim2_{1}_gtype_{2}_optim_{3}".format(*vals) for vals in values "dim1_{}_dim2_{}_gtype_{}_optim_{}".format(*vals) for vals in values
] ]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment