Unverified Commit be5cecb8 authored by Tim Dettmers's avatar Tim Dettmers Committed by GitHub
Browse files

Merge branch 'main' into main

parents 8724c990 f0ec93d0
...@@ -12,13 +12,13 @@ import torch ...@@ -12,13 +12,13 @@ import torch
import bitsandbytes.functional as F import bitsandbytes.functional as F
class MockArgs(object): class MockArgs:
def __init__(self, initial_data): def __init__(self, initial_data):
for key in initial_data: for key in initial_data:
setattr(self, key, initial_data[key]) setattr(self, key, initial_data[key])
class GlobalOptimManager(object): class GlobalOptimManager:
_instance = None _instance = None
def __init__(self): def __init__(self):
...@@ -56,9 +56,9 @@ class GlobalOptimManager(object): ...@@ -56,9 +56,9 @@ class GlobalOptimManager(object):
""" """
Overrides initial optimizer config for specific parameters. Overrides initial optimizer config for specific parameters.
The key-values of the optimizer config for the input parameters are overidden The key-values of the optimizer config for the input parameters are overridden
This can be both, optimizer parameters like "betas", or "lr" or it can be This can be both, optimizer parameters like "betas", or "lr" or it can be
8-bit specific paramters like "optim_bits", "percentile_clipping". 8-bit specific parameters like "optim_bits", "percentile_clipping".
Parameters Parameters
---------- ----------
...@@ -93,13 +93,12 @@ class GlobalOptimManager(object): ...@@ -93,13 +93,12 @@ class GlobalOptimManager(object):
class Optimizer8bit(torch.optim.Optimizer): class Optimizer8bit(torch.optim.Optimizer):
def __init__(self, params, defaults, optim_bits=32): def __init__(self, params, defaults, optim_bits=32):
super(Optimizer8bit, self).__init__(params, defaults) super().__init__(params, defaults)
self.initialized = False self.initialized = False
self.name2qmap = {} self.name2qmap = {}
self.mng = GlobalOptimManager.get_instance() self.mng = GlobalOptimManager.get_instance()
self.non_castable_tensor_keys = set( self.non_castable_tensor_keys = {
[
"qmap1", "qmap1",
"qmap2", "qmap2",
"max1", "max1",
...@@ -112,8 +111,7 @@ class Optimizer8bit(torch.optim.Optimizer): ...@@ -112,8 +111,7 @@ class Optimizer8bit(torch.optim.Optimizer):
"absmax1", "absmax1",
"absmax2", "absmax2",
"unorm_vec", "unorm_vec",
] }
)
if optim_bits == 8: if optim_bits == 8:
self.fill_qmap() self.fill_qmap()
...@@ -123,7 +121,7 @@ class Optimizer8bit(torch.optim.Optimizer): ...@@ -123,7 +121,7 @@ class Optimizer8bit(torch.optim.Optimizer):
self.name2qmap["udynamic"] = F.create_dynamic_map(signed=False) self.name2qmap["udynamic"] = F.create_dynamic_map(signed=False)
def __setstate__(self, state): def __setstate__(self, state):
super(Optimizer8bit, self).__setstate__(state) super().__setstate__(state)
def load_state_dict(self, state_dict): def load_state_dict(self, state_dict):
r"""Loads the optimizer state. r"""Loads the optimizer state.
...@@ -155,8 +153,8 @@ class Optimizer8bit(torch.optim.Optimizer): ...@@ -155,8 +153,8 @@ class Optimizer8bit(torch.optim.Optimizer):
id_map = { id_map = {
old_id: p old_id: p
for old_id, p in zip( for old_id, p in zip(
chain.from_iterable((g["params"] for g in saved_groups)), chain.from_iterable(g["params"] for g in saved_groups),
chain.from_iterable((g["params"] for g in groups)), chain.from_iterable(g["params"] for g in groups),
) )
} }
...@@ -284,11 +282,11 @@ class Optimizer8bit(torch.optim.Optimizer): ...@@ -284,11 +282,11 @@ class Optimizer8bit(torch.optim.Optimizer):
return config return config
def init_state(self, group, p, gindex, pindex): def init_state(self, group, p, gindex, pindex):
raise NotImplementedError(f"init_state method needs to be overidden") raise NotImplementedError("init_state method needs to be overridden")
def update_step(self, group, p, gindex, pindex): def update_step(self, group, p, gindex, pindex):
raise NotImplementedError( raise NotImplementedError(
f"The update_step method needs to be overidden" "The update_step method needs to be overridden"
) )
...@@ -310,9 +308,9 @@ class Optimizer2State(Optimizer8bit): ...@@ -310,9 +308,9 @@ class Optimizer2State(Optimizer8bit):
skip_zeros=False, skip_zeros=False,
): ):
if not 0.0 <= lr: if not 0.0 <= lr:
raise ValueError("Invalid learning rate: {}".format(lr)) raise ValueError(f"Invalid learning rate: {lr}")
if not 0.0 <= eps: if not 0.0 <= eps:
raise ValueError("Invalid epsilon value: {}".format(eps)) raise ValueError(f"Invalid epsilon value: {eps}")
if isinstance(betas, str): if isinstance(betas, str):
# format: '(beta1, beta2)' # format: '(beta1, beta2)'
betas = betas.replace("(", "").replace(")", "").strip().split(",") betas = betas.replace("(", "").replace(")", "").strip().split(",")
...@@ -324,10 +322,10 @@ class Optimizer2State(Optimizer8bit): ...@@ -324,10 +322,10 @@ class Optimizer2State(Optimizer8bit):
) )
if not 0.0 <= weight_decay: if not 0.0 <= weight_decay:
raise ValueError( raise ValueError(
"Invalid weight_decay value: {}".format(weight_decay) f"Invalid weight_decay value: {weight_decay}"
) )
defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
super(Optimizer2State, self).__init__(params, defaults, optim_bits) super().__init__(params, defaults, optim_bits)
if args is None: if args is None:
args = {} args = {}
...@@ -542,9 +540,9 @@ class Optimizer1State(Optimizer8bit): ...@@ -542,9 +540,9 @@ class Optimizer1State(Optimizer8bit):
skip_zeros=False, skip_zeros=False,
): ):
if not 0.0 <= lr: if not 0.0 <= lr:
raise ValueError("Invalid learning rate: {}".format(lr)) raise ValueError(f"Invalid learning rate: {lr}")
if not 0.0 <= eps: if not 0.0 <= eps:
raise ValueError("Invalid epsilon value: {}".format(eps)) raise ValueError(f"Invalid epsilon value: {eps}")
for i in range(len(betas)): for i in range(len(betas)):
if not 0.0 <= betas[i] < 1.0: if not 0.0 <= betas[i] < 1.0:
raise ValueError( raise ValueError(
...@@ -552,10 +550,10 @@ class Optimizer1State(Optimizer8bit): ...@@ -552,10 +550,10 @@ class Optimizer1State(Optimizer8bit):
) )
if not 0.0 <= weight_decay: if not 0.0 <= weight_decay:
raise ValueError( raise ValueError(
"Invalid weight_decay value: {}".format(weight_decay) f"Invalid weight_decay value: {weight_decay}"
) )
defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
super(Optimizer1State, self).__init__(params, defaults, optim_bits) super().__init__(params, defaults, optim_bits)
if args is None: if args is None:
args = {} args = {}
......
...@@ -23,11 +23,11 @@ class RMSprop(Optimizer1State): ...@@ -23,11 +23,11 @@ class RMSprop(Optimizer1State):
): ):
if alpha == 0: if alpha == 0:
raise NotImplementedError( raise NotImplementedError(
f"RMSprop with alpha==0.0 is not supported!" "RMSprop with alpha==0.0 is not supported!"
) )
if centered: if centered:
raise NotImplementedError(f"Centered RMSprop is not supported!") raise NotImplementedError("Centered RMSprop is not supported!")
super(RMSprop, self).__init__( super().__init__(
"rmsprop", "rmsprop",
params, params,
lr, lr,
...@@ -59,11 +59,11 @@ class RMSprop8bit(Optimizer1State): ...@@ -59,11 +59,11 @@ class RMSprop8bit(Optimizer1State):
): ):
if alpha == 0: if alpha == 0:
raise NotImplementedError( raise NotImplementedError(
f"RMSprop with alpha==0.0 is not supported!" "RMSprop with alpha==0.0 is not supported!"
) )
if centered: if centered:
raise NotImplementedError(f"Centered RMSprop is not supported!") raise NotImplementedError("Centered RMSprop is not supported!")
super(RMSprop8bit, self).__init__( super().__init__(
"rmsprop", "rmsprop",
params, params,
lr, lr,
...@@ -96,11 +96,11 @@ class RMSprop32bit(Optimizer1State): ...@@ -96,11 +96,11 @@ class RMSprop32bit(Optimizer1State):
if alpha == 0: if alpha == 0:
raise NotImplementedError( raise NotImplementedError(
f"RMSprop with alpha==0.0 is not supported!" "RMSprop with alpha==0.0 is not supported!"
) )
if centered: if centered:
raise NotImplementedError(f"Centered RMSprop is not supported!") raise NotImplementedError("Centered RMSprop is not supported!")
super(RMSprop32bit, self).__init__( super().__init__(
"rmsprop", "rmsprop",
params, params,
lr, lr,
......
...@@ -21,8 +21,8 @@ class SGD(Optimizer1State): ...@@ -21,8 +21,8 @@ class SGD(Optimizer1State):
block_wise=True, block_wise=True,
): ):
if momentum == 0: if momentum == 0:
raise NotImplementedError(f"SGD without momentum is not supported!") raise NotImplementedError("SGD without momentum is not supported!")
super(SGD, self).__init__( super().__init__(
"momentum", "momentum",
params, params,
lr, lr,
...@@ -52,8 +52,8 @@ class SGD8bit(Optimizer1State): ...@@ -52,8 +52,8 @@ class SGD8bit(Optimizer1State):
block_wise=True, block_wise=True,
): ):
if momentum == 0: if momentum == 0:
raise NotImplementedError(f"SGD without momentum is not supported!") raise NotImplementedError("SGD without momentum is not supported!")
super(SGD8bit, self).__init__( super().__init__(
"momentum", "momentum",
params, params,
lr, lr,
...@@ -83,8 +83,8 @@ class SGD32bit(Optimizer1State): ...@@ -83,8 +83,8 @@ class SGD32bit(Optimizer1State):
block_wise=True, block_wise=True,
): ):
if momentum == 0: if momentum == 0:
raise NotImplementedError(f"SGD without momentum is not supported!") raise NotImplementedError("SGD without momentum is not supported!")
super(SGD32bit, self).__init__( super().__init__(
"momentum", "momentum",
params, params,
lr, lr,
......
...@@ -428,16 +428,16 @@ __global__ void kQuantize(float * code, float * __restrict__ const A, unsigned c ...@@ -428,16 +428,16 @@ __global__ void kQuantize(float * code, float * __restrict__ const A, unsigned c
} }
template<typename T, int BLOCK_SIZE, int NUM_PER_TH, int STOCHASTIC> template<typename T, int BLOCK_SIZE, int NUM_PER_TH, int STOCHASTIC>
__launch_bounds__(TH, 4) //__launch_bounds__(TH, 4)
__global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n) __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n)
{ {
const int n_full = gridDim.x * BLOCK_SIZE; const int n_full = gridDim.x * BLOCK_SIZE;
int valid_items = 0; int valid_items = 0;
const int base_idx = (blockIdx.x * BLOCK_SIZE); const int base_idx = (blockIdx.x * BLOCK_SIZE);
T vals[NUM]; T vals[NUM_PER_TH];
float rand_vals[NUM]; float rand_vals[NUM_PER_TH];
unsigned char qvals[NUM]; unsigned char qvals[NUM_PER_TH];
//float local_abs_max = -FLT_MAX; //float local_abs_max = -FLT_MAX;
float local_abs_max = 0.0f; float local_abs_max = 0.0f;
int local_rand_idx = 0; int local_rand_idx = 0;
...@@ -454,8 +454,8 @@ __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float ...@@ -454,8 +454,8 @@ __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float
__shared__ float smem_code[256]; __shared__ float smem_code[256];
__shared__ float smem_absmax_value[1]; __shared__ float smem_absmax_value[1];
if(threadIdx.x < 256) for(int i = threadIdx.x; i < 256; i+=blockDim.x)
smem_code[threadIdx.x] = code[threadIdx.x]; smem_code[i] = code[i];
for (unsigned int i = base_idx; i < n_full; i += gridDim.x*BLOCK_SIZE) for (unsigned int i = base_idx; i < n_full; i += gridDim.x*BLOCK_SIZE)
{ {
...@@ -510,15 +510,15 @@ __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float ...@@ -510,15 +510,15 @@ __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float
} }
template<typename T, int BLOCK_SIZE, int THREADS, int NUM_PER_TH> template<typename T, int BLOCK_SIZE, int THREADS, int NUM_PER_TH>
__global__ void kDequantizeBlockwise(float *code, unsigned char * __restrict__ const A, float * __restrict__ const absmax, T *out, const int n) __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, T *out, const int n)
{ {
const int n_full = gridDim.x * BLOCK_SIZE; const int n_full = gridDim.x * BLOCK_SIZE;
int valid_items = 0; int valid_items = 0;
const int base_idx = (blockIdx.x * BLOCK_SIZE); const int base_idx = (blockIdx.x * BLOCK_SIZE);
T vals[NUM]; T vals[NUM_PER_TH];
unsigned char qvals[NUM]; unsigned char qvals[NUM_PER_TH];
float local_abs_max = -FLT_MAX; float local_abs_max = -FLT_MAX;
typedef cub::BlockLoad<unsigned char, THREADS, NUM_PER_TH, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadChar; typedef cub::BlockLoad<unsigned char, THREADS, NUM_PER_TH, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadChar;
...@@ -526,10 +526,11 @@ __global__ void kDequantizeBlockwise(float *code, unsigned char * __restrict__ c ...@@ -526,10 +526,11 @@ __global__ void kDequantizeBlockwise(float *code, unsigned char * __restrict__ c
__shared__ typename LoadChar::TempStorage loadchar; __shared__ typename LoadChar::TempStorage loadchar;
__shared__ typename StoreT::TempStorage storet; __shared__ typename StoreT::TempStorage storet;
__shared__ float smem_code[256]; //__shared__ float smem_code[256];
//float local_code[16];
if(threadIdx.x < 256) //if(threadIdx.x < 256)
smem_code[threadIdx.x] = code[threadIdx.x]; //smem_code[threadIdx.x] = code[threadIdx.x];
for (unsigned int i = base_idx; i < n_full; i += gridDim.x*BLOCK_SIZE) for (unsigned int i = base_idx; i < n_full; i += gridDim.x*BLOCK_SIZE)
{ {
...@@ -539,9 +540,10 @@ __global__ void kDequantizeBlockwise(float *code, unsigned char * __restrict__ c ...@@ -539,9 +540,10 @@ __global__ void kDequantizeBlockwise(float *code, unsigned char * __restrict__ c
__syncthreads(); __syncthreads();
LoadChar(loadchar).Load(&(A[i]), qvals, valid_items, 128); LoadChar(loadchar).Load(&(A[i]), qvals, valid_items, 128);
// load code through read-only cache via __ldg
#pragma unroll NUM_PER_TH #pragma unroll NUM_PER_TH
for(int j = 0; j < NUM_PER_TH; j++) for(int j = 0; j < NUM_PER_TH; j++)
vals[j] = smem_code[qvals[j]]*local_abs_max; vals[j] = __ldg(&code[qvals[j]])*local_abs_max;
__syncthreads(); __syncthreads();
StoreT(storet).Store(&(out[i]), vals, valid_items); StoreT(storet).Store(&(out[i]), vals, valid_items);
...@@ -2791,11 +2793,33 @@ template __global__ void kQuantizeBlockwise<half, 4096, 4, 0>(float * code, half ...@@ -2791,11 +2793,33 @@ template __global__ void kQuantizeBlockwise<half, 4096, 4, 0>(float * code, half
template __global__ void kQuantizeBlockwise<float, 4096, 4, 0>(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); template __global__ void kQuantizeBlockwise<float, 4096, 4, 0>(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n);
template __global__ void kQuantizeBlockwise<half, 4096, 4, 1>(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); template __global__ void kQuantizeBlockwise<half, 4096, 4, 1>(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n);
template __global__ void kQuantizeBlockwise<float, 4096, 4, 1>(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); template __global__ void kQuantizeBlockwise<float, 4096, 4, 1>(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n);
template __global__ void kQuantizeBlockwise<half, 2048, 4, 0>(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n);
template __global__ void kDequantizeBlockwise<half, 4096, 1024, 4>(float *code, unsigned char * __restrict__ const A, float * __restrict__ const absmax, half *out, const int n); template __global__ void kQuantizeBlockwise<float, 2048, 4, 0>(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n);
template __global__ void kDequantizeBlockwise<float, 4096, 1024, 4>(float *code, unsigned char * __restrict__ const A, float * __restrict__ const absmax, float *out, const int n); template __global__ void kQuantizeBlockwise<half, 1024, 4, 0>(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n);
template __global__ void kDequantizeBlockwise<half, 2048, 512, 4>(float *code, unsigned char * __restrict__ const A, float * __restrict__ const absmax, half *out, const int n); template __global__ void kQuantizeBlockwise<float, 1024, 4, 0>(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n);
template __global__ void kDequantizeBlockwise<float, 2048, 512, 4>(float *code, unsigned char * __restrict__ const A, float * __restrict__ const absmax, float *out, const int n); template __global__ void kQuantizeBlockwise<half, 512, 2, 0>(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n);
template __global__ void kQuantizeBlockwise<float, 512, 2, 0>(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n);
template __global__ void kQuantizeBlockwise<half, 256, 2, 0>(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n);
template __global__ void kQuantizeBlockwise<float, 256, 2, 0>(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n);
template __global__ void kQuantizeBlockwise<half, 128, 2, 0>(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n);
template __global__ void kQuantizeBlockwise<float, 128, 2, 0>(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n);
template __global__ void kQuantizeBlockwise<half, 64, 1, 0>(float * code, half * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n);
template __global__ void kQuantizeBlockwise<float, 64, 1, 0>(float * code, float * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n);
template __global__ void kDequantizeBlockwise<half, 4096, 1024, 4>(float *code, unsigned char * A, float * absmax, half *out, const int n);
template __global__ void kDequantizeBlockwise<float, 4096, 1024, 4>(float *code, unsigned char * A, float * absmax, float *out, const int n);
template __global__ void kDequantizeBlockwise<half, 2048, 512, 4>(float *code, unsigned char * A, float * absmax, half *out, const int n);
template __global__ void kDequantizeBlockwise<float, 2048, 512, 4>(float *code, unsigned char * A, float * absmax, float *out, const int n);
template __global__ void kDequantizeBlockwise<half, 1024, 256, 4>(float *code, unsigned char * A, float * absmax, half *out, const int n);
template __global__ void kDequantizeBlockwise<float, 1024, 256, 4>(float *code, unsigned char * A, float * absmax, float *out, const int n);
template __global__ void kDequantizeBlockwise<half, 512, 256, 2>(float *code, unsigned char * A, float * absmax, half *out, const int n);
template __global__ void kDequantizeBlockwise<float, 512, 256, 2>(float *code, unsigned char * A, float * absmax, float *out, const int n);
template __global__ void kDequantizeBlockwise<half, 256, 128, 2>(float *code, unsigned char * A, float * absmax, half *out, const int n);
template __global__ void kDequantizeBlockwise<float, 256, 128, 2>(float *code, unsigned char * A, float * absmax, float *out, const int n);
template __global__ void kDequantizeBlockwise<half, 128, 64, 2>(float *code, unsigned char * A, float * absmax, half *out, const int n);
template __global__ void kDequantizeBlockwise<float, 128, 64, 2>(float *code, unsigned char * A, float * absmax, float *out, const int n);
template __global__ void kDequantizeBlockwise<half, 64, 64, 1>(float *code, unsigned char * A, float * absmax, half *out, const int n);
template __global__ void kDequantizeBlockwise<float, 64, 64, 1>(float *code, unsigned char * A, float * absmax, float *out, const int n);
......
...@@ -15,7 +15,7 @@ __global__ void kQuantize(float * code, float * __restrict__ const A, unsigned c ...@@ -15,7 +15,7 @@ __global__ void kQuantize(float * code, float * __restrict__ const A, unsigned c
__global__ void kDequantize(float *code, unsigned char *A, float *out, const int n); __global__ void kDequantize(float *code, unsigned char *A, float *out, const int n);
template<typename T, int BLOCK_SIZE, int NUM_PER_TH, int STOCHASTIC> __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); template<typename T, int BLOCK_SIZE, int NUM_PER_TH, int STOCHASTIC> __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n);
template<typename T, int BLOCK_SIZE, int THREADS, int NUM_PER_TH> __global__ void kDequantizeBlockwise(float *code, unsigned char * __restrict__ const A, float * __restrict__ const absmax, T *out, const int n); template<typename T, int BLOCK_SIZE, int THREADS, int NUM_PER_TH> __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, T *out, const int n);
template<typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS> template<typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS>
__global__ void kPreconditionOptimizer32bit2State(T* g, T* p, __global__ void kPreconditionOptimizer32bit2State(T* g, T* p,
...@@ -121,5 +121,3 @@ template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int T ...@@ -121,5 +121,3 @@ template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int T
template <int FORMAT> __global__ void kExtractOutliers(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA); template <int FORMAT> __global__ void kExtractOutliers(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA);
#endif #endif
...@@ -50,11 +50,29 @@ void dequantize(float *code, unsigned char *A, float *out, int n) ...@@ -50,11 +50,29 @@ void dequantize(float *code, unsigned char *A, float *out, int n)
CUDA_CHECK_RETURN(cudaPeekAtLastError()); CUDA_CHECK_RETURN(cudaPeekAtLastError());
} }
template <typename T, int STOCHASTIC> void quantizeBlockwise(float * code, T *A, float *absmax, unsigned char *out, float *rand, int rand_offset, const int n) template <typename T, int STOCHASTIC> void quantizeBlockwise(float * code, T *A, float *absmax, unsigned char *out, float *rand, int rand_offset, int blocksize, const int n)
{ {
int num_blocks = n/4096; int num_blocks = n/blocksize;
num_blocks = n % 4096 == 0 ? num_blocks : num_blocks + 1; num_blocks = n % blocksize == 0 ? num_blocks : num_blocks + 1;
if(STOCHASTIC == 1)
assert(blocksize == 4096);
if(blocksize == 4096)
kQuantizeBlockwise<T, 4096, 4, STOCHASTIC><<<num_blocks, 1024>>>(code, A, absmax, out, rand, rand_offset, n); kQuantizeBlockwise<T, 4096, 4, STOCHASTIC><<<num_blocks, 1024>>>(code, A, absmax, out, rand, rand_offset, n);
else if(blocksize == 2048)
kQuantizeBlockwise<T, 2048, 4, 0><<<num_blocks, 512>>>(code, A, absmax, out, rand, rand_offset, n);
else if(blocksize == 1024)
kQuantizeBlockwise<T, 1024, 4, 0><<<num_blocks, 256>>>(code, A, absmax, out, rand, rand_offset, n);
else if(blocksize == 512)
kQuantizeBlockwise<T, 512, 2, 0><<<num_blocks, 256>>>(code, A, absmax, out, rand, rand_offset, n);
else if(blocksize == 256)
kQuantizeBlockwise<T, 256, 2, 0><<<num_blocks, 128>>>(code, A, absmax, out, rand, rand_offset, n);
else if(blocksize == 128)
kQuantizeBlockwise<T, 128, 2, 0><<<num_blocks, 64>>>(code, A, absmax, out, rand, rand_offset, n);
else if(blocksize == 64)
kQuantizeBlockwise<T, 64, 1, 0><<<num_blocks, 64>>>(code, A, absmax, out, rand, rand_offset, n);
CUDA_CHECK_RETURN(cudaPeekAtLastError()); CUDA_CHECK_RETURN(cudaPeekAtLastError());
} }
...@@ -66,6 +84,17 @@ template<typename T> void dequantizeBlockwise(float *code, unsigned char *A, flo ...@@ -66,6 +84,17 @@ template<typename T> void dequantizeBlockwise(float *code, unsigned char *A, flo
kDequantizeBlockwise<T, 4096, 1024, 4><<<num_blocks, 4096/4>>>(code, A, absmax, out, n); kDequantizeBlockwise<T, 4096, 1024, 4><<<num_blocks, 4096/4>>>(code, A, absmax, out, n);
else if(blocksize == 2048) else if(blocksize == 2048)
kDequantizeBlockwise<T, 2048, 512, 4><<<num_blocks, 2048/4>>>(code, A, absmax, out, n); kDequantizeBlockwise<T, 2048, 512, 4><<<num_blocks, 2048/4>>>(code, A, absmax, out, n);
else if(blocksize == 1024)
kDequantizeBlockwise<T, 1024, 256, 4><<<num_blocks, 1024/4>>>(code, A, absmax, out, n);
else if(blocksize == 512)
kDequantizeBlockwise<T, 512, 256, 2><<<num_blocks, 512/2>>>(code, A, absmax, out, n);
else if(blocksize == 256)
kDequantizeBlockwise<T, 256, 128, 2><<<num_blocks, 256/2>>>(code, A, absmax, out, n);
else if(blocksize == 128)
kDequantizeBlockwise<T, 128, 64, 2><<<num_blocks, 128/2>>>(code, A, absmax, out, n);
else if(blocksize == 64)
kDequantizeBlockwise<T, 64, 64, 1><<<num_blocks, 64/1>>>(code, A, absmax, out, n);
CUDA_CHECK_RETURN(cudaPeekAtLastError()); CUDA_CHECK_RETURN(cudaPeekAtLastError());
} }
...@@ -659,10 +688,10 @@ template void transformRowToFormat<COL_AMPERE, 1>(char * A, char *out, int rows, ...@@ -659,10 +688,10 @@ template void transformRowToFormat<COL_AMPERE, 1>(char * A, char *out, int rows,
template void estimateQuantiles(half *A, float *code, float offset, int n); template void estimateQuantiles(half *A, float *code, float offset, int n);
template void estimateQuantiles(float *A, float *code, float offset, int n); template void estimateQuantiles(float *A, float *code, float offset, int n);
template void quantizeBlockwise<half, 0>(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n); template void quantizeBlockwise<half, 0>(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
template void quantizeBlockwise<float, 0>(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n); template void quantizeBlockwise<float, 0>(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
template void quantizeBlockwise<half, 1>(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n); template void quantizeBlockwise<half, 1>(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
template void quantizeBlockwise<float, 1>(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n); template void quantizeBlockwise<float, 1>(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
template void dequantizeBlockwise<half>(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n); template void dequantizeBlockwise<half>(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n);
template void dequantizeBlockwise<float>(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n); template void dequantizeBlockwise<float>(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n);
......
...@@ -128,7 +128,7 @@ template <typename T> void estimateQuantiles(T *A, float *code, float offset, in ...@@ -128,7 +128,7 @@ template <typename T> void estimateQuantiles(T *A, float *code, float offset, in
void quantize(float *code, float *A, unsigned char *out, int n); void quantize(float *code, float *A, unsigned char *out, int n);
void dequantize(float *code, unsigned char *A, float *out, int n); void dequantize(float *code, unsigned char *A, float *out, int n);
template <typename T, int STOCHASTIC> void quantizeBlockwise(float * code, T *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n); template <typename T, int STOCHASTIC> void quantizeBlockwise(float * code, T *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
template<typename T> void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, T *out, int block_size, const int n); template<typename T> void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, T *out, int block_size, const int n);
template<typename T, int OPTIMIZER> void optimizer32bit(T* g, T* p, template<typename T, int OPTIMIZER> void optimizer32bit(T* g, T* p,
......
...@@ -75,10 +75,10 @@ MAKE_BLOCKWISE8(adagrad, ADAGRAD, float, 32) ...@@ -75,10 +75,10 @@ MAKE_BLOCKWISE8(adagrad, ADAGRAD, float, 32)
void percentileClipping_g32(float * g, float *gnorm_vec, int step, const int n){ percentileClipping<float>(g, gnorm_vec, step, n); } void percentileClipping_g32(float * g, float *gnorm_vec, int step, const int n){ percentileClipping<float>(g, gnorm_vec, step, n); }
void percentileClipping_g16(half * g, float *gnorm_vec, int step, const int n){ percentileClipping<half>(g, gnorm_vec, step, n); } void percentileClipping_g16(half * g, float *gnorm_vec, int step, const int n){ percentileClipping<half>(g, gnorm_vec, step, n); }
void quantizeBlockwise_fp16(float * code, half *A, float *absmax, unsigned char *out, const int n){ quantizeBlockwise<half, 0>(code, A, absmax, out, NULL, 0, n); } void quantizeBlockwise_fp16(float * code, half *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise<half, 0>(code, A, absmax, out, NULL, 0, blocksize, n); }
void quantizeBlockwise_fp32(float * code, float *A, float *absmax, unsigned char *out, const int n){ quantizeBlockwise<float, 0>(code, A, absmax, out, NULL, 0, n); } void quantizeBlockwise_fp32(float * code, float *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise<float, 0>(code, A, absmax, out, NULL, 0, blocksize, n); }
void quantizeBlockwise_stochastic_fp16(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n){ quantizeBlockwise<half, 1>(code, A, absmax, out, rand, rand_offset, n); } void quantizeBlockwise_stochastic_fp16(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n){ quantizeBlockwise<half, 1>(code, A, absmax, out, rand, rand_offset, 4096, n); }
void quantizeBlockwise_stochastic_fp32(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n){ quantizeBlockwise<float, 1>(code, A, absmax, out, rand, rand_offset, n); } void quantizeBlockwise_stochastic_fp32(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n){ quantizeBlockwise<float, 1>(code, A, absmax, out, rand, rand_offset, 4096, n); }
void dequantizeBlockwise_fp16(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise<half>(code, A, absmax, out, blocksize, n); } \ void dequantizeBlockwise_fp16(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise<half>(code, A, absmax, out, blocksize, n); } \
void dequantizeBlockwise_fp32(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise<float>(code, A, absmax, out, blocksize, n); } void dequantizeBlockwise_fp32(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise<float>(code, A, absmax, out, blocksize, n); }
...@@ -140,8 +140,8 @@ extern "C" ...@@ -140,8 +140,8 @@ extern "C"
void cestimate_quantiles_fp16(half *A, float *code, float offset, int n){ estimateQuantiles_fp16(A, code, offset, n); } void cestimate_quantiles_fp16(half *A, float *code, float offset, int n){ estimateQuantiles_fp16(A, code, offset, n); }
void cquantize(float *code, float *A, unsigned char *out, int n){ quantize(code, A, out, n); } void cquantize(float *code, float *A, unsigned char *out, int n){ quantize(code, A, out, n); }
void cdequantize(float *code, unsigned char *A, float *out, int n){ dequantize(code, A, out, n); } void cdequantize(float *code, unsigned char *A, float *out, int n){ dequantize(code, A, out, n); }
void cquantize_blockwise_fp16(float * code, half *A, float *absmax, unsigned char *out, const int n){ quantizeBlockwise_fp16(code, A, absmax, out, n); } void cquantize_blockwise_fp16(float * code, half *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise_fp16(code, A, absmax, out, blocksize, n); }
void cquantize_blockwise_fp32(float * code, float *A, float *absmax, unsigned char *out, const int n){ quantizeBlockwise_fp32(code, A, absmax, out, n); } void cquantize_blockwise_fp32(float * code, float *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise_fp32(code, A, absmax, out, blocksize, n); }
void cquantize_blockwise_stochastic_fp16(float * code, half *A, float *absmax, unsigned char *out, float *rand, int rand_offset, const int n){ quantizeBlockwise_stochastic_fp16(code, A, absmax, out, rand, rand_offset, n); } void cquantize_blockwise_stochastic_fp16(float * code, half *A, float *absmax, unsigned char *out, float *rand, int rand_offset, const int n){ quantizeBlockwise_stochastic_fp16(code, A, absmax, out, rand, rand_offset, n); }
void cquantize_blockwise_stochastic_fp32(float * code, float *A, float *absmax, unsigned char *out, float *rand, int rand_offset, const int n){ quantizeBlockwise_stochastic_fp32(code, A, absmax, out, rand, rand_offset, n); } void cquantize_blockwise_stochastic_fp32(float * code, float *A, float *absmax, unsigned char *out, float *rand, int rand_offset, const int n){ quantizeBlockwise_stochastic_fp32(code, A, absmax, out, rand, rand_offset, n); }
...@@ -290,4 +290,3 @@ extern "C" ...@@ -290,4 +290,3 @@ extern "C"
void cquantize_blockwise_cpu_fp32(float *code, float *A, float *absmax, unsigned char *out, long long blocksize, long long n){ quantize_cpu(code, A, absmax, out, blocksize, n); } void cquantize_blockwise_cpu_fp32(float *code, float *A, float *absmax, unsigned char *out, long long blocksize, long long n){ quantize_cpu(code, A, absmax, out, blocksize, n); }
void cdequantize_blockwise_cpu_fp32(float *code, unsigned char *A, float *absmax, float *out, long long blocksize, long long n){ dequantize_cpu(code, A, absmax, out, blocksize, n); } void cdequantize_blockwise_cpu_fp32(float *code, unsigned char *A, float *absmax, float *out, long long blocksize, long long n){ dequantize_cpu(code, A, absmax, out, blocksize, n); }
} }
...@@ -76,6 +76,3 @@ if [[ -n "$CUDA_VERSION" ]]; then ...@@ -76,6 +76,3 @@ if [[ -n "$CUDA_VERSION" ]]; then
else else
echo "" echo ""
fi fi
...@@ -18,7 +18,7 @@ def read(fname): ...@@ -18,7 +18,7 @@ def read(fname):
setup( setup(
name=f"bitsandbytes", name=f"bitsandbytes",
version=f"0.35.3", version=f"0.35.4",
author="Tim Dettmers", author="Tim Dettmers",
author_email="dettmers@cs.washington.edu", author_email="dettmers@cs.washington.edu",
description="8-bit optimizers and matrix multiplication routines.", description="8-bit optimizers and matrix multiplication routines.",
...@@ -26,9 +26,6 @@ setup( ...@@ -26,9 +26,6 @@ setup(
keywords="gpu optimizers optimization 8-bit quantization compression", keywords="gpu optimizers optimization 8-bit quantization compression",
url="https://github.com/TimDettmers/bitsandbytes", url="https://github.com/TimDettmers/bitsandbytes",
packages=find_packages(), packages=find_packages(),
entry_points={
"console_scripts": ["debug_cuda = bitsandbytes.debug_cli:cli"],
},
package_data={"": libs}, package_data={"": libs},
long_description=read("README.md"), long_description=read("README.md"),
long_description_content_type="text/markdown", long_description_content_type="text/markdown",
......
from itertools import product, permutations from itertools import permutations, product
import pytest import pytest
import torch import torch
...@@ -27,7 +27,7 @@ str_values = list( ...@@ -27,7 +27,7 @@ str_values = list(
) )
) )
names = [ names = [
"dim1_{0}_dim2_{1}_dim3_{2}_dim4_{3}_func_{4}_dtype_{5}_requires_grad_{6}_transpose_{7}".format( "dim1_{}_dim2_{}_dim3_{}_dim4_{}_func_{}_dtype_{}_requires_grad_{}_transpose_{}".format(
*vals *vals
) )
for vals in str_values for vals in str_values
...@@ -286,7 +286,7 @@ str_values = list( ...@@ -286,7 +286,7 @@ str_values = list(
has_bias has_bias
) )
) )
names = ["dim1_{0}_dim2_{1}_dim3_{2}_dim4_{3}_func_{4}_dtype_{5}_requires_grad_{6}_transpose_{7}_decomp_{8}_has_fp16_weights_{9}_has_bias_{10}".format(*vals) for vals in str_values] names = ["dim1_{}_dim2_{}_dim3_{}_dim4_{}_func_{}_dtype_{}_requires_grad_{}_transpose_{}_decomp_{}_has_fp16_weights_{}_has_bias_{}".format(*vals) for vals in str_values]
@pytest.mark.parametrize( @pytest.mark.parametrize(
......
import os import os
import pytest
import bitsandbytes as bnb
from typing import List, NamedTuple from typing import List, NamedTuple
import pytest
import bitsandbytes as bnb
from bitsandbytes.cuda_setup import ( from bitsandbytes.cuda_setup import (
CUDA_RUNTIME_LIB, CUDA_RUNTIME_LIB,
evaluate_cuda_setup,
determine_cuda_runtime_lib_path, determine_cuda_runtime_lib_path,
evaluate_cuda_setup,
extract_candidate_paths, extract_candidate_paths,
) )
......
...@@ -6,12 +6,14 @@ from itertools import product ...@@ -6,12 +6,14 @@ from itertools import product
import einops import einops
import pytest import pytest
import torch import torch
import numpy as np
import bitsandbytes as bnb import bitsandbytes as bnb
from bitsandbytes import functional as F from bitsandbytes import functional as F
from scipy.stats import norm
torch.set_printoptions( torch.set_printoptions(
precision=4, sci_mode=False, linewidth=120, edgeitems=20, threshold=10000 precision=5, sci_mode=False, linewidth=120, edgeitems=20, threshold=10000
) )
k = 20 k = 20
...@@ -26,7 +28,7 @@ def assert_all_approx_close(a, b, rtol=1e-3, atol=1e-3, count=0): ...@@ -26,7 +28,7 @@ def assert_all_approx_close(a, b, rtol=1e-3, atol=1e-3, count=0):
class FFN(torch.nn.Module): class FFN(torch.nn.Module):
def __init__(self, input_features, hidden_size, bias=True): def __init__(self, input_features, hidden_size, bias=True):
super(FFN, self).__init__() super().__init__()
self.fc1 = torch.nn.Linear(input_features, hidden_size, bias=bias) self.fc1 = torch.nn.Linear(input_features, hidden_size, bias=bias)
self.fc2 = torch.nn.Linear(hidden_size, input_features, bias=bias) self.fc2 = torch.nn.Linear(hidden_size, input_features, bias=bias)
...@@ -40,7 +42,7 @@ class FFN(torch.nn.Module): ...@@ -40,7 +42,7 @@ class FFN(torch.nn.Module):
return x return x
class Timer(object): class Timer:
def __init__(self): def __init__(self):
self.starts = {} self.starts = {}
self.ends = {} self.ends = {}
...@@ -67,7 +69,7 @@ class Timer(object): ...@@ -67,7 +69,7 @@ class Timer(object):
self.ends.pop(name) self.ends.pop(name)
if print_ms and name in self.agg: if print_ms and name in self.agg:
print("{0} took: {1:.5f}s".format(name, self.agg[name] / 1000.0)) print(f"{name} took: {self.agg[name] / 1000.0:.5f}s")
return self.agg[name] return self.agg[name]
...@@ -149,30 +151,41 @@ def test_dynamic_quantization(): ...@@ -149,30 +151,41 @@ def test_dynamic_quantization():
def test_dynamic_blockwise_quantization(): def test_dynamic_blockwise_quantization():
#print('')
for blocksize in [4096, 2048, 1024, 512]:
diffs = [] diffs = []
reldiffs = [] reldiffs = []
for i in range(100): for i in range(100):
A1 = torch.randn(1024, 1024, device="cuda") A1 = torch.randn(1024, 1024, device="cuda")
C, S = F.quantize_blockwise(A1) C, S = F.quantize_blockwise(A1, blocksize=blocksize)
A2 = F.dequantize_blockwise(C, S) A2 = F.dequantize_blockwise(C, S, blocksize=blocksize)
diff = torch.abs(A1 - A2) diff = torch.abs(A1 - A2)
reldiff = diff / torch.abs(A1 + 1e-8) reldiff = diff / torch.abs(A1 + 1e-8)
diffs.append(diff.mean().item()) diffs.append(diff.mean().item())
reldiffs.append(reldiff.mean().item()) reldiffs.append(reldiff.mean().item())
assert diffs[-1] < 0.011 abserr = sum(diffs)/len(diffs)
# print(sum(diffs)/len(diffs)) relerr = sum(reldiffs)/len(reldiffs)
# print(sum(reldiffs)/len(reldiffs)) assert abserr < 0.011
assert relerr < 0.018
#print('randn', blocksize, sum(diffs)/len(diffs))
#print('randn', blocksize, sum(reldiffs)/len(reldiffs))
diffs = [] diffs = []
for i in range(100): for i in range(100):
A1 = torch.rand(1024, 1024, device="cuda") A1 = torch.rand(1024, 1024, device="cuda")
C, S = F.quantize_blockwise(A1) C, S = F.quantize_blockwise(A1, blocksize=blocksize)
A2 = F.dequantize_blockwise(C, S) A2 = F.dequantize_blockwise(C, S, blocksize=blocksize)
diff = torch.abs(A1 - A2).mean().item() diff = torch.abs(A1 - A2)
assert diff < 0.0033 reldiff = diff / torch.abs(A1 + 1e-8)
diffs.append(diff) diffs.append(diff.mean().item())
torch.testing.assert_allclose(A1, A2, atol=1e-2, rtol=0) reldiffs.append(reldiff.mean().item())
# print(sum(diffs)/len(diffs)) #torch.testing.assert_allclose(A1, A2, atol=1e-2, rtol=0)
abserr = sum(diffs)/len(diffs)
relerr = sum(reldiffs)/len(reldiffs)
assert abserr < 0.0035
assert relerr < 0.015
#print('rand', blocksize, sum(diffs)/len(diffs))
#print('rand', blocksize, sum(reldiffs)/len(reldiffs))
def test_dynamic_blockwise_stochastic_quantization(): def test_dynamic_blockwise_stochastic_quantization():
...@@ -289,7 +302,7 @@ batched = [False, True] ...@@ -289,7 +302,7 @@ batched = [False, True]
values = list(product(dim1, dim2, methods, batched)) values = list(product(dim1, dim2, methods, batched))
values_names = list(product(dim1, dim2, method_names, batched)) values_names = list(product(dim1, dim2, method_names, batched))
names = [ names = [
"dim1_{0}_dim2_{1}_quant_{2}_batched_{3}".format(*vals) "dim1_{}_dim2_{}_quant_{}_batched_{}".format(*vals)
for vals in values_names for vals in values_names
] ]
...@@ -347,7 +360,7 @@ seq_dim = torch.randint(16, 256, size=(n,)).tolist() ...@@ -347,7 +360,7 @@ seq_dim = torch.randint(16, 256, size=(n,)).tolist()
transpose = [(False, False), (False, True), (True, False), (True, True)] transpose = [(False, False), (False, True), (True, False), (True, True)]
values = list(product(hidden_dim, batch_dim, transpose, seq_dim)) values = list(product(hidden_dim, batch_dim, transpose, seq_dim))
names = [ names = [
"hidden_dim_{0}_batch_dim_{1},transpose_{2}_seq_dim_{3}".format(*vals) "hidden_dim_{}_batch_dim_{},transpose_{}_seq_dim_{}".format(*vals)
for vals in values for vals in values
] ]
...@@ -412,7 +425,7 @@ hidden_dim = torch.randint(32, 1024 * 4, size=(n,)).tolist() ...@@ -412,7 +425,7 @@ hidden_dim = torch.randint(32, 1024 * 4, size=(n,)).tolist()
batch_dim = torch.randint(2, 16, size=(n,)).tolist() batch_dim = torch.randint(2, 16, size=(n,)).tolist()
values = list(product(seq_dim, hidden_dim, batch_dim)) values = list(product(seq_dim, hidden_dim, batch_dim))
names = [ names = [
"seq_dim{0}_hidden_dim{1}_batch_dim{2}".format(*vals) for vals in values "seq_dim{}_hidden_dim{}_batch_dim{}".format(*vals) for vals in values
] ]
...@@ -444,7 +457,7 @@ batch_dim = torch.randint(2, 16, size=(n,)).tolist() ...@@ -444,7 +457,7 @@ batch_dim = torch.randint(2, 16, size=(n,)).tolist()
transpose = [False, True] transpose = [False, True]
values = list(product(seq_dim, hidden_dim, batch_dim, transpose)) values = list(product(seq_dim, hidden_dim, batch_dim, transpose))
names = [ names = [
"seq_dim={0}_hidden_dim={1}_batch_dim={2}_transpose{3}".format(*vals) "seq_dim={}_hidden_dim={}_batch_dim={}_transpose{}".format(*vals)
for vals in values for vals in values
] ]
...@@ -529,7 +542,7 @@ dim4 = torch.randint(32, 256, size=(n,)).tolist() ...@@ -529,7 +542,7 @@ dim4 = torch.randint(32, 256, size=(n,)).tolist()
transpose = [(False, False), (True, False), (False, True), (True, True)] transpose = [(False, False), (True, False), (False, True), (True, True)]
values = list(product(dim1, dim2, dim3, dim4, transpose)) values = list(product(dim1, dim2, dim3, dim4, transpose))
names = [ names = [
"dim1_{0}_dim2_{1}_dim3_{2}_dim4_{3}_transpose_{4}".format(*vals) "dim1_{}_dim2_{}_dim3_{}_dim4_{}_transpose_{}".format(*vals)
for vals in values for vals in values
] ]
...@@ -567,7 +580,7 @@ dim1 = torch.randint(1, 64, size=(n,)).tolist() ...@@ -567,7 +580,7 @@ dim1 = torch.randint(1, 64, size=(n,)).tolist()
dim2 = torch.randint(32, 128, size=(n,)).tolist() dim2 = torch.randint(32, 128, size=(n,)).tolist()
dim3 = torch.randint(32, 256, size=(n,)).tolist() dim3 = torch.randint(32, 256, size=(n,)).tolist()
values = list(product(dim1, dim2, dim3)) values = list(product(dim1, dim2, dim3))
names = ["dim1_{0}_dim2_{1}_dim3_{2}".format(*vals) for vals in values] names = ["dim1_{}_dim2_{}_dim3_{}".format(*vals) for vals in values]
@pytest.mark.parametrize("dim1, dim2, dim3", values, ids=names) @pytest.mark.parametrize("dim1, dim2, dim3", values, ids=names)
...@@ -596,7 +609,7 @@ transpose = [False] ...@@ -596,7 +609,7 @@ transpose = [False]
dims = [2, 3] dims = [2, 3]
values = list(product(dim1, dim2, dim3, dims, dtype, a_order, out_order, transpose)) values = list(product(dim1, dim2, dim3, dims, dtype, a_order, out_order, transpose))
names = ["dim1_{0}_dim2_{1}_dim3_{2}_dims_{3}_dtype_{4}_orderA_{5}_orderOut_{6}_transpose_{7}".format(*vals)for vals in values] names = ["dim1_{}_dim2_{}_dim3_{}_dims_{}_dtype_{}_orderA_{}_orderOut_{}_transpose_{}".format(*vals)for vals in values]
@pytest.mark.parametrize("dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose",values,ids=names) @pytest.mark.parametrize("dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose",values,ids=names)
...@@ -678,7 +691,7 @@ ldb = [0] ...@@ -678,7 +691,7 @@ ldb = [0]
# ldb = list(range(256, 1*1024, 256)) # ldb = list(range(256, 1*1024, 256))
values = list(product(dim1, dim2, dim3, dim4, dims, ldb)) values = list(product(dim1, dim2, dim3, dim4, dims, ldb))
names = [ names = [
"dim1_{0}_dim2_{1}_dim3_{2}_dim4_{3}_dims_{4}_ldb_{5}".format(*vals) "dim1_{}_dim2_{}_dim3_{}_dim4_{}_dims_{}_ldb_{}".format(*vals)
for vals in values for vals in values
] ]
...@@ -726,7 +739,7 @@ dims = (2,) ...@@ -726,7 +739,7 @@ dims = (2,)
# ldb = list(range(256, 1*1024, 256)) # ldb = list(range(256, 1*1024, 256))
values = list(product(dim1, dim2, dim3, dim4, dims)) values = list(product(dim1, dim2, dim3, dim4, dims))
names = [ names = [
"dim1_{0}_dim2_{1}_dim3_{2}_dim4_{3}_dims_{4}".format(*vals) "dim1_{}_dim2_{}_dim3_{}_dim4_{}_dims_{}".format(*vals)
for vals in values for vals in values
] ]
...@@ -784,7 +797,7 @@ values = [ ...@@ -784,7 +797,7 @@ values = [
# values = list(product(batch, seq, model, hidden)) # values = list(product(batch, seq, model, hidden))
names = [ names = [
"batch_{0}_seq_{1}_model_{2}_hidden_{3}".format(*vals) for vals in values "batch_{}_seq_{}_model_{}_hidden_{}".format(*vals) for vals in values
] ]
...@@ -952,7 +965,7 @@ dims = (2,) ...@@ -952,7 +965,7 @@ dims = (2,)
formatB = ["col_turing", "col_ampere"] formatB = ["col_turing", "col_ampere"]
has_bias = [True, False] has_bias = [True, False]
values = list(product(dim1, dim4, dims, formatB, has_bias)) values = list(product(dim1, dim4, dims, formatB, has_bias))
names = ["dim1_{0}_dim4_{1}_dims_{2}_formatB_{3}_has_bias_{4}".format(*vals) for vals in values] names = ["dim1_{}_dim4_{}_dims_{}_formatB_{}_has_bias_{}".format(*vals) for vals in values]
@pytest.mark.parametrize("dim1, dim4, dims, formatB, has_bias", values, ids=names) @pytest.mark.parametrize("dim1, dim4, dims, formatB, has_bias", values, ids=names)
...@@ -1002,7 +1015,7 @@ dim2 = [1 * 1024] ...@@ -1002,7 +1015,7 @@ dim2 = [1 * 1024]
dims = (2,) dims = (2,)
# ldb = list(range(256, 1*1024, 256)) # ldb = list(range(256, 1*1024, 256))
values = list(product(dim1, dim2, dims)) values = list(product(dim1, dim2, dims))
names = ["dim1_{0}_dim2_{1}_dims_{2}".format(*vals) for vals in values] names = ["dim1_{}_dim2_{}_dims_{}".format(*vals) for vals in values]
@pytest.mark.parametrize("dim1, dim2, dims", values, ids=names) @pytest.mark.parametrize("dim1, dim2, dims", values, ids=names)
...@@ -1058,7 +1071,7 @@ dim1 = torch.randint(1, 4 * 1024, size=(n,)).tolist() ...@@ -1058,7 +1071,7 @@ dim1 = torch.randint(1, 4 * 1024, size=(n,)).tolist()
dim2 = torch.randint(1, 4 * 1024, size=(n,)).tolist() dim2 = torch.randint(1, 4 * 1024, size=(n,)).tolist()
values = list(product(dim1, dim2)) values = list(product(dim1, dim2))
names = ["dim1_{0}_dim2_{1}".format(*vals) for vals in values] names = ["dim1_{}_dim2_{}".format(*vals) for vals in values]
@pytest.mark.parametrize("dim1, dim2", values, ids=names) @pytest.mark.parametrize("dim1, dim2", values, ids=names)
...@@ -1105,7 +1118,7 @@ dim4 = torch.randint(1, 4 * 1024, size=(n,)).tolist() ...@@ -1105,7 +1118,7 @@ dim4 = torch.randint(1, 4 * 1024, size=(n,)).tolist()
inner = torch.randint(1, 4 * 1024, size=(n,)).tolist() inner = torch.randint(1, 4 * 1024, size=(n,)).tolist()
values = list(zip(dim1, dim4, inner)) values = list(zip(dim1, dim4, inner))
names = ["dim1_{0}_dim4_{1}_inner_{2}".format(*vals) for vals in values] names = ["dim1_{}_dim4_{}_inner_{}".format(*vals) for vals in values]
@pytest.mark.parametrize("dim1, dim4, inner", values, ids=names) @pytest.mark.parametrize("dim1, dim4, inner", values, ids=names)
...@@ -1149,7 +1162,7 @@ dim4 = torch.randint(1, 4 * 1024, size=(n,)).tolist() ...@@ -1149,7 +1162,7 @@ dim4 = torch.randint(1, 4 * 1024, size=(n,)).tolist()
inner = torch.randint(1, 4 * 1024, size=(n,)).tolist() inner = torch.randint(1, 4 * 1024, size=(n,)).tolist()
values = list(zip(dim1, dim4, inner)) values = list(zip(dim1, dim4, inner))
names = ["dim1_{0}_dim4_{1}_inner_{2}".format(*vals) for vals in values] names = ["dim1_{}_dim4_{}_inner_{}".format(*vals) for vals in values]
@pytest.mark.parametrize("dim1, dim4, inner", values, ids=names) @pytest.mark.parametrize("dim1, dim4, inner", values, ids=names)
...@@ -1224,7 +1237,7 @@ inner = [12288 * 4, 4096 * 4] ...@@ -1224,7 +1237,7 @@ inner = [12288 * 4, 4096 * 4]
dim4 = [12288, 4096] dim4 = [12288, 4096]
values = list(zip(dim1, dim4, inner)) values = list(zip(dim1, dim4, inner))
names = ["dim1_{0}_dim4_{1}_inner_{2}".format(*vals) for vals in values] names = ["dim1_{}_dim4_{}_inner_{}".format(*vals) for vals in values]
@pytest.mark.parametrize("dim1, dim4, inner", values, ids=names) @pytest.mark.parametrize("dim1, dim4, inner", values, ids=names)
...@@ -1290,7 +1303,7 @@ values = list( ...@@ -1290,7 +1303,7 @@ values = list(
product(dim1, dim2, dim3, dims, dtype, a_order, out_order, transpose) product(dim1, dim2, dim3, dims, dtype, a_order, out_order, transpose)
) )
names = [ names = [
"dim1_{0}_dim2_{1}_dim3_{2}_dims_{3}_dtype_{4}_orderA_{5}_orderOut_{6}_{7}".format( "dim1_{}_dim2_{}_dim3_{}_dims_{}_dtype_{}_orderA_{}_orderOut_{}_{}".format(
*vals *vals
) )
for vals in values for vals in values
...@@ -1341,7 +1354,7 @@ a_order = ["col_turing"] ...@@ -1341,7 +1354,7 @@ a_order = ["col_turing"]
out_order = ["row"] out_order = ["row"]
values = list(product(dim1, dim2, dtype, a_order, out_order)) values = list(product(dim1, dim2, dtype, a_order, out_order))
names = [ names = [
"dim1_{0}_dim2_{1}_dtype_{2}_orderA_{3}_orderOut_{4}".format(*vals) "dim1_{}_dim2_{}_dtype_{}_orderA_{}_orderOut_{}".format(*vals)
for vals in values for vals in values
] ]
...@@ -1367,7 +1380,7 @@ dim2 = torch.randint(1, 4 * 1024, size=(n,)).tolist() ...@@ -1367,7 +1380,7 @@ dim2 = torch.randint(1, 4 * 1024, size=(n,)).tolist()
# dim2 = [5] # dim2 = [5]
values = list(product(dim1, dim2)) values = list(product(dim1, dim2))
names = ["dim1_{0}_dim2_{1}".format(*vals) for vals in values] names = ["dim1_{}_dim2_{}".format(*vals) for vals in values]
@pytest.mark.parametrize("dim1, dim2", values, ids=names) @pytest.mark.parametrize("dim1, dim2", values, ids=names)
...@@ -1404,7 +1417,7 @@ dim2 = torch.randint(1, 1 * 1024, size=(n,)).tolist() ...@@ -1404,7 +1417,7 @@ dim2 = torch.randint(1, 1 * 1024, size=(n,)).tolist()
# dim2 = [11] # dim2 = [11]
transposed_B = [False, True] transposed_B = [False, True]
values = list(product(dim1, dim2, transposed_B)) values = list(product(dim1, dim2, transposed_B))
names = ["dim1_{0}_dim2_{1}_transposed_B_{2}".format(*vals) for vals in values] names = ["dim1_{}_dim2_{}_transposed_B_{}".format(*vals) for vals in values]
@pytest.mark.parametrize("dim1, dim2, transposed_B", values, ids=names) @pytest.mark.parametrize("dim1, dim2, transposed_B", values, ids=names)
...@@ -1485,7 +1498,7 @@ n = 2 ...@@ -1485,7 +1498,7 @@ n = 2
dim1 = torch.randint(256, 1 * 1024, size=(n,)).tolist() dim1 = torch.randint(256, 1 * 1024, size=(n,)).tolist()
dim2 = torch.randint(256, 1 * 1024, size=(n,)).tolist() dim2 = torch.randint(256, 1 * 1024, size=(n,)).tolist()
values = list(product(dim1, dim2)) values = list(product(dim1, dim2))
names = ["dim1_{0}_dim2_{1}".format(*vals) for vals in values] names = ["dim1_{}_dim2_{}".format(*vals) for vals in values]
@pytest.mark.parametrize("dim1, dim2", values, ids=names) @pytest.mark.parametrize("dim1, dim2", values, ids=names)
...@@ -1550,7 +1563,7 @@ dtype = [torch.float16] ...@@ -1550,7 +1563,7 @@ dtype = [torch.float16]
out_function = ["zeros", "ones"] out_function = ["zeros", "ones"]
values = list(product(dim1, dim2, dtype, out_function)) values = list(product(dim1, dim2, dtype, out_function))
names = [ names = [
"dim1_{0}_dim2_{1}_dtype_{2}_out_func_{3}".format(*vals) for vals in values "dim1_{}_dim2_{}_dtype_{}_out_func_{}".format(*vals) for vals in values
] ]
...@@ -1616,17 +1629,6 @@ def test_spmm_coo_very_sparse(dim1, dim2, dtype, out_func): ...@@ -1616,17 +1629,6 @@ def test_spmm_coo_very_sparse(dim1, dim2, dtype, out_func):
# print(time.time() - t0) # print(time.time() - t0)
def test_layout():
a1 = torch.rand(16, 64, device="cuda", dtype=torch.float16)
a1 = torch.arange(16 * 64, device="cuda").reshape(16, 64).byte()
a2, s2 = F.transform(a1, "col_turing")
print(a2.shape)
print(a1.flatten()[8 * 64 : 8 * 64 + 32])
for i in range(4):
print(a2.flatten()[i * 8 * 32 : i * 8 * 32 + 32], 0)
def test_coo2csr(): def test_coo2csr():
threshold = 1 threshold = 1
A = torch.randn(128, 128).half().cuda() A = torch.randn(128, 128).half().cuda()
...@@ -1678,7 +1680,7 @@ dim2 = [2048] ...@@ -1678,7 +1680,7 @@ dim2 = [2048]
# dim2 = [2] # dim2 = [2]
dtype = [torch.int8] dtype = [torch.int8]
values = list(product(dim1, dim2, dtype)) values = list(product(dim1, dim2, dtype))
names = ["dim1_{0}_dim2_{1}_dtype_{2}".format(*vals) for vals in values] names = ["dim1_{}_dim2_{}_dtype_{}".format(*vals) for vals in values]
@pytest.mark.parametrize("dim1, dim2, dtype", values, ids=names) @pytest.mark.parametrize("dim1, dim2, dtype", values, ids=names)
...@@ -1794,7 +1796,7 @@ values.append((batch_size, seqdim, 768, 4 * 768)) ...@@ -1794,7 +1796,7 @@ values.append((batch_size, seqdim, 768, 4 * 768))
# values.append((batch_size, seqdim, 5140, 4*5140)) # values.append((batch_size, seqdim, 5140, 4*5140))
#values.append((batch_size, seqdim, 12288, 4*12288)) #values.append((batch_size, seqdim, 12288, 4*12288))
names = [ names = [
"batch_{0}_seq_{1}_model_{2}_hidden_{3}".format(*vals) for vals in values "batch_{}_seq_{}_model_{}_hidden_{}".format(*vals) for vals in values
] ]
...@@ -2040,3 +2042,154 @@ def test_blockwise_cpu_large(): ...@@ -2040,3 +2042,154 @@ def test_blockwise_cpu_large():
assert diffs[-1] < 0.011 assert diffs[-1] < 0.011
# print(sum(diffs)/len(diffs)) # print(sum(diffs)/len(diffs))
# print(sum(reldiffs)/len(reldiffs)) # print(sum(reldiffs)/len(reldiffs))
def test_fp8_quant():
for e_bits in range(1, 7):
p_bits = 7-e_bits
code = F.create_fp8_map(True, e_bits, p_bits).cuda()
print(e_bits, p_bits)
abserr = []
relerr = []
for i in range(100):
A1 = torch.randn(1024, 1024, device="cuda")
C, SC = F.quantize_blockwise(A1, code=code)
A2 = F.dequantize_blockwise(C, SC)
diff = torch.abs(A1 - A2)
reldiff = diff/torch.abs(A1+1e-8)
abserr.append(diff.mean().item())
relerr.append(reldiff.mean().item())
#assert diff < 0.0075
#print(sum(abserr)/len(abserr))
#print(sum(relerr)/len(relerr))
abserr = []
relerr = []
for i in range(100):
A1 = torch.rand(1024, 1024, device="cuda")
C, SC = F.quantize_blockwise(A1, code=code)
A2 = F.dequantize_blockwise(C, SC)
diff = torch.abs(A1 - A2)
reldiff = diff/torch.abs(A1+1e-8)
abserr.append(diff.mean().item())
relerr.append(reldiff.mean().item())
#assert diff < 0.0075
#print(sum(abserr)/len(abserr))
#print(sum(relerr)/len(relerr))
abserr = []
relerr = []
for i in range(100):
A1 = torch.randn(1024, 1024, device="cuda")
C, SC = F.quantize_blockwise(A1)
A2 = F.dequantize_blockwise(C, SC)
diff = torch.abs(A1 - A2)
reldiff = diff/torch.abs(A1+1e-8)
abserr.append(diff.mean().item())
relerr.append(reldiff.mean().item())
#assert diff < 0.0075
#print(3, sum(abserr)/len(abserr))
#print(3, sum(relerr)/len(relerr))
def test_few_bit_quant():
#print('')
for bits in range(2, 9):
#print('='*30, bits, '='*30)
for method in ['linear', 'fp8', 'dynamic', 'quantile']:
abserrs = []
relerrs = []
code = None
if method == 'linear':
code = F.create_linear_map(True, total_bits=bits).cuda()
elif method == 'fp8':
ebits = math.ceil(bits/2)
pbits = bits-ebits-1
code = F.create_fp8_map(True, ebits, pbits, bits).cuda()
elif method == 'dynamic':
code = F.create_dynamic_map(True, bits-0, bits).cuda()
elif method == 'quantile':
values = torch.randn(2048, 2048, device='cuda')
code = F.create_quantile_map(values, bits).cuda()
# for some data types we have no zero
# for some data types we have one zero
# for some data types we have two zeros
assert torch.unique(code).numel() in [2**bits, 2**bits-1], f'bits: {bits}, method: {method}'
#print(method, (code==0).sum())
assert code.numel() == 256
for i in range(10):
values = torch.randn(1, 32, device='cuda')
values /= values.abs().max()
#values[values.abs() < 1e-6] += 1e-5
q1 = []
v1 = []
for v in values[0]:
idx = torch.abs(v-code).argmin()
q1.append(idx.item())
v1.append(code[idx].item())
q1 = torch.Tensor(q1).cuda()
v1 = torch.Tensor(v1).cuda()
q2, S2 = F.quantize_blockwise(values, code=code)
v2 = F.dequantize_blockwise(q2, S2)
idx = torch.isclose(q1.int(), q2.int())
err2 = torch.abs(v2-values)
abserrs.append(err2.mean().item())
relerrs.append((err2/(1e-10+values).abs()).mean().item())
if idx.sum():
# some weird cases
err1 = torch.abs(v1-values).mean()
#assert err2.mean() <= err1
else:
torch.testing.assert_allclose(q1, q2)
#print(method, 'abserr:', sum(abserrs)/len(abserrs), 'relerr:', sum(relerrs)/len(relerrs))
#assert False
def test_kbit_quantile_estimation():
for i in range(100):
data = torch.randn(1024, 1024, device='cuda')
for bits in range(2, 9):
p = np.linspace(1.3e-4, 1-1.3e-4, 2**bits)
val1 = torch.Tensor(norm.ppf(p)).cuda()
val2 = F.estimate_quantiles(data, offset=0, num_quantiles=2**bits)
err = torch.abs(val1-val2).mean()
assert err < 0.038
for i in range(100):
data = torch.randn(1024, 1024, device='cuda')
for bits in range(2, 4):
total_values = 2**bits-1
p = np.linspace(0, 1, 2*total_values+1)
idx = np.arange(1, 2*total_values+1, 2)
p = p[idx]
offset = 1/(2*total_values)
p = np.linspace(offset, 1-offset, total_values)
val1 = torch.Tensor(norm.ppf(p)).cuda()
val2 = F.estimate_quantiles(data, num_quantiles=2**bits-1)
err = torch.abs(val1-val2).mean()
assert err < 0.035
def test_bench_dequantization():
a = torch.rand(1024, 1024, device='cuda').half()
qa, SA = F.quantize_blockwise(a)
max_theoretical_mu = 1024*1024*2/1024**3/672*1000*1000
#print(max_theoretical_mu)
torch.cuda.synchronize()
t0 = time.time()
for i in range(100):
F.dequantize_blockwise(qa, SA, blocksize=2048)
torch.cuda.synchronize()
#print((time.time()-t0)/1e6)
...@@ -7,7 +7,7 @@ from torch import nn ...@@ -7,7 +7,7 @@ from torch import nn
import bitsandbytes as bnb import bitsandbytes as bnb
class MockArgs(object): class MockArgs:
def __init__(self, initial_data): def __init__(self, initial_data):
for key in initial_data: for key in initial_data:
setattr(self, key, initial_data[key]) setattr(self, key, initial_data[key])
...@@ -15,7 +15,7 @@ class MockArgs(object): ...@@ -15,7 +15,7 @@ class MockArgs(object):
class MLP8bit(torch.nn.Module): class MLP8bit(torch.nn.Module):
def __init__(self, dim1, dim2, has_fp16_weights=True, memory_efficient_backward=False, threshold=0.0): def __init__(self, dim1, dim2, has_fp16_weights=True, memory_efficient_backward=False, threshold=0.0):
super(MLP8bit, self).__init__() super().__init__()
self.fc1 = bnb.nn.Linear8bitLt( self.fc1 = bnb.nn.Linear8bitLt(
dim1, dim2, has_fp16_weights=has_fp16_weights, memory_efficient_backward=memory_efficient_backward, dim1, dim2, has_fp16_weights=has_fp16_weights, memory_efficient_backward=memory_efficient_backward,
threshold=threshold threshold=threshold
...@@ -289,7 +289,7 @@ class LinearFunction(torch.autograd.Function): ...@@ -289,7 +289,7 @@ class LinearFunction(torch.autograd.Function):
class Linear8bit(nn.Module): class Linear8bit(nn.Module):
def __init__(self, input_features, output_features, bias=True, args=None): def __init__(self, input_features, output_features, bias=True, args=None):
super(Linear8bit, self).__init__() super().__init__()
self.input_features = input_features self.input_features = input_features
self.output_features = output_features self.output_features = output_features
self.args = args self.args = args
...@@ -312,7 +312,7 @@ class Linear8bit(nn.Module): ...@@ -312,7 +312,7 @@ class Linear8bit(nn.Module):
threshold = [0.0, 3.0] threshold = [0.0, 3.0]
values = threshold values = threshold
names = ["threshold_{0}".format(vals) for vals in values] names = [f"threshold_{vals}" for vals in values]
@pytest.mark.parametrize("threshold", values, ids=names) @pytest.mark.parametrize("threshold", values, ids=names)
...@@ -378,7 +378,7 @@ def test_linear8bitlt_accumulated_gradient(): ...@@ -378,7 +378,7 @@ def test_linear8bitlt_accumulated_gradient():
threshold = [0.0, 2.0] threshold = [0.0, 2.0]
values = threshold values = threshold
names = ["threshold_{0}".format(vals) for vals in values] names = [f"threshold_{vals}" for vals in values]
@pytest.mark.parametrize("threshold", values, ids=names) @pytest.mark.parametrize("threshold", values, ids=names)
......
...@@ -18,7 +18,7 @@ k = 20 ...@@ -18,7 +18,7 @@ k = 20
def get_temp_dir(): def get_temp_dir():
path = "/tmp/autoswap/{0}".format(str(uuid.uuid4())) path = f"/tmp/autoswap/{str(uuid.uuid4())}"
os.makedirs(path, exist_ok=True) os.makedirs(path, exist_ok=True)
return path return path
...@@ -116,7 +116,7 @@ gtype = [torch.float32, torch.float16] ...@@ -116,7 +116,7 @@ gtype = [torch.float32, torch.float16]
optimizer_names = ["adam", "momentum", "rmsprop", "lars"] optimizer_names = ["adam", "momentum", "rmsprop", "lars"]
values = list(product(dim1, dim2, gtype, optimizer_names)) values = list(product(dim1, dim2, gtype, optimizer_names))
names = [ names = [
"dim1_{0}_dim2_{1}_gtype_{2}_optim_{3}".format(*vals) for vals in values "dim1_{}_dim2_{}_gtype_{}_optim_{}".format(*vals) for vals in values
] ]
...@@ -187,7 +187,7 @@ dim1 = [1024] ...@@ -187,7 +187,7 @@ dim1 = [1024]
dim2 = [32, 1024, 4097] dim2 = [32, 1024, 4097]
gtype = [torch.float32, torch.float16] gtype = [torch.float32, torch.float16]
values = list(product(dim1, dim2, gtype)) values = list(product(dim1, dim2, gtype))
names = ["dim1_{0}_dim2_{1}_gtype_{2}".format(*vals) for vals in values] names = ["dim1_{}_dim2_{}_gtype_{}".format(*vals) for vals in values]
@pytest.mark.parametrize("dim1, dim2, gtype", values, ids=names) @pytest.mark.parametrize("dim1, dim2, gtype", values, ids=names)
...@@ -250,7 +250,7 @@ optimizer_names = [ ...@@ -250,7 +250,7 @@ optimizer_names = [
] ]
values = list(product(dim1, dim2, gtype, optimizer_names)) values = list(product(dim1, dim2, gtype, optimizer_names))
names = [ names = [
"dim1_{0}_dim2_{1}_gtype_{2}_optim_{3}".format(*vals) for vals in values "dim1_{}_dim2_{}_gtype_{}_optim_{}".format(*vals) for vals in values
] ]
...@@ -391,7 +391,7 @@ gtype = [torch.float32] ...@@ -391,7 +391,7 @@ gtype = [torch.float32]
optim_bits = [32, 8] optim_bits = [32, 8]
values = list(product(dim1, dim2, gtype, optim_bits)) values = list(product(dim1, dim2, gtype, optim_bits))
names = [ names = [
"dim1_{0}_dim2_{1}_gtype_{2}_optim_bits_{3}".format(*vals) "dim1_{}_dim2_{}_gtype_{}_optim_bits_{}".format(*vals)
for vals in values for vals in values
] ]
...@@ -495,7 +495,7 @@ gtype = [torch.float32, torch.float16] ...@@ -495,7 +495,7 @@ gtype = [torch.float32, torch.float16]
optimizer_names = ["adam8bit_blockwise"] optimizer_names = ["adam8bit_blockwise"]
values = list(product(dim1, dim2, gtype, optimizer_names)) values = list(product(dim1, dim2, gtype, optimizer_names))
names = [ names = [
"dim1_{0}_dim2_{1}_gtype_{2}_optim_{3}".format(*vals) for vals in values "dim1_{}_dim2_{}_gtype_{}_optim_{}".format(*vals) for vals in values
] ]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment