Initial commit

74399248 · Tim Dettmers · 74399248 · 74399248 · 74399248 · 74399248
Commit 74399248 authored Oct 05, 2021 by Tim Dettmers
20 changed files
--- a/bitsandbytes/optim/sgd.py
+++ b/bitsandbytes/optim/sgd.py
+# Copyright (c) Facebook, Inc. and its affiliates. 
+#   
+# This source code is licensed under the MIT license found in the 
+# LICENSE file in the root directory of this source tree.
+from bitsandbytes.optim.optimizer import Optimizer1State
+
+class SGD(Optimizer1State):
+    def __init__(self, params, lr, momentum=0, dampening=0,
+                 weight_decay=0, nesterov=False, optim_bits=32, args=None,
+            min_8bit_size=4096, percentile_clipping=100, block_wise=True):
+        if momentum == 0:
+            raise NotImplementError(f'SGD without momentum is not supported!')
+        super(SGD, self).__init__('momentum', params, lr, (momentum, dampening), 0.0,
+                weight_decay, optim_bits, args, min_8bit_size, percentile_clipping, block_wise)
+
+class SGD8bit(Optimizer1State):
+    def __init__(self, params, lr, momentum=0, dampening=0,
+                 weight_decay=0, nesterov=False, args=None,
+            min_8bit_size=4096, percentile_clipping=100, block_wise=True):
+        if momentum == 0:
+            raise NotImplementError(f'SGD without momentum is not supported!')
+        super(SGD8bit, self).__init__('momentum', params, lr, (momentum, dampening), 0.0,
+                weight_decay, 8, args, min_8bit_size, percentile_clipping, block_wise)
+
+class SGD32bit(Optimizer1State):
+    def __init__(self, params, lr, momentum=0, dampening=0,
+                 weight_decay=0, nesterov=False, args=None,
+            min_8bit_size=4096, percentile_clipping=100, block_wise=True):
+        if momentum == 0:
+            raise NotImplementError(f'SGD without momentum is not supported!')
+        super(SGD32bit, self).__init__('momentum', params, lr, (momentum, dampening), 0.0,
+                weight_decay, 32, args, min_8bit_size, percentile_clipping, block_wise)
--- a/csrc/kernels.cu
+++ b/csrc/kernels.cu
--- a/csrc/kernels.cuh
+++ b/csrc/kernels.cuh
+// Copyright (c) Facebook, Inc. and its affiliates. 
+//   
+// This source code is licensed under the MIT license found in the 
+// LICENSE file in the root directory of this source tree.
+
+#include <float.h>
+#include <ops.cuh>
+
+#ifndef kernels
+#define kernels
+
+template<typename T>__global__ void kEstimateQuantiles(T *__restrict__ const A, float *code, const float offset, const T max_val, const int n);
+
+__global__ void kQuantize(float * code, float * __restrict__ const A, unsigned char *out, const int n);
+__global__ void kDequantize(float *code, unsigned char *A, float *out, const int n);
+
+template<typename T, int BLOCK_SIZE, int NUM_PER_TH, int STOCHASTIC> __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n);
+template<typename T, int BLOCK_SIZE, int THREADS, int NUM_PER_TH> __global__ void kDequantizeBlockwise(float *code, unsigned char * __restrict__ const A, float * __restrict__ const absmax, T *out, const int n);
+
+template<typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS>
+__global__ void kPreconditionOptimizer32bit2State(T* g, T* p, 
+                float* state1, float* state2, float *unorm,
+                const float beta1, const float beta2, const float eps, const float weight_decay,
+                const int step, const float lr, const float gnorm_scale, const int n);
+
+template<typename T, int OPTIMIZER>
+__global__ void kOptimizer32bit2State(T* g, T* p, 
+                float* state1, float* state2, float *unorm, const float max_unorm, const float param_norm,
+                const float beta1, const float beta2, const float eps, const float weight_decay,
+                const int step, const float lr, const float gnorm_scale, const int n);
+
+template<typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS>
+__global__ void kPreconditionOptimizer32bit1State(T* g, T* p, 
+                float* state1, float *unorm,
+                const float beta1, const float eps, const float weight_decay,
+                const int step, const float lr, const float gnorm_scale, const int n);
+
+template<typename T, int OPTIMIZER>
+__global__ void kOptimizer32bit1State(T* g, T* p, 
+                float* state1,  float *unorm, const float max_unorm, const float param_norm,
+                const float beta1, const float eps, const float weight_decay,
+                const int step, const float lr, const float gnorm_scale, const int n);
+
+template<typename T, int OPTIMIZER>
+__global__ void
+kPreconditionOptimizerStatic8bit1State(T* p, T* __restrict__ const g, unsigned char*__restrict__  const state1, 
+                float *unorm,
+                const float beta1, 
+                const float eps, const int step, 
+                float* __restrict__ const quantiles1, 
+                float* max1, float* new_max1, 
+                const float weight_decay,
+                const float gnorm_scale, const int n);
+
+
+template<typename T, int OPTIMIZER>
+__global__ void
+kOptimizerStatic8bit1State(T* p, T* const g, unsigned char* state1, 
+                const float *unorm, const float max_unorm, const float param_norm,
+                const float beta1, 
+                const float eps, const int step, const float lr, 
+                float* __restrict__ const quantiles1, 
+                float* max1, float* new_max1, 
+                float weight_decay, const float gnorm_scale, const int n);
+
+
+
+template<typename T, int OPTIMIZER>
+__global__ void
+kPreconditionOptimizerStatic8bit2State(T* p, T* __restrict__ const g, unsigned char*__restrict__  const state1, unsigned char* __restrict__ const state2,
+                float *unorm,
+                const float beta1, const float beta2,
+                const float eps, const int step, 
+                float* __restrict__ const quantiles1, float* __restrict__ const quantiles2,
+                float* max1, float* max2, float* new_max1, float* new_max2,
+                const float gnorm_scale, const int n);
+
+
+template<typename T, int OPTIMIZER>
+__global__ void
+kOptimizerStatic8bit2State(T* p, T* const g, unsigned char* state1, unsigned char* state2,
+                const float *unorm, const float max_unorm, const float param_norm,
+                const float beta1, const float beta2,
+                const float eps, const int step, const float lr, 
+                float* __restrict__ const quantiles1, float* __restrict__ const quantiles2,
+                float* max1, float* max2, float* new_max1, float* new_max2,
+                float weight_decay, const float gnorm_scale, const int n);
+
+template<typename T, int OPTIMIZER, int BLOCK_SIZE, int N_PER_TH> __global__ void kOptimizerStatic8bit2StateBlockwise(
+		T* p, T* __restrict__ const g, unsigned char* state1, unsigned char* state2,
+                const float beta1, const float beta2, const float eps, const int step, const float lr,
+                float* __restrict__ const quantiles1, float* __restrict__ const quantiles2,
+                float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale, const int n);
+
+template<typename T, int OPTIMIZER, int BLOCK_SIZE, int N_PER_TH> __global__ void kOptimizerStatic8bit1StateBlockwise(
+		T* p, T* __restrict__ const g, unsigned char* state1,
+                const float beta1, const float beta2,
+                const float eps, const int step, const float lr,
+                float* __restrict__ const quantiles1,
+                float* absmax1,
+                float weight_decay,
+                const float gnorm_scale, const int n);
+
+
+template<typename T, int BLOCK_SIZE, int NUM_VALS> __global__ void kPercentileClipping(T * __restrict__ g, float *gnorm_vec, int step, const int n);
+
+__global__ void kHistogramScatterAdd2D(float* histogram, int *index1, int *index2, float *src, const int maxidx1, const int n);
+
+#endif
+
+
--- a/csrc/ops.cu
+++ b/csrc/ops.cu
+// Copyright (c) Facebook, Inc. and its affiliates. 
+//   
+// This source code is licensed under the MIT license found in the 
+// LICENSE file in the root directory of this source tree.
+
+#include <ops.cuh>
+#include <kernels.cuh>
+#include <cub/device/device_scan.cuh>
+#include <limits>
+#include <BinSearch.h>
+
+
+using namespace BinSearch;
+using std::cout;
+using std::endl;
+
+#define BLOCK_SIZE 4096
+
+struct quantize_block_args
+{
+  BinAlgo<Scalar, float, Direct2> *bin_searcher;
+  float *code;
+  float *A;
+  float *absmax;
+  unsigned char *out;
+  int block_end;
+  int block_idx;
+  int threadidx;
+};
+
+void *quantize_block(void *arguments)
+{
+  // 1. find absmax in block
+  // 2. divide input value by absmax to normalize into [-1.0, 1.0]
+  // 3. do binary search to find the closest value
+  // 4. check minimal distance
+  // 5. store index
+
+  struct quantize_block_args *args = (quantize_block_args*)arguments;
+
+  // 1. find absmax in block
+  float absmax_block = -FLT_MAX;
+  for (int i = args->block_idx; i < args->block_end; i++)
+    absmax_block = fmax(absmax_block, fabs(args->A[i]));
+
+  args->absmax[args->block_idx/BLOCK_SIZE] = absmax_block;
+
+  for (int i = args->block_idx; i < args->block_end; i++)
+  {
+    // 2. divide input value by absmax to normalize into [-1.0, 1.0]
+    // 3. do binary search to find the closest value
+    float normed_value = args->A[i]/absmax_block;
+    int idx = args->bin_searcher->scalar(normed_value);
+
+    // 4. check minimal distance
+    // The binary search returns always the value to the left, which might not be the closest value
+    if(idx < 255)
+    {
+      float dist_left = fabs(normed_value-(args->code[idx]));
+      float dist_right = fabs(normed_value-(args->code[idx+1]));
+      if(dist_right < dist_left){ idx+=1; }
+    }
+
+    // 5. store index
+    args->out[i] = (unsigned char)idx;
+  }
+
+  return NULL;
+}
+
+void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, int n)
+{
+
+  // the default code is has range [-0.993, 1.0] which can cause an error in the binary search algorithm used below
+  code[0] = -1.0f; 
+
+  int num_blocks = n/BLOCK_SIZE;
+  num_blocks += n % BLOCK_SIZE == 0 ? 0 : 1;
+
+  pthread_t *threads = (pthread_t*)malloc(sizeof(pthread_t)*num_blocks);
+  struct quantize_block_args **args = (quantize_block_args**)malloc(num_blocks*sizeof(quantize_block_args*));
+
+  for(int i = 0; i < num_blocks; i++)
+    args[i] = (quantize_block_args*)malloc(sizeof(quantize_block_args));
+
+  const uint32 elements_code = 256;
+  BinAlgo<Scalar, float, Direct2> bin_searcher(code, elements_code);
+
+  for(int block_idx = 0; block_idx < n; block_idx+=BLOCK_SIZE)
+  {
+    int valid_items = n-block_idx >= BLOCK_SIZE ? BLOCK_SIZE : n - block_idx;
+    int block_end = block_idx + valid_items;
+
+    struct quantize_block_args *arg = args[block_idx/BLOCK_SIZE];
+    arg->bin_searcher = &bin_searcher;
+    arg->code = code;
+    arg->A = A;
+    arg->absmax = absmax;
+    arg->out = out;
+    arg->block_end = block_end;
+    arg->block_idx = block_idx;
+    arg->threadidx = block_idx/BLOCK_SIZE;
+ 
+    pthread_create(&threads[block_idx/BLOCK_SIZE], NULL, &quantize_block, (void *)arg);
+  }
+
+  for(int i = 0; i < num_blocks; i++)
+    int err = pthread_join(threads[i], NULL);
+
+  free(threads);
+  for(int i = 0; i < num_blocks; i++)
+    free(args[i]);
+  free(args);
+}
+
+
+void dequantize_cpu(float *code, unsigned char *A, float *absmax, float *out, int n)
+{
+  for(int block_idx = 0; block_idx < n; block_idx+=BLOCK_SIZE)
+  {
+    int valid_items = n-block_idx >= BLOCK_SIZE ? BLOCK_SIZE : n - block_idx;
+    int block_end = block_idx + valid_items;
+    for (int i = block_idx; i < block_end; i++)
+      out[i] = code[A[i]]*absmax[block_idx/BLOCK_SIZE];
+  }
+}
+
+void histogramScatterAdd2D(float* histogram, int *index1, int *index2, float *src, int maxidx1, int n)
+{
+  int threads = 512;
+  int blocks = n/threads;
+  blocks = n % threads == 0 ? blocks : blocks + 1;
+  kHistogramScatterAdd2D<<<blocks, 512>>>(histogram, index1, index2, src, maxidx1, n);
+  CUDA_CHECK_RETURN(cudaPeekAtLastError());
+}
+
+template <typename T> void estimateQuantiles(T *A, float *code, float offset, int n)
+{
+  int blocks = n/4096;
+  blocks = n % 4096 == 0 ? blocks : blocks + 1;
+	CUDA_CHECK_RETURN(cudaMemset(code, 0, 256*sizeof(float)));
+  kEstimateQuantiles<T><<<blocks, 512>>>(A, code, offset, std::numeric_limits<T>::max(), n);
+  CUDA_CHECK_RETURN(cudaPeekAtLastError());
+}
+
+void quantize(float *code, float *A, unsigned char *out, int n)
+{
+  int blocks = n/1024;
+  blocks = n % 1024 == 0 ? blocks : blocks + 1;
+  kQuantize<<<blocks, 1024>>>(code, A, out, n);
+  CUDA_CHECK_RETURN(cudaPeekAtLastError());
+}
+
+void dequantize(float *code, unsigned char *A, float *out, int n)
+{
+  int blocks = n/1024;
+  blocks = n % 1024 == 0 ? blocks : blocks + 1;
+  kDequantize<<<blocks, 1024>>>(code, A, out, n);
+  CUDA_CHECK_RETURN(cudaPeekAtLastError());
+}
+
+template <typename T, int STOCHASTIC> void quantizeBlockwise(float * code, T *A, float *absmax, unsigned char *out, float *rand, int rand_offset, const int n)
+{
+  int blocks = n/4096;
+  blocks = n % 4096 == 0 ? blocks : blocks + 1;
+  kQuantizeBlockwise<T, 4096, 4, STOCHASTIC><<<blocks, 1024>>>(code, A, absmax, out, rand, rand_offset, n);
+  CUDA_CHECK_RETURN(cudaPeekAtLastError());
+}
+
+template<typename T> void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, T *out, int blocksize, const int n)
+{
+  int blocks = n/blocksize;
+  blocks = n % blocksize == 0 ? blocks : blocks + 1;
+  if(blocksize == 4096)
+    kDequantizeBlockwise<T, 4096, 1024, 4><<<blocks, 4096/4>>>(code, A, absmax, out, n);
+  else if(blocksize == 2048)
+    kDequantizeBlockwise<T, 2048, 512, 4><<<blocks, 2048/4>>>(code, A, absmax, out, n);
+  CUDA_CHECK_RETURN(cudaPeekAtLastError());
+}
+
+template<typename T, int OPTIMIZER> void optimizer32bit(T* g, T* p, 
+                float* state1, float* state2, float *unorm, float max_unorm, float param_norm,
+                const float beta1, const float beta2, const float eps, const float weight_decay,
+                const int step, const float lr, const float gnorm_scale, const int n)
+{
+  int blocks = n/4096;
+  blocks = n % 4096 == 0 ? blocks : blocks + 1;
+	switch(OPTIMIZER)
+	{
+		case ADAM:
+      if(max_unorm > 0.0f)
+			{ 
+				CUDA_CHECK_RETURN(cudaMemset(unorm, 0, 1*sizeof(float)));
+        kPreconditionOptimizer32bit2State<T, OPTIMIZER, 4096, 8><<<blocks, 512>>>(g, p, state1, state2, unorm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, n);
+        CUDA_CHECK_RETURN(cudaPeekAtLastError());
+      }
+			kOptimizer32bit2State<T, OPTIMIZER><<<blocks, 1024>>>(g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, n);
+      CUDA_CHECK_RETURN(cudaPeekAtLastError());
+			break;
+		case MOMENTUM:
+    case RMSPROP:
+      if(max_unorm > 0.0f)
+			{ 
+				CUDA_CHECK_RETURN(cudaMemset(unorm, 0, 1*sizeof(float)));
+				kPreconditionOptimizer32bit1State<T, OPTIMIZER, 4096, 8><<<blocks, 512>>>(g, p, state1, unorm, beta1, eps, weight_decay, step, lr, gnorm_scale, n);
+        CUDA_CHECK_RETURN(cudaPeekAtLastError());
+			}
+
+			kOptimizer32bit1State<T, OPTIMIZER><<<blocks, 1024>>>(g, p, state1, unorm, max_unorm, param_norm, beta1, eps, weight_decay, step, lr, gnorm_scale, n);
+      CUDA_CHECK_RETURN(cudaPeekAtLastError());
+			break;
+	}
+}
+
+template<typename T, int OPTIMIZER> void optimizerStatic8bit(T* p, T* g,
+                unsigned char* state1, unsigned char* state2,
+                float *unorm, float max_unorm, float param_norm,
+                float beta1, float beta2,
+                float eps, int step, float lr, 
+                float* quantiles1, float* quantiles2,
+                float* max1, float* max2, float* new_max1, float* new_max2,
+                float weight_decay,
+                const float gnorm_scale, int n)
+{
+  int blocks = n/4096;
+  blocks = n % 4096 == 0 ? blocks : blocks + 1;
+
+  if(max_unorm > 0.0f){ CUDA_CHECK_RETURN(cudaMemset(unorm, 0, 1*sizeof(float))); }
+
+	switch(OPTIMIZER)
+	{
+		case ADAM:
+			CUDA_CHECK_RETURN(cudaMemset(new_max1, 0, 1*sizeof(float)));
+			CUDA_CHECK_RETURN(cudaMemset(new_max2, 0, 1*sizeof(float)));
+			kPreconditionOptimizerStatic8bit2State<T, OPTIMIZER><<<blocks, 256>>>(p, g, state1, state2, unorm, beta1, beta2, eps, step, quantiles1, quantiles2, max1, max2, new_max1, new_max2, gnorm_scale, n);
+			CUDA_CHECK_RETURN(cudaPeekAtLastError());
+			kOptimizerStatic8bit2State<T, OPTIMIZER><<<blocks, 1024>>>(p, g, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr,
+																														quantiles1, quantiles2, max1, max2, new_max1, new_max2, weight_decay, gnorm_scale, n);
+			CUDA_CHECK_RETURN(cudaPeekAtLastError());
+		break;
+		case MOMENTUM:
+    case RMSPROP:
+			CUDA_CHECK_RETURN(cudaMemset(new_max1, 0, 1*sizeof(float)));
+			kPreconditionOptimizerStatic8bit1State<T, OPTIMIZER><<<blocks, 256>>>(p, g, state1, unorm, beta1, eps, step, quantiles1, max1, new_max1, weight_decay, gnorm_scale, n);
+			CUDA_CHECK_RETURN(cudaPeekAtLastError());
+			kOptimizerStatic8bit1State<T, OPTIMIZER><<<blocks, 1024>>>(p, g, state1, unorm, max_unorm, param_norm, beta1, eps, step, lr,
+																														quantiles1, max1, new_max1, weight_decay, gnorm_scale, n);
+			CUDA_CHECK_RETURN(cudaPeekAtLastError());
+			break;
+		default:
+			break;
+	}
+}
+
+#define BLOCKSIZE_2STATE 2048
+#define NUM_2STATE 8
+#define BLOCKSIZE_1STATE 2048
+#define NUM_1STATE 8
+
+template<typename T, int OPTIMIZER> void optimizerStatic8bitBlockwise(T* p, T* g,
+                unsigned char* state1, unsigned char* state2, float beta1, float beta2, float eps, int step, float lr, 
+                float* quantiles1, float* quantiles2, float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale, int n)
+{
+
+	int blocks = 0;
+	switch(OPTIMIZER)
+	{
+		case ADAM:
+			blocks = n/BLOCKSIZE_2STATE;
+			blocks = n % BLOCKSIZE_2STATE == 0 ? blocks : blocks + 1;
+			kOptimizerStatic8bit2StateBlockwise<T, OPTIMIZER, BLOCKSIZE_2STATE, NUM_2STATE><<<blocks, BLOCKSIZE_2STATE/NUM_2STATE>>>(p, g, state1, state2, beta1, beta2, eps, step, lr,
+																														quantiles1, quantiles2, absmax1, absmax2, weight_decay, gnorm_scale, n);
+			CUDA_CHECK_RETURN(cudaPeekAtLastError());
+		break;
+		case MOMENTUM:
+		case RMSPROP:
+			blocks = n/BLOCKSIZE_1STATE;
+			blocks = n % BLOCKSIZE_1STATE == 0 ? blocks : blocks + 1;
+			kOptimizerStatic8bit1StateBlockwise<T, OPTIMIZER, BLOCKSIZE_1STATE, NUM_1STATE><<<blocks, BLOCKSIZE_1STATE/NUM_1STATE>>>(p, g, state1, beta1, beta2, eps, step, lr,
+																														quantiles1, absmax1, weight_decay, gnorm_scale, n);
+			CUDA_CHECK_RETURN(cudaPeekAtLastError());
+		break;
+	}
+}
+
+
+
+template<typename T> void percentileClipping(T * g, float *gnorm_vec, int step, const int n)
+{
+  int blocks = n/2048;
+  blocks = n % 2048 == 0 ? blocks : blocks + 1;
+	CUDA_CHECK_RETURN(cudaMemset(&gnorm_vec[step % 100], 0, 1*sizeof(float)));
+  kPercentileClipping<T, 2048, 4><<<blocks, 512>>>(g, gnorm_vec, step, n);
+  CUDA_CHECK_RETURN(cudaPeekAtLastError());
+}
+
+
+//==============================================================
+//                   TEMPLATE DEFINITIONS
+//==============================================================
+
+template void estimateQuantiles(half *A, float *code, float offset, int n);
+template void estimateQuantiles(float *A, float *code, float offset, int n);
+
+template void quantizeBlockwise<half, 0>(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n);
+template void quantizeBlockwise<float, 0>(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n);
+template void quantizeBlockwise<half, 1>(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n);
+template void quantizeBlockwise<float, 1>(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n);
+template void dequantizeBlockwise<half>(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n);
+template void dequantizeBlockwise<float>(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n);
+
+#define MAKE_optimizer32bit(name, gtype) \
+template void optimizer32bit<gtype, name>(gtype* g, gtype* p, \
+                float* state1, float* state2, float* unorm, float max_unorm, float param_norm, \
+                const float beta1, const float beta2, const float eps, const float weight_decay, \
+                const int step, const float lr, const float gnorm_scale, const int n);
+
+MAKE_optimizer32bit(ADAM, half)
+MAKE_optimizer32bit(ADAM, float)
+MAKE_optimizer32bit(MOMENTUM, half)
+MAKE_optimizer32bit(MOMENTUM, float)
+MAKE_optimizer32bit(RMSPROP, half)
+MAKE_optimizer32bit(RMSPROP, float)
+
+#define MAKE_optimizerStatic8bit(name, gtype) \
+template void optimizerStatic8bit<gtype, name>(gtype* p, gtype* g, unsigned char* state1, unsigned char* state2, \
+                float *unorm, float max_unorm, float param_norm, \
+                float beta1, float beta2, \
+                float eps, int step, float lr,  \
+                float* quantiles1, float* quantiles2, \
+                float* max1, float* max2, float* new_max1, float* new_max2, \
+                float weight_decay, \
+                const float gnorm_scale, int n); \
+
+MAKE_optimizerStatic8bit(ADAM, half)
+MAKE_optimizerStatic8bit(ADAM, float)
+MAKE_optimizerStatic8bit(MOMENTUM, half)
+MAKE_optimizerStatic8bit(MOMENTUM, float)
+MAKE_optimizerStatic8bit(RMSPROP, half)
+MAKE_optimizerStatic8bit(RMSPROP, float)
+
+#define MAKE_optimizerStatic8bitBlockwise(gtype, optim_name) \
+template void optimizerStatic8bitBlockwise<gtype, optim_name>(gtype* p, gtype* g, \
+                unsigned char* state1, unsigned char* state2, float beta1, float beta2, float eps, int step, float lr,  \
+                float* quantiles1, float* quantiles2, float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale, int n); \
+
+MAKE_optimizerStatic8bitBlockwise(half, ADAM);
+MAKE_optimizerStatic8bitBlockwise(float, ADAM);
+MAKE_optimizerStatic8bitBlockwise(half, MOMENTUM);
+MAKE_optimizerStatic8bitBlockwise(float, MOMENTUM);
+MAKE_optimizerStatic8bitBlockwise(half, RMSPROP);
+MAKE_optimizerStatic8bitBlockwise(float, RMSPROP);
+
+template void percentileClipping(float * g, float *gnorm_vec, int step, const int n);
+template void percentileClipping(half * g, float *gnorm_vec, int step, const int n);
--- a/csrc/ops.cuh
+++ b/csrc/ops.cuh
+// Copyright (c) Facebook, Inc. and its affiliates. 
+//   
+// This source code is licensed under the MIT license found in the 
+// LICENSE file in the root directory of this source tree.
+
+
+#ifndef ops_H
+#define ops_H
+
+#include <stdio.h>
+#include <iostream>
+#include <unistd.h>
+#include <assert.h>
+
+#include <cuda_runtime_api.h>
+#include <cuda_fp16.h>
+
+#define CUDA_CHECK_RETURN(value) {                      \
+  cudaError_t _m_cudaStat = value;                    \
+  if (_m_cudaStat != cudaSuccess) {                   \
+    fprintf(stderr, "Error %s at line %d in file %s\n",         \
+        cudaGetErrorString(_m_cudaStat), __LINE__, __FILE__);   \
+    exit(1);                              \
+  } }
+
+#define THREADS_PER_BLOCKS (512)
+
+typedef enum Operations_t
+{
+	ksmul = 0,
+} Operations_t;
+
+typedef enum Optimizer_t
+{
+	ADAM = 0,
+	MOMENTUM = 1,
+  RMSPROP = 2,
+  LARS = 3,
+} Optimizer_t;
+
+
+template <typename T> void estimateQuantiles(T *A, float *code, float offset, int n);
+
+void quantize(float *code, float *A, unsigned char *out, int n);
+void dequantize(float *code, unsigned char *A, float *out, int n);
+template <typename T, int STOCHASTIC> void quantizeBlockwise(float * code, T *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n);
+template<typename T> void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, T *out, int block_size, const int n);
+
+template<typename T, int OPTIMIZER> void optimizer32bit(T* g, T* p, 
+                float* state1, float* state2, float *unorm, float max_unorm, float param_norm,
+                float beta1, float beta2, float eps, float weight_decay,
+                int step, float lr, const float gnorm_scale, int n);
+
+template<typename T, int OPTIMIZER> void optimizerStatic8bit(T* p, T* g, unsigned char* state1, unsigned char* state2,
+                float *unorm, float max_unorm, float param_norm,
+                float beta1, float beta2,
+                float eps, int step, float lr, 
+                float* quantiles1, float* quantiles2,
+                float* max1, float* max2, float* new_max1, float* new_max2,
+                float weight_decay,
+                const float gnorm_scale, int n);
+
+template<typename T, int OPTIMIZER> void optimizerStatic8bitBlockwise(T* p, T* g,
+                unsigned char* state1, unsigned char* state2, float beta1, float beta2, float eps, int step, float lr, 
+                float* quantiles1, float* quantiles2, float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale, int n);
+
+template<typename T> void percentileClipping(T * g, float *gnorm_vec, int step, const int n);
+
+void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, int n);
+void dequantize_cpu(float *code, unsigned char *A, float *absmax, float *out, int n);
+
+void histogramScatterAdd2D(float* histogram, int *index1, int *index2, float *src, int maxidx1, int n);
+
+#endif
+
+
+
+
+
+
+
--- a/csrc/pythonInterface.c
+++ b/csrc/pythonInterface.c
+// Copyright (c) Facebook, Inc. and its affiliates. 
+//   
+// This source code is licensed under the MIT license found in the 
+// LICENSE file in the root directory of this source tree.
+
+#include <ops.cuh>
+
+// We cannot call templated code from C, so we wrap the template in a C compatible call here if necessary.
+// We use macro functions to expand all the different optimizers. Looks ugly, and is ugly, but its better than to 
+// maintain all that boilerplate
+//===================================================================================
+//                               UNMANGLED CALLS
+//===================================================================================
+
+void estimateQuantiles_fp32(float *A, float *code, float offset, int n){ estimateQuantiles<float>(A, code, offset, n); }
+void estimateQuantiles_fp16(half *A, float *code, float offset, int n){ estimateQuantiles<half>(A, code, offset, n); }
+
+
+#define MAKE_FUNC32(fname, oname, gtype, gbits) \
+void fname##32bit_g##gbits(gtype *g, gtype *p, \
+               float* state1, float* state2, float *unorm, float max_unorm, float param_norm, \
+               const float beta1, const float beta2, const float eps, const float weight_decay, \
+               const int step, const float lr, float gnorm_scale, const int n) \
+{ optimizer32bit<gtype, oname>(g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, n); } \
+
+MAKE_FUNC32(momentum, MOMENTUM, float, 32)
+MAKE_FUNC32(momentum, MOMENTUM, half, 16)
+MAKE_FUNC32(adam, ADAM, float, 32)
+MAKE_FUNC32(adam, ADAM, half, 16)
+MAKE_FUNC32(rmsprop, RMSPROP, float, 32)
+MAKE_FUNC32(rmsprop, RMSPROP, half, 16)
+
+#define MAKE_FUNC8(fname, oname, gtype, gbits) \
+void fname##_static_8bit_g##gbits(gtype* p, gtype* g, unsigned char* state1, unsigned char* state2, \
+								float *unorm, float max_unorm, float param_norm, \
+                float beta1, float beta2, \
+                float eps, int step, float lr,  \
+                float* quantiles1, float* quantiles2, \
+                float* max1, float* max2, float* new_max1, float* new_max2, \
+                float weight_decay, float gnorm_scale, int n) \
+{  \
+	optimizerStatic8bit<gtype, oname>(g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr, \
+			                                  quantiles1, quantiles2, max1, max2, new_max1, new_max2, weight_decay, gnorm_scale, n); \
+} \
+
+MAKE_FUNC8(adam, ADAM, float, 32)
+MAKE_FUNC8(adam, ADAM, half, 16)
+MAKE_FUNC8(momentum, MOMENTUM, float, 32)
+MAKE_FUNC8(momentum, MOMENTUM, half, 16)
+MAKE_FUNC8(rmsprop, RMSPROP, float, 32)
+MAKE_FUNC8(rmsprop, RMSPROP, half, 16)
+
+#define MAKE_BLOCKWISE8(fname, optim_name, gtype, gbits) \
+void fname##_8bit_blockwise_fp##gbits(gtype* p, gtype* g, \
+                unsigned char* state1, unsigned char* state2, float beta1, float beta2, float eps, int step, float lr, \
+                float* quantiles1, float* quantiles2, float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale, int n)\
+{	optimizerStatic8bitBlockwise<gtype, optim_name>(p, g, state1, state2, beta1, beta2, eps, step, lr, quantiles1, quantiles2, absmax1, absmax2, weight_decay, gnorm_scale, n); }\
+
+MAKE_BLOCKWISE8(adam, ADAM, half, 16)
+MAKE_BLOCKWISE8(adam, ADAM, float, 32)
+MAKE_BLOCKWISE8(momentum, MOMENTUM, half, 16)
+MAKE_BLOCKWISE8(momentum, MOMENTUM, float, 32)
+MAKE_BLOCKWISE8(rmsprop, RMSPROP, half, 16)
+MAKE_BLOCKWISE8(rmsprop, RMSPROP, float, 32)
+
+
+void percentileClipping_g32(float * g, float *gnorm_vec, int step, const int n){ percentileClipping<float>(g, gnorm_vec, step, n); }
+void percentileClipping_g16(half * g, float *gnorm_vec, int step, const int n){ percentileClipping<half>(g, gnorm_vec, step, n); }
+
+void quantizeBlockwise_fp16(float * code, half *A, float *absmax, unsigned char *out, const int n){ quantizeBlockwise<half, 0>(code, A, absmax, out, NULL, 0, n); }
+void quantizeBlockwise_fp32(float * code, float *A, float *absmax, unsigned char *out, const int n){ quantizeBlockwise<float, 0>(code, A, absmax, out, NULL, 0, n); }
+void quantizeBlockwise_stochastic_fp16(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n){ quantizeBlockwise<half, 1>(code, A, absmax, out, rand, rand_offset, n); }
+void quantizeBlockwise_stochastic_fp32(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n){ quantizeBlockwise<float, 1>(code, A, absmax, out, rand, rand_offset, n); }
+
+void dequantizeBlockwise_fp16(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise<half>(code, A, absmax, out, blocksize, n); } \
+void dequantizeBlockwise_fp32(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise<float>(code, A, absmax, out, blocksize, n); }
+
+extern "C"
+{
+	void cestimate_quantiles_fp32(float *A, float *code, float offset, int n){ estimateQuantiles_fp32(A, code, offset, n); }
+	void cestimate_quantiles_fp16(half *A, float *code, float offset, int n){ estimateQuantiles_fp16(A, code, offset, n); }
+	void cquantize(float *code, float *A, unsigned char *out, int n){ quantize(code, A, out, n); }
+	void cdequantize(float *code, unsigned char *A, float *out, int n){ dequantize(code, A, out, n); }
+  void cquantize_blockwise_fp16(float * code, half *A, float *absmax, unsigned char *out, const int n){ quantizeBlockwise_fp16(code, A, absmax, out, n); }
+  void cquantize_blockwise_fp32(float * code, float *A, float *absmax, unsigned char *out, const int n){ quantizeBlockwise_fp32(code, A, absmax, out, n); }
+  void cquantize_blockwise_stochastic_fp16(float * code, half *A, float *absmax, unsigned char *out, float *rand, int rand_offset, const int n){ quantizeBlockwise_stochastic_fp16(code, A, absmax, out, rand, rand_offset, n); }
+  void cquantize_blockwise_stochastic_fp32(float * code, float *A, float *absmax, unsigned char *out, float *rand, int rand_offset, const int n){ quantizeBlockwise_stochastic_fp32(code, A, absmax, out, rand, rand_offset, n); }
+
+  void cdequantize_blockwise_fp16(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise_fp16(code, A, absmax, out, blocksize, n); }
+  void cdequantize_blockwise_fp32(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise_fp32(code, A, absmax, out, blocksize, n); }
+
+	#define MAKE_CFUNC32(name, gtype, gbits) \
+	void c##name##32bit_g##gbits(gtype *g, gtype *p, \
+								 float* state1, float* state2, float *unorm, float max_unorm, float param_norm, \
+								 const float beta1, const float beta2, const float eps, const float weight_decay, \
+								 const int step, const float lr, const float gnorm_scale, const int n) \
+	{ name##32bit_g##gbits(g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, n); } \
+
+	MAKE_CFUNC32(adam, float, 32)
+	MAKE_CFUNC32(adam, half, 16)
+	MAKE_CFUNC32(momentum, float, 32)
+	MAKE_CFUNC32(momentum, half, 16)
+	MAKE_CFUNC32(rmsprop, float, 32)
+	MAKE_CFUNC32(rmsprop, half, 16)
+
+	#define MAKE_CFUNC8(name, gtype, gbits) \
+	void c##name##_static_8bit_g##gbits(gtype* p, gtype* g, unsigned char* state1, unsigned char* state2, \
+                float *unorm, float max_unorm, float param_norm, \
+                float beta1, float beta2, \
+                float eps, int step, float lr,  \
+                float* quantiles1, float* quantiles2, \
+                float* max1, float* max2, float* new_max1, float* new_max2, \
+                float weight_decay, float gnorm_scale, int n) \
+  {  \
+	    name##_static_8bit_g##gbits(g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr, \
+			                                 quantiles1, quantiles2, max1, max2, new_max1, new_max2, weight_decay, gnorm_scale, n); \
+  } \
+
+	MAKE_CFUNC8(adam, float, 32)
+	MAKE_CFUNC8(adam, half, 16)
+	MAKE_CFUNC8(momentum, float, 32)
+	MAKE_CFUNC8(momentum, half, 16)
+	MAKE_CFUNC8(rmsprop, float, 32)
+	MAKE_CFUNC8(rmsprop, half, 16)
+
+  #define MAKE_CBLOCKWISE8(fname, optim_name, gtype, gbits) \
+  void c##fname##_8bit_blockwise_fp##gbits(gtype* p, gtype* g, \
+                unsigned char* state1, unsigned char* state2, float beta1, float beta2, float eps, int step, float lr,  \
+                float* quantiles1, float* quantiles2, float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale, int n) \
+  {	fname##_8bit_blockwise_fp##gbits(p, g, state1, state2, beta1, beta2, eps, step, lr, quantiles1, quantiles2, absmax1, absmax2, weight_decay, gnorm_scale, n); } \
+
+	MAKE_CBLOCKWISE8(adam, ADAM, half, 16)
+	MAKE_CBLOCKWISE8(adam, ADAM, float, 32)
+	MAKE_CBLOCKWISE8(momentum, MOMENTUM, half, 16)
+	MAKE_CBLOCKWISE8(momentum, MOMENTUM, float, 32)
+	MAKE_CBLOCKWISE8(rmsprop, RMSPROP, half, 16)
+	MAKE_CBLOCKWISE8(rmsprop, RMSPROP, float, 32)
+
+
+	void cpercentile_clipping_g32(float * g, float *gnorm_vec, int step, const int n){ percentileClipping_g32(g, gnorm_vec, step, n); }
+	void cpercentile_clipping_g16(half * g, float *gnorm_vec, int step, const int n){ percentileClipping_g16(g, gnorm_vec, step, n); }
+
+	void cquantize_blockwise_cpu_fp32(float *code, float *A, float *absmax, unsigned char *out, const int n){ quantize_cpu(code, A, absmax, out, n); }
+	void cdequantize_blockwise_cpu_fp32(float *code, unsigned char *A, float *absmax, float *out, const int n){ dequantize_cpu(code, A, absmax, out, n); }
+
+	void chistogram_scatter_add_2d(float* histogram, int *index1, int *index2, float *src, int maxidx1, int n){ histogramScatterAdd2D(histogram, index1, index2, src, maxidx1, n); }
+}
+
+
--- a/deploy.sh
+++ b/deploy.sh
+#!/bin/bash
+
+rm -rf dist build
+make clean
+CUDA_HOME=/usr/local/cuda-10.2 make
+CUDA_VERSION=102 python -m build
+python -m twine upload --repository testpypi dist/* --verbose
+
+rm -rf dist build
+make clean
+CUDA_HOME=/usr/local/cuda-11.1 make
+CUDA_VERSION=111 python -m build
+python -m twine upload --repository testpypi dist/* --verbose
--- a/deploy_from_slurm.sh
+++ b/deploy_from_slurm.sh
+#!/bin/bash
+module unload cuda
+module unload gcc
+
+rm -rf dist build
+make clean
+make cleaneggs
+module load cuda/9.2
+module load gcc/7.3.0
+CUDA_HOME=/public/apps/cuda/9.2
+make
+CUDA_VERSION=92 python -m build
+python -m twine upload --repository testpypi dist/* --verbose
+module unload cuda
+
+
+rm -rf dist build
+make clean
+make cleaneggs
+module load cuda/10.0
+CUDA_HOME=/public/apps/cuda/10.0
+make cuda10x
+CUDA_VERSION=100 python -m build
+python -m twine upload --repository testpypi dist/* --verbose
+module unload cuda
+module unload gcc
+module load gcc/8.4
+
+rm -rf dist build
+make clean
+make cleaneggs
+module load cuda/10.1
+CUDA_HOME=/public/apps/cuda/10.1
+make cuda10x
+CUDA_VERSION=101 python -m build
+python -m twine upload --repository testpypi dist/* --verbose
+module unload cuda
+
+rm -rf dist build
+make clean
+make cleaneggs
+module load cuda/10.2
+CUDA_HOME=/public/apps/cuda/10.2/
+make cuda10x
+CUDA_VERSION=102 python -m build
+python -m twine upload --repository testpypi dist/* --verbose
+module unload cuda
+
+
+rm -rf dist build
+make clean
+make cleaneggs
+module load cuda/11.0
+CUDA_HOME=/public/apps/cuda/11.0
+make cuda110
+CUDA_VERSION=110 python -m build
+python -m twine upload --repository testpypi dist/* --verbose
+module unload cuda
+
+rm -rf dist build
+make clean
+make cleaneggs
+module load cuda/11.1
+CUDA_HOME=/public/apps/cuda/11.1
+make cuda11x
+CUDA_VERSION=111 python -m build
+python -m twine upload --repository testpypi dist/* --verbose
+module unload cuda
+
+rm -rf dist build
+make clean
+make cleaneggs
+module load cuda/11.2
+CUDA_HOME=/public/apps/cuda/11.2
+make cuda11x
+CUDA_VERSION=112 python -m build
+python -m twine upload --repository testpypi dist/* --verbose
+module unload cuda
+
+rm -rf dist build
+make clean
+make cleaneggs
+CUDA_HOME=/private/home/timdettmers/git/autoswap/local/cuda-11.3 make cuda11x
+CUDA_VERSION=113 python -m build
+python -m twine upload --repository testpypi dist/* --verbose
+module unload cuda
--- a/include/AAlloc.h
+++ b/include/AAlloc.h
+#pragma once
+
+#include "Portable.h"
+
+namespace BinSearch {
+namespace Details {
+
+template <typename T>
+bool isAligned(const T *p, size_t A)
+{
+    return (reinterpret_cast<size_t>(p) % A) == 0;
+}
+
+template <class T, size_t A=64>
+struct AlignedVec
+{
+    AlignedVec()
+        : m_storage(0)
+        , m_data(0)
+        , m_sz(0)
+    {
+    }
+
+    static size_t nBytes(size_t sz)
+    {
+        return sz * sizeof(T) + A;
+    }
+
+    static size_t shiftAmt(char *p)
+    {
+        return A>1? (A - (reinterpret_cast<size_t>(p) % A)) % A: 0;
+    }
+
+    void setPtr(char *p, size_t sz)
+    {
+        m_sz = sz;
+        m_data = reinterpret_cast<T *>(p + shiftAmt(p));
+    }
+
+    //void setPtr(T *p, size_t sz)
+    //{
+    //    m_sz = sz;
+    //    if (A>1)
+    //        myassert(((reinterpret_cast<size_t>(p) % A) == 0), "bad alignment");
+    //    m_data = p;
+    //}
+
+    // internal allocation
+    void resize(size_t sz)
+    {
+        m_storage = new char[nBytes(sz)];
+        setPtr(m_storage, sz);
+    }
+
+    // external allocation
+    void set(char *storage, size_t sz)
+    {
+        setPtr(storage, sz);
+    }
+
+    ~AlignedVec()
+    {
+        if (m_storage)
+            delete [] m_storage;
+    }
+
+    size_t size() const { return m_sz; }
+    T& operator[](size_t i) { return m_data[i]; }
+    const T& operator[](size_t i) const { return m_data[i]; }
+    T* begin()  { return m_data;  }
+    T* end()  { return m_data+m_sz; }
+    const T* begin() const { return m_data;  }
+    const T* end() const { return m_data+m_sz; }
+    T& front() { return m_data[0]; }
+    T& back() { return m_data[m_sz-1]; }
+    const T& front() const { return m_data[0]; }
+    const T& back() const { return m_data[m_sz - 1]; }
+
+private:
+    char *m_storage;
+    T *m_data;
+    size_t m_sz;
+};
+
+} // namespace Details
+} // namespace BinSearch
--- a/include/Algo-Direct-Common.h
+++ b/include/Algo-Direct-Common.h
+#pragma once
+
+#include <algorithm>
+#include <limits>
+#include <type_traits>
+#include "AAlloc.h"
+
+namespace BinSearch {
+namespace Details {
+
+namespace DirectAux {
+
+#define SAFETY_MULTI_PASS true
+
+template <typename T>
+struct HResults
+{
+    HResults(T h, double ratio, size_t n) : H(h), hRatio(ratio), nInc(n) {}
+    T H;
+    double hRatio;
+    size_t nInc;
+};
+
+
+#ifdef USE_FMA
+template <Algos A> struct IsDirect { static const bool value = (A == Direct) || (A == DirectFMA); };
+template <Algos A> struct IsDirect2 { static const bool value = (A == Direct2) || (A == Direct2FMA); };
+template <Algos A> struct IsDirectCache { static const bool value = (A == DirectCache) || (A == DirectCacheFMA); };
+#else
+template <Algos A> struct IsDirect { static const bool value = (A == Direct); };
+template <Algos A> struct IsDirect2 { static const bool value = (A == Direct2); };
+template <Algos A> struct IsDirectCache { static const bool value = (A == DirectCache); };
+#endif
+
+// general definition
+template <Algos A, typename T, typename Enable = void>
+struct BucketElem
+{
+    FORCE_INLINE void set( uint32 b, const T *)
+    {
+        m_b = b;
+    }
+
+    FORCE_INLINE uint32 index() const { return m_b; }
+
+private:
+    uint32 m_b;
+};
+
+// specialization for DirectCache methods
+
+template <typename T> struct MatchingIntType;
+template <> struct MatchingIntType<double> { typedef uint64 type; };
+template <> struct MatchingIntType<float> { typedef uint32 type; };
+
+template <Algos A, typename T>
+struct BucketElem<A, T, typename std::enable_if< IsDirectCache<A>::value >::type >
+{
+    typedef typename MatchingIntType<T>::type I;
+
+    void set(uint32 b, const T *xi)
+    {
+        u.u.x = xi[b];
+        u.u.b = b;
+    }
+
+    FORCE_INLINE I index() const { return u.u.b; }
+    FORCE_INLINE T x() const { return u.u.x; }
+
+private:
+    union {
+        double dummy;
+        struct
+        {
+            T x;
+            I b;
+        } u;
+    } u;
+};
+
+
+template <bool UseFMA, unsigned char Gap, typename T>
+struct DirectTraits
+{
+    static void checkH(T scaler, T x0, T xN)
+    {
+        T Dn = xN - x0;
+        T ifmax = Dn * scaler;
+        myassert((ifmax < std::numeric_limits<uint32>::max() - (Gap - 1)),
+            "Problem unfeasible: index size exceeds uint32 capacity:"
+            << " D[N] =" << Dn
+            << ", H =" << scaler
+            << ", H D[n] =" << ifmax << "\n"
+        );
+    }
+
+    FORCE_INLINE static uint32 f(T scaler, T x0, T z)
+    {
+        T tmp = scaler * (z - x0);
+#ifdef USE_SSE2
+        return ftoi(FVec1<SSE,T>(tmp));
+#else
+        return static_cast<uint32>(tmp);
+#endif
+    }
+
+    template <InstrSet I>
+    FORCE_INLINE static typename FTOITraits<I, T>::vec_t f(const FVec<I, T>& scaler, const FVec<I, T>& x0, const FVec<I, T>& z)
+    {
+        return ftoi(scaler*(z-x0));
+    }
+
+    static T cst0(T scaler, T x0)
+    {
+        return x0;
+    }
+};
+
+#ifdef USE_FMA
+template <unsigned char Gap, typename T>
+struct DirectTraits<true,Gap,T>
+{
+    typedef FVec1<SSE, T> fVec1;
+    
+    static void checkH(T scaler, T H_Times_x0, T xN)
+    {
+        union {
+            typename FVec1<SSE, T>::vec_t v;
+            T s;
+        } ifmax;
+        ifmax.v = mulSub(fVec1(scaler), fVec1(xN), fVec1(H_Times_x0));
+        myassert((ifmax.s < std::numeric_limits<uint32>::max() - (Gap - 1)),
+            "Problem unfeasible: index size exceeds uint32 capacity:"
+            << " H X[0] =" << H_Times_x0
+            << ", H =" << scaler
+            << ", X[N] =" << xN
+            << ", H X[N] - H X[0] =" << ifmax.s << "\n"
+        );
+    }
+
+    FORCE_INLINE static uint32 f(T scaler, T Hx0, T xi)
+    {
+        return ftoi(mulSub(fVec1(scaler), fVec1(xi), fVec1(Hx0)));
+    }
+
+    template <InstrSet I>
+    FORCE_INLINE static typename FTOITraits<I,T>::vec_t f(const FVec<I,T>& scaler, const FVec<I, T>& H_Times_X0, const FVec<I, T>& z)
+    {
+        return ftoi(mulSub(scaler, z, H_Times_X0));
+    }
+
+    static T cst0(T scaler, T x0)
+    {
+        return scaler*x0;
+    }
+};
+#endif
+
+template <unsigned char Gap, typename T, Algos A>
+struct DirectInfo
+{
+    static const bool UseFMA = (A == DirectFMA) || (A == Direct2FMA) || (A == DirectCacheFMA);
+    typedef DirectTraits<UseFMA, Gap, T> fun_t;
+    typedef BucketElem<A,T> bucket_t;
+    typedef AlignedVec<bucket_t> bucketvec_t;
+
+    struct Data {
+        Data() : buckets(0), xi(0), scaler(0), cst0(0) {}
+        Data( const T *x      // for Direct must persist if xws=NULL
+            , uint32 n
+            , T H
+            , bucket_t *bws   // assumed to gave size nb, as computed below
+            , T *xws = NULL   // assumed to have size (n+Gap-1). Optional for Direct, unused for DirectCache, required for DirectGap
+            )
+            : buckets(bws)
+            , scaler(H)
+            , cst0(fun_t::cst0(H, x[0]))
+        {
+            myassert(((bws != NULL) && (isAligned(bws,64))), "bucket pointer not allocated or incorrectly aligned");
+            
+            uint32 nb = 1 + fun_t::f(H, cst0, x[n-1]);
+            
+            const uint32 npad = Gap-1;
+            const uint32 n_sz = n + npad;   // size of padded vector
+
+            if (xws) {
+                myassert(isAligned(xws,8), "x pointer not allocated or incorrectly aligned");
+                std::fill_n(xws, npad, x[0]);    // pad in front with x[0]
+                std::copy(x, x+n, xws + npad);
+                xi = xws;
+            }
+            else {
+                myassert(Gap==1, "if Gap>1 then X workspace must be provided");
+                xi = x;
+            }
+
+            populateIndex(bws, nb, xi, n_sz, scaler, cst0);
+        }
+
+        const bucket_t *buckets;
+        const T *xi;
+        T scaler;
+        T cst0;  // could be x0 or (scaler*x0), depending if we are using FMA or not
+    } data;
+
+    static T growStep(T H)
+    {
+        T step;
+        T P = next(H);
+        while ((step = P - H) == 0)
+            P = next(P);
+        return step;
+    }
+
+    static HResults<T> computeH(const T *px, uint32 nx)
+    {
+        myassert((nx > Gap), "Array X too small");
+        myassert(((Gap == 1) || (Gap == 2)), "Only tested for these values of Gap");
+
+        const T x0 = px[0];
+        const T xN = px[nx-1];
+
+        const T range = xN - x0;
+        myassert((range < std::numeric_limits<T>::max()), "range too large");
+
+        // check that D_i are strictly increasing and compute minimum value D_{i+Offset}-D_i
+        T deltaDMin = range;
+        for (uint32 i = Gap; i < nx; ++i) {
+            T Dnew = px[i] - x0;
+            T Dold = px[i - Gap] - x0;
+            myassert((Dnew > Dold),
+                "Problem unfeasible: D_i sequence not strictly increasing"
+                << " X[" << 0 << "]=" << x0
+                << " X[" << i - Gap << "]=" << px[i - Gap]
+                << " X[" << i << "]=" << px[i]
+                << "\n"
+            );
+            T deltaD = Dnew - Dold;
+            if (deltaD < deltaDMin)
+                deltaDMin = deltaD;
+        }
+
+        // initial guess for H
+        const T H0 = T(1.0) / deltaDMin;
+        T H = H0;
+
+        T cst0 = fun_t::cst0(H, x0);
+        fun_t::checkH(H, cst0, xN);
+
+        // adjust H by trial and error until succeed
+        size_t nInc = 0;
+        bool modified = false;
+        size_t npasses = 0;
+        T step = growStep(H);
+        uint32 seg_already_checked_from = nx;
+        do {
+            myassert((npasses++ < 2), "verification failed\n");
+            // if there has been an increase, then check only up to that point
+            uint32 last_seg_to_be_checked = seg_already_checked_from - 1;
+            modified = false;
+            uint32 inew = 0;
+            for (uint32 i = Gap; i <= last_seg_to_be_checked; ++i) {
+                uint32 iold = fun_t::f(H, cst0, px[i-Gap]);
+                uint32 inew = fun_t::f(H, cst0, px[i]);
+                while (inew == iold) {
+                    seg_already_checked_from = i;
+                    last_seg_to_be_checked = nx-1;  // everything needs to be checked
+                    modified = true;
+                    H = H + step;
+                    step *= 2;
+                    // recalculate all constants and indices
+                    cst0 = fun_t::cst0(H, x0);
+                    fun_t::checkH(H, cst0, xN);
+                    iold = fun_t::f(H, cst0, px[i - Gap]);
+                    inew = fun_t::f(H, cst0, px[i]);
+                }
+            }
+        } while (SAFETY_MULTI_PASS && modified);
+
+        return HResults<T>(H, (((double)H) / H0) - 1.0, nInc);
+    }
+
+    static void populateIndex(BucketElem<A, T> *buckets, uint32 index_size, const T *px, uint32 x_size, T scaler, T cst0)
+    {
+        for (uint32 i = x_size-1, b = index_size-1, j=0; ; --i) {
+            uint32 idx = fun_t::f(scaler, cst0, px[i]);
+            while (b > idx) {  // in the 1st iteration it is j=0 but this condition is always false
+                buckets[b].set( j, px );
+                --b;
+            }
+            if (Gap==1 || b == idx) { // if Gap==1, which is known at compile time, the check b==idx is redundant
+                j = i - (Gap-1); // subtracting (Gap-1) points to the index of the first X-element to check
+                buckets[b].set(j, px);
+                if (b-- == 0)
+                    break;
+            }
+        }
+    }
+
+    DirectInfo(const Data& d)
+        : data(d)
+    {
+    }
+
+    DirectInfo(const T* px, const uint32 n)
+    {
+        HResults<T> res = computeH(px, n);
+
+#ifdef PAPER_TEST
+        nInc = res.nInc;
+        hRatio = res.hRatio;
+#endif
+        const uint32 npad = Gap-1;
+        const uint32 n_sz = n + npad;   // size of padded vector
+
+        if (npad)
+            xi.resize(n_sz);
+
+        T H    = res.H;
+        T cst0 = fun_t::cst0(H, px[0]);
+        const uint32 maxIndex = fun_t::f(H, cst0, px[n-1]);
+        buckets.resize(maxIndex + 1);
+        
+        data = Data(px, n, H, buckets.begin(), (npad? xi.begin(): NULL));
+    }
+
+private:
+    bucketvec_t buckets;
+    AlignedVec<T,8> xi;
+
+#ifdef PAPER_TEST
+public:
+    double hRatio;
+    size_t nInc;
+#endif
+};
+
+
+} // namespace DirectAux
+} // namespace Details
+} // namespace BinSearch
--- a/include/Algo-Direct2.h
+++ b/include/Algo-Direct2.h
--- a/include/AlgoXCodes.h
+++ b/include/AlgoXCodes.h
+ALGOENUM(DirectCacheFMA, 5)
+ALGOENUM(DirectFMA, 15)
+ALGOENUM(Direct2FMA, 25)
+ALGOENUM(DirectCache, 10)
+ALGOENUM(Direct, 20)
+ALGOENUM(Direct2, 30)
+ALGOENUM(Nonary, 40)
+ALGOENUM(Pentary, 50)
+ALGOENUM(Ternary, 60)
+ALGOENUM(Eytzinger, 70)
+ALGOENUM(BitSet, 80)
+ALGOENUM(ClassicOffset, 90)
+#ifdef PAPER_TEST
+ALGOENUM(MorinOffset, 100)
+ALGOENUM(BitSetNoPad, 110)
+ALGOENUM(ClassicMod, 120)
+ALGOENUM(MorinBranchy, 130)
+ALGOENUM(Classic, 140)
+ALGOENUM(LowerBound, 145)
+#ifdef USE_MKL
+ALGOENUM(MKL, 150)
+#endif
+#endif
--- a/include/BinAlgo.h
+++ b/include/BinAlgo.h
+#pragma once
+
+#include "Type.h"
+#include <algorithm>
+
+namespace BinSearch {
+
+template <InstrSet I, typename T, Algos A, bool L=false, bool R=false>
+struct BinAlgo : Details::BinAlgoBase<I,T,A>
+{
+    typedef Details::BinAlgoBase<I,T,A> base_t;
+
+    BinAlgo(const T* px, const uint32 n) :  base_t(px, n), x0(px[0]), xN(px[n-1]), N(n) {}
+    BinAlgo(const T* px, const uint32 n, const typename base_t::Data& d) : base_t(d), x0(px[0]), xN(px[n-1]), N(n) {}
+
+    FORCE_INLINE
+    uint32 scalar(T z) const
+    {
+        if (!L || z >= x0)
+            if (!R || z < xN)
+                return base_t::scalar(z);
+            else
+                return N;
+        else
+            return std::numeric_limits<uint32>::max();
+    }
+
+
+    FORCE_INLINE
+    void vectorial(uint32 *pr, const T *pz, uint32 n) const
+    {
+        if (!L && !R) {
+            Details::Loop<T,base_t>::loop(*this, pr, pz, n);
+        }
+        else {
+            const uint32 nElem = base_t::nElem;
+            const uint32 idealbufsize = 256;
+            const uint32 bufsize = nElem * (idealbufsize / nElem + ((idealbufsize % nElem) ? 1 : 0));
+            T databuf[bufsize];
+            uint32 resbuf[bufsize];
+            uint32 indexbuf[bufsize];
+
+            uint32 *prend = pr + n;
+            while(pr != prend) {
+                uint32 cnt = 0;
+                uint32 niter = std::min(bufsize, (uint32)std::distance(pr,prend));
+                for (uint32 j = 0; j < niter; ++j) {
+                    T z = pz[j];
+                    // FIXME: use SSE2?
+                    if (!L || z >= x0)
+                        if (!R || z < xN) {
+                            databuf[cnt] = z;
+                            indexbuf[cnt] = j;
+                            ++cnt;
+                        }
+                        else
+                            pr[j] = N;
+                    else
+                        pr[j] = std::numeric_limits<uint32>::max();
+                }
+                // FIXME: merge these two loops
+                Details::Loop<T,base_t>::loop(*this, resbuf, databuf, cnt);
+                for (uint32 j = 0; j < cnt; ++j)
+                    pr[indexbuf[j]] = resbuf[j];
+                pr += niter;
+                pz += niter;
+            }
+        }
+    }
+
+    Details::CondData<T,L> x0;
+    Details::CondData<T,R> xN;
+    Details::CondData<uint32,R> N;
+};
+
+
+} // namespace BinSearch
--- a/include/BinSearch.h
+++ b/include/BinSearch.h
+#pragma once
+
+#include "AAlloc.h"
+#include "BinAlgo.h"
+#include "SIMD.h"
+
+#include <algorithm>
+#include <limits>
+
+
+#include "Algo-Direct2.h"
--- a/include/Portable.h
+++ b/include/Portable.h
--- a/include/SIMD.h
+++ b/include/SIMD.h
--- a/include/Type.h
+++ b/include/Type.h
--- a/pyproject.toml
+++ b/pyproject.toml
+[build-system]
+requires = [
+    "setuptools>=42",
+    "wheel"
+]
+build-backend = "setuptools.build_meta"
--- a/requirements.txt
+++ b/requirements.txt
+pytest
--- a/setup.py
+++ b/setup.py