Commit 74399248 authored by Tim Dettmers's avatar Tim Dettmers
Browse files

Initial commit

parents
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from bitsandbytes.optim.optimizer import Optimizer1State
class SGD(Optimizer1State):
def __init__(self, params, lr, momentum=0, dampening=0,
weight_decay=0, nesterov=False, optim_bits=32, args=None,
min_8bit_size=4096, percentile_clipping=100, block_wise=True):
if momentum == 0:
raise NotImplementError(f'SGD without momentum is not supported!')
super(SGD, self).__init__('momentum', params, lr, (momentum, dampening), 0.0,
weight_decay, optim_bits, args, min_8bit_size, percentile_clipping, block_wise)
class SGD8bit(Optimizer1State):
def __init__(self, params, lr, momentum=0, dampening=0,
weight_decay=0, nesterov=False, args=None,
min_8bit_size=4096, percentile_clipping=100, block_wise=True):
if momentum == 0:
raise NotImplementError(f'SGD without momentum is not supported!')
super(SGD8bit, self).__init__('momentum', params, lr, (momentum, dampening), 0.0,
weight_decay, 8, args, min_8bit_size, percentile_clipping, block_wise)
class SGD32bit(Optimizer1State):
def __init__(self, params, lr, momentum=0, dampening=0,
weight_decay=0, nesterov=False, args=None,
min_8bit_size=4096, percentile_clipping=100, block_wise=True):
if momentum == 0:
raise NotImplementError(f'SGD without momentum is not supported!')
super(SGD32bit, self).__init__('momentum', params, lr, (momentum, dampening), 0.0,
weight_decay, 32, args, min_8bit_size, percentile_clipping, block_wise)
This diff is collapsed.
// Copyright (c) Facebook, Inc. and its affiliates.
//
// This source code is licensed under the MIT license found in the
// LICENSE file in the root directory of this source tree.
#include <float.h>
#include <ops.cuh>
#ifndef kernels
#define kernels
template<typename T>__global__ void kEstimateQuantiles(T *__restrict__ const A, float *code, const float offset, const T max_val, const int n);
__global__ void kQuantize(float * code, float * __restrict__ const A, unsigned char *out, const int n);
__global__ void kDequantize(float *code, unsigned char *A, float *out, const int n);
template<typename T, int BLOCK_SIZE, int NUM_PER_TH, int STOCHASTIC> __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n);
template<typename T, int BLOCK_SIZE, int THREADS, int NUM_PER_TH> __global__ void kDequantizeBlockwise(float *code, unsigned char * __restrict__ const A, float * __restrict__ const absmax, T *out, const int n);
template<typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS>
__global__ void kPreconditionOptimizer32bit2State(T* g, T* p,
float* state1, float* state2, float *unorm,
const float beta1, const float beta2, const float eps, const float weight_decay,
const int step, const float lr, const float gnorm_scale, const int n);
template<typename T, int OPTIMIZER>
__global__ void kOptimizer32bit2State(T* g, T* p,
float* state1, float* state2, float *unorm, const float max_unorm, const float param_norm,
const float beta1, const float beta2, const float eps, const float weight_decay,
const int step, const float lr, const float gnorm_scale, const int n);
template<typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS>
__global__ void kPreconditionOptimizer32bit1State(T* g, T* p,
float* state1, float *unorm,
const float beta1, const float eps, const float weight_decay,
const int step, const float lr, const float gnorm_scale, const int n);
template<typename T, int OPTIMIZER>
__global__ void kOptimizer32bit1State(T* g, T* p,
float* state1, float *unorm, const float max_unorm, const float param_norm,
const float beta1, const float eps, const float weight_decay,
const int step, const float lr, const float gnorm_scale, const int n);
template<typename T, int OPTIMIZER>
__global__ void
kPreconditionOptimizerStatic8bit1State(T* p, T* __restrict__ const g, unsigned char*__restrict__ const state1,
float *unorm,
const float beta1,
const float eps, const int step,
float* __restrict__ const quantiles1,
float* max1, float* new_max1,
const float weight_decay,
const float gnorm_scale, const int n);
template<typename T, int OPTIMIZER>
__global__ void
kOptimizerStatic8bit1State(T* p, T* const g, unsigned char* state1,
const float *unorm, const float max_unorm, const float param_norm,
const float beta1,
const float eps, const int step, const float lr,
float* __restrict__ const quantiles1,
float* max1, float* new_max1,
float weight_decay, const float gnorm_scale, const int n);
template<typename T, int OPTIMIZER>
__global__ void
kPreconditionOptimizerStatic8bit2State(T* p, T* __restrict__ const g, unsigned char*__restrict__ const state1, unsigned char* __restrict__ const state2,
float *unorm,
const float beta1, const float beta2,
const float eps, const int step,
float* __restrict__ const quantiles1, float* __restrict__ const quantiles2,
float* max1, float* max2, float* new_max1, float* new_max2,
const float gnorm_scale, const int n);
template<typename T, int OPTIMIZER>
__global__ void
kOptimizerStatic8bit2State(T* p, T* const g, unsigned char* state1, unsigned char* state2,
const float *unorm, const float max_unorm, const float param_norm,
const float beta1, const float beta2,
const float eps, const int step, const float lr,
float* __restrict__ const quantiles1, float* __restrict__ const quantiles2,
float* max1, float* max2, float* new_max1, float* new_max2,
float weight_decay, const float gnorm_scale, const int n);
template<typename T, int OPTIMIZER, int BLOCK_SIZE, int N_PER_TH> __global__ void kOptimizerStatic8bit2StateBlockwise(
T* p, T* __restrict__ const g, unsigned char* state1, unsigned char* state2,
const float beta1, const float beta2, const float eps, const int step, const float lr,
float* __restrict__ const quantiles1, float* __restrict__ const quantiles2,
float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale, const int n);
template<typename T, int OPTIMIZER, int BLOCK_SIZE, int N_PER_TH> __global__ void kOptimizerStatic8bit1StateBlockwise(
T* p, T* __restrict__ const g, unsigned char* state1,
const float beta1, const float beta2,
const float eps, const int step, const float lr,
float* __restrict__ const quantiles1,
float* absmax1,
float weight_decay,
const float gnorm_scale, const int n);
template<typename T, int BLOCK_SIZE, int NUM_VALS> __global__ void kPercentileClipping(T * __restrict__ g, float *gnorm_vec, int step, const int n);
__global__ void kHistogramScatterAdd2D(float* histogram, int *index1, int *index2, float *src, const int maxidx1, const int n);
#endif
// Copyright (c) Facebook, Inc. and its affiliates.
//
// This source code is licensed under the MIT license found in the
// LICENSE file in the root directory of this source tree.
#include <ops.cuh>
#include <kernels.cuh>
#include <cub/device/device_scan.cuh>
#include <limits>
#include <BinSearch.h>
using namespace BinSearch;
using std::cout;
using std::endl;
#define BLOCK_SIZE 4096
struct quantize_block_args
{
BinAlgo<Scalar, float, Direct2> *bin_searcher;
float *code;
float *A;
float *absmax;
unsigned char *out;
int block_end;
int block_idx;
int threadidx;
};
void *quantize_block(void *arguments)
{
// 1. find absmax in block
// 2. divide input value by absmax to normalize into [-1.0, 1.0]
// 3. do binary search to find the closest value
// 4. check minimal distance
// 5. store index
struct quantize_block_args *args = (quantize_block_args*)arguments;
// 1. find absmax in block
float absmax_block = -FLT_MAX;
for (int i = args->block_idx; i < args->block_end; i++)
absmax_block = fmax(absmax_block, fabs(args->A[i]));
args->absmax[args->block_idx/BLOCK_SIZE] = absmax_block;
for (int i = args->block_idx; i < args->block_end; i++)
{
// 2. divide input value by absmax to normalize into [-1.0, 1.0]
// 3. do binary search to find the closest value
float normed_value = args->A[i]/absmax_block;
int idx = args->bin_searcher->scalar(normed_value);
// 4. check minimal distance
// The binary search returns always the value to the left, which might not be the closest value
if(idx < 255)
{
float dist_left = fabs(normed_value-(args->code[idx]));
float dist_right = fabs(normed_value-(args->code[idx+1]));
if(dist_right < dist_left){ idx+=1; }
}
// 5. store index
args->out[i] = (unsigned char)idx;
}
return NULL;
}
void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, int n)
{
// the default code is has range [-0.993, 1.0] which can cause an error in the binary search algorithm used below
code[0] = -1.0f;
int num_blocks = n/BLOCK_SIZE;
num_blocks += n % BLOCK_SIZE == 0 ? 0 : 1;
pthread_t *threads = (pthread_t*)malloc(sizeof(pthread_t)*num_blocks);
struct quantize_block_args **args = (quantize_block_args**)malloc(num_blocks*sizeof(quantize_block_args*));
for(int i = 0; i < num_blocks; i++)
args[i] = (quantize_block_args*)malloc(sizeof(quantize_block_args));
const uint32 elements_code = 256;
BinAlgo<Scalar, float, Direct2> bin_searcher(code, elements_code);
for(int block_idx = 0; block_idx < n; block_idx+=BLOCK_SIZE)
{
int valid_items = n-block_idx >= BLOCK_SIZE ? BLOCK_SIZE : n - block_idx;
int block_end = block_idx + valid_items;
struct quantize_block_args *arg = args[block_idx/BLOCK_SIZE];
arg->bin_searcher = &bin_searcher;
arg->code = code;
arg->A = A;
arg->absmax = absmax;
arg->out = out;
arg->block_end = block_end;
arg->block_idx = block_idx;
arg->threadidx = block_idx/BLOCK_SIZE;
pthread_create(&threads[block_idx/BLOCK_SIZE], NULL, &quantize_block, (void *)arg);
}
for(int i = 0; i < num_blocks; i++)
int err = pthread_join(threads[i], NULL);
free(threads);
for(int i = 0; i < num_blocks; i++)
free(args[i]);
free(args);
}
void dequantize_cpu(float *code, unsigned char *A, float *absmax, float *out, int n)
{
for(int block_idx = 0; block_idx < n; block_idx+=BLOCK_SIZE)
{
int valid_items = n-block_idx >= BLOCK_SIZE ? BLOCK_SIZE : n - block_idx;
int block_end = block_idx + valid_items;
for (int i = block_idx; i < block_end; i++)
out[i] = code[A[i]]*absmax[block_idx/BLOCK_SIZE];
}
}
void histogramScatterAdd2D(float* histogram, int *index1, int *index2, float *src, int maxidx1, int n)
{
int threads = 512;
int blocks = n/threads;
blocks = n % threads == 0 ? blocks : blocks + 1;
kHistogramScatterAdd2D<<<blocks, 512>>>(histogram, index1, index2, src, maxidx1, n);
CUDA_CHECK_RETURN(cudaPeekAtLastError());
}
template <typename T> void estimateQuantiles(T *A, float *code, float offset, int n)
{
int blocks = n/4096;
blocks = n % 4096 == 0 ? blocks : blocks + 1;
CUDA_CHECK_RETURN(cudaMemset(code, 0, 256*sizeof(float)));
kEstimateQuantiles<T><<<blocks, 512>>>(A, code, offset, std::numeric_limits<T>::max(), n);
CUDA_CHECK_RETURN(cudaPeekAtLastError());
}
void quantize(float *code, float *A, unsigned char *out, int n)
{
int blocks = n/1024;
blocks = n % 1024 == 0 ? blocks : blocks + 1;
kQuantize<<<blocks, 1024>>>(code, A, out, n);
CUDA_CHECK_RETURN(cudaPeekAtLastError());
}
void dequantize(float *code, unsigned char *A, float *out, int n)
{
int blocks = n/1024;
blocks = n % 1024 == 0 ? blocks : blocks + 1;
kDequantize<<<blocks, 1024>>>(code, A, out, n);
CUDA_CHECK_RETURN(cudaPeekAtLastError());
}
template <typename T, int STOCHASTIC> void quantizeBlockwise(float * code, T *A, float *absmax, unsigned char *out, float *rand, int rand_offset, const int n)
{
int blocks = n/4096;
blocks = n % 4096 == 0 ? blocks : blocks + 1;
kQuantizeBlockwise<T, 4096, 4, STOCHASTIC><<<blocks, 1024>>>(code, A, absmax, out, rand, rand_offset, n);
CUDA_CHECK_RETURN(cudaPeekAtLastError());
}
template<typename T> void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, T *out, int blocksize, const int n)
{
int blocks = n/blocksize;
blocks = n % blocksize == 0 ? blocks : blocks + 1;
if(blocksize == 4096)
kDequantizeBlockwise<T, 4096, 1024, 4><<<blocks, 4096/4>>>(code, A, absmax, out, n);
else if(blocksize == 2048)
kDequantizeBlockwise<T, 2048, 512, 4><<<blocks, 2048/4>>>(code, A, absmax, out, n);
CUDA_CHECK_RETURN(cudaPeekAtLastError());
}
template<typename T, int OPTIMIZER> void optimizer32bit(T* g, T* p,
float* state1, float* state2, float *unorm, float max_unorm, float param_norm,
const float beta1, const float beta2, const float eps, const float weight_decay,
const int step, const float lr, const float gnorm_scale, const int n)
{
int blocks = n/4096;
blocks = n % 4096 == 0 ? blocks : blocks + 1;
switch(OPTIMIZER)
{
case ADAM:
if(max_unorm > 0.0f)
{
CUDA_CHECK_RETURN(cudaMemset(unorm, 0, 1*sizeof(float)));
kPreconditionOptimizer32bit2State<T, OPTIMIZER, 4096, 8><<<blocks, 512>>>(g, p, state1, state2, unorm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, n);
CUDA_CHECK_RETURN(cudaPeekAtLastError());
}
kOptimizer32bit2State<T, OPTIMIZER><<<blocks, 1024>>>(g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, n);
CUDA_CHECK_RETURN(cudaPeekAtLastError());
break;
case MOMENTUM:
case RMSPROP:
if(max_unorm > 0.0f)
{
CUDA_CHECK_RETURN(cudaMemset(unorm, 0, 1*sizeof(float)));
kPreconditionOptimizer32bit1State<T, OPTIMIZER, 4096, 8><<<blocks, 512>>>(g, p, state1, unorm, beta1, eps, weight_decay, step, lr, gnorm_scale, n);
CUDA_CHECK_RETURN(cudaPeekAtLastError());
}
kOptimizer32bit1State<T, OPTIMIZER><<<blocks, 1024>>>(g, p, state1, unorm, max_unorm, param_norm, beta1, eps, weight_decay, step, lr, gnorm_scale, n);
CUDA_CHECK_RETURN(cudaPeekAtLastError());
break;
}
}
template<typename T, int OPTIMIZER> void optimizerStatic8bit(T* p, T* g,
unsigned char* state1, unsigned char* state2,
float *unorm, float max_unorm, float param_norm,
float beta1, float beta2,
float eps, int step, float lr,
float* quantiles1, float* quantiles2,
float* max1, float* max2, float* new_max1, float* new_max2,
float weight_decay,
const float gnorm_scale, int n)
{
int blocks = n/4096;
blocks = n % 4096 == 0 ? blocks : blocks + 1;
if(max_unorm > 0.0f){ CUDA_CHECK_RETURN(cudaMemset(unorm, 0, 1*sizeof(float))); }
switch(OPTIMIZER)
{
case ADAM:
CUDA_CHECK_RETURN(cudaMemset(new_max1, 0, 1*sizeof(float)));
CUDA_CHECK_RETURN(cudaMemset(new_max2, 0, 1*sizeof(float)));
kPreconditionOptimizerStatic8bit2State<T, OPTIMIZER><<<blocks, 256>>>(p, g, state1, state2, unorm, beta1, beta2, eps, step, quantiles1, quantiles2, max1, max2, new_max1, new_max2, gnorm_scale, n);
CUDA_CHECK_RETURN(cudaPeekAtLastError());
kOptimizerStatic8bit2State<T, OPTIMIZER><<<blocks, 1024>>>(p, g, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr,
quantiles1, quantiles2, max1, max2, new_max1, new_max2, weight_decay, gnorm_scale, n);
CUDA_CHECK_RETURN(cudaPeekAtLastError());
break;
case MOMENTUM:
case RMSPROP:
CUDA_CHECK_RETURN(cudaMemset(new_max1, 0, 1*sizeof(float)));
kPreconditionOptimizerStatic8bit1State<T, OPTIMIZER><<<blocks, 256>>>(p, g, state1, unorm, beta1, eps, step, quantiles1, max1, new_max1, weight_decay, gnorm_scale, n);
CUDA_CHECK_RETURN(cudaPeekAtLastError());
kOptimizerStatic8bit1State<T, OPTIMIZER><<<blocks, 1024>>>(p, g, state1, unorm, max_unorm, param_norm, beta1, eps, step, lr,
quantiles1, max1, new_max1, weight_decay, gnorm_scale, n);
CUDA_CHECK_RETURN(cudaPeekAtLastError());
break;
default:
break;
}
}
#define BLOCKSIZE_2STATE 2048
#define NUM_2STATE 8
#define BLOCKSIZE_1STATE 2048
#define NUM_1STATE 8
template<typename T, int OPTIMIZER> void optimizerStatic8bitBlockwise(T* p, T* g,
unsigned char* state1, unsigned char* state2, float beta1, float beta2, float eps, int step, float lr,
float* quantiles1, float* quantiles2, float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale, int n)
{
int blocks = 0;
switch(OPTIMIZER)
{
case ADAM:
blocks = n/BLOCKSIZE_2STATE;
blocks = n % BLOCKSIZE_2STATE == 0 ? blocks : blocks + 1;
kOptimizerStatic8bit2StateBlockwise<T, OPTIMIZER, BLOCKSIZE_2STATE, NUM_2STATE><<<blocks, BLOCKSIZE_2STATE/NUM_2STATE>>>(p, g, state1, state2, beta1, beta2, eps, step, lr,
quantiles1, quantiles2, absmax1, absmax2, weight_decay, gnorm_scale, n);
CUDA_CHECK_RETURN(cudaPeekAtLastError());
break;
case MOMENTUM:
case RMSPROP:
blocks = n/BLOCKSIZE_1STATE;
blocks = n % BLOCKSIZE_1STATE == 0 ? blocks : blocks + 1;
kOptimizerStatic8bit1StateBlockwise<T, OPTIMIZER, BLOCKSIZE_1STATE, NUM_1STATE><<<blocks, BLOCKSIZE_1STATE/NUM_1STATE>>>(p, g, state1, beta1, beta2, eps, step, lr,
quantiles1, absmax1, weight_decay, gnorm_scale, n);
CUDA_CHECK_RETURN(cudaPeekAtLastError());
break;
}
}
template<typename T> void percentileClipping(T * g, float *gnorm_vec, int step, const int n)
{
int blocks = n/2048;
blocks = n % 2048 == 0 ? blocks : blocks + 1;
CUDA_CHECK_RETURN(cudaMemset(&gnorm_vec[step % 100], 0, 1*sizeof(float)));
kPercentileClipping<T, 2048, 4><<<blocks, 512>>>(g, gnorm_vec, step, n);
CUDA_CHECK_RETURN(cudaPeekAtLastError());
}
//==============================================================
// TEMPLATE DEFINITIONS
//==============================================================
template void estimateQuantiles(half *A, float *code, float offset, int n);
template void estimateQuantiles(float *A, float *code, float offset, int n);
template void quantizeBlockwise<half, 0>(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n);
template void quantizeBlockwise<float, 0>(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n);
template void quantizeBlockwise<half, 1>(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n);
template void quantizeBlockwise<float, 1>(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n);
template void dequantizeBlockwise<half>(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n);
template void dequantizeBlockwise<float>(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n);
#define MAKE_optimizer32bit(name, gtype) \
template void optimizer32bit<gtype, name>(gtype* g, gtype* p, \
float* state1, float* state2, float* unorm, float max_unorm, float param_norm, \
const float beta1, const float beta2, const float eps, const float weight_decay, \
const int step, const float lr, const float gnorm_scale, const int n);
MAKE_optimizer32bit(ADAM, half)
MAKE_optimizer32bit(ADAM, float)
MAKE_optimizer32bit(MOMENTUM, half)
MAKE_optimizer32bit(MOMENTUM, float)
MAKE_optimizer32bit(RMSPROP, half)
MAKE_optimizer32bit(RMSPROP, float)
#define MAKE_optimizerStatic8bit(name, gtype) \
template void optimizerStatic8bit<gtype, name>(gtype* p, gtype* g, unsigned char* state1, unsigned char* state2, \
float *unorm, float max_unorm, float param_norm, \
float beta1, float beta2, \
float eps, int step, float lr, \
float* quantiles1, float* quantiles2, \
float* max1, float* max2, float* new_max1, float* new_max2, \
float weight_decay, \
const float gnorm_scale, int n); \
MAKE_optimizerStatic8bit(ADAM, half)
MAKE_optimizerStatic8bit(ADAM, float)
MAKE_optimizerStatic8bit(MOMENTUM, half)
MAKE_optimizerStatic8bit(MOMENTUM, float)
MAKE_optimizerStatic8bit(RMSPROP, half)
MAKE_optimizerStatic8bit(RMSPROP, float)
#define MAKE_optimizerStatic8bitBlockwise(gtype, optim_name) \
template void optimizerStatic8bitBlockwise<gtype, optim_name>(gtype* p, gtype* g, \
unsigned char* state1, unsigned char* state2, float beta1, float beta2, float eps, int step, float lr, \
float* quantiles1, float* quantiles2, float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale, int n); \
MAKE_optimizerStatic8bitBlockwise(half, ADAM);
MAKE_optimizerStatic8bitBlockwise(float, ADAM);
MAKE_optimizerStatic8bitBlockwise(half, MOMENTUM);
MAKE_optimizerStatic8bitBlockwise(float, MOMENTUM);
MAKE_optimizerStatic8bitBlockwise(half, RMSPROP);
MAKE_optimizerStatic8bitBlockwise(float, RMSPROP);
template void percentileClipping(float * g, float *gnorm_vec, int step, const int n);
template void percentileClipping(half * g, float *gnorm_vec, int step, const int n);
// Copyright (c) Facebook, Inc. and its affiliates.
//
// This source code is licensed under the MIT license found in the
// LICENSE file in the root directory of this source tree.
#ifndef ops_H
#define ops_H
#include <stdio.h>
#include <iostream>
#include <unistd.h>
#include <assert.h>
#include <cuda_runtime_api.h>
#include <cuda_fp16.h>
#define CUDA_CHECK_RETURN(value) { \
cudaError_t _m_cudaStat = value; \
if (_m_cudaStat != cudaSuccess) { \
fprintf(stderr, "Error %s at line %d in file %s\n", \
cudaGetErrorString(_m_cudaStat), __LINE__, __FILE__); \
exit(1); \
} }
#define THREADS_PER_BLOCKS (512)
typedef enum Operations_t
{
ksmul = 0,
} Operations_t;
typedef enum Optimizer_t
{
ADAM = 0,
MOMENTUM = 1,
RMSPROP = 2,
LARS = 3,
} Optimizer_t;
template <typename T> void estimateQuantiles(T *A, float *code, float offset, int n);
void quantize(float *code, float *A, unsigned char *out, int n);
void dequantize(float *code, unsigned char *A, float *out, int n);
template <typename T, int STOCHASTIC> void quantizeBlockwise(float * code, T *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n);
template<typename T> void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, T *out, int block_size, const int n);
template<typename T, int OPTIMIZER> void optimizer32bit(T* g, T* p,
float* state1, float* state2, float *unorm, float max_unorm, float param_norm,
float beta1, float beta2, float eps, float weight_decay,
int step, float lr, const float gnorm_scale, int n);
template<typename T, int OPTIMIZER> void optimizerStatic8bit(T* p, T* g, unsigned char* state1, unsigned char* state2,
float *unorm, float max_unorm, float param_norm,
float beta1, float beta2,
float eps, int step, float lr,
float* quantiles1, float* quantiles2,
float* max1, float* max2, float* new_max1, float* new_max2,
float weight_decay,
const float gnorm_scale, int n);
template<typename T, int OPTIMIZER> void optimizerStatic8bitBlockwise(T* p, T* g,
unsigned char* state1, unsigned char* state2, float beta1, float beta2, float eps, int step, float lr,
float* quantiles1, float* quantiles2, float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale, int n);
template<typename T> void percentileClipping(T * g, float *gnorm_vec, int step, const int n);
void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, int n);
void dequantize_cpu(float *code, unsigned char *A, float *absmax, float *out, int n);
void histogramScatterAdd2D(float* histogram, int *index1, int *index2, float *src, int maxidx1, int n);
#endif
// Copyright (c) Facebook, Inc. and its affiliates.
//
// This source code is licensed under the MIT license found in the
// LICENSE file in the root directory of this source tree.
#include <ops.cuh>
// We cannot call templated code from C, so we wrap the template in a C compatible call here if necessary.
// We use macro functions to expand all the different optimizers. Looks ugly, and is ugly, but its better than to
// maintain all that boilerplate
//===================================================================================
// UNMANGLED CALLS
//===================================================================================
void estimateQuantiles_fp32(float *A, float *code, float offset, int n){ estimateQuantiles<float>(A, code, offset, n); }
void estimateQuantiles_fp16(half *A, float *code, float offset, int n){ estimateQuantiles<half>(A, code, offset, n); }
#define MAKE_FUNC32(fname, oname, gtype, gbits) \
void fname##32bit_g##gbits(gtype *g, gtype *p, \
float* state1, float* state2, float *unorm, float max_unorm, float param_norm, \
const float beta1, const float beta2, const float eps, const float weight_decay, \
const int step, const float lr, float gnorm_scale, const int n) \
{ optimizer32bit<gtype, oname>(g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, n); } \
MAKE_FUNC32(momentum, MOMENTUM, float, 32)
MAKE_FUNC32(momentum, MOMENTUM, half, 16)
MAKE_FUNC32(adam, ADAM, float, 32)
MAKE_FUNC32(adam, ADAM, half, 16)
MAKE_FUNC32(rmsprop, RMSPROP, float, 32)
MAKE_FUNC32(rmsprop, RMSPROP, half, 16)
#define MAKE_FUNC8(fname, oname, gtype, gbits) \
void fname##_static_8bit_g##gbits(gtype* p, gtype* g, unsigned char* state1, unsigned char* state2, \
float *unorm, float max_unorm, float param_norm, \
float beta1, float beta2, \
float eps, int step, float lr, \
float* quantiles1, float* quantiles2, \
float* max1, float* max2, float* new_max1, float* new_max2, \
float weight_decay, float gnorm_scale, int n) \
{ \
optimizerStatic8bit<gtype, oname>(g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr, \
quantiles1, quantiles2, max1, max2, new_max1, new_max2, weight_decay, gnorm_scale, n); \
} \
MAKE_FUNC8(adam, ADAM, float, 32)
MAKE_FUNC8(adam, ADAM, half, 16)
MAKE_FUNC8(momentum, MOMENTUM, float, 32)
MAKE_FUNC8(momentum, MOMENTUM, half, 16)
MAKE_FUNC8(rmsprop, RMSPROP, float, 32)
MAKE_FUNC8(rmsprop, RMSPROP, half, 16)
#define MAKE_BLOCKWISE8(fname, optim_name, gtype, gbits) \
void fname##_8bit_blockwise_fp##gbits(gtype* p, gtype* g, \
unsigned char* state1, unsigned char* state2, float beta1, float beta2, float eps, int step, float lr, \
float* quantiles1, float* quantiles2, float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale, int n)\
{ optimizerStatic8bitBlockwise<gtype, optim_name>(p, g, state1, state2, beta1, beta2, eps, step, lr, quantiles1, quantiles2, absmax1, absmax2, weight_decay, gnorm_scale, n); }\
MAKE_BLOCKWISE8(adam, ADAM, half, 16)
MAKE_BLOCKWISE8(adam, ADAM, float, 32)
MAKE_BLOCKWISE8(momentum, MOMENTUM, half, 16)
MAKE_BLOCKWISE8(momentum, MOMENTUM, float, 32)
MAKE_BLOCKWISE8(rmsprop, RMSPROP, half, 16)
MAKE_BLOCKWISE8(rmsprop, RMSPROP, float, 32)
void percentileClipping_g32(float * g, float *gnorm_vec, int step, const int n){ percentileClipping<float>(g, gnorm_vec, step, n); }
void percentileClipping_g16(half * g, float *gnorm_vec, int step, const int n){ percentileClipping<half>(g, gnorm_vec, step, n); }
void quantizeBlockwise_fp16(float * code, half *A, float *absmax, unsigned char *out, const int n){ quantizeBlockwise<half, 0>(code, A, absmax, out, NULL, 0, n); }
void quantizeBlockwise_fp32(float * code, float *A, float *absmax, unsigned char *out, const int n){ quantizeBlockwise<float, 0>(code, A, absmax, out, NULL, 0, n); }
void quantizeBlockwise_stochastic_fp16(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n){ quantizeBlockwise<half, 1>(code, A, absmax, out, rand, rand_offset, n); }
void quantizeBlockwise_stochastic_fp32(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n){ quantizeBlockwise<float, 1>(code, A, absmax, out, rand, rand_offset, n); }
void dequantizeBlockwise_fp16(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise<half>(code, A, absmax, out, blocksize, n); } \
void dequantizeBlockwise_fp32(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise<float>(code, A, absmax, out, blocksize, n); }
extern "C"
{
void cestimate_quantiles_fp32(float *A, float *code, float offset, int n){ estimateQuantiles_fp32(A, code, offset, n); }
void cestimate_quantiles_fp16(half *A, float *code, float offset, int n){ estimateQuantiles_fp16(A, code, offset, n); }
void cquantize(float *code, float *A, unsigned char *out, int n){ quantize(code, A, out, n); }
void cdequantize(float *code, unsigned char *A, float *out, int n){ dequantize(code, A, out, n); }
void cquantize_blockwise_fp16(float * code, half *A, float *absmax, unsigned char *out, const int n){ quantizeBlockwise_fp16(code, A, absmax, out, n); }
void cquantize_blockwise_fp32(float * code, float *A, float *absmax, unsigned char *out, const int n){ quantizeBlockwise_fp32(code, A, absmax, out, n); }
void cquantize_blockwise_stochastic_fp16(float * code, half *A, float *absmax, unsigned char *out, float *rand, int rand_offset, const int n){ quantizeBlockwise_stochastic_fp16(code, A, absmax, out, rand, rand_offset, n); }
void cquantize_blockwise_stochastic_fp32(float * code, float *A, float *absmax, unsigned char *out, float *rand, int rand_offset, const int n){ quantizeBlockwise_stochastic_fp32(code, A, absmax, out, rand, rand_offset, n); }
void cdequantize_blockwise_fp16(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise_fp16(code, A, absmax, out, blocksize, n); }
void cdequantize_blockwise_fp32(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise_fp32(code, A, absmax, out, blocksize, n); }
#define MAKE_CFUNC32(name, gtype, gbits) \
void c##name##32bit_g##gbits(gtype *g, gtype *p, \
float* state1, float* state2, float *unorm, float max_unorm, float param_norm, \
const float beta1, const float beta2, const float eps, const float weight_decay, \
const int step, const float lr, const float gnorm_scale, const int n) \
{ name##32bit_g##gbits(g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, n); } \
MAKE_CFUNC32(adam, float, 32)
MAKE_CFUNC32(adam, half, 16)
MAKE_CFUNC32(momentum, float, 32)
MAKE_CFUNC32(momentum, half, 16)
MAKE_CFUNC32(rmsprop, float, 32)
MAKE_CFUNC32(rmsprop, half, 16)
#define MAKE_CFUNC8(name, gtype, gbits) \
void c##name##_static_8bit_g##gbits(gtype* p, gtype* g, unsigned char* state1, unsigned char* state2, \
float *unorm, float max_unorm, float param_norm, \
float beta1, float beta2, \
float eps, int step, float lr, \
float* quantiles1, float* quantiles2, \
float* max1, float* max2, float* new_max1, float* new_max2, \
float weight_decay, float gnorm_scale, int n) \
{ \
name##_static_8bit_g##gbits(g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr, \
quantiles1, quantiles2, max1, max2, new_max1, new_max2, weight_decay, gnorm_scale, n); \
} \
MAKE_CFUNC8(adam, float, 32)
MAKE_CFUNC8(adam, half, 16)
MAKE_CFUNC8(momentum, float, 32)
MAKE_CFUNC8(momentum, half, 16)
MAKE_CFUNC8(rmsprop, float, 32)
MAKE_CFUNC8(rmsprop, half, 16)
#define MAKE_CBLOCKWISE8(fname, optim_name, gtype, gbits) \
void c##fname##_8bit_blockwise_fp##gbits(gtype* p, gtype* g, \
unsigned char* state1, unsigned char* state2, float beta1, float beta2, float eps, int step, float lr, \
float* quantiles1, float* quantiles2, float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale, int n) \
{ fname##_8bit_blockwise_fp##gbits(p, g, state1, state2, beta1, beta2, eps, step, lr, quantiles1, quantiles2, absmax1, absmax2, weight_decay, gnorm_scale, n); } \
MAKE_CBLOCKWISE8(adam, ADAM, half, 16)
MAKE_CBLOCKWISE8(adam, ADAM, float, 32)
MAKE_CBLOCKWISE8(momentum, MOMENTUM, half, 16)
MAKE_CBLOCKWISE8(momentum, MOMENTUM, float, 32)
MAKE_CBLOCKWISE8(rmsprop, RMSPROP, half, 16)
MAKE_CBLOCKWISE8(rmsprop, RMSPROP, float, 32)
void cpercentile_clipping_g32(float * g, float *gnorm_vec, int step, const int n){ percentileClipping_g32(g, gnorm_vec, step, n); }
void cpercentile_clipping_g16(half * g, float *gnorm_vec, int step, const int n){ percentileClipping_g16(g, gnorm_vec, step, n); }
void cquantize_blockwise_cpu_fp32(float *code, float *A, float *absmax, unsigned char *out, const int n){ quantize_cpu(code, A, absmax, out, n); }
void cdequantize_blockwise_cpu_fp32(float *code, unsigned char *A, float *absmax, float *out, const int n){ dequantize_cpu(code, A, absmax, out, n); }
void chistogram_scatter_add_2d(float* histogram, int *index1, int *index2, float *src, int maxidx1, int n){ histogramScatterAdd2D(histogram, index1, index2, src, maxidx1, n); }
}
#!/bin/bash
rm -rf dist build
make clean
CUDA_HOME=/usr/local/cuda-10.2 make
CUDA_VERSION=102 python -m build
python -m twine upload --repository testpypi dist/* --verbose
rm -rf dist build
make clean
CUDA_HOME=/usr/local/cuda-11.1 make
CUDA_VERSION=111 python -m build
python -m twine upload --repository testpypi dist/* --verbose
#!/bin/bash
module unload cuda
module unload gcc
rm -rf dist build
make clean
make cleaneggs
module load cuda/9.2
module load gcc/7.3.0
CUDA_HOME=/public/apps/cuda/9.2
make
CUDA_VERSION=92 python -m build
python -m twine upload --repository testpypi dist/* --verbose
module unload cuda
rm -rf dist build
make clean
make cleaneggs
module load cuda/10.0
CUDA_HOME=/public/apps/cuda/10.0
make cuda10x
CUDA_VERSION=100 python -m build
python -m twine upload --repository testpypi dist/* --verbose
module unload cuda
module unload gcc
module load gcc/8.4
rm -rf dist build
make clean
make cleaneggs
module load cuda/10.1
CUDA_HOME=/public/apps/cuda/10.1
make cuda10x
CUDA_VERSION=101 python -m build
python -m twine upload --repository testpypi dist/* --verbose
module unload cuda
rm -rf dist build
make clean
make cleaneggs
module load cuda/10.2
CUDA_HOME=/public/apps/cuda/10.2/
make cuda10x
CUDA_VERSION=102 python -m build
python -m twine upload --repository testpypi dist/* --verbose
module unload cuda
rm -rf dist build
make clean
make cleaneggs
module load cuda/11.0
CUDA_HOME=/public/apps/cuda/11.0
make cuda110
CUDA_VERSION=110 python -m build
python -m twine upload --repository testpypi dist/* --verbose
module unload cuda
rm -rf dist build
make clean
make cleaneggs
module load cuda/11.1
CUDA_HOME=/public/apps/cuda/11.1
make cuda11x
CUDA_VERSION=111 python -m build
python -m twine upload --repository testpypi dist/* --verbose
module unload cuda
rm -rf dist build
make clean
make cleaneggs
module load cuda/11.2
CUDA_HOME=/public/apps/cuda/11.2
make cuda11x
CUDA_VERSION=112 python -m build
python -m twine upload --repository testpypi dist/* --verbose
module unload cuda
rm -rf dist build
make clean
make cleaneggs
CUDA_HOME=/private/home/timdettmers/git/autoswap/local/cuda-11.3 make cuda11x
CUDA_VERSION=113 python -m build
python -m twine upload --repository testpypi dist/* --verbose
module unload cuda
#pragma once
#include "Portable.h"
namespace BinSearch {
namespace Details {
template <typename T>
bool isAligned(const T *p, size_t A)
{
return (reinterpret_cast<size_t>(p) % A) == 0;
}
template <class T, size_t A=64>
struct AlignedVec
{
AlignedVec()
: m_storage(0)
, m_data(0)
, m_sz(0)
{
}
static size_t nBytes(size_t sz)
{
return sz * sizeof(T) + A;
}
static size_t shiftAmt(char *p)
{
return A>1? (A - (reinterpret_cast<size_t>(p) % A)) % A: 0;
}
void setPtr(char *p, size_t sz)
{
m_sz = sz;
m_data = reinterpret_cast<T *>(p + shiftAmt(p));
}
//void setPtr(T *p, size_t sz)
//{
// m_sz = sz;
// if (A>1)
// myassert(((reinterpret_cast<size_t>(p) % A) == 0), "bad alignment");
// m_data = p;
//}
// internal allocation
void resize(size_t sz)
{
m_storage = new char[nBytes(sz)];
setPtr(m_storage, sz);
}
// external allocation
void set(char *storage, size_t sz)
{
setPtr(storage, sz);
}
~AlignedVec()
{
if (m_storage)
delete [] m_storage;
}
size_t size() const { return m_sz; }
T& operator[](size_t i) { return m_data[i]; }
const T& operator[](size_t i) const { return m_data[i]; }
T* begin() { return m_data; }
T* end() { return m_data+m_sz; }
const T* begin() const { return m_data; }
const T* end() const { return m_data+m_sz; }
T& front() { return m_data[0]; }
T& back() { return m_data[m_sz-1]; }
const T& front() const { return m_data[0]; }
const T& back() const { return m_data[m_sz - 1]; }
private:
char *m_storage;
T *m_data;
size_t m_sz;
};
} // namespace Details
} // namespace BinSearch
#pragma once
#include <algorithm>
#include <limits>
#include <type_traits>
#include "AAlloc.h"
namespace BinSearch {
namespace Details {
namespace DirectAux {
#define SAFETY_MULTI_PASS true
template <typename T>
struct HResults
{
HResults(T h, double ratio, size_t n) : H(h), hRatio(ratio), nInc(n) {}
T H;
double hRatio;
size_t nInc;
};
#ifdef USE_FMA
template <Algos A> struct IsDirect { static const bool value = (A == Direct) || (A == DirectFMA); };
template <Algos A> struct IsDirect2 { static const bool value = (A == Direct2) || (A == Direct2FMA); };
template <Algos A> struct IsDirectCache { static const bool value = (A == DirectCache) || (A == DirectCacheFMA); };
#else
template <Algos A> struct IsDirect { static const bool value = (A == Direct); };
template <Algos A> struct IsDirect2 { static const bool value = (A == Direct2); };
template <Algos A> struct IsDirectCache { static const bool value = (A == DirectCache); };
#endif
// general definition
template <Algos A, typename T, typename Enable = void>
struct BucketElem
{
FORCE_INLINE void set( uint32 b, const T *)
{
m_b = b;
}
FORCE_INLINE uint32 index() const { return m_b; }
private:
uint32 m_b;
};
// specialization for DirectCache methods
template <typename T> struct MatchingIntType;
template <> struct MatchingIntType<double> { typedef uint64 type; };
template <> struct MatchingIntType<float> { typedef uint32 type; };
template <Algos A, typename T>
struct BucketElem<A, T, typename std::enable_if< IsDirectCache<A>::value >::type >
{
typedef typename MatchingIntType<T>::type I;
void set(uint32 b, const T *xi)
{
u.u.x = xi[b];
u.u.b = b;
}
FORCE_INLINE I index() const { return u.u.b; }
FORCE_INLINE T x() const { return u.u.x; }
private:
union {
double dummy;
struct
{
T x;
I b;
} u;
} u;
};
template <bool UseFMA, unsigned char Gap, typename T>
struct DirectTraits
{
static void checkH(T scaler, T x0, T xN)
{
T Dn = xN - x0;
T ifmax = Dn * scaler;
myassert((ifmax < std::numeric_limits<uint32>::max() - (Gap - 1)),
"Problem unfeasible: index size exceeds uint32 capacity:"
<< " D[N] =" << Dn
<< ", H =" << scaler
<< ", H D[n] =" << ifmax << "\n"
);
}
FORCE_INLINE static uint32 f(T scaler, T x0, T z)
{
T tmp = scaler * (z - x0);
#ifdef USE_SSE2
return ftoi(FVec1<SSE,T>(tmp));
#else
return static_cast<uint32>(tmp);
#endif
}
template <InstrSet I>
FORCE_INLINE static typename FTOITraits<I, T>::vec_t f(const FVec<I, T>& scaler, const FVec<I, T>& x0, const FVec<I, T>& z)
{
return ftoi(scaler*(z-x0));
}
static T cst0(T scaler, T x0)
{
return x0;
}
};
#ifdef USE_FMA
template <unsigned char Gap, typename T>
struct DirectTraits<true,Gap,T>
{
typedef FVec1<SSE, T> fVec1;
static void checkH(T scaler, T H_Times_x0, T xN)
{
union {
typename FVec1<SSE, T>::vec_t v;
T s;
} ifmax;
ifmax.v = mulSub(fVec1(scaler), fVec1(xN), fVec1(H_Times_x0));
myassert((ifmax.s < std::numeric_limits<uint32>::max() - (Gap - 1)),
"Problem unfeasible: index size exceeds uint32 capacity:"
<< " H X[0] =" << H_Times_x0
<< ", H =" << scaler
<< ", X[N] =" << xN
<< ", H X[N] - H X[0] =" << ifmax.s << "\n"
);
}
FORCE_INLINE static uint32 f(T scaler, T Hx0, T xi)
{
return ftoi(mulSub(fVec1(scaler), fVec1(xi), fVec1(Hx0)));
}
template <InstrSet I>
FORCE_INLINE static typename FTOITraits<I,T>::vec_t f(const FVec<I,T>& scaler, const FVec<I, T>& H_Times_X0, const FVec<I, T>& z)
{
return ftoi(mulSub(scaler, z, H_Times_X0));
}
static T cst0(T scaler, T x0)
{
return scaler*x0;
}
};
#endif
template <unsigned char Gap, typename T, Algos A>
struct DirectInfo
{
static const bool UseFMA = (A == DirectFMA) || (A == Direct2FMA) || (A == DirectCacheFMA);
typedef DirectTraits<UseFMA, Gap, T> fun_t;
typedef BucketElem<A,T> bucket_t;
typedef AlignedVec<bucket_t> bucketvec_t;
struct Data {
Data() : buckets(0), xi(0), scaler(0), cst0(0) {}
Data( const T *x // for Direct must persist if xws=NULL
, uint32 n
, T H
, bucket_t *bws // assumed to gave size nb, as computed below
, T *xws = NULL // assumed to have size (n+Gap-1). Optional for Direct, unused for DirectCache, required for DirectGap
)
: buckets(bws)
, scaler(H)
, cst0(fun_t::cst0(H, x[0]))
{
myassert(((bws != NULL) && (isAligned(bws,64))), "bucket pointer not allocated or incorrectly aligned");
uint32 nb = 1 + fun_t::f(H, cst0, x[n-1]);
const uint32 npad = Gap-1;
const uint32 n_sz = n + npad; // size of padded vector
if (xws) {
myassert(isAligned(xws,8), "x pointer not allocated or incorrectly aligned");
std::fill_n(xws, npad, x[0]); // pad in front with x[0]
std::copy(x, x+n, xws + npad);
xi = xws;
}
else {
myassert(Gap==1, "if Gap>1 then X workspace must be provided");
xi = x;
}
populateIndex(bws, nb, xi, n_sz, scaler, cst0);
}
const bucket_t *buckets;
const T *xi;
T scaler;
T cst0; // could be x0 or (scaler*x0), depending if we are using FMA or not
} data;
static T growStep(T H)
{
T step;
T P = next(H);
while ((step = P - H) == 0)
P = next(P);
return step;
}
static HResults<T> computeH(const T *px, uint32 nx)
{
myassert((nx > Gap), "Array X too small");
myassert(((Gap == 1) || (Gap == 2)), "Only tested for these values of Gap");
const T x0 = px[0];
const T xN = px[nx-1];
const T range = xN - x0;
myassert((range < std::numeric_limits<T>::max()), "range too large");
// check that D_i are strictly increasing and compute minimum value D_{i+Offset}-D_i
T deltaDMin = range;
for (uint32 i = Gap; i < nx; ++i) {
T Dnew = px[i] - x0;
T Dold = px[i - Gap] - x0;
myassert((Dnew > Dold),
"Problem unfeasible: D_i sequence not strictly increasing"
<< " X[" << 0 << "]=" << x0
<< " X[" << i - Gap << "]=" << px[i - Gap]
<< " X[" << i << "]=" << px[i]
<< "\n"
);
T deltaD = Dnew - Dold;
if (deltaD < deltaDMin)
deltaDMin = deltaD;
}
// initial guess for H
const T H0 = T(1.0) / deltaDMin;
T H = H0;
T cst0 = fun_t::cst0(H, x0);
fun_t::checkH(H, cst0, xN);
// adjust H by trial and error until succeed
size_t nInc = 0;
bool modified = false;
size_t npasses = 0;
T step = growStep(H);
uint32 seg_already_checked_from = nx;
do {
myassert((npasses++ < 2), "verification failed\n");
// if there has been an increase, then check only up to that point
uint32 last_seg_to_be_checked = seg_already_checked_from - 1;
modified = false;
uint32 inew = 0;
for (uint32 i = Gap; i <= last_seg_to_be_checked; ++i) {
uint32 iold = fun_t::f(H, cst0, px[i-Gap]);
uint32 inew = fun_t::f(H, cst0, px[i]);
while (inew == iold) {
seg_already_checked_from = i;
last_seg_to_be_checked = nx-1; // everything needs to be checked
modified = true;
H = H + step;
step *= 2;
// recalculate all constants and indices
cst0 = fun_t::cst0(H, x0);
fun_t::checkH(H, cst0, xN);
iold = fun_t::f(H, cst0, px[i - Gap]);
inew = fun_t::f(H, cst0, px[i]);
}
}
} while (SAFETY_MULTI_PASS && modified);
return HResults<T>(H, (((double)H) / H0) - 1.0, nInc);
}
static void populateIndex(BucketElem<A, T> *buckets, uint32 index_size, const T *px, uint32 x_size, T scaler, T cst0)
{
for (uint32 i = x_size-1, b = index_size-1, j=0; ; --i) {
uint32 idx = fun_t::f(scaler, cst0, px[i]);
while (b > idx) { // in the 1st iteration it is j=0 but this condition is always false
buckets[b].set( j, px );
--b;
}
if (Gap==1 || b == idx) { // if Gap==1, which is known at compile time, the check b==idx is redundant
j = i - (Gap-1); // subtracting (Gap-1) points to the index of the first X-element to check
buckets[b].set(j, px);
if (b-- == 0)
break;
}
}
}
DirectInfo(const Data& d)
: data(d)
{
}
DirectInfo(const T* px, const uint32 n)
{
HResults<T> res = computeH(px, n);
#ifdef PAPER_TEST
nInc = res.nInc;
hRatio = res.hRatio;
#endif
const uint32 npad = Gap-1;
const uint32 n_sz = n + npad; // size of padded vector
if (npad)
xi.resize(n_sz);
T H = res.H;
T cst0 = fun_t::cst0(H, px[0]);
const uint32 maxIndex = fun_t::f(H, cst0, px[n-1]);
buckets.resize(maxIndex + 1);
data = Data(px, n, H, buckets.begin(), (npad? xi.begin(): NULL));
}
private:
bucketvec_t buckets;
AlignedVec<T,8> xi;
#ifdef PAPER_TEST
public:
double hRatio;
size_t nInc;
#endif
};
} // namespace DirectAux
} // namespace Details
} // namespace BinSearch
This diff is collapsed.
ALGOENUM(DirectCacheFMA, 5)
ALGOENUM(DirectFMA, 15)
ALGOENUM(Direct2FMA, 25)
ALGOENUM(DirectCache, 10)
ALGOENUM(Direct, 20)
ALGOENUM(Direct2, 30)
ALGOENUM(Nonary, 40)
ALGOENUM(Pentary, 50)
ALGOENUM(Ternary, 60)
ALGOENUM(Eytzinger, 70)
ALGOENUM(BitSet, 80)
ALGOENUM(ClassicOffset, 90)
#ifdef PAPER_TEST
ALGOENUM(MorinOffset, 100)
ALGOENUM(BitSetNoPad, 110)
ALGOENUM(ClassicMod, 120)
ALGOENUM(MorinBranchy, 130)
ALGOENUM(Classic, 140)
ALGOENUM(LowerBound, 145)
#ifdef USE_MKL
ALGOENUM(MKL, 150)
#endif
#endif
#pragma once
#include "Type.h"
#include <algorithm>
namespace BinSearch {
template <InstrSet I, typename T, Algos A, bool L=false, bool R=false>
struct BinAlgo : Details::BinAlgoBase<I,T,A>
{
typedef Details::BinAlgoBase<I,T,A> base_t;
BinAlgo(const T* px, const uint32 n) : base_t(px, n), x0(px[0]), xN(px[n-1]), N(n) {}
BinAlgo(const T* px, const uint32 n, const typename base_t::Data& d) : base_t(d), x0(px[0]), xN(px[n-1]), N(n) {}
FORCE_INLINE
uint32 scalar(T z) const
{
if (!L || z >= x0)
if (!R || z < xN)
return base_t::scalar(z);
else
return N;
else
return std::numeric_limits<uint32>::max();
}
FORCE_INLINE
void vectorial(uint32 *pr, const T *pz, uint32 n) const
{
if (!L && !R) {
Details::Loop<T,base_t>::loop(*this, pr, pz, n);
}
else {
const uint32 nElem = base_t::nElem;
const uint32 idealbufsize = 256;
const uint32 bufsize = nElem * (idealbufsize / nElem + ((idealbufsize % nElem) ? 1 : 0));
T databuf[bufsize];
uint32 resbuf[bufsize];
uint32 indexbuf[bufsize];
uint32 *prend = pr + n;
while(pr != prend) {
uint32 cnt = 0;
uint32 niter = std::min(bufsize, (uint32)std::distance(pr,prend));
for (uint32 j = 0; j < niter; ++j) {
T z = pz[j];
// FIXME: use SSE2?
if (!L || z >= x0)
if (!R || z < xN) {
databuf[cnt] = z;
indexbuf[cnt] = j;
++cnt;
}
else
pr[j] = N;
else
pr[j] = std::numeric_limits<uint32>::max();
}
// FIXME: merge these two loops
Details::Loop<T,base_t>::loop(*this, resbuf, databuf, cnt);
for (uint32 j = 0; j < cnt; ++j)
pr[indexbuf[j]] = resbuf[j];
pr += niter;
pz += niter;
}
}
}
Details::CondData<T,L> x0;
Details::CondData<T,R> xN;
Details::CondData<uint32,R> N;
};
} // namespace BinSearch
#pragma once
#include "AAlloc.h"
#include "BinAlgo.h"
#include "SIMD.h"
#include <algorithm>
#include <limits>
#include "Algo-Direct2.h"
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
[build-system]
requires = [
"setuptools>=42",
"wheel"
]
build-backend = "setuptools.build_meta"
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment