Commit 8258b436 authored by Max Ryabinin's avatar Max Ryabinin
Browse files

Add a CPU-only build option

parent 33efe4a0
...@@ -10,10 +10,10 @@ NVCC := $(CUDA_HOME)/bin/nvcc ...@@ -10,10 +10,10 @@ NVCC := $(CUDA_HOME)/bin/nvcc
########################################### ###########################################
CSRC := $(ROOT_DIR)/csrc CSRC := $(ROOT_DIR)/csrc
BUILD_DIR:= $(ROOT_DIR)/cuda_build BUILD_DIR:= $(ROOT_DIR)/build
FILES_CUDA := $(CSRC)/ops.cu $(CSRC)/kernels.cu FILES_CUDA := $(CSRC)/ops.cu $(CSRC)/kernels.cu
FILES_CPP := $(CSRC)/pythonInterface.c FILES_CPP := $(CSRC)/common.cpp $(CSRC)/cpu_ops.cpp $(CSRC)/pythonInterface.c
INCLUDE := -I $(CUDA_HOME)/include -I $(ROOT_DIR)/csrc -I $(CONDA_PREFIX)/include -I $(ROOT_DIR)/dependencies/cub -I $(ROOT_DIR)/include INCLUDE := -I $(CUDA_HOME)/include -I $(ROOT_DIR)/csrc -I $(CONDA_PREFIX)/include -I $(ROOT_DIR)/dependencies/cub -I $(ROOT_DIR)/include
LIB := -L $(CUDA_HOME)/lib64 -lcudart -lcuda -lcublas -lcurand -lcusparse -L $(CONDA_PREFIX)/lib LIB := -L $(CUDA_HOME)/lib64 -lcudart -lcuda -lcublas -lcurand -lcusparse -L $(CONDA_PREFIX)/lib
...@@ -46,27 +46,30 @@ CC_CUDA11x += -gencode arch=compute_86,code=sm_86 ...@@ -46,27 +46,30 @@ CC_CUDA11x += -gencode arch=compute_86,code=sm_86
all: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR) env all: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR) env
$(NVCC) $(COMPUTE_CAPABILITY) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) $(NVCC) $(COMPUTE_CAPABILITY) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
$(NVCC) $(COMPUTE_CAPABILITY) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o $(NVCC) $(COMPUTE_CAPABILITY) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
$(GPP) -std=c++14 -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes.so $(LIB) $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes.so $(LIB)
cuda92: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR) env cuda92: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR) env
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA92) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA92) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA92) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA92) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
$(GPP) -std=c++14 -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes.so $(LIB) $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes.so $(LIB)
cuda10x: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR) env cuda10x: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR) env
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA10x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA10x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA10x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA10x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
$(GPP) -std=c++14 -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes.so $(LIB) $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes.so $(LIB)
cuda110: $(BUILD_DIR) env cuda110: $(BUILD_DIR) env
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
$(GPP) -std=c++14 -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes.so $(LIB) $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes.so $(LIB)
cuda11x: $(BUILD_DIR) env cuda11x: $(BUILD_DIR) env
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
$(GPP) -std=c++14 -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes.so $(LIB) $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes.so $(LIB)
cpuonly: $(BUILD_DIR) env
$(GPP) -std=c++14 -shared -fPIC -I $(ROOT_DIR)/csrc -I $(ROOT_DIR)/include $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes.so
env: env:
@echo "ENVIRONMENT" @echo "ENVIRONMENT"
...@@ -80,7 +83,7 @@ env: ...@@ -80,7 +83,7 @@ env:
@echo "============================" @echo "============================"
$(BUILD_DIR): $(BUILD_DIR):
mkdir -p cuda_build mkdir -p build
mkdir -p dependencies mkdir -p dependencies
$(ROOT_DIR)/dependencies/cub: $(ROOT_DIR)/dependencies/cub:
......
...@@ -2,9 +2,14 @@ ...@@ -2,9 +2,14 @@
# #
# This source code is licensed under the MIT license found in the # This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree. # LICENSE file in the root directory of this source tree.
from .optim import adam
from .nn import modules from .nn import modules
__pdoc__ = {'libBitsNBytes' : False, from cextension import COMPILED_WITH_CUDA
if COMPILED_WITH_CUDA:
from .optim import adam
__pdoc__ = {'libBitsNBytes': False,
'optim.optimizer.Optimizer8bit': False, 'optim.optimizer.Optimizer8bit': False,
'optim.optimizer.MockArgs': False 'optim.optimizer.MockArgs': False
} }
import ctypes as ct
import os
from warnings import warn
lib = ct.cdll.LoadLibrary(os.path.dirname(__file__) + '/libbitsandbytes.so')
try:
lib.cadam32bit_g32
COMPILED_WITH_CUDA = True
except AttributeError:
warn("The installed version of bitsandbytes was compiled without GPU support. "
"8-bit optimizers and GPU quantization are unavailable.")
COMPILED_WITH_CUDA = False
...@@ -3,38 +3,38 @@ ...@@ -3,38 +3,38 @@
# This source code is licensed under the MIT license found in the # This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree. # LICENSE file in the root directory of this source tree.
import ctypes as ct import ctypes as ct
import os
import random import random
from typing import Tuple from typing import Tuple
import torch import torch
from torch import Tensor from torch import Tensor
lib = ct.cdll.LoadLibrary(os.path.dirname(__file__) + '/libbitsandbytes.so') from cextension import lib, COMPILED_WITH_CUDA
name2qmap = {} name2qmap = {}
''' C FUNCTIONS FOR OPTIMIZERS ''' if COMPILED_WITH_CUDA:
''' C FUNCTIONS FOR OPTIMIZERS '''
str2optimizer32bit = {} str2optimizer32bit = {}
str2optimizer32bit['adam'] = (lib.cadam32bit_g32, lib.cadam32bit_g16) str2optimizer32bit['adam'] = (lib.cadam32bit_g32, lib.cadam32bit_g16)
str2optimizer32bit['momentum'] = (lib.cmomentum32bit_g32, lib.cmomentum32bit_g16) str2optimizer32bit['momentum'] = (lib.cmomentum32bit_g32, lib.cmomentum32bit_g16)
str2optimizer32bit['rmsprop'] = (lib.crmsprop32bit_g32, lib.crmsprop32bit_g16) str2optimizer32bit['rmsprop'] = (lib.crmsprop32bit_g32, lib.crmsprop32bit_g16)
str2optimizer32bit['adagrad'] = (lib.cadagrad32bit_g32, lib.cadagrad32bit_g16) str2optimizer32bit['adagrad'] = (lib.cadagrad32bit_g32, lib.cadagrad32bit_g16)
str2optimizer32bit['lars'] = (lib.cmomentum32bit_g32, lib.cmomentum32bit_g16) str2optimizer32bit['lars'] = (lib.cmomentum32bit_g32, lib.cmomentum32bit_g16)
str2optimizer32bit['lamb'] = (lib.cadam32bit_g32, lib.cadam32bit_g16) str2optimizer32bit['lamb'] = (lib.cadam32bit_g32, lib.cadam32bit_g16)
str2optimizer8bit = {} str2optimizer8bit = {}
str2optimizer8bit['adam'] = (lib.cadam_static_8bit_g32, lib.cadam_static_8bit_g16) str2optimizer8bit['adam'] = (lib.cadam_static_8bit_g32, lib.cadam_static_8bit_g16)
str2optimizer8bit['momentum'] = (lib.cmomentum_static_8bit_g32, lib.cmomentum_static_8bit_g16) str2optimizer8bit['momentum'] = (lib.cmomentum_static_8bit_g32, lib.cmomentum_static_8bit_g16)
str2optimizer8bit['rmsprop'] = (lib.crmsprop_static_8bit_g32, lib.crmsprop_static_8bit_g16) str2optimizer8bit['rmsprop'] = (lib.crmsprop_static_8bit_g32, lib.crmsprop_static_8bit_g16)
str2optimizer8bit['lamb'] = (lib.cadam_static_8bit_g32, lib.cadam_static_8bit_g16) str2optimizer8bit['lamb'] = (lib.cadam_static_8bit_g32, lib.cadam_static_8bit_g16)
str2optimizer8bit['lars'] = (lib.cmomentum_static_8bit_g32, lib.cmomentum_static_8bit_g16) str2optimizer8bit['lars'] = (lib.cmomentum_static_8bit_g32, lib.cmomentum_static_8bit_g16)
str2optimizer8bit_blockwise = {} str2optimizer8bit_blockwise = {}
str2optimizer8bit_blockwise['adam'] = (lib.cadam_8bit_blockwise_fp32, lib.cadam_8bit_blockwise_fp16) str2optimizer8bit_blockwise['adam'] = (lib.cadam_8bit_blockwise_fp32, lib.cadam_8bit_blockwise_fp16)
str2optimizer8bit_blockwise['momentum'] = (lib.cmomentum_8bit_blockwise_fp32, lib.cmomentum_8bit_blockwise_fp16) str2optimizer8bit_blockwise['momentum'] = (lib.cmomentum_8bit_blockwise_fp32, lib.cmomentum_8bit_blockwise_fp16)
str2optimizer8bit_blockwise['rmsprop'] = (lib.crmsprop_8bit_blockwise_fp32, lib.crmsprop_8bit_blockwise_fp16) str2optimizer8bit_blockwise['rmsprop'] = (lib.crmsprop_8bit_blockwise_fp32, lib.crmsprop_8bit_blockwise_fp16)
str2optimizer8bit_blockwise['adagrad'] = (lib.cadagrad_8bit_blockwise_fp32, lib.cadagrad_8bit_blockwise_fp16) str2optimizer8bit_blockwise['adagrad'] = (lib.cadagrad_8bit_blockwise_fp32, lib.cadagrad_8bit_blockwise_fp16)
optimal_normal = [-0.9939730167388916, -0.8727636337280273, -0.8097418546676636, -0.7660024166107178, -0.7318882346153259, -0.6793879270553589, -0.657649040222168, -0.6385974884033203, -0.6211113333702087, -0.5901028513908386, -0.5762918591499329, -0.5630806684494019, -0.5509274005889893, -0.5394591689109802, -0.5283197164535522, -0.517780065536499, -0.5074946284294128, -0.4980469048023224, -0.48867011070251465, -0.48003149032592773, -0.47125306725502014, -0.4629971981048584, -0.4547359049320221, -0.446626216173172, -0.43902668356895447, -0.43158355355262756, -0.4244747757911682, -0.4173796474933624, -0.41038978099823, -0.4055633544921875, -0.4035947024822235, -0.39701032638549805, -0.39057496190071106, -0.38439232110977173, -0.3782760500907898, -0.3721940815448761, -0.3661896586418152, -0.3604033589363098, -0.354605108499527, -0.34892538189888, -0.34320303797721863, -0.3376772701740265, -0.3323028087615967, -0.3269782066345215, -0.32166096568107605, -0.316457599401474, -0.3112771809101105, -0.3061025142669678, -0.30106794834136963, -0.2961243987083435, -0.2912728488445282, -0.28644347190856934, -0.28165507316589355, -0.2769731283187866, -0.2722635865211487, -0.26779335737228394, -0.26314786076545715, -0.2586647868156433, -0.2541804611682892, -0.2496625930070877, -0.24527113139629364, -0.24097171425819397, -0.23659978806972504, -0.23218469321727753, -0.22799566388130188, -0.22380566596984863, -0.21965542435646057, -0.2154538631439209, -0.2113603949546814, -0.20735277235507965, -0.20334717631340027, -0.19932441413402557, -0.19530178606510162, -0.19136647880077362, -0.18736697733402252, -0.18337111175060272, -0.17951400578022003, -0.1757056713104248, -0.17182783782482147, -0.1680615097284317, -0.16431649029254913, -0.16053077578544617, -0.15685945749282837, -0.15298527479171753, -0.1493264138698578, -0.14566898345947266, -0.14188314974308014, -0.13819937407970428, -0.1344561129808426, -0.1306886374950409, -0.1271020770072937, -0.12346585839986801, -0.11981867253780365, -0.11614970862865448, -0.11256207525730133, -0.10889036953449249, -0.10525048524141312, -0.1016591489315033, -0.09824034571647644, -0.09469068050384521, -0.0911419615149498, -0.08773849159479141, -0.08416644483804703, -0.08071305602788925, -0.07720902562141418, -0.07371306419372559, -0.07019119709730148, -0.06673648208379745, -0.06329209357500076, -0.059800852090120316, -0.0564190037548542, -0.05296570807695389, -0.049522045999765396, -0.04609023034572601, -0.04262964054942131, -0.039246633648872375, -0.03577171266078949, -0.03236335143446922, -0.028855687007308006, -0.02542758360505104, -0.022069433704018593, -0.018754752352833748, -0.015386369079351425, -0.01194947212934494, -0.008439815603196621, -0.004995611496269703, -0.0016682245768606663, 0.0, 0.0015510577941313386, 0.005062474869191647, 0.008417150937020779, 0.011741090565919876, 0.015184164978563786, 0.018582714721560478, 0.02204744517803192, 0.025471193715929985, 0.02889077737927437, 0.0323684960603714, 0.03579240292310715, 0.039281025528907776, 0.0427563451230526, 0.04619763046503067, 0.04968220740556717, 0.05326594039797783, 0.05679265409708023, 0.060245808213949203, 0.06372645497322083, 0.06721872836351395, 0.0706876739859581, 0.0742349922657013, 0.07774098962545395, 0.08123527467250824, 0.08468879014253616, 0.08810535818338394, 0.09155989438295364, 0.09498448669910431, 0.0985206812620163, 0.10206405073404312, 0.10563778132200241, 0.10921968519687653, 0.11284469068050385, 0.11653254181146622, 0.12008969485759735, 0.12368203699588776, 0.1272617131471634, 0.13089501857757568, 0.134552001953125, 0.1382799744606018, 0.14194637537002563, 0.14563234150409698, 0.14930322766304016, 0.15303383767604828, 0.1567956507205963, 0.16050070524215698, 0.16431072354316711, 0.16813558340072632, 0.17204202711582184, 0.1758781224489212, 0.17973239719867706, 0.1836014688014984, 0.18753431737422943, 0.19138391315937042, 0.19535475969314575, 0.19931404292583466, 0.20333819091320038, 0.20738255977630615, 0.21152682602405548, 0.21568812429904938, 0.21978361904621124, 0.22393859922885895, 0.22814159095287323, 0.23241068422794342, 0.23675410449504852, 0.24123944342136383, 0.24569889903068542, 0.2500703036785126, 0.25904011726379395, 0.26349544525146484, 0.2682226300239563, 0.272907555103302, 0.2774306833744049, 0.28220856189727783, 0.2869136929512024, 0.2916390895843506, 0.29649388790130615, 0.30142995715141296, 0.3065022826194763, 0.3114383816719055, 0.31648796796798706, 0.3216581642627716, 0.32700115442276, 0.3322487473487854, 0.33778008818626404, 0.3431521952152252, 0.3487405776977539, 0.3543166518211365, 0.3601346015930176, 0.36605337262153625, 0.37217751145362854, 0.378179669380188, 0.3843980133533478, 0.3906566798686981, 0.39714935421943665, 0.40357843041419983, 0.4104187488555908, 0.4171563684940338, 0.42418959736824036, 0.43136918544769287, 0.4389212429523468, 0.44673123955726624, 0.45457619428634644, 0.4627031683921814, 0.47130417823791504, 0.4798591434955597, 0.48897242546081543, 0.4979848861694336, 0.5, 0.5076631307601929, 0.5177803635597229, 0.5282770991325378, 0.5392990112304688, 0.5506287813186646, 0.5632893443107605, 0.5764452815055847, 0.5903191566467285, 0.6051878333091736, 0.6209936141967773, 0.6382884979248047, 0.6573970913887024, 0.6795773506164551, 0.7037051916122437, 0.7327037453651428, 0.7677436470985413, 0.8111193776130676, 0.875165581703186, 1.0] optimal_normal = [-0.9939730167388916, -0.8727636337280273, -0.8097418546676636, -0.7660024166107178, -0.7318882346153259, -0.6793879270553589, -0.657649040222168, -0.6385974884033203, -0.6211113333702087, -0.5901028513908386, -0.5762918591499329, -0.5630806684494019, -0.5509274005889893, -0.5394591689109802, -0.5283197164535522, -0.517780065536499, -0.5074946284294128, -0.4980469048023224, -0.48867011070251465, -0.48003149032592773, -0.47125306725502014, -0.4629971981048584, -0.4547359049320221, -0.446626216173172, -0.43902668356895447, -0.43158355355262756, -0.4244747757911682, -0.4173796474933624, -0.41038978099823, -0.4055633544921875, -0.4035947024822235, -0.39701032638549805, -0.39057496190071106, -0.38439232110977173, -0.3782760500907898, -0.3721940815448761, -0.3661896586418152, -0.3604033589363098, -0.354605108499527, -0.34892538189888, -0.34320303797721863, -0.3376772701740265, -0.3323028087615967, -0.3269782066345215, -0.32166096568107605, -0.316457599401474, -0.3112771809101105, -0.3061025142669678, -0.30106794834136963, -0.2961243987083435, -0.2912728488445282, -0.28644347190856934, -0.28165507316589355, -0.2769731283187866, -0.2722635865211487, -0.26779335737228394, -0.26314786076545715, -0.2586647868156433, -0.2541804611682892, -0.2496625930070877, -0.24527113139629364, -0.24097171425819397, -0.23659978806972504, -0.23218469321727753, -0.22799566388130188, -0.22380566596984863, -0.21965542435646057, -0.2154538631439209, -0.2113603949546814, -0.20735277235507965, -0.20334717631340027, -0.19932441413402557, -0.19530178606510162, -0.19136647880077362, -0.18736697733402252, -0.18337111175060272, -0.17951400578022003, -0.1757056713104248, -0.17182783782482147, -0.1680615097284317, -0.16431649029254913, -0.16053077578544617, -0.15685945749282837, -0.15298527479171753, -0.1493264138698578, -0.14566898345947266, -0.14188314974308014, -0.13819937407970428, -0.1344561129808426, -0.1306886374950409, -0.1271020770072937, -0.12346585839986801, -0.11981867253780365, -0.11614970862865448, -0.11256207525730133, -0.10889036953449249, -0.10525048524141312, -0.1016591489315033, -0.09824034571647644, -0.09469068050384521, -0.0911419615149498, -0.08773849159479141, -0.08416644483804703, -0.08071305602788925, -0.07720902562141418, -0.07371306419372559, -0.07019119709730148, -0.06673648208379745, -0.06329209357500076, -0.059800852090120316, -0.0564190037548542, -0.05296570807695389, -0.049522045999765396, -0.04609023034572601, -0.04262964054942131, -0.039246633648872375, -0.03577171266078949, -0.03236335143446922, -0.028855687007308006, -0.02542758360505104, -0.022069433704018593, -0.018754752352833748, -0.015386369079351425, -0.01194947212934494, -0.008439815603196621, -0.004995611496269703, -0.0016682245768606663, 0.0, 0.0015510577941313386, 0.005062474869191647, 0.008417150937020779, 0.011741090565919876, 0.015184164978563786, 0.018582714721560478, 0.02204744517803192, 0.025471193715929985, 0.02889077737927437, 0.0323684960603714, 0.03579240292310715, 0.039281025528907776, 0.0427563451230526, 0.04619763046503067, 0.04968220740556717, 0.05326594039797783, 0.05679265409708023, 0.060245808213949203, 0.06372645497322083, 0.06721872836351395, 0.0706876739859581, 0.0742349922657013, 0.07774098962545395, 0.08123527467250824, 0.08468879014253616, 0.08810535818338394, 0.09155989438295364, 0.09498448669910431, 0.0985206812620163, 0.10206405073404312, 0.10563778132200241, 0.10921968519687653, 0.11284469068050385, 0.11653254181146622, 0.12008969485759735, 0.12368203699588776, 0.1272617131471634, 0.13089501857757568, 0.134552001953125, 0.1382799744606018, 0.14194637537002563, 0.14563234150409698, 0.14930322766304016, 0.15303383767604828, 0.1567956507205963, 0.16050070524215698, 0.16431072354316711, 0.16813558340072632, 0.17204202711582184, 0.1758781224489212, 0.17973239719867706, 0.1836014688014984, 0.18753431737422943, 0.19138391315937042, 0.19535475969314575, 0.19931404292583466, 0.20333819091320038, 0.20738255977630615, 0.21152682602405548, 0.21568812429904938, 0.21978361904621124, 0.22393859922885895, 0.22814159095287323, 0.23241068422794342, 0.23675410449504852, 0.24123944342136383, 0.24569889903068542, 0.2500703036785126, 0.25904011726379395, 0.26349544525146484, 0.2682226300239563, 0.272907555103302, 0.2774306833744049, 0.28220856189727783, 0.2869136929512024, 0.2916390895843506, 0.29649388790130615, 0.30142995715141296, 0.3065022826194763, 0.3114383816719055, 0.31648796796798706, 0.3216581642627716, 0.32700115442276, 0.3322487473487854, 0.33778008818626404, 0.3431521952152252, 0.3487405776977539, 0.3543166518211365, 0.3601346015930176, 0.36605337262153625, 0.37217751145362854, 0.378179669380188, 0.3843980133533478, 0.3906566798686981, 0.39714935421943665, 0.40357843041419983, 0.4104187488555908, 0.4171563684940338, 0.42418959736824036, 0.43136918544769287, 0.4389212429523468, 0.44673123955726624, 0.45457619428634644, 0.4627031683921814, 0.47130417823791504, 0.4798591434955597, 0.48897242546081543, 0.4979848861694336, 0.5, 0.5076631307601929, 0.5177803635597229, 0.5282770991325378, 0.5392990112304688, 0.5506287813186646, 0.5632893443107605, 0.5764452815055847, 0.5903191566467285, 0.6051878333091736, 0.6209936141967773, 0.6382884979248047, 0.6573970913887024, 0.6795773506164551, 0.7037051916122437, 0.7327037453651428, 0.7677436470985413, 0.8111193776130676, 0.875165581703186, 1.0]
...@@ -138,7 +138,7 @@ def estimate_quantiles(A: Tensor, out: Tensor=None, offset: float=1/512) -> Tens ...@@ -138,7 +138,7 @@ def estimate_quantiles(A: Tensor, out: Tensor=None, offset: float=1/512) -> Tens
elif A.dtype == torch.float16: elif A.dtype == torch.float16:
lib.cestimate_quantiles_fp16(get_ptr(A), get_ptr(out), ct.c_float(offset), ct.c_int(A.numel())) lib.cestimate_quantiles_fp16(get_ptr(A), get_ptr(out), ct.c_float(offset), ct.c_int(A.numel()))
else: else:
raise NotImplementError(f'Not supported data type {A.dtype}') raise NotImplementedError(f'Not supported data type {A.dtype}')
return out return out
def quantize_blockwise(A: Tensor, code: Tensor=None, absmax: Tensor=None, rand=None, out: Tensor=None) -> Tensor: def quantize_blockwise(A: Tensor, code: Tensor=None, absmax: Tensor=None, rand=None, out: Tensor=None) -> Tensor:
...@@ -384,7 +384,7 @@ def optimizer_update_32bit(optimizer_name:str, g: Tensor, p: Tensor, state1: Ten ...@@ -384,7 +384,7 @@ def optimizer_update_32bit(optimizer_name:str, g: Tensor, p: Tensor, state1: Ten
param_norm = torch.norm(p.data.float()) param_norm = torch.norm(p.data.float())
if optimizer_name not in str2optimizer32bit: if optimizer_name not in str2optimizer32bit:
raise NotImplementError(f'Optimizer not implemented: {optimizer_name}. Choices: {",".join(str2optimizer32bit.keys())}') raise NotImplementedError(f'Optimizer not implemented: {optimizer_name}. Choices: {",".join(str2optimizer32bit.keys())}')
if g.dtype == torch.float32 and state1.dtype == torch.float32: if g.dtype == torch.float32 and state1.dtype == torch.float32:
str2optimizer32bit[optimizer_name][0](get_ptr(g), get_ptr(p), get_ptr(state1), get_ptr(state2), get_ptr(unorm_vec), ct.c_float(max_unorm), str2optimizer32bit[optimizer_name][0](get_ptr(g), get_ptr(p), get_ptr(state1), get_ptr(state2), get_ptr(unorm_vec), ct.c_float(max_unorm),
......
...@@ -2,11 +2,15 @@ ...@@ -2,11 +2,15 @@
# #
# This source code is licensed under the MIT license found in the # This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree. # LICENSE file in the root directory of this source tree.
from .adam import Adam, Adam8bit, Adam32bit
from .adamw import AdamW, AdamW8bit, AdamW32bit from bitsandbytes.cextension import COMPILED_WITH_CUDA
from .sgd import SGD, SGD8bit, SGD32bit
from .lars import LARS, LARS8bit, LARS32bit, PytorchLARS if COMPILED_WITH_CUDA:
from .lamb import LAMB, LAMB8bit, LAMB32bit from .adam import Adam, Adam8bit, Adam32bit
from .rmsprop import RMSprop, RMSprop8bit, RMSprop32bit from .adamw import AdamW, AdamW8bit, AdamW32bit
from .adagrad import Adagrad, Adagrad8bit, Adagrad32bit from .sgd import SGD, SGD8bit, SGD32bit
from .optimizer import GlobalOptimManager from .lars import LARS, LARS8bit, LARS32bit, PytorchLARS
from .lamb import LAMB, LAMB8bit, LAMB32bit
from .rmsprop import RMSprop, RMSprop8bit, RMSprop32bit
from .adagrad import Adagrad, Adagrad8bit, Adagrad32bit
from .optimizer import GlobalOptimManager
...@@ -31,6 +31,6 @@ class RMSprop32bit(Optimizer1State): ...@@ -31,6 +31,6 @@ class RMSprop32bit(Optimizer1State):
if alpha == 0: if alpha == 0:
raise NotImplementedError(f'RMSprop with alpha==0.0 is not supported!') raise NotImplementedError(f'RMSprop with alpha==0.0 is not supported!')
if centered: if centered:
raise NotImplementError(f'Centered RMSprop is not supported!') raise NotImplementedError(f'Centered RMSprop is not supported!')
super(RMSprop32bit, self).__init__('rmsprop', params, lr, (alpha, momentum), eps, super(RMSprop32bit, self).__init__('rmsprop', params, lr, (alpha, momentum), eps,
weight_decay, 32, args, min_8bit_size, percentile_clipping, block_wise) weight_decay, 32, args, min_8bit_size, percentile_clipping, block_wise)
#include <common.h>
#include <float.h>
void *quantize_block(void *arguments) {
// 1. find absmax in block
// 2. divide input value by absmax to normalize into [-1.0, 1.0]
// 3. do binary search to find the closest value
// 4. check minimal distance
// 5. store index
struct quantize_block_args *args = (quantize_block_args *) arguments;
// 1. find absmax in block
float absmax_block = -FLT_MAX;
for (int i = args->block_idx; i < args->block_end; i++)
absmax_block = fmax(absmax_block, fabs(args->A[i]));
args->absmax[args->block_idx / BLOCK_SIZE] = absmax_block;
for (int i = args->block_idx; i < args->block_end; i++) {
// 2. divide input value by absmax to normalize into [-1.0, 1.0]
// 3. do binary search to find the closest value
float normed_value = args->A[i] / absmax_block;
int idx = args->bin_searcher->scalar(normed_value);
// 4. check minimal distance
// The binary search returns always the value to the left, which might not be the closest value
if (idx < 255) {
float dist_left = fabs(normed_value - (args->code[idx]));
float dist_right = fabs(normed_value - (args->code[idx + 1]));
if (dist_right < dist_left) { idx += 1; }
}
// 5. store index
args->out[i] = (unsigned char) idx;
}
return NULL;
}
#include <BinSearch.h>
#ifndef common
#define common
using namespace BinSearch;
struct quantize_block_args {
BinAlgo<Scalar, float, Direct2> *bin_searcher;
float *code;
float *A;
float *absmax;
unsigned char *out;
int block_end;
int block_idx;
int threadidx;
};
#define BLOCK_SIZE 4096
void *quantize_block(void *arguments);
#endif
\ No newline at end of file
#include <BinSearch.h>
#include <pthread.h>
#include <common.h>
using namespace BinSearch;
void dequantize_cpu(float *code, unsigned char *A, float *absmax, float *out, int n) {
for (int block_idx = 0; block_idx < n; block_idx += BLOCK_SIZE) {
int valid_items = n - block_idx >= BLOCK_SIZE ? BLOCK_SIZE : n - block_idx;
int block_end = block_idx + valid_items;
for (int i = block_idx; i < block_end; i++)
out[i] = code[A[i]] * absmax[block_idx / BLOCK_SIZE];
}
}
void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, int n) {
// the default code is has range [-0.993, 1.0] which can cause an error in the binary search algorithm used below
code[0] = -1.0f;
int num_blocks = n / BLOCK_SIZE;
num_blocks += n % BLOCK_SIZE == 0 ? 0 : 1;
pthread_t *threads = (pthread_t *) malloc(sizeof(pthread_t) * num_blocks);
struct quantize_block_args **args = (quantize_block_args **) malloc(num_blocks * sizeof(quantize_block_args *));
for (int i = 0; i < num_blocks; i++)
args[i] = (quantize_block_args *) malloc(sizeof(quantize_block_args));
const uint32 elements_code = 256;
BinAlgo<Scalar, float, Direct2> bin_searcher(code, elements_code);
for (int block_idx = 0; block_idx < n; block_idx += BLOCK_SIZE) {
int valid_items = n - block_idx >= BLOCK_SIZE ? BLOCK_SIZE : n - block_idx;
int block_end = block_idx + valid_items;
struct quantize_block_args *arg = args[block_idx / BLOCK_SIZE];
arg->bin_searcher = &bin_searcher;
arg->code = code;
arg->A = A;
arg->absmax = absmax;
arg->out = out;
arg->block_end = block_end;
arg->block_idx = block_idx;
arg->threadidx = block_idx / BLOCK_SIZE;
pthread_create(&threads[block_idx / BLOCK_SIZE], NULL, &quantize_block, (void *) arg);
}
for (int i = 0; i < num_blocks; i++)
int err = pthread_join(threads[i], NULL);
free(threads);
for (int i = 0; i < num_blocks; i++)
free(args[i]);
free(args);
}
\ No newline at end of file
#ifndef BITSANDBYTES_CPU_OPS_H
#define BITSANDBYTES_CPU_OPS_H
void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, int n);
void dequantize_cpu(float *code, unsigned char *A, float *absmax, float *out, int n);
#endif
...@@ -8,251 +8,141 @@ ...@@ -8,251 +8,141 @@
#include <cub/device/device_scan.cuh> #include <cub/device/device_scan.cuh>
#include <limits> #include <limits>
#include <BinSearch.h> #include <BinSearch.h>
#include <common.h>
using namespace BinSearch; using namespace BinSearch;
using std::cout; using std::cout;
using std::endl; using std::endl;
#define BLOCK_SIZE 4096 void histogramScatterAdd2D(float *histogram, int *index1, int *index2, float *src, int maxidx1, int n) {
int threads = 512;
struct quantize_block_args int blocks = n / threads;
{ blocks = n % threads == 0 ? blocks : blocks + 1;
BinAlgo<Scalar, float, Direct2> *bin_searcher; kHistogramScatterAdd2D<<<blocks, 512>>>(histogram, index1, index2, src, maxidx1, n);
float *code; CUDA_CHECK_RETURN(cudaPeekAtLastError());
float *A;
float *absmax;
unsigned char *out;
int block_end;
int block_idx;
int threadidx;
};
void *quantize_block(void *arguments)
{
// 1. find absmax in block
// 2. divide input value by absmax to normalize into [-1.0, 1.0]
// 3. do binary search to find the closest value
// 4. check minimal distance
// 5. store index
struct quantize_block_args *args = (quantize_block_args*)arguments;
// 1. find absmax in block
float absmax_block = -FLT_MAX;
for (int i = args->block_idx; i < args->block_end; i++)
absmax_block = fmax(absmax_block, fabs(args->A[i]));
args->absmax[args->block_idx/BLOCK_SIZE] = absmax_block;
for (int i = args->block_idx; i < args->block_end; i++)
{
// 2. divide input value by absmax to normalize into [-1.0, 1.0]
// 3. do binary search to find the closest value
float normed_value = args->A[i]/absmax_block;
int idx = args->bin_searcher->scalar(normed_value);
// 4. check minimal distance
// The binary search returns always the value to the left, which might not be the closest value
if(idx < 255)
{
float dist_left = fabs(normed_value-(args->code[idx]));
float dist_right = fabs(normed_value-(args->code[idx+1]));
if(dist_right < dist_left){ idx+=1; }
}
// 5. store index
args->out[i] = (unsigned char)idx;
}
return NULL;
} }
void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, int n) template<typename T>
{ void estimateQuantiles(T *A, float *code, float offset, int n) {
int blocks = n / 4096;
// the default code is has range [-0.993, 1.0] which can cause an error in the binary search algorithm used below blocks = n % 4096 == 0 ? blocks : blocks + 1;
code[0] = -1.0f; CUDA_CHECK_RETURN(cudaMemset(code, 0, 256 * sizeof(float)));
kEstimateQuantiles < T ><<<blocks, 512>>>(A, code, offset, std::numeric_limits<T>::max(), n);
int num_blocks = n/BLOCK_SIZE; CUDA_CHECK_RETURN(cudaPeekAtLastError());
num_blocks += n % BLOCK_SIZE == 0 ? 0 : 1;
pthread_t *threads = (pthread_t*)malloc(sizeof(pthread_t)*num_blocks);
struct quantize_block_args **args = (quantize_block_args**)malloc(num_blocks*sizeof(quantize_block_args*));
for(int i = 0; i < num_blocks; i++)
args[i] = (quantize_block_args*)malloc(sizeof(quantize_block_args));
const uint32 elements_code = 256;
BinAlgo<Scalar, float, Direct2> bin_searcher(code, elements_code);
for(int block_idx = 0; block_idx < n; block_idx+=BLOCK_SIZE)
{
int valid_items = n-block_idx >= BLOCK_SIZE ? BLOCK_SIZE : n - block_idx;
int block_end = block_idx + valid_items;
struct quantize_block_args *arg = args[block_idx/BLOCK_SIZE];
arg->bin_searcher = &bin_searcher;
arg->code = code;
arg->A = A;
arg->absmax = absmax;
arg->out = out;
arg->block_end = block_end;
arg->block_idx = block_idx;
arg->threadidx = block_idx/BLOCK_SIZE;
pthread_create(&threads[block_idx/BLOCK_SIZE], NULL, &quantize_block, (void *)arg);
}
for(int i = 0; i < num_blocks; i++)
int err = pthread_join(threads[i], NULL);
free(threads);
for(int i = 0; i < num_blocks; i++)
free(args[i]);
free(args);
}
void dequantize_cpu(float *code, unsigned char *A, float *absmax, float *out, int n)
{
for(int block_idx = 0; block_idx < n; block_idx+=BLOCK_SIZE)
{
int valid_items = n-block_idx >= BLOCK_SIZE ? BLOCK_SIZE : n - block_idx;
int block_end = block_idx + valid_items;
for (int i = block_idx; i < block_end; i++)
out[i] = code[A[i]]*absmax[block_idx/BLOCK_SIZE];
}
} }
void histogramScatterAdd2D(float* histogram, int *index1, int *index2, float *src, int maxidx1, int n) void quantize(float *code, float *A, unsigned char *out, int n) {
{ int blocks = n / 1024;
int threads = 512; blocks = n % 1024 == 0 ? blocks : blocks + 1;
int blocks = n/threads; kQuantize<<<blocks, 1024>>>(code, A, out, n);
blocks = n % threads == 0 ? blocks : blocks + 1; CUDA_CHECK_RETURN(cudaPeekAtLastError());
kHistogramScatterAdd2D<<<blocks, 512>>>(histogram, index1, index2, src, maxidx1, n);
CUDA_CHECK_RETURN(cudaPeekAtLastError());
} }
template <typename T> void estimateQuantiles(T *A, float *code, float offset, int n) void dequantize(float *code, unsigned char *A, float *out, int n) {
{ int blocks = n / 1024;
int blocks = n/4096; blocks = n % 1024 == 0 ? blocks : blocks + 1;
blocks = n % 4096 == 0 ? blocks : blocks + 1; kDequantize<<<blocks, 1024>>>(code, A, out, n);
CUDA_CHECK_RETURN(cudaMemset(code, 0, 256*sizeof(float))); CUDA_CHECK_RETURN(cudaPeekAtLastError());
kEstimateQuantiles<T><<<blocks, 512>>>(A, code, offset, std::numeric_limits<T>::max(), n);
CUDA_CHECK_RETURN(cudaPeekAtLastError());
} }
void quantize(float *code, float *A, unsigned char *out, int n) template<typename T, int STOCHASTIC>
{ void quantizeBlockwise(float *code, T *A, float *absmax, unsigned char *out, float *rand, int rand_offset, const int n) {
int blocks = n/1024; int blocks = n / 4096;
blocks = n % 1024 == 0 ? blocks : blocks + 1; blocks = n % 4096 == 0 ? blocks : blocks + 1;
kQuantize<<<blocks, 1024>>>(code, A, out, n); kQuantizeBlockwise < T, 4096, 4, STOCHASTIC ><<<blocks, 1024>>>(code, A, absmax, out, rand, rand_offset, n);
CUDA_CHECK_RETURN(cudaPeekAtLastError()); CUDA_CHECK_RETURN(cudaPeekAtLastError());
} }
void dequantize(float *code, unsigned char *A, float *out, int n) template<typename T>
{ void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, T *out, int blocksize, const int n) {
int blocks = n/1024; int blocks = n / blocksize;
blocks = n % 1024 == 0 ? blocks : blocks + 1; blocks = n % blocksize == 0 ? blocks : blocks + 1;
kDequantize<<<blocks, 1024>>>(code, A, out, n); if (blocksize == 4096)
CUDA_CHECK_RETURN(cudaPeekAtLastError()); kDequantizeBlockwise < T, 4096, 1024, 4 ><<<blocks, 4096 / 4>>>(code, A, absmax, out, n);
else if (blocksize == 2048)
kDequantizeBlockwise < T, 2048, 512, 4 ><<<blocks, 2048 / 4>>>(code, A, absmax, out, n);
CUDA_CHECK_RETURN(cudaPeekAtLastError());
} }
template <typename T, int STOCHASTIC> void quantizeBlockwise(float * code, T *A, float *absmax, unsigned char *out, float *rand, int rand_offset, const int n) template<typename T, int OPTIMIZER>
{ void optimizer32bit(T *g, T *p,
int blocks = n/4096; float *state1, float *state2, float *unorm, float max_unorm, float param_norm,
blocks = n % 4096 == 0 ? blocks : blocks + 1; const float beta1, const float beta2, const float eps, const float weight_decay,
kQuantizeBlockwise<T, 4096, 4, STOCHASTIC><<<blocks, 1024>>>(code, A, absmax, out, rand, rand_offset, n); const int step, const float lr, const float gnorm_scale, bool skip_zeros, const int n) {
CUDA_CHECK_RETURN(cudaPeekAtLastError()); int blocks = n / 4096;
} blocks = n % 4096 == 0 ? blocks : blocks + 1;
switch (OPTIMIZER) {
template<typename T> void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, T *out, int blocksize, const int n) case ADAM:
{ if (max_unorm > 0.0f) {
int blocks = n/blocksize; CUDA_CHECK_RETURN(cudaMemset(unorm, 0, 1 * sizeof(float)));
blocks = n % blocksize == 0 ? blocks : blocks + 1; kPreconditionOptimizer32bit2State < T, OPTIMIZER, 4096,
if(blocksize == 4096) 8 ><<<blocks, 512>>>(g, p, state1, state2, unorm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, n);
kDequantizeBlockwise<T, 4096, 1024, 4><<<blocks, 4096/4>>>(code, A, absmax, out, n); CUDA_CHECK_RETURN(cudaPeekAtLastError());
else if(blocksize == 2048) }
kDequantizeBlockwise<T, 2048, 512, 4><<<blocks, 2048/4>>>(code, A, absmax, out, n); kOptimizer32bit2State < T,
CUDA_CHECK_RETURN(cudaPeekAtLastError()); OPTIMIZER ><<<blocks, 1024>>>(g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n);
} CUDA_CHECK_RETURN(cudaPeekAtLastError());
break;
template<typename T, int OPTIMIZER> void optimizer32bit(T* g, T* p, case MOMENTUM:
float* state1, float* state2, float *unorm, float max_unorm, float param_norm, case RMSPROP:
const float beta1, const float beta2, const float eps, const float weight_decay, case ADAGRAD:
const int step, const float lr, const float gnorm_scale, bool skip_zeros, const int n)
{ if (max_unorm > 0.0f) {
int blocks = n/4096; CUDA_CHECK_RETURN(cudaMemset(unorm, 0, 1 * sizeof(float)));
blocks = n % 4096 == 0 ? blocks : blocks + 1; kPreconditionOptimizer32bit1State < T, OPTIMIZER, 4096,
switch(OPTIMIZER) 8 ><<<blocks, 512>>>(g, p, state1, unorm, beta1, eps, weight_decay, step, lr, gnorm_scale, n);
{ CUDA_CHECK_RETURN(cudaPeekAtLastError());
case ADAM: }
if(max_unorm > 0.0f)
{ kOptimizer32bit1State < T,
CUDA_CHECK_RETURN(cudaMemset(unorm, 0, 1*sizeof(float))); OPTIMIZER ><<<blocks, 1024>>>(g, p, state1, unorm, max_unorm, param_norm, beta1, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n);
kPreconditionOptimizer32bit2State<T, OPTIMIZER, 4096, 8><<<blocks, 512>>>(g, p, state1, state2, unorm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, n); CUDA_CHECK_RETURN(cudaPeekAtLastError());
CUDA_CHECK_RETURN(cudaPeekAtLastError()); break;
} }
kOptimizer32bit2State<T, OPTIMIZER><<<blocks, 1024>>>(g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n);
CUDA_CHECK_RETURN(cudaPeekAtLastError());
break;
case MOMENTUM:
case RMSPROP:
case ADAGRAD:
if(max_unorm > 0.0f)
{
CUDA_CHECK_RETURN(cudaMemset(unorm, 0, 1*sizeof(float)));
kPreconditionOptimizer32bit1State<T, OPTIMIZER, 4096, 8><<<blocks, 512>>>(g, p, state1, unorm, beta1, eps, weight_decay, step, lr, gnorm_scale, n);
CUDA_CHECK_RETURN(cudaPeekAtLastError());
}
kOptimizer32bit1State<T, OPTIMIZER><<<blocks, 1024>>>(g, p, state1, unorm, max_unorm, param_norm, beta1, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n);
CUDA_CHECK_RETURN(cudaPeekAtLastError());
break;
}
} }
template<typename T, int OPTIMIZER> void optimizerStatic8bit(T* p, T* g, template<typename T, int OPTIMIZER>
unsigned char* state1, unsigned char* state2, void optimizerStatic8bit(T *p, T *g,
float *unorm, float max_unorm, float param_norm, unsigned char *state1, unsigned char *state2,
float beta1, float beta2, float *unorm, float max_unorm, float param_norm,
float eps, int step, float lr, float beta1, float beta2,
float* quantiles1, float* quantiles2, float eps, int step, float lr,
float* max1, float* max2, float* new_max1, float* new_max2, float *quantiles1, float *quantiles2,
float weight_decay, float *max1, float *max2, float *new_max1, float *new_max2,
const float gnorm_scale, int n) float weight_decay,
{ const float gnorm_scale, int n) {
int blocks = n/4096; int blocks = n / 4096;
blocks = n % 4096 == 0 ? blocks : blocks + 1; blocks = n % 4096 == 0 ? blocks : blocks + 1;
if(max_unorm > 0.0f){ CUDA_CHECK_RETURN(cudaMemset(unorm, 0, 1*sizeof(float))); } if (max_unorm > 0.0f) { CUDA_CHECK_RETURN(cudaMemset(unorm, 0, 1 * sizeof(float))); }
switch(OPTIMIZER) switch (OPTIMIZER) {
{ case ADAM:
case ADAM: CUDA_CHECK_RETURN(cudaMemset(new_max1, 0, 1 * sizeof(float)));
CUDA_CHECK_RETURN(cudaMemset(new_max1, 0, 1*sizeof(float))); CUDA_CHECK_RETURN(cudaMemset(new_max2, 0, 1 * sizeof(float)));
CUDA_CHECK_RETURN(cudaMemset(new_max2, 0, 1*sizeof(float))); kPreconditionOptimizerStatic8bit2State < T,
kPreconditionOptimizerStatic8bit2State<T, OPTIMIZER><<<blocks, 256>>>(p, g, state1, state2, unorm, beta1, beta2, eps, step, quantiles1, quantiles2, max1, max2, new_max1, new_max2, gnorm_scale, n); OPTIMIZER ><<<blocks, 256>>>(p, g, state1, state2, unorm, beta1, beta2, eps, step, quantiles1, quantiles2, max1, max2, new_max1, new_max2, gnorm_scale, n);
CUDA_CHECK_RETURN(cudaPeekAtLastError()); CUDA_CHECK_RETURN(cudaPeekAtLastError());
kOptimizerStatic8bit2State<T, OPTIMIZER><<<blocks, 1024>>>(p, g, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr, kOptimizerStatic8bit2State < T,
quantiles1, quantiles2, max1, max2, new_max1, new_max2, weight_decay, gnorm_scale, n); OPTIMIZER ><<<blocks, 1024>>>(p, g, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr,
CUDA_CHECK_RETURN(cudaPeekAtLastError()); quantiles1, quantiles2, max1, max2, new_max1, new_max2, weight_decay, gnorm_scale, n);
break; CUDA_CHECK_RETURN(cudaPeekAtLastError());
case MOMENTUM: break;
case RMSPROP: case MOMENTUM:
case ADAGRAD: case RMSPROP:
CUDA_CHECK_RETURN(cudaMemset(new_max1, 0, 1*sizeof(float))); case ADAGRAD:
kPreconditionOptimizerStatic8bit1State<T, OPTIMIZER><<<blocks, 256>>>(p, g, state1, unorm, beta1, eps, step, quantiles1, max1, new_max1, weight_decay, gnorm_scale, n); CUDA_CHECK_RETURN(cudaMemset(new_max1, 0, 1 * sizeof(float)));
CUDA_CHECK_RETURN(cudaPeekAtLastError()); kPreconditionOptimizerStatic8bit1State < T,
kOptimizerStatic8bit1State<T, OPTIMIZER><<<blocks, 1024>>>(p, g, state1, unorm, max_unorm, param_norm, beta1, eps, step, lr, OPTIMIZER ><<<blocks, 256>>>(p, g, state1, unorm, beta1, eps, step, quantiles1, max1, new_max1, weight_decay, gnorm_scale, n);
quantiles1, max1, new_max1, weight_decay, gnorm_scale, n); CUDA_CHECK_RETURN(cudaPeekAtLastError());
CUDA_CHECK_RETURN(cudaPeekAtLastError()); kOptimizerStatic8bit1State < T, OPTIMIZER ><<<blocks, 1024>>>(p, g, state1, unorm, max_unorm, param_norm, beta1, eps, step, lr,
break; quantiles1, max1, new_max1, weight_decay, gnorm_scale, n);
default: CUDA_CHECK_RETURN(cudaPeekAtLastError());
break; break;
} default:
break;
}
} }
#define BLOCKSIZE_2STATE 2048 #define BLOCKSIZE_2STATE 2048
...@@ -260,42 +150,43 @@ template<typename T, int OPTIMIZER> void optimizerStatic8bit(T* p, T* g, ...@@ -260,42 +150,43 @@ template<typename T, int OPTIMIZER> void optimizerStatic8bit(T* p, T* g,
#define BLOCKSIZE_1STATE 2048 #define BLOCKSIZE_1STATE 2048
#define NUM_1STATE 8 #define NUM_1STATE 8
template<typename T, int OPTIMIZER> void optimizerStatic8bitBlockwise(T* p, T* g, template<typename T, int OPTIMIZER>
unsigned char* state1, unsigned char* state2, float beta1, float beta2, float eps, int step, float lr, void optimizerStatic8bitBlockwise(T *p, T *g,
float* quantiles1, float* quantiles2, float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale, bool skip_zeros, int n) unsigned char *state1, unsigned char *state2, float beta1, float beta2, float eps, int step, float lr,
{ float *quantiles1, float *quantiles2, float *absmax1, float *absmax2, float weight_decay,
const float gnorm_scale, bool skip_zeros, int n) {
int blocks = 0;
switch(OPTIMIZER) int blocks = 0;
{ switch (OPTIMIZER) {
case ADAM: case ADAM:
blocks = n/BLOCKSIZE_2STATE; blocks = n / BLOCKSIZE_2STATE;
blocks = n % BLOCKSIZE_2STATE == 0 ? blocks : blocks + 1; blocks = n % BLOCKSIZE_2STATE == 0 ? blocks : blocks + 1;
kOptimizerStatic8bit2StateBlockwise<T, OPTIMIZER, BLOCKSIZE_2STATE, NUM_2STATE><<<blocks, BLOCKSIZE_2STATE/NUM_2STATE>>>(p, g, state1, state2, beta1, beta2, eps, step, lr, kOptimizerStatic8bit2StateBlockwise < T, OPTIMIZER, BLOCKSIZE_2STATE, NUM_2STATE ><<<blocks, BLOCKSIZE_2STATE /
quantiles1, quantiles2, absmax1, absmax2, weight_decay, gnorm_scale, skip_zeros, n); NUM_2STATE>>>(p, g, state1, state2, beta1, beta2, eps, step, lr,
CUDA_CHECK_RETURN(cudaPeekAtLastError()); quantiles1, quantiles2, absmax1, absmax2, weight_decay, gnorm_scale, skip_zeros, n);
break; CUDA_CHECK_RETURN(cudaPeekAtLastError());
case MOMENTUM: break;
case RMSPROP: case MOMENTUM:
case ADAGRAD: case RMSPROP:
blocks = n/BLOCKSIZE_1STATE; case ADAGRAD:
blocks = n % BLOCKSIZE_1STATE == 0 ? blocks : blocks + 1; blocks = n / BLOCKSIZE_1STATE;
kOptimizerStatic8bit1StateBlockwise<T, OPTIMIZER, BLOCKSIZE_1STATE, NUM_1STATE><<<blocks, BLOCKSIZE_1STATE/NUM_1STATE>>>(p, g, state1, beta1, beta2, eps, step, lr, blocks = n % BLOCKSIZE_1STATE == 0 ? blocks : blocks + 1;
quantiles1, absmax1, weight_decay, gnorm_scale, skip_zeros, n); kOptimizerStatic8bit1StateBlockwise < T, OPTIMIZER, BLOCKSIZE_1STATE, NUM_1STATE ><<<blocks, BLOCKSIZE_1STATE /
CUDA_CHECK_RETURN(cudaPeekAtLastError()); NUM_1STATE>>>(p, g, state1, beta1, beta2, eps, step, lr,
break; quantiles1, absmax1, weight_decay, gnorm_scale, skip_zeros, n);
} CUDA_CHECK_RETURN(cudaPeekAtLastError());
break;
}
} }
template<typename T>
template<typename T> void percentileClipping(T * g, float *gnorm_vec, int step, const int n) void percentileClipping(T *g, float *gnorm_vec, int step, const int n) {
{ int blocks = n / 2048;
int blocks = n/2048; blocks = n % 2048 == 0 ? blocks : blocks + 1;
blocks = n % 2048 == 0 ? blocks : blocks + 1; CUDA_CHECK_RETURN(cudaMemset(&gnorm_vec[step % 100], 0, 1 * sizeof(float)));
CUDA_CHECK_RETURN(cudaMemset(&gnorm_vec[step % 100], 0, 1*sizeof(float))); kPercentileClipping < T, 2048, 4 ><<<blocks, 512>>>(g, gnorm_vec, step, n);
kPercentileClipping<T, 2048, 4><<<blocks, 512>>>(g, gnorm_vec, step, n); CUDA_CHECK_RETURN(cudaPeekAtLastError());
CUDA_CHECK_RETURN(cudaPeekAtLastError());
} }
...@@ -304,13 +195,23 @@ template<typename T> void percentileClipping(T * g, float *gnorm_vec, int step, ...@@ -304,13 +195,23 @@ template<typename T> void percentileClipping(T * g, float *gnorm_vec, int step,
//============================================================== //==============================================================
template void estimateQuantiles(half *A, float *code, float offset, int n); template void estimateQuantiles(half *A, float *code, float offset, int n);
template void estimateQuantiles(float *A, float *code, float offset, int n); template void estimateQuantiles(float *A, float *code, float offset, int n);
template void quantizeBlockwise<half, 0>(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n); template void
template void quantizeBlockwise<float, 0>(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n); quantizeBlockwise<half, 0>(float *code, half *A, float *absmax, unsigned char *out, float *rand, int rand_offset, const int n);
template void quantizeBlockwise<half, 1>(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n);
template void quantizeBlockwise<float, 1>(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, const int n); template void
quantizeBlockwise<float, 0>(float *code, float *A, float *absmax, unsigned char *out, float *rand, int rand_offset, const int n);
template void
quantizeBlockwise<half, 1>(float *code, half *A, float *absmax, unsigned char *out, float *rand, int rand_offset, const int n);
template void
quantizeBlockwise<float, 1>(float *code, float *A, float *absmax, unsigned char *out, float *rand, int rand_offset, const int n);
template void dequantizeBlockwise<half>(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n); template void dequantizeBlockwise<half>(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n);
template void dequantizeBlockwise<float>(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n); template void dequantizeBlockwise<float>(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n);
#define MAKE_optimizer32bit(name, gtype) \ #define MAKE_optimizer32bit(name, gtype) \
...@@ -320,12 +221,19 @@ template void optimizer32bit<gtype, name>(gtype* g, gtype* p, \ ...@@ -320,12 +221,19 @@ template void optimizer32bit<gtype, name>(gtype* g, gtype* p, \
const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n); const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n);
MAKE_optimizer32bit(ADAM, half) MAKE_optimizer32bit(ADAM, half)
MAKE_optimizer32bit(ADAM, float) MAKE_optimizer32bit(ADAM, float)
MAKE_optimizer32bit(MOMENTUM, half) MAKE_optimizer32bit(MOMENTUM, half)
MAKE_optimizer32bit(MOMENTUM, float) MAKE_optimizer32bit(MOMENTUM, float)
MAKE_optimizer32bit(RMSPROP, half) MAKE_optimizer32bit(RMSPROP, half)
MAKE_optimizer32bit(RMSPROP, float) MAKE_optimizer32bit(RMSPROP, float)
MAKE_optimizer32bit(ADAGRAD, half) MAKE_optimizer32bit(ADAGRAD, half)
MAKE_optimizer32bit(ADAGRAD, float) MAKE_optimizer32bit(ADAGRAD, float)
#define MAKE_optimizerStatic8bit(name, gtype) \ #define MAKE_optimizerStatic8bit(name, gtype) \
...@@ -338,11 +246,17 @@ template void optimizerStatic8bit<gtype, name>(gtype* p, gtype* g, unsigned char ...@@ -338,11 +246,17 @@ template void optimizerStatic8bit<gtype, name>(gtype* p, gtype* g, unsigned char
float weight_decay, \ float weight_decay, \
const float gnorm_scale, int n); \ const float gnorm_scale, int n); \
MAKE_optimizerStatic8bit(ADAM, half) MAKE_optimizerStatic8bit(ADAM, half)
MAKE_optimizerStatic8bit(ADAM, float) MAKE_optimizerStatic8bit(ADAM, float)
MAKE_optimizerStatic8bit(MOMENTUM, half) MAKE_optimizerStatic8bit(MOMENTUM, half)
MAKE_optimizerStatic8bit(MOMENTUM, float) MAKE_optimizerStatic8bit(MOMENTUM, float)
MAKE_optimizerStatic8bit(RMSPROP, half) MAKE_optimizerStatic8bit(RMSPROP, half)
MAKE_optimizerStatic8bit(RMSPROP, float) MAKE_optimizerStatic8bit(RMSPROP, float)
#define MAKE_optimizerStatic8bitBlockwise(gtype, optim_name) \ #define MAKE_optimizerStatic8bitBlockwise(gtype, optim_name) \
...@@ -350,14 +264,23 @@ template void optimizerStatic8bitBlockwise<gtype, optim_name>(gtype* p, gtype* g ...@@ -350,14 +264,23 @@ template void optimizerStatic8bitBlockwise<gtype, optim_name>(gtype* p, gtype* g
unsigned char* state1, unsigned char* state2, float beta1, float beta2, float eps, int step, float lr, \ unsigned char* state1, unsigned char* state2, float beta1, float beta2, float eps, int step, float lr, \
float* quantiles1, float* quantiles2, float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale, bool skip_zeros, int n); \ float* quantiles1, float* quantiles2, float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale, bool skip_zeros, int n); \
MAKE_optimizerStatic8bitBlockwise(half, ADAM); MAKE_optimizerStatic8bitBlockwise(half, ADAM);
MAKE_optimizerStatic8bitBlockwise(float, ADAM); MAKE_optimizerStatic8bitBlockwise(float, ADAM);
MAKE_optimizerStatic8bitBlockwise(half, MOMENTUM); MAKE_optimizerStatic8bitBlockwise(half, MOMENTUM);
MAKE_optimizerStatic8bitBlockwise(float, MOMENTUM); MAKE_optimizerStatic8bitBlockwise(float, MOMENTUM);
MAKE_optimizerStatic8bitBlockwise(half, RMSPROP); MAKE_optimizerStatic8bitBlockwise(half, RMSPROP);
MAKE_optimizerStatic8bitBlockwise(float, RMSPROP); MAKE_optimizerStatic8bitBlockwise(float, RMSPROP);
MAKE_optimizerStatic8bitBlockwise(half, ADAGRAD); MAKE_optimizerStatic8bitBlockwise(half, ADAGRAD);
MAKE_optimizerStatic8bitBlockwise(float, ADAGRAD); MAKE_optimizerStatic8bitBlockwise(float, ADAGRAD);
template void percentileClipping(float * g, float *gnorm_vec, int step, const int n); template void percentileClipping(float *g, float *gnorm_vec, int step, const int n);
template void percentileClipping(half * g, float *gnorm_vec, int step, const int n);
template void percentileClipping(half *g, float *gnorm_vec, int step, const int n);
...@@ -68,16 +68,6 @@ template<typename T, int OPTIMIZER> void optimizerStatic8bitBlockwise(T* p, T* g ...@@ -68,16 +68,6 @@ template<typename T, int OPTIMIZER> void optimizerStatic8bitBlockwise(T* p, T* g
template<typename T> void percentileClipping(T * g, float *gnorm_vec, int step, const int n); template<typename T> void percentileClipping(T * g, float *gnorm_vec, int step, const int n);
void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, int n);
void dequantize_cpu(float *code, unsigned char *A, float *absmax, float *out, int n);
void histogramScatterAdd2D(float* histogram, int *index1, int *index2, float *src, int maxidx1, int n); void histogramScatterAdd2D(float* histogram, int *index1, int *index2, float *src, int maxidx1, int n);
#endif #endif
...@@ -3,7 +3,10 @@ ...@@ -3,7 +3,10 @@
// This source code is licensed under the MIT license found in the // This source code is licensed under the MIT license found in the
// LICENSE file in the root directory of this source tree. // LICENSE file in the root directory of this source tree.
#if BUILD_CUDA
#include <ops.cuh> #include <ops.cuh>
#endif
#include <cpu_ops.h>
// We cannot call templated code from C, so we wrap the template in a C compatible call here if necessary. // We cannot call templated code from C, so we wrap the template in a C compatible call here if necessary.
// We use macro functions to expand all the different optimizers. Looks ugly, and is ugly, but its better than to // We use macro functions to expand all the different optimizers. Looks ugly, and is ugly, but its better than to
...@@ -12,6 +15,7 @@ ...@@ -12,6 +15,7 @@
// UNMANGLED CALLS // UNMANGLED CALLS
//=================================================================================== //===================================================================================
#if BUILD_CUDA
void estimateQuantiles_fp32(float *A, float *code, float offset, int n){ estimateQuantiles<float>(A, code, offset, n); } void estimateQuantiles_fp32(float *A, float *code, float offset, int n){ estimateQuantiles<float>(A, code, offset, n); }
void estimateQuantiles_fp16(half *A, float *code, float offset, int n){ estimateQuantiles<half>(A, code, offset, n); } void estimateQuantiles_fp16(half *A, float *code, float offset, int n){ estimateQuantiles<half>(A, code, offset, n); }
...@@ -34,15 +38,15 @@ MAKE_FUNC32(adagrad, ADAGRAD, half, 16) ...@@ -34,15 +38,15 @@ MAKE_FUNC32(adagrad, ADAGRAD, half, 16)
#define MAKE_FUNC8(fname, oname, gtype, gbits) \ #define MAKE_FUNC8(fname, oname, gtype, gbits) \
void fname##_static_8bit_g##gbits(gtype* p, gtype* g, unsigned char* state1, unsigned char* state2, \ void fname##_static_8bit_g##gbits(gtype* p, gtype* g, unsigned char* state1, unsigned char* state2, \
float *unorm, float max_unorm, float param_norm, \ float *unorm, float max_unorm, float param_norm, \
float beta1, float beta2, \ float beta1, float beta2, \
float eps, int step, float lr, \ float eps, int step, float lr, \
float* quantiles1, float* quantiles2, \ float* quantiles1, float* quantiles2, \
float* max1, float* max2, float* new_max1, float* new_max2, \ float* max1, float* max2, float* new_max1, float* new_max2, \
float weight_decay, float gnorm_scale, int n) \ float weight_decay, float gnorm_scale, int n) \
{ \ { \
optimizerStatic8bit<gtype, oname>(g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr, \ optimizerStatic8bit<gtype, oname>(g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr, \
quantiles1, quantiles2, max1, max2, new_max1, new_max2, weight_decay, gnorm_scale, n); \ quantiles1, quantiles2, max1, max2, new_max1, new_max2, weight_decay, gnorm_scale, n); \
} \ } \
MAKE_FUNC8(adam, ADAM, float, 32) MAKE_FUNC8(adam, ADAM, float, 32)
...@@ -78,39 +82,41 @@ void quantizeBlockwise_stochastic_fp32(float * code, float *A, float *absmax, un ...@@ -78,39 +82,41 @@ void quantizeBlockwise_stochastic_fp32(float * code, float *A, float *absmax, un
void dequantizeBlockwise_fp16(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise<half>(code, A, absmax, out, blocksize, n); } \ void dequantizeBlockwise_fp16(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise<half>(code, A, absmax, out, blocksize, n); } \
void dequantizeBlockwise_fp32(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise<float>(code, A, absmax, out, blocksize, n); } void dequantizeBlockwise_fp32(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise<float>(code, A, absmax, out, blocksize, n); }
#endif
extern "C" extern "C"
{ {
void cestimate_quantiles_fp32(float *A, float *code, float offset, int n){ estimateQuantiles_fp32(A, code, offset, n); } #if BUILD_CUDA
void cestimate_quantiles_fp16(half *A, float *code, float offset, int n){ estimateQuantiles_fp16(A, code, offset, n); } void cestimate_quantiles_fp32(float *A, float *code, float offset, int n){ estimateQuantiles_fp32(A, code, offset, n); }
void cquantize(float *code, float *A, unsigned char *out, int n){ quantize(code, A, out, n); } void cestimate_quantiles_fp16(half *A, float *code, float offset, int n){ estimateQuantiles_fp16(A, code, offset, n); }
void cdequantize(float *code, unsigned char *A, float *out, int n){ dequantize(code, A, out, n); } void cquantize(float *code, float *A, unsigned char *out, int n){ quantize(code, A, out, n); }
void cquantize_blockwise_fp16(float * code, half *A, float *absmax, unsigned char *out, const int n){ quantizeBlockwise_fp16(code, A, absmax, out, n); } void cdequantize(float *code, unsigned char *A, float *out, int n){ dequantize(code, A, out, n); }
void cquantize_blockwise_fp32(float * code, float *A, float *absmax, unsigned char *out, const int n){ quantizeBlockwise_fp32(code, A, absmax, out, n); } void cquantize_blockwise_fp16(float * code, half *A, float *absmax, unsigned char *out, const int n){ quantizeBlockwise_fp16(code, A, absmax, out, n); }
void cquantize_blockwise_stochastic_fp16(float * code, half *A, float *absmax, unsigned char *out, float *rand, int rand_offset, const int n){ quantizeBlockwise_stochastic_fp16(code, A, absmax, out, rand, rand_offset, n); } void cquantize_blockwise_fp32(float * code, float *A, float *absmax, unsigned char *out, const int n){ quantizeBlockwise_fp32(code, A, absmax, out, n); }
void cquantize_blockwise_stochastic_fp32(float * code, float *A, float *absmax, unsigned char *out, float *rand, int rand_offset, const int n){ quantizeBlockwise_stochastic_fp32(code, A, absmax, out, rand, rand_offset, n); } void cquantize_blockwise_stochastic_fp16(float * code, half *A, float *absmax, unsigned char *out, float *rand, int rand_offset, const int n){ quantizeBlockwise_stochastic_fp16(code, A, absmax, out, rand, rand_offset, n); }
void cquantize_blockwise_stochastic_fp32(float * code, float *A, float *absmax, unsigned char *out, float *rand, int rand_offset, const int n){ quantizeBlockwise_stochastic_fp32(code, A, absmax, out, rand, rand_offset, n); }
void cdequantize_blockwise_fp16(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise_fp16(code, A, absmax, out, blocksize, n); }
void cdequantize_blockwise_fp32(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise_fp32(code, A, absmax, out, blocksize, n); } void cdequantize_blockwise_fp16(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise_fp16(code, A, absmax, out, blocksize, n); }
void cdequantize_blockwise_fp32(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise_fp32(code, A, absmax, out, blocksize, n); }
#define MAKE_CFUNC32(name, gtype, gbits) \
void c##name##32bit_g##gbits(gtype *g, gtype *p, \ #define MAKE_CFUNC32(name, gtype, gbits) \
float* state1, float* state2, float *unorm, float max_unorm, float param_norm, \ void c##name##32bit_g##gbits(gtype *g, gtype *p, \
const float beta1, const float beta2, const float eps, const float weight_decay, \ float* state1, float* state2, float *unorm, float max_unorm, float param_norm, \
const int step, const float lr, const float gnorm_scale, bool skip_zeros, const int n) \ const float beta1, const float beta2, const float eps, const float weight_decay, \
{ name##32bit_g##gbits(g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n); } \ const int step, const float lr, const float gnorm_scale, bool skip_zeros, const int n) \
{ name##32bit_g##gbits(g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n); } \
MAKE_CFUNC32(adam, float, 32)
MAKE_CFUNC32(adam, half, 16) MAKE_CFUNC32(adam, float, 32)
MAKE_CFUNC32(momentum, float, 32) MAKE_CFUNC32(adam, half, 16)
MAKE_CFUNC32(momentum, half, 16) MAKE_CFUNC32(momentum, float, 32)
MAKE_CFUNC32(rmsprop, float, 32) MAKE_CFUNC32(momentum, half, 16)
MAKE_CFUNC32(rmsprop, half, 16) MAKE_CFUNC32(rmsprop, float, 32)
MAKE_CFUNC32(adagrad, float, 32) MAKE_CFUNC32(rmsprop, half, 16)
MAKE_CFUNC32(adagrad, half, 16) MAKE_CFUNC32(adagrad, float, 32)
MAKE_CFUNC32(adagrad, half, 16)
#define MAKE_CFUNC8(name, gtype, gbits) \
void c##name##_static_8bit_g##gbits(gtype* p, gtype* g, unsigned char* state1, unsigned char* state2, \ #define MAKE_CFUNC8(name, gtype, gbits) \
void c##name##_static_8bit_g##gbits(gtype* p, gtype* g, unsigned char* state1, unsigned char* state2, \
float *unorm, float max_unorm, float param_norm, \ float *unorm, float max_unorm, float param_norm, \
float beta1, float beta2, \ float beta1, float beta2, \
float eps, int step, float lr, \ float eps, int step, float lr, \
...@@ -118,40 +124,40 @@ extern "C" ...@@ -118,40 +124,40 @@ extern "C"
float* max1, float* max2, float* new_max1, float* new_max2, \ float* max1, float* max2, float* new_max1, float* new_max2, \
float weight_decay, float gnorm_scale, int n) \ float weight_decay, float gnorm_scale, int n) \
{ \ { \
name##_static_8bit_g##gbits(g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr, \ name##_static_8bit_g##gbits(g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr, \
quantiles1, quantiles2, max1, max2, new_max1, new_max2, weight_decay, gnorm_scale, n); \ quantiles1, quantiles2, max1, max2, new_max1, new_max2, weight_decay, gnorm_scale, n); \
} \ } \
MAKE_CFUNC8(adam, float, 32) MAKE_CFUNC8(adam, float, 32)
MAKE_CFUNC8(adam, half, 16) MAKE_CFUNC8(adam, half, 16)
MAKE_CFUNC8(momentum, float, 32) MAKE_CFUNC8(momentum, float, 32)
MAKE_CFUNC8(momentum, half, 16) MAKE_CFUNC8(momentum, half, 16)
MAKE_CFUNC8(rmsprop, float, 32) MAKE_CFUNC8(rmsprop, float, 32)
MAKE_CFUNC8(rmsprop, half, 16) MAKE_CFUNC8(rmsprop, half, 16)
#define MAKE_CBLOCKWISE8(fname, optim_name, gtype, gbits) \ #define MAKE_CBLOCKWISE8(fname, optim_name, gtype, gbits) \
void c##fname##_8bit_blockwise_fp##gbits(gtype* p, gtype* g, \ void c##fname##_8bit_blockwise_fp##gbits(gtype* p, gtype* g, \
unsigned char* state1, unsigned char* state2, float beta1, float beta2, float eps, int step, float lr, \ unsigned char* state1, unsigned char* state2, float beta1, float beta2, float eps, int step, float lr, \
float* quantiles1, float* quantiles2, float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale, bool skip_zeros, int n) \ float* quantiles1, float* quantiles2, float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale, bool skip_zeros, int n) \
{ fname##_8bit_blockwise_fp##gbits(p, g, state1, state2, beta1, beta2, eps, step, lr, quantiles1, quantiles2, absmax1, absmax2, weight_decay, gnorm_scale, skip_zeros, n); } \ { fname##_8bit_blockwise_fp##gbits(p, g, state1, state2, beta1, beta2, eps, step, lr, quantiles1, quantiles2, absmax1, absmax2, weight_decay, gnorm_scale, skip_zeros, n); } \
MAKE_CBLOCKWISE8(adam, ADAM, half, 16) MAKE_CBLOCKWISE8(adam, ADAM, half, 16)
MAKE_CBLOCKWISE8(adam, ADAM, float, 32) MAKE_CBLOCKWISE8(adam, ADAM, float, 32)
MAKE_CBLOCKWISE8(momentum, MOMENTUM, half, 16) MAKE_CBLOCKWISE8(momentum, MOMENTUM, half, 16)
MAKE_CBLOCKWISE8(momentum, MOMENTUM, float, 32) MAKE_CBLOCKWISE8(momentum, MOMENTUM, float, 32)
MAKE_CBLOCKWISE8(rmsprop, RMSPROP, half, 16) MAKE_CBLOCKWISE8(rmsprop, RMSPROP, half, 16)
MAKE_CBLOCKWISE8(rmsprop, RMSPROP, float, 32) MAKE_CBLOCKWISE8(rmsprop, RMSPROP, float, 32)
MAKE_CBLOCKWISE8(adagrad, ADAGRAD, half, 16) MAKE_CBLOCKWISE8(adagrad, ADAGRAD, half, 16)
MAKE_CBLOCKWISE8(adagrad, ADAGRAD, float, 32) MAKE_CBLOCKWISE8(adagrad, ADAGRAD, float, 32)
void cpercentile_clipping_g32(float * g, float *gnorm_vec, int step, const int n){ percentileClipping_g32(g, gnorm_vec, step, n); } void cpercentile_clipping_g32(float * g, float *gnorm_vec, int step, const int n){ percentileClipping_g32(g, gnorm_vec, step, n); }
void cpercentile_clipping_g16(half * g, float *gnorm_vec, int step, const int n){ percentileClipping_g16(g, gnorm_vec, step, n); } void cpercentile_clipping_g16(half * g, float *gnorm_vec, int step, const int n){ percentileClipping_g16(g, gnorm_vec, step, n); }
void chistogram_scatter_add_2d(float* histogram, int *index1, int *index2, float *src, int maxidx1, int n){ histogramScatterAdd2D(histogram, index1, index2, src, maxidx1, n); }
#endif
void cquantize_blockwise_cpu_fp32(float *code, float *A, float *absmax, unsigned char *out, const int n){ quantize_cpu(code, A, absmax, out, n); } void cquantize_blockwise_cpu_fp32(float *code, float *A, float *absmax, unsigned char *out, const int n){ quantize_cpu(code, A, absmax, out, n); }
void cdequantize_blockwise_cpu_fp32(float *code, unsigned char *A, float *absmax, float *out, const int n){ dequantize_cpu(code, A, absmax, out, n); } void cdequantize_blockwise_cpu_fp32(float *code, unsigned char *A, float *absmax, float *out, const int n){ dequantize_cpu(code, A, absmax, out, n); }
void chistogram_scatter_add_2d(float* histogram, int *index1, int *index2, float *src, int maxidx1, int n){ histogramScatterAdd2D(histogram, index1, index2, src, maxidx1, n); }
} }
...@@ -6,27 +6,27 @@ import os ...@@ -6,27 +6,27 @@ import os
from setuptools import setup, find_packages from setuptools import setup, find_packages
def read(fname): def read(fname):
return open(os.path.join(os.path.dirname(__file__), fname)).read() return open(os.path.join(os.path.dirname(__file__), fname)).read()
version = os.getenv("CUDA_VERSION", "cpu")
setup( setup(
name = f"bitsandbytes-cuda{os.environ['CUDA_VERSION']}", name="bitsandbytes",
version = "0.26.0", version=f"0.26.0+{version}",
author = "Tim Dettmers", author="Tim Dettmers",
author_email = "dettmers@cs.washington.edu", author_email="dettmers@cs.washington.edu",
description = ("8-bit optimizers and quantization routines."), description="8-bit optimizers and quantization routines.",
license = "MIT", license="MIT",
keywords = "gpu optimizers optimization 8-bit quantization compression", keywords="gpu optimizers optimization 8-bit quantization compression",
url = "http://packages.python.org/bitsandbytes", url="http://packages.python.org/bitsandbytes",
packages=find_packages(), packages=find_packages(),
package_data={'': ['libbitsandbytes.so']}, package_data={'': ['libbitsandbytes.so']},
long_description=read('README.md'), long_description=read('README.md'),
long_description_content_type = 'text/markdown', long_description_content_type='text/markdown',
classifiers=[ classifiers=[
"Development Status :: 4 - Beta", "Development Status :: 4 - Beta",
'Topic :: Scientific/Engineering :: Artificial Intelligence' 'Topic :: Scientific/Engineering :: Artificial Intelligence'
], ],
) )
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment