Commit 0fb378b4 authored by Tim Dettmers's avatar Tim Dettmers
Browse files

Added compilation from source instructions; easier compilation.

parent d2f16726
...@@ -2,7 +2,11 @@ MKFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST))) ...@@ -2,7 +2,11 @@ MKFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
ROOT_DIR := $(patsubst %/,%,$(dir $(MKFILE_PATH))) ROOT_DIR := $(patsubst %/,%,$(dir $(MKFILE_PATH)))
GPP:= /usr/bin/g++ GPP:= /usr/bin/g++
ifeq ($(CUDA_HOME),)
CUDA_HOME:= $(shell which nvcc | rev | cut -d'/' -f3- | rev)
endif
NVCC := $(CUDA_HOME)/bin/nvcc NVCC := $(CUDA_HOME)/bin/nvcc
########################################### ###########################################
CSRC := $(ROOT_DIR)/csrc CSRC := $(ROOT_DIR)/csrc
...@@ -15,58 +19,66 @@ INCLUDE := -I $(CUDA_HOME)/include -I $(ROOT_DIR)/csrc -I $(CONDA_PREFIX)/inclu ...@@ -15,58 +19,66 @@ INCLUDE := -I $(CUDA_HOME)/include -I $(ROOT_DIR)/csrc -I $(CONDA_PREFIX)/inclu
LIB := -L $(CUDA_HOME)/lib64 -lcudart -lcuda -lcublas -lcurand -lcusparse -L $(CONDA_PREFIX)/lib LIB := -L $(CUDA_HOME)/lib64 -lcudart -lcuda -lcublas -lcurand -lcusparse -L $(CONDA_PREFIX)/lib
# NVIDIA NVCC compilation flags # NVIDIA NVCC compilation flags
#COMPUTE_CAPABILITY := -gencode arch=compute_35,code=sm_35 # Kepler COMPUTE_CAPABILITY := -gencode arch=compute_35,code=sm_35 # Kepler
#COMPUTE_CAPABILITY += -gencode arch=compute_37,code=sm_37 # Kepler COMPUTE_CAPABILITY += -gencode arch=compute_37,code=sm_37 # Kepler
#COMPUTE_CAPABILITY += -gencode arch=compute_50,code=sm_50 # Maxwell COMPUTE_CAPABILITY += -gencode arch=compute_50,code=sm_50 # Maxwell
#COMPUTE_CAPABILITY += -gencode arch=compute_52,code=sm_52 # Maxwell COMPUTE_CAPABILITY += -gencode arch=compute_52,code=sm_52 # Maxwell
#COMPUTE_CAPABILITY += -gencode arch=compute_60,code=sm_60 # Pascal COMPUTE_CAPABILITY += -gencode arch=compute_60,code=sm_60 # Pascal
#COMPUTE_CAPABILITY += -gencode arch=compute_61,code=sm_61 # Pascal COMPUTE_CAPABILITY += -gencode arch=compute_61,code=sm_61 # Pascal
#COMPUTE_CAPABILITY += -gencode arch=compute_70,code=sm_70 # Volta COMPUTE_CAPABILITY += -gencode arch=compute_70,code=sm_70 # Volta
#COMPUTE_CAPABILITY += -gencode arch=compute_72,code=sm_72 # Volta COMPUTE_CAPABILITY += -gencode arch=compute_72,code=sm_72 # Volta
#COMPUTE_CAPABILITY += -gencode arch=compute_72,code=sm_72 # Volta COMPUTE_CAPABILITY += -gencode arch=compute_72,code=sm_72 # Volta
#
## CUDA 9.2 supports CC 3.0, but CUDA >= 11.0 does not # CUDA 9.2 supports CC 3.0, but CUDA >= 11.0 does not
#CC_CUDA92 := -gencode arch=compute_30,code=sm_30 CC_CUDA92 := -gencode arch=compute_30,code=sm_30
#
## Later versions of CUDA support the new architectures # Later versions of CUDA support the new architectures
#CC_CUDA10x := -gencode arch=compute_30,code=sm_30 CC_CUDA10x := -gencode arch=compute_30,code=sm_30
#CC_CUDA10x += -gencode arch=compute_75,code=sm_75 CC_CUDA10x += -gencode arch=compute_75,code=sm_75
#
#CC_CUDA110 := -gencode arch=compute_75,code=sm_75 CC_CUDA110 := -gencode arch=compute_75,code=sm_75
#CC_CUDA110 += -gencode arch=compute_80,code=sm_80 CC_CUDA110 += -gencode arch=compute_80,code=sm_80
#
#CC_CUDA11x := -gencode arch=compute_75,code=sm_75 CC_CUDA11x := -gencode arch=compute_75,code=sm_75
#CC_CUDA11x += -gencode arch=compute_80,code=sm_80 CC_CUDA11x += -gencode arch=compute_80,code=sm_80
#CC_CUDA11x += -gencode arch=compute_86,code=sm_86 CC_CUDA11x += -gencode arch=compute_86,code=sm_86
COMPUTE_CAPABILITY := -gencode arch=compute_70,code=sm_70 # Volta all: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR) env
all: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR)
$(NVCC) $(COMPUTE_CAPABILITY) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) $(NVCC) $(COMPUTE_CAPABILITY) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
$(NVCC) $(COMPUTE_CAPABILITY) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o $(NVCC) $(COMPUTE_CAPABILITY) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
$(GPP) -std=c++11 -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes.so $(LIB) $(GPP) -std=c++11 -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes.so $(LIB)
cuda92: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR) cuda92: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR) env
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA92) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA92) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA92) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA92) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
$(GPP) -std=c++11 -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes.so $(LIB) $(GPP) -std=c++11 -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes.so $(LIB)
cuda10x: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR) cuda10x: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR) env
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA10x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA10x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA10x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA10x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
$(GPP) -std=c++11 -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes.so $(LIB) $(GPP) -std=c++11 -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes.so $(LIB)
cuda110: $(BUILD_DIR) cuda110: $(BUILD_DIR) env
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA10x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA10x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA10x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA10x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
$(GPP) -std=c++11 -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes.so $(LIB) $(GPP) -std=c++11 -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes.so $(LIB)
cuda11x: $(BUILD_DIR) cuda11x: $(BUILD_DIR) env
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA10x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA10x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
$(GPP) -std=c++11 -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes.so $(LIB) $(GPP) -std=c++11 -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes.so $(LIB)
env:
@echo "ENVIRONMENT"
@echo "============================"
@echo "NVCC path: $(NVCC)"
@echo "GPP path: $(GPP)"
@echo "CUDA_HOME: $(CUDA_HOME)"
@echo "CONDA_PREFIX: $(CONDA_PREFIX)"
@echo "PATH: $(PATH)"
@echo "LD_LIBRARY_PATH: $(LD_LIBRARY_PATH)"
@echo "============================"
$(BUILD_DIR): $(BUILD_DIR):
mkdir -p cuda_build mkdir -p cuda_build
mkdir -p dependencies mkdir -p dependencies
......
...@@ -84,6 +84,10 @@ For upcoming features and changes and full history see [Patch Notes](CHANGELOG.m ...@@ -84,6 +84,10 @@ For upcoming features and changes and full history see [Patch Notes](CHANGELOG.m
1. RuntimeError: CUDA error: no kernel image is available for execution on the device. [Solution](errors_and_solutions.md#No-kernel-image-available) 1. RuntimeError: CUDA error: no kernel image is available for execution on the device. [Solution](errors_and_solutions.md#No-kernel-image-available)
## Compile from source
To compile from source, please follow the [compile_from_source.md](compile_from_source.md) instructions.
## License ## License
The majority of bitsandbytes is licensed under MIT, however portions of the project are available under separate license terms: Pytorch is licensed under the BSD license. The majority of bitsandbytes is licensed under MIT, however portions of the project are available under separate license terms: Pytorch is licensed under the BSD license.
......
# Compiling from source
Basic steps.
1. `make cudaXXX` where `cudaXXX` is among `cuda92, cuda10x, cuda110, cuda11x`
2. `CUDA_VERSION=XXX python setup.py install`
To run these steps you will need to have the nvcc compiler installed that comes with a CUDA installation. If you use anaconda (recommended) then you can figure out which version of CUDA you are using with PyTorch via the command `conda list | grep cudatoolkit`. Then you can install the nvcc compiler by downloading and installing the same CUDA version from the [CUDA toolkit archive](https://developer.nvidia.com/cuda-toolkit-archive).
For your convenience, there is a install script int he root directory that installs CUDA 11.1 locally and configures it automatically. After installing you should add the `bin` sub-directory to the `$PATH` variable to make the compiler visible to your system. To do this you can add this to your `.bashrc` by executing these commands:
```bash
echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64/" >> ~/.bashrc
echo "export PATH=$PATH:/usr/local/cuda/bin/" >> ~/.bashrc
source ~/.bashrc
```
By default, the Makefile will look at your `CUDA_HOME` environmental variable to find your CUDA version for compiling the library. If this path is not set it is inferred from the path of your `nvcc` compiler.
Either `nvcc` needs to be in path for the `CUDA_HOME` variable needs to be set to the CUDA directory root (e.g. `/usr/local/cuda`) in order for compilation to succeed
If you have problems compiling the library with these instructions from source, please open an issue.
...@@ -715,7 +715,7 @@ __global__ void kOptimizer32bit2State(T* g, T* p, ...@@ -715,7 +715,7 @@ __global__ void kOptimizer32bit2State(T* g, T* p,
switch(OPTIMIZER) switch(OPTIMIZER)
{ {
case ADAM: case ADAM:
if(!skip_zeros || (skip_zeros && g_vals[j] != (T)0.0)) if(!skip_zeros || (skip_zeros && ((float)g_vals[j] != 0.0f)))
{ {
s1_vals[j] = s1_vals[j]*beta1 + ((1.0f -beta1)*((float)g_vals[j])); s1_vals[j] = s1_vals[j]*beta1 + ((1.0f -beta1)*((float)g_vals[j]));
s2_vals[j] = s2_vals[j]*beta2 + ((1.0f -beta2)*(((float)g_vals[j])*((float)g_vals[j]))); s2_vals[j] = s2_vals[j]*beta2 + ((1.0f -beta2)*(((float)g_vals[j])*((float)g_vals[j])));
...@@ -868,7 +868,7 @@ __global__ void kOptimizer32bit1State(T *g, T *p, ...@@ -868,7 +868,7 @@ __global__ void kOptimizer32bit1State(T *g, T *p,
# pragma unroll 4 # pragma unroll 4
for(unsigned int j = 0; j < NUM_PER_THREAD; j++) for(unsigned int j = 0; j < NUM_PER_THREAD; j++)
{ {
if(!skip_zeros || (skip_zeros && g_vals[j] != (T)0.0)) if(!skip_zeros || (skip_zeros && ((float)g_vals[j] != 0.0f)))
{ {
switch(OPTIMIZER) switch(OPTIMIZER)
{ {
...@@ -1475,7 +1475,7 @@ kOptimizerStatic8bit2StateBlockwise(T* p, T* __restrict__ const g, unsigned char ...@@ -1475,7 +1475,7 @@ kOptimizerStatic8bit2StateBlockwise(T* p, T* __restrict__ const g, unsigned char
{ {
g_val = float(g_vals[j]); g_val = float(g_vals[j]);
g_val *= gnorm_scale; g_val *= gnorm_scale;
if(!skip_zeros || (skip_zeros && g_vals[j] != (T)0.0)) if(!skip_zeros || (skip_zeros && ((float)g_vals[j] != 0.0f)))
{ {
s1_vals[j] = smem_quantiles1[lane_id][c1s[j]]*absmax1[i/BLOCK_SIZE]; s1_vals[j] = smem_quantiles1[lane_id][c1s[j]]*absmax1[i/BLOCK_SIZE];
s1_vals[j] = (s1_vals[j]*beta1) + (((1.0f-beta1)*g_val)); s1_vals[j] = (s1_vals[j]*beta1) + (((1.0f-beta1)*g_val));
...@@ -1518,7 +1518,7 @@ kOptimizerStatic8bit2StateBlockwise(T* p, T* __restrict__ const g, unsigned char ...@@ -1518,7 +1518,7 @@ kOptimizerStatic8bit2StateBlockwise(T* p, T* __restrict__ const g, unsigned char
# pragma unroll N_PER_TH # pragma unroll N_PER_TH
for(unsigned int j = 0; j < N_PER_TH; j++) for(unsigned int j = 0; j < N_PER_TH; j++)
{ {
if(!skip_zeros || (skip_zeros && g_vals[j] != (T)0.0)) if(!skip_zeros || (skip_zeros && ((float)g_vals[j] != 0.0f)))
{ {
g_vals[j] = (T)(((float)g_vals[j]) + ((step_size*(__fdividef(s1_vals[j],(sqrtf(s2_vals[j])+(correction2*eps))))))); g_vals[j] = (T)(((float)g_vals[j]) + ((step_size*(__fdividef(s1_vals[j],(sqrtf(s2_vals[j])+(correction2*eps)))))));
if(weight_decay > 0.0f) if(weight_decay > 0.0f)
...@@ -1635,7 +1635,7 @@ kOptimizerStatic8bit1StateBlockwise(T* p, T* __restrict__ const g, unsigned char ...@@ -1635,7 +1635,7 @@ kOptimizerStatic8bit1StateBlockwise(T* p, T* __restrict__ const g, unsigned char
{ {
g_val = float(g_vals[j]); g_val = float(g_vals[j]);
g_val *= gnorm_scale; g_val *= gnorm_scale;
if(!skip_zeros || (skip_zeros && g_vals[j] != (T)0.0)) if(!skip_zeros || (skip_zeros && ((float)g_vals[j] != 0.0f)))
{ {
if(weight_decay > 0.0f) if(weight_decay > 0.0f)
g_val += ((float)p_vals[j])*weight_decay; g_val += ((float)p_vals[j])*weight_decay;
...@@ -1677,7 +1677,7 @@ kOptimizerStatic8bit1StateBlockwise(T* p, T* __restrict__ const g, unsigned char ...@@ -1677,7 +1677,7 @@ kOptimizerStatic8bit1StateBlockwise(T* p, T* __restrict__ const g, unsigned char
# pragma unroll N_PER_TH # pragma unroll N_PER_TH
for(unsigned int j = 0; j < N_PER_TH; j++) for(unsigned int j = 0; j < N_PER_TH; j++)
{ {
if(!skip_zeros || (skip_zeros && g_vals[j] != (T)0.0)) if(!skip_zeros || (skip_zeros && ((float)g_vals[j] != 0.0f)))
{ {
switch(OPTIMIZER) switch(OPTIMIZER)
{ {
......
...@@ -3,3 +3,6 @@ ...@@ -3,3 +3,6 @@
This problem arises with the cuda version loaded by bitsandbytes is not supported by your GPU, or if you pytorch CUDA version mismatches. So solve this problem you need to debug ``$LD_LIBRARY_PATH``, ``$CUDA_HOME``, ``$PATH``. You can print these via ``echo $PATH``. You should look for multiple paths to different CUDA versions. This can include versions in your anaconda path, for example ``$HOME/anaconda3/lib``. You can check those versions via ``ls -l $HOME/anaconda3/lib/*cuda*`` or equivalent paths. Look at the CUDA versions of files in these paths. Does it match with ``nvidia-smi``? This problem arises with the cuda version loaded by bitsandbytes is not supported by your GPU, or if you pytorch CUDA version mismatches. So solve this problem you need to debug ``$LD_LIBRARY_PATH``, ``$CUDA_HOME``, ``$PATH``. You can print these via ``echo $PATH``. You should look for multiple paths to different CUDA versions. This can include versions in your anaconda path, for example ``$HOME/anaconda3/lib``. You can check those versions via ``ls -l $HOME/anaconda3/lib/*cuda*`` or equivalent paths. Look at the CUDA versions of files in these paths. Does it match with ``nvidia-smi``?
If you are feeling lucky, you can also try to compile the library from source. This can be still problematic if your PATH variables have multiple cuda versions. As such, it is recommended to figure out path conflicts before you proceed with compilation. If you are feeling lucky, you can also try to compile the library from source. This can be still problematic if your PATH variables have multiple cuda versions. As such, it is recommended to figure out path conflicts before you proceed with compilation.
__If you encounter any other error not listed here please create an issue. This will help resolve your problem and will help out others in the future.
wget https://developer.download.nvidia.com/compute/cuda/11.1.1/local_installers/cuda_11.1.1_455.32.00_linux.run
bash cuda_11.1.1_455.32.00_linux.run --no-drm --no-man-page --override --installpath=~/local --librarypath=~/local/lib --toolkitpath=~/local/cuda-11.1/ --toolkit --silent
echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/local/cuda-11.1/lib64/" >> ~/.bashrc
echo "export PATH=$PATH:~/local/cuda-11.1/bin/" >> ~/.bashrc
source ~/.bashrc
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment