Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
bitsandbytes
Commits
6bfd7a40
Commit
6bfd7a40
authored
Apr 25, 2023
by
Tim Dettmers
Browse files
Initial template.
parent
0f9d3020
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
53 additions
and
1 deletion
+53
-1
Makefile
Makefile
+12
-1
csrc/kernels.cu
csrc/kernels.cu
+25
-0
csrc/kernels.cuh
csrc/kernels.cuh
+2
-0
csrc/ops.cu
csrc/ops.cu
+12
-0
csrc/ops.cuh
csrc/ops.cuh
+2
-0
No files found.
Makefile
View file @
6bfd7a40
...
...
@@ -25,6 +25,7 @@ FILES_CPP := $(CSRC)/common.cpp $(CSRC)/cpu_ops.cpp $(CSRC)/pythonInterface.c
INCLUDE
:=
-I
$(CUDA_HOME)
/include
-I
$(ROOT_DIR)
/csrc
-I
$(CONDA_PREFIX)
/include
-I
$(ROOT_DIR)
/include
INCLUDE_10x
:=
-I
$(CUDA_HOME)
/include
-I
$(ROOT_DIR)
/csrc
-I
$(ROOT_DIR)
/dependencies/cub
-I
$(ROOT_DIR)
/include
INCLUDE_cutlass
:=
-I
$(ROOT_DIR)
/dependencies/cutlass/include
LIB
:=
-L
$(CUDA_HOME)
/lib64
-lcudart
-lcublas
-lcublasLt
-lcurand
-lcusparse
-L
$(CONDA_PREFIX)
/lib
# NVIDIA NVCC compilation flags
...
...
@@ -61,7 +62,7 @@ CC_ADA_HOPPER += -gencode arch=compute_90,code=sm_90
all
:
$(BUILD_DIR) env
$(NVCC)
$(CC_CUDA11x)
-Xcompiler
'-fPIC'
--use_fast_math
-Xptxas
=
-v
-dc
$(FILES_CUDA)
$(INCLUDE)
$(LIB)
--output-directory
$(BUILD_DIR)
$(NVCC)
$(CC_CUDA11x)
-Xcompiler
'-fPIC'
--use_fast_math
-Xptxas
=
-v
-dc
$(FILES_CUDA)
$(INCLUDE)
$(LIB)
--output-directory
$(BUILD_DIR)
$(NVCC)
$(CC_CUDA11x)
-Xcompiler
'-fPIC'
-dlink
$(BUILD_DIR)
/ops.o
$(BUILD_DIR)
/kernels.o
-o
$(BUILD_DIR)
/link.o
$(GPP)
-std
=
c++14
-DBUILD_CUDA
-shared
-fPIC
$(INCLUDE)
$(BUILD_DIR)
/ops.o
$(BUILD_DIR)
/kernels.o
$(BUILD_DIR)
/link.o
$(FILES_CPP)
-o
./bitsandbytes/libbitsandbytes_cuda
$(CUDA_VERSION)
.so
$(LIB)
...
...
@@ -100,6 +101,11 @@ cuda11x: $(BUILD_DIR) env
$(NVCC)
$(CC_cublasLt111)
-Xcompiler
'-fPIC'
-dlink
$(BUILD_DIR)
/ops.o
$(BUILD_DIR)
/kernels.o
-o
$(BUILD_DIR)
/link.o
$(GPP)
-std
=
c++14
-DBUILD_CUDA
-shared
-fPIC
$(INCLUDE)
$(BUILD_DIR)
/ops.o
$(BUILD_DIR)
/kernels.o
$(BUILD_DIR)
/link.o
$(FILES_CPP)
-o
./bitsandbytes/libbitsandbytes_cuda
$(CUDA_VERSION)
.so
$(LIB)
cuda11x_cutlass
:
$(BUILD_DIR) env cutlass
$(NVCC)
$(CC_cublasLt111)
-Xcompiler
'-fPIC'
--use_fast_math
-Xptxas
=
-v
-dc
$(FILES_CUDA)
$(INCLUDE)
$(INCLUDE_cutlass)
$(LIB)
--output-directory
$(BUILD_DIR)
$(NVCC)
$(CC_cublasLt111)
-Xcompiler
'-fPIC'
-dlink
$(BUILD_DIR)
/ops.o
$(BUILD_DIR)
/kernels.o
-o
$(BUILD_DIR)
/link.o
$(GPP)
-std
=
c++20
-DBUILD_CUDA
-shared
-fPIC
$(INCLUDE)
$(BUILD_DIR)
/ops.o
$(BUILD_DIR)
/kernels.o
$(BUILD_DIR)
/link.o
$(FILES_CPP)
-o
./bitsandbytes/libbitsandbytes_cuda
$(CUDA_VERSION)
.so
$(LIB)
cuda12x
:
$(BUILD_DIR) env
$(NVCC)
$(CC_cublasLt111)
$(CC_ADA_HOPPER)
-Xcompiler
'-fPIC'
--use_fast_math
-Xptxas
=
-v
-dc
$(FILES_CUDA)
$(INCLUDE)
$(LIB)
--output-directory
$(BUILD_DIR)
$(NVCC)
$(CC_cublasLt111)
$(CC_ADA_HOPPER)
-Xcompiler
'-fPIC'
-dlink
$(BUILD_DIR)
/ops.o
$(BUILD_DIR)
/kernels.o
-o
$(BUILD_DIR)
/link.o
...
...
@@ -121,6 +127,11 @@ env:
@
echo
"LD_LIBRARY_PATH:
$(LD_LIBRARY_PATH)
"
@
echo
"============================"
cutlass
:
if
[
!
-d
"
$(ROOT_DIR)
/dependencies/cutlass"
]
;
then
\
git clone https://github.com/NVIDIA/cutlass.git
$(ROOT_DIR)
/dependencies/cutlass
;
\
fi
\
$(BUILD_DIR)
:
mkdir
-p
build
mkdir
-p
dependencies
...
...
csrc/kernels.cu
View file @
6bfd7a40
...
...
@@ -2919,10 +2919,35 @@ template <int FORMAT> __global__ void kExtractOutliers(char *A, int *idx, char *
}
}
template
<
int
QUANT_TYPE
,
typename
INPT
,
typename
COMPT
,
typename
OUTT
>
__global__
void
kMatmul_inference_4bit
(
INPT
*
A
,
unsigned
char
*
B
,
OUTT
*
out
,
int
lda
,
int
ldb
,
int
rowsA
,
int
colsA
,
int
colsB
)
{
// element-wise kernel
// 1. Load batch x k into registers
// 2. Load k x k into registers
// 3. dequantize and store in second pair of k x k
// 4. matmul
// 5. sum with cub
// 6. store outputs
// TC kernel
// use k warps per thread block
// 1. threadblock use read-only cache to read in register tile for A into shared memory
// 2. each warp loops over shared memory tiles of A of size 8x16 and loads them into fragments
// 3. each warp reads a segment of values 16x32 from B
// 4. do dequantization from register of B into second pair of registers
// 5. store (4) into fragment
// 6. matmul aggregate into fragment C
// 7. aggreecate files of C into shared memroy block C
// 8. sum (7)
// 9. write outputs to matmul output matrix
}
//==============================================================
// TEMPLATE DEFINITIONS
//==============================================================
template
__global__
void
kMatmul_inference_4bit
<
NF4
,
half
,
half
,
half
>(
half
*
A
,
unsigned
char
*
B
,
half
*
out
,
int
lda
,
int
ldb
,
int
rowsA
,
int
colsA
,
int
colsB
);
template
__global__
void
kExtractOutliers
<
COL_TURING
>(
char
*
A
,
int
*
idx
,
char
*
out
,
int
idx_size
,
int
rowsA
,
int
colsA
,
int
tiledRowsA
,
int
tiledColsA
);
template
__global__
void
kExtractOutliers
<
COL_AMPERE
>(
char
*
A
,
int
*
idx
,
char
*
out
,
int
idx_size
,
int
rowsA
,
int
colsA
,
int
tiledRowsA
,
int
tiledColsA
);
...
...
csrc/kernels.cuh
View file @
6bfd7a40
...
...
@@ -9,6 +9,8 @@
#ifndef kernels
#define kernels
template
<
int
QUANT_TYPE
,
typename
INP_TYPE
,
typename
COMP_TYPE
,
typename
OUT_TYPE
>
__global__
void
kMatmul_inference_4bit
(
INP_TYPE
*
A
,
unsigned
char
*
B
,
OUT_TYPE
*
out
,
int
lda
,
int
ldb
,
int
rowsA
,
int
colsA
,
int
colsB
);
template
<
typename
T
>
__global__
void
kEstimateQuantiles
(
T
*
__restrict__
const
A
,
float
*
code
,
const
float
offset
,
const
T
max_val
,
const
int
n
);
__global__
void
kQuantize
(
float
*
code
,
float
*
__restrict__
const
A
,
unsigned
char
*
out
,
const
int
n
);
...
...
csrc/ops.cu
View file @
6bfd7a40
...
...
@@ -90,6 +90,17 @@ template<typename T, int DATA_TYPE> void dequantizeBlockwise(float *code, unsign
CUDA_CHECK_RETURN
(
cudaPeekAtLastError
());
}
void
matmul4bite
(
half
*
A
,
unsigned
char
*
B
,
half
*
out
,
int
lda
,
int
ldb
,
int
rowsA
,
int
colsA
,
int
colsB
)
{
int
num_blocks
=
(
colsB
+
32
-
1
)
/
32
;
kMatmul_inference_4bit
<
NF4
,
half
,
half
,
half
><<<
num_blocks
,
256
>>>
(
A
,
B
,
out
,
lda
,
ldb
,
rowsA
,
colsA
,
colsB
);
CUDA_CHECK_RETURN
(
cudaPeekAtLastError
());
}
template
<
int
QUANT_TYPE
,
typename
INP_TYPE
,
typename
COMP_TYPE
,
typename
OUT_TYPE
>
__global__
void
kMatmul_inference_4bit
(
INP_TYPE
*
A
,
unsigned
char
*
B
,
OUT_TYPE
*
C
,
int
lda
,
int
ldb
,
int
rowsA
,
int
colsA
,
int
colsB
);
template
<
typename
T
,
int
OPTIMIZER
>
void
optimizer32bit
(
T
*
g
,
T
*
p
,
float
*
state1
,
float
*
state2
,
float
*
unorm
,
float
max_unorm
,
float
param_norm
,
const
float
beta1
,
const
float
beta2
,
const
float
eps
,
const
float
weight_decay
,
...
...
@@ -653,6 +664,7 @@ template <int FORMAT> void extractOutliers(char * A, int *idx, char *out, int id
CUDA_CHECK_RETURN
(
cudaPeekAtLastError
());
}
//==============================================================
// TEMPLATE DEFINITIONS
//==============================================================
...
...
csrc/ops.cuh
View file @
6bfd7a40
...
...
@@ -183,4 +183,6 @@ template <typename T, int BITS> void spmm_coo_very_sparse_naive(int *max_count,
template
<
int
FORMAT
>
void
extractOutliers
(
char
*
A
,
int
*
idx
,
char
*
out
,
int
idx_size
,
int
rows
,
int
cols
);
void
matmul4bite
(
half
*
A
,
unsigned
char
*
B
,
half
*
out
,
int
lda
,
int
ldb
,
int
rowsA
,
int
colsA
,
int
colsB
);
#endif
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment