Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
bitsandbytes
Commits
3aef7834
Commit
3aef7834
authored
Apr 28, 2023
by
Tim Dettmers
Browse files
Added template refactor.
parent
c1bfb210
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
20 additions
and
50 deletions
+20
-50
bitsandbytes/functional.py
bitsandbytes/functional.py
+1
-3
csrc/kernels.cu
csrc/kernels.cu
+10
-13
csrc/kernels.cuh
csrc/kernels.cuh
+1
-5
csrc/ops.cu
csrc/ops.cu
+3
-8
csrc/ops.cuh
csrc/ops.cuh
+1
-6
csrc/pythonInterface.c
csrc/pythonInterface.c
+4
-15
No files found.
bitsandbytes/functional.py
View file @
3aef7834
...
...
@@ -1464,9 +1464,7 @@ def cutlass3_gemm(
lda
=
ct
.
c_int32
(
lda
)
ldb
=
ct
.
c_int32
(
ldb
)
ldc
=
ct
.
c_int32
(
ldc
)
alpha
=
ct
.
c_float
(
1.0
)
beta
=
ct
.
c_float
(
0.0
)
lib
.
ccutlass_gemm
(
m
,
n
,
k
,
alpha
,
get_ptr
(
A
),
lda
,
get_ptr
(
B
),
ldb
,
beta
,
get_ptr
(
out
),
ldc
)
lib
.
cgemm_host_fp32
(
m
,
n
,
k
,
get_ptr
(
A
),
get_ptr
(
B
),
get_ptr
(
out
),
lda
,
ldb
,
ldc
)
return
out
...
...
csrc/kernels.cu
View file @
3aef7834
...
...
@@ -2949,22 +2949,18 @@ template <int FORMAT> __global__ void kExtractOutliers(char *A, int *idx, char *
#define ROWS 2
__global__
void
gemm_device
(
int
M
,
int
N
,
int
K
,
float
const
*
A
,
float
*
B
,
float
*
out
,
int
lda
,
int
ldb
,
int
ldc
,
float
alpha
,
float
beta
)
template
<
typename
T
>
__global__
void
gemm_device
(
int
M
,
int
N
,
int
K
,
T
const
*
A
,
T
*
B
,
T
*
out
,
int
lda
,
int
ldb
,
int
ldc
)
{
// 0. We want to fill a 8x128 tile for a thread block so we have 8x16 tile for each warp
// 1. Load dataB into register
// 2. Dequantize B
// 3. Fetch data from A and multiply
typedef
cub
::
BlockLoad
<
float
,
256
,
4
,
cub
::
BLOCK_LOAD_WARP_TRANSPOSE
>
LoadA
;
typedef
cub
::
BlockLoad
<
T
,
256
,
4
,
cub
::
BLOCK_LOAD_WARP_TRANSPOSE
>
LoadA
;
//__shared__ typename LoadA::TempStorage loada;
typedef
cub
::
BlockLoad
<
float
,
256
,
4
,
cub
::
BLOCK_LOAD_WARP_TRANSPOSE
>
LoadB
;
typedef
cub
::
BlockLoad
<
T
,
256
,
4
,
cub
::
BLOCK_LOAD_WARP_TRANSPOSE
>
LoadB
;
//__shared__ typename LoadB::TempStorage loadb;
typedef
cub
::
BlockReduce
<
float
,
256
>
BlockReduce
;
typedef
cub
::
BlockReduce
<
T
,
256
>
BlockReduce
;
// Allocate shared memory for BlockReduce
//__shared__ typename BlockReduce::TempStorage reduce;
...
...
@@ -2975,16 +2971,16 @@ __global__ void gemm_device(int M, int N, int K,
}
temp_storage
;
float
dataA
[
4
];
float
local_B
[
4
];
float
local_accC
[
ROWS
];
T
dataA
[
4
];
T
local_B
[
4
];
T
local_accC
[
ROWS
];
int
valid_items
=
0
;
const
int
warp_id
=
threadIdx
.
x
/
32
;
const
int
warp_lane
=
threadIdx
.
x
%
32
;
const
int
col_offset
=
blockIdx
.
x
*
8
;
__shared__
float
tileA
[
ROWS
*
1024
];
__shared__
float
accumulatorC
[
ROWS
*
8
];
__shared__
T
tileA
[
ROWS
*
1024
];
__shared__
T
accumulatorC
[
ROWS
*
8
];
//#pragma unroll 8
//for(int i = 0; i < 8; i++)
...
...
@@ -3128,6 +3124,7 @@ __global__ void with_staging_unified(float const* global_in, float * global_out,
// TB const* B, BStride dB, BBlockLayout blockB, BThreadLayout tB,
// TC * out, CStride dC, CBlockLayout , CThreadLayout tC,
// half alpha, half beta);
template
__global__
void
gemm_device
<
float
>(
int
M
,
int
N
,
int
K
,
float
const
*
A
,
float
*
B
,
float
*
out
,
int
lda
,
int
ldb
,
int
ldc
);
//template __global__ void kMatmul_inference_4bit<NF4, half, half, half>(half *A, unsigned char *B, half *out, int lda, int ldb, int rowsA, int colsA, int colsB);
...
...
csrc/kernels.cuh
View file @
3aef7834
...
...
@@ -138,10 +138,6 @@ template <int FORMAT> __global__ void kExtractOutliers(char *A, int *idx, char *
template
<
size_t
stages_count
/* Pipeline with stages_count stages */
>
__global__
void
with_staging_unified
(
float
const
*
global_in
,
float
*
global_out
,
size_t
size
,
size_t
batch_sz
);
__global__
void
gemm_device
(
int
M
,
int
N
,
int
K
,
float
const
*
A
,
float
*
B
,
float
*
out
,
int
lda
,
int
ldb
,
int
ldc
,
float
alpha
,
float
beta
);
template
<
typename
T
>
__global__
void
gemm_device
(
int
M
,
int
N
,
int
K
,
T
const
*
A
,
T
*
B
,
T
*
out
,
int
lda
,
int
ldb
,
int
ldc
);
#endif
csrc/ops.cu
View file @
3aef7834
...
...
@@ -675,12 +675,7 @@ void pipeline_test(float *A, float *B, size_t n, size_t batch_size)
void
gemm_host
(
int
m
,
int
n
,
int
k
,
float
alpha
,
float
const
*
A
,
int
lda
,
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
)
template
<
typename
T
>
void
gemm_host
(
int
m
,
int
n
,
int
k
,
T
const
*
A
,
T
*
B
,
T
*
out
,
int
lda
,
int
ldb
,
int
ldc
)
{
dim3
dimBlock
(
256
);
...
...
@@ -699,14 +694,14 @@ void gemm_host(int m, int n, int k,
(
m
,
n
,
k
,
A
,
B
,
C
,
lda
,
ldb
,
ldc
,
alpha
,
beta
);
out
,
lda
,
ldb
,
ldc
);
}
//==============================================================
// TEMPLATE DEFINITIONS
//==============================================================
template
void
gemm_host
<
float
>(
int
m
,
int
n
,
int
k
,
float
const
*
A
,
float
*
B
,
float
*
out
,
int
lda
,
int
ldb
,
int
ldc
);
template
void
extractOutliers
<
COL_TURING
>(
char
*
A
,
int
*
idx
,
char
*
out
,
int
idx_size
,
int
rows
,
int
cols
);
template
void
extractOutliers
<
COL_AMPERE
>(
char
*
A
,
int
*
idx
,
char
*
out
,
int
idx_size
,
int
rows
,
int
cols
);
...
...
csrc/ops.cuh
View file @
3aef7834
...
...
@@ -190,12 +190,7 @@ template <int FORMAT> void extractOutliers(char * A, int *idx, char *out, int id
void
matmul4bite
(
half
*
A
,
unsigned
char
*
B
,
half
*
out
,
int
lda
,
int
ldb
,
int
rowsA
,
int
colsA
,
int
colsB
);
void
gemm_host
(
int
m
,
int
n
,
int
k
,
float
alpha
,
float
const
*
A
,
int
ldA
,
float
*
B
,
int
ldB
,
float
beta
,
float
*
C
,
int
ldC
);
template
<
typename
T
>
void
gemm_host
(
int
m
,
int
n
,
int
k
,
T
const
*
A
,
T
*
B
,
T
*
out
,
int
lda
,
int
ldb
,
int
ldc
);
void
pipeline_test
(
float
*
A
,
float
*
B
,
size_t
n
,
size_t
batch_size
);
...
...
csrc/pythonInterface.c
View file @
3aef7834
...
...
@@ -20,14 +20,8 @@ void estimateQuantiles_fp32(float *A, float *code, float offset, int n){ estimat
void
estimateQuantiles_fp16
(
half
*
A
,
float
*
code
,
float
offset
,
int
n
){
estimateQuantiles
<
half
>
(
A
,
code
,
offset
,
n
);
}
void
cppgemm
(
int
m
,
int
n
,
int
k
,
float
alpha
,
float
const
*
A
,
int
ldA
,
float
*
B
,
int
ldB
,
float
beta
,
float
*
C
,
int
ldC
)
{
gemm_host
(
m
,
n
,
k
,
alpha
,
A
,
ldA
,
B
,
ldB
,
beta
,
C
,
ldC
);}
void
gemm_host_fp32
(
int
M
,
int
N
,
int
K
,
float
const
*
A
,
float
*
B
,
float
*
out
,
int
lda
,
int
ldb
,
int
ldc
)
{
gemm_host
<
float
>
(
M
,
N
,
K
,
A
,
B
,
out
,
lda
,
ldb
,
ldc
);
}
#define MAKE_FUNC32(fname, oname, gtype, gbits) \
...
...
@@ -317,13 +311,8 @@ extern "C"
void
cextractOutliers_ampere
(
char
*
A
,
int
*
idx
,
char
*
out
,
int
idx_size
,
int
rows
,
int
cols
){
extractOutliers_ampere
(
A
,
idx
,
out
,
idx_size
,
rows
,
cols
);
}
void
cpipeline_test
(
float
*
A
,
float
*
B
,
size_t
n
,
size_t
batch_size
){
pipeline_test
(
A
,
B
,
n
,
batch_size
);
}
void
ccutlass_gemm
(
int
m
,
int
n
,
int
k
,
float
alpha
,
float
const
*
A
,
int
ldA
,
float
*
B
,
int
ldB
,
float
beta
,
float
*
C
,
int
ldC
)
{
cppgemm
(
m
,
n
,
k
,
alpha
,
A
,
ldA
,
B
,
ldB
,
beta
,
C
,
ldC
);}
void
cgemm_host_fp32
(
int
M
,
int
N
,
int
K
,
float
const
*
A
,
float
*
B
,
float
*
out
,
int
lda
,
int
ldb
,
int
ldc
)
{
gemm_host_fp32
(
M
,
N
,
K
,
A
,
B
,
out
,
lda
,
ldb
,
ldc
);
}
#endif
void
cquantize_blockwise_cpu_fp32
(
float
*
code
,
float
*
A
,
float
*
absmax
,
unsigned
char
*
out
,
long
long
blocksize
,
long
long
n
){
quantize_cpu
(
code
,
A
,
absmax
,
out
,
blocksize
,
n
);
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment