Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
bitsandbytes
Commits
c059bd28
".github/git@developer.sourcefind.cn:renzhc/diffusers_dcu.git" did not exist on "1168eaaadd69457d1e460512ab235b29bc552907"
Commit
c059bd28
authored
Nov 20, 2022
by
Tim Dettmers
Browse files
Added additional blocksizes: {64, 128, 256}.
parent
eb028e6e
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
29 additions
and
5 deletions
+29
-5
bitsandbytes/functional.py
bitsandbytes/functional.py
+3
-3
csrc/kernels.cu
csrc/kernels.cu
+14
-2
csrc/ops.cu
csrc/ops.cu
+12
-0
No files found.
bitsandbytes/functional.py
View file @
c059bd28
...
@@ -503,7 +503,7 @@ def quantize_blockwise(A: Tensor, code: Tensor = None, absmax: Tensor = None, ra
...
@@ -503,7 +503,7 @@ def quantize_blockwise(A: Tensor, code: Tensor = None, absmax: Tensor = None, ra
out
=
torch
.
zeros_like
(
A
,
dtype
=
torch
.
uint8
)
out
=
torch
.
zeros_like
(
A
,
dtype
=
torch
.
uint8
)
if
A
.
device
.
type
!=
'cpu'
:
if
A
.
device
.
type
!=
'cpu'
:
assert
blocksize
in
[
4096
,
2048
,
1024
,
512
]
assert
blocksize
in
[
4096
,
2048
,
1024
,
512
,
256
,
128
,
64
]
cblocksize
=
ct
.
c_int32
(
blocksize
)
cblocksize
=
ct
.
c_int32
(
blocksize
)
prev_device
=
pre_call
(
A
.
device
)
prev_device
=
pre_call
(
A
.
device
)
code
=
code
.
to
(
A
.
device
)
code
=
code
.
to
(
A
.
device
)
...
@@ -586,8 +586,8 @@ def dequantize_blockwise(
...
@@ -586,8 +586,8 @@ def dequantize_blockwise(
if
A
.
device
.
type
!=
'cpu'
:
if
A
.
device
.
type
!=
'cpu'
:
device
=
pre_call
(
A
.
device
)
device
=
pre_call
(
A
.
device
)
code
=
code
.
to
(
A
.
device
)
code
=
code
.
to
(
A
.
device
)
if
blocksize
not
in
[
2048
,
4096
,
1024
,
512
]:
if
blocksize
not
in
[
2048
,
4096
,
1024
,
512
,
256
,
128
,
64
]:
raise
ValueError
(
f
"The blockwise of
{
blocksize
}
is not supported. Supported values: [2048, 4096, 1024, 512]"
)
raise
ValueError
(
f
"The blockwise of
{
blocksize
}
is not supported. Supported values: [2048, 4096, 1024, 512
, 256, 128, 64
]"
)
is_on_gpu
([
A
,
out
])
is_on_gpu
([
A
,
out
])
if
out
.
dtype
==
torch
.
float32
:
if
out
.
dtype
==
torch
.
float32
:
lib
.
cdequantize_blockwise_fp32
(
get_ptr
(
code
),
get_ptr
(
A
),
get_ptr
(
absmax
),
get_ptr
(
out
),
ct
.
c_int
(
blocksize
),
ct
.
c_int
(
A
.
numel
()))
lib
.
cdequantize_blockwise_fp32
(
get_ptr
(
code
),
get_ptr
(
A
),
get_ptr
(
absmax
),
get_ptr
(
out
),
ct
.
c_int
(
blocksize
),
ct
.
c_int
(
A
.
numel
()))
...
...
csrc/kernels.cu
View file @
c059bd28
...
@@ -454,8 +454,8 @@ __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float
...
@@ -454,8 +454,8 @@ __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float
__shared__
float
smem_code
[
256
];
__shared__
float
smem_code
[
256
];
__shared__
float
smem_absmax_value
[
1
];
__shared__
float
smem_absmax_value
[
1
];
if
(
threadIdx
.
x
<
256
)
for
(
int
i
=
threadIdx
.
x
;
i
<
256
;
i
+=
blockDim
.
x
)
smem_code
[
threadIdx
.
x
]
=
code
[
threadIdx
.
x
];
smem_code
[
i
]
=
code
[
i
];
for
(
unsigned
int
i
=
base_idx
;
i
<
n_full
;
i
+=
gridDim
.
x
*
BLOCK_SIZE
)
for
(
unsigned
int
i
=
base_idx
;
i
<
n_full
;
i
+=
gridDim
.
x
*
BLOCK_SIZE
)
{
{
...
@@ -2799,6 +2799,12 @@ template __global__ void kQuantizeBlockwise<half, 1024, 4, 0>(float * code, half
...
@@ -2799,6 +2799,12 @@ template __global__ void kQuantizeBlockwise<half, 1024, 4, 0>(float * code, half
template
__global__
void
kQuantizeBlockwise
<
float
,
1024
,
4
,
0
>(
float
*
code
,
float
*
__restrict__
const
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
__restrict__
const
rand
,
const
int
rand_offset
,
const
int
n
);
template
__global__
void
kQuantizeBlockwise
<
float
,
1024
,
4
,
0
>(
float
*
code
,
float
*
__restrict__
const
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
__restrict__
const
rand
,
const
int
rand_offset
,
const
int
n
);
template
__global__
void
kQuantizeBlockwise
<
half
,
512
,
2
,
0
>(
float
*
code
,
half
*
__restrict__
const
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
__restrict__
const
rand
,
const
int
rand_offset
,
const
int
n
);
template
__global__
void
kQuantizeBlockwise
<
half
,
512
,
2
,
0
>(
float
*
code
,
half
*
__restrict__
const
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
__restrict__
const
rand
,
const
int
rand_offset
,
const
int
n
);
template
__global__
void
kQuantizeBlockwise
<
float
,
512
,
2
,
0
>(
float
*
code
,
float
*
__restrict__
const
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
__restrict__
const
rand
,
const
int
rand_offset
,
const
int
n
);
template
__global__
void
kQuantizeBlockwise
<
float
,
512
,
2
,
0
>(
float
*
code
,
float
*
__restrict__
const
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
__restrict__
const
rand
,
const
int
rand_offset
,
const
int
n
);
template
__global__
void
kQuantizeBlockwise
<
half
,
256
,
2
,
0
>(
float
*
code
,
half
*
__restrict__
const
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
__restrict__
const
rand
,
const
int
rand_offset
,
const
int
n
);
template
__global__
void
kQuantizeBlockwise
<
float
,
256
,
2
,
0
>(
float
*
code
,
float
*
__restrict__
const
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
__restrict__
const
rand
,
const
int
rand_offset
,
const
int
n
);
template
__global__
void
kQuantizeBlockwise
<
half
,
128
,
2
,
0
>(
float
*
code
,
half
*
__restrict__
const
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
__restrict__
const
rand
,
const
int
rand_offset
,
const
int
n
);
template
__global__
void
kQuantizeBlockwise
<
float
,
128
,
2
,
0
>(
float
*
code
,
float
*
__restrict__
const
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
__restrict__
const
rand
,
const
int
rand_offset
,
const
int
n
);
template
__global__
void
kQuantizeBlockwise
<
half
,
64
,
1
,
0
>(
float
*
code
,
half
*
__restrict__
const
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
__restrict__
const
rand
,
const
int
rand_offset
,
const
int
n
);
template
__global__
void
kQuantizeBlockwise
<
float
,
64
,
1
,
0
>(
float
*
code
,
float
*
__restrict__
const
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
__restrict__
const
rand
,
const
int
rand_offset
,
const
int
n
);
template
__global__
void
kDequantizeBlockwise
<
half
,
4096
,
1024
,
4
>(
float
*
code
,
unsigned
char
*
A
,
float
*
absmax
,
half
*
out
,
const
int
n
);
template
__global__
void
kDequantizeBlockwise
<
half
,
4096
,
1024
,
4
>(
float
*
code
,
unsigned
char
*
A
,
float
*
absmax
,
half
*
out
,
const
int
n
);
template
__global__
void
kDequantizeBlockwise
<
float
,
4096
,
1024
,
4
>(
float
*
code
,
unsigned
char
*
A
,
float
*
absmax
,
float
*
out
,
const
int
n
);
template
__global__
void
kDequantizeBlockwise
<
float
,
4096
,
1024
,
4
>(
float
*
code
,
unsigned
char
*
A
,
float
*
absmax
,
float
*
out
,
const
int
n
);
...
@@ -2808,6 +2814,12 @@ template __global__ void kDequantizeBlockwise<half, 1024, 256, 4>(float *code, u
...
@@ -2808,6 +2814,12 @@ template __global__ void kDequantizeBlockwise<half, 1024, 256, 4>(float *code, u
template
__global__
void
kDequantizeBlockwise
<
float
,
1024
,
256
,
4
>(
float
*
code
,
unsigned
char
*
A
,
float
*
absmax
,
float
*
out
,
const
int
n
);
template
__global__
void
kDequantizeBlockwise
<
float
,
1024
,
256
,
4
>(
float
*
code
,
unsigned
char
*
A
,
float
*
absmax
,
float
*
out
,
const
int
n
);
template
__global__
void
kDequantizeBlockwise
<
half
,
512
,
256
,
2
>(
float
*
code
,
unsigned
char
*
A
,
float
*
absmax
,
half
*
out
,
const
int
n
);
template
__global__
void
kDequantizeBlockwise
<
half
,
512
,
256
,
2
>(
float
*
code
,
unsigned
char
*
A
,
float
*
absmax
,
half
*
out
,
const
int
n
);
template
__global__
void
kDequantizeBlockwise
<
float
,
512
,
256
,
2
>(
float
*
code
,
unsigned
char
*
A
,
float
*
absmax
,
float
*
out
,
const
int
n
);
template
__global__
void
kDequantizeBlockwise
<
float
,
512
,
256
,
2
>(
float
*
code
,
unsigned
char
*
A
,
float
*
absmax
,
float
*
out
,
const
int
n
);
template
__global__
void
kDequantizeBlockwise
<
half
,
256
,
128
,
2
>(
float
*
code
,
unsigned
char
*
A
,
float
*
absmax
,
half
*
out
,
const
int
n
);
template
__global__
void
kDequantizeBlockwise
<
float
,
256
,
128
,
2
>(
float
*
code
,
unsigned
char
*
A
,
float
*
absmax
,
float
*
out
,
const
int
n
);
template
__global__
void
kDequantizeBlockwise
<
half
,
128
,
64
,
2
>(
float
*
code
,
unsigned
char
*
A
,
float
*
absmax
,
half
*
out
,
const
int
n
);
template
__global__
void
kDequantizeBlockwise
<
float
,
128
,
64
,
2
>(
float
*
code
,
unsigned
char
*
A
,
float
*
absmax
,
float
*
out
,
const
int
n
);
template
__global__
void
kDequantizeBlockwise
<
half
,
64
,
64
,
1
>(
float
*
code
,
unsigned
char
*
A
,
float
*
absmax
,
half
*
out
,
const
int
n
);
template
__global__
void
kDequantizeBlockwise
<
float
,
64
,
64
,
1
>(
float
*
code
,
unsigned
char
*
A
,
float
*
absmax
,
float
*
out
,
const
int
n
);
...
...
csrc/ops.cu
View file @
c059bd28
...
@@ -65,6 +65,12 @@ template <typename T, int STOCHASTIC> void quantizeBlockwise(float * code, T *A,
...
@@ -65,6 +65,12 @@ template <typename T, int STOCHASTIC> void quantizeBlockwise(float * code, T *A,
kQuantizeBlockwise
<
T
,
1024
,
4
,
0
><<<
num_blocks
,
256
>>>
(
code
,
A
,
absmax
,
out
,
rand
,
rand_offset
,
n
);
kQuantizeBlockwise
<
T
,
1024
,
4
,
0
><<<
num_blocks
,
256
>>>
(
code
,
A
,
absmax
,
out
,
rand
,
rand_offset
,
n
);
else
if
(
blocksize
==
512
)
else
if
(
blocksize
==
512
)
kQuantizeBlockwise
<
T
,
512
,
2
,
0
><<<
num_blocks
,
256
>>>
(
code
,
A
,
absmax
,
out
,
rand
,
rand_offset
,
n
);
kQuantizeBlockwise
<
T
,
512
,
2
,
0
><<<
num_blocks
,
256
>>>
(
code
,
A
,
absmax
,
out
,
rand
,
rand_offset
,
n
);
else
if
(
blocksize
==
256
)
kQuantizeBlockwise
<
T
,
256
,
2
,
0
><<<
num_blocks
,
128
>>>
(
code
,
A
,
absmax
,
out
,
rand
,
rand_offset
,
n
);
else
if
(
blocksize
==
128
)
kQuantizeBlockwise
<
T
,
128
,
2
,
0
><<<
num_blocks
,
64
>>>
(
code
,
A
,
absmax
,
out
,
rand
,
rand_offset
,
n
);
else
if
(
blocksize
==
64
)
kQuantizeBlockwise
<
T
,
64
,
1
,
0
><<<
num_blocks
,
64
>>>
(
code
,
A
,
absmax
,
out
,
rand
,
rand_offset
,
n
);
CUDA_CHECK_RETURN
(
cudaPeekAtLastError
());
CUDA_CHECK_RETURN
(
cudaPeekAtLastError
());
...
@@ -82,6 +88,12 @@ template<typename T> void dequantizeBlockwise(float *code, unsigned char *A, flo
...
@@ -82,6 +88,12 @@ template<typename T> void dequantizeBlockwise(float *code, unsigned char *A, flo
kDequantizeBlockwise
<
T
,
1024
,
256
,
4
><<<
num_blocks
,
1024
/
4
>>>
(
code
,
A
,
absmax
,
out
,
n
);
kDequantizeBlockwise
<
T
,
1024
,
256
,
4
><<<
num_blocks
,
1024
/
4
>>>
(
code
,
A
,
absmax
,
out
,
n
);
else
if
(
blocksize
==
512
)
else
if
(
blocksize
==
512
)
kDequantizeBlockwise
<
T
,
512
,
256
,
2
><<<
num_blocks
,
512
/
2
>>>
(
code
,
A
,
absmax
,
out
,
n
);
kDequantizeBlockwise
<
T
,
512
,
256
,
2
><<<
num_blocks
,
512
/
2
>>>
(
code
,
A
,
absmax
,
out
,
n
);
else
if
(
blocksize
==
256
)
kDequantizeBlockwise
<
T
,
256
,
128
,
2
><<<
num_blocks
,
256
/
2
>>>
(
code
,
A
,
absmax
,
out
,
n
);
else
if
(
blocksize
==
128
)
kDequantizeBlockwise
<
T
,
128
,
64
,
2
><<<
num_blocks
,
128
/
2
>>>
(
code
,
A
,
absmax
,
out
,
n
);
else
if
(
blocksize
==
64
)
kDequantizeBlockwise
<
T
,
64
,
64
,
1
><<<
num_blocks
,
64
/
1
>>>
(
code
,
A
,
absmax
,
out
,
n
);
CUDA_CHECK_RETURN
(
cudaPeekAtLastError
());
CUDA_CHECK_RETURN
(
cudaPeekAtLastError
());
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment