Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
386c53bb
Commit
386c53bb
authored
Mar 12, 2025
by
xuxzh1
🎱
Browse files
update
parent
4667452a
Changes
3
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
597 additions
and
601 deletions
+597
-601
llama/ggml-cuda/mmvq.cu
llama/ggml-cuda/mmvq.cu
+424
-429
llama/ggml-cuda/quantize.cu
llama/ggml-cuda/quantize.cu
+171
-170
llama/ggml-cuda/vecdotq.cuh
llama/ggml-cuda/vecdotq.cuh
+2
-2
No files found.
llama/ggml-cuda/mmvq.cu
View file @
386c53bb
This diff is collapsed.
Click to expand it.
llama/ggml-cuda/quantize.cu
View file @
386c53bb
...
@@ -24,173 +24,174 @@
...
@@ -24,173 +24,174 @@
* SOFTWARE.
* SOFTWARE.
*/
*/
#include "quantize.cuh"
#include "quantize.cuh"
#include <cstdint>
#include <cstdint>
static
__global__
__launch_bounds__
(
1024
)
void
quantize_q8_1
(
const
float
*
__restrict__
x
,
void
*
__restrict__
vy
,
const
int64_t
kx
,
const
int64_t
kx0_padded
)
{
static
__global__
__launch_bounds__
(
1024
)
void
quantize_q8_1
(
const
float
*
__restrict__
x
,
void
*
__restrict__
vy
,
const
int64_t
kx
,
const
int64_t
kx0_padded
)
{
const
int64_t
ix0
=
(
int64_t
)
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
const
int64_t
ix0
=
(
int64_t
)
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
if
(
ix0
>=
kx0_padded
)
{
if
(
ix0
>=
kx0_padded
)
{
return
;
return
;
}
}
const
int64_t
ix1
=
blockIdx
.
y
;
const
int64_t
ix1
=
blockIdx
.
y
;
const
int64_t
i_padded
=
ix1
*
kx0_padded
+
ix0
;
const
int64_t
i_padded
=
ix1
*
kx0_padded
+
ix0
;
block_q8_1
*
y
=
(
block_q8_1
*
)
vy
;
block_q8_1
*
y
=
(
block_q8_1
*
)
vy
;
const
int64_t
ib
=
i_padded
/
QK8_1
;
// block index
const
int64_t
ib
=
i_padded
/
QK8_1
;
// block index
const
int64_t
iqs
=
i_padded
%
QK8_1
;
// quant index
const
int64_t
iqs
=
i_padded
%
QK8_1
;
// quant index
const
float
xi
=
ix0
<
kx
?
x
[
ix1
*
kx
+
ix0
]
:
0.0
f
;
const
float
xi
=
ix0
<
kx
?
x
[
ix1
*
kx
+
ix0
]
:
0.0
f
;
float
amax
=
fabsf
(
xi
);
float
amax
=
fabsf
(
xi
);
float
sum
=
xi
;
float
sum
=
xi
;
amax
=
warp_reduce_max
(
amax
);
amax
=
warp_reduce_max
(
amax
);
sum
=
warp_reduce_sum
(
sum
);
sum
=
warp_reduce_sum
(
sum
);
const
float
d
=
amax
/
127
;
const
float
d
=
amax
/
127
;
const
int8_t
q
=
amax
==
0.0
f
?
0
:
roundf
(
xi
/
d
);
const
int8_t
q
=
amax
==
0.0
f
?
0
:
roundf
(
xi
/
d
);
y
[
ib
].
qs
[
iqs
]
=
q
;
y
[
ib
].
qs
[
iqs
]
=
q
;
if
(
iqs
>
0
)
{
if
(
iqs
>
0
)
{
return
;
return
;
}
}
ggml_half2
ds
=
{
d
,
sum
};
y
[
ib
].
ds
=
ds
;
y
[
ib
].
ds
=
ggml_half2
{
d
,
sum
};
//reinterpret_cast<half&>(y[ib].ds) = ds;
//reinterpret_cast<half&>(y[ib].ds) = ds;
//reinterpret_cast<half&>(y[ib].ds.y) = sum;
//reinterpret_cast<half&>(y[ib].ds.y) = sum;
}
}
template
<
mmq_q8_1_ds_layout
ds_layout
>
template
<
mmq_q8_1_ds_layout
ds_layout
>
static
__global__
void
quantize_mmq_q8_1
(
static
__global__
void
quantize_mmq_q8_1
(
const
float
*
__restrict__
x
,
void
*
__restrict__
vy
,
const
int64_t
kx0
,
const
int64_t
kx1
,
const
int64_t
kx0_padded
)
{
const
float
*
__restrict__
x
,
void
*
__restrict__
vy
,
const
int64_t
kx0
,
const
int64_t
kx1
,
const
int64_t
kx0_padded
)
{
constexpr
int
vals_per_scale
=
ds_layout
==
MMQ_Q8_1_DS_LAYOUT_D2S6
?
64
:
32
;
constexpr
int
vals_per_scale
=
ds_layout
==
MMQ_Q8_1_DS_LAYOUT_D2S6
?
64
:
32
;
constexpr
int
vals_per_sum
=
ds_layout
==
MMQ_Q8_1_DS_LAYOUT_D2S6
?
16
:
32
;
constexpr
int
vals_per_sum
=
ds_layout
==
MMQ_Q8_1_DS_LAYOUT_D2S6
?
16
:
32
;
const
int64_t
ix0
=
((
int64_t
)
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
)
*
4
;
const
int64_t
ix0
=
((
int64_t
)
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
)
*
4
;
if
(
ix0
>=
kx0_padded
)
{
if
(
ix0
>=
kx0_padded
)
{
return
;
return
;
}
}
const
float4
*
x4
=
(
const
float4
*
)
x
;
const
float4
*
x4
=
(
const
float4
*
)
x
;
const
int64_t
ix1
=
kx1
*
blockIdx
.
z
+
blockIdx
.
y
;
const
int64_t
ix1
=
kx1
*
blockIdx
.
z
+
blockIdx
.
y
;
block_q8_1_mmq
*
y
=
(
block_q8_1_mmq
*
)
vy
;
block_q8_1_mmq
*
y
=
(
block_q8_1_mmq
*
)
vy
;
const
int64_t
ib0
=
blockIdx
.
z
*
((
int64_t
)
gridDim
.
y
*
gridDim
.
x
*
blockDim
.
x
/
QK8_1
);
// first block of channel
const
int64_t
ib0
=
blockIdx
.
z
*
((
int64_t
)
gridDim
.
y
*
gridDim
.
x
*
blockDim
.
x
/
QK8_1
);
// first block of channel
const
int64_t
ib
=
ib0
+
(
ix0
/
(
4
*
QK8_1
))
*
kx1
+
blockIdx
.
y
;
// block index in channel
const
int64_t
ib
=
ib0
+
(
ix0
/
(
4
*
QK8_1
))
*
kx1
+
blockIdx
.
y
;
// block index in channel
const
int64_t
iqs
=
ix0
%
(
4
*
QK8_1
);
// quant index in block
const
int64_t
iqs
=
ix0
%
(
4
*
QK8_1
);
// quant index in block
// Load 4 floats per thread and calculate max. abs. value between them:
// Load 4 floats per thread and calculate max. abs. value between them:
const
float4
xi
=
ix0
<
kx0
?
x4
[(
ix1
*
kx0
+
ix0
)
/
4
]
:
make_float4
(
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
);
const
float4
xi
=
ix0
<
kx0
?
x4
[(
ix1
*
kx0
+
ix0
)
/
4
]
:
make_float4
(
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
);
float
amax
=
fabsf
(
xi
.
x
);
float
amax
=
fabsf
(
xi
.
x
);
amax
=
fmaxf
(
amax
,
fabsf
(
xi
.
y
));
amax
=
fmaxf
(
amax
,
fabsf
(
xi
.
y
));
amax
=
fmaxf
(
amax
,
fabsf
(
xi
.
z
));
amax
=
fmaxf
(
amax
,
fabsf
(
xi
.
z
));
amax
=
fmaxf
(
amax
,
fabsf
(
xi
.
w
));
amax
=
fmaxf
(
amax
,
fabsf
(
xi
.
w
));
// Exchange max. abs. value between vals_per_scale/4 threads.
// Exchange max. abs. value between vals_per_scale/4 threads.
#pragma unroll
#pragma unroll
for
(
int
offset
=
vals_per_scale
/
8
;
offset
>
0
;
offset
>>=
1
)
{
for
(
int
offset
=
vals_per_scale
/
8
;
offset
>
0
;
offset
>>=
1
)
{
amax
=
fmaxf
(
amax
,
__shfl_xor_sync
(
0xFFFFFFFF
,
amax
,
offset
,
WARP_SIZE
));
amax
=
fmaxf
(
amax
,
__shfl_xor_sync
(
0xFFFFFFFF
,
amax
,
offset
,
WARP_SIZE
));
}
}
float
sum
;
float
sum
;
if
(
ds_layout
!=
MMQ_Q8_1_DS_LAYOUT_D4
)
{
if
(
ds_layout
!=
MMQ_Q8_1_DS_LAYOUT_D4
)
{
sum
=
xi
.
x
+
xi
.
y
+
xi
.
z
+
xi
.
w
;
sum
=
xi
.
x
+
xi
.
y
+
xi
.
z
+
xi
.
w
;
// Exchange calculate sum across vals_per_sum/4 threads.
// Exchange calculate sum across vals_per_sum/4 threads.
#pragma unroll
#pragma unroll
for
(
int
offset
=
vals_per_sum
/
8
;
offset
>
0
;
offset
>>=
1
)
{
for
(
int
offset
=
vals_per_sum
/
8
;
offset
>
0
;
offset
>>=
1
)
{
sum
+=
__shfl_xor_sync
(
0xFFFFFFFF
,
sum
,
offset
,
WARP_SIZE
);
sum
+=
__shfl_xor_sync
(
0xFFFFFFFF
,
sum
,
offset
,
WARP_SIZE
);
}
}
}
}
const
float
d_inv
=
127.0
f
/
amax
;
const
float
d_inv
=
127.0
f
/
amax
;
char4
q
;
char4
q
;
q
.
x
=
roundf
(
xi
.
x
*
d_inv
);
q
.
x
=
roundf
(
xi
.
x
*
d_inv
);
q
.
y
=
roundf
(
xi
.
y
*
d_inv
);
q
.
y
=
roundf
(
xi
.
y
*
d_inv
);
q
.
z
=
roundf
(
xi
.
z
*
d_inv
);
q
.
z
=
roundf
(
xi
.
z
*
d_inv
);
q
.
w
=
roundf
(
xi
.
w
*
d_inv
);
q
.
w
=
roundf
(
xi
.
w
*
d_inv
);
// Write back 4 int8 values as a single 32 bit value for better memroy bandwidth:
// Write back 4 int8 values as a single 32 bit value for better memroy bandwidth:
char4
*
yqs4
=
(
char4
*
)
y
[
ib
].
qs
;
char4
*
yqs4
=
(
char4
*
)
y
[
ib
].
qs
;
yqs4
[
iqs
/
4
]
=
q
;
yqs4
[
iqs
/
4
]
=
q
;
if
(
ds_layout
==
MMQ_Q8_1_DS_LAYOUT_D2S6
)
{
if
(
ds_layout
==
MMQ_Q8_1_DS_LAYOUT_D2S6
)
{
if
(
iqs
%
16
!=
0
||
iqs
>=
96
)
{
if
(
iqs
%
16
!=
0
||
iqs
>=
96
)
{
return
;
return
;
}
}
y
[
ib
].
d2s6
[
2
+
iqs
/
16
]
=
sum
;
y
[
ib
].
d2s6
[
2
+
iqs
/
16
]
=
sum
;
if
(
iqs
%
64
!=
0
)
{
if
(
iqs
%
64
!=
0
)
{
return
;
return
;
}
}
const
float
d
=
1.0
f
/
d_inv
;
const
float
d
=
1.0
f
/
d_inv
;
y
[
ib
].
d2s6
[
iqs
/
64
]
=
d
;
y
[
ib
].
d2s6
[
iqs
/
64
]
=
d
;
return
;
return
;
}
}
if
(
iqs
%
32
!=
0
)
{
if
(
iqs
%
32
!=
0
)
{
return
;
return
;
}
}
const
float
d
=
1.0
f
/
d_inv
;
const
float
d
=
1.0
f
/
d_inv
;
if
(
ds_layout
==
MMQ_Q8_1_DS_LAYOUT_DS4
)
{
if
(
ds_layout
==
MMQ_Q8_1_DS_LAYOUT_DS4
)
{
y
[
ib
].
ds4
[
iqs
/
32
]
=
make_half2
(
d
,
sum
);
y
[
ib
].
ds4
[
iqs
/
32
]
=
make_half2
(
d
,
sum
);
}
else
{
}
else
{
y
[
ib
].
d4
[
iqs
/
32
]
=
d
;
y
[
ib
].
d4
[
iqs
/
32
]
=
d
;
}
}
}
}
void
quantize_row_q8_1_cuda
(
void
quantize_row_q8_1_cuda
(
const
float
*
x
,
void
*
vy
,
const
int64_t
kx0
,
const
int64_t
kx1
,
const
int64_t
channels
,
const
float
*
x
,
void
*
vy
,
const
int64_t
kx0
,
const
int64_t
kx1
,
const
int64_t
channels
,
const
int64_t
kx0_padded
,
const
ggml_type
type_x
,
cudaStream_t
stream
)
{
const
int64_t
kx0_padded
,
const
ggml_type
type_x
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
kx0_padded
%
QK8_1
==
0
);
GGML_ASSERT
(
kx0_padded
%
QK8_1
==
0
);
const
int64_t
block_num_x
=
(
kx0_padded
+
CUDA_QUANTIZE_BLOCK_SIZE
-
1
)
/
CUDA_QUANTIZE_BLOCK_SIZE
;
const
int64_t
block_num_x
=
(
kx0_padded
+
CUDA_QUANTIZE_BLOCK_SIZE
-
1
)
/
CUDA_QUANTIZE_BLOCK_SIZE
;
const
dim3
num_blocks
(
block_num_x
,
kx1
*
channels
,
1
);
const
dim3
num_blocks
(
block_num_x
,
kx1
*
channels
,
1
);
const
dim3
block_size
(
CUDA_QUANTIZE_BLOCK_SIZE
,
1
,
1
);
const
dim3
block_size
(
CUDA_QUANTIZE_BLOCK_SIZE
,
1
,
1
);
quantize_q8_1
<<<
num_blocks
,
block_size
,
0
,
stream
>>>
(
x
,
vy
,
kx0
,
kx0_padded
);
quantize_q8_1
<<<
num_blocks
,
block_size
,
0
,
stream
>>>
(
x
,
vy
,
kx0
,
kx0_padded
);
GGML_UNUSED
(
type_x
);
GGML_UNUSED
(
type_x
);
}
}
void
quantize_mmq_q8_1_cuda
(
void
quantize_mmq_q8_1_cuda
(
const
float
*
x
,
void
*
vy
,
const
int64_t
kx0
,
const
int64_t
kx1
,
const
int64_t
channels
,
const
float
*
x
,
void
*
vy
,
const
int64_t
kx0
,
const
int64_t
kx1
,
const
int64_t
channels
,
const
int64_t
kx0_padded
,
const
ggml_type
type_x
,
cudaStream_t
stream
)
{
const
int64_t
kx0_padded
,
const
ggml_type
type_x
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
kx0_padded
%
(
4
*
QK8_1
)
==
0
);
GGML_ASSERT
(
kx0_padded
%
(
4
*
QK8_1
)
==
0
);
const
int64_t
block_num_x
=
(
kx0_padded
+
4
*
CUDA_QUANTIZE_BLOCK_SIZE_MMQ
-
1
)
/
(
4
*
CUDA_QUANTIZE_BLOCK_SIZE_MMQ
);
const
int64_t
block_num_x
=
(
kx0_padded
+
4
*
CUDA_QUANTIZE_BLOCK_SIZE_MMQ
-
1
)
/
(
4
*
CUDA_QUANTIZE_BLOCK_SIZE_MMQ
);
const
dim3
num_blocks
(
block_num_x
,
kx1
,
channels
);
const
dim3
num_blocks
(
block_num_x
,
kx1
,
channels
);
const
dim3
block_size
(
CUDA_QUANTIZE_BLOCK_SIZE_MMQ
,
1
,
1
);
const
dim3
block_size
(
CUDA_QUANTIZE_BLOCK_SIZE_MMQ
,
1
,
1
);
switch
(
mmq_get_q8_1_ds_layout
(
type_x
))
{
switch
(
mmq_get_q8_1_ds_layout
(
type_x
))
{
case
MMQ_Q8_1_DS_LAYOUT_D4
:
case
MMQ_Q8_1_DS_LAYOUT_D4
:
quantize_mmq_q8_1
<
MMQ_Q8_1_DS_LAYOUT_D4
>
quantize_mmq_q8_1
<
MMQ_Q8_1_DS_LAYOUT_D4
>
<<<
num_blocks
,
block_size
,
0
,
stream
>>>
(
x
,
vy
,
kx0
,
kx1
,
kx0_padded
);
<<<
num_blocks
,
block_size
,
0
,
stream
>>>
(
x
,
vy
,
kx0
,
kx1
,
kx0_padded
);
break
;
break
;
case
MMQ_Q8_1_DS_LAYOUT_DS4
:
case
MMQ_Q8_1_DS_LAYOUT_DS4
:
quantize_mmq_q8_1
<
MMQ_Q8_1_DS_LAYOUT_DS4
>
quantize_mmq_q8_1
<
MMQ_Q8_1_DS_LAYOUT_DS4
>
<<<
num_blocks
,
block_size
,
0
,
stream
>>>
(
x
,
vy
,
kx0
,
kx1
,
kx0_padded
);
<<<
num_blocks
,
block_size
,
0
,
stream
>>>
(
x
,
vy
,
kx0
,
kx1
,
kx0_padded
);
break
;
break
;
case
MMQ_Q8_1_DS_LAYOUT_D2S6
:
case
MMQ_Q8_1_DS_LAYOUT_D2S6
:
quantize_mmq_q8_1
<
MMQ_Q8_1_DS_LAYOUT_D2S6
>
quantize_mmq_q8_1
<
MMQ_Q8_1_DS_LAYOUT_D2S6
>
<<<
num_blocks
,
block_size
,
0
,
stream
>>>
(
x
,
vy
,
kx0
,
kx1
,
kx0_padded
);
<<<
num_blocks
,
block_size
,
0
,
stream
>>>
(
x
,
vy
,
kx0
,
kx1
,
kx0_padded
);
break
;
break
;
default:
default:
GGML_ABORT
(
"fatal error"
);
GGML_ABORT
(
"fatal error"
);
break
;
break
;
}
}
}
}
\ No newline at end of file
llama/ggml-cuda/vecdotq.cuh
View file @
386c53bb
...
@@ -505,7 +505,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
...
@@ -505,7 +505,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
float
sumf
=
0.0
f
;
float
sumf
=
0.0
f
;
#pragma unroll
#pragma unroll
QR6_K
for
(
int
i
=
0
;
i
<
QR6_K
;
++
i
)
{
for
(
int
i
=
0
;
i
<
QR6_K
;
++
i
)
{
const
int
sc
=
scales
[
4
*
i
];
const
int
sc
=
scales
[
4
*
i
];
...
@@ -803,7 +803,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
...
@@ -803,7 +803,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
int
u
[
QR6_K
];
int
u
[
QR6_K
];
float
d8
[
QR6_K
];
float
d8
[
QR6_K
];
#pragma unroll
#pragma unroll
QR6_K
for
(
int
i
=
0
;
i
<
QR6_K
;
++
i
)
{
for
(
int
i
=
0
;
i
<
QR6_K
;
++
i
)
{
u
[
i
]
=
get_int_b4
(
bq8_1
[
bq8_offset
+
2
*
i
].
qs
,
iqs
%
QI8_1
);
u
[
i
]
=
get_int_b4
(
bq8_1
[
bq8_offset
+
2
*
i
].
qs
,
iqs
%
QI8_1
);
d8
[
i
]
=
__low2float
(
bq8_1
[
bq8_offset
+
2
*
i
].
ds
);
d8
[
i
]
=
__low2float
(
bq8_1
[
bq8_offset
+
2
*
i
].
ds
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment