Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
85aeb428
Unverified
Commit
85aeb428
authored
Aug 03, 2023
by
Michael Yang
Committed by
GitHub
Aug 03, 2023
Browse files
Merge pull request #270 from jmorganca/update-llama-cpp
update llama.cpp
parents
f0b365a4
c5bcf328
Changes
19
Hide whitespace changes
Inline
Side-by-side
Showing
19 changed files
with
623 additions
and
298 deletions
+623
-298
llama/ggml-alloc.c
llama/ggml-alloc.c
+1
-1
llama/ggml-alloc.h
llama/ggml-alloc.h
+1
-1
llama/ggml-cuda.cu
llama/ggml-cuda.cu
+606
-276
llama/ggml-cuda.h
llama/ggml-cuda.h
+1
-1
llama/ggml-metal.h
llama/ggml-metal.h
+1
-1
llama/ggml-metal.m
llama/ggml-metal.m
+1
-1
llama/ggml-metal.metal
llama/ggml-metal.metal
+1
-1
llama/ggml-mpi.c
llama/ggml-mpi.c
+1
-1
llama/ggml-mpi.h
llama/ggml-mpi.h
+1
-1
llama/ggml-opencl.cpp
llama/ggml-opencl.cpp
+1
-1
llama/ggml-opencl.h
llama/ggml-opencl.h
+1
-1
llama/ggml.c
llama/ggml.c
+1
-1
llama/ggml.h
llama/ggml.h
+1
-1
llama/k_quants.c
llama/k_quants.c
+1
-1
llama/k_quants.h
llama/k_quants.h
+1
-1
llama/llama-util.h
llama/llama-util.h
+1
-1
llama/llama.cpp
llama/llama.cpp
+1
-1
llama/llama.go
llama/llama.go
+0
-5
llama/llama.h
llama/llama.h
+1
-1
No files found.
llama/ggml-alloc.c
View file @
85aeb428
/**
/**
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
* llama.cpp - git
8183159cf3def112f6d1fe94815fce70e1bffa12
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-alloc.h
View file @
85aeb428
/**
/**
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
* llama.cpp - git
8183159cf3def112f6d1fe94815fce70e1bffa12
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-cuda.cu
View file @
85aeb428
/**
/**
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
* llama.cpp - git
8183159cf3def112f6d1fe94815fce70e1bffa12
*
*
* MIT License
* MIT License
*
*
...
@@ -188,7 +188,7 @@ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_
...
@@ -188,7 +188,7 @@ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_
typedef
void
(
*
allocate_tiles_cuda_t
)(
int
**
x_ql
,
half2
**
x_dm
,
int
**
x_qh
,
int
**
x_sc
);
typedef
void
(
*
allocate_tiles_cuda_t
)(
int
**
x_ql
,
half2
**
x_dm
,
int
**
x_qh
,
int
**
x_sc
);
typedef
void
(
*
load_tiles_cuda_t
)(
typedef
void
(
*
load_tiles_cuda_t
)(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
k
,
const
int
&
blocks_per_row
);
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
i_max
,
const
int
&
k
,
const
int
&
blocks_per_row
);
typedef
float
(
*
vec_dot_q_mul_mat_cuda_t
)(
typedef
float
(
*
vec_dot_q_mul_mat_cuda_t
)(
const
int
*
__restrict__
x_ql
,
const
half2
*
__restrict__
x_dm
,
const
int
*
__restrict__
x_qh
,
const
int
*
__restrict__
x_sc
,
const
int
*
__restrict__
x_ql
,
const
half2
*
__restrict__
x_dm
,
const
int
*
__restrict__
x_qh
,
const
int
*
__restrict__
x_sc
,
const
int
*
__restrict__
y_qs
,
const
half2
*
__restrict__
y_ms
,
const
int
&
i
,
const
int
&
j
,
const
int
&
k
);
const
int
*
__restrict__
y_qs
,
const
half2
*
__restrict__
y_ms
,
const
int
&
i
,
const
int
&
j
,
const
int
&
k
);
...
@@ -1388,22 +1388,185 @@ static __global__ void dequantize_block(const void * __restrict__ vx, float * __
...
@@ -1388,22 +1388,185 @@ static __global__ void dequantize_block(const void * __restrict__ vx, float * __
}
}
// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
#define VDR_q4_0_q8_1 1
#define VDR_Q4_0_Q8_1_MMVQ 2
#define VDR_Q4_0_Q8_1_MMQ 4
static
__device__
__forceinline__
float
vec_dot_q4_0_q8_1_impl
(
template
<
int
vdr
>
static
__device__
__forceinline__
float
vec_dot_q4_0_q8_1_impl
(
const
int
&
v
i
,
const
int
&
u
i0
,
const
int
&
ui1
,
const
half
&
d4
,
const
half2
&
ds8
)
{
const
int
*
v
,
const
int
*
u
,
const
float
&
d4
,
const
half2
&
ds8
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
// subtract 8 from each quantized value
int
sumi
=
0
;
const
int
vi0
=
(
vi
>>
0
)
&
0x0F0F0F0F
;
const
int
vi1
=
(
vi
>>
4
)
&
0x0F0F0F0F
;
// SIMD dot product of quantized values
#pragma unroll
int
sumi
=
__dp4a
(
vi0
,
ui0
,
0
);
for
(
int
i
=
0
;
i
<
vdr
;
++
i
)
{
sumi
=
__dp4a
(
vi1
,
ui1
,
sumi
);
const
int
vi0
=
(
v
[
i
]
>>
0
)
&
0x0F0F0F0F
;
const
int
vi1
=
(
v
[
i
]
>>
4
)
&
0x0F0F0F0F
;
// SIMD dot product of quantized values
sumi
=
__dp4a
(
vi0
,
u
[
2
*
i
+
0
],
sumi
);
sumi
=
__dp4a
(
vi1
,
u
[
2
*
i
+
1
],
sumi
);
}
// second part effectively subtracts 8 from each quant value
return
d4
*
(
sumi
*
__half2float
(
ds8
.
x
)
-
(
8
*
vdr
/
QI4_0
)
*
__half2float
(
ds8
.
y
));
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
#define VDR_Q4_1_Q8_1_MMVQ 2
#define VDR_Q4_1_Q8_1_MMQ 4
template
<
int
vdr
>
static
__device__
__forceinline__
float
vec_dot_q4_1_q8_1_impl
(
const
int
*
v
,
const
int
*
u
,
const
half2
&
dm4
,
const
half2
&
ds8
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
int
sumi
=
0
;
#pragma unroll
for
(
int
i
=
0
;
i
<
vdr
;
++
i
)
{
const
int
vi0
=
(
v
[
i
]
>>
0
)
&
0x0F0F0F0F
;
const
int
vi1
=
(
v
[
i
]
>>
4
)
&
0x0F0F0F0F
;
// SIMD dot product of quantized values
sumi
=
__dp4a
(
vi0
,
u
[
2
*
i
+
0
],
sumi
);
sumi
=
__dp4a
(
vi1
,
u
[
2
*
i
+
1
],
sumi
);
}
return
__half2float
(
d4
)
*
(
sumi
*
__half2float
(
ds8
.
x
)
-
(
8
/
QI4_0
)
*
__half2float
(
ds8
.
y
));
#ifdef GGML_CUDA_F16
const
half2
tmp
=
__hmul2
(
dm4
,
ds8
);
const
float
d4d8
=
__half2float
(
tmp
.
x
);
const
float
m4s8
=
__half2float
(
tmp
.
y
);
#else
const
float
d4d8
=
__half2float
(
dm4
.
x
)
*
__half2float
(
ds8
.
x
);
const
float
m4s8
=
__half2float
(
dm4
.
y
)
*
__half2float
(
ds8
.
y
);
#endif // GGML_CUDA_F16
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
return
sumi
*
d4d8
+
m4s8
/
(
QI8_1
/
(
vdr
*
QR4_1
));
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
#define VDR_Q5_0_Q8_1_MMVQ 2
#define VDR_Q5_0_Q8_1_MMQ 4
template
<
int
vdr
>
static
__device__
__forceinline__
float
vec_dot_q5_0_q8_1_impl
(
const
int
*
vl
,
const
int
*
vh
,
const
int
*
u
,
const
float
&
d5
,
const
half2
&
ds8
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
int
sumi
=
0
;
for
(
int
i
=
0
;
i
<
vdr
;
++
i
)
{
int
vi0
=
(
vl
[
i
]
>>
0
)
&
0x0F0F0F0F
;
// lower 4 qs bits, still need qh as 5th bits
vi0
|=
(
vh
[
i
]
<<
4
)
&
0x00000010
;
// 0 -> 4
vi0
|=
(
vh
[
i
]
<<
11
)
&
0x00001000
;
// 1 -> 12
vi0
|=
(
vh
[
i
]
<<
18
)
&
0x00100000
;
// 2 -> 20
vi0
|=
(
vh
[
i
]
<<
25
)
&
0x10000000
;
// 3 -> 28
sumi
=
__dp4a
(
vi0
,
u
[
2
*
i
+
0
],
sumi
);
// SIMD dot product of quantized values
int
vi1
=
(
vl
[
i
]
>>
4
)
&
0x0F0F0F0F
;
// upper 4 qs bits, still need qh as 5th bits
vi1
|=
(
vh
[
i
]
>>
12
)
&
0x00000010
;
// 16 -> 4
vi1
|=
(
vh
[
i
]
>>
5
)
&
0x00001000
;
// 17 -> 12
vi1
|=
(
vh
[
i
]
<<
2
)
&
0x00100000
;
// 18 -> 20
vi1
|=
(
vh
[
i
]
<<
9
)
&
0x10000000
;
// 19 -> 28
sumi
=
__dp4a
(
vi1
,
u
[
2
*
i
+
1
],
sumi
);
// SIMD dot product of quantized values
}
// second part effectively subtracts 16 from each quant value
return
d5
*
(
sumi
*
__half2float
(
ds8
.
x
)
-
(
16
*
vdr
/
QI5_0
)
*
__half2float
(
ds8
.
y
));
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
#define VDR_Q5_1_Q8_1_MMVQ 2
#define VDR_Q5_1_Q8_1_MMQ 4
template
<
int
vdr
>
static
__device__
__forceinline__
float
vec_dot_q5_1_q8_1_impl
(
const
int
*
vl
,
const
int
*
vh
,
const
int
*
u
,
const
half2
&
dm5
,
const
half2
&
ds8
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
int
sumi
=
0
;
for
(
int
i
=
0
;
i
<
vdr
;
++
i
)
{
int
vi0
=
(
vl
[
i
]
>>
0
)
&
0x0F0F0F0F
;
// lower 4 qs bits, still need qh as 5th bits
vi0
|=
(
vh
[
i
]
<<
4
)
&
0x00000010
;
// 0 -> 4
vi0
|=
(
vh
[
i
]
<<
11
)
&
0x00001000
;
// 1 -> 12
vi0
|=
(
vh
[
i
]
<<
18
)
&
0x00100000
;
// 2 -> 20
vi0
|=
(
vh
[
i
]
<<
25
)
&
0x10000000
;
// 3 -> 28
sumi
=
__dp4a
(
vi0
,
u
[
2
*
i
+
0
],
sumi
);
// SIMD dot product of quantized values
int
vi1
=
(
vl
[
i
]
>>
4
)
&
0x0F0F0F0F
;
// upper 4 qs bits, still need qh as 5th bits
vi1
|=
(
vh
[
i
]
>>
12
)
&
0x00000010
;
// 16 -> 4
vi1
|=
(
vh
[
i
]
>>
5
)
&
0x00001000
;
// 17 -> 12
vi1
|=
(
vh
[
i
]
<<
2
)
&
0x00100000
;
// 18 -> 20
vi1
|=
(
vh
[
i
]
<<
9
)
&
0x10000000
;
// 19 -> 28
sumi
=
__dp4a
(
vi1
,
u
[
2
*
i
+
1
],
sumi
);
// SIMD dot product of quantized values
}
#ifdef GGML_CUDA_F16
const
half2
tmp
=
__hmul2
(
dm5
,
ds8
);
const
float
d5d8
=
__half2float
(
tmp
.
x
);
const
float
m5s8
=
__half2float
(
tmp
.
y
);
#else
const
float
d5d8
=
__half2float
(
dm5
.
x
)
*
__half2float
(
ds8
.
x
);
const
float
m5s8
=
__half2float
(
dm5
.
y
)
*
__half2float
(
ds8
.
y
);
#endif // GGML_CUDA_F16
// scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
return
sumi
*
d5d8
+
m5s8
/
(
QI5_1
/
vdr
);
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
#define VDR_Q8_0_Q8_1_MMVQ 2
#define VDR_Q8_0_Q8_1_MMQ 8
template
<
int
vdr
>
static
__device__
__forceinline__
float
vec_dot_q8_0_q8_1_impl
(
const
int
*
v
,
const
int
*
u
,
const
float
&
d8_0
,
const
half2
&
ds8_1
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
int
sumi
=
0
;
for
(
int
i
=
0
;
i
<
vdr
;
++
i
)
{
// SIMD dot product of quantized values
sumi
=
__dp4a
(
v
[
i
],
u
[
i
],
sumi
);
}
return
sumi
*
d8_0
*
__half2float
(
ds8_1
.
x
);
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
template
<
int
vdr
>
static
__device__
__forceinline__
float
vec_dot_q8_1_q8_1_impl
(
const
int
*
v
,
const
int
*
u
,
const
half2
&
dm8
,
const
half2
&
ds8
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
int
sumi
=
0
;
for
(
int
i
=
0
;
i
<
vdr
;
++
i
)
{
// SIMD dot product of quantized values
sumi
=
__dp4a
(
v
[
i
],
u
[
i
],
sumi
);
}
#ifdef GGML_CUDA_F16
const
half2
tmp
=
__hmul2
(
dm8
,
ds8
);
const
float
d8d8
=
__half2float
(
tmp
.
x
);
const
float
m8s8
=
__half2float
(
tmp
.
y
);
#else
const
float
d8d8
=
__half2float
(
dm8
.
x
)
*
__half2float
(
ds8
.
x
);
const
float
m8s8
=
__half2float
(
dm8
.
y
)
*
__half2float
(
ds8
.
y
);
#endif // GGML_CUDA_F16
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
return
sumi
*
d8d8
+
m8s8
/
(
QI8_1
/
vdr
);
#else
#else
return
0.0
f
;
// only to satisfy the compiler
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
...
@@ -1414,25 +1577,31 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
...
@@ -1414,25 +1577,31 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
const
block_q4_0
*
bq4_0
=
(
const
block_q4_0
*
)
vbq
;
const
block_q4_0
*
bq4_0
=
(
const
block_q4_0
*
)
vbq
;
const
int
vi
=
get_int_from_uint8
(
bq4_0
->
qs
,
iqs
);
int
v
[
VDR_Q4_0_Q8_1_MMVQ
];
const
int
ui0
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
);
int
u
[
2
*
VDR_Q4_0_Q8_1_MMVQ
];
const
int
ui1
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
+
QI4_0
);
#pragma unroll
for
(
int
i
=
0
;
i
<
VDR_Q4_0_Q8_1_MMVQ
;
++
i
)
{
v
[
i
]
=
get_int_from_uint8
(
bq4_0
->
qs
,
iqs
+
i
);
u
[
2
*
i
+
0
]
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
+
i
);
u
[
2
*
i
+
1
]
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
+
i
+
QI4_0
);
}
return
vec_dot_q4_0_q8_1_impl
(
v
i
,
u
i0
,
ui1
,
bq4_0
->
d
,
bq8_1
->
ds
);
return
vec_dot_q4_0_q8_1_impl
<
VDR_Q4_0_Q8_1_MMVQ
>
(
v
,
u
,
bq4_0
->
d
,
bq8_1
->
ds
);
}
}
static
__device__
__forceinline__
void
allocate_tiles_q4_0
(
int
**
x_ql
,
half2
**
x_dm
,
int
**
x_qh
,
int
**
x_sc
)
{
static
__device__
__forceinline__
void
allocate_tiles_q4_0
(
int
**
x_ql
,
half2
**
x_dm
,
int
**
x_qh
,
int
**
x_sc
)
{
__shared__
int
tile_x_qs
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
)
+
GGML_CUDA_MMQ_Y
];
__shared__
int
tile_x_qs
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
)
+
GGML_CUDA_MMQ_Y
];
__shared__
half2
tile_x_d
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
/
QI4_0
)
+
GGML_CUDA_MMQ_Y
/
QI4_0
];
__shared__
float
tile_x_d
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
/
QI4_0
)
+
GGML_CUDA_MMQ_Y
/
QI4_0
];
*
x_ql
=
tile_x_qs
;
*
x_ql
=
tile_x_qs
;
*
x_dm
=
tile_x_d
;
*
x_dm
=
(
half2
*
)
tile_x_d
;
}
}
static
__device__
__forceinline__
void
load_tiles_q4_0
(
template
<
bool
need_check
>
static
__device__
__forceinline__
void
load_tiles_q4_0
(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
i_max
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
__builtin_assume
(
i_offset
>=
0
);
__builtin_assume
(
i_offset
>=
0
);
__builtin_assume
(
i_offset
<
8
);
__builtin_assume
(
i_offset
<
8
);
...
@@ -1444,14 +1613,20 @@ static __device__ __forceinline__ void load_tiles_q4_0(
...
@@ -1444,14 +1613,20 @@ static __device__ __forceinline__ void load_tiles_q4_0(
const
block_q4_0
*
bx0
=
(
block_q4_0
*
)
vx
;
const
block_q4_0
*
bx0
=
(
block_q4_0
*
)
vx
;
float
*
x_dmf
=
(
float
*
)
x_dm
;
#pragma unroll
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
)
{
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
)
{
const
int
i
=
i0
+
i_offset
;
int
i
=
i0
+
i_offset
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q4_0
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
const
block_q4_0
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
]
=
get_int_from_uint8
(
bxi
->
qs
,
kqsx
);
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
]
=
get_int_from_uint8
(
bxi
->
qs
,
kqsx
);
x_dm
[
i
*
(
WARP_SIZE
/
QI4_0
)
+
i
/
QI4_0
+
kbx
]
.
x
=
bxi
->
d
;
x_dm
f
[
i
*
(
WARP_SIZE
/
QI4_0
)
+
i
/
QI4_0
+
kbx
]
=
bxi
->
d
;
}
}
// const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
// const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
...
@@ -1459,6 +1634,7 @@ static __device__ __forceinline__ void load_tiles_q4_0(
...
@@ -1459,6 +1634,7 @@ static __device__ __forceinline__ void load_tiles_q4_0(
// #pragma unroll
// #pragma unroll
// for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_0) {
// for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_0) {
// FIXME out-of-bounds
// const int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
// const int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
// if (i >= GGML_CUDA_MMQ_Y) {
// if (i >= GGML_CUDA_MMQ_Y) {
...
@@ -1483,39 +1659,19 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
...
@@ -1483,39 +1659,19 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
__builtin_assume
(
k
<
WARP_SIZE
);
__builtin_assume
(
k
<
WARP_SIZE
);
const
int
kyqs
=
k
%
(
QI8_1
/
2
)
+
QI8_1
*
(
k
/
(
QI8_1
/
2
));
const
int
kyqs
=
k
%
(
QI8_1
/
2
)
+
QI8_1
*
(
k
/
(
QI8_1
/
2
));
const
float
*
x_dmf
=
(
float
*
)
x_dm
;
return
vec_dot_q4_0_q8_1_impl
(
int
u
[
2
*
VDR_Q4_0_Q8_1_MMQ
];
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
],
y_qs
[
j
*
(
2
*
WARP_SIZE
)
+
kyqs
],
y_qs
[
j
*
(
2
*
WARP_SIZE
)
+
kyqs
+
(
QI8_1
/
2
)],
x_dm
[
i
*
(
WARP_SIZE
/
QI4_0
)
+
i
/
QI4_0
+
k
/
QI4_0
].
x
,
y_ds
[
j
*
(
2
*
WARP_SIZE
/
QI8_1
)
+
2
*
k
/
QI8_1
]);
}
#define VDR_q4_1_q8_1 1
#pragma unroll
for
(
int
l
=
0
;
l
<
VDR_Q4_0_Q8_1_MMQ
;
++
l
)
{
static
__device__
__forceinline__
float
vec_dot_q4_1_q8_1_impl
(
u
[
2
*
l
+
0
]
=
y_qs
[
j
*
(
2
*
WARP_SIZE
)
+
kyqs
+
l
];
const
int
&
vi
,
const
int
&
ui0
,
const
int
&
ui1
,
const
half2
&
dm4
,
const
half2
&
ds8
)
{
u
[
2
*
l
+
1
]
=
y_qs
[
j
*
(
2
*
WARP_SIZE
)
+
kyqs
+
l
+
QI4_0
];
}
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const
int
vi0
=
(
vi
>>
0
)
&
0x0F0F0F0F
;
const
int
vi1
=
(
vi
>>
4
)
&
0x0F0F0F0F
;
// SIMD dot product of quantized values
int
sumi
=
__dp4a
(
vi0
,
ui0
,
0
);
sumi
=
__dp4a
(
vi1
,
ui1
,
sumi
);
#ifdef GGML_CUDA_F16
const
half2
tmp
=
__hmul2
(
dm4
,
ds8
);
const
float
d4d8
=
__half2float
(
tmp
.
x
);
const
float
m4s8
=
__half2float
(
tmp
.
y
);
#else
const
float
d4d8
=
__half2float
(
dm4
.
x
)
*
__half2float
(
ds8
.
x
);
const
float
m4s8
=
__half2float
(
dm4
.
y
)
*
__half2float
(
ds8
.
y
);
#endif // GGML_CUDA_F16
// scale second part of sum by QI8_1/QR4_1 to compensate for multiple threads adding it
return
vec_dot_q4_0_q8_1_impl
<
VDR_Q4_0_Q8_1_MMQ
>
return
sumi
*
d4d8
+
m4s8
/
(
QI8_1
/
QR4_1
);
(
&
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
],
u
,
x_dmf
[
i
*
(
WARP_SIZE
/
QI4_0
)
+
i
/
QI4_0
+
k
/
QI4_0
],
#else
y_ds
[
j
*
(
2
*
WARP_SIZE
/
QI8_1
)
+
2
*
k
/
QI8_1
]);
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
}
static
__device__
__forceinline__
float
vec_dot_q4_1_q8_1
(
static
__device__
__forceinline__
float
vec_dot_q4_1_q8_1
(
...
@@ -1523,11 +1679,17 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
...
@@ -1523,11 +1679,17 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
const
block_q4_1
*
bq4_1
=
(
const
block_q4_1
*
)
vbq
;
const
block_q4_1
*
bq4_1
=
(
const
block_q4_1
*
)
vbq
;
const
int
vi
=
get_int_from_uint8_aligned
(
bq4_1
->
qs
,
iqs
);
int
v
[
VDR_Q4_1_Q8_1_MMVQ
];
const
int
ui0
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
);
int
u
[
2
*
VDR_Q4_1_Q8_1_MMVQ
];
const
int
ui1
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
+
QI4_1
);
#pragma unroll
for
(
int
i
=
0
;
i
<
VDR_Q4_1_Q8_1_MMVQ
;
++
i
)
{
v
[
i
]
=
get_int_from_uint8_aligned
(
bq4_1
->
qs
,
iqs
+
i
);
u
[
2
*
i
+
0
]
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
+
i
);
u
[
2
*
i
+
1
]
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
+
i
+
QI4_1
);
}
return
vec_dot_q4_1_q8_1_impl
(
v
i
,
u
i0
,
ui1
,
bq4_1
->
dm
,
bq8_1
->
ds
);
return
vec_dot_q4_1_q8_1_impl
<
VDR_Q4_1_Q8_1_MMVQ
>
(
v
,
u
,
bq4_1
->
dm
,
bq8_1
->
ds
);
}
}
static
__device__
__forceinline__
void
allocate_tiles_q4_1
(
int
**
x_ql
,
half2
**
x_dm
,
int
**
x_qh
,
int
**
x_sc
)
{
static
__device__
__forceinline__
void
allocate_tiles_q4_1
(
int
**
x_ql
,
half2
**
x_dm
,
int
**
x_qh
,
int
**
x_sc
)
{
...
@@ -1539,9 +1701,9 @@ static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 **
...
@@ -1539,9 +1701,9 @@ static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 **
*
x_dm
=
tile_x_dm
;
*
x_dm
=
tile_x_dm
;
}
}
static
__device__
__forceinline__
void
load_tiles_q4_1
(
template
<
bool
need_check
>
static
__device__
__forceinline__
void
load_tiles_q4_1
(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
i_max
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
__builtin_assume
(
i_offset
>=
0
);
__builtin_assume
(
i_offset
>=
0
);
__builtin_assume
(
i_offset
<
8
);
__builtin_assume
(
i_offset
<
8
);
...
@@ -1555,7 +1717,11 @@ static __device__ __forceinline__ void load_tiles_q4_1(
...
@@ -1555,7 +1717,11 @@ static __device__ __forceinline__ void load_tiles_q4_1(
#pragma unroll
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
)
{
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
)
{
const
int
i
=
i0
+
i_offset
;
int
i
=
i0
+
i_offset
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q4_1
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
const
block_q4_1
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
...
@@ -1567,7 +1733,11 @@ static __device__ __forceinline__ void load_tiles_q4_1(
...
@@ -1567,7 +1733,11 @@ static __device__ __forceinline__ void load_tiles_q4_1(
#pragma unroll
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
QI4_1
)
{
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
QI4_1
)
{
const
int
i
=
i0
+
i_offset
*
QI4_1
+
k
/
blocks_per_tile_x_row
;
int
i
=
i0
+
i_offset
*
QI4_1
+
k
/
blocks_per_tile_x_row
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q4_1
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbxd
;
const
block_q4_1
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbxd
;
...
@@ -1588,35 +1758,17 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
...
@@ -1588,35 +1758,17 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
const
int
kyqs
=
k
%
(
QI8_1
/
2
)
+
QI8_1
*
(
k
/
(
QI8_1
/
2
));
const
int
kyqs
=
k
%
(
QI8_1
/
2
)
+
QI8_1
*
(
k
/
(
QI8_1
/
2
));
return
vec_dot_q4_1_q8_1_impl
(
int
u
[
2
*
VDR_Q4_1_Q8_1_MMQ
];
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
],
y_qs
[
j
*
(
2
*
WARP_SIZE
)
+
kyqs
],
y_qs
[
j
*
(
2
*
WARP_SIZE
)
+
kyqs
+
(
QI8_1
/
2
)],
x_dm
[
i
*
(
WARP_SIZE
/
QI4_1
)
+
i
/
QI4_1
+
k
/
QI4_1
],
y_ds
[
j
*
(
2
*
WARP_SIZE
/
QI8_1
)
+
2
*
k
/
QI8_1
]);
}
#define VDR_q5_0_q8_1 1
static
__device__
__forceinline__
float
vec_dot_q5_0_q8_1_impl
(
#pragma unroll
const
int
&
qs
,
const
int
&
qh
,
const
int
&
ui0
,
const
int
&
ui1
,
const
half
&
d5
,
const
half2
&
ds8
)
{
for
(
int
l
=
0
;
l
<
VDR_Q4_1_Q8_1_MMQ
;
++
l
)
{
u
[
2
*
l
+
0
]
=
y_qs
[
j
*
(
2
*
WARP_SIZE
)
+
kyqs
+
l
];
u
[
2
*
l
+
1
]
=
y_qs
[
j
*
(
2
*
WARP_SIZE
)
+
kyqs
+
l
+
QI4_1
];
}
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
return
vec_dot_q4_1_q8_1_impl
<
VDR_Q4_1_Q8_1_MMQ
>
int
vi0
=
(
qs
>>
0
)
&
0x0F0F0F0F
;
// lower 4 qs bits, still need qh as 5th bits
(
&
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
],
u
,
x_dm
[
i
*
(
WARP_SIZE
/
QI4_1
)
+
i
/
QI4_1
+
k
/
QI4_1
],
vi0
|=
(
qh
<<
4
)
&
0x00000010
;
// 0 -> 4
y_ds
[
j
*
(
2
*
WARP_SIZE
/
QI8_1
)
+
2
*
k
/
QI8_1
]);
vi0
|=
(
qh
<<
11
)
&
0x00001000
;
// 1 -> 12
vi0
|=
(
qh
<<
18
)
&
0x00100000
;
// 2 -> 20
vi0
|=
(
qh
<<
25
)
&
0x10000000
;
// 3 -> 28
int
sumi
=
__dp4a
(
vi0
,
ui0
,
0
);
// SIMD dot product of quantized values
int
vi1
=
(
qs
>>
4
)
&
0x0F0F0F0F
;
// upper 4 qs bits, still need qh as 5th bits
vi1
|=
(
qh
>>
12
)
&
0x00000010
;
// 16 -> 4
vi1
|=
(
qh
>>
5
)
&
0x00001000
;
// 17 -> 12
vi1
|=
(
qh
<<
2
)
&
0x00100000
;
// 18 -> 20
vi1
|=
(
qh
<<
9
)
&
0x10000000
;
// 19 -> 28
sumi
=
__dp4a
(
vi1
,
ui1
,
sumi
);
// SIMD dot product of quantized values
return
__half2float
(
d5
)
*
(
sumi
*
__half2float
(
ds8
.
x
)
-
(
16
/
QI5_0
)
*
__half2float
(
ds8
.
y
));
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
}
static
__device__
__forceinline__
float
vec_dot_q5_0_q8_1
(
static
__device__
__forceinline__
float
vec_dot_q5_0_q8_1
(
...
@@ -1624,28 +1776,33 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
...
@@ -1624,28 +1776,33 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
const
block_q5_0
*
bq5_0
=
(
const
block_q5_0
*
)
vbq
;
const
block_q5_0
*
bq5_0
=
(
const
block_q5_0
*
)
vbq
;
const
int
qs
=
get_int_from_uint8
(
bq5_0
->
qs
,
iqs
);
int
vl
[
VDR_Q5_0_Q8_1_MMVQ
];
const
int
qh
=
get_int_from_uint8
(
bq5_0
->
qh
,
0
)
>>
(
4
*
iqs
);
int
vh
[
VDR_Q5_0_Q8_1_MMVQ
];
const
int
ui0
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
);
int
u
[
2
*
VDR_Q5_0_Q8_1_MMVQ
];
const
int
ui1
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
+
QI5_0
);
return
vec_dot_q5_0_q8_1_impl
(
qs
,
qh
,
ui0
,
ui1
,
bq5_0
->
d
,
bq8_1
->
ds
);
#pragma unroll
for
(
int
i
=
0
;
i
<
VDR_Q5_0_Q8_1_MMVQ
;
++
i
)
{
vl
[
i
]
=
get_int_from_uint8
(
bq5_0
->
qs
,
iqs
+
i
);
vh
[
i
]
=
get_int_from_uint8
(
bq5_0
->
qh
,
0
)
>>
(
4
*
(
iqs
+
i
));
u
[
2
*
i
+
0
]
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
+
i
);
u
[
2
*
i
+
1
]
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
+
i
+
QI5_0
);
}
return
vec_dot_q5_0_q8_1_impl
<
VDR_Q5_0_Q8_1_MMVQ
>
(
vl
,
vh
,
u
,
bq5_0
->
d
,
bq8_1
->
ds
);
}
}
static
__device__
__forceinline__
void
allocate_tiles_q5_0
(
int
**
x_ql
,
half2
**
x_dm
,
int
**
x_qh
,
int
**
x_sc
)
{
static
__device__
__forceinline__
void
allocate_tiles_q5_0
(
int
**
x_ql
,
half2
**
x_dm
,
int
**
x_qh
,
int
**
x_sc
)
{
__shared__
int
tile_x_ql
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
)
+
GGML_CUDA_MMQ_Y
];
__shared__
int
tile_x_ql
[
GGML_CUDA_MMQ_Y
*
(
2
*
WARP_SIZE
)
+
GGML_CUDA_MMQ_Y
];
__shared__
int
tile_x_qh
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
/
QI5_0
)
+
GGML_CUDA_MMQ_Y
/
QI5_0
];
__shared__
float
tile_x_d
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
/
QI5_0
)
+
GGML_CUDA_MMQ_Y
/
QI5_0
];
__shared__
half2
tile_x_d
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
/
QI5_0
)
+
GGML_CUDA_MMQ_Y
/
QI5_0
];
*
x_ql
=
tile_x_ql
;
*
x_ql
=
tile_x_ql
;
*
x_qh
=
tile_x_qh
;
*
x_dm
=
(
half2
*
)
tile_x_d
;
*
x_dm
=
tile_x_d
;
}
}
static
__device__
__forceinline__
void
load_tiles_q5_0
(
template
<
bool
need_check
>
static
__device__
__forceinline__
void
load_tiles_q5_0
(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
i_max
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
__builtin_assume
(
i_offset
>=
0
);
__builtin_assume
(
i_offset
>=
0
);
__builtin_assume
(
i_offset
<
8
);
__builtin_assume
(
i_offset
<
8
);
...
@@ -1659,24 +1816,51 @@ static __device__ __forceinline__ void load_tiles_q5_0(
...
@@ -1659,24 +1816,51 @@ static __device__ __forceinline__ void load_tiles_q5_0(
#pragma unroll
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
)
{
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
)
{
const
int
i
=
i0
+
i_offset
;
int
i
=
i0
+
i_offset
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q5_0
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
const
block_q5_0
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
]
=
get_int_from_uint8
(
bxi
->
qs
,
kqsx
);
const
int
ql
=
get_int_from_uint8
(
bxi
->
qs
,
kqsx
);
const
int
qh
=
get_int_from_uint8
(
bxi
->
qh
,
0
)
>>
(
4
*
(
k
%
QI5_0
));
int
qs0
=
(
ql
>>
0
)
&
0x0F0F0F0F
;
qs0
|=
(
qh
<<
4
)
&
0x00000010
;
// 0 -> 4
qs0
|=
(
qh
<<
11
)
&
0x00001000
;
// 1 -> 12
qs0
|=
(
qh
<<
18
)
&
0x00100000
;
// 2 -> 20
qs0
|=
(
qh
<<
25
)
&
0x10000000
;
// 3 -> 28
qs0
=
__vsubss4
(
qs0
,
0x10101010
);
// subtract 16
x_ql
[
i
*
(
2
*
WARP_SIZE
+
1
)
+
2
*
k
+
0
]
=
qs0
;
int
qs1
=
(
ql
>>
4
)
&
0x0F0F0F0F
;
qs1
|=
(
qh
>>
12
)
&
0x00000010
;
// 16 -> 4
qs1
|=
(
qh
>>
5
)
&
0x00001000
;
// 17 -> 12
qs1
|=
(
qh
<<
2
)
&
0x00100000
;
// 18 -> 20
qs1
|=
(
qh
<<
9
)
&
0x10000000
;
// 19 -> 28
qs1
=
__vsubss4
(
qs1
,
0x10101010
);
// subtract 16
x_ql
[
i
*
(
2
*
WARP_SIZE
+
1
)
+
2
*
k
+
1
]
=
qs1
;
}
}
const
int
blocks_per_tile_x_row
=
WARP_SIZE
/
QI5_0
;
const
int
blocks_per_tile_x_row
=
WARP_SIZE
/
QI5_0
;
const
int
kbxd
=
k
%
blocks_per_tile_x_row
;
const
int
kbxd
=
k
%
blocks_per_tile_x_row
;
float
*
x_dmf
=
(
float
*
)
x_dm
;
#pragma unroll
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
QI5_0
)
{
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
QI5_0
)
{
const
int
i
=
i0
+
i_offset
*
QI5_0
+
k
/
blocks_per_tile_x_row
;
int
i
=
i0
+
i_offset
*
QI5_0
+
k
/
blocks_per_tile_x_row
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q5_0
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbxd
;
const
block_q5_0
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbxd
;
x_qh
[
i
*
(
WARP_SIZE
/
QI5_0
)
+
i
/
QI5_0
+
kbxd
]
=
get_int_from_uint8
(
bxi
->
qh
,
0
);
x_dmf
[
i
*
(
WARP_SIZE
/
QI5_0
)
+
i
/
QI5_0
+
kbxd
]
=
bxi
->
d
;
x_dm
[
i
*
(
WARP_SIZE
/
QI5_0
)
+
i
/
QI5_0
+
kbxd
].
x
=
bxi
->
d
;
}
}
}
}
...
@@ -1693,46 +1877,18 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
...
@@ -1693,46 +1877,18 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
const
int
kyqs
=
k
%
(
QI8_1
/
2
)
+
QI8_1
*
(
k
/
(
QI8_1
/
2
));
const
int
kyqs
=
k
%
(
QI8_1
/
2
)
+
QI8_1
*
(
k
/
(
QI8_1
/
2
));
const
int
index_bx
=
i
*
(
WARP_SIZE
/
QI5_0
)
+
i
/
QI5_0
+
k
/
QI5_0
;
const
int
index_bx
=
i
*
(
WARP_SIZE
/
QI5_0
)
+
i
/
QI5_0
+
k
/
QI5_0
;
const
float
*
x_dmf
=
(
float
*
)
x_dm
;
return
vec_dot_q5_0_q8_1_impl
(
int
u
[
2
*
VDR_Q5_0_Q8_1_MMQ
];
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
],
x_qh
[
index_bx
]
>>
(
4
*
(
k
%
QI5_0
)),
y_qs
[
j
*
(
2
*
WARP_SIZE
)
+
kyqs
],
y_qs
[
j
*
(
2
*
WARP_SIZE
)
+
kyqs
+
(
QI8_1
/
2
)],
x_dm
[
index_bx
].
x
,
y_ds
[
j
*
(
2
*
WARP_SIZE
/
QI8_1
)
+
2
*
k
/
QI8_1
]);
}
#define VDR_q5_1_q8_1 1
static
__device__
__forceinline__
float
vec_dot_q5_1_q8_1_impl
(
const
int
&
qs
,
const
int
&
qh
,
const
int
&
ui0
,
const
int
&
ui1
,
const
half2
&
dm5
,
const
half2
&
ds8
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
int
vi0
=
(
qs
>>
0
)
&
0x0F0F0F0F
;
// lower 4 qs bits, still need qh0 as 5th bits
vi0
|=
(
qh
<<
4
)
&
0x00000010
;
// 0 -> 4
vi0
|=
(
qh
<<
11
)
&
0x00001000
;
// 1 -> 12
vi0
|=
(
qh
<<
18
)
&
0x00100000
;
// 2 -> 20
vi0
|=
(
qh
<<
25
)
&
0x10000000
;
// 3 -> 28
int
sumi
=
__dp4a
(
vi0
,
ui0
,
0
);
// SIMD dot product of quantized values
int
vi1
=
(
qs
>>
4
)
&
0x0F0F0F0F
;
// upper 4 qs bits, still need qh1 as 5th bits
vi1
|=
(
qh
>>
12
)
&
0x00000010
;
// 16 -> 4
vi1
|=
(
qh
>>
5
)
&
0x00001000
;
// 17 -> 12
vi1
|=
(
qh
<<
2
)
&
0x00100000
;
// 18 -> 20
vi1
|=
(
qh
<<
9
)
&
0x10000000
;
// 19 -> 28
sumi
=
__dp4a
(
vi1
,
ui1
,
sumi
);
// SIMD dot product of quantized values
#ifdef GGML_CUDA_F16
#pragma unroll
const
half2
tmp
=
__hmul2
(
dm5
,
ds8
);
for
(
int
l
=
0
;
l
<
VDR_Q5_0_Q8_1_MMQ
;
++
l
)
{
const
float
d5d8
=
__half2float
(
tmp
.
x
);
u
[
2
*
l
+
0
]
=
y_qs
[
j
*
(
2
*
WARP_SIZE
)
+
kyqs
+
l
];
const
float
m5s8
=
__half2float
(
tmp
.
y
);
u
[
2
*
l
+
1
]
=
y_qs
[
j
*
(
2
*
WARP_SIZE
)
+
kyqs
+
l
+
QI5_0
];
#else
}
const
float
d5d8
=
__half2float
(
dm5
.
x
)
*
__half2float
(
ds8
.
x
);
const
float
m5s8
=
__half2float
(
dm5
.
y
)
*
__half2float
(
ds8
.
y
);
#endif // GGML_CUDA_F16
return
sumi
*
d5d8
+
m5s8
/
QI5_1
;
// scale sum by QI5_1 because there are QI5_1 threads working on this block
#else
return
vec_dot_q8_0_q8_1_impl
<
QR5_0
*
VDR_Q5_0_Q8_1_MMQ
>
return
0.0
f
;
// only to satisfy the compiler
(
&
x_ql
[
i
*
(
2
*
WARP_SIZE
+
1
)
+
2
*
k
],
u
,
x_dmf
[
index_bx
],
y_ds
[
j
*
(
2
*
WARP_SIZE
/
QI8_1
)
+
2
*
k
/
QI8_1
]);
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
}
static
__device__
__forceinline__
float
vec_dot_q5_1_q8_1
(
static
__device__
__forceinline__
float
vec_dot_q5_1_q8_1
(
...
@@ -1740,28 +1896,33 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
...
@@ -1740,28 +1896,33 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
const
block_q5_1
*
bq5_1
=
(
const
block_q5_1
*
)
vbq
;
const
block_q5_1
*
bq5_1
=
(
const
block_q5_1
*
)
vbq
;
const
int
qs
=
get_int_from_uint8_aligned
(
bq5_1
->
qs
,
iqs
);
int
vl
[
VDR_Q5_1_Q8_1_MMVQ
];
const
int
qh
=
get_int_from_uint8_aligned
(
bq5_1
->
qh
,
0
)
>>
(
4
*
iqs
);
int
vh
[
VDR_Q5_1_Q8_1_MMVQ
];
const
int
ui0
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
);
int
u
[
2
*
VDR_Q5_1_Q8_1_MMVQ
];
const
int
ui1
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
+
QI5_1
);
return
vec_dot_q5_1_q8_1_impl
(
qs
,
qh
,
ui0
,
ui1
,
bq5_1
->
dm
,
bq8_1
->
ds
);
#pragma unroll
for
(
int
i
=
0
;
i
<
VDR_Q5_1_Q8_1_MMVQ
;
++
i
)
{
vl
[
i
]
=
get_int_from_uint8_aligned
(
bq5_1
->
qs
,
iqs
+
i
);
vh
[
i
]
=
get_int_from_uint8_aligned
(
bq5_1
->
qh
,
0
)
>>
(
4
*
(
iqs
+
i
));
u
[
2
*
i
+
0
]
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
+
i
);
u
[
2
*
i
+
1
]
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
+
i
+
QI5_1
);
}
return
vec_dot_q5_1_q8_1_impl
<
VDR_Q5_1_Q8_1_MMVQ
>
(
vl
,
vh
,
u
,
bq5_1
->
dm
,
bq8_1
->
ds
);
}
}
static
__device__
__forceinline__
void
allocate_tiles_q5_1
(
int
**
x_ql
,
half2
**
x_dm
,
int
**
x_qh
,
int
**
x_sc
)
{
static
__device__
__forceinline__
void
allocate_tiles_q5_1
(
int
**
x_ql
,
half2
**
x_dm
,
int
**
x_qh
,
int
**
x_sc
)
{
__shared__
int
tile_x_ql
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
)
+
GGML_CUDA_MMQ_Y
];
__shared__
int
tile_x_ql
[
GGML_CUDA_MMQ_Y
*
(
2
*
WARP_SIZE
)
+
GGML_CUDA_MMQ_Y
];
__shared__
int
tile_x_qh
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
/
QI5_1
)
+
GGML_CUDA_MMQ_Y
/
QI5_1
];
__shared__
half2
tile_x_dm
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
/
QI5_1
)
+
GGML_CUDA_MMQ_Y
/
QI5_1
];
__shared__
half2
tile_x_dm
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
/
QI5_1
)
+
GGML_CUDA_MMQ_Y
/
QI5_1
];
*
x_ql
=
tile_x_ql
;
*
x_ql
=
tile_x_ql
;
*
x_qh
=
tile_x_qh
;
*
x_dm
=
tile_x_dm
;
*
x_dm
=
tile_x_dm
;
}
}
static
__device__
__forceinline__
void
load_tiles_q5_1
(
template
<
bool
need_check
>
static
__device__
__forceinline__
void
load_tiles_q5_1
(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
i_max
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
__builtin_assume
(
i_offset
>=
0
);
__builtin_assume
(
i_offset
>=
0
);
__builtin_assume
(
i_offset
<
8
);
__builtin_assume
(
i_offset
<
8
);
...
@@ -1775,11 +1936,32 @@ static __device__ __forceinline__ void load_tiles_q5_1(
...
@@ -1775,11 +1936,32 @@ static __device__ __forceinline__ void load_tiles_q5_1(
#pragma unroll
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
)
{
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
)
{
const
int
i
=
i0
+
i_offset
;
int
i
=
i0
+
i_offset
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q5_1
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
const
block_q5_1
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
]
=
get_int_from_uint8_aligned
(
bxi
->
qs
,
kqsx
);
const
int
ql
=
get_int_from_uint8_aligned
(
bxi
->
qs
,
kqsx
);
const
int
qh
=
get_int_from_uint8_aligned
(
bxi
->
qh
,
0
)
>>
(
4
*
(
k
%
QI5_1
));
int
qs0
=
(
ql
>>
0
)
&
0x0F0F0F0F
;
qs0
|=
(
qh
<<
4
)
&
0x00000010
;
// 0 -> 4
qs0
|=
(
qh
<<
11
)
&
0x00001000
;
// 1 -> 12
qs0
|=
(
qh
<<
18
)
&
0x00100000
;
// 2 -> 20
qs0
|=
(
qh
<<
25
)
&
0x10000000
;
// 3 -> 28
x_ql
[
i
*
(
2
*
WARP_SIZE
+
1
)
+
2
*
k
+
0
]
=
qs0
;
int
qs1
=
(
ql
>>
4
)
&
0x0F0F0F0F
;
qs1
|=
(
qh
>>
12
)
&
0x00000010
;
// 16 -> 4
qs1
|=
(
qh
>>
5
)
&
0x00001000
;
// 17 -> 12
qs1
|=
(
qh
<<
2
)
&
0x00100000
;
// 18 -> 20
qs1
|=
(
qh
<<
9
)
&
0x10000000
;
// 19 -> 28
x_ql
[
i
*
(
2
*
WARP_SIZE
+
1
)
+
2
*
k
+
1
]
=
qs1
;
}
}
const
int
blocks_per_tile_x_row
=
WARP_SIZE
/
QI5_1
;
const
int
blocks_per_tile_x_row
=
WARP_SIZE
/
QI5_1
;
...
@@ -1787,11 +1969,14 @@ static __device__ __forceinline__ void load_tiles_q5_1(
...
@@ -1787,11 +1969,14 @@ static __device__ __forceinline__ void load_tiles_q5_1(
#pragma unroll
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
QI5_1
)
{
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
QI5_1
)
{
const
int
i
=
i0
+
i_offset
*
QI5_1
+
k
/
blocks_per_tile_x_row
;
int
i
=
i0
+
i_offset
*
QI5_1
+
k
/
blocks_per_tile_x_row
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q5_1
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbxd
;
const
block_q5_1
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbxd
;
x_qh
[
i
*
(
WARP_SIZE
/
QI5_1
)
+
i
/
QI5_1
+
kbxd
]
=
get_int_from_uint8_aligned
(
bxi
->
qh
,
0
);
x_dm
[
i
*
(
WARP_SIZE
/
QI5_1
)
+
i
/
QI5_1
+
kbxd
]
=
bxi
->
dm
;
x_dm
[
i
*
(
WARP_SIZE
/
QI5_1
)
+
i
/
QI5_1
+
kbxd
]
=
bxi
->
dm
;
}
}
}
}
...
@@ -1810,24 +1995,16 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
...
@@ -1810,24 +1995,16 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
const
int
kyqs
=
k
%
(
QI8_1
/
2
)
+
QI8_1
*
(
k
/
(
QI8_1
/
2
));
const
int
kyqs
=
k
%
(
QI8_1
/
2
)
+
QI8_1
*
(
k
/
(
QI8_1
/
2
));
const
int
index_bx
=
i
*
(
WARP_SIZE
/
QI5_1
)
+
+
i
/
QI5_1
+
k
/
QI5_1
;
const
int
index_bx
=
i
*
(
WARP_SIZE
/
QI5_1
)
+
+
i
/
QI5_1
+
k
/
QI5_1
;
return
vec_dot_q5_1_q8_1_impl
(
int
u
[
2
*
VDR_Q5_1_Q8_1_MMQ
];
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
],
x_qh
[
index_bx
]
>>
(
4
*
(
k
%
QI5_1
)),
y_qs
[
j
*
(
2
*
WARP_SIZE
)
+
kyqs
],
y_qs
[
j
*
(
2
*
WARP_SIZE
)
+
kyqs
+
(
QI8_1
/
2
)],
x_dm
[
index_bx
],
y_ds
[
j
*
(
2
*
WARP_SIZE
/
QI8_1
)
+
2
*
k
/
QI8_1
]);
}
#define VDR_q8_0_q8_1 1
static
__device__
__forceinline__
float
vec_dot_q8_0_q8_1_impl
(
const
int
&
vi
,
const
int
&
ui
,
const
half
&
d8_0
,
const
half2
&
ds8_1
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
#pragma unroll
// SIMD dot product of quantized values
for
(
int
l
=
0
;
l
<
VDR_Q5_1_Q8_1_MMQ
;
++
l
)
{
const
int
sumi
=
__dp4a
(
vi
,
ui
,
0
);
u
[
2
*
l
+
0
]
=
y_qs
[
j
*
(
2
*
WARP_SIZE
)
+
kyqs
+
l
];
u
[
2
*
l
+
1
]
=
y_qs
[
j
*
(
2
*
WARP_SIZE
)
+
kyqs
+
l
+
QI5_1
];
}
return
sumi
*
__half2float
(
d8_0
)
*
__half2float
(
ds8_1
.
x
);
return
vec_dot_q8_1_q8_1_impl
<
QR5_1
*
VDR_Q5_1_Q8_1_MMQ
>
#else
(
&
x_ql
[
i
*
(
2
*
WARP_SIZE
+
1
)
+
2
*
k
],
u
,
x_dm
[
index_bx
],
y_ds
[
j
*
(
2
*
WARP_SIZE
/
QI8_1
)
+
2
*
k
/
QI8_1
]);
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
}
static
__device__
__forceinline__
float
vec_dot_q8_0_q8_1
(
static
__device__
__forceinline__
float
vec_dot_q8_0_q8_1
(
...
@@ -1835,24 +2012,29 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
...
@@ -1835,24 +2012,29 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
const
block_q8_0
*
bq8_0
=
(
const
block_q8_0
*
)
vbq
;
const
block_q8_0
*
bq8_0
=
(
const
block_q8_0
*
)
vbq
;
const
int
vi
=
get_int_from_int8
(
bq8_0
->
qs
,
iqs
);
int
v
[
VDR_Q8_0_Q8_1_MMVQ
];
const
int
ui
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
);
int
u
[
VDR_Q8_0_Q8_1_MMVQ
];
for
(
int
i
=
0
;
i
<
VDR_Q8_0_Q8_1_MMVQ
;
++
i
)
{
v
[
i
]
=
get_int_from_int8
(
bq8_0
->
qs
,
iqs
+
i
);
u
[
i
]
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
+
i
);
}
return
vec_dot_q8_0_q8_1_impl
(
v
i
,
u
i
,
bq8_0
->
d
,
bq8_1
->
ds
);
return
vec_dot_q8_0_q8_1_impl
<
VDR_Q8_0_Q8_1_MMVQ
>
(
v
,
u
,
bq8_0
->
d
,
bq8_1
->
ds
);
}
}
static
__device__
__forceinline__
void
allocate_tiles_q8_0
(
int
**
x_ql
,
half2
**
x_dm
,
int
**
x_qh
,
int
**
x_sc
)
{
static
__device__
__forceinline__
void
allocate_tiles_q8_0
(
int
**
x_ql
,
half2
**
x_dm
,
int
**
x_qh
,
int
**
x_sc
)
{
__shared__
int
tile_x_qs
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
)
+
GGML_CUDA_MMQ_Y
];
__shared__
int
tile_x_qs
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
)
+
GGML_CUDA_MMQ_Y
];
__shared__
half2
tile_x_d
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
/
QI8_0
)
+
GGML_CUDA_MMQ_Y
/
QI8_0
];
__shared__
float
tile_x_d
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
/
QI8_0
)
+
GGML_CUDA_MMQ_Y
/
QI8_0
];
*
x_ql
=
tile_x_qs
;
*
x_ql
=
tile_x_qs
;
*
x_dm
=
tile_x_d
;
*
x_dm
=
(
half2
*
)
tile_x_d
;
}
}
static
__device__
__forceinline__
void
load_tiles_q8_0
(
template
<
bool
need_check
>
static
__device__
__forceinline__
void
load_tiles_q8_0
(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
i_max
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
__builtin_assume
(
i_offset
>=
0
);
__builtin_assume
(
i_offset
>=
0
);
__builtin_assume
(
i_offset
<
8
);
__builtin_assume
(
i_offset
<
8
);
...
@@ -1861,17 +2043,22 @@ static __device__ __forceinline__ void load_tiles_q8_0(
...
@@ -1861,17 +2043,22 @@ static __device__ __forceinline__ void load_tiles_q8_0(
const
int
kbx
=
k
/
QI8_0
;
const
int
kbx
=
k
/
QI8_0
;
const
int
kqsx
=
k
%
QI8_0
;
const
int
kqsx
=
k
%
QI8_0
;
float
*
x_dmf
=
(
float
*
)
x_dm
;
const
block_q8_0
*
bx0
=
(
block_q8_0
*
)
vx
;
const
block_q8_0
*
bx0
=
(
block_q8_0
*
)
vx
;
#pragma unroll
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
)
{
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
)
{
const
int
i
=
i0
+
i_offset
;
int
i
=
i0
+
i_offset
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q8_0
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
const
block_q8_0
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
]
=
get_int_from_int8
(
bxi
->
qs
,
kqsx
);
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
]
=
get_int_from_int8
(
bxi
->
qs
,
kqsx
);
x_dm
[
i
*
(
WARP_SIZE
/
QI8_0
)
+
i
/
QI8_0
+
kbx
]
.
x
=
bxi
->
d
;
x_dm
f
[
i
*
(
WARP_SIZE
/
QI8_0
)
+
i
/
QI8_0
+
kbx
]
=
bxi
->
d
;
}
}
// const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
// const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
...
@@ -1879,6 +2066,7 @@ static __device__ __forceinline__ void load_tiles_q8_0(
...
@@ -1879,6 +2066,7 @@ static __device__ __forceinline__ void load_tiles_q8_0(
// #pragma unroll
// #pragma unroll
// for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI8_0) {
// for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI8_0) {
// FIXME out-of-bounds
// const int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
// const int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
// #if GGML_CUDA_MMQ_Y < 64
// #if GGML_CUDA_MMQ_Y < 64
...
@@ -1904,9 +2092,11 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
...
@@ -1904,9 +2092,11 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
__builtin_assume
(
k
>=
0
);
__builtin_assume
(
k
>=
0
);
__builtin_assume
(
k
<
WARP_SIZE
);
__builtin_assume
(
k
<
WARP_SIZE
);
return
vec_dot_q8_0_q8_1_impl
(
const
float
*
x_dmf
=
(
float
*
)
x_dm
;
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
],
y_qs
[
j
*
WARP_SIZE
+
k
],
x_dm
[
i
*
(
WARP_SIZE
/
QI8_0
)
+
i
/
QI8_0
+
k
/
QI8_0
].
x
,
y_ds
[
j
*
(
WARP_SIZE
/
QI8_1
)
+
k
/
QI8_1
]);
return
vec_dot_q8_0_q8_1_impl
<
VDR_Q8_0_Q8_1_MMQ
>
(
&
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
],
&
y_qs
[
j
*
WARP_SIZE
+
k
],
x_dmf
[
i
*
(
WARP_SIZE
/
QI8_0
)
+
i
/
QI8_0
+
k
/
QI8_0
],
y_ds
[
j
*
(
WARP_SIZE
/
QI8_1
)
+
k
/
QI8_1
]);
}
}
#define VDR_q2_K_q8_1 1
#define VDR_q2_K_q8_1 1
...
@@ -1973,9 +2163,9 @@ static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 **
...
@@ -1973,9 +2163,9 @@ static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 **
*
x_sc
=
tile_x_sc
;
*
x_sc
=
tile_x_sc
;
}
}
static
__device__
__forceinline__
void
load_tiles_q2_K
(
template
<
bool
need_check
>
static
__device__
__forceinline__
void
load_tiles_q2_K
(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
i_max
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
__builtin_assume
(
i_offset
>=
0
);
__builtin_assume
(
i_offset
>=
0
);
__builtin_assume
(
i_offset
<
8
);
__builtin_assume
(
i_offset
<
8
);
...
@@ -1989,7 +2179,11 @@ static __device__ __forceinline__ void load_tiles_q2_K(
...
@@ -1989,7 +2179,11 @@ static __device__ __forceinline__ void load_tiles_q2_K(
#pragma unroll
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
)
{
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
)
{
const
int
i
=
i0
+
i_offset
;
int
i
=
i0
+
i_offset
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q2_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
const
block_q2_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
...
@@ -2001,7 +2195,11 @@ static __device__ __forceinline__ void load_tiles_q2_K(
...
@@ -2001,7 +2195,11 @@ static __device__ __forceinline__ void load_tiles_q2_K(
#pragma unroll
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
QI2_K
)
{
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
QI2_K
)
{
const
int
i
=
(
i0
+
i_offset
*
QI2_K
+
k
/
blocks_per_tile_x_row
)
%
GGML_CUDA_MMQ_Y
;
int
i
=
(
i0
+
i_offset
*
QI2_K
+
k
/
blocks_per_tile_x_row
)
%
GGML_CUDA_MMQ_Y
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q2_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbxd
;
const
block_q2_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbxd
;
...
@@ -2010,7 +2208,11 @@ static __device__ __forceinline__ void load_tiles_q2_K(
...
@@ -2010,7 +2208,11 @@ static __device__ __forceinline__ void load_tiles_q2_K(
#pragma unroll
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
4
)
{
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
4
)
{
const
int
i
=
i0
+
i_offset
*
4
+
k
/
(
WARP_SIZE
/
4
);
int
i
=
i0
+
i_offset
*
4
+
k
/
(
WARP_SIZE
/
4
);
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q2_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
(
k
%
(
WARP_SIZE
/
4
))
/
(
QI2_K
/
4
);
const
block_q2_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
(
k
%
(
WARP_SIZE
/
4
))
/
(
QI2_K
/
4
);
...
@@ -2125,9 +2327,9 @@ static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 **
...
@@ -2125,9 +2327,9 @@ static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 **
*
x_sc
=
tile_x_sc
;
*
x_sc
=
tile_x_sc
;
}
}
static
__device__
__forceinline__
void
load_tiles_q3_K
(
template
<
bool
need_check
>
static
__device__
__forceinline__
void
load_tiles_q3_K
(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
i_max
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
__builtin_assume
(
i_offset
>=
0
);
__builtin_assume
(
i_offset
>=
0
);
__builtin_assume
(
i_offset
<
8
);
__builtin_assume
(
i_offset
<
8
);
...
@@ -2141,7 +2343,11 @@ static __device__ __forceinline__ void load_tiles_q3_K(
...
@@ -2141,7 +2343,11 @@ static __device__ __forceinline__ void load_tiles_q3_K(
#pragma unroll
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
)
{
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
)
{
const
int
i
=
i0
+
i_offset
;
int
i
=
i0
+
i_offset
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q3_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
const
block_q3_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
...
@@ -2153,7 +2359,11 @@ static __device__ __forceinline__ void load_tiles_q3_K(
...
@@ -2153,7 +2359,11 @@ static __device__ __forceinline__ void load_tiles_q3_K(
#pragma unroll
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
QI3_K
)
{
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
QI3_K
)
{
const
int
i
=
(
i0
+
i_offset
*
QI3_K
+
k
/
blocks_per_tile_x_row
)
%
GGML_CUDA_MMQ_Y
;
int
i
=
(
i0
+
i_offset
*
QI3_K
+
k
/
blocks_per_tile_x_row
)
%
GGML_CUDA_MMQ_Y
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q3_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbxd
;
const
block_q3_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbxd
;
...
@@ -2162,7 +2372,11 @@ static __device__ __forceinline__ void load_tiles_q3_K(
...
@@ -2162,7 +2372,11 @@ static __device__ __forceinline__ void load_tiles_q3_K(
#pragma unroll
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
2
)
{
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
2
)
{
const
int
i
=
i0
+
i_offset
*
2
+
k
/
(
WARP_SIZE
/
2
);
int
i
=
i0
+
i_offset
*
2
+
k
/
(
WARP_SIZE
/
2
);
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q3_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
(
k
%
(
WARP_SIZE
/
2
))
/
(
QI3_K
/
2
);
const
block_q3_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
(
k
%
(
WARP_SIZE
/
2
))
/
(
QI3_K
/
2
);
...
@@ -2171,7 +2385,11 @@ static __device__ __forceinline__ void load_tiles_q3_K(
...
@@ -2171,7 +2385,11 @@ static __device__ __forceinline__ void load_tiles_q3_K(
#pragma unroll
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
4
)
{
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
4
)
{
const
int
i
=
i0
+
i_offset
*
4
+
k
/
(
WARP_SIZE
/
4
);
int
i
=
i0
+
i_offset
*
4
+
k
/
(
WARP_SIZE
/
4
);
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q3_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
(
k
%
(
WARP_SIZE
/
4
))
/
(
QI3_K
/
4
);
const
block_q3_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
(
k
%
(
WARP_SIZE
/
4
))
/
(
QI3_K
/
4
);
...
@@ -2252,15 +2470,15 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
...
@@ -2252,15 +2470,15 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
int
u
[
2
*
QR4_K
];
int
u
[
2
*
QR4_K
];
float
d8
[
QR4_K
];
float
d8
[
QR4_K
];
// iqs is in 0
...15
. bq8_offset =
2 * (
iqs/4
)
-> bq8_offset = 0, 2, 4, 6
// iqs is in 0
,2..30
. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6
const
int
bq8_offset
=
QR4_K
*
(
iqs
/
(
QI8_1
/
2
));
const
int
bq8_offset
=
QR4_K
*
(
(
iqs
/
2
)
/
(
QI8_1
/
2
));
// iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
// iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
// iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
// iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
// iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
// iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
// iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
// iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
const
int
*
q4
=
(
const
int
*
)(
bq4_K
->
qs
+
16
*
bq8_offset
+
4
*
(
iqs
%
4
));
const
int
*
q4
=
(
const
int
*
)(
bq4_K
->
qs
+
16
*
bq8_offset
+
4
*
(
(
iqs
/
2
)
%
4
));
v
[
0
]
=
q4
[
0
];
v
[
0
]
=
q4
[
0
];
v
[
1
]
=
q4
[
4
];
v
[
1
]
=
q4
[
4
];
...
@@ -2281,7 +2499,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
...
@@ -2281,7 +2499,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
const
block_q8_1
*
bq8i
=
bq8_1
+
bq8_offset
+
i
;
const
block_q8_1
*
bq8i
=
bq8_1
+
bq8_offset
+
i
;
d8
[
i
]
=
bq8i
->
ds
.
x
;
d8
[
i
]
=
bq8i
->
ds
.
x
;
const
int
*
q8
=
(
const
int
*
)
bq8i
->
qs
+
(
iqs
%
4
);
const
int
*
q8
=
(
const
int
*
)
bq8i
->
qs
+
(
(
iqs
/
2
)
%
4
);
u
[
2
*
i
+
0
]
=
q8
[
0
];
u
[
2
*
i
+
0
]
=
q8
[
0
];
u
[
2
*
i
+
1
]
=
q8
[
4
];
u
[
2
*
i
+
1
]
=
q8
[
4
];
}
}
...
@@ -2309,12 +2527,12 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
...
@@ -2309,12 +2527,12 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
const
float
d8_1
=
bq8_1
[
0
].
ds
.
x
;
const
float
d8_1
=
bq8_1
[
0
].
ds
.
x
;
const
float
d8_2
=
bq8_1
[
1
].
ds
.
x
;
const
float
d8_2
=
bq8_1
[
1
].
ds
.
x
;
const
int
ui1
=
*
((
const
int
*
)
bq8_1
[
0
].
qs
+
iqs
);
const
int
ui1
=
*
((
const
int
*
)
bq8_1
[
0
].
qs
+
(
iqs
/
2
)
);
const
int
ui2
=
*
((
const
int
*
)
bq8_1
[
0
].
qs
+
iqs
+
4
);
const
int
ui2
=
*
((
const
int
*
)
bq8_1
[
0
].
qs
+
(
iqs
/
2
)
+
4
);
const
int
ui3
=
*
((
const
int
*
)
bq8_1
[
1
].
qs
+
iqs
);
const
int
ui3
=
*
((
const
int
*
)
bq8_1
[
1
].
qs
+
(
iqs
/
2
)
);
const
int
ui4
=
*
((
const
int
*
)
bq8_1
[
1
].
qs
+
iqs
+
4
);
const
int
ui4
=
*
((
const
int
*
)
bq8_1
[
1
].
qs
+
(
iqs
/
2
)
+
4
);
const
int
*
q4
=
(
const
int
*
)
bq4_K
->
qs
+
iqs
;
const
int
*
q4
=
(
const
int
*
)
bq4_K
->
qs
+
(
iqs
/
2
)
;
const
int
v1
=
q4
[
0
];
const
int
v1
=
q4
[
0
];
const
int
v2
=
q4
[
4
];
const
int
v2
=
q4
[
4
];
...
@@ -2346,9 +2564,9 @@ static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 **
...
@@ -2346,9 +2564,9 @@ static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 **
*
x_sc
=
tile_x_sc
;
*
x_sc
=
tile_x_sc
;
}
}
static
__device__
__forceinline__
void
load_tiles_q4_K
(
template
<
bool
need_check
>
static
__device__
__forceinline__
void
load_tiles_q4_K
(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
i_max
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
__builtin_assume
(
i_offset
>=
0
);
__builtin_assume
(
i_offset
>=
0
);
__builtin_assume
(
i_offset
<
8
);
__builtin_assume
(
i_offset
<
8
);
...
@@ -2362,7 +2580,11 @@ static __device__ __forceinline__ void load_tiles_q4_K(
...
@@ -2362,7 +2580,11 @@ static __device__ __forceinline__ void load_tiles_q4_K(
#pragma unroll
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
)
{
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
)
{
const
int
i
=
i0
+
i_offset
;
int
i
=
i0
+
i_offset
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q4_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
const
block_q4_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
...
@@ -2374,7 +2596,11 @@ static __device__ __forceinline__ void load_tiles_q4_K(
...
@@ -2374,7 +2596,11 @@ static __device__ __forceinline__ void load_tiles_q4_K(
#pragma unroll
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
QI4_K
)
{
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
QI4_K
)
{
const
int
i
=
(
i0
+
i_offset
*
QI4_K
+
k
/
blocks_per_tile_x_row
)
%
GGML_CUDA_MMQ_Y
;
int
i
=
(
i0
+
i_offset
*
QI4_K
+
k
/
blocks_per_tile_x_row
)
%
GGML_CUDA_MMQ_Y
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q4_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbxd
;
const
block_q4_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbxd
;
...
@@ -2383,7 +2609,11 @@ static __device__ __forceinline__ void load_tiles_q4_K(
...
@@ -2383,7 +2609,11 @@ static __device__ __forceinline__ void load_tiles_q4_K(
#pragma unroll
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
8
)
{
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
8
)
{
const
int
i
=
(
i0
+
i_offset
*
8
+
k
/
(
WARP_SIZE
/
8
))
%
GGML_CUDA_MMQ_Y
;
int
i
=
(
i0
+
i_offset
*
8
+
k
/
(
WARP_SIZE
/
8
))
%
GGML_CUDA_MMQ_Y
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q4_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
(
k
%
(
WARP_SIZE
/
8
))
/
(
QI4_K
/
8
);
const
block_q4_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
(
k
%
(
WARP_SIZE
/
8
))
/
(
QI4_K
/
8
);
...
@@ -2409,11 +2639,11 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
...
@@ -2409,11 +2639,11 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
int
u
[
2
*
QR4_K
];
int
u
[
2
*
QR4_K
];
float
d8
[
QR4_K
];
float
d8
[
QR4_K
];
//
i
qs is in 0...
15
. bq8_offset = 2 * (
i
qs/4) -> bq8_offset = 0, 2, 4, 6
//
k
qs
x
is in 0
,2
...
30
. bq8_offset = 2 * (
k
qs
x
/4) -> bq8_offset = 0, 2, 4, 6
const
int
bq8_offset
=
QR4_K
*
(
kqsx
/
(
QI8_1
/
2
));
const
int
bq8_offset
=
QR4_K
*
(
(
kqsx
/
2
)
/
(
QI8_1
/
2
));
v
[
0
]
=
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
4
*
bq8_offset
+
kqsx
%
4
+
0
];
v
[
0
]
=
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
4
*
bq8_offset
+
(
kqsx
/
2
)
%
4
+
0
];
v
[
1
]
=
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
4
*
bq8_offset
+
kqsx
%
4
+
4
];
v
[
1
]
=
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
4
*
bq8_offset
+
(
kqsx
/
2
)
%
4
+
4
];
const
uint16_t
*
scales
=
(
const
uint16_t
*
)
&
x_sc
[
i
*
(
WARP_SIZE
/
8
)
+
i
/
8
+
kbx
*
4
];
const
uint16_t
*
scales
=
(
const
uint16_t
*
)
&
x_sc
[
i
*
(
WARP_SIZE
/
8
)
+
i
/
8
+
kbx
*
4
];
uint16_t
aux
[
2
];
uint16_t
aux
[
2
];
...
@@ -2429,7 +2659,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
...
@@ -2429,7 +2659,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
const
uint8_t
*
m
=
sc
+
2
;
const
uint8_t
*
m
=
sc
+
2
;
for
(
int
l
=
0
;
l
<
QR4_K
;
++
l
)
{
for
(
int
l
=
0
;
l
<
QR4_K
;
++
l
)
{
const
int
kqsy
=
j
*
(
QR4_K
*
WARP_SIZE
)
+
kbx
*
(
QR4_K
*
QI4_K
)
+
(
bq8_offset
+
l
)
*
QI8_1
+
kqsx
%
(
QI8_1
/
2
);
const
int
kqsy
=
j
*
(
QR4_K
*
WARP_SIZE
)
+
kbx
*
(
QR4_K
*
QI4_K
)
+
(
bq8_offset
+
l
)
*
QI8_1
+
(
kqsx
/
2
)
%
(
QI8_1
/
2
);
u
[
2
*
l
+
0
]
=
y_qs
[
kqsy
+
0
*
(
QI8_1
/
2
)];
u
[
2
*
l
+
0
]
=
y_qs
[
kqsy
+
0
*
(
QI8_1
/
2
)];
u
[
2
*
l
+
1
]
=
y_qs
[
kqsy
+
1
*
(
QI8_1
/
2
)];
u
[
2
*
l
+
1
]
=
y_qs
[
kqsy
+
1
*
(
QI8_1
/
2
)];
d8
[
l
]
=
y_ds
[
kqsy
/
QI8_1
].
x
;
d8
[
l
]
=
y_ds
[
kqsy
/
QI8_1
].
x
;
...
@@ -2484,9 +2714,9 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
...
@@ -2484,9 +2714,9 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
int
u
[
2
*
QR5_K
];
int
u
[
2
*
QR5_K
];
float
d8
[
QR5_K
];
float
d8
[
QR5_K
];
const
int
bq8_offset
=
QR5_K
*
(
iqs
/
(
QI8_1
/
2
));
const
int
bq8_offset
=
QR5_K
*
(
(
iqs
/
2
)
/
(
QI8_1
/
2
));
const
int
*
ql
=
(
const
int
*
)(
bq5_K
->
qs
+
16
*
bq8_offset
+
4
*
(
iqs
%
4
));
const
int
*
ql
=
(
const
int
*
)(
bq5_K
->
qs
+
16
*
bq8_offset
+
4
*
(
(
iqs
/
2
)
%
4
));
const
int
*
qh
=
(
const
int
*
)(
bq5_K
->
qh
+
4
*
(
iqs
%
4
));
const
int
*
qh
=
(
const
int
*
)(
bq5_K
->
qh
+
4
*
(
(
iqs
/
2
)
%
4
));
vl
[
0
]
=
ql
[
0
];
vl
[
0
]
=
ql
[
0
];
vl
[
1
]
=
ql
[
4
];
vl
[
1
]
=
ql
[
4
];
...
@@ -2511,7 +2741,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
...
@@ -2511,7 +2741,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
const
block_q8_1
*
bq8i
=
bq8_1
+
bq8_offset
+
i
;
const
block_q8_1
*
bq8i
=
bq8_1
+
bq8_offset
+
i
;
d8
[
i
]
=
bq8i
->
ds
.
x
;
d8
[
i
]
=
bq8i
->
ds
.
x
;
const
int
*
q8
=
(
const
int
*
)
bq8i
->
qs
+
(
iqs
%
4
);
const
int
*
q8
=
(
const
int
*
)
bq8i
->
qs
+
(
(
iqs
/
2
)
%
4
);
u
[
2
*
i
+
0
]
=
q8
[
0
];
u
[
2
*
i
+
0
]
=
q8
[
0
];
u
[
2
*
i
+
1
]
=
q8
[
4
];
u
[
2
*
i
+
1
]
=
q8
[
4
];
}
}
...
@@ -2530,17 +2760,17 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
...
@@ -2530,17 +2760,17 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
const
float
d8_1
=
bq8_1
[
0
].
ds
.
x
;
const
float
d8_1
=
bq8_1
[
0
].
ds
.
x
;
const
float
d8_2
=
bq8_1
[
1
].
ds
.
x
;
const
float
d8_2
=
bq8_1
[
1
].
ds
.
x
;
const
int
ui1
=
*
((
const
int
*
)
bq8_1
[
0
].
qs
+
iqs
);
const
int
ui1
=
*
((
const
int
*
)
bq8_1
[
0
].
qs
+
(
iqs
/
2
)
);
const
int
ui2
=
*
((
const
int
*
)
bq8_1
[
0
].
qs
+
iqs
+
4
);
const
int
ui2
=
*
((
const
int
*
)
bq8_1
[
0
].
qs
+
(
iqs
/
2
)
+
4
);
const
int
ui3
=
*
((
const
int
*
)
bq8_1
[
1
].
qs
+
iqs
);
const
int
ui3
=
*
((
const
int
*
)
bq8_1
[
1
].
qs
+
(
iqs
/
2
)
);
const
int
ui4
=
*
((
const
int
*
)
bq8_1
[
1
].
qs
+
iqs
+
4
);
const
int
ui4
=
*
((
const
int
*
)
bq8_1
[
1
].
qs
+
(
iqs
/
2
)
+
4
);
const
int
*
ql
=
(
const
int
*
)
bq5_K
->
qs
+
iqs
;
const
int
*
ql
=
(
const
int
*
)
bq5_K
->
qs
+
(
iqs
/
2
)
;
const
int
vl1
=
ql
[
0
];
const
int
vl1
=
ql
[
0
];
const
int
vl2
=
ql
[
4
];
const
int
vl2
=
ql
[
4
];
const
int
step
=
4
*
iqs
;
// 0, 4, 8, 12
const
int
step
=
4
*
(
iqs
/
2
)
;
// 0, 4, 8, 12
const
int
im
=
step
/
8
;
// = 0 for iqs = 0,
1
, = 1 for iqs =
2
,
3
const
int
im
=
step
/
8
;
// = 0 for iqs = 0,
2
, = 1 for iqs =
4
,
6
const
int
in
=
step
%
8
;
// 0, 4, 0, 4
const
int
in
=
step
%
8
;
// 0, 4, 0, 4
const
int
vh
=
(
*
((
const
int
*
)(
bq5_K
->
qh
+
in
)))
>>
im
;
const
int
vh
=
(
*
((
const
int
*
)(
bq5_K
->
qh
+
in
)))
>>
im
;
...
@@ -2574,9 +2804,9 @@ static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 **
...
@@ -2574,9 +2804,9 @@ static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 **
*
x_sc
=
tile_x_sc
;
*
x_sc
=
tile_x_sc
;
}
}
static
__device__
__forceinline__
void
load_tiles_q5_K
(
template
<
bool
need_check
>
static
__device__
__forceinline__
void
load_tiles_q5_K
(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
i_max
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
__builtin_assume
(
i_offset
>=
0
);
__builtin_assume
(
i_offset
>=
0
);
__builtin_assume
(
i_offset
<
8
);
__builtin_assume
(
i_offset
<
8
);
...
@@ -2590,7 +2820,11 @@ static __device__ __forceinline__ void load_tiles_q5_K(
...
@@ -2590,7 +2820,11 @@ static __device__ __forceinline__ void load_tiles_q5_K(
#pragma unroll
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
)
{
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
)
{
const
int
i
=
i0
+
i_offset
;
int
i
=
i0
+
i_offset
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q5_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
const
block_q5_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
...
@@ -2602,7 +2836,11 @@ static __device__ __forceinline__ void load_tiles_q5_K(
...
@@ -2602,7 +2836,11 @@ static __device__ __forceinline__ void load_tiles_q5_K(
#pragma unroll
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
QI5_K
)
{
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
QI5_K
)
{
const
int
i
=
(
i0
+
i_offset
*
QI5_K
+
k
/
blocks_per_tile_x_row
)
%
GGML_CUDA_MMQ_Y
;
int
i
=
(
i0
+
i_offset
*
QI5_K
+
k
/
blocks_per_tile_x_row
)
%
GGML_CUDA_MMQ_Y
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q5_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbxd
;
const
block_q5_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbxd
;
...
@@ -2611,7 +2849,11 @@ static __device__ __forceinline__ void load_tiles_q5_K(
...
@@ -2611,7 +2849,11 @@ static __device__ __forceinline__ void load_tiles_q5_K(
#pragma unroll
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
4
)
{
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
4
)
{
const
int
i
=
i0
+
i_offset
*
4
+
k
/
(
WARP_SIZE
/
4
);
int
i
=
i0
+
i_offset
*
4
+
k
/
(
WARP_SIZE
/
4
);
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q5_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
(
k
%
(
WARP_SIZE
/
4
))
/
(
QI5_K
/
4
);
const
block_q5_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
(
k
%
(
WARP_SIZE
/
4
))
/
(
QI5_K
/
4
);
...
@@ -2620,7 +2862,11 @@ static __device__ __forceinline__ void load_tiles_q5_K(
...
@@ -2620,7 +2862,11 @@ static __device__ __forceinline__ void load_tiles_q5_K(
#pragma unroll
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
8
)
{
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
8
)
{
const
int
i
=
(
i0
+
i_offset
*
8
+
k
/
(
WARP_SIZE
/
8
))
%
GGML_CUDA_MMQ_Y
;
int
i
=
(
i0
+
i_offset
*
8
+
k
/
(
WARP_SIZE
/
8
))
%
GGML_CUDA_MMQ_Y
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q5_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
(
k
%
(
WARP_SIZE
/
8
))
/
(
QI5_K
/
8
);
const
block_q5_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
(
k
%
(
WARP_SIZE
/
8
))
/
(
QI5_K
/
8
);
...
@@ -2647,13 +2893,13 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
...
@@ -2647,13 +2893,13 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
int
u
[
2
*
QR4_K
];
int
u
[
2
*
QR4_K
];
float
d8
[
QR4_K
];
float
d8
[
QR4_K
];
const
int
bq8_offset
=
QR5_K
*
(
kqsx
/
(
QI8_1
/
2
));
const
int
bq8_offset
=
QR5_K
*
(
(
kqsx
/
2
)
/
(
QI8_1
/
2
));
vl
[
0
]
=
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
4
*
bq8_offset
+
kqsx
%
4
+
0
];
vl
[
0
]
=
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
4
*
bq8_offset
+
(
kqsx
/
2
)
%
4
+
0
];
vl
[
1
]
=
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
4
*
bq8_offset
+
kqsx
%
4
+
4
];
vl
[
1
]
=
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
4
*
bq8_offset
+
(
kqsx
/
2
)
%
4
+
4
];
vh
[
0
]
=
x_qh
[
i
*
(
WARP_SIZE
/
4
)
+
i
/
4
+
kqsx
%
4
+
0
]
>>
bq8_offset
;
vh
[
0
]
=
x_qh
[
i
*
(
WARP_SIZE
/
4
)
+
i
/
4
+
(
kqsx
/
2
)
%
4
+
0
]
>>
bq8_offset
;
vh
[
1
]
=
x_qh
[
i
*
(
WARP_SIZE
/
4
)
+
i
/
4
+
kqsx
%
4
+
4
]
>>
bq8_offset
;
vh
[
1
]
=
x_qh
[
i
*
(
WARP_SIZE
/
4
)
+
i
/
4
+
(
kqsx
/
2
)
%
4
+
4
]
>>
bq8_offset
;
const
uint16_t
*
scales
=
(
const
uint16_t
*
)
&
x_sc
[
i
*
(
WARP_SIZE
/
8
)
+
i
/
8
+
kbx
*
4
];
const
uint16_t
*
scales
=
(
const
uint16_t
*
)
&
x_sc
[
i
*
(
WARP_SIZE
/
8
)
+
i
/
8
+
kbx
*
4
];
uint16_t
aux
[
2
];
uint16_t
aux
[
2
];
...
@@ -2669,7 +2915,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
...
@@ -2669,7 +2915,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
const
uint8_t
*
m
=
sc
+
2
;
const
uint8_t
*
m
=
sc
+
2
;
for
(
int
l
=
0
;
l
<
QR5_K
;
++
l
)
{
for
(
int
l
=
0
;
l
<
QR5_K
;
++
l
)
{
const
int
kqsy
=
j
*
(
QR5_K
*
WARP_SIZE
)
+
kbx
*
(
QR5_K
*
QI5_K
)
+
(
bq8_offset
+
l
)
*
QI8_1
+
kqsx
%
(
QI8_1
/
2
);
const
int
kqsy
=
j
*
(
QR5_K
*
WARP_SIZE
)
+
kbx
*
(
QR5_K
*
QI5_K
)
+
(
bq8_offset
+
l
)
*
QI8_1
+
(
kqsx
/
2
)
%
(
QI8_1
/
2
);
u
[
2
*
l
+
0
]
=
y_qs
[
kqsy
+
0
*
(
QI8_1
/
2
)];
u
[
2
*
l
+
0
]
=
y_qs
[
kqsy
+
0
*
(
QI8_1
/
2
)];
u
[
2
*
l
+
1
]
=
y_qs
[
kqsy
+
1
*
(
QI8_1
/
2
)];
u
[
2
*
l
+
1
]
=
y_qs
[
kqsy
+
1
*
(
QI8_1
/
2
)];
d8
[
l
]
=
y_ds
[
kqsy
/
QI8_1
].
x
;
d8
[
l
]
=
y_ds
[
kqsy
/
QI8_1
].
x
;
...
@@ -2743,9 +2989,9 @@ static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 **
...
@@ -2743,9 +2989,9 @@ static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 **
*
x_sc
=
tile_x_sc
;
*
x_sc
=
tile_x_sc
;
}
}
static
__device__
__forceinline__
void
load_tiles_q6_K
(
template
<
bool
need_check
>
static
__device__
__forceinline__
void
load_tiles_q6_K
(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
i_max
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
__builtin_assume
(
i_offset
>=
0
);
__builtin_assume
(
i_offset
>=
0
);
__builtin_assume
(
i_offset
<
8
);
__builtin_assume
(
i_offset
<
8
);
...
@@ -2759,7 +3005,11 @@ static __device__ __forceinline__ void load_tiles_q6_K(
...
@@ -2759,7 +3005,11 @@ static __device__ __forceinline__ void load_tiles_q6_K(
#pragma unroll
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
)
{
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
)
{
const
int
i
=
i0
+
i_offset
;
int
i
=
i0
+
i_offset
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q6_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
const
block_q6_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
...
@@ -2771,7 +3021,11 @@ static __device__ __forceinline__ void load_tiles_q6_K(
...
@@ -2771,7 +3021,11 @@ static __device__ __forceinline__ void load_tiles_q6_K(
#pragma unroll
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
QI6_K
)
{
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
QI6_K
)
{
const
int
i
=
(
i0
+
i_offset
*
QI6_K
+
k
/
blocks_per_tile_x_row
)
%
GGML_CUDA_MMQ_Y
;
int
i
=
(
i0
+
i_offset
*
QI6_K
+
k
/
blocks_per_tile_x_row
)
%
GGML_CUDA_MMQ_Y
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q6_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbxd
;
const
block_q6_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbxd
;
...
@@ -2780,7 +3034,11 @@ static __device__ __forceinline__ void load_tiles_q6_K(
...
@@ -2780,7 +3034,11 @@ static __device__ __forceinline__ void load_tiles_q6_K(
#pragma unroll
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
2
)
{
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
2
)
{
const
int
i
=
i0
+
i_offset
*
2
+
k
/
(
WARP_SIZE
/
2
);
int
i
=
i0
+
i_offset
*
2
+
k
/
(
WARP_SIZE
/
2
);
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q6_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
(
k
%
(
WARP_SIZE
/
2
))
/
(
QI6_K
/
2
);
const
block_q6_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
(
k
%
(
WARP_SIZE
/
2
))
/
(
QI6_K
/
2
);
...
@@ -2789,7 +3047,11 @@ static __device__ __forceinline__ void load_tiles_q6_K(
...
@@ -2789,7 +3047,11 @@ static __device__ __forceinline__ void load_tiles_q6_K(
#pragma unroll
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
8
)
{
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
8
)
{
const
int
i
=
(
i0
+
i_offset
*
8
+
k
/
(
WARP_SIZE
/
8
))
%
GGML_CUDA_MMQ_Y
;
int
i
=
(
i0
+
i_offset
*
8
+
k
/
(
WARP_SIZE
/
8
))
%
GGML_CUDA_MMQ_Y
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q6_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
(
k
%
(
WARP_SIZE
/
8
))
/
4
;
const
block_q6_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
(
k
%
(
WARP_SIZE
/
8
))
/
4
;
...
@@ -2875,7 +3137,7 @@ static __global__ void mul_mat_q(
...
@@ -2875,7 +3137,7 @@ static __global__ void mul_mat_q(
for
(
int
ib0
=
0
;
ib0
<
blocks_per_row_x
;
ib0
+=
blocks_per_warp
)
{
for
(
int
ib0
=
0
;
ib0
<
blocks_per_row_x
;
ib0
+=
blocks_per_warp
)
{
load_tiles
(
x
+
row_x_0
*
blocks_per_row_x
+
ib0
,
tile_x_ql
,
tile_x_dm
,
tile_x_qh
,
tile_x_sc
,
load_tiles
(
x
+
row_x_0
*
blocks_per_row_x
+
ib0
,
tile_x_ql
,
tile_x_dm
,
tile_x_qh
,
tile_x_sc
,
tid_y
,
tid_x
,
blocks_per_row_x
);
tid_y
,
nrows_x
-
row_x_0
-
1
,
tid_x
,
blocks_per_row_x
);
for
(
int
ir
=
0
;
ir
<
qr
;
++
ir
)
{
for
(
int
ir
=
0
;
ir
<
qr
;
++
ir
)
{
const
int
kqs
=
ir
*
WARP_SIZE
+
tid_x
;
const
int
kqs
=
ir
*
WARP_SIZE
+
tid_x
;
...
@@ -2899,10 +3161,10 @@ static __global__ void mul_mat_q(
...
@@ -2899,10 +3161,10 @@ static __global__ void mul_mat_q(
__syncthreads
();
__syncthreads
();
#if __CUDA_ARCH__ >= 700 //
TODO: actually test this with compute capability 7.X cards
#if __CUDA_ARCH__ >= 700 //
Unrolling the loop is slower on Pascal
#pragma unroll
#pragma unroll
#endif // __CUDA_ARCH__ >= 700
#endif // __CUDA_ARCH__ >= 700
for
(
int
k
=
0
;
k
<
WARP_SIZE
/
vdr
;
++
k
)
{
for
(
int
k
=
0
;
k
<
WARP_SIZE
;
k
+=
vdr
)
{
#pragma unroll
#pragma unroll
for
(
int
j
=
0
;
j
<
WARP_SIZE
;
j
+=
8
)
{
for
(
int
j
=
0
;
j
<
WARP_SIZE
;
j
+=
8
)
{
#pragma unroll
#pragma unroll
...
@@ -2954,9 +3216,9 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
...
@@ -2954,9 +3216,9 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
for
(
int
i
=
0
;
i
<
blocks_per_row
;
i
+=
blocks_per_warp
)
{
for
(
int
i
=
0
;
i
<
blocks_per_row
;
i
+=
blocks_per_warp
)
{
const
int
ibx
=
row
*
blocks_per_row
+
i
+
threadIdx
.
x
/
(
qi
/
vdr
);
// x block index
const
int
ibx
=
row
*
blocks_per_row
+
i
+
threadIdx
.
x
/
(
qi
/
vdr
);
// x block index
const
int
iby
=
(
i
+
threadIdx
.
x
/
(
qi
/
vdr
))
*
qk
/
QK8_1
;
// y block index that aligns with ibx
const
int
iby
=
(
i
+
threadIdx
.
x
/
(
qi
/
vdr
))
*
(
qk
/
QK8_1
)
;
// y block index that aligns with ibx
const
int
iqs
=
threadIdx
.
x
%
(
qi
/
vdr
);
// x block quant index when casting the quants to int
const
int
iqs
=
vdr
*
(
threadIdx
.
x
%
(
qi
/
vdr
)
)
;
// x block quant index when casting the quants to int
tmp
+=
vec_dot_q_cuda
(
&
x
[
ibx
],
&
y
[
iby
],
iqs
);
tmp
+=
vec_dot_q_cuda
(
&
x
[
ibx
],
&
y
[
iby
],
iqs
);
}
}
...
@@ -3499,7 +3761,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
...
@@ -3499,7 +3761,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
mul_mat_vec_q
<
QK4_0
,
QI4_0
,
block_q4_0
,
VDR_
q
4_0_
q
8_1
,
vec_dot_q4_0_q8_1
>
mul_mat_vec_q
<
QK4_0
,
QI4_0
,
block_q4_0
,
VDR_
Q
4_0_
Q
8_1
_MMVQ
,
vec_dot_q4_0_q8_1
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols
,
nrows
);
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols
,
nrows
);
}
}
...
@@ -3508,7 +3770,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
...
@@ -3508,7 +3770,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
mul_mat_vec_q
<
QK4_0
,
QI4_1
,
block_q4_1
,
VDR_
q
4_1_
q
8_1
,
vec_dot_q4_1_q8_1
>
mul_mat_vec_q
<
QK4_0
,
QI4_1
,
block_q4_1
,
VDR_
Q
4_1_
Q
8_1
_MMVQ
,
vec_dot_q4_1_q8_1
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols
,
nrows
);
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols
,
nrows
);
}
}
...
@@ -3517,7 +3779,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
...
@@ -3517,7 +3779,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
mul_mat_vec_q
<
QK5_0
,
QI5_0
,
block_q5_0
,
VDR_
q
5_0_
q
8_1
,
vec_dot_q5_0_q8_1
>
mul_mat_vec_q
<
QK5_0
,
QI5_0
,
block_q5_0
,
VDR_
Q
5_0_
Q
8_1
_MMVQ
,
vec_dot_q5_0_q8_1
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols
,
nrows
);
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols
,
nrows
);
}
}
...
@@ -3526,7 +3788,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
...
@@ -3526,7 +3788,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
mul_mat_vec_q
<
QK5_1
,
QI5_1
,
block_q5_1
,
VDR_
q
5_1_
q
8_1
,
vec_dot_q5_1_q8_1
>
mul_mat_vec_q
<
QK5_1
,
QI5_1
,
block_q5_1
,
VDR_
Q
5_1_
Q
8_1
_MMVQ
,
vec_dot_q5_1_q8_1
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols
,
nrows
);
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols
,
nrows
);
}
}
...
@@ -3535,7 +3797,7 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
...
@@ -3535,7 +3797,7 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
mul_mat_vec_q
<
QK8_0
,
QI8_0
,
block_q8_0
,
VDR_
q
8_0_
q
8_1
,
vec_dot_q8_0_q8_1
>
mul_mat_vec_q
<
QK8_0
,
QI8_0
,
block_q8_0
,
VDR_
Q
8_0_
Q
8_1
_MMVQ
,
vec_dot_q8_0_q8_1
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols
,
nrows
);
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols
,
nrows
);
}
}
...
@@ -3635,8 +3897,14 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
...
@@ -3635,8 +3897,14 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
const
int
block_num_y
=
(
ncols_y
+
WARP_SIZE
-
1
)
/
WARP_SIZE
;
const
int
block_num_y
=
(
ncols_y
+
WARP_SIZE
-
1
)
/
WARP_SIZE
;
const
dim3
block_nums
(
block_num_x
,
block_num_y
,
1
);
const
dim3
block_nums
(
block_num_x
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
WARP_SIZE
/
4
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
WARP_SIZE
/
4
,
1
);
mul_mat_q
<
QK4_0
,
QR4_0
,
QI4_0
,
block_q4_0
,
allocate_tiles_q4_0
,
load_tiles_q4_0
,
VDR_q4_0_q8_1
,
vec_dot_q4_0_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
if
(
nrows_x
%
GGML_CUDA_MMQ_Y
==
0
)
{
mul_mat_q
<
QK4_0
,
QR4_0
,
QI4_0
,
block_q4_0
,
allocate_tiles_q4_0
,
load_tiles_q4_0
<
false
>
,
VDR_Q4_0_Q8_1_MMQ
,
vec_dot_q4_0_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
else
{
mul_mat_q
<
QK4_0
,
QR4_0
,
QI4_0
,
block_q4_0
,
allocate_tiles_q4_0
,
load_tiles_q4_0
<
true
>
,
VDR_Q4_0_Q8_1_MMQ
,
vec_dot_q4_0_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
}
}
static
void
ggml_mul_mat_q4_1_q8_1_cuda
(
static
void
ggml_mul_mat_q4_1_q8_1_cuda
(
...
@@ -3647,8 +3915,14 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
...
@@ -3647,8 +3915,14 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
const
int
block_num_y
=
(
ncols_y
+
WARP_SIZE
-
1
)
/
WARP_SIZE
;
const
int
block_num_y
=
(
ncols_y
+
WARP_SIZE
-
1
)
/
WARP_SIZE
;
const
dim3
block_nums
(
block_num_x
,
block_num_y
,
1
);
const
dim3
block_nums
(
block_num_x
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
WARP_SIZE
/
4
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
WARP_SIZE
/
4
,
1
);
mul_mat_q
<
QK4_1
,
QR4_1
,
QI4_1
,
block_q4_1
,
allocate_tiles_q4_1
,
load_tiles_q4_1
,
VDR_q4_1_q8_1
,
vec_dot_q4_1_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
if
(
nrows_x
%
GGML_CUDA_MMQ_Y
==
0
)
{
mul_mat_q
<
QK4_1
,
QR4_1
,
QI4_1
,
block_q4_1
,
allocate_tiles_q4_1
,
load_tiles_q4_1
<
false
>
,
VDR_Q4_1_Q8_1_MMQ
,
vec_dot_q4_1_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
else
{
mul_mat_q
<
QK4_1
,
QR4_1
,
QI4_1
,
block_q4_1
,
allocate_tiles_q4_1
,
load_tiles_q4_1
<
true
>
,
VDR_Q4_1_Q8_1_MMQ
,
vec_dot_q4_1_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
}
}
static
void
ggml_mul_mat_q5_0_q8_1_cuda
(
static
void
ggml_mul_mat_q5_0_q8_1_cuda
(
...
@@ -3659,8 +3933,14 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
...
@@ -3659,8 +3933,14 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
const
int
block_num_y
=
(
ncols_y
+
WARP_SIZE
-
1
)
/
WARP_SIZE
;
const
int
block_num_y
=
(
ncols_y
+
WARP_SIZE
-
1
)
/
WARP_SIZE
;
const
dim3
block_nums
(
block_num_x
,
block_num_y
,
1
);
const
dim3
block_nums
(
block_num_x
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
WARP_SIZE
/
4
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
WARP_SIZE
/
4
,
1
);
mul_mat_q
<
QK5_0
,
QR5_0
,
QI5_0
,
block_q5_0
,
allocate_tiles_q5_0
,
load_tiles_q5_0
,
VDR_q5_0_q8_1
,
vec_dot_q5_0_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
if
(
nrows_x
%
GGML_CUDA_MMQ_Y
==
0
)
{
mul_mat_q
<
QK5_0
,
QR5_0
,
QI5_0
,
block_q5_0
,
allocate_tiles_q5_0
,
load_tiles_q5_0
<
false
>
,
VDR_Q5_0_Q8_1_MMQ
,
vec_dot_q5_0_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
else
{
mul_mat_q
<
QK5_0
,
QR5_0
,
QI5_0
,
block_q5_0
,
allocate_tiles_q5_0
,
load_tiles_q5_0
<
true
>
,
VDR_Q5_0_Q8_1_MMQ
,
vec_dot_q5_0_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
}
}
static
void
ggml_mul_mat_q5_1_q8_1_cuda
(
static
void
ggml_mul_mat_q5_1_q8_1_cuda
(
...
@@ -3671,8 +3951,14 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
...
@@ -3671,8 +3951,14 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
const
int
block_num_y
=
(
ncols_y
+
WARP_SIZE
-
1
)
/
WARP_SIZE
;
const
int
block_num_y
=
(
ncols_y
+
WARP_SIZE
-
1
)
/
WARP_SIZE
;
const
dim3
block_nums
(
block_num_x
,
block_num_y
,
1
);
const
dim3
block_nums
(
block_num_x
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
WARP_SIZE
/
4
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
WARP_SIZE
/
4
,
1
);
mul_mat_q
<
QK5_1
,
QR5_1
,
QI5_1
,
block_q5_1
,
allocate_tiles_q5_1
,
load_tiles_q5_1
,
VDR_q5_1_q8_1
,
vec_dot_q5_1_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
if
(
nrows_x
%
GGML_CUDA_MMQ_Y
==
0
)
{
mul_mat_q
<
QK5_1
,
QR5_1
,
QI5_1
,
block_q5_1
,
allocate_tiles_q5_1
,
load_tiles_q5_1
<
false
>
,
VDR_Q5_1_Q8_1_MMQ
,
vec_dot_q5_1_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
else
{
mul_mat_q
<
QK5_1
,
QR5_1
,
QI5_1
,
block_q5_1
,
allocate_tiles_q5_1
,
load_tiles_q5_1
<
true
>
,
VDR_Q5_1_Q8_1_MMQ
,
vec_dot_q5_1_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
}
}
static
void
ggml_mul_mat_q8_0_q8_1_cuda
(
static
void
ggml_mul_mat_q8_0_q8_1_cuda
(
...
@@ -3683,8 +3969,14 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
...
@@ -3683,8 +3969,14 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
const
int
block_num_y
=
(
ncols_y
+
WARP_SIZE
-
1
)
/
WARP_SIZE
;
const
int
block_num_y
=
(
ncols_y
+
WARP_SIZE
-
1
)
/
WARP_SIZE
;
const
dim3
block_nums
(
block_num_x
,
block_num_y
,
1
);
const
dim3
block_nums
(
block_num_x
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
WARP_SIZE
/
4
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
WARP_SIZE
/
4
,
1
);
mul_mat_q
<
QK8_0
,
QR8_0
,
QI8_0
,
block_q8_0
,
allocate_tiles_q8_0
,
load_tiles_q8_0
,
VDR_q8_0_q8_1
,
vec_dot_q8_0_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
if
(
nrows_x
%
GGML_CUDA_MMQ_Y
==
0
)
{
mul_mat_q
<
QK8_0
,
QR8_0
,
QI8_0
,
block_q8_0
,
allocate_tiles_q8_0
,
load_tiles_q8_0
<
false
>
,
VDR_Q8_0_Q8_1_MMQ
,
vec_dot_q8_0_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
else
{
mul_mat_q
<
QK8_0
,
QR8_0
,
QI8_0
,
block_q8_0
,
allocate_tiles_q8_0
,
load_tiles_q8_0
<
true
>
,
VDR_Q8_0_Q8_1_MMQ
,
vec_dot_q8_0_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
}
}
static
void
ggml_mul_mat_q2_K_q8_1_cuda
(
static
void
ggml_mul_mat_q2_K_q8_1_cuda
(
...
@@ -3695,8 +3987,14 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
...
@@ -3695,8 +3987,14 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
const
int
block_num_y
=
(
ncols_y
+
WARP_SIZE
-
1
)
/
WARP_SIZE
;
const
int
block_num_y
=
(
ncols_y
+
WARP_SIZE
-
1
)
/
WARP_SIZE
;
const
dim3
block_nums
(
block_num_x
,
block_num_y
,
1
);
const
dim3
block_nums
(
block_num_x
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
WARP_SIZE
/
4
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
WARP_SIZE
/
4
,
1
);
mul_mat_q
<
QK_K
,
QR2_K
,
QI2_K
,
block_q2_K
,
allocate_tiles_q2_K
,
load_tiles_q2_K
,
VDR_q2_K_q8_1
,
vec_dot_q2_K_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
if
(
nrows_x
%
GGML_CUDA_MMQ_Y
==
0
)
{
mul_mat_q
<
QK_K
,
QR2_K
,
QI2_K
,
block_q2_K
,
allocate_tiles_q2_K
,
load_tiles_q2_K
<
false
>
,
VDR_q2_K_q8_1
,
vec_dot_q2_K_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
else
{
mul_mat_q
<
QK_K
,
QR2_K
,
QI2_K
,
block_q2_K
,
allocate_tiles_q2_K
,
load_tiles_q2_K
<
true
>
,
VDR_q2_K_q8_1
,
vec_dot_q2_K_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
}
}
static
void
ggml_mul_mat_q3_K_q8_1_cuda
(
static
void
ggml_mul_mat_q3_K_q8_1_cuda
(
...
@@ -3707,8 +4005,14 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
...
@@ -3707,8 +4005,14 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
const
int
block_num_y
=
(
ncols_y
+
WARP_SIZE
-
1
)
/
WARP_SIZE
;
const
int
block_num_y
=
(
ncols_y
+
WARP_SIZE
-
1
)
/
WARP_SIZE
;
const
dim3
block_nums
(
block_num_x
,
block_num_y
,
1
);
const
dim3
block_nums
(
block_num_x
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
WARP_SIZE
/
4
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
WARP_SIZE
/
4
,
1
);
mul_mat_q
<
QK_K
,
QR3_K
,
QI3_K
,
block_q3_K
,
allocate_tiles_q3_K
,
load_tiles_q3_K
,
VDR_q3_K_q8_1
,
vec_dot_q3_K_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
if
(
nrows_x
%
GGML_CUDA_MMQ_Y
==
0
)
{
mul_mat_q
<
QK_K
,
QR3_K
,
QI3_K
,
block_q3_K
,
allocate_tiles_q3_K
,
load_tiles_q3_K
<
false
>
,
VDR_q3_K_q8_1
,
vec_dot_q3_K_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
else
{
mul_mat_q
<
QK_K
,
QR3_K
,
QI3_K
,
block_q3_K
,
allocate_tiles_q3_K
,
load_tiles_q3_K
<
true
>
,
VDR_q3_K_q8_1
,
vec_dot_q3_K_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
}
}
static
void
ggml_mul_mat_q4_K_q8_1_cuda
(
static
void
ggml_mul_mat_q4_K_q8_1_cuda
(
...
@@ -3719,8 +4023,14 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
...
@@ -3719,8 +4023,14 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
const
int
block_num_y
=
(
ncols_y
+
WARP_SIZE
-
1
)
/
WARP_SIZE
;
const
int
block_num_y
=
(
ncols_y
+
WARP_SIZE
-
1
)
/
WARP_SIZE
;
const
dim3
block_nums
(
block_num_x
,
block_num_y
,
1
);
const
dim3
block_nums
(
block_num_x
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
WARP_SIZE
/
4
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
WARP_SIZE
/
4
,
1
);
mul_mat_q
<
QK_K
,
QR4_K
,
QI4_K
,
block_q4_K
,
allocate_tiles_q4_K
,
load_tiles_q4_K
,
VDR_q4_K_q8_1
,
vec_dot_q4_K_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
if
(
nrows_x
%
GGML_CUDA_MMQ_Y
==
0
)
{
mul_mat_q
<
QK_K
,
QR4_K
,
QI4_K
,
block_q4_K
,
allocate_tiles_q4_K
,
load_tiles_q4_K
<
false
>
,
VDR_q4_K_q8_1
,
vec_dot_q4_K_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
else
{
mul_mat_q
<
QK_K
,
QR4_K
,
QI4_K
,
block_q4_K
,
allocate_tiles_q4_K
,
load_tiles_q4_K
<
true
>
,
VDR_q4_K_q8_1
,
vec_dot_q4_K_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
}
}
static
void
ggml_mul_mat_q5_K_q8_1_cuda
(
static
void
ggml_mul_mat_q5_K_q8_1_cuda
(
...
@@ -3731,8 +4041,14 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
...
@@ -3731,8 +4041,14 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
const
int
block_num_y
=
(
ncols_y
+
WARP_SIZE
-
1
)
/
WARP_SIZE
;
const
int
block_num_y
=
(
ncols_y
+
WARP_SIZE
-
1
)
/
WARP_SIZE
;
const
dim3
block_nums
(
block_num_x
,
block_num_y
,
1
);
const
dim3
block_nums
(
block_num_x
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
WARP_SIZE
/
4
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
WARP_SIZE
/
4
,
1
);
mul_mat_q
<
QK_K
,
QR5_K
,
QI5_K
,
block_q5_K
,
allocate_tiles_q5_K
,
load_tiles_q5_K
,
VDR_q5_K_q8_1
,
vec_dot_q5_K_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
if
(
nrows_x
%
GGML_CUDA_MMQ_Y
==
0
)
{
mul_mat_q
<
QK_K
,
QR5_K
,
QI5_K
,
block_q5_K
,
allocate_tiles_q5_K
,
load_tiles_q5_K
<
false
>
,
VDR_q5_K_q8_1
,
vec_dot_q5_K_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
else
{
mul_mat_q
<
QK_K
,
QR5_K
,
QI5_K
,
block_q5_K
,
allocate_tiles_q5_K
,
load_tiles_q5_K
<
true
>
,
VDR_q5_K_q8_1
,
vec_dot_q5_K_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
}
}
static
void
ggml_mul_mat_q6_K_q8_1_cuda
(
static
void
ggml_mul_mat_q6_K_q8_1_cuda
(
...
@@ -3743,8 +4059,14 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
...
@@ -3743,8 +4059,14 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
const
int
block_num_y
=
(
ncols_y
+
WARP_SIZE
-
1
)
/
WARP_SIZE
;
const
int
block_num_y
=
(
ncols_y
+
WARP_SIZE
-
1
)
/
WARP_SIZE
;
const
dim3
block_nums
(
block_num_x
,
block_num_y
,
1
);
const
dim3
block_nums
(
block_num_x
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
WARP_SIZE
/
4
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
WARP_SIZE
/
4
,
1
);
mul_mat_q
<
QK_K
,
QR6_K
,
QI6_K
,
block_q6_K
,
allocate_tiles_q6_K
,
load_tiles_q6_K
,
VDR_q6_K_q8_1
,
vec_dot_q6_K_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
if
(
nrows_x
%
GGML_CUDA_MMQ_Y
==
0
)
{
mul_mat_q
<
QK_K
,
QR6_K
,
QI6_K
,
block_q6_K
,
allocate_tiles_q6_K
,
load_tiles_q6_K
<
false
>
,
VDR_q6_K_q8_1
,
vec_dot_q6_K_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
else
{
mul_mat_q
<
QK_K
,
QR6_K
,
QI6_K
,
block_q6_K
,
allocate_tiles_q6_K
,
load_tiles_q6_K
<
true
>
,
VDR_q6_K_q8_1
,
vec_dot_q6_K_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
}
}
static
void
ggml_mul_mat_p021_f16_f32_cuda
(
static
void
ggml_mul_mat_p021_f16_f32_cuda
(
...
@@ -4690,8 +5012,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
...
@@ -4690,8 +5012,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
row_low
=
id
==
0
?
0
:
nrows0
*
g_tensor_split
[
id
];
row_low
=
id
==
0
?
0
:
nrows0
*
g_tensor_split
[
id
];
row_low
-=
row_low
%
GGML_CUDA_MMQ_Y
;
row_low
-=
row_low
%
GGML_CUDA_MMQ_Y
;
row_high
=
id
==
g_device_count
-
1
?
nrows0
:
nrows0
*
g_tensor_split
[
id
+
1
];
if
(
id
==
g_device_count
-
1
)
{
row_high
-=
row_high
%
GGML_CUDA_MMQ_Y
;
row_high
=
nrows0
;
}
else
{
row_high
=
nrows0
*
g_tensor_split
[
id
+
1
];
row_high
-=
row_high
%
GGML_CUDA_MMQ_Y
;
}
}
else
{
}
else
{
row_low
=
0
;
row_low
=
0
;
row_high
=
nrows0
*
i02_divisor
;
row_high
=
nrows0
*
i02_divisor
;
...
@@ -5171,8 +5497,12 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
...
@@ -5171,8 +5497,12 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
row_low
=
id
==
0
?
0
:
nrows
*
g_tensor_split
[
id
];
row_low
=
id
==
0
?
0
:
nrows
*
g_tensor_split
[
id
];
row_low
-=
row_low
%
GGML_CUDA_MMQ_Y
;
row_low
-=
row_low
%
GGML_CUDA_MMQ_Y
;
row_high
=
id
==
g_device_count
-
1
?
nrows
:
nrows
*
g_tensor_split
[
id
+
1
];
if
(
id
==
g_device_count
-
1
)
{
row_high
-=
row_high
%
GGML_CUDA_MMQ_Y
;
row_high
=
nrows
;
}
else
{
row_high
=
nrows
*
g_tensor_split
[
id
+
1
];
row_high
-=
row_high
%
GGML_CUDA_MMQ_Y
;
}
}
else
{
}
else
{
GGML_ASSERT
(
false
);
GGML_ASSERT
(
false
);
}
}
...
...
llama/ggml-cuda.h
View file @
85aeb428
/**
/**
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
* llama.cpp - git
8183159cf3def112f6d1fe94815fce70e1bffa12
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-metal.h
View file @
85aeb428
//go:build darwin
//go:build darwin
/**
/**
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
* llama.cpp - git
8183159cf3def112f6d1fe94815fce70e1bffa12
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-metal.m
View file @
85aeb428
//go:build darwin
//go:build darwin
/**
/**
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
* llama.cpp - git
8183159cf3def112f6d1fe94815fce70e1bffa12
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-metal.metal
View file @
85aeb428
//go:build darwin
//go:build darwin
/**
/**
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
* llama.cpp - git
8183159cf3def112f6d1fe94815fce70e1bffa12
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-mpi.c
View file @
85aeb428
//go:build mpi
//go:build mpi
/**
/**
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
* llama.cpp - git
8183159cf3def112f6d1fe94815fce70e1bffa12
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-mpi.h
View file @
85aeb428
//go:build mpi
//go:build mpi
/**
/**
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
* llama.cpp - git
8183159cf3def112f6d1fe94815fce70e1bffa12
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-opencl.cpp
View file @
85aeb428
//go:build opencl
//go:build opencl
/**
/**
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
* llama.cpp - git
8183159cf3def112f6d1fe94815fce70e1bffa12
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-opencl.h
View file @
85aeb428
//go:build opencl
//go:build opencl
/**
/**
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
* llama.cpp - git
8183159cf3def112f6d1fe94815fce70e1bffa12
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml.c
View file @
85aeb428
/**
/**
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
* llama.cpp - git
8183159cf3def112f6d1fe94815fce70e1bffa12
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml.h
View file @
85aeb428
/**
/**
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
* llama.cpp - git
8183159cf3def112f6d1fe94815fce70e1bffa12
*
*
* MIT License
* MIT License
*
*
...
...
llama/k_quants.c
View file @
85aeb428
/**
/**
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
* llama.cpp - git
8183159cf3def112f6d1fe94815fce70e1bffa12
*
*
* MIT License
* MIT License
*
*
...
...
llama/k_quants.h
View file @
85aeb428
/**
/**
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
* llama.cpp - git
8183159cf3def112f6d1fe94815fce70e1bffa12
*
*
* MIT License
* MIT License
*
*
...
...
llama/llama-util.h
View file @
85aeb428
/**
/**
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
* llama.cpp - git
8183159cf3def112f6d1fe94815fce70e1bffa12
*
*
* MIT License
* MIT License
*
*
...
...
llama/llama.cpp
View file @
85aeb428
/**
/**
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
* llama.cpp - git
8183159cf3def112f6d1fe94815fce70e1bffa12
*
*
* MIT License
* MIT License
*
*
...
...
llama/llama.go
View file @
85aeb428
...
@@ -128,11 +128,6 @@ func New(model string, opts api.Options) (*LLM, error) {
...
@@ -128,11 +128,6 @@ func New(model string, opts api.Options) (*LLM, error) {
C
.
llama_backend_init
(
C
.
bool
(
llm
.
UseNUMA
))
C
.
llama_backend_init
(
C
.
bool
(
llm
.
UseNUMA
))
// TODO: GQA == 8 suggests 70B model which doesn't support metal
if
llm
.
NumGQA
==
8
{
llm
.
NumGPU
=
0
}
params
:=
C
.
llama_context_default_params
()
params
:=
C
.
llama_context_default_params
()
params
.
seed
=
C
.
uint
(
llm
.
Seed
)
params
.
seed
=
C
.
uint
(
llm
.
Seed
)
params
.
n_ctx
=
C
.
int
(
llm
.
NumCtx
)
params
.
n_ctx
=
C
.
int
(
llm
.
NumCtx
)
...
...
llama/llama.h
View file @
85aeb428
/**
/**
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
* llama.cpp - git
8183159cf3def112f6d1fe94815fce70e1bffa12
*
*
* MIT License
* MIT License
*
*
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment