Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
orangecat
ollama
Commits
85aeb428
"docs/vscode:/vscode.git/clone" did not exist on "9a34e7c0e2368e30d261f47fcdb99bfc6163adcc"
Unverified
Commit
85aeb428
authored
Aug 03, 2023
by
Michael Yang
Committed by
GitHub
Aug 03, 2023
Browse files
Merge pull request #270 from jmorganca/update-llama-cpp
update llama.cpp
parents
f0b365a4
c5bcf328
Changes
19
Hide whitespace changes
Inline
Side-by-side
Showing
19 changed files
with
623 additions
and
298 deletions
+623
-298
llama/ggml-alloc.c
llama/ggml-alloc.c
+1
-1
llama/ggml-alloc.h
llama/ggml-alloc.h
+1
-1
llama/ggml-cuda.cu
llama/ggml-cuda.cu
+606
-276
llama/ggml-cuda.h
llama/ggml-cuda.h
+1
-1
llama/ggml-metal.h
llama/ggml-metal.h
+1
-1
llama/ggml-metal.m
llama/ggml-metal.m
+1
-1
llama/ggml-metal.metal
llama/ggml-metal.metal
+1
-1
llama/ggml-mpi.c
llama/ggml-mpi.c
+1
-1
llama/ggml-mpi.h
llama/ggml-mpi.h
+1
-1
llama/ggml-opencl.cpp
llama/ggml-opencl.cpp
+1
-1
llama/ggml-opencl.h
llama/ggml-opencl.h
+1
-1
llama/ggml.c
llama/ggml.c
+1
-1
llama/ggml.h
llama/ggml.h
+1
-1
llama/k_quants.c
llama/k_quants.c
+1
-1
llama/k_quants.h
llama/k_quants.h
+1
-1
llama/llama-util.h
llama/llama-util.h
+1
-1
llama/llama.cpp
llama/llama.cpp
+1
-1
llama/llama.go
llama/llama.go
+0
-5
llama/llama.h
llama/llama.h
+1
-1
No files found.
llama/ggml-alloc.c
View file @
85aeb428
/**
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
* llama.cpp - git
8183159cf3def112f6d1fe94815fce70e1bffa12
*
* MIT License
*
...
...
llama/ggml-alloc.h
View file @
85aeb428
/**
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
* llama.cpp - git
8183159cf3def112f6d1fe94815fce70e1bffa12
*
* MIT License
*
...
...
llama/ggml-cuda.cu
View file @
85aeb428
/**
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
* llama.cpp - git
8183159cf3def112f6d1fe94815fce70e1bffa12
*
* MIT License
*
...
...
@@ -188,7 +188,7 @@ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_
typedef
void
(
*
allocate_tiles_cuda_t
)(
int
**
x_ql
,
half2
**
x_dm
,
int
**
x_qh
,
int
**
x_sc
);
typedef
void
(
*
load_tiles_cuda_t
)(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
k
,
const
int
&
blocks_per_row
);
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
i_max
,
const
int
&
k
,
const
int
&
blocks_per_row
);
typedef
float
(
*
vec_dot_q_mul_mat_cuda_t
)(
const
int
*
__restrict__
x_ql
,
const
half2
*
__restrict__
x_dm
,
const
int
*
__restrict__
x_qh
,
const
int
*
__restrict__
x_sc
,
const
int
*
__restrict__
y_qs
,
const
half2
*
__restrict__
y_ms
,
const
int
&
i
,
const
int
&
j
,
const
int
&
k
);
...
...
@@ -1388,22 +1388,185 @@ static __global__ void dequantize_block(const void * __restrict__ vx, float * __
}
// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
#define VDR_q4_0_q8_1 1
#define VDR_Q4_0_Q8_1_MMVQ 2
#define VDR_Q4_0_Q8_1_MMQ 4
static
__device__
__forceinline__
float
vec_dot_q4_0_q8_1_impl
(
const
int
&
v
i
,
const
int
&
u
i0
,
const
int
&
ui1
,
const
half
&
d4
,
const
half2
&
ds8
)
{
template
<
int
vdr
>
static
__device__
__forceinline__
float
vec_dot_q4_0_q8_1_impl
(
const
int
*
v
,
const
int
*
u
,
const
float
&
d4
,
const
half2
&
ds8
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
// subtract 8 from each quantized value
const
int
vi0
=
(
vi
>>
0
)
&
0x0F0F0F0F
;
const
int
vi1
=
(
vi
>>
4
)
&
0x0F0F0F0F
;
int
sumi
=
0
;
// SIMD dot product of quantized values
int
sumi
=
__dp4a
(
vi0
,
ui0
,
0
);
sumi
=
__dp4a
(
vi1
,
ui1
,
sumi
);
#pragma unroll
for
(
int
i
=
0
;
i
<
vdr
;
++
i
)
{
const
int
vi0
=
(
v
[
i
]
>>
0
)
&
0x0F0F0F0F
;
const
int
vi1
=
(
v
[
i
]
>>
4
)
&
0x0F0F0F0F
;
// SIMD dot product of quantized values
sumi
=
__dp4a
(
vi0
,
u
[
2
*
i
+
0
],
sumi
);
sumi
=
__dp4a
(
vi1
,
u
[
2
*
i
+
1
],
sumi
);
}
// second part effectively subtracts 8 from each quant value
return
d4
*
(
sumi
*
__half2float
(
ds8
.
x
)
-
(
8
*
vdr
/
QI4_0
)
*
__half2float
(
ds8
.
y
));
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
#define VDR_Q4_1_Q8_1_MMVQ 2
#define VDR_Q4_1_Q8_1_MMQ 4
template
<
int
vdr
>
static
__device__
__forceinline__
float
vec_dot_q4_1_q8_1_impl
(
const
int
*
v
,
const
int
*
u
,
const
half2
&
dm4
,
const
half2
&
ds8
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
int
sumi
=
0
;
#pragma unroll
for
(
int
i
=
0
;
i
<
vdr
;
++
i
)
{
const
int
vi0
=
(
v
[
i
]
>>
0
)
&
0x0F0F0F0F
;
const
int
vi1
=
(
v
[
i
]
>>
4
)
&
0x0F0F0F0F
;
// SIMD dot product of quantized values
sumi
=
__dp4a
(
vi0
,
u
[
2
*
i
+
0
],
sumi
);
sumi
=
__dp4a
(
vi1
,
u
[
2
*
i
+
1
],
sumi
);
}
return
__half2float
(
d4
)
*
(
sumi
*
__half2float
(
ds8
.
x
)
-
(
8
/
QI4_0
)
*
__half2float
(
ds8
.
y
));
#ifdef GGML_CUDA_F16
const
half2
tmp
=
__hmul2
(
dm4
,
ds8
);
const
float
d4d8
=
__half2float
(
tmp
.
x
);
const
float
m4s8
=
__half2float
(
tmp
.
y
);
#else
const
float
d4d8
=
__half2float
(
dm4
.
x
)
*
__half2float
(
ds8
.
x
);
const
float
m4s8
=
__half2float
(
dm4
.
y
)
*
__half2float
(
ds8
.
y
);
#endif // GGML_CUDA_F16
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
return
sumi
*
d4d8
+
m4s8
/
(
QI8_1
/
(
vdr
*
QR4_1
));
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
#define VDR_Q5_0_Q8_1_MMVQ 2
#define VDR_Q5_0_Q8_1_MMQ 4
template
<
int
vdr
>
static
__device__
__forceinline__
float
vec_dot_q5_0_q8_1_impl
(
const
int
*
vl
,
const
int
*
vh
,
const
int
*
u
,
const
float
&
d5
,
const
half2
&
ds8
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
int
sumi
=
0
;
for
(
int
i
=
0
;
i
<
vdr
;
++
i
)
{
int
vi0
=
(
vl
[
i
]
>>
0
)
&
0x0F0F0F0F
;
// lower 4 qs bits, still need qh as 5th bits
vi0
|=
(
vh
[
i
]
<<
4
)
&
0x00000010
;
// 0 -> 4
vi0
|=
(
vh
[
i
]
<<
11
)
&
0x00001000
;
// 1 -> 12
vi0
|=
(
vh
[
i
]
<<
18
)
&
0x00100000
;
// 2 -> 20
vi0
|=
(
vh
[
i
]
<<
25
)
&
0x10000000
;
// 3 -> 28
sumi
=
__dp4a
(
vi0
,
u
[
2
*
i
+
0
],
sumi
);
// SIMD dot product of quantized values
int
vi1
=
(
vl
[
i
]
>>
4
)
&
0x0F0F0F0F
;
// upper 4 qs bits, still need qh as 5th bits
vi1
|=
(
vh
[
i
]
>>
12
)
&
0x00000010
;
// 16 -> 4
vi1
|=
(
vh
[
i
]
>>
5
)
&
0x00001000
;
// 17 -> 12
vi1
|=
(
vh
[
i
]
<<
2
)
&
0x00100000
;
// 18 -> 20
vi1
|=
(
vh
[
i
]
<<
9
)
&
0x10000000
;
// 19 -> 28
sumi
=
__dp4a
(
vi1
,
u
[
2
*
i
+
1
],
sumi
);
// SIMD dot product of quantized values
}
// second part effectively subtracts 16 from each quant value
return
d5
*
(
sumi
*
__half2float
(
ds8
.
x
)
-
(
16
*
vdr
/
QI5_0
)
*
__half2float
(
ds8
.
y
));
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
#define VDR_Q5_1_Q8_1_MMVQ 2
#define VDR_Q5_1_Q8_1_MMQ 4
template
<
int
vdr
>
static
__device__
__forceinline__
float
vec_dot_q5_1_q8_1_impl
(
const
int
*
vl
,
const
int
*
vh
,
const
int
*
u
,
const
half2
&
dm5
,
const
half2
&
ds8
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
int
sumi
=
0
;
for
(
int
i
=
0
;
i
<
vdr
;
++
i
)
{
int
vi0
=
(
vl
[
i
]
>>
0
)
&
0x0F0F0F0F
;
// lower 4 qs bits, still need qh as 5th bits
vi0
|=
(
vh
[
i
]
<<
4
)
&
0x00000010
;
// 0 -> 4
vi0
|=
(
vh
[
i
]
<<
11
)
&
0x00001000
;
// 1 -> 12
vi0
|=
(
vh
[
i
]
<<
18
)
&
0x00100000
;
// 2 -> 20
vi0
|=
(
vh
[
i
]
<<
25
)
&
0x10000000
;
// 3 -> 28
sumi
=
__dp4a
(
vi0
,
u
[
2
*
i
+
0
],
sumi
);
// SIMD dot product of quantized values
int
vi1
=
(
vl
[
i
]
>>
4
)
&
0x0F0F0F0F
;
// upper 4 qs bits, still need qh as 5th bits
vi1
|=
(
vh
[
i
]
>>
12
)
&
0x00000010
;
// 16 -> 4
vi1
|=
(
vh
[
i
]
>>
5
)
&
0x00001000
;
// 17 -> 12
vi1
|=
(
vh
[
i
]
<<
2
)
&
0x00100000
;
// 18 -> 20
vi1
|=
(
vh
[
i
]
<<
9
)
&
0x10000000
;
// 19 -> 28
sumi
=
__dp4a
(
vi1
,
u
[
2
*
i
+
1
],
sumi
);
// SIMD dot product of quantized values
}
#ifdef GGML_CUDA_F16
const
half2
tmp
=
__hmul2
(
dm5
,
ds8
);
const
float
d5d8
=
__half2float
(
tmp
.
x
);
const
float
m5s8
=
__half2float
(
tmp
.
y
);
#else
const
float
d5d8
=
__half2float
(
dm5
.
x
)
*
__half2float
(
ds8
.
x
);
const
float
m5s8
=
__half2float
(
dm5
.
y
)
*
__half2float
(
ds8
.
y
);
#endif // GGML_CUDA_F16
// scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
return
sumi
*
d5d8
+
m5s8
/
(
QI5_1
/
vdr
);
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
#define VDR_Q8_0_Q8_1_MMVQ 2
#define VDR_Q8_0_Q8_1_MMQ 8
template
<
int
vdr
>
static
__device__
__forceinline__
float
vec_dot_q8_0_q8_1_impl
(
const
int
*
v
,
const
int
*
u
,
const
float
&
d8_0
,
const
half2
&
ds8_1
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
int
sumi
=
0
;
for
(
int
i
=
0
;
i
<
vdr
;
++
i
)
{
// SIMD dot product of quantized values
sumi
=
__dp4a
(
v
[
i
],
u
[
i
],
sumi
);
}
return
sumi
*
d8_0
*
__half2float
(
ds8_1
.
x
);
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
template
<
int
vdr
>
static
__device__
__forceinline__
float
vec_dot_q8_1_q8_1_impl
(
const
int
*
v
,
const
int
*
u
,
const
half2
&
dm8
,
const
half2
&
ds8
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
int
sumi
=
0
;
for
(
int
i
=
0
;
i
<
vdr
;
++
i
)
{
// SIMD dot product of quantized values
sumi
=
__dp4a
(
v
[
i
],
u
[
i
],
sumi
);
}
#ifdef GGML_CUDA_F16
const
half2
tmp
=
__hmul2
(
dm8
,
ds8
);
const
float
d8d8
=
__half2float
(
tmp
.
x
);
const
float
m8s8
=
__half2float
(
tmp
.
y
);
#else
const
float
d8d8
=
__half2float
(
dm8
.
x
)
*
__half2float
(
ds8
.
x
);
const
float
m8s8
=
__half2float
(
dm8
.
y
)
*
__half2float
(
ds8
.
y
);
#endif // GGML_CUDA_F16
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
return
sumi
*
d8d8
+
m8s8
/
(
QI8_1
/
vdr
);
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
...
...
@@ -1414,25 +1577,31 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
const
block_q4_0
*
bq4_0
=
(
const
block_q4_0
*
)
vbq
;
const
int
vi
=
get_int_from_uint8
(
bq4_0
->
qs
,
iqs
);
const
int
ui0
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
);
const
int
ui1
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
+
QI4_0
);
int
v
[
VDR_Q4_0_Q8_1_MMVQ
];
int
u
[
2
*
VDR_Q4_0_Q8_1_MMVQ
];
#pragma unroll
for
(
int
i
=
0
;
i
<
VDR_Q4_0_Q8_1_MMVQ
;
++
i
)
{
v
[
i
]
=
get_int_from_uint8
(
bq4_0
->
qs
,
iqs
+
i
);
u
[
2
*
i
+
0
]
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
+
i
);
u
[
2
*
i
+
1
]
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
+
i
+
QI4_0
);
}
return
vec_dot_q4_0_q8_1_impl
(
v
i
,
u
i0
,
ui1
,
bq4_0
->
d
,
bq8_1
->
ds
);
return
vec_dot_q4_0_q8_1_impl
<
VDR_Q4_0_Q8_1_MMVQ
>
(
v
,
u
,
bq4_0
->
d
,
bq8_1
->
ds
);
}
static
__device__
__forceinline__
void
allocate_tiles_q4_0
(
int
**
x_ql
,
half2
**
x_dm
,
int
**
x_qh
,
int
**
x_sc
)
{
__shared__
int
tile_x_qs
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
)
+
GGML_CUDA_MMQ_Y
];
__shared__
half2
tile_x_d
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
/
QI4_0
)
+
GGML_CUDA_MMQ_Y
/
QI4_0
];
__shared__
float
tile_x_d
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
/
QI4_0
)
+
GGML_CUDA_MMQ_Y
/
QI4_0
];
*
x_ql
=
tile_x_qs
;
*
x_dm
=
tile_x_d
;
*
x_dm
=
(
half2
*
)
tile_x_d
;
}
static
__device__
__forceinline__
void
load_tiles_q4_0
(
template
<
bool
need_check
>
static
__device__
__forceinline__
void
load_tiles_q4_0
(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
i_max
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
__builtin_assume
(
i_offset
>=
0
);
__builtin_assume
(
i_offset
<
8
);
...
...
@@ -1444,14 +1613,20 @@ static __device__ __forceinline__ void load_tiles_q4_0(
const
block_q4_0
*
bx0
=
(
block_q4_0
*
)
vx
;
float
*
x_dmf
=
(
float
*
)
x_dm
;
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
)
{
const
int
i
=
i0
+
i_offset
;
int
i
=
i0
+
i_offset
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q4_0
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
]
=
get_int_from_uint8
(
bxi
->
qs
,
kqsx
);
x_dm
[
i
*
(
WARP_SIZE
/
QI4_0
)
+
i
/
QI4_0
+
kbx
]
.
x
=
bxi
->
d
;
x_dm
f
[
i
*
(
WARP_SIZE
/
QI4_0
)
+
i
/
QI4_0
+
kbx
]
=
bxi
->
d
;
}
// const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
...
...
@@ -1459,6 +1634,7 @@ static __device__ __forceinline__ void load_tiles_q4_0(
// #pragma unroll
// for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_0) {
// FIXME out-of-bounds
// const int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
// if (i >= GGML_CUDA_MMQ_Y) {
...
...
@@ -1483,39 +1659,19 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
__builtin_assume
(
k
<
WARP_SIZE
);
const
int
kyqs
=
k
%
(
QI8_1
/
2
)
+
QI8_1
*
(
k
/
(
QI8_1
/
2
));
const
float
*
x_dmf
=
(
float
*
)
x_dm
;
return
vec_dot_q4_0_q8_1_impl
(
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
],
y_qs
[
j
*
(
2
*
WARP_SIZE
)
+
kyqs
],
y_qs
[
j
*
(
2
*
WARP_SIZE
)
+
kyqs
+
(
QI8_1
/
2
)],
x_dm
[
i
*
(
WARP_SIZE
/
QI4_0
)
+
i
/
QI4_0
+
k
/
QI4_0
].
x
,
y_ds
[
j
*
(
2
*
WARP_SIZE
/
QI8_1
)
+
2
*
k
/
QI8_1
]);
}
int
u
[
2
*
VDR_Q4_0_Q8_1_MMQ
];
#define VDR_q4_1_q8_1 1
static
__device__
__forceinline__
float
vec_dot_q4_1_q8_1_impl
(
const
int
&
vi
,
const
int
&
ui0
,
const
int
&
ui1
,
const
half2
&
dm4
,
const
half2
&
ds8
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const
int
vi0
=
(
vi
>>
0
)
&
0x0F0F0F0F
;
const
int
vi1
=
(
vi
>>
4
)
&
0x0F0F0F0F
;
// SIMD dot product of quantized values
int
sumi
=
__dp4a
(
vi0
,
ui0
,
0
);
sumi
=
__dp4a
(
vi1
,
ui1
,
sumi
);
#ifdef GGML_CUDA_F16
const
half2
tmp
=
__hmul2
(
dm4
,
ds8
);
const
float
d4d8
=
__half2float
(
tmp
.
x
);
const
float
m4s8
=
__half2float
(
tmp
.
y
);
#else
const
float
d4d8
=
__half2float
(
dm4
.
x
)
*
__half2float
(
ds8
.
x
);
const
float
m4s8
=
__half2float
(
dm4
.
y
)
*
__half2float
(
ds8
.
y
);
#endif // GGML_CUDA_F16
#pragma unroll
for
(
int
l
=
0
;
l
<
VDR_Q4_0_Q8_1_MMQ
;
++
l
)
{
u
[
2
*
l
+
0
]
=
y_qs
[
j
*
(
2
*
WARP_SIZE
)
+
kyqs
+
l
];
u
[
2
*
l
+
1
]
=
y_qs
[
j
*
(
2
*
WARP_SIZE
)
+
kyqs
+
l
+
QI4_0
];
}
// scale second part of sum by QI8_1/QR4_1 to compensate for multiple threads adding it
return
sumi
*
d4d8
+
m4s8
/
(
QI8_1
/
QR4_1
);
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
return
vec_dot_q4_0_q8_1_impl
<
VDR_Q4_0_Q8_1_MMQ
>
(
&
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
],
u
,
x_dmf
[
i
*
(
WARP_SIZE
/
QI4_0
)
+
i
/
QI4_0
+
k
/
QI4_0
],
y_ds
[
j
*
(
2
*
WARP_SIZE
/
QI8_1
)
+
2
*
k
/
QI8_1
]);
}
static
__device__
__forceinline__
float
vec_dot_q4_1_q8_1
(
...
...
@@ -1523,11 +1679,17 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
const
block_q4_1
*
bq4_1
=
(
const
block_q4_1
*
)
vbq
;
const
int
vi
=
get_int_from_uint8_aligned
(
bq4_1
->
qs
,
iqs
);
const
int
ui0
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
);
const
int
ui1
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
+
QI4_1
);
int
v
[
VDR_Q4_1_Q8_1_MMVQ
];
int
u
[
2
*
VDR_Q4_1_Q8_1_MMVQ
];
#pragma unroll
for
(
int
i
=
0
;
i
<
VDR_Q4_1_Q8_1_MMVQ
;
++
i
)
{
v
[
i
]
=
get_int_from_uint8_aligned
(
bq4_1
->
qs
,
iqs
+
i
);
u
[
2
*
i
+
0
]
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
+
i
);
u
[
2
*
i
+
1
]
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
+
i
+
QI4_1
);
}
return
vec_dot_q4_1_q8_1_impl
(
v
i
,
u
i0
,
ui1
,
bq4_1
->
dm
,
bq8_1
->
ds
);
return
vec_dot_q4_1_q8_1_impl
<
VDR_Q4_1_Q8_1_MMVQ
>
(
v
,
u
,
bq4_1
->
dm
,
bq8_1
->
ds
);
}
static
__device__
__forceinline__
void
allocate_tiles_q4_1
(
int
**
x_ql
,
half2
**
x_dm
,
int
**
x_qh
,
int
**
x_sc
)
{
...
...
@@ -1539,9 +1701,9 @@ static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 **
*
x_dm
=
tile_x_dm
;
}
static
__device__
__forceinline__
void
load_tiles_q4_1
(
template
<
bool
need_check
>
static
__device__
__forceinline__
void
load_tiles_q4_1
(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
i_max
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
__builtin_assume
(
i_offset
>=
0
);
__builtin_assume
(
i_offset
<
8
);
...
...
@@ -1555,7 +1717,11 @@ static __device__ __forceinline__ void load_tiles_q4_1(
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
)
{
const
int
i
=
i0
+
i_offset
;
int
i
=
i0
+
i_offset
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q4_1
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
...
...
@@ -1567,7 +1733,11 @@ static __device__ __forceinline__ void load_tiles_q4_1(
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
QI4_1
)
{
const
int
i
=
i0
+
i_offset
*
QI4_1
+
k
/
blocks_per_tile_x_row
;
int
i
=
i0
+
i_offset
*
QI4_1
+
k
/
blocks_per_tile_x_row
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q4_1
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbxd
;
...
...
@@ -1588,35 +1758,17 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
const
int
kyqs
=
k
%
(
QI8_1
/
2
)
+
QI8_1
*
(
k
/
(
QI8_1
/
2
));
return
vec_dot_q4_1_q8_1_impl
(
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
],
y_qs
[
j
*
(
2
*
WARP_SIZE
)
+
kyqs
],
y_qs
[
j
*
(
2
*
WARP_SIZE
)
+
kyqs
+
(
QI8_1
/
2
)],
x_dm
[
i
*
(
WARP_SIZE
/
QI4_1
)
+
i
/
QI4_1
+
k
/
QI4_1
],
y_ds
[
j
*
(
2
*
WARP_SIZE
/
QI8_1
)
+
2
*
k
/
QI8_1
]);
}
#define VDR_q5_0_q8_1 1
int
u
[
2
*
VDR_Q4_1_Q8_1_MMQ
];
static
__device__
__forceinline__
float
vec_dot_q5_0_q8_1_impl
(
const
int
&
qs
,
const
int
&
qh
,
const
int
&
ui0
,
const
int
&
ui1
,
const
half
&
d5
,
const
half2
&
ds8
)
{
#pragma unroll
for
(
int
l
=
0
;
l
<
VDR_Q4_1_Q8_1_MMQ
;
++
l
)
{
u
[
2
*
l
+
0
]
=
y_qs
[
j
*
(
2
*
WARP_SIZE
)
+
kyqs
+
l
];
u
[
2
*
l
+
1
]
=
y_qs
[
j
*
(
2
*
WARP_SIZE
)
+
kyqs
+
l
+
QI4_1
];
}
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
int
vi0
=
(
qs
>>
0
)
&
0x0F0F0F0F
;
// lower 4 qs bits, still need qh as 5th bits
vi0
|=
(
qh
<<
4
)
&
0x00000010
;
// 0 -> 4
vi0
|=
(
qh
<<
11
)
&
0x00001000
;
// 1 -> 12
vi0
|=
(
qh
<<
18
)
&
0x00100000
;
// 2 -> 20
vi0
|=
(
qh
<<
25
)
&
0x10000000
;
// 3 -> 28
int
sumi
=
__dp4a
(
vi0
,
ui0
,
0
);
// SIMD dot product of quantized values
int
vi1
=
(
qs
>>
4
)
&
0x0F0F0F0F
;
// upper 4 qs bits, still need qh as 5th bits
vi1
|=
(
qh
>>
12
)
&
0x00000010
;
// 16 -> 4
vi1
|=
(
qh
>>
5
)
&
0x00001000
;
// 17 -> 12
vi1
|=
(
qh
<<
2
)
&
0x00100000
;
// 18 -> 20
vi1
|=
(
qh
<<
9
)
&
0x10000000
;
// 19 -> 28
sumi
=
__dp4a
(
vi1
,
ui1
,
sumi
);
// SIMD dot product of quantized values
return
__half2float
(
d5
)
*
(
sumi
*
__half2float
(
ds8
.
x
)
-
(
16
/
QI5_0
)
*
__half2float
(
ds8
.
y
));
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
return
vec_dot_q4_1_q8_1_impl
<
VDR_Q4_1_Q8_1_MMQ
>
(
&
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
],
u
,
x_dm
[
i
*
(
WARP_SIZE
/
QI4_1
)
+
i
/
QI4_1
+
k
/
QI4_1
],
y_ds
[
j
*
(
2
*
WARP_SIZE
/
QI8_1
)
+
2
*
k
/
QI8_1
]);
}
static
__device__
__forceinline__
float
vec_dot_q5_0_q8_1
(
...
...
@@ -1624,28 +1776,33 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
const
block_q5_0
*
bq5_0
=
(
const
block_q5_0
*
)
vbq
;
const
int
qs
=
get_int_from_uint8
(
bq5_0
->
qs
,
iqs
);
const
int
qh
=
get_int_from_uint8
(
bq5_0
->
qh
,
0
)
>>
(
4
*
iqs
);
const
int
ui0
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
);
const
int
ui1
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
+
QI5_0
);
int
vl
[
VDR_Q5_0_Q8_1_MMVQ
];
int
vh
[
VDR_Q5_0_Q8_1_MMVQ
];
int
u
[
2
*
VDR_Q5_0_Q8_1_MMVQ
];
return
vec_dot_q5_0_q8_1_impl
(
qs
,
qh
,
ui0
,
ui1
,
bq5_0
->
d
,
bq8_1
->
ds
);
#pragma unroll
for
(
int
i
=
0
;
i
<
VDR_Q5_0_Q8_1_MMVQ
;
++
i
)
{
vl
[
i
]
=
get_int_from_uint8
(
bq5_0
->
qs
,
iqs
+
i
);
vh
[
i
]
=
get_int_from_uint8
(
bq5_0
->
qh
,
0
)
>>
(
4
*
(
iqs
+
i
));
u
[
2
*
i
+
0
]
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
+
i
);
u
[
2
*
i
+
1
]
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
+
i
+
QI5_0
);
}
return
vec_dot_q5_0_q8_1_impl
<
VDR_Q5_0_Q8_1_MMVQ
>
(
vl
,
vh
,
u
,
bq5_0
->
d
,
bq8_1
->
ds
);
}
static
__device__
__forceinline__
void
allocate_tiles_q5_0
(
int
**
x_ql
,
half2
**
x_dm
,
int
**
x_qh
,
int
**
x_sc
)
{
__shared__
int
tile_x_ql
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
)
+
GGML_CUDA_MMQ_Y
];
__shared__
int
tile_x_qh
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
/
QI5_0
)
+
GGML_CUDA_MMQ_Y
/
QI5_0
];
__shared__
half2
tile_x_d
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
/
QI5_0
)
+
GGML_CUDA_MMQ_Y
/
QI5_0
];
__shared__
int
tile_x_ql
[
GGML_CUDA_MMQ_Y
*
(
2
*
WARP_SIZE
)
+
GGML_CUDA_MMQ_Y
];
__shared__
float
tile_x_d
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
/
QI5_0
)
+
GGML_CUDA_MMQ_Y
/
QI5_0
];
*
x_ql
=
tile_x_ql
;
*
x_qh
=
tile_x_qh
;
*
x_dm
=
tile_x_d
;
*
x_dm
=
(
half2
*
)
tile_x_d
;
}
static
__device__
__forceinline__
void
load_tiles_q5_0
(
template
<
bool
need_check
>
static
__device__
__forceinline__
void
load_tiles_q5_0
(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
i_max
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
__builtin_assume
(
i_offset
>=
0
);
__builtin_assume
(
i_offset
<
8
);
...
...
@@ -1659,24 +1816,51 @@ static __device__ __forceinline__ void load_tiles_q5_0(
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
)
{
const
int
i
=
i0
+
i_offset
;
int
i
=
i0
+
i_offset
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q5_0
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
]
=
get_int_from_uint8
(
bxi
->
qs
,
kqsx
);
const
int
ql
=
get_int_from_uint8
(
bxi
->
qs
,
kqsx
);
const
int
qh
=
get_int_from_uint8
(
bxi
->
qh
,
0
)
>>
(
4
*
(
k
%
QI5_0
));
int
qs0
=
(
ql
>>
0
)
&
0x0F0F0F0F
;
qs0
|=
(
qh
<<
4
)
&
0x00000010
;
// 0 -> 4
qs0
|=
(
qh
<<
11
)
&
0x00001000
;
// 1 -> 12
qs0
|=
(
qh
<<
18
)
&
0x00100000
;
// 2 -> 20
qs0
|=
(
qh
<<
25
)
&
0x10000000
;
// 3 -> 28
qs0
=
__vsubss4
(
qs0
,
0x10101010
);
// subtract 16
x_ql
[
i
*
(
2
*
WARP_SIZE
+
1
)
+
2
*
k
+
0
]
=
qs0
;
int
qs1
=
(
ql
>>
4
)
&
0x0F0F0F0F
;
qs1
|=
(
qh
>>
12
)
&
0x00000010
;
// 16 -> 4
qs1
|=
(
qh
>>
5
)
&
0x00001000
;
// 17 -> 12
qs1
|=
(
qh
<<
2
)
&
0x00100000
;
// 18 -> 20
qs1
|=
(
qh
<<
9
)
&
0x10000000
;
// 19 -> 28
qs1
=
__vsubss4
(
qs1
,
0x10101010
);
// subtract 16
x_ql
[
i
*
(
2
*
WARP_SIZE
+
1
)
+
2
*
k
+
1
]
=
qs1
;
}
const
int
blocks_per_tile_x_row
=
WARP_SIZE
/
QI5_0
;
const
int
kbxd
=
k
%
blocks_per_tile_x_row
;
float
*
x_dmf
=
(
float
*
)
x_dm
;
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
QI5_0
)
{
const
int
i
=
i0
+
i_offset
*
QI5_0
+
k
/
blocks_per_tile_x_row
;
int
i
=
i0
+
i_offset
*
QI5_0
+
k
/
blocks_per_tile_x_row
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q5_0
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbxd
;
x_qh
[
i
*
(
WARP_SIZE
/
QI5_0
)
+
i
/
QI5_0
+
kbxd
]
=
get_int_from_uint8
(
bxi
->
qh
,
0
);
x_dm
[
i
*
(
WARP_SIZE
/
QI5_0
)
+
i
/
QI5_0
+
kbxd
].
x
=
bxi
->
d
;
x_dmf
[
i
*
(
WARP_SIZE
/
QI5_0
)
+
i
/
QI5_0
+
kbxd
]
=
bxi
->
d
;
}
}
...
...
@@ -1693,46 +1877,18 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
const
int
kyqs
=
k
%
(
QI8_1
/
2
)
+
QI8_1
*
(
k
/
(
QI8_1
/
2
));
const
int
index_bx
=
i
*
(
WARP_SIZE
/
QI5_0
)
+
i
/
QI5_0
+
k
/
QI5_0
;
const
float
*
x_dmf
=
(
float
*
)
x_dm
;
return
vec_dot_q5_0_q8_1_impl
(
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
],
x_qh
[
index_bx
]
>>
(
4
*
(
k
%
QI5_0
)),
y_qs
[
j
*
(
2
*
WARP_SIZE
)
+
kyqs
],
y_qs
[
j
*
(
2
*
WARP_SIZE
)
+
kyqs
+
(
QI8_1
/
2
)],
x_dm
[
index_bx
].
x
,
y_ds
[
j
*
(
2
*
WARP_SIZE
/
QI8_1
)
+
2
*
k
/
QI8_1
]);
}
#define VDR_q5_1_q8_1 1
static
__device__
__forceinline__
float
vec_dot_q5_1_q8_1_impl
(
const
int
&
qs
,
const
int
&
qh
,
const
int
&
ui0
,
const
int
&
ui1
,
const
half2
&
dm5
,
const
half2
&
ds8
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
int
vi0
=
(
qs
>>
0
)
&
0x0F0F0F0F
;
// lower 4 qs bits, still need qh0 as 5th bits
vi0
|=
(
qh
<<
4
)
&
0x00000010
;
// 0 -> 4
vi0
|=
(
qh
<<
11
)
&
0x00001000
;
// 1 -> 12
vi0
|=
(
qh
<<
18
)
&
0x00100000
;
// 2 -> 20
vi0
|=
(
qh
<<
25
)
&
0x10000000
;
// 3 -> 28
int
sumi
=
__dp4a
(
vi0
,
ui0
,
0
);
// SIMD dot product of quantized values
int
vi1
=
(
qs
>>
4
)
&
0x0F0F0F0F
;
// upper 4 qs bits, still need qh1 as 5th bits
vi1
|=
(
qh
>>
12
)
&
0x00000010
;
// 16 -> 4
vi1
|=
(
qh
>>
5
)
&
0x00001000
;
// 17 -> 12
vi1
|=
(
qh
<<
2
)
&
0x00100000
;
// 18 -> 20
vi1
|=
(
qh
<<
9
)
&
0x10000000
;
// 19 -> 28
sumi
=
__dp4a
(
vi1
,
ui1
,
sumi
);
// SIMD dot product of quantized values
int
u
[
2
*
VDR_Q5_0_Q8_1_MMQ
];
#ifdef GGML_CUDA_F16
const
half2
tmp
=
__hmul2
(
dm5
,
ds8
);
const
float
d5d8
=
__half2float
(
tmp
.
x
);
const
float
m5s8
=
__half2float
(
tmp
.
y
);
#else
const
float
d5d8
=
__half2float
(
dm5
.
x
)
*
__half2float
(
ds8
.
x
);
const
float
m5s8
=
__half2float
(
dm5
.
y
)
*
__half2float
(
ds8
.
y
);
#endif // GGML_CUDA_F16
return
sumi
*
d5d8
+
m5s8
/
QI5_1
;
// scale sum by QI5_1 because there are QI5_1 threads working on this block
#pragma unroll
for
(
int
l
=
0
;
l
<
VDR_Q5_0_Q8_1_MMQ
;
++
l
)
{
u
[
2
*
l
+
0
]
=
y_qs
[
j
*
(
2
*
WARP_SIZE
)
+
kyqs
+
l
];
u
[
2
*
l
+
1
]
=
y_qs
[
j
*
(
2
*
WARP_SIZE
)
+
kyqs
+
l
+
QI5_0
];
}
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
return
vec_dot_q8_0_q8_1_impl
<
QR5_0
*
VDR_Q5_0_Q8_1_MMQ
>
(
&
x_ql
[
i
*
(
2
*
WARP_SIZE
+
1
)
+
2
*
k
],
u
,
x_dmf
[
index_bx
],
y_ds
[
j
*
(
2
*
WARP_SIZE
/
QI8_1
)
+
2
*
k
/
QI8_1
]);
}
static
__device__
__forceinline__
float
vec_dot_q5_1_q8_1
(
...
...
@@ -1740,28 +1896,33 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
const
block_q5_1
*
bq5_1
=
(
const
block_q5_1
*
)
vbq
;
const
int
qs
=
get_int_from_uint8_aligned
(
bq5_1
->
qs
,
iqs
);
const
int
qh
=
get_int_from_uint8_aligned
(
bq5_1
->
qh
,
0
)
>>
(
4
*
iqs
);
const
int
ui0
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
);
const
int
ui1
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
+
QI5_1
);
int
vl
[
VDR_Q5_1_Q8_1_MMVQ
];
int
vh
[
VDR_Q5_1_Q8_1_MMVQ
];
int
u
[
2
*
VDR_Q5_1_Q8_1_MMVQ
];
return
vec_dot_q5_1_q8_1_impl
(
qs
,
qh
,
ui0
,
ui1
,
bq5_1
->
dm
,
bq8_1
->
ds
);
#pragma unroll
for
(
int
i
=
0
;
i
<
VDR_Q5_1_Q8_1_MMVQ
;
++
i
)
{
vl
[
i
]
=
get_int_from_uint8_aligned
(
bq5_1
->
qs
,
iqs
+
i
);
vh
[
i
]
=
get_int_from_uint8_aligned
(
bq5_1
->
qh
,
0
)
>>
(
4
*
(
iqs
+
i
));
u
[
2
*
i
+
0
]
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
+
i
);
u
[
2
*
i
+
1
]
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
+
i
+
QI5_1
);
}
return
vec_dot_q5_1_q8_1_impl
<
VDR_Q5_1_Q8_1_MMVQ
>
(
vl
,
vh
,
u
,
bq5_1
->
dm
,
bq8_1
->
ds
);
}
static
__device__
__forceinline__
void
allocate_tiles_q5_1
(
int
**
x_ql
,
half2
**
x_dm
,
int
**
x_qh
,
int
**
x_sc
)
{
__shared__
int
tile_x_ql
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
)
+
GGML_CUDA_MMQ_Y
];
__shared__
int
tile_x_qh
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
/
QI5_1
)
+
GGML_CUDA_MMQ_Y
/
QI5_1
];
__shared__
int
tile_x_ql
[
GGML_CUDA_MMQ_Y
*
(
2
*
WARP_SIZE
)
+
GGML_CUDA_MMQ_Y
];
__shared__
half2
tile_x_dm
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
/
QI5_1
)
+
GGML_CUDA_MMQ_Y
/
QI5_1
];
*
x_ql
=
tile_x_ql
;
*
x_qh
=
tile_x_qh
;
*
x_dm
=
tile_x_dm
;
}
static
__device__
__forceinline__
void
load_tiles_q5_1
(
template
<
bool
need_check
>
static
__device__
__forceinline__
void
load_tiles_q5_1
(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
i_max
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
__builtin_assume
(
i_offset
>=
0
);
__builtin_assume
(
i_offset
<
8
);
...
...
@@ -1775,11 +1936,32 @@ static __device__ __forceinline__ void load_tiles_q5_1(
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
)
{
const
int
i
=
i0
+
i_offset
;
int
i
=
i0
+
i_offset
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q5_1
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
]
=
get_int_from_uint8_aligned
(
bxi
->
qs
,
kqsx
);
const
int
ql
=
get_int_from_uint8_aligned
(
bxi
->
qs
,
kqsx
);
const
int
qh
=
get_int_from_uint8_aligned
(
bxi
->
qh
,
0
)
>>
(
4
*
(
k
%
QI5_1
));
int
qs0
=
(
ql
>>
0
)
&
0x0F0F0F0F
;
qs0
|=
(
qh
<<
4
)
&
0x00000010
;
// 0 -> 4
qs0
|=
(
qh
<<
11
)
&
0x00001000
;
// 1 -> 12
qs0
|=
(
qh
<<
18
)
&
0x00100000
;
// 2 -> 20
qs0
|=
(
qh
<<
25
)
&
0x10000000
;
// 3 -> 28
x_ql
[
i
*
(
2
*
WARP_SIZE
+
1
)
+
2
*
k
+
0
]
=
qs0
;
int
qs1
=
(
ql
>>
4
)
&
0x0F0F0F0F
;
qs1
|=
(
qh
>>
12
)
&
0x00000010
;
// 16 -> 4
qs1
|=
(
qh
>>
5
)
&
0x00001000
;
// 17 -> 12
qs1
|=
(
qh
<<
2
)
&
0x00100000
;
// 18 -> 20
qs1
|=
(
qh
<<
9
)
&
0x10000000
;
// 19 -> 28
x_ql
[
i
*
(
2
*
WARP_SIZE
+
1
)
+
2
*
k
+
1
]
=
qs1
;
}
const
int
blocks_per_tile_x_row
=
WARP_SIZE
/
QI5_1
;
...
...
@@ -1787,11 +1969,14 @@ static __device__ __forceinline__ void load_tiles_q5_1(
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
QI5_1
)
{
const
int
i
=
i0
+
i_offset
*
QI5_1
+
k
/
blocks_per_tile_x_row
;
int
i
=
i0
+
i_offset
*
QI5_1
+
k
/
blocks_per_tile_x_row
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q5_1
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbxd
;
x_qh
[
i
*
(
WARP_SIZE
/
QI5_1
)
+
i
/
QI5_1
+
kbxd
]
=
get_int_from_uint8_aligned
(
bxi
->
qh
,
0
);
x_dm
[
i
*
(
WARP_SIZE
/
QI5_1
)
+
i
/
QI5_1
+
kbxd
]
=
bxi
->
dm
;
}
}
...
...
@@ -1810,24 +1995,16 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
const
int
kyqs
=
k
%
(
QI8_1
/
2
)
+
QI8_1
*
(
k
/
(
QI8_1
/
2
));
const
int
index_bx
=
i
*
(
WARP_SIZE
/
QI5_1
)
+
+
i
/
QI5_1
+
k
/
QI5_1
;
return
vec_dot_q5_1_q8_1_impl
(
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
],
x_qh
[
index_bx
]
>>
(
4
*
(
k
%
QI5_1
)),
y_qs
[
j
*
(
2
*
WARP_SIZE
)
+
kyqs
],
y_qs
[
j
*
(
2
*
WARP_SIZE
)
+
kyqs
+
(
QI8_1
/
2
)],
x_dm
[
index_bx
],
y_ds
[
j
*
(
2
*
WARP_SIZE
/
QI8_1
)
+
2
*
k
/
QI8_1
]);
}
#define VDR_q8_0_q8_1 1
static
__device__
__forceinline__
float
vec_dot_q8_0_q8_1_impl
(
const
int
&
vi
,
const
int
&
ui
,
const
half
&
d8_0
,
const
half2
&
ds8_1
)
{
int
u
[
2
*
VDR_Q5_1_Q8_1_MMQ
];
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
// SIMD dot product of quantized values
const
int
sumi
=
__dp4a
(
vi
,
ui
,
0
);
#pragma unroll
for
(
int
l
=
0
;
l
<
VDR_Q5_1_Q8_1_MMQ
;
++
l
)
{
u
[
2
*
l
+
0
]
=
y_qs
[
j
*
(
2
*
WARP_SIZE
)
+
kyqs
+
l
];
u
[
2
*
l
+
1
]
=
y_qs
[
j
*
(
2
*
WARP_SIZE
)
+
kyqs
+
l
+
QI5_1
];
}
return
sumi
*
__half2float
(
d8_0
)
*
__half2float
(
ds8_1
.
x
);
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
return
vec_dot_q8_1_q8_1_impl
<
QR5_1
*
VDR_Q5_1_Q8_1_MMQ
>
(
&
x_ql
[
i
*
(
2
*
WARP_SIZE
+
1
)
+
2
*
k
],
u
,
x_dm
[
index_bx
],
y_ds
[
j
*
(
2
*
WARP_SIZE
/
QI8_1
)
+
2
*
k
/
QI8_1
]);
}
static
__device__
__forceinline__
float
vec_dot_q8_0_q8_1
(
...
...
@@ -1835,24 +2012,29 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
const
block_q8_0
*
bq8_0
=
(
const
block_q8_0
*
)
vbq
;
const
int
vi
=
get_int_from_int8
(
bq8_0
->
qs
,
iqs
);
const
int
ui
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
);
int
v
[
VDR_Q8_0_Q8_1_MMVQ
];
int
u
[
VDR_Q8_0_Q8_1_MMVQ
];
for
(
int
i
=
0
;
i
<
VDR_Q8_0_Q8_1_MMVQ
;
++
i
)
{
v
[
i
]
=
get_int_from_int8
(
bq8_0
->
qs
,
iqs
+
i
);
u
[
i
]
=
get_int_from_int8_aligned
(
bq8_1
->
qs
,
iqs
+
i
);
}
return
vec_dot_q8_0_q8_1_impl
(
v
i
,
u
i
,
bq8_0
->
d
,
bq8_1
->
ds
);
return
vec_dot_q8_0_q8_1_impl
<
VDR_Q8_0_Q8_1_MMVQ
>
(
v
,
u
,
bq8_0
->
d
,
bq8_1
->
ds
);
}
static
__device__
__forceinline__
void
allocate_tiles_q8_0
(
int
**
x_ql
,
half2
**
x_dm
,
int
**
x_qh
,
int
**
x_sc
)
{
__shared__
int
tile_x_qs
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
)
+
GGML_CUDA_MMQ_Y
];
__shared__
half2
tile_x_d
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
/
QI8_0
)
+
GGML_CUDA_MMQ_Y
/
QI8_0
];
__shared__
float
tile_x_d
[
GGML_CUDA_MMQ_Y
*
(
WARP_SIZE
/
QI8_0
)
+
GGML_CUDA_MMQ_Y
/
QI8_0
];
*
x_ql
=
tile_x_qs
;
*
x_dm
=
tile_x_d
;
*
x_dm
=
(
half2
*
)
tile_x_d
;
}
static
__device__
__forceinline__
void
load_tiles_q8_0
(
template
<
bool
need_check
>
static
__device__
__forceinline__
void
load_tiles_q8_0
(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
i_max
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
__builtin_assume
(
i_offset
>=
0
);
__builtin_assume
(
i_offset
<
8
);
...
...
@@ -1861,17 +2043,22 @@ static __device__ __forceinline__ void load_tiles_q8_0(
const
int
kbx
=
k
/
QI8_0
;
const
int
kqsx
=
k
%
QI8_0
;
float
*
x_dmf
=
(
float
*
)
x_dm
;
const
block_q8_0
*
bx0
=
(
block_q8_0
*
)
vx
;
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
)
{
const
int
i
=
i0
+
i_offset
;
int
i
=
i0
+
i_offset
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q8_0
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
]
=
get_int_from_int8
(
bxi
->
qs
,
kqsx
);
x_dm
[
i
*
(
WARP_SIZE
/
QI8_0
)
+
i
/
QI8_0
+
kbx
]
.
x
=
bxi
->
d
;
x_dm
f
[
i
*
(
WARP_SIZE
/
QI8_0
)
+
i
/
QI8_0
+
kbx
]
=
bxi
->
d
;
}
// const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
...
...
@@ -1879,6 +2066,7 @@ static __device__ __forceinline__ void load_tiles_q8_0(
// #pragma unroll
// for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI8_0) {
// FIXME out-of-bounds
// const int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
// #if GGML_CUDA_MMQ_Y < 64
...
...
@@ -1904,9 +2092,11 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
__builtin_assume
(
k
>=
0
);
__builtin_assume
(
k
<
WARP_SIZE
);
return
vec_dot_q8_0_q8_1_impl
(
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
],
y_qs
[
j
*
WARP_SIZE
+
k
],
x_dm
[
i
*
(
WARP_SIZE
/
QI8_0
)
+
i
/
QI8_0
+
k
/
QI8_0
].
x
,
y_ds
[
j
*
(
WARP_SIZE
/
QI8_1
)
+
k
/
QI8_1
]);
const
float
*
x_dmf
=
(
float
*
)
x_dm
;
return
vec_dot_q8_0_q8_1_impl
<
VDR_Q8_0_Q8_1_MMQ
>
(
&
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
k
],
&
y_qs
[
j
*
WARP_SIZE
+
k
],
x_dmf
[
i
*
(
WARP_SIZE
/
QI8_0
)
+
i
/
QI8_0
+
k
/
QI8_0
],
y_ds
[
j
*
(
WARP_SIZE
/
QI8_1
)
+
k
/
QI8_1
]);
}
#define VDR_q2_K_q8_1 1
...
...
@@ -1973,9 +2163,9 @@ static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 **
*
x_sc
=
tile_x_sc
;
}
static
__device__
__forceinline__
void
load_tiles_q2_K
(
template
<
bool
need_check
>
static
__device__
__forceinline__
void
load_tiles_q2_K
(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
i_max
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
__builtin_assume
(
i_offset
>=
0
);
__builtin_assume
(
i_offset
<
8
);
...
...
@@ -1989,7 +2179,11 @@ static __device__ __forceinline__ void load_tiles_q2_K(
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
)
{
const
int
i
=
i0
+
i_offset
;
int
i
=
i0
+
i_offset
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q2_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
...
...
@@ -2001,7 +2195,11 @@ static __device__ __forceinline__ void load_tiles_q2_K(
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
QI2_K
)
{
const
int
i
=
(
i0
+
i_offset
*
QI2_K
+
k
/
blocks_per_tile_x_row
)
%
GGML_CUDA_MMQ_Y
;
int
i
=
(
i0
+
i_offset
*
QI2_K
+
k
/
blocks_per_tile_x_row
)
%
GGML_CUDA_MMQ_Y
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q2_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbxd
;
...
...
@@ -2010,7 +2208,11 @@ static __device__ __forceinline__ void load_tiles_q2_K(
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
4
)
{
const
int
i
=
i0
+
i_offset
*
4
+
k
/
(
WARP_SIZE
/
4
);
int
i
=
i0
+
i_offset
*
4
+
k
/
(
WARP_SIZE
/
4
);
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q2_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
(
k
%
(
WARP_SIZE
/
4
))
/
(
QI2_K
/
4
);
...
...
@@ -2125,9 +2327,9 @@ static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 **
*
x_sc
=
tile_x_sc
;
}
static
__device__
__forceinline__
void
load_tiles_q3_K
(
template
<
bool
need_check
>
static
__device__
__forceinline__
void
load_tiles_q3_K
(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
i_max
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
__builtin_assume
(
i_offset
>=
0
);
__builtin_assume
(
i_offset
<
8
);
...
...
@@ -2141,7 +2343,11 @@ static __device__ __forceinline__ void load_tiles_q3_K(
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
)
{
const
int
i
=
i0
+
i_offset
;
int
i
=
i0
+
i_offset
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q3_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
...
...
@@ -2153,7 +2359,11 @@ static __device__ __forceinline__ void load_tiles_q3_K(
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
QI3_K
)
{
const
int
i
=
(
i0
+
i_offset
*
QI3_K
+
k
/
blocks_per_tile_x_row
)
%
GGML_CUDA_MMQ_Y
;
int
i
=
(
i0
+
i_offset
*
QI3_K
+
k
/
blocks_per_tile_x_row
)
%
GGML_CUDA_MMQ_Y
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q3_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbxd
;
...
...
@@ -2162,7 +2372,11 @@ static __device__ __forceinline__ void load_tiles_q3_K(
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
2
)
{
const
int
i
=
i0
+
i_offset
*
2
+
k
/
(
WARP_SIZE
/
2
);
int
i
=
i0
+
i_offset
*
2
+
k
/
(
WARP_SIZE
/
2
);
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q3_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
(
k
%
(
WARP_SIZE
/
2
))
/
(
QI3_K
/
2
);
...
...
@@ -2171,7 +2385,11 @@ static __device__ __forceinline__ void load_tiles_q3_K(
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
4
)
{
const
int
i
=
i0
+
i_offset
*
4
+
k
/
(
WARP_SIZE
/
4
);
int
i
=
i0
+
i_offset
*
4
+
k
/
(
WARP_SIZE
/
4
);
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q3_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
(
k
%
(
WARP_SIZE
/
4
))
/
(
QI3_K
/
4
);
...
...
@@ -2252,15 +2470,15 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
int
u
[
2
*
QR4_K
];
float
d8
[
QR4_K
];
// iqs is in 0
...15
. bq8_offset =
2 * (
iqs/4
)
-> bq8_offset = 0, 2, 4, 6
const
int
bq8_offset
=
QR4_K
*
(
iqs
/
(
QI8_1
/
2
));
// iqs is in 0
,2..30
. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6
const
int
bq8_offset
=
QR4_K
*
(
(
iqs
/
2
)
/
(
QI8_1
/
2
));
// iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
// iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
// iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
// iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
const
int
*
q4
=
(
const
int
*
)(
bq4_K
->
qs
+
16
*
bq8_offset
+
4
*
(
iqs
%
4
));
const
int
*
q4
=
(
const
int
*
)(
bq4_K
->
qs
+
16
*
bq8_offset
+
4
*
(
(
iqs
/
2
)
%
4
));
v
[
0
]
=
q4
[
0
];
v
[
1
]
=
q4
[
4
];
...
...
@@ -2281,7 +2499,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
const
block_q8_1
*
bq8i
=
bq8_1
+
bq8_offset
+
i
;
d8
[
i
]
=
bq8i
->
ds
.
x
;
const
int
*
q8
=
(
const
int
*
)
bq8i
->
qs
+
(
iqs
%
4
);
const
int
*
q8
=
(
const
int
*
)
bq8i
->
qs
+
(
(
iqs
/
2
)
%
4
);
u
[
2
*
i
+
0
]
=
q8
[
0
];
u
[
2
*
i
+
1
]
=
q8
[
4
];
}
...
...
@@ -2309,12 +2527,12 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
const
float
d8_1
=
bq8_1
[
0
].
ds
.
x
;
const
float
d8_2
=
bq8_1
[
1
].
ds
.
x
;
const
int
ui1
=
*
((
const
int
*
)
bq8_1
[
0
].
qs
+
iqs
);
const
int
ui2
=
*
((
const
int
*
)
bq8_1
[
0
].
qs
+
iqs
+
4
);
const
int
ui3
=
*
((
const
int
*
)
bq8_1
[
1
].
qs
+
iqs
);
const
int
ui4
=
*
((
const
int
*
)
bq8_1
[
1
].
qs
+
iqs
+
4
);
const
int
ui1
=
*
((
const
int
*
)
bq8_1
[
0
].
qs
+
(
iqs
/
2
)
);
const
int
ui2
=
*
((
const
int
*
)
bq8_1
[
0
].
qs
+
(
iqs
/
2
)
+
4
);
const
int
ui3
=
*
((
const
int
*
)
bq8_1
[
1
].
qs
+
(
iqs
/
2
)
);
const
int
ui4
=
*
((
const
int
*
)
bq8_1
[
1
].
qs
+
(
iqs
/
2
)
+
4
);
const
int
*
q4
=
(
const
int
*
)
bq4_K
->
qs
+
iqs
;
const
int
*
q4
=
(
const
int
*
)
bq4_K
->
qs
+
(
iqs
/
2
)
;
const
int
v1
=
q4
[
0
];
const
int
v2
=
q4
[
4
];
...
...
@@ -2346,9 +2564,9 @@ static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 **
*
x_sc
=
tile_x_sc
;
}
static
__device__
__forceinline__
void
load_tiles_q4_K
(
template
<
bool
need_check
>
static
__device__
__forceinline__
void
load_tiles_q4_K
(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
i_max
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
__builtin_assume
(
i_offset
>=
0
);
__builtin_assume
(
i_offset
<
8
);
...
...
@@ -2362,7 +2580,11 @@ static __device__ __forceinline__ void load_tiles_q4_K(
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
)
{
const
int
i
=
i0
+
i_offset
;
int
i
=
i0
+
i_offset
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q4_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
...
...
@@ -2374,7 +2596,11 @@ static __device__ __forceinline__ void load_tiles_q4_K(
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
QI4_K
)
{
const
int
i
=
(
i0
+
i_offset
*
QI4_K
+
k
/
blocks_per_tile_x_row
)
%
GGML_CUDA_MMQ_Y
;
int
i
=
(
i0
+
i_offset
*
QI4_K
+
k
/
blocks_per_tile_x_row
)
%
GGML_CUDA_MMQ_Y
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q4_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbxd
;
...
...
@@ -2383,7 +2609,11 @@ static __device__ __forceinline__ void load_tiles_q4_K(
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
8
)
{
const
int
i
=
(
i0
+
i_offset
*
8
+
k
/
(
WARP_SIZE
/
8
))
%
GGML_CUDA_MMQ_Y
;
int
i
=
(
i0
+
i_offset
*
8
+
k
/
(
WARP_SIZE
/
8
))
%
GGML_CUDA_MMQ_Y
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q4_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
(
k
%
(
WARP_SIZE
/
8
))
/
(
QI4_K
/
8
);
...
...
@@ -2409,11 +2639,11 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
int
u
[
2
*
QR4_K
];
float
d8
[
QR4_K
];
//
i
qs is in 0...
15
. bq8_offset = 2 * (
i
qs/4) -> bq8_offset = 0, 2, 4, 6
const
int
bq8_offset
=
QR4_K
*
(
kqsx
/
(
QI8_1
/
2
));
//
k
qs
x
is in 0
,2
...
30
. bq8_offset = 2 * (
k
qs
x
/4) -> bq8_offset = 0, 2, 4, 6
const
int
bq8_offset
=
QR4_K
*
(
(
kqsx
/
2
)
/
(
QI8_1
/
2
));
v
[
0
]
=
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
4
*
bq8_offset
+
kqsx
%
4
+
0
];
v
[
1
]
=
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
4
*
bq8_offset
+
kqsx
%
4
+
4
];
v
[
0
]
=
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
4
*
bq8_offset
+
(
kqsx
/
2
)
%
4
+
0
];
v
[
1
]
=
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
4
*
bq8_offset
+
(
kqsx
/
2
)
%
4
+
4
];
const
uint16_t
*
scales
=
(
const
uint16_t
*
)
&
x_sc
[
i
*
(
WARP_SIZE
/
8
)
+
i
/
8
+
kbx
*
4
];
uint16_t
aux
[
2
];
...
...
@@ -2429,7 +2659,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
const
uint8_t
*
m
=
sc
+
2
;
for
(
int
l
=
0
;
l
<
QR4_K
;
++
l
)
{
const
int
kqsy
=
j
*
(
QR4_K
*
WARP_SIZE
)
+
kbx
*
(
QR4_K
*
QI4_K
)
+
(
bq8_offset
+
l
)
*
QI8_1
+
kqsx
%
(
QI8_1
/
2
);
const
int
kqsy
=
j
*
(
QR4_K
*
WARP_SIZE
)
+
kbx
*
(
QR4_K
*
QI4_K
)
+
(
bq8_offset
+
l
)
*
QI8_1
+
(
kqsx
/
2
)
%
(
QI8_1
/
2
);
u
[
2
*
l
+
0
]
=
y_qs
[
kqsy
+
0
*
(
QI8_1
/
2
)];
u
[
2
*
l
+
1
]
=
y_qs
[
kqsy
+
1
*
(
QI8_1
/
2
)];
d8
[
l
]
=
y_ds
[
kqsy
/
QI8_1
].
x
;
...
...
@@ -2484,9 +2714,9 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
int
u
[
2
*
QR5_K
];
float
d8
[
QR5_K
];
const
int
bq8_offset
=
QR5_K
*
(
iqs
/
(
QI8_1
/
2
));
const
int
*
ql
=
(
const
int
*
)(
bq5_K
->
qs
+
16
*
bq8_offset
+
4
*
(
iqs
%
4
));
const
int
*
qh
=
(
const
int
*
)(
bq5_K
->
qh
+
4
*
(
iqs
%
4
));
const
int
bq8_offset
=
QR5_K
*
(
(
iqs
/
2
)
/
(
QI8_1
/
2
));
const
int
*
ql
=
(
const
int
*
)(
bq5_K
->
qs
+
16
*
bq8_offset
+
4
*
(
(
iqs
/
2
)
%
4
));
const
int
*
qh
=
(
const
int
*
)(
bq5_K
->
qh
+
4
*
(
(
iqs
/
2
)
%
4
));
vl
[
0
]
=
ql
[
0
];
vl
[
1
]
=
ql
[
4
];
...
...
@@ -2511,7 +2741,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
const
block_q8_1
*
bq8i
=
bq8_1
+
bq8_offset
+
i
;
d8
[
i
]
=
bq8i
->
ds
.
x
;
const
int
*
q8
=
(
const
int
*
)
bq8i
->
qs
+
(
iqs
%
4
);
const
int
*
q8
=
(
const
int
*
)
bq8i
->
qs
+
(
(
iqs
/
2
)
%
4
);
u
[
2
*
i
+
0
]
=
q8
[
0
];
u
[
2
*
i
+
1
]
=
q8
[
4
];
}
...
...
@@ -2530,17 +2760,17 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
const
float
d8_1
=
bq8_1
[
0
].
ds
.
x
;
const
float
d8_2
=
bq8_1
[
1
].
ds
.
x
;
const
int
ui1
=
*
((
const
int
*
)
bq8_1
[
0
].
qs
+
iqs
);
const
int
ui2
=
*
((
const
int
*
)
bq8_1
[
0
].
qs
+
iqs
+
4
);
const
int
ui3
=
*
((
const
int
*
)
bq8_1
[
1
].
qs
+
iqs
);
const
int
ui4
=
*
((
const
int
*
)
bq8_1
[
1
].
qs
+
iqs
+
4
);
const
int
ui1
=
*
((
const
int
*
)
bq8_1
[
0
].
qs
+
(
iqs
/
2
)
);
const
int
ui2
=
*
((
const
int
*
)
bq8_1
[
0
].
qs
+
(
iqs
/
2
)
+
4
);
const
int
ui3
=
*
((
const
int
*
)
bq8_1
[
1
].
qs
+
(
iqs
/
2
)
);
const
int
ui4
=
*
((
const
int
*
)
bq8_1
[
1
].
qs
+
(
iqs
/
2
)
+
4
);
const
int
*
ql
=
(
const
int
*
)
bq5_K
->
qs
+
iqs
;
const
int
*
ql
=
(
const
int
*
)
bq5_K
->
qs
+
(
iqs
/
2
)
;
const
int
vl1
=
ql
[
0
];
const
int
vl2
=
ql
[
4
];
const
int
step
=
4
*
iqs
;
// 0, 4, 8, 12
const
int
im
=
step
/
8
;
// = 0 for iqs = 0,
1
, = 1 for iqs =
2
,
3
const
int
step
=
4
*
(
iqs
/
2
)
;
// 0, 4, 8, 12
const
int
im
=
step
/
8
;
// = 0 for iqs = 0,
2
, = 1 for iqs =
4
,
6
const
int
in
=
step
%
8
;
// 0, 4, 0, 4
const
int
vh
=
(
*
((
const
int
*
)(
bq5_K
->
qh
+
in
)))
>>
im
;
...
...
@@ -2574,9 +2804,9 @@ static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 **
*
x_sc
=
tile_x_sc
;
}
static
__device__
__forceinline__
void
load_tiles_q5_K
(
template
<
bool
need_check
>
static
__device__
__forceinline__
void
load_tiles_q5_K
(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
i_max
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
__builtin_assume
(
i_offset
>=
0
);
__builtin_assume
(
i_offset
<
8
);
...
...
@@ -2590,7 +2820,11 @@ static __device__ __forceinline__ void load_tiles_q5_K(
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
)
{
const
int
i
=
i0
+
i_offset
;
int
i
=
i0
+
i_offset
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q5_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
...
...
@@ -2602,7 +2836,11 @@ static __device__ __forceinline__ void load_tiles_q5_K(
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
QI5_K
)
{
const
int
i
=
(
i0
+
i_offset
*
QI5_K
+
k
/
blocks_per_tile_x_row
)
%
GGML_CUDA_MMQ_Y
;
int
i
=
(
i0
+
i_offset
*
QI5_K
+
k
/
blocks_per_tile_x_row
)
%
GGML_CUDA_MMQ_Y
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q5_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbxd
;
...
...
@@ -2611,7 +2849,11 @@ static __device__ __forceinline__ void load_tiles_q5_K(
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
4
)
{
const
int
i
=
i0
+
i_offset
*
4
+
k
/
(
WARP_SIZE
/
4
);
int
i
=
i0
+
i_offset
*
4
+
k
/
(
WARP_SIZE
/
4
);
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q5_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
(
k
%
(
WARP_SIZE
/
4
))
/
(
QI5_K
/
4
);
...
...
@@ -2620,7 +2862,11 @@ static __device__ __forceinline__ void load_tiles_q5_K(
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
8
)
{
const
int
i
=
(
i0
+
i_offset
*
8
+
k
/
(
WARP_SIZE
/
8
))
%
GGML_CUDA_MMQ_Y
;
int
i
=
(
i0
+
i_offset
*
8
+
k
/
(
WARP_SIZE
/
8
))
%
GGML_CUDA_MMQ_Y
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q5_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
(
k
%
(
WARP_SIZE
/
8
))
/
(
QI5_K
/
8
);
...
...
@@ -2647,13 +2893,13 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
int
u
[
2
*
QR4_K
];
float
d8
[
QR4_K
];
const
int
bq8_offset
=
QR5_K
*
(
kqsx
/
(
QI8_1
/
2
));
const
int
bq8_offset
=
QR5_K
*
(
(
kqsx
/
2
)
/
(
QI8_1
/
2
));
vl
[
0
]
=
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
4
*
bq8_offset
+
kqsx
%
4
+
0
];
vl
[
1
]
=
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
4
*
bq8_offset
+
kqsx
%
4
+
4
];
vl
[
0
]
=
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
4
*
bq8_offset
+
(
kqsx
/
2
)
%
4
+
0
];
vl
[
1
]
=
x_ql
[
i
*
(
WARP_SIZE
+
1
)
+
4
*
bq8_offset
+
(
kqsx
/
2
)
%
4
+
4
];
vh
[
0
]
=
x_qh
[
i
*
(
WARP_SIZE
/
4
)
+
i
/
4
+
kqsx
%
4
+
0
]
>>
bq8_offset
;
vh
[
1
]
=
x_qh
[
i
*
(
WARP_SIZE
/
4
)
+
i
/
4
+
kqsx
%
4
+
4
]
>>
bq8_offset
;
vh
[
0
]
=
x_qh
[
i
*
(
WARP_SIZE
/
4
)
+
i
/
4
+
(
kqsx
/
2
)
%
4
+
0
]
>>
bq8_offset
;
vh
[
1
]
=
x_qh
[
i
*
(
WARP_SIZE
/
4
)
+
i
/
4
+
(
kqsx
/
2
)
%
4
+
4
]
>>
bq8_offset
;
const
uint16_t
*
scales
=
(
const
uint16_t
*
)
&
x_sc
[
i
*
(
WARP_SIZE
/
8
)
+
i
/
8
+
kbx
*
4
];
uint16_t
aux
[
2
];
...
...
@@ -2669,7 +2915,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
const
uint8_t
*
m
=
sc
+
2
;
for
(
int
l
=
0
;
l
<
QR5_K
;
++
l
)
{
const
int
kqsy
=
j
*
(
QR5_K
*
WARP_SIZE
)
+
kbx
*
(
QR5_K
*
QI5_K
)
+
(
bq8_offset
+
l
)
*
QI8_1
+
kqsx
%
(
QI8_1
/
2
);
const
int
kqsy
=
j
*
(
QR5_K
*
WARP_SIZE
)
+
kbx
*
(
QR5_K
*
QI5_K
)
+
(
bq8_offset
+
l
)
*
QI8_1
+
(
kqsx
/
2
)
%
(
QI8_1
/
2
);
u
[
2
*
l
+
0
]
=
y_qs
[
kqsy
+
0
*
(
QI8_1
/
2
)];
u
[
2
*
l
+
1
]
=
y_qs
[
kqsy
+
1
*
(
QI8_1
/
2
)];
d8
[
l
]
=
y_ds
[
kqsy
/
QI8_1
].
x
;
...
...
@@ -2743,9 +2989,9 @@ static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 **
*
x_sc
=
tile_x_sc
;
}
static
__device__
__forceinline__
void
load_tiles_q6_K
(
template
<
bool
need_check
>
static
__device__
__forceinline__
void
load_tiles_q6_K
(
const
void
*
__restrict__
vx
,
int
*
__restrict__
x_ql
,
half2
*
__restrict__
x_dm
,
int
*
__restrict__
x_qh
,
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
int
*
__restrict__
x_sc
,
const
int
&
i_offset
,
const
int
&
i_max
,
const
int
&
k
,
const
int
&
blocks_per_row
)
{
__builtin_assume
(
i_offset
>=
0
);
__builtin_assume
(
i_offset
<
8
);
...
...
@@ -2759,7 +3005,11 @@ static __device__ __forceinline__ void load_tiles_q6_K(
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
)
{
const
int
i
=
i0
+
i_offset
;
int
i
=
i0
+
i_offset
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q6_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbx
;
...
...
@@ -2771,7 +3021,11 @@ static __device__ __forceinline__ void load_tiles_q6_K(
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
QI6_K
)
{
const
int
i
=
(
i0
+
i_offset
*
QI6_K
+
k
/
blocks_per_tile_x_row
)
%
GGML_CUDA_MMQ_Y
;
int
i
=
(
i0
+
i_offset
*
QI6_K
+
k
/
blocks_per_tile_x_row
)
%
GGML_CUDA_MMQ_Y
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q6_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
kbxd
;
...
...
@@ -2780,7 +3034,11 @@ static __device__ __forceinline__ void load_tiles_q6_K(
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
2
)
{
const
int
i
=
i0
+
i_offset
*
2
+
k
/
(
WARP_SIZE
/
2
);
int
i
=
i0
+
i_offset
*
2
+
k
/
(
WARP_SIZE
/
2
);
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q6_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
(
k
%
(
WARP_SIZE
/
2
))
/
(
QI6_K
/
2
);
...
...
@@ -2789,7 +3047,11 @@ static __device__ __forceinline__ void load_tiles_q6_K(
#pragma unroll
for
(
int
i0
=
0
;
i0
<
GGML_CUDA_MMQ_Y
;
i0
+=
8
*
8
)
{
const
int
i
=
(
i0
+
i_offset
*
8
+
k
/
(
WARP_SIZE
/
8
))
%
GGML_CUDA_MMQ_Y
;
int
i
=
(
i0
+
i_offset
*
8
+
k
/
(
WARP_SIZE
/
8
))
%
GGML_CUDA_MMQ_Y
;
if
(
need_check
)
{
i
=
min
(
i
,
i_max
);
}
const
block_q6_K
*
bxi
=
bx0
+
i
*
blocks_per_row
+
(
k
%
(
WARP_SIZE
/
8
))
/
4
;
...
...
@@ -2875,7 +3137,7 @@ static __global__ void mul_mat_q(
for
(
int
ib0
=
0
;
ib0
<
blocks_per_row_x
;
ib0
+=
blocks_per_warp
)
{
load_tiles
(
x
+
row_x_0
*
blocks_per_row_x
+
ib0
,
tile_x_ql
,
tile_x_dm
,
tile_x_qh
,
tile_x_sc
,
tid_y
,
tid_x
,
blocks_per_row_x
);
tid_y
,
nrows_x
-
row_x_0
-
1
,
tid_x
,
blocks_per_row_x
);
for
(
int
ir
=
0
;
ir
<
qr
;
++
ir
)
{
const
int
kqs
=
ir
*
WARP_SIZE
+
tid_x
;
...
...
@@ -2899,10 +3161,10 @@ static __global__ void mul_mat_q(
__syncthreads
();
#if __CUDA_ARCH__ >= 700 //
TODO: actually test this with compute capability 7.X cards
#if __CUDA_ARCH__ >= 700 //
Unrolling the loop is slower on Pascal
#pragma unroll
#endif // __CUDA_ARCH__ >= 700
for
(
int
k
=
0
;
k
<
WARP_SIZE
/
vdr
;
++
k
)
{
for
(
int
k
=
0
;
k
<
WARP_SIZE
;
k
+=
vdr
)
{
#pragma unroll
for
(
int
j
=
0
;
j
<
WARP_SIZE
;
j
+=
8
)
{
#pragma unroll
...
...
@@ -2954,9 +3216,9 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
for
(
int
i
=
0
;
i
<
blocks_per_row
;
i
+=
blocks_per_warp
)
{
const
int
ibx
=
row
*
blocks_per_row
+
i
+
threadIdx
.
x
/
(
qi
/
vdr
);
// x block index
const
int
iby
=
(
i
+
threadIdx
.
x
/
(
qi
/
vdr
))
*
qk
/
QK8_1
;
// y block index that aligns with ibx
const
int
iby
=
(
i
+
threadIdx
.
x
/
(
qi
/
vdr
))
*
(
qk
/
QK8_1
)
;
// y block index that aligns with ibx
const
int
iqs
=
threadIdx
.
x
%
(
qi
/
vdr
);
// x block quant index when casting the quants to int
const
int
iqs
=
vdr
*
(
threadIdx
.
x
%
(
qi
/
vdr
)
)
;
// x block quant index when casting the quants to int
tmp
+=
vec_dot_q_cuda
(
&
x
[
ibx
],
&
y
[
iby
],
iqs
);
}
...
...
@@ -3499,7 +3761,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
mul_mat_vec_q
<
QK4_0
,
QI4_0
,
block_q4_0
,
VDR_
q
4_0_
q
8_1
,
vec_dot_q4_0_q8_1
>
mul_mat_vec_q
<
QK4_0
,
QI4_0
,
block_q4_0
,
VDR_
Q
4_0_
Q
8_1
_MMVQ
,
vec_dot_q4_0_q8_1
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols
,
nrows
);
}
...
...
@@ -3508,7 +3770,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
mul_mat_vec_q
<
QK4_0
,
QI4_1
,
block_q4_1
,
VDR_
q
4_1_
q
8_1
,
vec_dot_q4_1_q8_1
>
mul_mat_vec_q
<
QK4_0
,
QI4_1
,
block_q4_1
,
VDR_
Q
4_1_
Q
8_1
_MMVQ
,
vec_dot_q4_1_q8_1
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols
,
nrows
);
}
...
...
@@ -3517,7 +3779,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
mul_mat_vec_q
<
QK5_0
,
QI5_0
,
block_q5_0
,
VDR_
q
5_0_
q
8_1
,
vec_dot_q5_0_q8_1
>
mul_mat_vec_q
<
QK5_0
,
QI5_0
,
block_q5_0
,
VDR_
Q
5_0_
Q
8_1
_MMVQ
,
vec_dot_q5_0_q8_1
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols
,
nrows
);
}
...
...
@@ -3526,7 +3788,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
mul_mat_vec_q
<
QK5_1
,
QI5_1
,
block_q5_1
,
VDR_
q
5_1_
q
8_1
,
vec_dot_q5_1_q8_1
>
mul_mat_vec_q
<
QK5_1
,
QI5_1
,
block_q5_1
,
VDR_
Q
5_1_
Q
8_1
_MMVQ
,
vec_dot_q5_1_q8_1
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols
,
nrows
);
}
...
...
@@ -3535,7 +3797,7 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
mul_mat_vec_q
<
QK8_0
,
QI8_0
,
block_q8_0
,
VDR_
q
8_0_
q
8_1
,
vec_dot_q8_0_q8_1
>
mul_mat_vec_q
<
QK8_0
,
QI8_0
,
block_q8_0
,
VDR_
Q
8_0_
Q
8_1
_MMVQ
,
vec_dot_q8_0_q8_1
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols
,
nrows
);
}
...
...
@@ -3635,8 +3897,14 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
const
int
block_num_y
=
(
ncols_y
+
WARP_SIZE
-
1
)
/
WARP_SIZE
;
const
dim3
block_nums
(
block_num_x
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
WARP_SIZE
/
4
,
1
);
mul_mat_q
<
QK4_0
,
QR4_0
,
QI4_0
,
block_q4_0
,
allocate_tiles_q4_0
,
load_tiles_q4_0
,
VDR_q4_0_q8_1
,
vec_dot_q4_0_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
if
(
nrows_x
%
GGML_CUDA_MMQ_Y
==
0
)
{
mul_mat_q
<
QK4_0
,
QR4_0
,
QI4_0
,
block_q4_0
,
allocate_tiles_q4_0
,
load_tiles_q4_0
<
false
>
,
VDR_Q4_0_Q8_1_MMQ
,
vec_dot_q4_0_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
else
{
mul_mat_q
<
QK4_0
,
QR4_0
,
QI4_0
,
block_q4_0
,
allocate_tiles_q4_0
,
load_tiles_q4_0
<
true
>
,
VDR_Q4_0_Q8_1_MMQ
,
vec_dot_q4_0_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
}
static
void
ggml_mul_mat_q4_1_q8_1_cuda
(
...
...
@@ -3647,8 +3915,14 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
const
int
block_num_y
=
(
ncols_y
+
WARP_SIZE
-
1
)
/
WARP_SIZE
;
const
dim3
block_nums
(
block_num_x
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
WARP_SIZE
/
4
,
1
);
mul_mat_q
<
QK4_1
,
QR4_1
,
QI4_1
,
block_q4_1
,
allocate_tiles_q4_1
,
load_tiles_q4_1
,
VDR_q4_1_q8_1
,
vec_dot_q4_1_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
if
(
nrows_x
%
GGML_CUDA_MMQ_Y
==
0
)
{
mul_mat_q
<
QK4_1
,
QR4_1
,
QI4_1
,
block_q4_1
,
allocate_tiles_q4_1
,
load_tiles_q4_1
<
false
>
,
VDR_Q4_1_Q8_1_MMQ
,
vec_dot_q4_1_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
else
{
mul_mat_q
<
QK4_1
,
QR4_1
,
QI4_1
,
block_q4_1
,
allocate_tiles_q4_1
,
load_tiles_q4_1
<
true
>
,
VDR_Q4_1_Q8_1_MMQ
,
vec_dot_q4_1_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
}
static
void
ggml_mul_mat_q5_0_q8_1_cuda
(
...
...
@@ -3659,8 +3933,14 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
const
int
block_num_y
=
(
ncols_y
+
WARP_SIZE
-
1
)
/
WARP_SIZE
;
const
dim3
block_nums
(
block_num_x
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
WARP_SIZE
/
4
,
1
);
mul_mat_q
<
QK5_0
,
QR5_0
,
QI5_0
,
block_q5_0
,
allocate_tiles_q5_0
,
load_tiles_q5_0
,
VDR_q5_0_q8_1
,
vec_dot_q5_0_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
if
(
nrows_x
%
GGML_CUDA_MMQ_Y
==
0
)
{
mul_mat_q
<
QK5_0
,
QR5_0
,
QI5_0
,
block_q5_0
,
allocate_tiles_q5_0
,
load_tiles_q5_0
<
false
>
,
VDR_Q5_0_Q8_1_MMQ
,
vec_dot_q5_0_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
else
{
mul_mat_q
<
QK5_0
,
QR5_0
,
QI5_0
,
block_q5_0
,
allocate_tiles_q5_0
,
load_tiles_q5_0
<
true
>
,
VDR_Q5_0_Q8_1_MMQ
,
vec_dot_q5_0_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
}
static
void
ggml_mul_mat_q5_1_q8_1_cuda
(
...
...
@@ -3671,8 +3951,14 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
const
int
block_num_y
=
(
ncols_y
+
WARP_SIZE
-
1
)
/
WARP_SIZE
;
const
dim3
block_nums
(
block_num_x
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
WARP_SIZE
/
4
,
1
);
mul_mat_q
<
QK5_1
,
QR5_1
,
QI5_1
,
block_q5_1
,
allocate_tiles_q5_1
,
load_tiles_q5_1
,
VDR_q5_1_q8_1
,
vec_dot_q5_1_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
if
(
nrows_x
%
GGML_CUDA_MMQ_Y
==
0
)
{
mul_mat_q
<
QK5_1
,
QR5_1
,
QI5_1
,
block_q5_1
,
allocate_tiles_q5_1
,
load_tiles_q5_1
<
false
>
,
VDR_Q5_1_Q8_1_MMQ
,
vec_dot_q5_1_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
else
{
mul_mat_q
<
QK5_1
,
QR5_1
,
QI5_1
,
block_q5_1
,
allocate_tiles_q5_1
,
load_tiles_q5_1
<
true
>
,
VDR_Q5_1_Q8_1_MMQ
,
vec_dot_q5_1_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
}
static
void
ggml_mul_mat_q8_0_q8_1_cuda
(
...
...
@@ -3683,8 +3969,14 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
const
int
block_num_y
=
(
ncols_y
+
WARP_SIZE
-
1
)
/
WARP_SIZE
;
const
dim3
block_nums
(
block_num_x
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
WARP_SIZE
/
4
,
1
);
mul_mat_q
<
QK8_0
,
QR8_0
,
QI8_0
,
block_q8_0
,
allocate_tiles_q8_0
,
load_tiles_q8_0
,
VDR_q8_0_q8_1
,
vec_dot_q8_0_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
if
(
nrows_x
%
GGML_CUDA_MMQ_Y
==
0
)
{
mul_mat_q
<
QK8_0
,
QR8_0
,
QI8_0
,
block_q8_0
,
allocate_tiles_q8_0
,
load_tiles_q8_0
<
false
>
,
VDR_Q8_0_Q8_1_MMQ
,
vec_dot_q8_0_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
else
{
mul_mat_q
<
QK8_0
,
QR8_0
,
QI8_0
,
block_q8_0
,
allocate_tiles_q8_0
,
load_tiles_q8_0
<
true
>
,
VDR_Q8_0_Q8_1_MMQ
,
vec_dot_q8_0_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
}
static
void
ggml_mul_mat_q2_K_q8_1_cuda
(
...
...
@@ -3695,8 +3987,14 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
const
int
block_num_y
=
(
ncols_y
+
WARP_SIZE
-
1
)
/
WARP_SIZE
;
const
dim3
block_nums
(
block_num_x
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
WARP_SIZE
/
4
,
1
);
mul_mat_q
<
QK_K
,
QR2_K
,
QI2_K
,
block_q2_K
,
allocate_tiles_q2_K
,
load_tiles_q2_K
,
VDR_q2_K_q8_1
,
vec_dot_q2_K_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
if
(
nrows_x
%
GGML_CUDA_MMQ_Y
==
0
)
{
mul_mat_q
<
QK_K
,
QR2_K
,
QI2_K
,
block_q2_K
,
allocate_tiles_q2_K
,
load_tiles_q2_K
<
false
>
,
VDR_q2_K_q8_1
,
vec_dot_q2_K_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
else
{
mul_mat_q
<
QK_K
,
QR2_K
,
QI2_K
,
block_q2_K
,
allocate_tiles_q2_K
,
load_tiles_q2_K
<
true
>
,
VDR_q2_K_q8_1
,
vec_dot_q2_K_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
}
static
void
ggml_mul_mat_q3_K_q8_1_cuda
(
...
...
@@ -3707,8 +4005,14 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
const
int
block_num_y
=
(
ncols_y
+
WARP_SIZE
-
1
)
/
WARP_SIZE
;
const
dim3
block_nums
(
block_num_x
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
WARP_SIZE
/
4
,
1
);
mul_mat_q
<
QK_K
,
QR3_K
,
QI3_K
,
block_q3_K
,
allocate_tiles_q3_K
,
load_tiles_q3_K
,
VDR_q3_K_q8_1
,
vec_dot_q3_K_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
if
(
nrows_x
%
GGML_CUDA_MMQ_Y
==
0
)
{
mul_mat_q
<
QK_K
,
QR3_K
,
QI3_K
,
block_q3_K
,
allocate_tiles_q3_K
,
load_tiles_q3_K
<
false
>
,
VDR_q3_K_q8_1
,
vec_dot_q3_K_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
else
{
mul_mat_q
<
QK_K
,
QR3_K
,
QI3_K
,
block_q3_K
,
allocate_tiles_q3_K
,
load_tiles_q3_K
<
true
>
,
VDR_q3_K_q8_1
,
vec_dot_q3_K_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
}
static
void
ggml_mul_mat_q4_K_q8_1_cuda
(
...
...
@@ -3719,8 +4023,14 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
const
int
block_num_y
=
(
ncols_y
+
WARP_SIZE
-
1
)
/
WARP_SIZE
;
const
dim3
block_nums
(
block_num_x
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
WARP_SIZE
/
4
,
1
);
mul_mat_q
<
QK_K
,
QR4_K
,
QI4_K
,
block_q4_K
,
allocate_tiles_q4_K
,
load_tiles_q4_K
,
VDR_q4_K_q8_1
,
vec_dot_q4_K_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
if
(
nrows_x
%
GGML_CUDA_MMQ_Y
==
0
)
{
mul_mat_q
<
QK_K
,
QR4_K
,
QI4_K
,
block_q4_K
,
allocate_tiles_q4_K
,
load_tiles_q4_K
<
false
>
,
VDR_q4_K_q8_1
,
vec_dot_q4_K_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
else
{
mul_mat_q
<
QK_K
,
QR4_K
,
QI4_K
,
block_q4_K
,
allocate_tiles_q4_K
,
load_tiles_q4_K
<
true
>
,
VDR_q4_K_q8_1
,
vec_dot_q4_K_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
}
static
void
ggml_mul_mat_q5_K_q8_1_cuda
(
...
...
@@ -3731,8 +4041,14 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
const
int
block_num_y
=
(
ncols_y
+
WARP_SIZE
-
1
)
/
WARP_SIZE
;
const
dim3
block_nums
(
block_num_x
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
WARP_SIZE
/
4
,
1
);
mul_mat_q
<
QK_K
,
QR5_K
,
QI5_K
,
block_q5_K
,
allocate_tiles_q5_K
,
load_tiles_q5_K
,
VDR_q5_K_q8_1
,
vec_dot_q5_K_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
if
(
nrows_x
%
GGML_CUDA_MMQ_Y
==
0
)
{
mul_mat_q
<
QK_K
,
QR5_K
,
QI5_K
,
block_q5_K
,
allocate_tiles_q5_K
,
load_tiles_q5_K
<
false
>
,
VDR_q5_K_q8_1
,
vec_dot_q5_K_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
else
{
mul_mat_q
<
QK_K
,
QR5_K
,
QI5_K
,
block_q5_K
,
allocate_tiles_q5_K
,
load_tiles_q5_K
<
true
>
,
VDR_q5_K_q8_1
,
vec_dot_q5_K_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
}
static
void
ggml_mul_mat_q6_K_q8_1_cuda
(
...
...
@@ -3743,8 +4059,14 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
const
int
block_num_y
=
(
ncols_y
+
WARP_SIZE
-
1
)
/
WARP_SIZE
;
const
dim3
block_nums
(
block_num_x
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
WARP_SIZE
/
4
,
1
);
mul_mat_q
<
QK_K
,
QR6_K
,
QI6_K
,
block_q6_K
,
allocate_tiles_q6_K
,
load_tiles_q6_K
,
VDR_q6_K_q8_1
,
vec_dot_q6_K_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
if
(
nrows_x
%
GGML_CUDA_MMQ_Y
==
0
)
{
mul_mat_q
<
QK_K
,
QR6_K
,
QI6_K
,
block_q6_K
,
allocate_tiles_q6_K
,
load_tiles_q6_K
<
false
>
,
VDR_q6_K_q8_1
,
vec_dot_q6_K_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
else
{
mul_mat_q
<
QK_K
,
QR6_K
,
QI6_K
,
block_q6_K
,
allocate_tiles_q6_K
,
load_tiles_q6_K
<
true
>
,
VDR_q6_K_q8_1
,
vec_dot_q6_K_q8_1_mul_mat
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols_x
,
nrows_x
,
ncols_y
,
nrows_y
,
nrows_dst
);
}
}
static
void
ggml_mul_mat_p021_f16_f32_cuda
(
...
...
@@ -4690,8 +5012,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
row_low
=
id
==
0
?
0
:
nrows0
*
g_tensor_split
[
id
];
row_low
-=
row_low
%
GGML_CUDA_MMQ_Y
;
row_high
=
id
==
g_device_count
-
1
?
nrows0
:
nrows0
*
g_tensor_split
[
id
+
1
];
row_high
-=
row_high
%
GGML_CUDA_MMQ_Y
;
if
(
id
==
g_device_count
-
1
)
{
row_high
=
nrows0
;
}
else
{
row_high
=
nrows0
*
g_tensor_split
[
id
+
1
];
row_high
-=
row_high
%
GGML_CUDA_MMQ_Y
;
}
}
else
{
row_low
=
0
;
row_high
=
nrows0
*
i02_divisor
;
...
...
@@ -5171,8 +5497,12 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
row_low
=
id
==
0
?
0
:
nrows
*
g_tensor_split
[
id
];
row_low
-=
row_low
%
GGML_CUDA_MMQ_Y
;
row_high
=
id
==
g_device_count
-
1
?
nrows
:
nrows
*
g_tensor_split
[
id
+
1
];
row_high
-=
row_high
%
GGML_CUDA_MMQ_Y
;
if
(
id
==
g_device_count
-
1
)
{
row_high
=
nrows
;
}
else
{
row_high
=
nrows
*
g_tensor_split
[
id
+
1
];
row_high
-=
row_high
%
GGML_CUDA_MMQ_Y
;
}
}
else
{
GGML_ASSERT
(
false
);
}
...
...
llama/ggml-cuda.h
View file @
85aeb428
/**
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
* llama.cpp - git
8183159cf3def112f6d1fe94815fce70e1bffa12
*
* MIT License
*
...
...
llama/ggml-metal.h
View file @
85aeb428
//go:build darwin
/**
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
* llama.cpp - git
8183159cf3def112f6d1fe94815fce70e1bffa12
*
* MIT License
*
...
...
llama/ggml-metal.m
View file @
85aeb428
//go:build darwin
/**
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
* llama.cpp - git
8183159cf3def112f6d1fe94815fce70e1bffa12
*
* MIT License
*
...
...
llama/ggml-metal.metal
View file @
85aeb428
//go:build darwin
/**
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
* llama.cpp - git
8183159cf3def112f6d1fe94815fce70e1bffa12
*
* MIT License
*
...
...
llama/ggml-mpi.c
View file @
85aeb428
//go:build mpi
/**
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
* llama.cpp - git
8183159cf3def112f6d1fe94815fce70e1bffa12
*
* MIT License
*
...
...
llama/ggml-mpi.h
View file @
85aeb428
//go:build mpi
/**
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
* llama.cpp - git
8183159cf3def112f6d1fe94815fce70e1bffa12
*
* MIT License
*
...
...
llama/ggml-opencl.cpp
View file @
85aeb428
//go:build opencl
/**
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
* llama.cpp - git
8183159cf3def112f6d1fe94815fce70e1bffa12
*
* MIT License
*
...
...
llama/ggml-opencl.h
View file @
85aeb428
//go:build opencl
/**
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
* llama.cpp - git
8183159cf3def112f6d1fe94815fce70e1bffa12
*
* MIT License
*
...
...
llama/ggml.c
View file @
85aeb428
/**
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
* llama.cpp - git
8183159cf3def112f6d1fe94815fce70e1bffa12
*
* MIT License
*
...
...
llama/ggml.h
View file @
85aeb428
/**
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
* llama.cpp - git
8183159cf3def112f6d1fe94815fce70e1bffa12
*
* MIT License
*
...
...
llama/k_quants.c
View file @
85aeb428
/**
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
* llama.cpp - git
8183159cf3def112f6d1fe94815fce70e1bffa12
*
* MIT License
*
...
...
llama/k_quants.h
View file @
85aeb428
/**
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
* llama.cpp - git
8183159cf3def112f6d1fe94815fce70e1bffa12
*
* MIT License
*
...
...
llama/llama-util.h
View file @
85aeb428
/**
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
* llama.cpp - git
8183159cf3def112f6d1fe94815fce70e1bffa12
*
* MIT License
*
...
...
llama/llama.cpp
View file @
85aeb428
/**
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
* llama.cpp - git
8183159cf3def112f6d1fe94815fce70e1bffa12
*
* MIT License
*
...
...
llama/llama.go
View file @
85aeb428
...
...
@@ -128,11 +128,6 @@ func New(model string, opts api.Options) (*LLM, error) {
C
.
llama_backend_init
(
C
.
bool
(
llm
.
UseNUMA
))
// TODO: GQA == 8 suggests 70B model which doesn't support metal
if
llm
.
NumGQA
==
8
{
llm
.
NumGPU
=
0
}
params
:=
C
.
llama_context_default_params
()
params
.
seed
=
C
.
uint
(
llm
.
Seed
)
params
.
n_ctx
=
C
.
int
(
llm
.
NumCtx
)
...
...
llama/llama.h
View file @
85aeb428
/**
* llama.cpp - git
c574bddb368424b5996cbee2ec45ec050967d404
* llama.cpp - git
8183159cf3def112f6d1fe94815fce70e1bffa12
*
* MIT License
*
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment