Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
a83eaa7a
Commit
a83eaa7a
authored
Jul 19, 2023
by
Michael Yang
Browse files
update llama.cpp to e782c9e735f93ab4767ffc37462c523b73a17ddc
parent
5156e48c
Changes
12
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
1708 additions
and
650 deletions
+1708
-650
llama/ggml-cuda.cu
llama/ggml-cuda.cu
+563
-62
llama/ggml-cuda.h
llama/ggml-cuda.h
+1
-1
llama/ggml-metal.h
llama/ggml-metal.h
+1
-1
llama/ggml-metal.m
llama/ggml-metal.m
+45
-35
llama/ggml-metal.metal
llama/ggml-metal.metal
+390
-329
llama/ggml.c
llama/ggml.c
+493
-159
llama/ggml.h
llama/ggml.h
+49
-2
llama/k_quants.c
llama/k_quants.c
+1
-1
llama/k_quants.h
llama/k_quants.h
+9
-1
llama/llama-util.h
llama/llama-util.h
+4
-4
llama/llama.cpp
llama/llama.cpp
+120
-53
llama/llama.h
llama/llama.h
+32
-2
No files found.
llama/ggml-cuda.cu
View file @
a83eaa7a
/**
* llama.cpp - git
5bf2a2771886ee86137e01dbc7492f78fb392066
* llama.cpp - git
e782c9e735f93ab4767ffc37462c523b73a17ddc
*
* MIT License
*
...
...
@@ -39,6 +39,8 @@
#include "ggml-cuda.h"
#include "ggml.h"
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
...
...
@@ -100,7 +102,7 @@ typedef void (*ggml_cuda_op_t)(
#define QK4_0 32
#define QR4_0 2
#define QI4_0
4
#define QI4_0
(QK4_0 / (4 * QR4_0))
typedef
struct
{
half
d
;
// delta
uint8_t
qs
[
QK4_0
/
2
];
// nibbles / quants
...
...
@@ -109,7 +111,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
#define QK4_1 32
#define QR4_1 2
#define QI4_1
4
#define QI4_1
(QK4_1 / (4 * QR4_1))
typedef
struct
{
half
d
;
// delta
half
m
;
// min
...
...
@@ -119,7 +121,7 @@ static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong
#define QK5_0 32
#define QR5_0 2
#define QI5_0
4
#define QI5_0
(QK5_0 / (4 * QR5_0))
typedef
struct
{
half
d
;
// delta
uint8_t
qh
[
4
];
// 5-th bit of quants
...
...
@@ -129,7 +131,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
#define QK5_1 32
#define QR5_1 2
#define QI5_1
4
#define QI5_1
(QK5_1 / (4 * QR5_1))
typedef
struct
{
half
d
;
// delta
half
m
;
// min
...
...
@@ -140,7 +142,7 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
#define QK8_0 32
#define QR8_0 1
#define QI8_0
8
#define QI8_0
(QK8_0 / (4 * QR8_0))
typedef
struct
{
half
d
;
// delta
int8_t
qs
[
QK8_0
];
// quants
...
...
@@ -149,7 +151,7 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo
#define QK8_1 32
#define QR8_1 1
#define QI8_1
8
#define QI8_1
(QK8_1 / (4 * QR8_1))
typedef
struct
{
half
d
;
// delta
half
s
;
// unquantized sum
...
...
@@ -169,6 +171,8 @@ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_
#define K_SCALE_SIZE 12
#endif
#define QR2_K 4
#define QI2_K (QK_K / (4*QR2_K))
typedef
struct
{
uint8_t
scales
[
QK_K
/
16
];
// scales and mins, quantized with 4 bits
uint8_t
qs
[
QK_K
/
4
];
// quants
...
...
@@ -177,6 +181,8 @@ typedef struct {
}
block_q2_K
;
static_assert
(
sizeof
(
block_q2_K
)
==
2
*
sizeof
(
ggml_fp16_t
)
+
QK_K
/
16
+
QK_K
/
4
,
"wrong q2_K block size/padding"
);
#define QR3_K 4
#define QI3_K (QK_K / (4*QR3_K))
typedef
struct
{
uint8_t
hmask
[
QK_K
/
8
];
// quants - high bit
uint8_t
qs
[
QK_K
/
4
];
// quants - low 2 bits
...
...
@@ -189,6 +195,8 @@ typedef struct {
}
block_q3_K
;
//static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
#define QR4_K 2
#define QI4_K (QK_K / (4*QR4_K))
#ifdef GGML_QKK_64
typedef
struct
{
half
d
[
2
];
// super-block scales/mins
...
...
@@ -206,6 +214,8 @@ typedef struct {
static_assert
(
sizeof
(
block_q4_K
)
==
2
*
sizeof
(
ggml_fp16_t
)
+
3
*
QK_K
/
64
+
QK_K
/
2
,
"wrong q4_K block size/padding"
);
#endif
#define QR5_K 2
#define QI5_K (QK_K / (4*QR5_K))
#ifdef GGML_QKK_64
typedef
struct
{
half
d
;
// super-block scale
...
...
@@ -225,6 +235,8 @@ typedef struct {
static_assert
(
sizeof
(
block_q5_K
)
==
2
*
sizeof
(
ggml_fp16_t
)
+
K_SCALE_SIZE
+
QK_K
/
2
+
QK_K
/
8
,
"wrong q5_K block size/padding"
);
#endif
#define QR6_K 2
#define QI6_K (QK_K / (4*QR6_K))
typedef
struct
{
uint8_t
ql
[
QK_K
/
2
];
// quants, lower 4 bits
uint8_t
qh
[
QK_K
/
4
];
// quants, upper 2 bits
...
...
@@ -238,6 +250,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
#define CUDA_ADD_BLOCK_SIZE 256
#define CUDA_MUL_BLOCK_SIZE 256
#define CUDA_GELU_BLOCK_SIZE 256
#define CUDA_SILU_BLOCK_SIZE 256
#define CUDA_CPY_BLOCK_SIZE 32
#define CUDA_SCALE_BLOCK_SIZE 256
...
...
@@ -265,13 +278,13 @@ struct ggml_tensor_extra_gpu {
cudaEvent_t
events
[
GGML_CUDA_MAX_DEVICES
];
// events for synchronizing multiple GPUs
};
static
__global__
void
add_f32
(
const
float
*
x
,
const
float
*
y
,
float
*
dst
,
const
int
k
)
{
static
__global__
void
add_f32
(
const
float
*
x
,
const
float
*
y
,
float
*
dst
,
const
int
k
x
,
const
int
ky
)
{
const
int
i
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
if
(
i
>=
k
)
{
if
(
i
>=
k
x
)
{
return
;
}
dst
[
i
]
=
x
[
i
]
+
y
[
i
];
dst
[
i
]
=
x
[
i
]
+
y
[
i
%
ky
];
}
static
__global__
void
add_f16_f32_f16
(
const
half
*
x
,
const
float
*
y
,
half
*
dst
,
const
int
k
)
{
...
...
@@ -292,6 +305,19 @@ static __global__ void mul_f32(const float * x, const float * y, float * dst, co
dst
[
i
]
=
x
[
i
]
*
y
[
i
%
ky
];
}
static
__global__
void
gelu_f32
(
const
float
*
x
,
float
*
dst
,
const
int
k
)
{
const
float
GELU_COEF_A
=
0.044715
f
;
const
float
SQRT_2_OVER_PI
=
0.79788456080286535587989211986876
f
;
const
int
i
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
if
(
i
>=
k
)
{
return
;
}
float
xi
=
x
[
i
];
dst
[
i
]
=
0.5
f
*
xi
*
(
1.0
f
+
tanhf
(
SQRT_2_OVER_PI
*
xi
*
(
1.0
f
+
GELU_COEF_A
*
xi
*
xi
)));
}
static
__global__
void
silu_f32
(
const
float
*
x
,
float
*
dst
,
const
int
k
)
{
const
int
i
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
...
...
@@ -301,16 +327,46 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
dst
[
i
]
=
x
[
i
]
/
(
1.0
f
+
expf
(
-
x
[
i
]));
}
static
__global__
void
norm_f32
(
const
float
*
x
,
float
*
dst
,
const
int
ncols
)
{
const
int
row
=
blockIdx
.
x
*
blockDim
.
y
+
threadIdx
.
y
;
const
int
tid
=
threadIdx
.
x
;
const
float
eps
=
1e-5
f
;
float
mean
=
0.0
f
;
float
var
=
0.0
f
;
for
(
int
col
=
tid
;
col
<
ncols
;
col
+=
WARP_SIZE
)
{
const
float
xi
=
x
[
row
*
ncols
+
col
];
mean
+=
xi
;
var
+=
xi
*
xi
;
}
// sum up partial sums
#pragma unroll
for
(
int
mask
=
16
;
mask
>
0
;
mask
>>=
1
)
{
mean
+=
__shfl_xor_sync
(
0xffffffff
,
mean
,
mask
,
32
);
var
+=
__shfl_xor_sync
(
0xffffffff
,
var
,
mask
,
32
);
}
mean
/=
ncols
;
var
=
var
/
ncols
-
mean
*
mean
;
const
float
inv_var
=
rsqrtf
(
var
+
eps
);
for
(
int
col
=
tid
;
col
<
ncols
;
col
+=
WARP_SIZE
)
{
dst
[
row
*
ncols
+
col
]
=
(
x
[
row
*
ncols
+
col
]
-
mean
)
*
inv_var
;
}
}
static
__global__
void
rms_norm_f32
(
const
float
*
x
,
float
*
dst
,
const
int
ncols
)
{
const
int
row
=
blockIdx
.
x
*
blockDim
.
y
+
threadIdx
.
y
;
const
int
tid
=
threadIdx
.
x
;
const
float
eps
=
1e-6
;
const
float
eps
=
1e-6
f
;
float
tmp
=
0.0
f
;
// partial sum for thread in warp
for
(
int
i
=
0
;
i
<
ncols
;
i
+=
WARP_SIZE
)
{
const
int
col
=
i
+
tid
;
for
(
int
col
=
tid
;
col
<
ncols
;
col
+=
WARP_SIZE
)
{
const
float
xi
=
x
[
row
*
ncols
+
col
];
tmp
+=
xi
*
xi
;
}
...
...
@@ -322,10 +378,9 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
}
const
float
mean
=
tmp
/
ncols
;
const
float
scale
=
1.0
f
/
sqrtf
(
mean
+
eps
);
const
float
scale
=
r
sqrtf
(
mean
+
eps
);
for
(
int
i
=
0
;
i
<
ncols
;
i
+=
WARP_SIZE
)
{
const
int
col
=
i
+
tid
;
for
(
int
col
=
tid
;
col
<
ncols
;
col
+=
WARP_SIZE
)
{
dst
[
row
*
ncols
+
col
]
=
scale
*
x
[
row
*
ncols
+
col
];
}
}
...
...
@@ -1254,8 +1309,9 @@ static __global__ void dequantize_block(const void * __restrict__ vx, float * __
y
[
iybs
+
iqs
+
y_offset
]
=
v
.
y
;
}
static
__device__
__forceinline__
float
vec_dot_q4_0_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
iqs
)
{
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
static
__device__
__forceinline__
float
vec_dot_q4_0_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
iqs
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const
block_q4_0
*
bq4_0
=
(
const
block_q4_0
*
)
vbq
;
int
vi
;
...
...
@@ -1276,11 +1332,12 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restric
return
sumi
*
d
;
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >=
600
#endif // __CUDA_ARCH__ >=
MIN_CC_DP4A
}
static
__device__
__forceinline__
float
vec_dot_q4_1_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
iqs
)
{
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
static
__device__
__forceinline__
float
vec_dot_q4_1_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
iqs
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const
block_q4_1
*
bq4_1
=
(
const
block_q4_1
*
)
vbq
;
const
int
vi
=
*
((
int
*
)
&
bq4_1
->
qs
[
sizeof
(
int
)
*
(
iqs
+
0
)]);
...
...
@@ -1301,11 +1358,12 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restric
return
sumi
*
d
+
m
*
s
/
QI4_1
;
// scale sum by QI4_1 because there are QI4_1 threads working on this block
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >=
600
#endif // __CUDA_ARCH__ >=
MIN_CC_DP4A
}
static
__device__
__forceinline__
float
vec_dot_q5_0_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
iqs
)
{
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
static
__device__
__forceinline__
float
vec_dot_q5_0_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
iqs
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const
block_q5_0
*
bq5_0
=
(
const
block_q5_0
*
)
vbq
;
int
qs
;
...
...
@@ -1336,11 +1394,12 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restric
return
sumi
*
d
;
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >=
600
#endif // __CUDA_ARCH__ >=
MIN_CC_DP4A
}
static
__device__
__forceinline__
float
vec_dot_q5_1_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
iqs
)
{
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
static
__device__
__forceinline__
float
vec_dot_q5_1_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
iqs
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const
block_q5_1
*
bq5_1
=
(
const
block_q5_1
*
)
vbq
;
const
int
qs
=
*
((
int
*
)
&
bq5_1
->
qs
[
sizeof
(
int
)
*
(
iqs
+
0
)]);
...
...
@@ -1370,11 +1429,12 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restric
return
sumi
*
d
+
m
*
s
/
QI5_1
;
// scale sum by QI5_1 because there are QI5_1 threads working on this block
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >=
600
#endif // __CUDA_ARCH__ >=
MIN_CC_DP4A
}
static
__device__
__forceinline__
float
vec_dot_q8_0_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
iqs
)
{
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
static
__device__
__forceinline__
float
vec_dot_q8_0_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
iqs
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const
block_q8_0
*
bq8_0
=
(
const
block_q8_0
*
)
vbq
;
int
vi
;
...
...
@@ -1389,7 +1449,220 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restric
return
sumi
*
d
;
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= 600
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
static
__device__
__forceinline__
float
vec_dot_q2_K_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
iqs
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const
block_q2_K
*
bq2_K
=
(
const
block_q2_K
*
)
vbq
;
const
int
bq8_offset
=
QR2_K
*
(
iqs
/
QI8_1
);
const
int
scale_offset
=
iqs
-
iqs
%
QI8_1
+
(
iqs
%
QI8_1
)
/
(
QI8_1
/
2
);
float
sumf_d
=
0.0
f
;
float
sumf_m
=
0.0
f
;
const
float
d
=
bq2_K
->
d
;
const
float
dmin
=
bq2_K
->
dmin
;
const
int
v
=
*
((
int
*
)
&
bq2_K
->
qs
[
sizeof
(
int
)
*
iqs
]);
for
(
int
i
=
0
;
i
<
QR2_K
;
++
i
)
{
const
int
sc
=
bq2_K
->
scales
[
scale_offset
+
2
*
i
];
const
block_q8_1
*
bq8i
=
bq8_1
+
bq8_offset
+
i
;
const
float
d8i
=
bq8i
->
d
;
const
int
vi
=
(
v
>>
(
2
*
i
))
&
0x03030303
;
const
int
ui
=
*
((
int
*
)
&
bq8i
->
qs
[
sizeof
(
int
)
*
(
iqs
%
QI8_1
)]);
sumf_d
+=
d8i
*
(
__dp4a
(
vi
,
ui
,
0
)
*
(
sc
&
0xF
));
// SIMD dot product
sumf_m
+=
d8i
*
(
__dp4a
(
0x01010101
,
ui
,
0
)
*
(
sc
>>
4
));
// multiply constant q2_K part with sum of q8_1 values
}
return
d
*
sumf_d
-
dmin
*
sumf_m
;
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
static
__device__
__forceinline__
float
vec_dot_q3_K_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
iqs
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const
block_q3_K
*
bq3_K
=
(
const
block_q3_K
*
)
vbq
;
const
int
bq8_offset
=
QR3_K
*
(
iqs
/
(
QI3_K
/
2
));
const
int
scale_offset
=
iqs
-
iqs
%
QI8_1
+
(
iqs
%
QI8_1
)
/
(
QI8_1
/
2
);
float
sumf
=
0.0
f
;
const
float
d
=
bq3_K
->
d
;
int
vl
;
memcpy
(
&
vl
,
&
bq3_K
->
qs
[
sizeof
(
int
)
*
iqs
],
sizeof
(
int
));
int
vh
;
memcpy
(
&
vh
,
&
bq3_K
->
hmask
[
sizeof
(
int
)
*
(
iqs
%
(
QI3_K
/
2
))],
sizeof
(
int
));
vh
=
~
vh
;
// invert the mask so that a 0/1 results in 4/0 being subtracted
vh
>>=
bq8_offset
;
for
(
int
i
=
0
;
i
<
QR3_K
;
++
i
)
{
const
int
isc
=
scale_offset
+
2
*
i
;
const
int
isc_low
=
isc
%
(
QK_K
/
32
);
const
int
sc_shift_low
=
4
*
(
isc
/
(
QK_K
/
32
));
const
int
sc_low
=
(
bq3_K
->
scales
[
isc_low
]
>>
sc_shift_low
)
&
0xF
;
const
int
isc_high
=
isc
%
(
QK_K
/
64
);
const
int
sc_shift_high
=
2
*
(
isc
/
(
QK_K
/
64
));
const
int
sc_high
=
((
bq3_K
->
scales
[(
QK_K
/
32
)
+
isc_high
]
>>
sc_shift_high
)
&
3
)
<<
4
;
const
int
sc
=
(
sc_low
|
sc_high
)
-
32
;
const
block_q8_1
*
bq8i
=
bq8_1
+
bq8_offset
+
i
;
const
int
ui
=
*
((
int
*
)
&
bq8i
->
qs
[
sizeof
(
int
)
*
(
iqs
%
QI8_1
)]);
const
float
d8i
=
bq8i
->
d
;
const
int
vil
=
(
vl
>>
(
2
*
i
))
&
0x03030303
;
const
int
vih
=
((
vh
>>
i
)
<<
2
)
&
0x04040404
;
const
int
vi
=
__vsubss4
(
vil
,
vih
);
sumf
+=
d8i
*
(
__dp4a
(
vi
,
ui
,
0
)
*
sc
);
// SIMD dot product
}
return
d
*
sumf
;
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
static
__device__
__forceinline__
float
vec_dot_q4_K_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
iqs
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const
block_q4_K
*
bq4_K
=
(
const
block_q4_K
*
)
vbq
;
const
int
bq8_offset
=
QR4_K
*
(
iqs
/
QI8_1
);
float
sumf_d
=
0.0
f
;
float
sumf_m
=
0.0
f
;
const
float
d
=
bq4_K
->
d
;
const
float
dmin
=
bq4_K
->
dmin
;
const
int
v
=
*
((
int
*
)
&
bq4_K
->
qs
[
sizeof
(
int
)
*
iqs
]);
for
(
int
i
=
0
;
i
<
QR4_K
;
++
i
)
{
const
int
isc
=
bq8_offset
+
i
;
uint8_t
sc
,
m
;
get_scale_min_k4
(
isc
,
bq4_K
->
scales
,
sc
,
m
);
const
block_q8_1
*
bq8i
=
bq8_1
+
bq8_offset
+
i
;
const
int
ui
=
*
((
int
*
)
&
bq8i
->
qs
[
sizeof
(
int
)
*
(
iqs
%
QI8_1
)]);
const
float
d8i
=
bq8i
->
d
;
const
int
vi
=
(
v
>>
(
4
*
i
))
&
0x0F0F0F0F
;
sumf_d
+=
d8i
*
(
__dp4a
(
vi
,
ui
,
0
)
*
sc
);
// SIMD dot product
sumf_m
+=
d8i
*
(
__dp4a
(
0x01010101
,
ui
,
0
)
*
m
);
// multiply constant part of q4_K with sum of q8_1 values
}
return
d
*
sumf_d
-
dmin
*
sumf_m
;
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
static
__device__
__forceinline__
float
vec_dot_q5_K_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
iqs
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const
block_q5_K
*
bq5_K
=
(
const
block_q5_K
*
)
vbq
;
const
int
bq8_offset
=
QR5_K
*
(
iqs
/
QI8_1
);
float
sumf_d
=
0.0
f
;
float
sumf_m
=
0.0
f
;
const
float
d
=
bq5_K
->
d
;
const
float
dmin
=
bq5_K
->
dmin
;
const
int
vl
=
*
((
int
*
)
&
bq5_K
->
qs
[
sizeof
(
int
)
*
iqs
]);
const
int
vh
=
(
*
((
int
*
)
&
bq5_K
->
qh
[
sizeof
(
int
)
*
(
iqs
%
(
QI5_K
/
4
))]))
>>
bq8_offset
;
for
(
int
i
=
0
;
i
<
QR5_K
;
++
i
)
{
const
int
isc
=
bq8_offset
+
i
;
uint8_t
sc
,
m
;
get_scale_min_k4
(
isc
,
bq5_K
->
scales
,
sc
,
m
);
const
block_q8_1
*
bq8i
=
bq8_1
+
bq8_offset
+
i
;
const
int
ui
=
*
((
int
*
)
&
bq8i
->
qs
[
sizeof
(
int
)
*
(
iqs
%
QI8_1
)]);
const
float
d8i
=
bq8i
->
d
;
const
int
vil
=
(
vl
>>
(
4
*
i
))
&
0x0F0F0F0F
;
const
int
vih
=
((
vh
>>
i
)
<<
4
)
&
0x10101010
;
const
int
vi
=
vil
|
vih
;
sumf_d
+=
d8i
*
(
__dp4a
(
vi
,
ui
,
0
)
*
sc
);
// SIMD dot product
sumf_m
+=
d8i
*
(
__dp4a
(
0x01010101
,
ui
,
0
)
*
m
);
// multiply constant part of q5_K with sum of q8_1 values
}
return
d
*
sumf_d
-
dmin
*
sumf_m
;
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
static
__device__
__forceinline__
float
vec_dot_q6_K_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
iqs
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const
block_q6_K
*
bq6_K
=
(
const
block_q6_K
*
)
vbq
;
const
int
bq8_offset
=
2
*
QR6_K
*
(
iqs
/
(
QI6_K
/
2
))
+
(
iqs
%
(
QI6_K
/
2
))
/
(
QI6_K
/
4
);
const
int
scale_offset
=
(
QI6_K
/
4
)
*
(
iqs
/
(
QI6_K
/
2
))
+
(
iqs
%
(
QI6_K
/
2
))
/
(
QI6_K
/
8
);
const
int
vh_shift
=
2
*
((
iqs
%
(
QI6_K
/
2
))
/
(
QI6_K
/
4
));
float
sumf
=
0.0
f
;
const
float
d
=
bq6_K
->
d
;
int
vl
;
memcpy
(
&
vl
,
&
bq6_K
->
ql
[
sizeof
(
int
)
*
iqs
],
sizeof
(
int
));
int
vh
;
memcpy
(
&
vh
,
&
bq6_K
->
qh
[
sizeof
(
int
)
*
((
QI6_K
/
4
)
*
(
iqs
/
(
QI6_K
/
2
))
+
iqs
%
(
QI6_K
/
4
))],
sizeof
(
int
));
for
(
int
i
=
0
;
i
<
QR6_K
;
++
i
)
{
const
int
sc
=
bq6_K
->
scales
[
scale_offset
+
4
*
i
];
const
block_q8_1
*
bq8i
=
bq8_1
+
bq8_offset
+
2
*
i
;
const
int
ui
=
*
((
int
*
)
&
bq8i
->
qs
[
sizeof
(
int
)
*
(
iqs
%
(
QI8_1
))]);
const
float
d8i
=
bq8i
->
d
;
const
int
vil
=
(
vl
>>
(
4
*
i
))
&
0x0F0F0F0F
;
const
int
vih
=
((
vh
>>
(
vh_shift
+
4
*
i
))
<<
4
)
&
0x30303030
;
const
int
vi
=
__vsubss4
((
vil
|
vih
),
0x20202020
);
// vi = (vil | vih) - 32
sumf
+=
d8i
*
(
__dp4a
(
vi
,
ui
,
0
)
*
sc
);
// SIMD dot product
}
return
d
*
sumf
;
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
template
<
int
qk
,
int
qi
,
typename
block_q_t
,
vec_dot_q_cuda_t
vec_dot_q_cuda
>
...
...
@@ -1412,7 +1685,7 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
for
(
int
i
=
0
;
i
<
blocks_per_row
;
i
+=
blocks_per_warp
)
{
const
int
ibx
=
row
*
blocks_per_row
+
i
+
threadIdx
.
x
/
qi
;
// x block index
const
int
iby
=
i
+
threadIdx
.
x
/
qi
;
// y block index
const
int
iby
=
(
i
+
threadIdx
.
x
/
qi
)
*
qk
/
QK8_1
;
// y block index
that aligns with ibx
const
int
iqs
=
threadIdx
.
x
%
qi
;
// x block quant index when casting the quants to int
...
...
@@ -1650,6 +1923,40 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
dst
[
i
+
1
]
=
x0
*
sin_theta
+
x1
*
cos_theta
;
}
static
__global__
void
rope_glm_f32
(
const
float
*
x
,
float
*
dst
,
const
int
ncols
,
const
float
p
,
const
float
block_p
,
const
float
theta_scale
)
{
const
int
col
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
const
int
half_n_dims
=
ncols
/
4
;
if
(
col
>=
half_n_dims
)
{
return
;
}
const
int
row
=
blockDim
.
y
*
blockIdx
.
y
+
threadIdx
.
y
;
const
int
i
=
row
*
ncols
+
col
;
const
float
col_theta_scale
=
powf
(
theta_scale
,
col
);
const
float
theta
=
p
*
col_theta_scale
;
const
float
sin_theta
=
sinf
(
theta
);
const
float
cos_theta
=
cosf
(
theta
);
const
float
x0
=
x
[
i
+
0
];
const
float
x1
=
x
[
i
+
half_n_dims
];
dst
[
i
+
0
]
=
x0
*
cos_theta
-
x1
*
sin_theta
;
dst
[
i
+
half_n_dims
]
=
x0
*
sin_theta
+
x1
*
cos_theta
;
const
float
block_theta
=
block_p
*
col_theta_scale
;
const
float
sin_block_theta
=
sinf
(
block_theta
);
const
float
cos_block_theta
=
cosf
(
block_theta
);
const
float
x2
=
x
[
i
+
half_n_dims
*
2
];
const
float
x3
=
x
[
i
+
half_n_dims
*
3
];
dst
[
i
+
half_n_dims
*
2
]
=
x2
*
cos_block_theta
-
x3
*
sin_block_theta
;
dst
[
i
+
half_n_dims
*
3
]
=
x2
*
sin_block_theta
+
x3
*
cos_block_theta
;
}
static
__global__
void
diag_mask_inf_f32
(
const
float
*
x
,
float
*
dst
,
const
int
ncols
,
const
int
rows_per_channel
,
const
int
n_past
)
{
const
int
col
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
const
int
row
=
blockDim
.
y
*
blockIdx
.
y
+
threadIdx
.
y
;
...
...
@@ -1715,9 +2022,9 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
dst
[
i
]
=
scale
*
x
[
i
];
}
static
void
add_f32_cuda
(
const
float
*
x
,
const
float
*
y
,
float
*
dst
,
const
int
k
,
cudaStream_t
stream
)
{
const
int
num_blocks
=
(
k
+
CUDA_ADD_BLOCK_SIZE
-
1
)
/
CUDA_ADD_BLOCK_SIZE
;
add_f32
<<<
num_blocks
,
CUDA_ADD_BLOCK_SIZE
,
0
,
stream
>>>
(
x
,
y
,
dst
,
k
);
static
void
add_f32_cuda
(
const
float
*
x
,
const
float
*
y
,
float
*
dst
,
const
int
k
x
,
const
int
ky
,
cudaStream_t
stream
)
{
const
int
num_blocks
=
(
k
x
+
CUDA_ADD_BLOCK_SIZE
-
1
)
/
CUDA_ADD_BLOCK_SIZE
;
add_f32
<<<
num_blocks
,
CUDA_ADD_BLOCK_SIZE
,
0
,
stream
>>>
(
x
,
y
,
dst
,
k
x
,
ky
);
}
static
void
add_f16_f32_f16_cuda
(
const
half
*
x
,
const
float
*
y
,
half
*
dst
,
const
int
k
,
cudaStream_t
stream
)
{
...
...
@@ -1730,11 +2037,22 @@ static void mul_f32_cuda(const float * x, const float * y, float * dst, const in
mul_f32
<<<
num_blocks
,
CUDA_MUL_BLOCK_SIZE
,
0
,
stream
>>>
(
x
,
y
,
dst
,
kx
,
ky
);
}
static
void
gelu_f32_cuda
(
const
float
*
x
,
float
*
dst
,
const
int
k
,
cudaStream_t
stream
)
{
const
int
num_blocks
=
(
k
+
CUDA_GELU_BLOCK_SIZE
-
1
)
/
CUDA_GELU_BLOCK_SIZE
;
gelu_f32
<<<
num_blocks
,
CUDA_GELU_BLOCK_SIZE
,
0
,
stream
>>>
(
x
,
dst
,
k
);
}
static
void
silu_f32_cuda
(
const
float
*
x
,
float
*
dst
,
const
int
k
,
cudaStream_t
stream
)
{
const
int
num_blocks
=
(
k
+
CUDA_SILU_BLOCK_SIZE
-
1
)
/
CUDA_SILU_BLOCK_SIZE
;
silu_f32
<<<
num_blocks
,
CUDA_SILU_BLOCK_SIZE
,
0
,
stream
>>>
(
x
,
dst
,
k
);
}
static
void
norm_f32_cuda
(
const
float
*
x
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ncols
%
WARP_SIZE
==
0
);
const
dim3
block_dims
(
WARP_SIZE
,
1
,
1
);
norm_f32
<<<
nrows
,
block_dims
,
0
,
stream
>>>
(
x
,
dst
,
ncols
);
}
static
void
rms_norm_f32_cuda
(
const
float
*
x
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ncols
%
WARP_SIZE
==
0
);
const
dim3
block_dims
(
WARP_SIZE
,
1
,
1
);
...
...
@@ -1900,7 +2218,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
}
static
void
mul_mat_vec_q4_0_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ncols
%
GGML_CUDA_DMMV_X
==
0
);
GGML_ASSERT
(
ncols
%
QK4_0
==
0
);
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
...
...
@@ -1909,7 +2227,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
}
static
void
mul_mat_vec_q4_1_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ncols
%
GGML_CUDA_DMMV_X
==
0
);
GGML_ASSERT
(
ncols
%
QK4_1
==
0
);
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
...
...
@@ -1918,7 +2236,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
}
static
void
mul_mat_vec_q5_0_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ncols
%
GGML_CUDA_DMMV_X
==
0
);
GGML_ASSERT
(
ncols
%
QK5_0
==
0
);
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
...
...
@@ -1927,7 +2245,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
}
static
void
mul_mat_vec_q5_1_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ncols
%
GGML_CUDA_DMMV_X
==
0
);
GGML_ASSERT
(
ncols
%
QK5_1
==
0
);
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
...
...
@@ -1936,7 +2254,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
}
static
void
mul_mat_vec_q8_0_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ncols
%
GGML_CUDA_DMMV_X
==
0
);
GGML_ASSERT
(
ncols
%
QK8_0
==
0
);
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
...
...
@@ -1944,6 +2262,51 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols
,
nrows
);
}
static
void
mul_mat_vec_q2_K_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ncols
%
QK_K
==
0
);
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
mul_mat_vec_q
<
QK_K
,
QI2_K
,
block_q2_K
,
vec_dot_q2_K_q8_1
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols
,
nrows
);
}
static
void
mul_mat_vec_q3_K_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ncols
%
QK_K
==
0
);
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
mul_mat_vec_q
<
QK_K
,
QI3_K
,
block_q3_K
,
vec_dot_q3_K_q8_1
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols
,
nrows
);
}
static
void
mul_mat_vec_q4_K_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ncols
%
QK_K
==
0
);
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
mul_mat_vec_q
<
QK_K
,
QI4_K
,
block_q4_K
,
vec_dot_q4_K_q8_1
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols
,
nrows
);
}
static
void
mul_mat_vec_q5_K_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ncols
%
QK_K
==
0
);
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
mul_mat_vec_q
<
QK_K
,
QI5_K
,
block_q5_K
,
vec_dot_q5_K_q8_1
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols
,
nrows
);
}
static
void
mul_mat_vec_q6_K_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ncols
%
QK_K
==
0
);
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
mul_mat_vec_q
<
QK_K
,
QI6_K
,
block_q6_K
,
vec_dot_q6_K_q8_1
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols
,
nrows
);
}
static
void
convert_fp16_to_fp32_cuda
(
const
void
*
vx
,
float
*
y
,
const
int
k
,
cudaStream_t
stream
)
{
const
int
num_blocks
=
(
k
+
CUDA_DEQUANTIZE_BLOCK_SIZE
-
1
)
/
CUDA_DEQUANTIZE_BLOCK_SIZE
;
dequantize_block
<
1
,
1
,
convert_f16
><<<
num_blocks
,
CUDA_DEQUANTIZE_BLOCK_SIZE
,
0
,
stream
>>>
(
vx
,
y
,
k
);
...
...
@@ -2036,6 +2399,14 @@ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const i
rope_f32
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
x
,
dst
,
ncols
,
p
,
theta_scale
);
}
static
void
rope_glm_f32_cuda
(
const
float
*
x
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
const
float
p
,
const
float
block_p
,
const
float
theta_scale
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
nrows
%
4
==
0
);
const
dim3
block_dims
(
4
*
CUDA_ROPE_BLOCK_SIZE
,
1
,
1
);
const
int
num_blocks_x
=
(
ncols
+
4
*
CUDA_ROPE_BLOCK_SIZE
-
1
)
/
(
4
*
CUDA_ROPE_BLOCK_SIZE
);
const
dim3
block_nums
(
num_blocks_x
,
nrows
,
1
);
rope_glm_f32
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
x
,
dst
,
ncols
,
p
,
block_p
,
theta_scale
);
}
static
void
diag_mask_inf_f32_cuda
(
const
float
*
x
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
rows_per_channel
,
const
int
n_past
,
cudaStream_t
stream
)
{
const
dim3
block_dims
(
CUDA_DIAG_MASK_INF_BLOCK_SIZE
,
1
,
1
);
const
int
block_num_x
=
(
ncols_x
+
CUDA_DIAG_MASK_INF_BLOCK_SIZE
-
1
)
/
CUDA_DIAG_MASK_INF_BLOCK_SIZE
;
...
...
@@ -2263,16 +2634,19 @@ inline void ggml_cuda_op_add(
GGML_ASSERT
(
src0_ddq_i
!=
nullptr
||
src0_ddf_i
!=
nullptr
);
GGML_ASSERT
(
src1_ddf_i
!=
nullptr
);
GGML_ASSERT
(
dst_ddf_i
!=
nullptr
);
GGML_ASSERT
(
dst_ddf_i
!=
nullptr
);
const
int64_t
ne0
=
src0
->
ne
[
0
];
const
int64_t
ne0
0
=
src0
->
ne
[
0
];
const
int64_t
i01_diff
=
i01_high
-
i01_low
;
const
int64_t
ne10
=
src1
->
ne
[
0
];
const
int64_t
ne11
=
src1
->
ne
[
1
];
// compute
if
(
src0
->
type
==
GGML_TYPE_F32
&&
dst
->
type
==
GGML_TYPE_F32
)
{
add_f32_cuda
(
src0_ddf_i
,
src1_ddf_i
,
dst_ddf_i
,
ne0
*
i01_diff
,
cudaStream_main
);
add_f32_cuda
(
src0_ddf_i
,
src1_ddf_i
,
dst_ddf_i
,
ne0
0
*
i01_diff
,
ne10
*
ne11
,
cudaStream_main
);
}
else
if
(
src0
->
type
==
GGML_TYPE_F16
&&
dst
->
type
==
GGML_TYPE_F16
)
{
add_f16_f32_f16_cuda
((
half
*
)
src0_ddq_i
,
src1_ddf_i
,
(
half
*
)
dst_ddf_i
,
ne0
*
i01_diff
,
cudaStream_main
);
add_f16_f32_f16_cuda
((
half
*
)
src0_ddq_i
,
src1_ddf_i
,
(
half
*
)
dst_ddf_i
,
ne0
0
*
i01_diff
,
cudaStream_main
);
}
else
{
GGML_ASSERT
(
false
);
}
...
...
@@ -2291,27 +2665,41 @@ inline void ggml_cuda_op_mul(
GGML_ASSERT
(
src0_ddf_i
!=
nullptr
);
GGML_ASSERT
(
src1_ddf_i
!=
nullptr
);
GGML_ASSERT
(
dst_ddf_i
!=
nullptr
);
GGML_ASSERT
(
dst_ddf_i
!=
nullptr
);
const
int64_t
ne00
=
src0
->
ne
[
0
];
const
int64_t
i01_diff
=
i01_high
-
i01_low
;
const
int64_t
ne10
=
src1
->
ne
[
0
];
const
int64_t
ne11
=
src1
->
ne
[
1
];
for
(
int64_t
i01
=
i01_low
;
i01
<
i01_high
;
i01
++
)
{
const
int64_t
i11
=
i1
*
ne11
+
i01
%
ne11
;
// broadcast src1 across src0
mul_f32_cuda
(
src0_ddf_i
,
src1_ddf_i
,
dst_ddf_i
,
ne00
*
i01_diff
,
ne10
*
ne11
,
cudaStream_main
);
float
*
src0_ddf_i01
=
src0_ddf_i
+
i01
*
ne00
;
float
*
src1_ddf_i01
=
src1_ddf_i
+
i11
*
ne10
;
float
*
dst_ddf_i01
=
dst_ddf_i
+
i01
*
ne00
;
(
void
)
dst
;
(
void
)
src0_ddq_i
;
(
void
)
i02
;
}
// compute
mul_f32_cuda
(
src0_ddf_i01
,
src1_ddf_i01
,
dst_ddf_i01
,
ne00
,
ne10
,
cudaStream_main
);
}
inline
void
ggml_cuda_op_gelu
(
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
,
char
*
src0_ddq_i
,
float
*
src0_ddf_i
,
float
*
src1_ddf_i
,
float
*
dst_ddf_i
,
int64_t
i02
,
int64_t
i01_low
,
int64_t
i01_high
,
int
i1
,
cudaStream_t
&
cudaStream_main
){
GGML_ASSERT
(
src0_ddf_i
!=
nullptr
);
GGML_ASSERT
(
dst_ddf_i
!=
nullptr
);
const
int64_t
ne00
=
src0
->
ne
[
0
];
const
int64_t
i01_diff
=
i01_high
-
i01_low
;
// compute
gelu_f32_cuda
(
src0_ddf_i
,
dst_ddf_i
,
ne00
*
i01_diff
,
cudaStream_main
);
(
void
)
src1
;
(
void
)
dst
;
(
void
)
src0_ddq_i
;
(
void
)
src1_ddf_i
;
(
void
)
i02
;
(
void
)
i1
;
}
inline
void
ggml_cuda_op_silu
(
...
...
@@ -2336,6 +2724,28 @@ inline void ggml_cuda_op_silu(
(
void
)
i1
;
}
inline
void
ggml_cuda_op_norm
(
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
,
char
*
src0_ddq_i
,
float
*
src0_ddf_i
,
float
*
src1_ddf_i
,
float
*
dst_ddf_i
,
int64_t
i02
,
int64_t
i01_low
,
int64_t
i01_high
,
int
i1
,
cudaStream_t
&
cudaStream_main
){
GGML_ASSERT
(
src0_ddf_i
!=
nullptr
);
GGML_ASSERT
(
dst_ddf_i
!=
nullptr
);
const
int64_t
ne00
=
src0
->
ne
[
0
];
const
int64_t
i01_diff
=
i01_high
-
i01_low
;
// compute
norm_f32_cuda
(
src0_ddf_i
,
dst_ddf_i
,
ne00
,
i01_diff
,
cudaStream_main
);
(
void
)
src1
;
(
void
)
dst
;
(
void
)
src0_ddq_i
;
(
void
)
src1_ddf_i
;
(
void
)
i02
;
(
void
)
i1
;
}
inline
void
ggml_cuda_op_rms_norm
(
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
,
char
*
src0_ddq_i
,
float
*
src0_ddf_i
,
float
*
src1_ddf_i
,
float
*
dst_ddf_i
,
int64_t
i02
,
int64_t
i01_low
,
int64_t
i01_high
,
int
i1
,
...
...
@@ -2376,13 +2786,22 @@ inline void ggml_cuda_op_mul_mat_vec(
int
id
;
CUDA_CHECK
(
cudaGetDevice
(
&
id
));
const
bool
mul_mat_vec_q_implemented
=
src0
->
type
==
GGML_TYPE_Q4_0
||
bool
mul_mat_vec_q_implemented
=
src0
->
type
==
GGML_TYPE_Q4_0
||
src0
->
type
==
GGML_TYPE_Q4_1
||
src0
->
type
==
GGML_TYPE_Q5_0
||
src0
->
type
==
GGML_TYPE_Q5_1
||
src0
->
type
==
GGML_TYPE_Q8_0
;
const
bool
use_mul_mat_vec_q
=
g_compute_capabilities
[
id
]
>=
600
&&
mul_mat_vec_q_implemented
;
#if QK_K == 256
mul_mat_vec_q_implemented
=
mul_mat_vec_q_implemented
||
src0
->
type
==
GGML_TYPE_Q2_K
||
src0
->
type
==
GGML_TYPE_Q3_K
||
src0
->
type
==
GGML_TYPE_Q4_K
||
src0
->
type
==
GGML_TYPE_Q5_K
||
src0
->
type
==
GGML_TYPE_Q6_K
;
#endif // QK_K == 256
const
bool
use_mul_mat_vec_q
=
g_compute_capabilities
[
id
]
>=
MIN_CC_DP4A
&&
mul_mat_vec_q_implemented
;
#endif
if
(
use_mul_mat_vec_q
)
{
...
...
@@ -2408,6 +2827,21 @@ inline void ggml_cuda_op_mul_mat_vec(
case
GGML_TYPE_Q8_0
:
mul_mat_vec_q8_0_q8_1_cuda
(
src0_ddq_i
,
src1_q8_1
,
dst_ddf_i
,
ne00
,
nrows
,
cudaStream_main
);
break
;
case
GGML_TYPE_Q2_K
:
mul_mat_vec_q2_K_q8_1_cuda
(
src0_ddq_i
,
src1_q8_1
,
dst_ddf_i
,
ne00
,
nrows
,
cudaStream_main
);
break
;
case
GGML_TYPE_Q3_K
:
mul_mat_vec_q3_K_q8_1_cuda
(
src0_ddq_i
,
src1_q8_1
,
dst_ddf_i
,
ne00
,
nrows
,
cudaStream_main
);
break
;
case
GGML_TYPE_Q4_K
:
mul_mat_vec_q4_K_q8_1_cuda
(
src0_ddq_i
,
src1_q8_1
,
dst_ddf_i
,
ne00
,
nrows
,
cudaStream_main
);
break
;
case
GGML_TYPE_Q5_K
:
mul_mat_vec_q5_K_q8_1_cuda
(
src0_ddq_i
,
src1_q8_1
,
dst_ddf_i
,
ne00
,
nrows
,
cudaStream_main
);
break
;
case
GGML_TYPE_Q6_K
:
mul_mat_vec_q6_K_q8_1_cuda
(
src0_ddq_i
,
src1_q8_1
,
dst_ddf_i
,
ne00
,
nrows
,
cudaStream_main
);
break
;
default:
GGML_ASSERT
(
false
);
break
;
...
...
@@ -2542,13 +2976,21 @@ inline void ggml_cuda_op_rope(
const
int
n_past
=
((
int32_t
*
)
src1
->
data
)[
0
];
const
int
n_dims
=
((
int32_t
*
)
src1
->
data
)[
1
];
const
int
mode
=
((
int32_t
*
)
src1
->
data
)[
2
];
GGML_ASSERT
(
mode
==
0
)
;
const
int
n_ctx
=
((
int32_t
*
)
src1
->
data
)[
3
]
;
const
float
theta_scale
=
powf
(
10000.0
,
-
2.0
f
/
n_dims
);
const
float
p
=
((
mode
&
1
)
==
0
?
n_past
+
i02
:
i02
);
bool
is_glm
=
mode
&
4
;
// compute
rope_f32_cuda
(
src0_ddf_i
,
dst_ddf_i
,
ne00
,
i01_diff
,
p
,
theta_scale
,
cudaStream_main
);
if
(
is_glm
)
{
const
float
id_p
=
min
(
p
,
n_ctx
-
2.
f
);
const
float
block_p
=
max
(
p
-
(
n_ctx
-
2.
f
),
0.
f
);
rope_glm_f32_cuda
(
src0_ddf_i
,
dst_ddf_i
,
ne00
,
i01_diff
,
id_p
,
block_p
,
theta_scale
,
cudaStream_main
);
}
else
{
rope_f32_cuda
(
src0_ddf_i
,
dst_ddf_i
,
ne00
,
i01_diff
,
p
,
theta_scale
,
cudaStream_main
);
}
(
void
)
dst
;
(
void
)
src0_ddq_i
;
...
...
@@ -2951,11 +3393,21 @@ void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
ggml_cuda_op
(
src0
,
src1
,
dst
,
ggml_cuda_op_mul
,
true
,
false
);
// TODO ggml_cuda_op needs modification for flatten
}
void
ggml_cuda_gelu
(
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
)
{
GGML_ASSERT
(
src0
->
type
==
GGML_TYPE_F32
&&
dst
->
type
==
GGML_TYPE_F32
);
ggml_cuda_op
(
src0
,
src1
,
dst
,
ggml_cuda_op_gelu
,
true
,
true
);
}
void
ggml_cuda_silu
(
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
)
{
GGML_ASSERT
(
src0
->
type
==
GGML_TYPE_F32
&&
dst
->
type
==
GGML_TYPE_F32
);
ggml_cuda_op
(
src0
,
src1
,
dst
,
ggml_cuda_op_silu
,
true
,
true
);
}
void
ggml_cuda_norm
(
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
)
{
GGML_ASSERT
(
src0
->
type
==
GGML_TYPE_F32
&&
dst
->
type
==
GGML_TYPE_F32
);
ggml_cuda_op
(
src0
,
src1
,
dst
,
ggml_cuda_op_norm
,
true
,
true
);
}
void
ggml_cuda_rms_norm
(
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
)
{
GGML_ASSERT
(
src0
->
type
==
GGML_TYPE_F32
&&
dst
->
type
==
GGML_TYPE_F32
);
ggml_cuda_op
(
src0
,
src1
,
dst
,
ggml_cuda_op_rms_norm
,
true
,
true
);
...
...
@@ -3111,6 +3563,11 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
(
void
)
dst
;
}
void
ggml_cuda_dup
(
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
)
{
ggml_cuda_cpy
(
src0
,
dst
,
nullptr
);
(
void
)
src1
;
}
void
ggml_cuda_diag_mask_inf
(
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
)
{
GGML_ASSERT
(
src0
->
type
==
GGML_TYPE_F32
&&
dst
->
type
==
GGML_TYPE_F32
);
ggml_cuda_op
(
src0
,
src1
,
dst
,
ggml_cuda_op_diag_mask_inf
,
true
,
true
);
...
...
@@ -3186,7 +3643,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
}
cudaMemcpy
(
buf
,
buf_host
,
size
,
cudaMemcpyHostToDevice
);
CUDA_CHECK
(
cudaMemcpy
(
buf
,
buf_host
,
size
,
cudaMemcpyHostToDevice
)
)
;
extra
->
data_device
[
id
]
=
buf
;
...
...
@@ -3220,6 +3677,22 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
delete
extra
;
}
static
struct
ggml_tensor_extra_gpu
*
g_temp_tensor_extras
=
nullptr
;
static
size_t
g_temp_tensor_extra_index
=
0
;
static
struct
ggml_tensor_extra_gpu
*
ggml_cuda_alloc_temp_tensor_extra
()
{
if
(
g_temp_tensor_extras
==
nullptr
)
{
g_temp_tensor_extras
=
new
ggml_tensor_extra_gpu
[
GGML_MAX_NODES
];
}
size_t
alloc_index
=
g_temp_tensor_extra_index
;
g_temp_tensor_extra_index
=
(
g_temp_tensor_extra_index
+
1
)
%
GGML_MAX_NODES
;
struct
ggml_tensor_extra_gpu
*
extra
=
&
g_temp_tensor_extras
[
alloc_index
];
memset
(
extra
,
0
,
sizeof
(
*
extra
));
return
extra
;
}
void
ggml_cuda_assign_buffers_impl
(
struct
ggml_tensor
*
tensor
,
bool
scratch
,
bool
force_inplace
)
{
if
(
scratch
&&
g_scratch_size
==
0
)
{
return
;
...
...
@@ -3228,7 +3701,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
// recursively assign CUDA buffers until a compute tensor is found
if
(
tensor
->
src
[
0
]
!=
nullptr
&&
tensor
->
src
[
0
]
->
backend
==
GGML_BACKEND_CPU
)
{
const
ggml_op
src0_op
=
tensor
->
src
[
0
]
->
op
;
if
(
src0_op
==
GGML_OP_RESHAPE
||
src0_op
==
GGML_OP_TRANSPOSE
||
src0_op
==
GGML_OP_VIEW
)
{
if
(
src0_op
==
GGML_OP_RESHAPE
||
src0_op
==
GGML_OP_TRANSPOSE
||
src0_op
==
GGML_OP_VIEW
||
src0_op
==
GGML_OP_PERMUTE
)
{
ggml_cuda_assign_buffers_impl
(
tensor
->
src
[
0
],
scratch
,
force_inplace
);
}
}
...
...
@@ -3237,8 +3710,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
}
tensor
->
backend
=
GGML_BACKEND_GPU
;
struct
ggml_tensor_extra_gpu
*
extra
=
new
ggml_tensor_extra_gpu
;
memset
(
extra
,
0
,
sizeof
(
*
extra
));
struct
ggml_tensor_extra_gpu
*
extra
;
const
bool
inplace
=
(
tensor
->
src
[
0
]
!=
nullptr
&&
tensor
->
src
[
0
]
->
data
==
tensor
->
data
)
||
tensor
->
op
==
GGML_OP_VIEW
||
...
...
@@ -3253,10 +3725,12 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
if
(
tensor
->
op
==
GGML_OP_VIEW
)
{
memcpy
(
&
offset
,
tensor
->
src
[
2
]
->
data
,
sizeof
(
size_t
));
}
extra
=
ggml_cuda_alloc_temp_tensor_extra
();
extra
->
data_device
[
g_main_device
]
=
src0_ddc
+
offset
;
}
else
if
(
tensor
->
op
==
GGML_OP_CPY
)
{
struct
ggml_tensor_extra_gpu
*
src1_extra
=
(
ggml_tensor_extra_gpu
*
)
tensor
->
src
[
1
]
->
extra
;
void
*
src1_ddv
=
src1_extra
->
data_device
[
g_main_device
];
extra
=
ggml_cuda_alloc_temp_tensor_extra
();
extra
->
data_device
[
g_main_device
]
=
src1_ddv
;
}
else
if
(
scratch
)
{
GGML_ASSERT
(
size
<=
g_scratch_size
);
...
...
@@ -3269,6 +3743,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
CUDA_CHECK
(
cudaMalloc
(
&
data
,
g_scratch_size
));
g_scratch_buffer
=
data
;
}
extra
=
ggml_cuda_alloc_temp_tensor_extra
();
extra
->
data_device
[
g_main_device
]
=
data
+
g_scratch_offset
;
g_scratch_offset
+=
size
;
...
...
@@ -3278,6 +3753,8 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
void
*
data
;
CUDA_CHECK
(
cudaMalloc
(
&
data
,
size
));
CUDA_CHECK
(
cudaMemset
(
data
,
0
,
size
));
extra
=
new
ggml_tensor_extra_gpu
;
memset
(
extra
,
0
,
sizeof
(
*
extra
));
extra
->
data_device
[
g_main_device
]
=
data
;
}
...
...
@@ -3330,6 +3807,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
||
(
tensor
->
src
[
1
]
!=
nullptr
&&
tensor
->
src
[
1
]
->
backend
==
GGML_BACKEND_GPU
);
switch
(
tensor
->
op
)
{
case
GGML_OP_DUP
:
if
(
!
any_on_device
)
{
return
false
;
}
func
=
ggml_cuda_dup
;
break
;
case
GGML_OP_ADD
:
if
(
!
any_on_device
)
{
return
false
;
...
...
@@ -3342,12 +3825,24 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
}
func
=
ggml_cuda_mul
;
break
;
case
GGML_OP_GELU
:
if
(
!
any_on_device
)
{
return
false
;
}
func
=
ggml_cuda_gelu
;
break
;
case
GGML_OP_SILU
:
if
(
!
any_on_device
)
{
return
false
;
}
func
=
ggml_cuda_silu
;
break
;
case
GGML_OP_NORM
:
if
(
!
any_on_device
)
{
return
false
;
}
func
=
ggml_cuda_norm
;
break
;
case
GGML_OP_RMS_NORM
:
if
(
!
any_on_device
)
{
return
false
;
...
...
@@ -3372,6 +3867,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
}
func
=
ggml_cuda_cpy
;
break
;
case
GGML_OP_CONT
:
if
(
!
any_on_device
)
{
return
false
;
}
func
=
ggml_cuda_dup
;
break
;
case
GGML_OP_RESHAPE
:
case
GGML_OP_VIEW
:
case
GGML_OP_PERMUTE
:
...
...
llama/ggml-cuda.h
View file @
a83eaa7a
/**
* llama.cpp - git
5bf2a2771886ee86137e01dbc7492f78fb392066
* llama.cpp - git
e782c9e735f93ab4767ffc37462c523b73a17ddc
*
* MIT License
*
...
...
llama/ggml-metal.h
View file @
a83eaa7a
/**
* llama.cpp - git
5bf2a2771886ee86137e01dbc7492f78fb392066
* llama.cpp - git
e782c9e735f93ab4767ffc37462c523b73a17ddc
*
* MIT License
*
...
...
llama/ggml-metal.m
View file @
a83eaa7a
// +build darwin
/**
* llama.cpp - git
5bf2a2771886ee86137e01dbc7492f78fb392066
* llama.cpp - git
e782c9e735f93ab4767ffc37462c523b73a17ddc
*
* MIT License
*
...
...
@@ -722,8 +722,8 @@ void ggml_metal_graph_compute(
GGML_ASSERT
(
ne02
==
1
);
GGML_ASSERT
(
ne12
==
1
);
nth0
=
4
;
nth1
=
16
;
nth0
=
2
;
nth1
=
32
;
[
encoder
setComputePipelineState
:
ctx
->
pipeline_mul_mat_q4_K_f32
];
}
break
;
case
GGML_TYPE_Q5_K
:
...
...
@@ -731,8 +731,8 @@ void ggml_metal_graph_compute(
GGML_ASSERT
(
ne02
==
1
);
GGML_ASSERT
(
ne12
==
1
);
nth0
=
4
;
nth1
=
16
;
nth0
=
2
;
nth1
=
32
;
[
encoder
setComputePipelineState
:
ctx
->
pipeline_mul_mat_q5_K_f32
];
}
break
;
case
GGML_TYPE_Q6_K
:
...
...
@@ -740,8 +740,8 @@ void ggml_metal_graph_compute(
GGML_ASSERT
(
ne02
==
1
);
GGML_ASSERT
(
ne12
==
1
);
nth0
=
4
;
nth1
=
16
;
nth0
=
2
;
nth1
=
32
;
[
encoder
setComputePipelineState
:
ctx
->
pipeline_mul_mat_q6_K_f32
];
}
break
;
default:
...
...
@@ -767,15 +767,18 @@ void ggml_metal_graph_compute(
[
encoder
setBytes
:
&
ne0
length
:
sizeof
(
ne0
)
atIndex
:
13
];
[
encoder
setBytes
:
&
ne1
length
:
sizeof
(
ne1
)
atIndex
:
14
];
if
(
src0t
==
GGML_TYPE_Q4_0
||
src0t
==
GGML_TYPE_Q4_1
)
{
[
encoder
setThreadgroupMemoryLength
:
nth0
*
nth1
*
sizeof
(
float
)
atIndex
:
0
];
[
encoder
dispatchThreadgroups
:
MTLSizeMake
(
ne01
,
ne11
,
1
)
threadsPerThreadgroup
:
MTLSizeMake
(
nth0
,
nth1
,
1
)];
if
(
src0t
==
GGML_TYPE_Q4_0
||
src0t
==
GGML_TYPE_Q4_1
||
src0t
==
GGML_TYPE_Q4_K
)
{
[
encoder
dispatchThreadgroups
:
MTLSizeMake
((
ne01
+
7
)
/
8
,
ne11
,
1
)
threadsPerThreadgroup
:
MTLSizeMake
(
nth0
,
nth1
,
1
)];
}
else
if
(
src0t
==
GGML_TYPE_Q5_K
)
{
[
encoder
dispatchThreadgroups
:
MTLSizeMake
((
ne01
+
3
)
/
4
,
ne11
,
1
)
threadsPerThreadgroup
:
MTLSizeMake
(
nth0
,
nth1
,
1
)];
}
else
if
(
src0t
==
GGML_TYPE_Q6_K
)
{
[
encoder
dispatchThreadgroups
:
MTLSizeMake
((
ne01
+
1
)
/
2
,
ne11
,
1
)
threadsPerThreadgroup
:
MTLSizeMake
(
nth0
,
nth1
,
1
)];
}
else
if
(
src0t
==
GGML_TYPE_Q2_K
||
src0t
==
GGML_TYPE_Q3_K
||
src0t
==
GGML_TYPE_Q4_K
||
src0t
==
GGML_TYPE_Q5_K
||
src0t
==
GGML_TYPE_Q6_K
)
{
src0t
==
GGML_TYPE_Q3_K
)
{
[
encoder
setThreadgroupMemoryLength
:
nth0
*
nth1
*
sizeof
(
float
)
atIndex
:
0
];
[
encoder
dispatchThreadgroups
:
MTLSizeMake
(
ne01
,
1
,
1
)
threadsPerThreadgroup
:
MTLSizeMake
(
nth0
,
nth1
,
1
)];
}
else
{
...
...
@@ -821,7 +824,7 @@ void ggml_metal_graph_compute(
const
float
eps
=
1e-6
f
;
const
int
nth
=
256
;
const
int
nth
=
512
;
[
encoder
setComputePipelineState
:
ctx
->
pipeline_rms_norm
];
[
encoder
setBuffer
:
id_src0
offset
:
offs_src0
atIndex
:
0
];
...
...
@@ -829,7 +832,7 @@ void ggml_metal_graph_compute(
[
encoder
setBytes
:
&
ne00
length
:
sizeof
(
int64_t
)
atIndex
:
2
];
[
encoder
setBytes
:
&
nb01
length
:
sizeof
(
uint64_t
)
atIndex
:
3
];
[
encoder
setBytes
:
&
eps
length
:
sizeof
(
float
)
atIndex
:
4
];
[
encoder
setThreadgroupMemoryLength
:
nth
*
sizeof
(
float
)
atIndex
:
0
];
[
encoder
setThreadgroupMemoryLength
:
nth
/
32
*
sizeof
(
float
)
atIndex
:
0
];
const
int64_t
nrows
=
ggml_nrows
(
src0
);
...
...
@@ -910,28 +913,35 @@ void ggml_metal_graph_compute(
const
int
n_past
=
((
int32_t
*
)(
src1
->
data
))[
0
];
float
freq_base
;
float
freq_scale
;
memcpy
(
&
freq_base
,
(
int32_t
*
)
src1
->
data
+
4
,
sizeof
(
float
));
memcpy
(
&
freq_scale
,
(
int32_t
*
)
src1
->
data
+
5
,
sizeof
(
float
));
[
encoder
setComputePipelineState
:
ctx
->
pipeline_rope
];
[
encoder
setBuffer
:
id_src0
offset
:
offs_src0
atIndex
:
0
];
[
encoder
setBuffer
:
id_dst
offset
:
offs_dst
atIndex
:
1
];
[
encoder
setBytes
:
&
ne00
length
:
sizeof
(
int64_t
)
atIndex
:
2
];
[
encoder
setBytes
:
&
ne01
length
:
sizeof
(
int64_t
)
atIndex
:
3
];
[
encoder
setBytes
:
&
ne02
length
:
sizeof
(
int64_t
)
atIndex
:
4
];
[
encoder
setBytes
:
&
ne03
length
:
sizeof
(
int64_t
)
atIndex
:
5
];
[
encoder
setBytes
:
&
nb00
length
:
sizeof
(
uint64_t
)
atIndex
:
6
];
[
encoder
setBytes
:
&
nb01
length
:
sizeof
(
uint64_t
)
atIndex
:
7
];
[
encoder
setBytes
:
&
nb02
length
:
sizeof
(
uint64_t
)
atIndex
:
8
];
[
encoder
setBytes
:
&
nb03
length
:
sizeof
(
uint64_t
)
atIndex
:
9
];
[
encoder
setBytes
:
&
ne0
length
:
sizeof
(
int64_t
)
atIndex
:
10
];
[
encoder
setBytes
:
&
ne1
length
:
sizeof
(
int64_t
)
atIndex
:
11
];
[
encoder
setBytes
:
&
ne2
length
:
sizeof
(
int64_t
)
atIndex
:
12
];
[
encoder
setBytes
:
&
ne3
length
:
sizeof
(
int64_t
)
atIndex
:
13
];
[
encoder
setBytes
:
&
nb0
length
:
sizeof
(
uint64_t
)
atIndex
:
14
];
[
encoder
setBytes
:
&
nb1
length
:
sizeof
(
uint64_t
)
atIndex
:
15
];
[
encoder
setBytes
:
&
nb2
length
:
sizeof
(
uint64_t
)
atIndex
:
16
];
[
encoder
setBytes
:
&
nb3
length
:
sizeof
(
uint64_t
)
atIndex
:
17
];
[
encoder
setBytes
:
&
n_past
length
:
sizeof
(
int
)
atIndex
:
18
];
[
encoder
setBytes
:
&
n_dims
length
:
sizeof
(
int
)
atIndex
:
19
];
[
encoder
setBytes
:
&
mode
length
:
sizeof
(
int
)
atIndex
:
20
];
[
encoder
setBytes
:
&
ne00
length
:
sizeof
(
int64_t
)
atIndex
:
2
];
[
encoder
setBytes
:
&
ne01
length
:
sizeof
(
int64_t
)
atIndex
:
3
];
[
encoder
setBytes
:
&
ne02
length
:
sizeof
(
int64_t
)
atIndex
:
4
];
[
encoder
setBytes
:
&
ne03
length
:
sizeof
(
int64_t
)
atIndex
:
5
];
[
encoder
setBytes
:
&
nb00
length
:
sizeof
(
uint64_t
)
atIndex
:
6
];
[
encoder
setBytes
:
&
nb01
length
:
sizeof
(
uint64_t
)
atIndex
:
7
];
[
encoder
setBytes
:
&
nb02
length
:
sizeof
(
uint64_t
)
atIndex
:
8
];
[
encoder
setBytes
:
&
nb03
length
:
sizeof
(
uint64_t
)
atIndex
:
9
];
[
encoder
setBytes
:
&
ne0
length
:
sizeof
(
int64_t
)
atIndex
:
10
];
[
encoder
setBytes
:
&
ne1
length
:
sizeof
(
int64_t
)
atIndex
:
11
];
[
encoder
setBytes
:
&
ne2
length
:
sizeof
(
int64_t
)
atIndex
:
12
];
[
encoder
setBytes
:
&
ne3
length
:
sizeof
(
int64_t
)
atIndex
:
13
];
[
encoder
setBytes
:
&
nb0
length
:
sizeof
(
uint64_t
)
atIndex
:
14
];
[
encoder
setBytes
:
&
nb1
length
:
sizeof
(
uint64_t
)
atIndex
:
15
];
[
encoder
setBytes
:
&
nb2
length
:
sizeof
(
uint64_t
)
atIndex
:
16
];
[
encoder
setBytes
:
&
nb3
length
:
sizeof
(
uint64_t
)
atIndex
:
17
];
[
encoder
setBytes
:
&
n_past
length
:
sizeof
(
int
)
atIndex
:
18
];
[
encoder
setBytes
:
&
n_dims
length
:
sizeof
(
int
)
atIndex
:
19
];
[
encoder
setBytes
:
&
mode
length
:
sizeof
(
int
)
atIndex
:
20
];
[
encoder
setBytes
:
&
freq_base
length
:
sizeof
(
float
)
atIndex
:
21
];
[
encoder
setBytes
:
&
freq_scale
length
:
sizeof
(
float
)
atIndex
:
22
];
[
encoder
dispatchThreadgroups
:
MTLSizeMake
(
ne01
,
ne02
,
ne03
)
threadsPerThreadgroup
:
MTLSizeMake
(
1
,
1
,
1
)];
}
break
;
...
...
llama/ggml-metal.metal
View file @
a83eaa7a
/**
* llama.cpp - git
5bf2a2771886ee86137e01dbc7492f78fb392066
* llama.cpp - git
e782c9e735f93ab4767ffc37462c523b73a17ddc
*
* MIT License
*
...
...
@@ -357,26 +357,33 @@ kernel void kernel_rms_norm(
threadgroup float * sum [[threadgroup(0)]],
uint tgpig[[threadgroup_position_in_grid]],
uint tpitg[[thread_position_in_threadgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]],
uint tiisg[[thread_index_in_simdgroup]],
uint ntg[[threads_per_threadgroup]]) {
device const float * x = (device const float *) ((device const char *) src0 + tgpig*nb01);
device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
device const float * x_scalar = (device const float *) x;
float4 sumf=0;
float all_sum=0;
// parallel sum
sum[tpitg] = 0.0f;
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
sum[tpitg] += x[i00] * x[i00];
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
sumf += x[i00] * x[i00];
}
all_sum = sumf[0] + sumf[1] + sumf[2] + sumf[3];
all_sum = simd_sum(all_sum);
if (tiisg == 0) {
sum[sgitg] = all_sum;
}
// reduce
threadgroup_barrier(mem_flags::mem_threadgroup);
for (uint i = ntg/2; i > 0; i /= 2) {
if (tpitg < i
) {
sum[tpitg] += sum[
tpitg
+
i
];
}
threadgroup_barrier(mem_flags::mem_threadgroup);
// broadcast, simd group number is ntg / 32
for (int i = ntg / 32 / 2; i > 0; i /= 2
) {
if (
tpitg
<
i
) {
sum[tpitg] += sum[tpitg + i];
}
}
// broadcast
if (tpitg == 0) {
for (int i = 4 * (ne00 / 4); i < ne00; i++) {sum[0] += x_scalar[i];}
sum[0] /= ne00;
}
...
...
@@ -385,147 +392,127 @@ kernel void kernel_rms_norm(
const float mean = sum[0];
const float scale = 1.0f/sqrt(mean + eps);
device float * y = dst + tgpig*ne00;
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
device float4 * y = (device float4 *) (dst + tgpig*ne00);
device float * y_scalar = (device float *) y;
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
y[i00] = x[i00] * scale;
}
if (tpitg == 0) {
for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {y_scalar[i00] = x_scalar[i00] * scale;}
}
}
kernel void kernel_mul_mat_q4_0_f32(
device const void * src0,
device const float * src1,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne10,
constant int64_t & ne0,
threadgroup float * sum [[threadgroup(0)]],
uint2 tgpig[[threadgroup_position_in_grid]],
uint2 tpitg[[thread_position_in_threadgroup]],
uint2 tptg[[threads_per_threadgroup]]) {
const int nb = ne00/QK4_0;
// function for calculate inner product between a q4_0 block and 32 floats (yl), sumy is SUM(yl[i])
float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thread float * yl) {
float d = qb_curr->d;
float4 acc = 0.f;
device uint16_t * qs = ((device uint16_t *)qb_curr + 1);
for (int i = 0; i < 16; i+=2) {
acc[0] += yl[i] * (qs[i / 2] & 0x000F);
acc[1] += yl[i + 16] * (qs[i / 2] & 0x00F0);
acc[2] += yl[i + 1] * (qs[i / 2] & 0x0F00);
acc[3] += yl[i + 17] * (qs[i / 2] & 0xF000);
}
return d * (sumy * -8.f + acc[0] + acc[1]/16.f + acc[2]/256.f + acc[3]/4096.f);
}
const int64_t r0 = tgpig.x;
const int64_t r1 = tgpig.y;
// function for calculate inner product between a q4_1 block and 32 floats (yl), sumy is SUM(yl[i])
float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thread float * yl) {
float d = qb_curr->d;
float m = qb_curr->m;
float4 acc = 0.f;
device uint16_t * qs = ((device uint16_t *)qb_curr + 2);
for (int i = 0; i < 16; i+=2) {
acc[0] += yl[i] * (qs[i / 2] & 0x000F);
acc[1] += yl[i + 16] * (qs[i / 2] & 0x00F0);
acc[2] += yl[i + 1] * (qs[i / 2] & 0x0F00);
acc[3] += yl[i + 17] * (qs[i / 2] & 0xF000);
}
return d * (acc[0] + acc[1]/16.f + acc[2]/256.f + acc[3]/4096.f) + sumy * m;
}
device const block_q4_0 * x = (device const block_q4_0 *) src0 + r0*nb;
// putting them in the kernel cause a significant performance penalty
#define N_DST 4 // each SIMD group works on 4 rows
#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
template<typename block_q_type>
void mul_vec_q_n_f32(device const void * src0, device const float * src1, device float * dst,
int64_t ne00, int64_t ne10, int64_t ne0, int64_t ne01,
uint2 tgpig, uint tiisg, uint sgitg) {
const int nb = ne00/QK4_0;
const int r0 = tgpig.x;
const int r1 = tgpig.y;
device const block_q_type * x = (device const block_q_type *) src0 + (r0 * N_SIMDGROUP + sgitg) * N_DST * nb;
device const float * y = (device const float *) src1 + r1*ne10;
const int nth = tptg.x*tptg.y;
const int ith = tptg.y*tpitg.x + tpitg.y;
const int ix = tpitg.y/4; // 0 or 1
const int iy = tpitg.y - 4*ix; // 0...3
const int first = 4 * iy;
float sumf = 0;
for (int i = 2*tpitg.x + ix; i < nb; i += 2*tptg.x) {
const float d = (float)x[i].d;
device const uint8_t * xl = x[i].qs + first;
device const float * yl = y + i * QK4_0 + first;
float2 acc = {0.0f, 0.0f};
for (int j = 0; j < 4; ++j) {
acc[0] += yl[j] * (xl[j] & 0xF) + yl[j+16] * (xl[j] >> 4);
acc[1] += yl[j] + yl[j+16];
float4 y_curr[8]; // src1 vector cache
float sumf[N_DST]={0.f}, all_sum;
thread float * yl=(thread float *)y_curr;
// each thread in a SIMD group deals with 1 block.
for (int column = 0; column < nb / N_SIMDWIDTH; column++) {
float sumy = 0;
for (int i = 0; i < QK4_0 / 4; i++) {
y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0)) + i);
sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
}
sumf += d * (acc[0] - 8.f*acc[1]);
for (int row = 0; row < N_DST; row++) {
sumf[row] += block_q_n_dot_y(x+(tiisg + row * nb + column * N_SIMDWIDTH), sumy, yl);
}
}
sum[ith] = sumf;
// from now loads two rows every time and 16 blocks per row
int ir = tiisg / (N_SIMDWIDTH / 2);
int ib = tiisg % (N_SIMDWIDTH / 2);
for (int ind = 0; ind < (nb % N_SIMDWIDTH + N_SIMDWIDTH / 2 - 1)/(N_SIMDWIDTH / 2); ind++) {
int nb_start = (nb / N_SIMDWIDTH) * N_SIMDWIDTH + ind * (N_SIMDWIDTH / 2); //where the left blocks start
float sumy = 0;
for (int i = 0; i < QK4_0 / 4; i++) {
y_curr[i] = *((device float4 *)(y + (nb_start + ib) * QK4_0) + i);
sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
}
//
// Accumulate the sum from all threads in the threadgroup
//
threadgroup_barrier(mem_flags::mem_threadgroup);
if (ith%4 == 0) {
sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
}
threadgroup_barrier(mem_flags::mem_threadgroup);
if (ith%16 == 0) {
sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
for (int row = 0; row < N_DST; row+=2) {
if (nb_start + ib < nb) {
sumf[row + ir] += block_q_n_dot_y(x + (nb_start + ib + (row + ir) * nb), sumy, yl);
}
}
}
threadgroup_barrier(mem_flags::mem_threadgroup);
if (ith == 0) {
for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
dst[r1*ne0 + r0] = sum[0];
for (int row = 0; row < N_DST; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
}
}
}
kernel void kernel_mul_mat_q4_
1
_f32(
kernel void kernel_mul_mat_q4_
0
_f32(
device const void * src0,
device const float * src1,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne10,
constant int64_t & ne0,
threadgroup float * sum [[threadgroup(0
)]],
constant int64_t & ne01[[buffer(4
)]],
uint2 tgpig[[threadgroup_position_in_grid]],
uint2 tpitg[[thread_position_in_threadgroup]],
uint2 tptg[[threads_per_threadgroup]]) {
const int nb = ne00/QK4_1;
const int64_t r0 = tgpig.x;
const int64_t r1 = tgpig.y;
device const block_q4_1 * x = (device const block_q4_1 *) src0 + r0*nb;
device const float * y = (device const float *) src1 + r1*ne10;
const uint nth = tptg.x*tptg.y;
const uint ith = tptg.y*tpitg.x + tpitg.y;
const int ix = tpitg.y/4; // 0 or 1
const int iy = tpitg.y - 4*ix; // 0...3
const int first = 4 * iy;
float sumf = 0;
for (int i = 2*tpitg.x + ix; i < nb; i += 2*tptg.x) {
const float d = (float)x[i].d;
const float m = (float)x[i].m;
device const uint8_t * xl = x[i].qs + first;
device const float * yl = y + i * QK4_1 + first;
float2 acc = {0.0f, 0.0f};
for (int j = 0; j < 4; ++j) {
acc[0] += yl[j+ 0] * (d * (xl[j] & 0xF) + m);
acc[1] += yl[j+16] * (d * (xl[j] >> 4) + m);
}
sumf += acc[0] + acc[1];
}
sum[ith] = sumf;
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
mul_vec_q_n_f32<block_q4_0>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
}
//
// Accumulate the sum from all threads in the threadgroup
//
threadgroup_barrier(mem_flags::mem_threadgroup);
if (ith%4 == 0) {
sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
}
threadgroup_barrier(mem_flags::mem_threadgroup);
if (ith%16 == 0) {
sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
}
threadgroup_barrier(mem_flags::mem_threadgroup);
if (ith == 0) {
for (uint i = 16; i < nth; i += 16) sum[0] += sum[i];
dst[r1*ne0 + r0] = sum[0];
}
kernel void kernel_mul_mat_q4_1_f32(
device const void * src0,
device const float * src1,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne10,
constant int64_t & ne0,
constant int64_t & ne01[[buffer(4)]],
uint2 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
mul_vec_q_n_f32<block_q4_1>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
}
kernel void kernel_mul_mat_f16_f32(
...
...
@@ -641,17 +628,19 @@ kernel void kernel_rope(
constant int & n_past,
constant int & n_dims,
constant int & mode,
constant float & freq_base,
constant float & freq_scale,
uint3 tpig[[thread_position_in_grid]]) {
const int64_t i3 = tpig[2];
const int64_t i2 = tpig[1];
const int64_t i1 = tpig[0];
const bool is_neox = mode & 2;
const float theta_scale = pow(
10000.0
, -2.0f/n_dims);
const float theta_scale = pow(
freq_base
, -2.0f/n_dims);
const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
float theta = (float)p;
float theta =
freq_scale *
(float)p;
if (!is_neox) {
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
...
...
@@ -1489,6 +1478,7 @@ kernel void kernel_mul_mat_q3_K_f32(
}
#if QK_K == 256
kernel void kernel_mul_mat_q4_K_f32(
device const void * src0,
device const float * src1,
...
...
@@ -1496,131 +1486,180 @@ kernel void kernel_mul_mat_q4_K_f32(
constant int64_t & ne00,
constant int64_t & ne10,
constant int64_t & ne0,
threadgroup float * sum [[threadgroup(0
)]],
constant int64_t & ne01[[buffer(4
)]],
uint2 tgpig[[threadgroup_position_in_grid]],
uint2 tpitg[[thread_position_in_threadgroup]],
uint2 tptg[[threads_per_threadgroup]]) {
const int nb = ne00/QK_K;
const int64_t r0 = tgpig.x;
const int64_t r1 = tgpig.y;
const int nth = tptg.x*tptg.y;
const int ith = tptg.y*tpitg.x + tpitg.y;
device const block_q4_K * x = (device const block_q4_K *) src0 + r0*nb;
device const float * yy = (device const float *) src1 + r1*ne10;
float sumf = 0;
#if QK_K == 256
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
const uint16_t kmask1 = 0x3f3f;
const uint16_t kmask2 = 0x0f0f;
const uint16_t kmask3 = 0xc0c0;
const int
tid
= t
pitg.y;
// 0...
16
const int i
l
= ti
d/4;
// 0...
3
const int i
r
=
tid - 4*il;// 0...3
const int
n = 4;
const int
ix
= t
iisg/8;
// 0...
3
const int i
t
= ti
isg%8;
// 0...
7
const int i
m
=
it/4; // 0 or 1
const int
ir = it%4; // 0...3
const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
const int in = il%2;
const int nb = ne00/QK_K;
const int r0 = tgpig.x;
const int r1 = tgpig.y;
const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
const int ib_row = first_row * nb;
device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row;
device const float * y = (device const float *) src1 + r1*ne10;
float yl[16];
float yh[16];
float sumf[N_DST]={0.f}, all_sum;
const int l0 = n*(2*ir + in);
const int q_offset = 32*im + l0;
const int y_offset = 64*im + l0;
const int step = sizeof(block_q4_K) * nb / 2;
uchar2 sc1, sc2, sc3, sc4
;
device const float * y4 = y + ix * QK_K + 64 * im + 8 * ir
;
for (int i = tpitg.x; i < nb; i += tptg.x) {
uint16_t sc16[4];
thread const uint8_t * sc8 = (thread const uint8_t *)sc16;
device const uint8_t * q1 = (x + i)->qs + q_offset;
device const uint8_t * q2 = q1 + 64;
device const float * y1 = yy + i*QK_K + y_offset;
device const float * y2 = y1 + 128;
for (int ib = ix; ib < nb; ib += 4) {
const float dall = (float)((x + i)->d);
const float dmin = (float)((x + i)->dmin);
float4 sumy = {0.f, 0.f, 0.f, 0.f};
for (int i = 0; i < 8; ++i) {
yl[i+0] = y4[i+ 0]; sumy[0] += yl[i+0];
yl[i+8] = y4[i+ 32]; sumy[1] += yl[i+8];
yh[i+0] = y4[i+128]; sumy[2] += yh[i+0];
yh[i+8] = y4[i+160]; sumy[3] += yh[i+8];
}
device const uint16_t * a = (device const uint16_t *)(x + i)->scales;
sc1 = as_type<uchar2>((uint16_t)(a[im+0] & kmask1));
sc2 = as_type<uchar2>((uint16_t)(a[im+2] & kmask1));
sc3 = as_type<uchar2>((uint16_t)(((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2)));
sc4 = as_type<uchar2>((uint16_t)(((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2)));
device const uint16_t * sc = (device const uint16_t *)x[ib].scales + im;
device const uint16_t * q1 = (device const uint16_t *)x[ib].qs + 16 * im + 4 * ir;
device const half * dh = &x[ib].d;
for (int row = 0; row < N_DST; row++) {
sc16[0] = sc[0] & kmask1;
sc16[1] = sc[2] & kmask1;
sc16[2] = ((sc[4] >> 0) & kmask2) | ((sc[0] & kmask3) >> 2);
sc16[3] = ((sc[4] >> 4) & kmask2) | ((sc[2] & kmask3) >> 2);
device const uint16_t * q2 = q1 + 32;
float4 acc1 = {0.f, 0.f, 0.f, 0.f};
float4 acc2 = {0.f, 0.f, 0.f, 0.f};
for (int i = 0; i < 8; i += 2) {
acc1[0] += yl[i+0] * (q1[i/2] & 0x000F);
acc1[1] += yl[i+1] * (q1[i/2] & 0x0F00);
acc1[2] += yl[i+8] * (q1[i/2] & 0x00F0);
acc1[3] += yl[i+9] * (q1[i/2] & 0xF000);
acc2[0] += yh[i+0] * (q2[i/2] & 0x000F);
acc2[1] += yh[i+1] * (q2[i/2] & 0x0F00);
acc2[2] += yh[i+8] * (q2[i/2] & 0x00F0);
acc2[3] += yh[i+9] * (q2[i/2] & 0xF000);
}
float4 s = {0.f, 0.f, 0.f, 0.f};
float smin = 0;
for (int l = 0; l < n; ++l) {
float dall = dh[0];
float dmin = dh[1];
sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc8[0] +
(acc1[2] + 1.f/256.f * acc1[3]) * sc8[1] * 1.f/16.f +
(acc2[0] + 1.f/256.f * acc2[1]) * sc8[4] +
(acc2[2] + 1.f/256.f * acc2[3]) * sc8[5] * 1.f/16.f) -
dmin * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]);
q1 += step;
sc += step;
dh += step;
}
s[0] += y1[l] * (q1[l] & 0xF); s[1] += y1[l+32] * (q1[l] >> 4);
s[2] += y2[l] * (q2[l] & 0xF); s[3] += y2[l+32] * (q2[l] >> 4);
smin += y1[l] * sc2[0] + y1[l+32] * sc2[1] + y2[l] * sc4[0] + y2[l+32] * sc4[1];
y4 += 4 * QK_K;
}
for (int row = 0; row < N_DST; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst[r1*ne0 + first_row + row] = all_sum;
}
sumf += dall * (s[0] * sc1[0] + s[1] * sc1[1] + s[2] * sc3[0] + s[3] * sc3[1]) - dmin * smin;
}
}
#else
uint16_t aux16[2];
thread const uint8_t * scales = (thread const uint8_t *)aux16;
kernel void kernel_mul_mat_q4_K_f32(
device const void * src0,
device const float * src1,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne10,
constant int64_t & ne0,
constant int64_t & ne01[[buffer(4)]],
uint2 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
const int il = 4*tpitg.x;
const int ix = tiisg/4; // 0...7
const int it = tiisg%4; // 0...3
for (int i = tpitg.y; i < nb; i += tptg.y) {
const int nb = ne00/QK_K;
const int r0 = tgpig.x;
const int r1 = tgpig.y;
const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
const int ib_row = first_row * nb;
device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row;
device const float * y = (device const float *) src1 + r1*ne10;
float yl[8];
float yh[8];
float sumf[N_DST]={0.f}, all_sum;
device const uint8_t * q = x[i].qs + il;
device const float * y = yy + i * QK_K + il;
const int step = sizeof(block_q4_K) * nb / 2;
const float d = (float)x[i].d[0];
const float m = (float)x[i].d[1];
device const float * y4 = y + ix * QK_K + 8 * it;
device const uint16_t * a = (device const uint16_t *)x[i].scales;
aux16[0] = a[0] & 0x0f0f;
aux16[1] = (a[0] >> 4) & 0x0f0f;
uint16_t sc16[4];
for (int l = 0; l < 4; ++l) {
sumf += d * scales[0] * (y[l+ 0] * (q[l] & 0xF) + y[l+16] * (q[l+16] & 0xF)) - m * scales[2] * (y[l+ 0] + y[l+16])
+ d * scales[1] * (y[l+32] * (q[l] >> 4) + y[l+48] * (q[l+16] >> 4)) - m * scales[3] * (y[l+32] + y[l+48]);
for (int ib = ix; ib < nb; ib += 8) {
float2 sumy = {0.f, 0.f};
for (int i = 0; i < 8; ++i) {
yl[i] = y4[i+ 0]; sumy[0] += yl[i];
yh[i] = y4[i+32]; sumy[1] += yh[i];
}
}
#endif
sum[ith] = sumf;
device const uint16_t * sc = (device const uint16_t *)x[ib].scales;
device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 4 * it;
device const half * dh = x[ib].d;
//
// Accumulate the sum from all threads in the threadgroup
// This version is slightly faster than the commented out one below,
// which I copy-pasted from ggerganov's q4_0 dot product for metal.
//
threadgroup_barrier(mem_flags::mem_threadgroup);
if (ith%4 == 0) {
for (int i = 1; i < 4; ++i) sum[ith] += sum[ith + i];
}
threadgroup_barrier(mem_flags::mem_threadgroup);
if (ith%16 == 0) {
for (int i = 4; i < 16; i += 4) sum[ith] += sum[ith + i];
}
threadgroup_barrier(mem_flags::mem_threadgroup);
if (ith == 0) {
for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
dst[r1*ne0 + r0] = sum[0];
}
for (int row = 0; row < N_DST; row++) {
sc16[0] = sc[0] & 0x000f;
sc16[1] = sc[0] & 0x0f00;
sc16[2] = sc[0] & 0x00f0;
sc16[3] = sc[0] & 0xf000;
float2 acc1 = {0.f, 0.f};
float2 acc2 = {0.f, 0.f};
for (int i = 0; i < 8; i += 2) {
acc1[0] += yl[i+0] * (qs[i/2] & 0x000F);
acc1[1] += yl[i+1] * (qs[i/2] & 0x0F00);
acc2[0] += yh[i+0] * (qs[i/2] & 0x00F0);
acc2[1] += yh[i+1] * (qs[i/2] & 0xF000);
}
float dall = dh[0];
float dmin = dh[1];
sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc16[0] +
(acc2[0] + 1.f/256.f * acc2[1]) * sc16[1] * 1.f/4096.f) -
dmin * 1.f/16.f * (sumy[0] * sc16[2] + sumy[1] * sc16[3] * 1.f/256.f);
qs += step;
sc += step;
dh += step;
}
//// accumulate the sum from all threads in the threadgroup
//threadgroup_barrier(mem_flags::mem_threadgroup);
//for (uint i = nth/2; i > 0; i /= 2) {
// if (ith < i) {
// sum[ith] += sum[ith + i];
// }
// threadgroup_barrier(mem_flags::mem_threadgroup);
//}
y4 += 8 * QK_K;
}
//if (ith == 0) {
// dst[r1*ne0 + r0] = sum[0];
//}
for (int row = 0; row < N_DST; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst[r1*ne0 + first_row + row] = all_sum;
}
}
}
#endif
kernel void kernel_mul_mat_q5_K_f32(
device const void * src0,
...
...
@@ -1629,39 +1668,39 @@ kernel void kernel_mul_mat_q5_K_f32(
constant int64_t & ne00,
constant int64_t & ne10,
constant int64_t & ne0,
threadgroup float * sum [[threadgroup(0)]],
uint2 tgpig[[threadgroup_position_in_grid]],
uint
2
t
pit
g[[thread_
position_in_threa
dgroup]],
uint
2 tptg[[threads_per
_threadgroup]]) {
uint t
iis
g[[thread_
index_in_sim
dgroup]],
uint
sgitg[[simdgroup_index_in
_threadgroup]]) {
const int nb = ne00/QK_K;
const int64_t r0 = tgpig.x;
const int64_t r1 = tgpig.y;
device const block_q5_K * x = (device const block_q5_K *) src0 + r0*nb;
const int first_row = (r0 * N_SIMDGROUP + sgitg) * 2;
device const block_q5_K * x = (device const block_q5_K *) src0 + first_row*nb;
device const float * yy = (device const float *) src1 + r1*ne10;
const int nth = tptg.x*tptg.y;
const int ith = tptg.y*tpitg.x + tpitg.y;
float sumf[2]={0.f};
float sumf = 0
;
const int step = sizeof(block_q5_K) * nb
;
#if QK_K == 256
#
float yl[16], yh[16];
const uint16_t kmask1 = 0x3f3f;
const uint16_t kmask2 = 0x0f0f;
const uint16_t kmask3 = 0xc0c0;
const int tid = tpitg.y; // 0...16
const int il = tid/4; // 0...3
const int ir = tid - 4*il;// 0...3
const int n = 4;
const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
const int in = il%2;
const int tid = tiisg/4;
const int ix = tiisg%4;
const int im = tid/4;
const int ir = tid%4;
const int n = 8;
const int l0 = n*
(2*ir + in)
;
const int l0 = n*
ir
;
const int q_offset = 32*im + l0;
const int y_offset = 64*im + l0;
...
...
@@ -1670,78 +1709,114 @@ kernel void kernel_mul_mat_q5_K_f32(
const uint8_t hm3 = hm1 << 4;
const uint8_t hm4 = hm2 << 4;
uchar2 sc1, sc2, sc3, sc4;
uint16_t sc16[4];
thread const uint8_t * sc8 = (thread const uint8_t *)sc16;
for (int i = tpitg.x; i < nb; i += tptg.x) {
device const float * y1 = yy + ix*QK_K + y_offset;
for (int i = ix; i < nb; i += 4) {
device const uint8_t * q1 = x[i].qs + q_offset;
device const uint8_t * qh = x[i].qh + l0;
device const half * dh = &x[i].d;
device const uint16_t * a = (device const uint16_t *)x[i].scales + im;
device const uint8_t * q1 = (x + i)->qs + q_offset;
device const uint8_t * q2 = q1 + 64;
device const uint8_t * qh = (x + i)->qh + l0;
device const float * y1 = yy + i*QK_K + y_offset;
device const float * y2 = y1 + 128;
device const float * y2 = y1 + 128;
float4 sumy = {0.f, 0.f, 0.f, 0.f};
for (int l = 0; l < 8; ++l) {
yl[l+0] = y1[l+ 0]; sumy[0] += yl[l+0];
yl[l+8] = y1[l+32]; sumy[1] += yl[l+8];
yh[l+0] = y2[l+ 0]; sumy[2] += yh[l+0];
yh[l+8] = y2[l+32]; sumy[3] += yh[l+8];
}
const float dall = (float)((x + i)->d);
const float dmin = (float)((x + i)->dmin);
for (int row = 0; row < 2; ++row) {
device const uint16_t * a = (device const uint16_t *)(x + i)->scales;
sc1 = as_type<uchar2>((uint16_t)(a[im+0] & kmask1));
sc2 = as_type<uchar2>((uint16_t)(a[im+2] & kmask1));
sc3 = as_type<uchar2>((uint16_t)(((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2)));
sc4 = as_type<uchar2>((uint16_t)(((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2)));
device const uint8_t * q2 = q1 + 64;
float4 s = {0.f, 0.f, 0.f, 0.f};
float smin = 0;
for (int l = 0; l < n; ++l) {
sc16[0] = a[0] & kmask1;
sc16[1] = a[2] & kmask1;
sc16[2] = ((a[4] >> 0) & kmask2) | ((a[0] & kmask3) >> 2);
sc16[3] = ((a[4] >> 4) & kmask2) | ((a[2] & kmask3) >> 2);
s[0] += y1[l+ 0] * ((q1[l] & 0xF) + (qh[l] & hm1 ? 16 : 0));
s[1] += y1[l+32] * ((q1[l] >> 4) + (qh[l] & hm2 ? 16 : 0));
s[2] += y2[l+ 0] * ((q2[l] & 0xF) + (qh[l] & hm3 ? 16 : 0));
s[3] += y2[l+32] * ((q2[l] >> 4) + (qh[l] & hm4 ? 16 : 0));
smin += y1[l] * sc2[0] + y1[l+32] * sc2[1] + y2[l] * sc4[0] + y2[l+32] * sc4[1];
float4 acc = {0.f, 0.f, 0.f, 0.f};
for (int l = 0; l < n; ++l) {
uint8_t h = qh[l];
acc[0] += yl[l+0] * ((uint16_t)(q1[l] & 0x0F) + (h & hm1 ? 16 : 0));
acc[1] += yl[l+8] * ((uint16_t)(q1[l] & 0xF0) + (h & hm2 ? 256 : 0));
acc[2] += yh[l+0] * ((uint16_t)(q2[l] & 0x0F) + (h & hm3 ? 16 : 0));
acc[3] += yh[l+8] * ((uint16_t)(q2[l] & 0xF0) + (h & hm4 ? 256 : 0));
}
const float dall = dh[0];
const float dmin = dh[1];
sumf[row] += dall * (acc[0] * sc8[0] + acc[1] * sc8[1] * 1.f/16.f + acc[2] * sc8[4] + acc[3] * sc8[5] * 1.f/16.f) -
dmin * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]);
q1 += step;
qh += step;
dh += step/2;
a += step/2;
}
sumf += dall * (s[0] * sc1[0] + s[1] * sc1[1] + s[2] * sc3[0] + s[3] * sc3[1]) - dmin * smin;
y1 += 4 * QK_K;
}
#else
const int il = 4 * tpitg.x; // 0, 4, 8, 12
const int im = il/8; // 0, 0, 1, 1
const int in = il%8; // 0, 4, 0, 4
float yl[8], yh[8];
for (int i = tpitg.y; i < nb; i += tptg.y) {
const int il = 4 * (tiisg/8); // 0, 4, 8, 12
const int ix = tiisg%8;
const int im = il/8; // 0, 0, 1, 1
const int in = il%8; // 0, 4, 0, 4
const float d = (float)x[i].d;
device const float * y = yy + ix*QK_K + il;
for (int i = ix; i < nb; i += 8) {
float4 sumy = {0.f, 0.f, 0.f, 0.f};
for (int l = 0; l < 4; ++l) {
yl[l+0] = y[l+ 0];
yl[l+4] = y[l+16];
yh[l+0] = y[l+32];
yh[l+4] = y[l+48];
}
device const half * dh = &x[i].d;
device const uint8_t * q = x[i].qs + il;
device const uint8_t * h = x[i].qh + in;
device const int8_t * s = x[i].scales;
device const float * y = yy + i*QK_K + il;
for (int l = 0; l < 4; ++l) {
const uint8_t hl = h[l] >> im;
sumf += y[l+ 0] * d * s[0] * ((q[l+ 0] & 0xF) - (hl & 0x01 ? 0 : 16))
+ y[l+16] * d * s[1] * ((q[l+16] & 0xF) - (hl & 0x04 ? 0 : 16))
+ y[l+32] * d * s[2] * ((q[l+ 0] >> 4) - (hl & 0x10 ? 0 : 16))
+ y[l+48] * d * s[3] * ((q[l+16] >> 4) - (hl & 0x40 ? 0 : 16));
for (int row = 0; row < 2; ++row) {
const float d = dh[0];
float2 acc = {0.f, 0.f};
for (int l = 0; l < 4; ++l) {
const uint8_t hl = h[l] >> im;
acc[0] += yl[l+0] * s[0] * ((int16_t)(q[l+ 0] & 0x0F) - (hl & 0x01 ? 0 : 16))
+ yl[l+4] * s[1] * ((int16_t)(q[l+16] & 0x0F) - (hl & 0x04 ? 0 : 16));
acc[1] += yh[l+0] * s[2] * ((int16_t)(q[l+ 0] & 0xF0) - (hl & 0x10 ? 0 : 256))
+ yh[l+4] * s[3] * ((int16_t)(q[l+16] & 0xF0) - (hl & 0x40 ? 0 : 256));
}
sumf[row] += d * (acc[0] + 1.f/16.f * acc[1]);
q += step;
h += step;
s += step;
dh += step/2;
}
y += 8 * QK_K;
}
#endif
sum[ith] = sumf;
//
// Accumulate the sum from all threads in the threadgroup
//
threadgroup_barrier(mem_flags::mem_threadgroup);
if (ith%4 == 0) {
sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
}
threadgroup_barrier(mem_flags::mem_threadgroup);
if (ith%16 == 0) {
sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
}
threadgroup_barrier(mem_flags::mem_threadgroup);
if (ith == 0) {
for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
dst[r1*ne0 + r0] = sum[0];
for (int row = 0; row < 2; ++row) {
const float tot = simd_sum(sumf[row]);
if (tiisg == 0) {
dst[r1*ne0 + first_row + row] = tot;
}
}
}
...
...
@@ -1753,10 +1828,9 @@ kernel void kernel_mul_mat_q6_K_f32(
constant int64_t & ne00,
constant int64_t & ne10,
constant int64_t & ne0,
threadgroup float * sum [[threadgroup(0)]],
uint2 tgpig[[threadgroup_position_in_grid]],
uint
2
t
pit
g[[thread_
position_in_threa
dgroup]],
uint
2 tptg[[threads_per
_threadgroup]]) {
uint t
iis
g[[thread_
index_in_sim
dgroup]],
uint
sgitg[[simdgroup_index_in
_threadgroup]]) {
const uint8_t kmask1 = 0x03;
const uint8_t kmask2 = 0x0C;
...
...
@@ -1768,19 +1842,18 @@ kernel void kernel_mul_mat_q6_K_f32(
const int64_t r0 = tgpig.x;
const int64_t r1 = tgpig.y;
device const block_q6_K * x = (device const block_q6_K *) src0 + r0*nb;
device const float * yy = (device const float *) src1 + r1*ne10;
const int row = 2 * r0 + sgitg;
const int nth = tptg.x*tptg.y
;
const int ith = tptg.y*tpitg.x + tpitg.y
;
device const block_q6_K * x = (device const block_q6_K *) src0 + row * nb; //r0*nb
;
device const float * yy = (device const float *) src1 + r1*ne10
;
float sumf = 0;
#if QK_K == 256
// Note: we absolutely assume that tptg.y = 16 and QK_K = 256!
const int i
qs
=
16 * tpitg.y
;
const int ip =
iqs / 12
8; // 0 or 1
const int il =
(iqs - 128*ip)/16; // 0...7
const int tid = tiisg/2;
const int i
x
=
tiisg%2
;
const int ip =
tid/
8; // 0 or 1
const int il =
tid%8;
const int n = 4;
const int l0 = n*il;
const int is = 8*ip + l0/16;
...
...
@@ -1789,9 +1862,10 @@ kernel void kernel_mul_mat_q6_K_f32(
const int q_offset_l = 64*ip + l0;
const int q_offset_h = 32*ip + l0;
for (int i =
tpitg.
x; i < nb; i +=
tptg.x
) {
for (int i =
i
x; i < nb; i +=
2
) {
device const uint8_t * ql = x[i].ql + q_offset_l;
device const uint8_t * q1 = x[i].ql + q_offset_l;
device const uint8_t * q2 = q1 + 32;
device const uint8_t * qh = x[i].qh + q_offset_h;
device const int8_t * sc = x[i].scales + is;
...
...
@@ -1801,19 +1875,21 @@ kernel void kernel_mul_mat_q6_K_f32(
float4 sums = {0.f, 0.f, 0.f, 0.f};
for (int l = 0; l < n; ++l) {
sums[0] += y[l+ 0] * ((int8_t)((q
l
[l
+ 0
] & 0xF) | ((qh[l] & kmask1) << 4)) - 32);
sums[1] += y[l+32] * ((int8_t)((q
l
[l
+32
] & 0xF) | ((qh[l] & kmask2) << 2)) - 32);
sums[2] += y[l+64] * ((int8_t)((q
l
[l
+ 0
] >> 4) | ((qh[l] & kmask3) << 0)) - 32);
sums[3] += y[l+96] * ((int8_t)((q
l
[l
+32
] >> 4) | ((qh[l] & kmask4) >> 2)) - 32);
sums[0] += y[l+ 0] * ((int8_t)((q
1
[l] & 0xF) | ((qh[l] & kmask1) << 4)) - 32);
sums[1] += y[l+32] * ((int8_t)((q
2
[l] & 0xF) | ((qh[l] & kmask2) << 2)) - 32);
sums[2] += y[l+64] * ((int8_t)((q
1
[l] >> 4) | ((qh[l] & kmask3) << 0)) - 32);
sums[3] += y[l+96] * ((int8_t)((q
2
[l] >> 4) | ((qh[l] & kmask4) >> 2)) - 32);
}
sumf += dall * (sums[0] * sc[0] + sums[1] * sc[2] + sums[2] * sc[4] + sums[3] * sc[6]);
}
#else
const int il = 4*tpitg.x; // 0, 4, 8, 12
const int ix = tiisg/4;
const int il = 4*(tiisg%4);
for (int i =
tpitg.y
; i < nb; i +=
tptg.y
) {
for (int i =
ix
; i < nb; i +=
8
) {
device const float * y = yy + i * QK_K + il;
device const uint8_t * ql = x[i].ql + il;
device const uint8_t * qh = x[i].qh + il;
...
...
@@ -1833,23 +1909,8 @@ kernel void kernel_mul_mat_q6_K_f32(
#endif
sum[ith] = sumf;
//
// Accumulate the sum from all threads in the threadgroup
//
threadgroup_barrier(mem_flags::mem_threadgroup);
if (ith%4 == 0) {
for (int i = 1; i < 4; ++i) sum[ith] += sum[ith + i];
const float tot = simd_sum(sumf);
if (tiisg == 0) {
dst[r1*ne0 + row] = tot;
}
threadgroup_barrier(mem_flags::mem_threadgroup);
if (ith%16 == 0) {
for (int i = 4; i < 16; i += 4) sum[ith] += sum[ith + i];
}
threadgroup_barrier(mem_flags::mem_threadgroup);
if (ith == 0) {
for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
dst[r1*ne0 + r0] = sum[0];
}
}
llama/ggml.c
View file @
a83eaa7a
/**
* llama.cpp - git
5bf2a2771886ee86137e01dbc7492f78fb392066
* llama.cpp - git
e782c9e735f93ab4767ffc37462c523b73a17ddc
*
* MIT License
*
...
...
@@ -51,16 +51,23 @@
#include <float.h>
#include <limits.h>
#include <stdarg.h>
#include <signal.h>
#ifdef GGML_USE_METAL
#include <unistd.h>
#endif
// static_assert should be a #define, but if it's not,
// fall back to the _Static_assert C11 keyword.
// if C99 - static_assert is noop
// ref: https://stackoverflow.com/a/53923785/4039976
#ifndef static_assert
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
#define static_assert(cond, msg) _Static_assert(cond, msg)
#else
#define static_assert(cond, msg) struct global_scope_noop_trick
#endif
#endif
#if defined(_MSC_VER)
// disable "possible loss of data" to avoid hundreds of casts
...
...
@@ -75,23 +82,23 @@
typedef volatile LONG atomic_int;
typedef atomic_int atomic_bool;
static void atomic_store(atomic_int* ptr, LONG val) {
static void atomic_store(atomic_int
* ptr, LONG val) {
InterlockedExchange(ptr, val);
}
static LONG atomic_load(atomic_int* ptr) {
static LONG atomic_load(atomic_int
* ptr) {
return InterlockedCompareExchange(ptr, 0, 0);
}
static LONG atomic_fetch_add(atomic_int* ptr, LONG inc) {
static LONG atomic_fetch_add(atomic_int
* ptr, LONG inc) {
return InterlockedExchangeAdd(ptr, inc);
}
static LONG atomic_fetch_sub(atomic_int* ptr, LONG dec) {
static LONG atomic_fetch_sub(atomic_int
* ptr, LONG dec) {
return atomic_fetch_add(ptr, -(dec));
}
typedef HANDLE pthread_t;
typedef DWORD thread_ret_t;
static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void*), void* arg) {
static int pthread_create(pthread_t
* out, void
* unused, thread_ret_t(*func)(void
*), void
* arg) {
(void) unused;
HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);
if (handle == NULL)
...
...
@@ -103,7 +110,7 @@ static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void
return 0;
}
static int pthread_join(pthread_t thread, void* unused) {
static int pthread_join(pthread_t thread, void
* unused) {
(void) unused;
return (int) WaitForSingleObject(thread, INFINITE);
}
...
...
@@ -116,7 +123,7 @@ static int sched_yield (void) {
#include <pthread.h>
#include <stdatomic.h>
typedef void* thread_ret_t;
typedef void
* thread_ret_t;
#include <sys/types.h>
#include <sys/stat.h>
...
...
@@ -137,10 +144,6 @@ typedef void* thread_ret_t;
#endif
#endif
#ifdef __HAIKU__
#define static_assert(cond, msg) _Static_assert(cond, msg)
#endif
/*#define GGML_PERF*/
#define GGML_DEBUG 0
#define GGML_GELU_FP16
...
...
@@ -3812,6 +3815,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"CLAMP",
"CONV_1D",
"CONV_2D",
"POOL_1D",
"POOL_2D",
"FLASH_ATTN",
"FLASH_FF",
...
...
@@ -3830,7 +3835,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"CROSS_ENTROPY_LOSS_BACK",
};
static_assert(GGML_OP_COUNT == 6
6
, "GGML_OP_COUNT != 6
6
");
static_assert(GGML_OP_COUNT == 6
8
, "GGML_OP_COUNT != 6
8
");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
...
...
@@ -3890,6 +3895,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"clamp(x)",
"conv_1d(x)",
"conv_2d(x)",
"pool_1d(x)",
"pool_2d(x)",
"flash_attn(x)",
"flash_ff(x)",
...
...
@@ -3908,7 +3915,9 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"cross_entropy_loss_back(x,y)",
};
static_assert(GGML_OP_COUNT == 66, "GGML_OP_COUNT != 66");
static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
...
...
@@ -4187,10 +4196,9 @@ static inline bool ggml_is_matrix(const struct ggml_tensor * tensor) {
static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return
(t0->ne[0] == t1->ne[0]) &&
(t0->ne[2] == t1->ne[2]) &&
(t0->ne[3] == t1->ne[3]);
return (t0->ne[0] == t1->ne[0]) &&
(t1->ne[2]%t0->ne[2] == 0) && // verify t0 is broadcastable
(t1->ne[3]%t0->ne[3] == 0);
}
static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
...
...
@@ -4430,8 +4438,8 @@ void ggml_free(struct ggml_context * ctx) {
if (&g_state.contexts[i].context == ctx) {
g_state.contexts[i].used = false;
GGML_PRINT_DEBUG("%s: context %d
with %d objects
has been freed. memory used = %zu\n",
__func__, i,
ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size
);
GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n",
__func__, i,
ggml_used_mem(ctx)
);
if (ctx->mem_buffer_owned) {
GGML_ALIGNED_FREE(ctx->mem_buffer);
...
...
@@ -4749,7 +4757,7 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) {
{
assert(tensor->nb[0] == sizeof(ggml_fp16_t));
for (int i = 0; i < n; i++) {
ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), value);
ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1),
GGML_FP32_TO_FP16(
value)
)
;
}
} break;
case GGML_TYPE_F32:
...
...
@@ -4801,7 +4809,7 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
{
assert(tensor->nb[0] == sizeof(ggml_fp16_t));
for (int i = 0; i < n; i++) {
ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), value);
ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1),
GGML_FP32_TO_FP16(
value)
)
;
}
} break;
case GGML_TYPE_F32:
...
...
@@ -5061,11 +5069,15 @@ struct ggml_tensor * ggml_add_impl(
struct ggml_tensor * a,
struct ggml_tensor * b,
bool inplace) {
GGML_ASSERT(ggml_are_same_shape(a, b));
// TODO: support less-strict constraint
// GGML_ASSERT(ggml_can_repeat(b, a));
GGML_ASSERT(ggml_can_repeat_rows(b, a));
bool is_node = false;
if (a->grad || b->grad) {
if (!inplace && (a->grad || b->grad)) {
// TODO: support backward pass for broadcasting
GGML_ASSERT(ggml_are_same_shape(a, b));
is_node = true;
}
...
...
@@ -6051,8 +6063,8 @@ struct ggml_tensor * ggml_mul_mat(
is_node = true;
}
const int64_t ne[4] = { a->ne[1], b->ne[1],
a
->ne[2], b->ne[3] };
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, M
IN
(a->n_dims, b->n_dims), ne);
const int64_t ne[4] = { a->ne[1], b->ne[1],
b
->ne[2], b->ne[3] };
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, M
AX
(a->n_dims, b->n_dims), ne);
result->op = GGML_OP_MUL_MAT;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
...
...
@@ -6970,6 +6982,8 @@ struct ggml_tensor * ggml_rope_impl(
int n_past,
int n_dims,
int mode,
float freq_base,
float freq_scale,
int n_ctx,
bool inplace) {
GGML_ASSERT(n_past >= 0);
...
...
@@ -6983,12 +6997,14 @@ struct ggml_tensor * ggml_rope_impl(
ggml_scratch_save(ctx);
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32,
4
);
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32,
6
);
((int32_t *) b->data)[0] = n_past;
((int32_t *) b->data)[1] = n_dims;
((int32_t *) b->data)[2] = mode;
((int32_t *) b->data)[3] = n_ctx;
memcpy((int32_t *) b->data + 4, &freq_base, sizeof(float));
memcpy((int32_t *) b->data + 5, &freq_scale, sizeof(float));
ggml_scratch_load(ctx);
...
...
@@ -7007,7 +7023,7 @@ struct ggml_tensor * ggml_rope(
int n_dims,
int mode,
int n_ctx) {
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, false);
return ggml_rope_impl(ctx, a, n_past, n_dims, mode,
10000.0f, 1.0f,
n_ctx, false);
}
struct ggml_tensor * ggml_rope_inplace(
...
...
@@ -7017,7 +7033,19 @@ struct ggml_tensor * ggml_rope_inplace(
int n_dims,
int mode,
int n_ctx) {
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, true);
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, 10000.0f, 1.0f, n_ctx, true);
}
struct ggml_tensor * ggml_rope_custom_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_past,
int n_dims,
int mode,
float freq_base,
float freq_scale,
int n_ctx) {
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, freq_base, freq_scale, n_ctx, true);
}
// ggml_rope_back
...
...
@@ -7188,7 +7216,6 @@ struct ggml_tensor* ggml_conv_2d(
int d0,
int d1) {
GGML_ASSERT(b->ne[3] == 1);
GGML_ASSERT(a->ne[2] == b->ne[2]);
bool is_node = false;
...
...
@@ -7200,7 +7227,7 @@ struct ggml_tensor* ggml_conv_2d(
const int64_t ne[4] = {
ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1),
a->ne[3],
1
,
a->ne[3],
b->ne[3]
,
};
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
...
...
@@ -7235,6 +7262,98 @@ struct ggml_tensor* ggml_conv_1d_ph(
return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
}
// ggml_pool_*
static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) {
return (ins + 2 * p - ks) / s + 1;
}
// ggml_pool_2d
struct ggml_tensor* ggml_pool_1d(
struct ggml_context * ctx,
struct ggml_tensor * a,
enum ggml_op_pool op,
int k0,
int s0,
int p0) {
bool is_node = false;
if (a->grad) {
GGML_ASSERT(false); // TODO: implement backward
is_node = true;
}
const int64_t ne[3] = {
ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
a->ne[1],
};
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
ggml_scratch_save(ctx);
struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
((int32_t*)c->data)[0] = op;
((int32_t*)c->data)[1] = k0;
((int32_t*)c->data)[2] = s0;
((int32_t*)c->data)[3] = p0;
ggml_scratch_load(ctx);
result->op = GGML_OP_POOL_1D;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;
result->src[1] = c;
return result;
}
// ggml_pool_2d
struct ggml_tensor* ggml_pool_2d(
struct ggml_context * ctx,
struct ggml_tensor * a,
enum ggml_op_pool op,
int k0,
int k1,
int s0,
int s1,
int p0,
int p1) {
bool is_node = false;
if (a->grad) {
GGML_ASSERT(false); // TODO: implement backward
is_node = true;
}
const int64_t ne[3] = {
ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
ggml_calc_pool_output_size(a->ne[1], k1, s1, p1),
a->ne[2],
};
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
ggml_scratch_save(ctx);
struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 7);
((int32_t*)c->data)[0] = op;
((int32_t*)c->data)[1] = k0;
((int32_t*)c->data)[2] = k1;
((int32_t*)c->data)[3] = s0;
((int32_t*)c->data)[4] = s1;
((int32_t*)c->data)[5] = p0;
((int32_t*)c->data)[6] = p1;
ggml_scratch_load(ctx);
result->op = GGML_OP_POOL_2D;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;
result->src[1] = c;
return result;
}
// ggml_flash_attn
struct ggml_tensor * ggml_flash_attn(
...
...
@@ -8323,7 +8442,7 @@ static void ggml_compute_forward_add_f32(
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
GGML_ASSERT(ggml_
are_same_shape
(src
0
, src
1
) && ggml_are_same_shape(src0, dst));
GGML_ASSERT(ggml_
can_repeat_rows
(src
1
, src
0
) && ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
...
...
@@ -8348,23 +8467,23 @@ static void ggml_compute_forward_add_f32(
if (nb10 == sizeof(float)) {
for (int ir = ir0; ir < ir1; ++ir) {
// src
0, src1 and dst are same shape => same indices
const int i3 = ir/(ne2*ne1);
const int i2 = (ir - i3*ne2*ne1)/ne1;
const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
// src
1 is broadcastable across src0 and dst in i1, i2, i3
const int
64_t
i
0
3 = ir/(ne
0
2*ne
0
1);
const int
64_t
i
0
2 = (ir - i
0
3*ne
0
2*ne
0
1)/ne
0
1;
const int
64_t
i
0
1 = (ir - i
0
3*ne
0
2*ne
0
1 - i
0
2*ne
0
1);
const int64_t i13 = i03 % ne13;
const int64_t i12 = i02 % ne12;
const int64_t i11 = i01 % ne11;
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
#ifdef GGML_USE_ACCELERATE
vDSP_vadd(
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1,
ne0);
vDSP_vadd(src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00);
#else
ggml_vec_add_f32(ne0,
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ),
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
ggml_vec_add_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
#endif
// }
// }
...
...
@@ -8372,15 +8491,20 @@ static void ggml_compute_forward_add_f32(
} else {
// src1 is not contiguous
for (int ir = ir0; ir < ir1; ++ir) {
// src0, src1 and dst are same shape => same indices
const int i3 = ir/(ne2*ne1);
const int i2 = (ir - i3*ne2*ne1)/ne1;
const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
// src1 is broadcastable across src0 and dst in i1, i2, i3
const int64_t i03 = ir/(ne02*ne01);
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
const int64_t i13 = i03 % ne13;
const int64_t i12 = i02 % ne12;
const int64_t i11 = i01 % ne11;
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 );
float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
for (int i0 = 0; i0 < ne0; i0++) {
float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10);
float * src1_ptr = (float *) ((char *) src1->data + i
1
3*nb13 + i
1
2*nb12 + i1
1
*nb11 + i0*nb10);
dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
}
...
...
@@ -10559,7 +10683,6 @@ static void ggml_compute_forward_rms_norm_back(
}
}
// ggml_compute_forward_mul_mat
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
...
...
@@ -10603,17 +10726,19 @@ static void ggml_compute_forward_mul_mat(
const int ith = params->ith;
const int nth = params->nth;
GGML_ASSERT(ne02 == ne12);
GGML_ASSERT(ne03 == ne13);
GGML_ASSERT(ne2 == ne12);
GGML_ASSERT(ne3 == ne13);
const enum ggml_type type = src0->type;
const bool src1_cont = ggml_is_contiguous(src1);
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
GGML_ASSERT(ne0 == ne01);
GGML_ASSERT(ne1 == ne11);
GGML_ASSERT(ne2 == ne12);
GGML_ASSERT(ne3 == ne13);
// we don't support permuted src0 or src1
GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
GGML_ASSERT(nb10 == sizeof(float));
...
...
@@ -10624,16 +10749,16 @@ static void ggml_compute_forward_mul_mat(
GGML_ASSERT(nb1 <= nb2);
GGML_ASSERT(nb2 <= nb3);
GGML_ASSERT(ne0 == ne01);
GGML_ASSERT(ne1 == ne11);
GGML_ASSERT(ne2 == ne02);
GGML_ASSERT(ne3 == ne03);
// nb01 >= nb00 - src0 is not transposed
// compute by src0 rows
#if defined(GGML_USE_CLBLAST)
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
// TODO: handle case when src0 is broadcast-able into src1 across 2nd,3rd dimension
// ref: https://github.com/ggerganov/ggml/pull/224
GGML_ASSERT(ne02 == ne12);
GGML_ASSERT(ne03 == ne13);
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
}
...
...
@@ -10643,6 +10768,11 @@ static void ggml_compute_forward_mul_mat(
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
// TODO: handle case when src0 is broadcast-able into src1 across 2nd,3rd dimension
// ref: https://github.com/ggerganov/ggml/pull/224
GGML_ASSERT(ne02 == ne12);
GGML_ASSERT(ne03 == ne13);
if (params->ith != 0) {
return;
}
...
...
@@ -10663,7 +10793,7 @@ static void ggml_compute_forward_mul_mat(
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
if (type != GGML_TYPE_F32) {
float * const wdata = params->wdata;
float * const wdata
= params->wdata;
ggml_to_float_t const to_float = type_traits[type].to_float;
size_t id = 0;
...
...
@@ -10712,41 +10842,52 @@ static void ggml_compute_forward_mul_mat(
return;
}
// parallelize by src0 rows using ggml_vec_dot_q
// parallelize by src0 rows
const int64_t dr = (ne01 + nth - 1)/nth;
// total rows in src0
const int
nr = ne01*ne02*
ne0
3
;
const int64_t ir10 = dr*ith;
const int
64_t ir11 = MIN(ir10 + dr,
ne0
1)
;
//
rows per thread
const int
dr = (nr + nth - 1)/nth
;
//
src1 rows
const int
64_t nr1 = ne11*ne12*ne13
;
// row range for this thread
const int ir0 = dr*ith;
const int ir1 = MIN(ir0 + dr, nr);
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
const size_t row_size = ne00*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
for (int64_t ir1 = 0; ir1 < nr1; ++ir1) {
const int64_t i13 = (ir1/(ne12*ne11));
const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);
for (int ir = ir0; ir < ir1; ++ir) {
// src0 indices
const int i03 = ir/(ne02*ne01);
const int i02 = (ir - i03*ne02*ne01)/ne01;
const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
const int64_t ir0 = (ir1/ne11)%(ne02*ne03);
const int64_t i03 = (ir0/(ne02));
// Hack for "Falcon multi-query-attention key stutter" / alternative to ggml_repeat2.
// See https://github.com/ggerganov/llama.cpp/issues/1602#issuecomment-1606087470:
// GG: this is likely the correct way to broadcast, though need some more thought
// therefore leaving the comments to remind us for now
const int64_t i02 = (i12 / (ne12 / ne02));
// Original from PR/224 (and also essential/correct for non-broadcast matmuls in Falcon)
// const int64_t i02 = (ir0 - i03*ne02);
const int i13 = i03;
const int i12 = i02;
const int64_t i1 = i11;
const int64_t i2 = i12;
const int64_t i3 = i13;
const int i0 = i01;
const int i2 = i02;
const int i3 = i03;
const char * src0_row = (const char *) src0->data + ( 0 + i02*nb02 + i03*nb03 );
void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
char * src1_col = ((char *) wdata + ( (0 + i12*ne11 + i13*ne12*ne11)*row_size));
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
// the original src1 data pointer, so we should index using the indices directly
// TODO: this is a bit of a hack, we should probably have a better way to handle this
const char * src1_col = (const char *) wdata +
(src1_cont || src1->type != vec_dot_type
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
: (i11*nb11 + i12*nb12 + i13*nb13));
float * dst_col = (float *) ((char *) dst->data + (i
0*nb0 + 0
*nb1 + i2*nb2 + i3*nb3));
float * dst_col = (float *) ((char *) dst->data + (i
1
*nb1 + i2*nb2 + i3*nb3));
for (int64_t i
c
= 0; i
c
<
ne
11; ++i
c
) {
vec_dot(ne00, &dst_col[i
c*ne0
], src0_row
, (void *) (src1_col + ic*row_size)
);
for (int64_t i
r
=
ir1
0; i
r
<
ir
11; ++i
r
) {
vec_dot(ne00, &dst_col[i
r
], src0_row
+ ir*nb01, src1_col
);
}
}
...
...
@@ -11743,7 +11884,7 @@ static void ggml_compute_forward_alibi_f32(
const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
const int ne1 = src0->ne[1]; // seq_len_without_past
//
const int ne2 = src0->ne[2]; // n_head -> this is k
const int ne2 = src0->ne[2]; // n_head -> this is k
//const int ne3 = src0->ne[3]; // 1 -> bsz
const int n = ggml_nrows(src0);
...
...
@@ -11754,8 +11895,9 @@ static void ggml_compute_forward_alibi_f32(
const int nb2 = src0->nb[2];
//const int nb3 = src0->nb[3];
assert(nb0 == sizeof(float));
assert(ne1 + n_past == ne0); (void) n_past;
GGML_ASSERT(nb0 == sizeof(float));
GGML_ASSERT(ne1 + n_past == ne0);
GGML_ASSERT(n_head == ne2);
// add alibi to src0 (KQ_scaled)
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
...
...
@@ -11779,7 +11921,7 @@ static void ggml_compute_forward_alibi_f32(
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
}
pdst[0] =
(i-ne0+1)
* m_k + src[0];
pdst[0] =
i
* m_k + src[0];
}
}
...
...
@@ -11808,7 +11950,7 @@ static void ggml_compute_forward_alibi_f16(
const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
const int ne1 = src0->ne[1]; // seq_len_without_past
//
const int ne2 = src0->ne[2]; // n_head -> this is k
const int ne2 = src0->ne[2]; // n_head -> this is k
//const int ne3 = src0->ne[3]; // 1 -> bsz
const int n = ggml_nrows(src0);
...
...
@@ -11819,8 +11961,9 @@ static void ggml_compute_forward_alibi_f16(
const int nb2 = src0->nb[2];
//const int nb3 = src0->nb[3];
assert(nb0 == sizeof(ggml_fp16_t));
assert(ne1 + n_past == ne0); (void) n_past;
GGML_ASSERT(nb0 == sizeof(ggml_fp16_t));
GGML_ASSERT(ne1 + n_past == ne0); (void) n_past;
GGML_ASSERT(n_head == ne2);
// add alibi to src0 (KQ_scaled)
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
...
...
@@ -11845,7 +11988,7 @@ static void ggml_compute_forward_alibi_f16(
}
// we return F32
pdst[0] =
(i-ne0+1)
* m_k + GGML_FP16_TO_FP32(src[0]);
pdst[0] =
i
* m_k + GGML_FP16_TO_FP32(src[0]);
}
}
}
...
...
@@ -11973,16 +12116,21 @@ static void ggml_compute_forward_rope_f32(
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
GGML_ASSERT(src1->type == GGML_TYPE_I32);
GGML_ASSERT(ggml_nelements(src1) ==
4
);
GGML_ASSERT(ggml_nelements(src1) ==
6
);
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
}
float freq_base;
float freq_scale;
const int n_past = ((int32_t *) src1->data)[0];
const int n_dims = ((int32_t *) src1->data)[1];
const int mode = ((int32_t *) src1->data)[2];
const int n_ctx = ((int32_t *) src1->data)[3];
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
assert(n_past >= 0);
...
...
@@ -12011,7 +12159,7 @@ static void ggml_compute_forward_rope_f32(
// row index used to determine which thread to use
int ir = 0;
const float theta_scale = powf(
10000.0
, -2.0f/n_dims);
const float theta_scale = powf(
freq_base
, -2.0f/n_dims);
const bool is_neox = mode & 2;
const bool is_glm = mode & 4;
...
...
@@ -12023,7 +12171,7 @@ static void ggml_compute_forward_rope_f32(
if (ir++ < ir0) continue;
if (ir > ir1) break;
float theta = (float)p;
float theta =
freq_scale *
(float)p;
if (is_glm) {
theta = MIN(p, n_ctx - 2);
...
...
@@ -12100,16 +12248,21 @@ static void ggml_compute_forward_rope_f16(
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
GGML_ASSERT(src1->type == GGML_TYPE_I32);
GGML_ASSERT(ggml_nelements(src1) ==
4
);
GGML_ASSERT(ggml_nelements(src1) ==
6
);
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
}
float freq_base;
float freq_scale;
const int n_past = ((int32_t *) src1->data)[0];
const int n_dims = ((int32_t *) src1->data)[1];
const int mode = ((int32_t *) src1->data)[2];
const int n_ctx = ((int32_t *) src1->data)[3];
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
assert(n_past >= 0);
...
...
@@ -12138,7 +12291,7 @@ static void ggml_compute_forward_rope_f16(
// row index used to determine which thread to use
int ir = 0;
const float theta_scale = powf(
10000.0
, -2.0f/n_dims);
const float theta_scale = powf(
freq_base
, -2.0f/n_dims);
const bool is_neox = mode & 2;
const bool is_glm = mode & 4;
...
...
@@ -12150,7 +12303,7 @@ static void ggml_compute_forward_rope_f16(
if (ir++ < ir0) continue;
if (ir > ir1) break;
float theta = (float)p;
float theta =
freq_scale *
(float)p;
if (is_glm) {
theta = MIN(p, n_ctx - 2);
...
...
@@ -12211,7 +12364,7 @@ static void ggml_compute_forward_rope_f16(
const float x0 = GGML_FP16_TO_FP32(src[0]);
const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
dst_data[0]
= GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
}
}
...
...
@@ -12893,12 +13046,13 @@ static void ggml_compute_forward_conv_1d(
};
}
// ggml_compute_forward_conv_2d
_sk_p0
// ggml_compute_forward_conv_2d
static void ggml_compute_forward_conv_2d_
sk_p0_
f16_f32(
static void ggml_compute_forward_conv_2d_f16_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
const struct ggml_tensor * opt0,
struct ggml_tensor * dst) {
GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
...
...
@@ -12918,11 +13072,17 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
// size of the convolution row - the kernel size unrolled across all channels
const int ew0 = nk0*nk1*ne02;
const int32_t s0 = ((const int32_t*)(opt0->data))[0];
const int32_t s1 = ((const int32_t*)(opt0->data))[1];
const int32_t p0 = ((const int32_t*)(opt0->data))[2];
const int32_t p1 = ((const int32_t*)(opt0->data))[3];
const int32_t d0 = ((const int32_t*)(opt0->data))[4];
const int32_t d1 = ((const int32_t*)(opt0->data))[5];
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
GGML_ASSERT(nb10 == sizeof(float));
if (params->type == GGML_TASK_INIT) {
// TODO: fix this memset (wsize is overestimated)
memset(params->wdata, 0, params->wsize);
// prepare source data (src1)
...
...
@@ -12937,8 +13097,13 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
for (int i0 = 0; i0 < ne0; i0++) {
for (int ik1 = 0; ik1 < nk1; ik1++) {
for (int ik0 = 0; ik0 < nk0; ik0++) {
dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
GGML_FP32_TO_FP16(src[(i1*nk1 + ik1)*ne10 + (i0*nk0 + ik0)]);
const int idx0 = i0*s0 + ik0*d0 - p0;
const int idx1 = i1*s1 + ik1*d1 - p1;
if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) {
dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]);
}
}
}
}
...
...
@@ -12965,32 +13130,36 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
for (int i2 = ip0; i2 < ip1; i2++) {
float * dst_data = (float *)((char *) dst->data + i2*nb2);
for (int i1 = 0; i1 < ne1; ++i1) {
for (int i0 = 0; i0 < ne0; ++i0) {
ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0,
(ggml_fp16_t *) ((char *) src0->data + i2*nb03),
(ggml_fp16_t *) wdata + (i1*ne0 + i0)*ew0);
for (int i3 = 0; i3 < ne3; i3++) {
for (int i2 = ip0; i2 < ip1; i2++) {
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2);
for (int i1 = 0; i1 < ne1; ++i1) {
for (int i0 = 0; i0 < ne0; ++i0) {
ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0,
(ggml_fp16_t *) ((char *) src0->data + i2*nb03),
(ggml_fp16_t *) wdata + i3*nb3 + (i1*ne0 + i0)*ew0);
}
}
}
}
}
static void ggml_compute_forward_conv_2d
_sk_p0
(
static void ggml_compute_forward_conv_2d(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
const struct ggml_tensor * opt0,
struct ggml_tensor * dst
) {
switch (src0->type) {
case GGML_TYPE_F16:
{
ggml_compute_forward_conv_2d_
sk_p0_
f16_f32(params, src0, src1, dst);
ggml_compute_forward_conv_2d_f16_f32(params, src0, src1,
opt0,
dst);
} break;
case GGML_TYPE_F32:
{
//ggml_compute_forward_conv_2d_
sk_p0_
f32(params, src0, src1, dst);
//ggml_compute_forward_conv_2d_f32(params, src0, src1,
opt0,
dst);
GGML_ASSERT(false);
} break;
default:
...
...
@@ -13000,31 +13169,164 @@ static void ggml_compute_forward_conv_2d_sk_p0(
}
}
// ggml_compute_forward_
conv_2d
// ggml_compute_forward_
pool_1d_sk_p0
static void ggml_compute_forward_conv_2d(
static void ggml_compute_forward_pool_1d_sk_p0(
const struct ggml_compute_params * params,
const enum ggml_op_pool op,
const struct ggml_tensor * src,
const int k,
struct ggml_tensor * dst) {
assert(src->type == GGML_TYPE_F32);
assert(params->ith == 0);
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
}
const char * cdata = (const char *)src->data;
const char * const data_end = cdata + ggml_nbytes(src);
float * drow = (float *)dst->data;
const int64_t rs = dst->ne[0];
while (cdata < data_end) {
const float * const srow = (const float *)cdata;
int j = 0;
for (int64_t i = 0; i < rs; ++i) {
switch (op) {
case GGML_OP_POOL_AVG: drow[i] = 0; break;
case GGML_OP_POOL_MAX: drow[i] = -FLT_MAX; break;
case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
}
for (int ki = 0; ki < k; ++ki) {
switch (op) {
case GGML_OP_POOL_AVG: drow[i] += srow[j]; break;
case GGML_OP_POOL_MAX: if (srow[j] > drow[i]) drow[i] = srow[j]; break;
case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
}
++j;
}
switch (op) {
case GGML_OP_POOL_AVG: drow[i] /= k; break;
case GGML_OP_POOL_MAX: break;
case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
}
}
cdata += src->nb[1];
drow += rs;
}
}
// ggml_compute_forward_pool_1d
static void ggml_compute_forward_pool_1d(
const struct ggml_compute_params* params,
const struct ggml_tensor* src0,
const struct ggml_tensor* src1,
const struct ggml_tensor* opt0,
struct ggml_tensor* dst) {
const int32_t s0 = ((const int32_t*)(opt0->data))[0];
const int32_t s1 = ((const int32_t*)(opt0->data))[1];
const int32_t p0 = ((const int32_t*)(opt0->data))[2];
const int32_t p1 = ((const int32_t*)(opt0->data))[3];
const int32_t d0 = ((const int32_t*)(opt0->data))[4];
const int32_t d1 = ((const int32_t*)(opt0->data))[5];
GGML_ASSERT(d0 == 1); // dilation not supported
GGML_ASSERT(d1 == 1);
GGML_ASSERT(opt0->ne[0] == 4);
const int* opts = (const int*)opt0->data;
enum ggml_op_pool op = opts[0];
const int k0 = opts[1];
const int s0 = opts[2];
const int p0 = opts[3];
GGML_ASSERT(p0 == 0); // padding not supported
GGML_ASSERT(
p1
== 0);
GGML_ASSERT(
k0
==
s
0);
// only s = k supported
if (s0 == src0->ne[0] && s1 == src0->ne[1]) {
ggml_compute_forward_conv_2d_sk_p0(params, src0, src1, dst);
ggml_compute_forward_pool_1d_sk_p0(params, op, src0, k0, dst);
}
// ggml_compute_forward_pool_2d_sk_p0
static void ggml_compute_forward_pool_2d_sk_p0(
const struct ggml_compute_params * params,
const enum ggml_op_pool op,
const struct ggml_tensor * src,
const int k0,
const int k1,
struct ggml_tensor * dst) {
assert(src->type == GGML_TYPE_F32);
assert(params->ith == 0);
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
}
else {
GGML_ASSERT(false); // only stride equal to kernel size is supported
};
const char * cdata = (const char*)src->data;
const char * const data_end = cdata + ggml_nbytes(src);
const int64_t px = dst->ne[0];
const int64_t py = dst->ne[1];
const int64_t pa = px * py;
float * dplane = (float *)dst->data;
const int ka = k0 * k1;
while (cdata < data_end) {
for (int oy = 0; oy < py; ++oy) {
float * const drow = dplane + oy * px;
for (int ox = 0; ox < px; ++ox) {
float * const out = drow + ox;
switch (op) {
case GGML_OP_POOL_AVG: *out = 0; break;
case GGML_OP_POOL_MAX: *out = -FLT_MAX; break;
case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
}
const int ix = ox * k0;
const int iy = oy * k1;
for (int ky = 0; ky < k1; ++ky) {
const float * const srow = (const float *)(cdata + src->nb[1] * (iy + ky));
for (int kx = 0; kx < k0; ++kx) {
int j = ix + kx;
switch (op) {
case GGML_OP_POOL_AVG: *out += srow[j]; break;
case GGML_OP_POOL_MAX: if (srow[j] > *out) *out = srow[j]; break;
case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
}
}
}
switch (op) {
case GGML_OP_POOL_AVG: *out /= ka; break;
case GGML_OP_POOL_MAX: break;
case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
}
}
}
cdata += src->nb[2];
dplane += pa;
}
}
// ggml_compute_forward_pool_2d
static void ggml_compute_forward_pool_2d(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * opt0,
struct ggml_tensor * dst) {
GGML_ASSERT(opt0->ne[0] == 7);
const int* opts = (const int*)opt0->data;
enum ggml_op_pool op = opts[0];
const int k0 = opts[1];
const int k1 = opts[2];
const int s0 = opts[3];
const int s1 = opts[4];
const int p0 = opts[5];
const int p1 = opts[6];
GGML_ASSERT(p0 == 0);
GGML_ASSERT(p1 == 0); // padding not supported
GGML_ASSERT(k0 == s0);
GGML_ASSERT(k1 == s1); // only s = k supported
ggml_compute_forward_pool_2d_sk_p0(params, op, src0, k0, k1, dst);
}
...
...
@@ -14808,6 +15110,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
} break;
case GGML_OP_POOL_1D:
{
ggml_compute_forward_pool_1d(params, tensor->src[0], tensor->src[1], tensor);
} break;
case GGML_OP_POOL_2D:
{
ggml_compute_forward_pool_2d(params, tensor->src[0], tensor->src[1], tensor);
} break;
case GGML_OP_FLASH_ATTN:
{
const int32_t t = ggml_get_i32_1d(tensor->src[3], 0);
...
...
@@ -15452,7 +15762,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
// necessary for llama
if (src0->grad) {
assert(src1->type == GGML_TYPE_I32);
assert(ggml_nelements(src1) ==
4
);
assert(ggml_nelements(src1) ==
6
);
const int n_past = ((int32_t *) src1->data)[0];
const int n_dims = ((int32_t *) src1->data)[1];
const int mode = ((int32_t *) src1->data)[2];
...
...
@@ -15473,7 +15783,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
{
if (src0->grad) {
assert(src1->type == GGML_TYPE_I32);
assert(ggml_nelements(src1) ==
4
);
assert(ggml_nelements(src1) ==
3
);
const int n_past = ((int32_t *) src1->data)[0];
const int n_dims = ((int32_t *) src1->data)[1];
const int mode = ((int32_t *) src1->data)[2];
...
...
@@ -15508,6 +15818,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
{
GGML_ASSERT(false); // TODO: not implemented
} break;
case GGML_OP_POOL_1D:
{
GGML_ASSERT(false); // TODO: not implemented
} break;
case GGML_OP_POOL_2D:
{
GGML_ASSERT(false); // TODO: not implemented
} break;
case GGML_OP_FLASH_ATTN:
{
struct ggml_tensor * flash_grad = NULL;
...
...
@@ -15970,6 +16288,9 @@ struct ggml_compute_state_shared {
// synchronization primitives
atomic_int n_active; // num active threads
atomic_int node_n; // active graph node
bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
void * abort_callback_data;
};
struct ggml_compute_state {
...
...
@@ -16001,6 +16322,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
int node_n = -1;
while (true) {
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
state->shared->node_n += 1;
return (thread_ret_t) GGML_EXIT_ABORTED;
}
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
// all other threads are finished and spinning
// do finalize and init here so we don't have synchronize again
...
...
@@ -16018,8 +16343,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
if (GGML_OP_HAS_FINALIZE[node->op]) {
params.nth = n_tasks_arr[node_n];
ggml_compute_forward(¶ms, node);
ggml_graph_compute_perf_stats_node(node, state->shared);
}
ggml_graph_compute_perf_stats_node(node, state->shared);
}
// distribute new work or execute it direct if 1T
...
...
@@ -16049,11 +16374,16 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
if (GGML_OP_HAS_FINALIZE[node->op]) {
params.type = GGML_TASK_FINALIZE;
ggml_compute_forward(¶ms, node);
ggml_graph_compute_perf_stats_node(node, state->shared);
}
ggml_graph_compute_perf_stats_node(node, state->shared);
} else {
break;
}
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
break;
}
}
atomic_store(&state->shared->n_active, n_threads);
...
...
@@ -16087,7 +16417,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
}
}
return
0
;
return
GGML_EXIT_SUCCESS
;
}
struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
...
...
@@ -16287,8 +16617,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
{
n_tasks = n_threads;
GGML_ASSERT(node->src[1]->ne[3] == 1);
const int64_t ne00 = node->src[0]->ne[0]; // W
const int64_t ne01 = node->src[0]->ne[1]; // H
const int64_t ne02 = node->src[0]->ne[2]; // C
...
...
@@ -16298,19 +16626,22 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
const int64_t ne11 = node->src[1]->ne[1]; // H
const int64_t ne12 = node->src[1]->ne[2]; // C
const int64_t ne0 = node->ne[0];
const int64_t ne1 = node->ne[1];
const int64_t ne2 = node->ne[2];
const int64_t nk = ne00*ne01;
const int64_t ew0 = nk * ne02;
UNUSED(ne02);
UNUSED(ne03);
UNUSED(n
k
);
UNUSED(n
e2
);
size_t cur = 0;
if (node->src[0]->type == GGML_TYPE_F16 &&
node->src[1]->type == GGML_TYPE_F32) {
cur = sizeof(ggml_fp16_t)*(ne
1
0*ne1
1*ne12
);
node->src[1]->type == GGML_TYPE_F32) {
cur = sizeof(ggml_fp16_t)*(ne0*ne1
*ew0
);
} else if (node->src[0]->type == GGML_TYPE_F32 &&
node->src[1]->type == GGML_TYPE_F32) {
node->src[1]->type == GGML_TYPE_F32) {
cur = sizeof(float)* (ne10*ne11*ne12);
} else {
GGML_ASSERT(false);
...
...
@@ -16318,6 +16649,11 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
work_size = MAX(work_size, cur);
} break;
case GGML_OP_POOL_1D:
case GGML_OP_POOL_2D:
{
n_tasks = 1;
} break;
case GGML_OP_FLASH_ATTN:
{
n_tasks = n_threads;
...
...
@@ -16427,7 +16763,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
return cplan;
}
void
ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
int
ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
{
GGML_ASSERT(cplan);
GGML_ASSERT(cplan->n_threads > 0);
...
...
@@ -16453,6 +16789,8 @@ void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan)
/*.n_threads =*/ n_threads,
/*.n_active =*/ n_threads,
/*.node_n =*/ -1,
/*.abort_callback =*/ NULL,
/*.abort_callback_data =*/ NULL,
};
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
...
...
@@ -16476,12 +16814,12 @@ void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan)
const int64_t perf_start_time_us = ggml_perf_time_us();
// this is a work thread too
ggml_graph_compute_thread(&workers[0]);
int compute_status = (size_t)
ggml_graph_compute_thread(&workers[0]);
// don't leave affinity set on the main thread
clear_numa_thread_affinity();
// join thread pool
// join
or kill
thread pool
if (n_threads > 1) {
for (int j = 1; j < n_threads; j++) {
const int rc = ggml_thread_join(workers[j].thrd, NULL);
...
...
@@ -16505,6 +16843,8 @@ void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan)
(double) perf_time_us_cur / 1000.0,
(double) cgraph->perf_time_us / 1000.0 / cgraph->perf_runs);
}
return compute_status;
}
void ggml_graph_reset(struct ggml_cgraph * cgraph) {
...
...
@@ -16578,9 +16918,6 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
}
void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
//assert(cgraph->work == NULL);
//assert(cgraph->work_size == 0);
uint64_t size_eval = 0;
// compute size of intermediate results
...
...
@@ -17019,9 +17356,6 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
GGML_PRINT("=== GRAPH ===\n");
GGML_PRINT_DEBUG("n_threads = %d\n", cgraph->n_threads);
GGML_PRINT_DEBUG("total work size = %zu bytes\n", cgraph->work_size);
GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes);
for (int i = 0; i < cgraph->n_nodes; i++) {
struct ggml_tensor * node = cgraph->nodes[i];
...
...
llama/ggml.h
View file @
a83eaa7a
/**
* llama.cpp - git
5bf2a2771886ee86137e01dbc7492f78fb392066
* llama.cpp - git
e782c9e735f93ab4767ffc37462c523b73a17ddc
*
* MIT License
*
...
...
@@ -227,8 +227,13 @@
#define GGML_MAX_NAME 48
#define GGML_DEFAULT_N_THREADS 4
#define GGML_EXIT_SUCCESS 0
#define GGML_EXIT_ABORTED 1
#define GGML_UNUSED(x) (void)(x)
#define GGML_ASSERT(x) \
do { \
if (!(x)) { \
...
...
@@ -389,6 +394,8 @@ extern "C" {
GGML_OP_CLAMP
,
GGML_OP_CONV_1D
,
GGML_OP_CONV_2D
,
GGML_OP_POOL_1D
,
GGML_OP_POOL_2D
,
GGML_OP_FLASH_ATTN
,
GGML_OP_FLASH_FF
,
...
...
@@ -468,6 +475,10 @@ extern "C" {
// the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
int
n_tasks
[
GGML_MAX_NODES
];
// abort ggml_graph_compute when true
bool
(
*
abort_callback
)(
void
*
data
);
void
*
abort_callback_data
;
};
// computation graph
...
...
@@ -1136,6 +1147,17 @@ extern "C" {
int
mode
,
int
n_ctx
);
// custom RoPE, in-place, returns view(a)
GGML_API
struct
ggml_tensor
*
ggml_rope_custom_inplace
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
,
int
n_past
,
int
n_dims
,
int
mode
,
float
freq_base
,
float
freq_scale
,
int
n_ctx
);
// rotary position embedding backward, i.e compute dx from dy
// a - dy
GGML_API
struct
ggml_tensor
*
ggml_rope_back
(
...
...
@@ -1190,6 +1212,31 @@ extern "C" {
int
s
,
int
d
);
enum
ggml_op_pool
{
GGML_OP_POOL_MAX
,
GGML_OP_POOL_AVG
,
GGML_OP_POOL_COUNT
,
};
GGML_API
struct
ggml_tensor
*
ggml_pool_1d
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
,
enum
ggml_op_pool
op
,
int
k0
,
// kernel size
int
s0
,
// stride
int
p0
);
// padding
GGML_API
struct
ggml_tensor
*
ggml_pool_2d
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
,
enum
ggml_op_pool
op
,
int
k0
,
int
k1
,
int
s0
,
int
s1
,
int
p0
,
int
p1
);
GGML_API
struct
ggml_tensor
*
ggml_flash_attn
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
q
,
...
...
@@ -1329,7 +1376,7 @@ extern "C" {
// ggml_graph_plan() has to be called before ggml_graph_compute()
// when plan.work_size > 0, caller must allocate memory for plan.work_data
GGML_API
struct
ggml_cplan
ggml_graph_plan
(
struct
ggml_cgraph
*
cgraph
,
int
n_threads
/*= GGML_DEFAULT_N_THREADS*/
);
GGML_API
void
ggml_graph_compute
(
struct
ggml_cgraph
*
cgraph
,
struct
ggml_cplan
*
cplan
);
GGML_API
int
ggml_graph_compute
(
struct
ggml_cgraph
*
cgraph
,
struct
ggml_cplan
*
cplan
);
GGML_API
void
ggml_graph_reset
(
struct
ggml_cgraph
*
cgraph
);
// same as ggml_graph_compute() but the work data is allocated as a part of the context
...
...
llama/k_quants.c
View file @
a83eaa7a
/**
* llama.cpp - git
5bf2a2771886ee86137e01dbc7492f78fb392066
* llama.cpp - git
e782c9e735f93ab4767ffc37462c523b73a17ddc
*
* MIT License
*
...
...
llama/k_quants.h
View file @
a83eaa7a
/**
* llama.cpp - git
5bf2a2771886ee86137e01dbc7492f78fb392066
* llama.cpp - git
e782c9e735f93ab4767ffc37462c523b73a17ddc
*
* MIT License
*
...
...
@@ -41,6 +41,14 @@
#define K_SCALE_SIZE 12
#endif
#ifndef static_assert
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
#define static_assert(cond, msg) _Static_assert(cond, msg)
#else
#define static_assert(cond, msg) struct global_scope_noop_trick
#endif
#endif
//
// Super-block quantization structures
//
...
...
llama/llama-util.h
View file @
a83eaa7a
/**
* llama.cpp - git
5bf2a2771886ee86137e01dbc7492f78fb392066
* llama.cpp - git
e782c9e735f93ab4767ffc37462c523b73a17ddc
*
* MIT License
*
...
...
@@ -201,13 +201,13 @@ struct llama_mmap {
llama_mmap
(
struct
llama_file
*
file
,
size_t
prefetch
=
(
size_t
)
-
1
/* -1 = max value */
,
bool
numa
=
false
)
{
size
=
file
->
size
;
int
fd
=
fileno
(
file
->
fp
);
int
flags
=
MAP_
PRIVATE
;
int
flags
=
MAP_
SHARED
;
// prefetch/readahead impairs performance on NUMA systems
if
(
numa
)
{
prefetch
=
0
;
}
#ifdef __linux__
if
(
prefetch
)
{
flags
|=
MAP_POPULATE
;
}
#endif
addr
=
mmap
(
NULL
,
file
->
size
,
PROT_READ
|
PROT_WRITE
,
flags
,
fd
,
0
);
addr
=
mmap
(
NULL
,
file
->
size
,
PROT_READ
,
flags
,
fd
,
0
);
if
(
addr
==
MAP_FAILED
)
{
throw
std
::
runtime_error
(
format
(
"mmap failed: %s"
,
strerror
(
errno
)));
}
...
...
@@ -249,7 +249,7 @@ struct llama_mmap {
throw
std
::
runtime_error
(
format
(
"CreateFileMappingA failed: %s"
,
llama_format_win_err
(
error
).
c_str
()));
}
addr
=
MapViewOfFile
(
hMapping
,
FILE_MAP_
COPY
,
0
,
0
,
0
);
addr
=
MapViewOfFile
(
hMapping
,
FILE_MAP_
READ
,
0
,
0
,
0
);
error
=
GetLastError
();
CloseHandle
(
hMapping
);
...
...
llama/llama.cpp
View file @
a83eaa7a
/**
* llama.cpp - git
5bf2a2771886ee86137e01dbc7492f78fb392066
* llama.cpp - git
e782c9e735f93ab4767ffc37462c523b73a17ddc
*
* MIT License
*
...
...
@@ -127,14 +127,15 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
// memory sizes
//
static
const
std
::
map
<
e_model
,
size_t
>
&
MEM_REQ_SCRATCH0
()
static
const
std
::
map
<
e_model
,
size_t
>
&
MEM_REQ_SCRATCH0
(
int
n_ctx
)
{
static
std
::
map
<
e_model
,
size_t
>
k_sizes
=
{
{
MODEL_3B
,
256ull
*
MB
},
{
MODEL_7B
,
512ull
*
MB
},
{
MODEL_13B
,
512ull
*
MB
},
{
MODEL_30B
,
512ull
*
MB
},
{
MODEL_65B
,
1024ull
*
MB
},
/* empirical scaling, still a guess */
{
MODEL_3B
,
((
size_t
)
n_ctx
/
16ull
+
128ull
)
*
MB
},
{
MODEL_7B
,
((
size_t
)
n_ctx
/
16ull
+
256ull
)
*
MB
},
{
MODEL_13B
,
((
size_t
)
n_ctx
/
12ull
+
256ull
)
*
MB
},
{
MODEL_30B
,
((
size_t
)
n_ctx
/
10ull
+
256ull
)
*
MB
},
{
MODEL_65B
,
((
size_t
)
n_ctx
/
8ull
+
512ull
)
*
MB
},
};
return
k_sizes
;
}
...
...
@@ -166,14 +167,14 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
// this is mostly needed for temporary mul_mat buffers to dequantize the data
// not actually needed if BLAS is disabled
static
const
std
::
map
<
e_model
,
size_t
>
&
MEM_REQ_EVAL
()
static
const
std
::
map
<
e_model
,
size_t
>
&
MEM_REQ_EVAL
(
int
n_ctx
)
{
static
std
::
map
<
e_model
,
size_t
>
k_sizes
=
{
{
MODEL_3B
,
512ull
*
MB
},
{
MODEL_7B
,
768ull
*
MB
},
{
MODEL_13B
,
1024ull
*
MB
},
{
MODEL_30B
,
1280ull
*
MB
},
{
MODEL_65B
,
1536ull
*
MB
},
{
MODEL_3B
,
((
size_t
)
n_ctx
/
256ull
+
512ull
)
*
MB
},
{
MODEL_7B
,
((
size_t
)
n_ctx
/
256ull
+
768ull
)
*
MB
},
{
MODEL_13B
,
((
size_t
)
n_ctx
/
256ull
+
1024ull
)
*
MB
},
{
MODEL_30B
,
((
size_t
)
n_ctx
/
256ull
+
1280ull
)
*
MB
},
{
MODEL_65B
,
((
size_t
)
n_ctx
/
256ull
+
1536ull
)
*
MB
},
};
return
k_sizes
;
}
...
...
@@ -215,6 +216,10 @@ struct llama_hparams {
uint32_t
n_head
=
32
;
uint32_t
n_layer
=
32
;
uint32_t
n_rot
=
64
;
float
rope_freq_base
=
10000.0
f
;
float
rope_freq_scale
=
1.0
f
;
enum
llama_ftype
ftype
=
LLAMA_FTYPE_MOSTLY_F16
;
bool
operator
!=
(
const
llama_hparams
&
other
)
const
{
...
...
@@ -329,7 +334,7 @@ struct llama_model {
};
struct
llama_context
{
llama_context
(
const
llama_model
&
model
,
const
llama_vocab
&
vocab
)
:
model
(
model
),
vocab
(
vocab
),
t_load_us
(
model
.
t_load_us
),
t_start_us
(
model
.
t_start_us
)
{}
llama_context
(
const
llama_model
&
model
)
:
model
(
model
),
t_load_us
(
model
.
t_load_us
),
t_start_us
(
model
.
t_start_us
)
{}
#ifdef GGML_USE_METAL
~
llama_context
()
{
if
(
ctx_metal
)
{
...
...
@@ -350,7 +355,6 @@ struct llama_context {
int32_t
n_p_eval
=
0
;
// number of tokens in eval calls for the prompt (with batch size > 1)
const
llama_model
&
model
;
const
llama_vocab
&
vocab
;
bool
model_owner
=
false
;
...
...
@@ -577,7 +581,9 @@ struct llama_file_loader {
}
// skip to the next multiple of 32 bytes
file
.
seek
(
-
static_cast
<
ptrdiff_t
>
(
file
.
tell
())
&
31
,
SEEK_CUR
);
if
(
file_version
>=
LLAMA_FILE_VERSION_GGJT_V1
)
{
file
.
seek
(
-
static_cast
<
ptrdiff_t
>
(
file
.
tell
())
&
31
,
SEEK_CUR
);
}
tensor
.
file_off
=
file
.
tell
();
tensor
.
name
=
name
;
...
...
@@ -674,7 +680,7 @@ struct llama_model_loader {
*
ctx_size_p
=
*
mmapped_size_p
=
0
;
for
(
const
llama_load_tensor
&
lt
:
tensors_map
.
tensors
)
{
*
ctx_size_p
+=
sizeof
(
struct
ggml_tensor
)
+
GGML_OBJECT_SIZE
;
*
(
use_mmap
?
mmapped_size_p
:
ctx_size_p
)
+=
lt
.
size
;
*
(
use_mmap
?
mmapped_size_p
:
ctx_size_p
)
+=
lt
.
size
+
16
;
}
}
...
...
@@ -870,6 +876,8 @@ struct llama_context_params llama_context_default_params() {
/*.gpu_layers =*/
0
,
/*.main_gpu =*/
0
,
/*.tensor_split =*/
{
0
},
/*.rope_freq_base =*/
10000.0
f
,
/*.rope_freq_scale =*/
1.0
f
,
/*.progress_callback =*/
nullptr
,
/*.progress_callback_user_data =*/
nullptr
,
/*.low_vram =*/
false
,
...
...
@@ -895,6 +903,10 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
return
result
;
}
int
llama_max_devices
()
{
return
LLAMA_MAX_DEVICES
;
}
bool
llama_mmap_supported
()
{
return
llama_mmap
::
SUPPORTED
;
}
...
...
@@ -993,6 +1005,8 @@ static void llama_model_load_internal(
int
n_gpu_layers
,
int
main_gpu
,
const
float
*
tensor_split
,
float
rope_freq_base
,
float
rope_freq_scale
,
bool
low_vram
,
ggml_type
memory_type
,
bool
use_mmap
,
...
...
@@ -1027,22 +1041,27 @@ static void llama_model_load_internal(
}
hparams
.
n_ctx
=
n_ctx
;
hparams
.
rope_freq_base
=
rope_freq_base
;
hparams
.
rope_freq_scale
=
rope_freq_scale
;
}
const
uint32_t
n_ff
=
((
2
*
(
4
*
hparams
.
n_embd
)
/
3
+
hparams
.
n_mult
-
1
)
/
hparams
.
n_mult
)
*
hparams
.
n_mult
;
{
fprintf
(
stderr
,
"%s: format = %s
\n
"
,
__func__
,
llama_file_version_name
(
file_version
));
fprintf
(
stderr
,
"%s: n_vocab = %u
\n
"
,
__func__
,
hparams
.
n_vocab
);
fprintf
(
stderr
,
"%s: n_ctx = %u
\n
"
,
__func__
,
hparams
.
n_ctx
);
fprintf
(
stderr
,
"%s: n_embd = %u
\n
"
,
__func__
,
hparams
.
n_embd
);
fprintf
(
stderr
,
"%s: n_mult = %u
\n
"
,
__func__
,
hparams
.
n_mult
);
fprintf
(
stderr
,
"%s: n_head = %u
\n
"
,
__func__
,
hparams
.
n_head
);
fprintf
(
stderr
,
"%s: n_layer = %u
\n
"
,
__func__
,
hparams
.
n_layer
);
fprintf
(
stderr
,
"%s: n_rot = %u
\n
"
,
__func__
,
hparams
.
n_rot
);
fprintf
(
stderr
,
"%s: format = %s
\n
"
,
__func__
,
llama_file_version_name
(
file_version
));
fprintf
(
stderr
,
"%s: n_vocab = %u
\n
"
,
__func__
,
hparams
.
n_vocab
);
fprintf
(
stderr
,
"%s: n_ctx = %u
\n
"
,
__func__
,
hparams
.
n_ctx
);
fprintf
(
stderr
,
"%s: n_embd = %u
\n
"
,
__func__
,
hparams
.
n_embd
);
fprintf
(
stderr
,
"%s: n_mult = %u
\n
"
,
__func__
,
hparams
.
n_mult
);
fprintf
(
stderr
,
"%s: n_head = %u
\n
"
,
__func__
,
hparams
.
n_head
);
fprintf
(
stderr
,
"%s: n_layer = %u
\n
"
,
__func__
,
hparams
.
n_layer
);
fprintf
(
stderr
,
"%s: n_rot = %u
\n
"
,
__func__
,
hparams
.
n_rot
);
fprintf
(
stderr
,
"%s: freq_base = %.1f
\n
"
,
__func__
,
hparams
.
rope_freq_base
);
fprintf
(
stderr
,
"%s: freq_scale = %g
\n
"
,
__func__
,
hparams
.
rope_freq_scale
);
fprintf
(
stderr
,
"%s: ftype = %u (%s)
\n
"
,
__func__
,
hparams
.
ftype
,
llama_ftype_name
(
hparams
.
ftype
));
fprintf
(
stderr
,
"%s: n_ff = %u
\n
"
,
__func__
,
n_ff
);
fprintf
(
stderr
,
"%s: model size = %s
\n
"
,
__func__
,
llama_model_type_name
(
model
.
type
));
fprintf
(
stderr
,
"%s: n_ff = %u
\n
"
,
__func__
,
n_ff
);
fprintf
(
stderr
,
"%s: model size = %s
\n
"
,
__func__
,
llama_model_type_name
(
model
.
type
));
}
if
(
file_version
<
LLAMA_FILE_VERSION_GGJT_V2
)
{
...
...
@@ -1191,9 +1210,9 @@ static void llama_model_load_internal(
const
size_t
mem_required
=
ctx_size
+
mmapped_size
-
vram_weights
+
// weights in VRAM not in memory
MEM_REQ_SCRATCH0
().
at
(
model
.
type
)
+
MEM_REQ_SCRATCH0
(
hparams
.
n_ctx
).
at
(
model
.
type
)
+
MEM_REQ_SCRATCH1
().
at
(
model
.
type
)
+
MEM_REQ_EVAL
().
at
(
model
.
type
);
MEM_REQ_EVAL
(
hparams
.
n_ctx
).
at
(
model
.
type
);
// this is the memory required by one llama_state
const
size_t
mem_required_state
=
...
...
@@ -1297,6 +1316,8 @@ static bool llama_model_load(
int
n_gpu_layers
,
int
main_gpu
,
float
*
tensor_split
,
float
rope_freq_base
,
float
rope_freq_scale
,
bool
low_vram
,
ggml_type
memory_type
,
bool
use_mmap
,
...
...
@@ -1305,7 +1326,7 @@ static bool llama_model_load(
llama_progress_callback
progress_callback
,
void
*
progress_callback_user_data
)
{
try
{
llama_model_load_internal
(
fname
,
model
,
vocab
,
n_ctx
,
n_batch
,
n_gpu_layers
,
main_gpu
,
tensor_split
,
low_vram
,
memory_type
,
llama_model_load_internal
(
fname
,
model
,
vocab
,
n_ctx
,
n_batch
,
n_gpu_layers
,
main_gpu
,
tensor_split
,
rope_freq_base
,
rope_freq_scale
,
low_vram
,
memory_type
,
use_mmap
,
use_mlock
,
vocab_only
,
progress_callback
,
progress_callback_user_data
);
return
true
;
}
catch
(
const
std
::
exception
&
err
)
{
...
...
@@ -1357,6 +1378,9 @@ static bool llama_eval_internal(
const
int
n_rot
=
hparams
.
n_embd
/
hparams
.
n_head
;
const
int
n_gpu_layers
=
model
.
n_gpu_layers
;
const
float
freq_base
=
hparams
.
rope_freq_base
;
const
float
freq_scale
=
hparams
.
rope_freq_scale
;
auto
&
mem_per_token
=
lctx
.
mem_per_token
;
auto
&
buf_compute
=
lctx
.
buf_compute
;
...
...
@@ -1454,11 +1478,11 @@ static bool llama_eval_internal(
offload_func_kq
(
tmpq
);
ggml_set_name
(
tmpq
,
"tmpq"
);
struct
ggml_tensor
*
Kcur
=
ggml_rope_inplace
(
ctx0
,
ggml_reshape_3d
(
ctx0
,
tmpk
,
n_embd
/
n_head
,
n_head
,
N
),
n_past
,
n_rot
,
0
,
0
);
struct
ggml_tensor
*
Kcur
=
ggml_rope_
custom_
inplace
(
ctx0
,
ggml_reshape_3d
(
ctx0
,
tmpk
,
n_embd
/
n_head
,
n_head
,
N
),
n_past
,
n_rot
,
0
,
freq_base
,
freq_scale
,
0
);
offload_func_kq
(
Kcur
);
ggml_set_name
(
Kcur
,
"Kcur"
);
struct
ggml_tensor
*
Qcur
=
ggml_rope_inplace
(
ctx0
,
ggml_reshape_3d
(
ctx0
,
tmpq
,
n_embd
/
n_head
,
n_head
,
N
),
n_past
,
n_rot
,
0
,
0
);
struct
ggml_tensor
*
Qcur
=
ggml_rope_
custom_
inplace
(
ctx0
,
ggml_reshape_3d
(
ctx0
,
tmpq
,
n_embd
/
n_head
,
n_head
,
N
),
n_past
,
n_rot
,
0
,
freq_base
,
freq_scale
,
0
);
offload_func_kq
(
Qcur
);
ggml_set_name
(
Qcur
,
"Qcur"
);
...
...
@@ -2032,9 +2056,18 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
}
// Normalize the second derivatives
float
second_derivatives_sum
=
std
::
accumulate
(
second_derivatives
.
begin
(),
second_derivatives
.
end
(),
0.0
f
);
for
(
float
&
value
:
second_derivatives
)
{
value
/=
second_derivatives_sum
;
{
const
float
second_derivatives_sum
=
std
::
accumulate
(
second_derivatives
.
begin
(),
second_derivatives
.
end
(),
0.0
f
);
if
(
second_derivatives_sum
>
1e-6
f
)
{
for
(
float
&
value
:
second_derivatives
)
{
value
/=
second_derivatives_sum
;
}
}
else
{
for
(
float
&
value
:
second_derivatives
)
{
value
=
1.0
f
/
second_derivatives
.
size
();
}
}
}
float
cum_sum
=
0.0
f
;
...
...
@@ -2213,7 +2246,7 @@ void llama_sample_classifier_free_guidance(
struct
llama_context
*
guidance_ctx
,
float
scale
,
float
smooth_factor
)
{
int64_t
t_start_sample_us
=
t_start_sample_us
=
ggml_time_us
();
int64_t
t_start_sample_us
=
ggml_time_us
();
assert
(
ctx
);
auto
n_vocab
=
llama_n_vocab
(
ctx
);
...
...
@@ -2701,8 +2734,9 @@ struct llama_model * llama_load_model_from_file(
ggml_type
memory_type
=
params
.
f16_kv
?
GGML_TYPE_F16
:
GGML_TYPE_F32
;
if
(
!
llama_model_load
(
path_model
,
*
model
,
model
->
vocab
,
params
.
n_ctx
,
params
.
n_batch
,
params
.
n_gpu_layers
,
params
.
main_gpu
,
params
.
tensor_split
,
params
.
low_vram
,
memory_type
,
params
.
use_mmap
,
params
.
use_mlock
,
params
.
vocab_only
,
params
.
progress_callback
,
params
.
progress_callback_user_data
))
{
params
.
main_gpu
,
params
.
tensor_split
,
params
.
rope_freq_base
,
params
.
rope_freq_scale
,
params
.
low_vram
,
memory_type
,
params
.
use_mmap
,
params
.
use_mlock
,
params
.
vocab_only
,
params
.
progress_callback
,
params
.
progress_callback_user_data
))
{
delete
model
;
fprintf
(
stderr
,
"%s: failed to load model
\n
"
,
__func__
);
return
nullptr
;
...
...
@@ -2723,7 +2757,7 @@ struct llama_context * llama_new_context_with_model(
return
nullptr
;
}
llama_context
*
ctx
=
new
llama_context
(
*
model
,
model
->
vocab
);
llama_context
*
ctx
=
new
llama_context
(
*
model
);
if
(
params
.
seed
==
LLAMA_DEFAULT_SEED
)
{
params
.
seed
=
time
(
NULL
);
...
...
@@ -2777,9 +2811,9 @@ struct llama_context * llama_new_context_with_model(
ctx
->
embedding
.
resize
(
hparams
.
n_embd
);
}
ctx
->
buf_compute
.
resize
(
MEM_REQ_EVAL
().
at
(
ctx
->
model
.
type
));
ctx
->
buf_compute
.
resize
(
MEM_REQ_EVAL
(
hparams
.
n_ctx
).
at
(
ctx
->
model
.
type
));
ctx
->
buf_scratch
[
0
].
resize
(
MEM_REQ_SCRATCH0
().
at
(
ctx
->
model
.
type
));
ctx
->
buf_scratch
[
0
].
resize
(
MEM_REQ_SCRATCH0
(
hparams
.
n_ctx
).
at
(
ctx
->
model
.
type
));
ctx
->
buf_scratch
[
1
].
resize
(
MEM_REQ_SCRATCH1
().
at
(
ctx
->
model
.
type
));
}
...
...
@@ -3561,13 +3595,13 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
return
0
;
}
int
llama_tokenize
(
struct
llama_
context
*
ctx
,
int
llama_tokenize
_with_model
(
const
struct
llama_
model
*
model
,
const
char
*
text
,
llama_token
*
tokens
,
int
n_max_tokens
,
bool
add_bos
)
{
auto
res
=
llama_tokenize
(
ctx
->
vocab
,
text
,
add_bos
);
auto
res
=
llama_tokenize
(
model
->
vocab
,
text
,
add_bos
);
if
(
n_max_tokens
<
(
int
)
res
.
size
())
{
fprintf
(
stderr
,
"%s: too many tokens
\n
"
,
__func__
);
...
...
@@ -3581,8 +3615,29 @@ int llama_tokenize(
return
res
.
size
();
}
int
llama_tokenize
(
struct
llama_context
*
ctx
,
const
char
*
text
,
llama_token
*
tokens
,
int
n_max_tokens
,
bool
add_bos
)
{
return
llama_tokenize_with_model
(
&
ctx
->
model
,
text
,
tokens
,
n_max_tokens
,
add_bos
);
}
int
llama_n_vocab_from_model
(
const
struct
llama_model
*
model
)
{
return
model
->
vocab
.
id_to_token
.
size
();
}
int
llama_n_ctx_from_model
(
const
struct
llama_model
*
model
)
{
return
model
->
hparams
.
n_ctx
;
}
int
llama_n_embd_from_model
(
const
struct
llama_model
*
model
)
{
return
model
->
hparams
.
n_embd
;
}
int
llama_n_vocab
(
const
struct
llama_context
*
ctx
)
{
return
ctx
->
vocab
.
id_to_token
.
size
();
return
ctx
->
model
.
vocab
.
id_to_token
.
size
();
}
int
llama_n_ctx
(
const
struct
llama_context
*
ctx
)
{
...
...
@@ -3593,19 +3648,27 @@ int llama_n_embd(const struct llama_context * ctx) {
return
ctx
->
model
.
hparams
.
n_embd
;
}
int
llama_get_vocab
(
const
struct
llama_
context
*
ctx
,
int
llama_get_vocab
_from_model
(
const
struct
llama_
model
*
model
,
const
char
*
*
strings
,
float
*
scores
,
int
capacity
)
{
int
n
=
std
::
min
(
capacity
,
(
int
)
ctx
->
vocab
.
id_to_token
.
size
());
int
n
=
std
::
min
(
capacity
,
(
int
)
model
->
vocab
.
id_to_token
.
size
());
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
strings
[
i
]
=
ctx
->
vocab
.
id_to_token
[
i
].
tok
.
c_str
();
scores
[
i
]
=
ctx
->
vocab
.
id_to_token
[
i
].
score
;
strings
[
i
]
=
model
->
vocab
.
id_to_token
[
i
].
tok
.
c_str
();
scores
[
i
]
=
model
->
vocab
.
id_to_token
[
i
].
score
;
}
return
n
;
}
int
llama_get_vocab
(
const
struct
llama_context
*
ctx
,
const
char
*
*
strings
,
float
*
scores
,
int
capacity
)
{
return
llama_get_vocab_from_model
(
&
ctx
->
model
,
strings
,
scores
,
capacity
);
}
float
*
llama_get_logits
(
struct
llama_context
*
ctx
)
{
return
ctx
->
logits
.
data
();
}
...
...
@@ -3614,12 +3677,16 @@ float * llama_get_embeddings(struct llama_context * ctx) {
return
ctx
->
embedding
.
data
();
}
const
char
*
llama_token_to_str
(
const
struct
llama_
context
*
ctx
,
llama_token
token
)
{
if
(
token
>=
llama_n_vocab
(
ctx
))
{
const
char
*
llama_token_to_str
_with_model
(
const
struct
llama_
model
*
model
,
llama_token
token
)
{
if
(
token
>=
llama_n_vocab
_from_model
(
model
))
{
return
nullptr
;
}
return
ctx
->
vocab
.
id_to_token
[
token
].
tok
.
c_str
();
return
model
->
vocab
.
id_to_token
[
token
].
tok
.
c_str
();
}
const
char
*
llama_token_to_str
(
const
struct
llama_context
*
ctx
,
llama_token
token
)
{
return
llama_token_to_str_with_model
(
&
ctx
->
model
,
token
);
}
llama_token
llama_token_bos
()
{
...
...
llama/llama.h
View file @
a83eaa7a
/**
* llama.cpp - git
5bf2a2771886ee86137e01dbc7492f78fb392066
* llama.cpp - git
e782c9e735f93ab4767ffc37462c523b73a17ddc
*
* MIT License
*
...
...
@@ -115,6 +115,11 @@ extern "C" {
int32_t
n_gpu_layers
;
// number of layers to store in VRAM
int32_t
main_gpu
;
// the GPU that is used for scratch and small tensors
float
tensor_split
[
LLAMA_MAX_DEVICES
];
// how to split layers across multiple GPUs
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
float
rope_freq_base
;
// RoPE base frequency
float
rope_freq_scale
;
// RoPE frequency scaling factor
// called with a progress value between 0 and 1, pass NULL to disable
llama_progress_callback
progress_callback
;
// context pointer passed to the progress callback
...
...
@@ -174,6 +179,8 @@ extern "C" {
int32_t
n_eval
;
};
LLAMA_API
int
llama_max_devices
();
LLAMA_API
struct
llama_context_params
llama_context_default_params
();
LLAMA_API
struct
llama_model_quantize_params
llama_model_quantize_default_params
();
...
...
@@ -296,10 +303,21 @@ extern "C" {
int
n_max_tokens
,
bool
add_bos
);
LLAMA_API
int
llama_tokenize_with_model
(
const
struct
llama_model
*
model
,
const
char
*
text
,
llama_token
*
tokens
,
int
n_max_tokens
,
bool
add_bos
);
LLAMA_API
int
llama_n_vocab
(
const
struct
llama_context
*
ctx
);
LLAMA_API
int
llama_n_ctx
(
const
struct
llama_context
*
ctx
);
LLAMA_API
int
llama_n_embd
(
const
struct
llama_context
*
ctx
);
LLAMA_API
int
llama_n_vocab_from_model
(
const
struct
llama_model
*
model
);
LLAMA_API
int
llama_n_ctx_from_model
(
const
struct
llama_model
*
model
);
LLAMA_API
int
llama_n_embd_from_model
(
const
struct
llama_model
*
model
);
// Get the vocabulary as output parameters.
// Returns number of results.
LLAMA_API
int
llama_get_vocab
(
...
...
@@ -308,6 +326,12 @@ extern "C" {
float
*
scores
,
int
capacity
);
LLAMA_API
int
llama_get_vocab_from_model
(
const
struct
llama_model
*
model
,
const
char
*
*
strings
,
float
*
scores
,
int
capacity
);
// Token logits obtained from the last call to llama_eval()
// The logits for the last token are stored in the last row
// Can be mutated in order to change the probabilities of the next token
...
...
@@ -320,7 +344,13 @@ extern "C" {
LLAMA_API
float
*
llama_get_embeddings
(
struct
llama_context
*
ctx
);
// Token Id -> String. Uses the vocabulary in the provided context
LLAMA_API
const
char
*
llama_token_to_str
(
const
struct
llama_context
*
ctx
,
llama_token
token
);
LLAMA_API
const
char
*
llama_token_to_str
(
const
struct
llama_context
*
ctx
,
llama_token
token
);
LLAMA_API
const
char
*
llama_token_to_str_with_model
(
const
struct
llama_model
*
model
,
llama_token
token
);
// Special tokens
LLAMA_API
llama_token
llama_token_bos
();
// beginning-of-sentence
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment