Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
a83eaa7a
Commit
a83eaa7a
authored
Jul 19, 2023
by
Michael Yang
Browse files
update llama.cpp to e782c9e735f93ab4767ffc37462c523b73a17ddc
parent
5156e48c
Changes
12
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
1708 additions
and
650 deletions
+1708
-650
llama/ggml-cuda.cu
llama/ggml-cuda.cu
+563
-62
llama/ggml-cuda.h
llama/ggml-cuda.h
+1
-1
llama/ggml-metal.h
llama/ggml-metal.h
+1
-1
llama/ggml-metal.m
llama/ggml-metal.m
+45
-35
llama/ggml-metal.metal
llama/ggml-metal.metal
+390
-329
llama/ggml.c
llama/ggml.c
+493
-159
llama/ggml.h
llama/ggml.h
+49
-2
llama/k_quants.c
llama/k_quants.c
+1
-1
llama/k_quants.h
llama/k_quants.h
+9
-1
llama/llama-util.h
llama/llama-util.h
+4
-4
llama/llama.cpp
llama/llama.cpp
+120
-53
llama/llama.h
llama/llama.h
+32
-2
No files found.
llama/ggml-cuda.cu
View file @
a83eaa7a
/**
/**
* llama.cpp - git
5bf2a2771886ee86137e01dbc7492f78fb392066
* llama.cpp - git
e782c9e735f93ab4767ffc37462c523b73a17ddc
*
*
* MIT License
* MIT License
*
*
...
@@ -39,6 +39,8 @@
...
@@ -39,6 +39,8 @@
#include "ggml-cuda.h"
#include "ggml-cuda.h"
#include "ggml.h"
#include "ggml.h"
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
#if defined(_MSC_VER)
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
#endif
...
@@ -100,7 +102,7 @@ typedef void (*ggml_cuda_op_t)(
...
@@ -100,7 +102,7 @@ typedef void (*ggml_cuda_op_t)(
#define QK4_0 32
#define QK4_0 32
#define QR4_0 2
#define QR4_0 2
#define QI4_0
4
#define QI4_0
(QK4_0 / (4 * QR4_0))
typedef
struct
{
typedef
struct
{
half
d
;
// delta
half
d
;
// delta
uint8_t
qs
[
QK4_0
/
2
];
// nibbles / quants
uint8_t
qs
[
QK4_0
/
2
];
// nibbles / quants
...
@@ -109,7 +111,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
...
@@ -109,7 +111,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
#define QK4_1 32
#define QK4_1 32
#define QR4_1 2
#define QR4_1 2
#define QI4_1
4
#define QI4_1
(QK4_1 / (4 * QR4_1))
typedef
struct
{
typedef
struct
{
half
d
;
// delta
half
d
;
// delta
half
m
;
// min
half
m
;
// min
...
@@ -119,7 +121,7 @@ static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong
...
@@ -119,7 +121,7 @@ static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong
#define QK5_0 32
#define QK5_0 32
#define QR5_0 2
#define QR5_0 2
#define QI5_0
4
#define QI5_0
(QK5_0 / (4 * QR5_0))
typedef
struct
{
typedef
struct
{
half
d
;
// delta
half
d
;
// delta
uint8_t
qh
[
4
];
// 5-th bit of quants
uint8_t
qh
[
4
];
// 5-th bit of quants
...
@@ -129,7 +131,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
...
@@ -129,7 +131,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
#define QK5_1 32
#define QK5_1 32
#define QR5_1 2
#define QR5_1 2
#define QI5_1
4
#define QI5_1
(QK5_1 / (4 * QR5_1))
typedef
struct
{
typedef
struct
{
half
d
;
// delta
half
d
;
// delta
half
m
;
// min
half
m
;
// min
...
@@ -140,7 +142,7 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
...
@@ -140,7 +142,7 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
#define QK8_0 32
#define QK8_0 32
#define QR8_0 1
#define QR8_0 1
#define QI8_0
8
#define QI8_0
(QK8_0 / (4 * QR8_0))
typedef
struct
{
typedef
struct
{
half
d
;
// delta
half
d
;
// delta
int8_t
qs
[
QK8_0
];
// quants
int8_t
qs
[
QK8_0
];
// quants
...
@@ -149,7 +151,7 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo
...
@@ -149,7 +151,7 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo
#define QK8_1 32
#define QK8_1 32
#define QR8_1 1
#define QR8_1 1
#define QI8_1
8
#define QI8_1
(QK8_1 / (4 * QR8_1))
typedef
struct
{
typedef
struct
{
half
d
;
// delta
half
d
;
// delta
half
s
;
// unquantized sum
half
s
;
// unquantized sum
...
@@ -169,6 +171,8 @@ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_
...
@@ -169,6 +171,8 @@ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_
#define K_SCALE_SIZE 12
#define K_SCALE_SIZE 12
#endif
#endif
#define QR2_K 4
#define QI2_K (QK_K / (4*QR2_K))
typedef
struct
{
typedef
struct
{
uint8_t
scales
[
QK_K
/
16
];
// scales and mins, quantized with 4 bits
uint8_t
scales
[
QK_K
/
16
];
// scales and mins, quantized with 4 bits
uint8_t
qs
[
QK_K
/
4
];
// quants
uint8_t
qs
[
QK_K
/
4
];
// quants
...
@@ -177,6 +181,8 @@ typedef struct {
...
@@ -177,6 +181,8 @@ typedef struct {
}
block_q2_K
;
}
block_q2_K
;
static_assert
(
sizeof
(
block_q2_K
)
==
2
*
sizeof
(
ggml_fp16_t
)
+
QK_K
/
16
+
QK_K
/
4
,
"wrong q2_K block size/padding"
);
static_assert
(
sizeof
(
block_q2_K
)
==
2
*
sizeof
(
ggml_fp16_t
)
+
QK_K
/
16
+
QK_K
/
4
,
"wrong q2_K block size/padding"
);
#define QR3_K 4
#define QI3_K (QK_K / (4*QR3_K))
typedef
struct
{
typedef
struct
{
uint8_t
hmask
[
QK_K
/
8
];
// quants - high bit
uint8_t
hmask
[
QK_K
/
8
];
// quants - high bit
uint8_t
qs
[
QK_K
/
4
];
// quants - low 2 bits
uint8_t
qs
[
QK_K
/
4
];
// quants - low 2 bits
...
@@ -189,6 +195,8 @@ typedef struct {
...
@@ -189,6 +195,8 @@ typedef struct {
}
block_q3_K
;
}
block_q3_K
;
//static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
//static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
#define QR4_K 2
#define QI4_K (QK_K / (4*QR4_K))
#ifdef GGML_QKK_64
#ifdef GGML_QKK_64
typedef
struct
{
typedef
struct
{
half
d
[
2
];
// super-block scales/mins
half
d
[
2
];
// super-block scales/mins
...
@@ -206,6 +214,8 @@ typedef struct {
...
@@ -206,6 +214,8 @@ typedef struct {
static_assert
(
sizeof
(
block_q4_K
)
==
2
*
sizeof
(
ggml_fp16_t
)
+
3
*
QK_K
/
64
+
QK_K
/
2
,
"wrong q4_K block size/padding"
);
static_assert
(
sizeof
(
block_q4_K
)
==
2
*
sizeof
(
ggml_fp16_t
)
+
3
*
QK_K
/
64
+
QK_K
/
2
,
"wrong q4_K block size/padding"
);
#endif
#endif
#define QR5_K 2
#define QI5_K (QK_K / (4*QR5_K))
#ifdef GGML_QKK_64
#ifdef GGML_QKK_64
typedef
struct
{
typedef
struct
{
half
d
;
// super-block scale
half
d
;
// super-block scale
...
@@ -225,6 +235,8 @@ typedef struct {
...
@@ -225,6 +235,8 @@ typedef struct {
static_assert
(
sizeof
(
block_q5_K
)
==
2
*
sizeof
(
ggml_fp16_t
)
+
K_SCALE_SIZE
+
QK_K
/
2
+
QK_K
/
8
,
"wrong q5_K block size/padding"
);
static_assert
(
sizeof
(
block_q5_K
)
==
2
*
sizeof
(
ggml_fp16_t
)
+
K_SCALE_SIZE
+
QK_K
/
2
+
QK_K
/
8
,
"wrong q5_K block size/padding"
);
#endif
#endif
#define QR6_K 2
#define QI6_K (QK_K / (4*QR6_K))
typedef
struct
{
typedef
struct
{
uint8_t
ql
[
QK_K
/
2
];
// quants, lower 4 bits
uint8_t
ql
[
QK_K
/
2
];
// quants, lower 4 bits
uint8_t
qh
[
QK_K
/
4
];
// quants, upper 2 bits
uint8_t
qh
[
QK_K
/
4
];
// quants, upper 2 bits
...
@@ -238,6 +250,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
...
@@ -238,6 +250,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
#define CUDA_ADD_BLOCK_SIZE 256
#define CUDA_ADD_BLOCK_SIZE 256
#define CUDA_MUL_BLOCK_SIZE 256
#define CUDA_MUL_BLOCK_SIZE 256
#define CUDA_GELU_BLOCK_SIZE 256
#define CUDA_SILU_BLOCK_SIZE 256
#define CUDA_SILU_BLOCK_SIZE 256
#define CUDA_CPY_BLOCK_SIZE 32
#define CUDA_CPY_BLOCK_SIZE 32
#define CUDA_SCALE_BLOCK_SIZE 256
#define CUDA_SCALE_BLOCK_SIZE 256
...
@@ -265,13 +278,13 @@ struct ggml_tensor_extra_gpu {
...
@@ -265,13 +278,13 @@ struct ggml_tensor_extra_gpu {
cudaEvent_t
events
[
GGML_CUDA_MAX_DEVICES
];
// events for synchronizing multiple GPUs
cudaEvent_t
events
[
GGML_CUDA_MAX_DEVICES
];
// events for synchronizing multiple GPUs
};
};
static
__global__
void
add_f32
(
const
float
*
x
,
const
float
*
y
,
float
*
dst
,
const
int
k
)
{
static
__global__
void
add_f32
(
const
float
*
x
,
const
float
*
y
,
float
*
dst
,
const
int
k
x
,
const
int
ky
)
{
const
int
i
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
const
int
i
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
if
(
i
>=
k
)
{
if
(
i
>=
k
x
)
{
return
;
return
;
}
}
dst
[
i
]
=
x
[
i
]
+
y
[
i
];
dst
[
i
]
=
x
[
i
]
+
y
[
i
%
ky
];
}
}
static
__global__
void
add_f16_f32_f16
(
const
half
*
x
,
const
float
*
y
,
half
*
dst
,
const
int
k
)
{
static
__global__
void
add_f16_f32_f16
(
const
half
*
x
,
const
float
*
y
,
half
*
dst
,
const
int
k
)
{
...
@@ -292,6 +305,19 @@ static __global__ void mul_f32(const float * x, const float * y, float * dst, co
...
@@ -292,6 +305,19 @@ static __global__ void mul_f32(const float * x, const float * y, float * dst, co
dst
[
i
]
=
x
[
i
]
*
y
[
i
%
ky
];
dst
[
i
]
=
x
[
i
]
*
y
[
i
%
ky
];
}
}
static
__global__
void
gelu_f32
(
const
float
*
x
,
float
*
dst
,
const
int
k
)
{
const
float
GELU_COEF_A
=
0.044715
f
;
const
float
SQRT_2_OVER_PI
=
0.79788456080286535587989211986876
f
;
const
int
i
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
if
(
i
>=
k
)
{
return
;
}
float
xi
=
x
[
i
];
dst
[
i
]
=
0.5
f
*
xi
*
(
1.0
f
+
tanhf
(
SQRT_2_OVER_PI
*
xi
*
(
1.0
f
+
GELU_COEF_A
*
xi
*
xi
)));
}
static
__global__
void
silu_f32
(
const
float
*
x
,
float
*
dst
,
const
int
k
)
{
static
__global__
void
silu_f32
(
const
float
*
x
,
float
*
dst
,
const
int
k
)
{
const
int
i
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
const
int
i
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
...
@@ -301,16 +327,46 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
...
@@ -301,16 +327,46 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
dst
[
i
]
=
x
[
i
]
/
(
1.0
f
+
expf
(
-
x
[
i
]));
dst
[
i
]
=
x
[
i
]
/
(
1.0
f
+
expf
(
-
x
[
i
]));
}
}
static
__global__
void
norm_f32
(
const
float
*
x
,
float
*
dst
,
const
int
ncols
)
{
const
int
row
=
blockIdx
.
x
*
blockDim
.
y
+
threadIdx
.
y
;
const
int
tid
=
threadIdx
.
x
;
const
float
eps
=
1e-5
f
;
float
mean
=
0.0
f
;
float
var
=
0.0
f
;
for
(
int
col
=
tid
;
col
<
ncols
;
col
+=
WARP_SIZE
)
{
const
float
xi
=
x
[
row
*
ncols
+
col
];
mean
+=
xi
;
var
+=
xi
*
xi
;
}
// sum up partial sums
#pragma unroll
for
(
int
mask
=
16
;
mask
>
0
;
mask
>>=
1
)
{
mean
+=
__shfl_xor_sync
(
0xffffffff
,
mean
,
mask
,
32
);
var
+=
__shfl_xor_sync
(
0xffffffff
,
var
,
mask
,
32
);
}
mean
/=
ncols
;
var
=
var
/
ncols
-
mean
*
mean
;
const
float
inv_var
=
rsqrtf
(
var
+
eps
);
for
(
int
col
=
tid
;
col
<
ncols
;
col
+=
WARP_SIZE
)
{
dst
[
row
*
ncols
+
col
]
=
(
x
[
row
*
ncols
+
col
]
-
mean
)
*
inv_var
;
}
}
static
__global__
void
rms_norm_f32
(
const
float
*
x
,
float
*
dst
,
const
int
ncols
)
{
static
__global__
void
rms_norm_f32
(
const
float
*
x
,
float
*
dst
,
const
int
ncols
)
{
const
int
row
=
blockIdx
.
x
*
blockDim
.
y
+
threadIdx
.
y
;
const
int
row
=
blockIdx
.
x
*
blockDim
.
y
+
threadIdx
.
y
;
const
int
tid
=
threadIdx
.
x
;
const
int
tid
=
threadIdx
.
x
;
const
float
eps
=
1e-6
;
const
float
eps
=
1e-6
f
;
float
tmp
=
0.0
f
;
// partial sum for thread in warp
float
tmp
=
0.0
f
;
// partial sum for thread in warp
for
(
int
i
=
0
;
i
<
ncols
;
i
+=
WARP_SIZE
)
{
for
(
int
col
=
tid
;
col
<
ncols
;
col
+=
WARP_SIZE
)
{
const
int
col
=
i
+
tid
;
const
float
xi
=
x
[
row
*
ncols
+
col
];
const
float
xi
=
x
[
row
*
ncols
+
col
];
tmp
+=
xi
*
xi
;
tmp
+=
xi
*
xi
;
}
}
...
@@ -322,10 +378,9 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
...
@@ -322,10 +378,9 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
}
}
const
float
mean
=
tmp
/
ncols
;
const
float
mean
=
tmp
/
ncols
;
const
float
scale
=
1.0
f
/
sqrtf
(
mean
+
eps
);
const
float
scale
=
r
sqrtf
(
mean
+
eps
);
for
(
int
i
=
0
;
i
<
ncols
;
i
+=
WARP_SIZE
)
{
for
(
int
col
=
tid
;
col
<
ncols
;
col
+=
WARP_SIZE
)
{
const
int
col
=
i
+
tid
;
dst
[
row
*
ncols
+
col
]
=
scale
*
x
[
row
*
ncols
+
col
];
dst
[
row
*
ncols
+
col
]
=
scale
*
x
[
row
*
ncols
+
col
];
}
}
}
}
...
@@ -1254,8 +1309,9 @@ static __global__ void dequantize_block(const void * __restrict__ vx, float * __
...
@@ -1254,8 +1309,9 @@ static __global__ void dequantize_block(const void * __restrict__ vx, float * __
y
[
iybs
+
iqs
+
y_offset
]
=
v
.
y
;
y
[
iybs
+
iqs
+
y_offset
]
=
v
.
y
;
}
}
static
__device__
__forceinline__
float
vec_dot_q4_0_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
iqs
)
{
static
__device__
__forceinline__
float
vec_dot_q4_0_q8_1
(
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
iqs
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const
block_q4_0
*
bq4_0
=
(
const
block_q4_0
*
)
vbq
;
const
block_q4_0
*
bq4_0
=
(
const
block_q4_0
*
)
vbq
;
int
vi
;
int
vi
;
...
@@ -1276,11 +1332,12 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restric
...
@@ -1276,11 +1332,12 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restric
return
sumi
*
d
;
return
sumi
*
d
;
#else
#else
return
0.0
f
;
// only to satisfy the compiler
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >=
600
#endif // __CUDA_ARCH__ >=
MIN_CC_DP4A
}
}
static
__device__
__forceinline__
float
vec_dot_q4_1_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
iqs
)
{
static
__device__
__forceinline__
float
vec_dot_q4_1_q8_1
(
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
iqs
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const
block_q4_1
*
bq4_1
=
(
const
block_q4_1
*
)
vbq
;
const
block_q4_1
*
bq4_1
=
(
const
block_q4_1
*
)
vbq
;
const
int
vi
=
*
((
int
*
)
&
bq4_1
->
qs
[
sizeof
(
int
)
*
(
iqs
+
0
)]);
const
int
vi
=
*
((
int
*
)
&
bq4_1
->
qs
[
sizeof
(
int
)
*
(
iqs
+
0
)]);
...
@@ -1301,11 +1358,12 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restric
...
@@ -1301,11 +1358,12 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restric
return
sumi
*
d
+
m
*
s
/
QI4_1
;
// scale sum by QI4_1 because there are QI4_1 threads working on this block
return
sumi
*
d
+
m
*
s
/
QI4_1
;
// scale sum by QI4_1 because there are QI4_1 threads working on this block
#else
#else
return
0.0
f
;
// only to satisfy the compiler
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >=
600
#endif // __CUDA_ARCH__ >=
MIN_CC_DP4A
}
}
static
__device__
__forceinline__
float
vec_dot_q5_0_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
iqs
)
{
static
__device__
__forceinline__
float
vec_dot_q5_0_q8_1
(
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
iqs
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const
block_q5_0
*
bq5_0
=
(
const
block_q5_0
*
)
vbq
;
const
block_q5_0
*
bq5_0
=
(
const
block_q5_0
*
)
vbq
;
int
qs
;
int
qs
;
...
@@ -1336,11 +1394,12 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restric
...
@@ -1336,11 +1394,12 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restric
return
sumi
*
d
;
return
sumi
*
d
;
#else
#else
return
0.0
f
;
// only to satisfy the compiler
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >=
600
#endif // __CUDA_ARCH__ >=
MIN_CC_DP4A
}
}
static
__device__
__forceinline__
float
vec_dot_q5_1_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
iqs
)
{
static
__device__
__forceinline__
float
vec_dot_q5_1_q8_1
(
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
iqs
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const
block_q5_1
*
bq5_1
=
(
const
block_q5_1
*
)
vbq
;
const
block_q5_1
*
bq5_1
=
(
const
block_q5_1
*
)
vbq
;
const
int
qs
=
*
((
int
*
)
&
bq5_1
->
qs
[
sizeof
(
int
)
*
(
iqs
+
0
)]);
const
int
qs
=
*
((
int
*
)
&
bq5_1
->
qs
[
sizeof
(
int
)
*
(
iqs
+
0
)]);
...
@@ -1370,11 +1429,12 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restric
...
@@ -1370,11 +1429,12 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restric
return
sumi
*
d
+
m
*
s
/
QI5_1
;
// scale sum by QI5_1 because there are QI5_1 threads working on this block
return
sumi
*
d
+
m
*
s
/
QI5_1
;
// scale sum by QI5_1 because there are QI5_1 threads working on this block
#else
#else
return
0.0
f
;
// only to satisfy the compiler
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >=
600
#endif // __CUDA_ARCH__ >=
MIN_CC_DP4A
}
}
static
__device__
__forceinline__
float
vec_dot_q8_0_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
iqs
)
{
static
__device__
__forceinline__
float
vec_dot_q8_0_q8_1
(
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
iqs
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const
block_q8_0
*
bq8_0
=
(
const
block_q8_0
*
)
vbq
;
const
block_q8_0
*
bq8_0
=
(
const
block_q8_0
*
)
vbq
;
int
vi
;
int
vi
;
...
@@ -1389,7 +1449,220 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restric
...
@@ -1389,7 +1449,220 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restric
return
sumi
*
d
;
return
sumi
*
d
;
#else
#else
return
0.0
f
;
// only to satisfy the compiler
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= 600
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
static
__device__
__forceinline__
float
vec_dot_q2_K_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
iqs
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const
block_q2_K
*
bq2_K
=
(
const
block_q2_K
*
)
vbq
;
const
int
bq8_offset
=
QR2_K
*
(
iqs
/
QI8_1
);
const
int
scale_offset
=
iqs
-
iqs
%
QI8_1
+
(
iqs
%
QI8_1
)
/
(
QI8_1
/
2
);
float
sumf_d
=
0.0
f
;
float
sumf_m
=
0.0
f
;
const
float
d
=
bq2_K
->
d
;
const
float
dmin
=
bq2_K
->
dmin
;
const
int
v
=
*
((
int
*
)
&
bq2_K
->
qs
[
sizeof
(
int
)
*
iqs
]);
for
(
int
i
=
0
;
i
<
QR2_K
;
++
i
)
{
const
int
sc
=
bq2_K
->
scales
[
scale_offset
+
2
*
i
];
const
block_q8_1
*
bq8i
=
bq8_1
+
bq8_offset
+
i
;
const
float
d8i
=
bq8i
->
d
;
const
int
vi
=
(
v
>>
(
2
*
i
))
&
0x03030303
;
const
int
ui
=
*
((
int
*
)
&
bq8i
->
qs
[
sizeof
(
int
)
*
(
iqs
%
QI8_1
)]);
sumf_d
+=
d8i
*
(
__dp4a
(
vi
,
ui
,
0
)
*
(
sc
&
0xF
));
// SIMD dot product
sumf_m
+=
d8i
*
(
__dp4a
(
0x01010101
,
ui
,
0
)
*
(
sc
>>
4
));
// multiply constant q2_K part with sum of q8_1 values
}
return
d
*
sumf_d
-
dmin
*
sumf_m
;
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
static
__device__
__forceinline__
float
vec_dot_q3_K_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
iqs
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const
block_q3_K
*
bq3_K
=
(
const
block_q3_K
*
)
vbq
;
const
int
bq8_offset
=
QR3_K
*
(
iqs
/
(
QI3_K
/
2
));
const
int
scale_offset
=
iqs
-
iqs
%
QI8_1
+
(
iqs
%
QI8_1
)
/
(
QI8_1
/
2
);
float
sumf
=
0.0
f
;
const
float
d
=
bq3_K
->
d
;
int
vl
;
memcpy
(
&
vl
,
&
bq3_K
->
qs
[
sizeof
(
int
)
*
iqs
],
sizeof
(
int
));
int
vh
;
memcpy
(
&
vh
,
&
bq3_K
->
hmask
[
sizeof
(
int
)
*
(
iqs
%
(
QI3_K
/
2
))],
sizeof
(
int
));
vh
=
~
vh
;
// invert the mask so that a 0/1 results in 4/0 being subtracted
vh
>>=
bq8_offset
;
for
(
int
i
=
0
;
i
<
QR3_K
;
++
i
)
{
const
int
isc
=
scale_offset
+
2
*
i
;
const
int
isc_low
=
isc
%
(
QK_K
/
32
);
const
int
sc_shift_low
=
4
*
(
isc
/
(
QK_K
/
32
));
const
int
sc_low
=
(
bq3_K
->
scales
[
isc_low
]
>>
sc_shift_low
)
&
0xF
;
const
int
isc_high
=
isc
%
(
QK_K
/
64
);
const
int
sc_shift_high
=
2
*
(
isc
/
(
QK_K
/
64
));
const
int
sc_high
=
((
bq3_K
->
scales
[(
QK_K
/
32
)
+
isc_high
]
>>
sc_shift_high
)
&
3
)
<<
4
;
const
int
sc
=
(
sc_low
|
sc_high
)
-
32
;
const
block_q8_1
*
bq8i
=
bq8_1
+
bq8_offset
+
i
;
const
int
ui
=
*
((
int
*
)
&
bq8i
->
qs
[
sizeof
(
int
)
*
(
iqs
%
QI8_1
)]);
const
float
d8i
=
bq8i
->
d
;
const
int
vil
=
(
vl
>>
(
2
*
i
))
&
0x03030303
;
const
int
vih
=
((
vh
>>
i
)
<<
2
)
&
0x04040404
;
const
int
vi
=
__vsubss4
(
vil
,
vih
);
sumf
+=
d8i
*
(
__dp4a
(
vi
,
ui
,
0
)
*
sc
);
// SIMD dot product
}
return
d
*
sumf
;
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
static
__device__
__forceinline__
float
vec_dot_q4_K_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
iqs
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const
block_q4_K
*
bq4_K
=
(
const
block_q4_K
*
)
vbq
;
const
int
bq8_offset
=
QR4_K
*
(
iqs
/
QI8_1
);
float
sumf_d
=
0.0
f
;
float
sumf_m
=
0.0
f
;
const
float
d
=
bq4_K
->
d
;
const
float
dmin
=
bq4_K
->
dmin
;
const
int
v
=
*
((
int
*
)
&
bq4_K
->
qs
[
sizeof
(
int
)
*
iqs
]);
for
(
int
i
=
0
;
i
<
QR4_K
;
++
i
)
{
const
int
isc
=
bq8_offset
+
i
;
uint8_t
sc
,
m
;
get_scale_min_k4
(
isc
,
bq4_K
->
scales
,
sc
,
m
);
const
block_q8_1
*
bq8i
=
bq8_1
+
bq8_offset
+
i
;
const
int
ui
=
*
((
int
*
)
&
bq8i
->
qs
[
sizeof
(
int
)
*
(
iqs
%
QI8_1
)]);
const
float
d8i
=
bq8i
->
d
;
const
int
vi
=
(
v
>>
(
4
*
i
))
&
0x0F0F0F0F
;
sumf_d
+=
d8i
*
(
__dp4a
(
vi
,
ui
,
0
)
*
sc
);
// SIMD dot product
sumf_m
+=
d8i
*
(
__dp4a
(
0x01010101
,
ui
,
0
)
*
m
);
// multiply constant part of q4_K with sum of q8_1 values
}
return
d
*
sumf_d
-
dmin
*
sumf_m
;
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
static
__device__
__forceinline__
float
vec_dot_q5_K_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
iqs
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const
block_q5_K
*
bq5_K
=
(
const
block_q5_K
*
)
vbq
;
const
int
bq8_offset
=
QR5_K
*
(
iqs
/
QI8_1
);
float
sumf_d
=
0.0
f
;
float
sumf_m
=
0.0
f
;
const
float
d
=
bq5_K
->
d
;
const
float
dmin
=
bq5_K
->
dmin
;
const
int
vl
=
*
((
int
*
)
&
bq5_K
->
qs
[
sizeof
(
int
)
*
iqs
]);
const
int
vh
=
(
*
((
int
*
)
&
bq5_K
->
qh
[
sizeof
(
int
)
*
(
iqs
%
(
QI5_K
/
4
))]))
>>
bq8_offset
;
for
(
int
i
=
0
;
i
<
QR5_K
;
++
i
)
{
const
int
isc
=
bq8_offset
+
i
;
uint8_t
sc
,
m
;
get_scale_min_k4
(
isc
,
bq5_K
->
scales
,
sc
,
m
);
const
block_q8_1
*
bq8i
=
bq8_1
+
bq8_offset
+
i
;
const
int
ui
=
*
((
int
*
)
&
bq8i
->
qs
[
sizeof
(
int
)
*
(
iqs
%
QI8_1
)]);
const
float
d8i
=
bq8i
->
d
;
const
int
vil
=
(
vl
>>
(
4
*
i
))
&
0x0F0F0F0F
;
const
int
vih
=
((
vh
>>
i
)
<<
4
)
&
0x10101010
;
const
int
vi
=
vil
|
vih
;
sumf_d
+=
d8i
*
(
__dp4a
(
vi
,
ui
,
0
)
*
sc
);
// SIMD dot product
sumf_m
+=
d8i
*
(
__dp4a
(
0x01010101
,
ui
,
0
)
*
m
);
// multiply constant part of q5_K with sum of q8_1 values
}
return
d
*
sumf_d
-
dmin
*
sumf_m
;
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
static
__device__
__forceinline__
float
vec_dot_q6_K_q8_1
(
const
void
*
__restrict__
vbq
,
const
block_q8_1
*
__restrict__
bq8_1
,
const
int
iqs
)
{
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const
block_q6_K
*
bq6_K
=
(
const
block_q6_K
*
)
vbq
;
const
int
bq8_offset
=
2
*
QR6_K
*
(
iqs
/
(
QI6_K
/
2
))
+
(
iqs
%
(
QI6_K
/
2
))
/
(
QI6_K
/
4
);
const
int
scale_offset
=
(
QI6_K
/
4
)
*
(
iqs
/
(
QI6_K
/
2
))
+
(
iqs
%
(
QI6_K
/
2
))
/
(
QI6_K
/
8
);
const
int
vh_shift
=
2
*
((
iqs
%
(
QI6_K
/
2
))
/
(
QI6_K
/
4
));
float
sumf
=
0.0
f
;
const
float
d
=
bq6_K
->
d
;
int
vl
;
memcpy
(
&
vl
,
&
bq6_K
->
ql
[
sizeof
(
int
)
*
iqs
],
sizeof
(
int
));
int
vh
;
memcpy
(
&
vh
,
&
bq6_K
->
qh
[
sizeof
(
int
)
*
((
QI6_K
/
4
)
*
(
iqs
/
(
QI6_K
/
2
))
+
iqs
%
(
QI6_K
/
4
))],
sizeof
(
int
));
for
(
int
i
=
0
;
i
<
QR6_K
;
++
i
)
{
const
int
sc
=
bq6_K
->
scales
[
scale_offset
+
4
*
i
];
const
block_q8_1
*
bq8i
=
bq8_1
+
bq8_offset
+
2
*
i
;
const
int
ui
=
*
((
int
*
)
&
bq8i
->
qs
[
sizeof
(
int
)
*
(
iqs
%
(
QI8_1
))]);
const
float
d8i
=
bq8i
->
d
;
const
int
vil
=
(
vl
>>
(
4
*
i
))
&
0x0F0F0F0F
;
const
int
vih
=
((
vh
>>
(
vh_shift
+
4
*
i
))
<<
4
)
&
0x30303030
;
const
int
vi
=
__vsubss4
((
vil
|
vih
),
0x20202020
);
// vi = (vil | vih) - 32
sumf
+=
d8i
*
(
__dp4a
(
vi
,
ui
,
0
)
*
sc
);
// SIMD dot product
}
return
d
*
sumf
;
#else
return
0.0
f
;
// only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
}
template
<
int
qk
,
int
qi
,
typename
block_q_t
,
vec_dot_q_cuda_t
vec_dot_q_cuda
>
template
<
int
qk
,
int
qi
,
typename
block_q_t
,
vec_dot_q_cuda_t
vec_dot_q_cuda
>
...
@@ -1412,7 +1685,7 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
...
@@ -1412,7 +1685,7 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
for
(
int
i
=
0
;
i
<
blocks_per_row
;
i
+=
blocks_per_warp
)
{
for
(
int
i
=
0
;
i
<
blocks_per_row
;
i
+=
blocks_per_warp
)
{
const
int
ibx
=
row
*
blocks_per_row
+
i
+
threadIdx
.
x
/
qi
;
// x block index
const
int
ibx
=
row
*
blocks_per_row
+
i
+
threadIdx
.
x
/
qi
;
// x block index
const
int
iby
=
i
+
threadIdx
.
x
/
qi
;
// y block index
const
int
iby
=
(
i
+
threadIdx
.
x
/
qi
)
*
qk
/
QK8_1
;
// y block index
that aligns with ibx
const
int
iqs
=
threadIdx
.
x
%
qi
;
// x block quant index when casting the quants to int
const
int
iqs
=
threadIdx
.
x
%
qi
;
// x block quant index when casting the quants to int
...
@@ -1650,6 +1923,40 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
...
@@ -1650,6 +1923,40 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
dst
[
i
+
1
]
=
x0
*
sin_theta
+
x1
*
cos_theta
;
dst
[
i
+
1
]
=
x0
*
sin_theta
+
x1
*
cos_theta
;
}
}
static
__global__
void
rope_glm_f32
(
const
float
*
x
,
float
*
dst
,
const
int
ncols
,
const
float
p
,
const
float
block_p
,
const
float
theta_scale
)
{
const
int
col
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
const
int
half_n_dims
=
ncols
/
4
;
if
(
col
>=
half_n_dims
)
{
return
;
}
const
int
row
=
blockDim
.
y
*
blockIdx
.
y
+
threadIdx
.
y
;
const
int
i
=
row
*
ncols
+
col
;
const
float
col_theta_scale
=
powf
(
theta_scale
,
col
);
const
float
theta
=
p
*
col_theta_scale
;
const
float
sin_theta
=
sinf
(
theta
);
const
float
cos_theta
=
cosf
(
theta
);
const
float
x0
=
x
[
i
+
0
];
const
float
x1
=
x
[
i
+
half_n_dims
];
dst
[
i
+
0
]
=
x0
*
cos_theta
-
x1
*
sin_theta
;
dst
[
i
+
half_n_dims
]
=
x0
*
sin_theta
+
x1
*
cos_theta
;
const
float
block_theta
=
block_p
*
col_theta_scale
;
const
float
sin_block_theta
=
sinf
(
block_theta
);
const
float
cos_block_theta
=
cosf
(
block_theta
);
const
float
x2
=
x
[
i
+
half_n_dims
*
2
];
const
float
x3
=
x
[
i
+
half_n_dims
*
3
];
dst
[
i
+
half_n_dims
*
2
]
=
x2
*
cos_block_theta
-
x3
*
sin_block_theta
;
dst
[
i
+
half_n_dims
*
3
]
=
x2
*
sin_block_theta
+
x3
*
cos_block_theta
;
}
static
__global__
void
diag_mask_inf_f32
(
const
float
*
x
,
float
*
dst
,
const
int
ncols
,
const
int
rows_per_channel
,
const
int
n_past
)
{
static
__global__
void
diag_mask_inf_f32
(
const
float
*
x
,
float
*
dst
,
const
int
ncols
,
const
int
rows_per_channel
,
const
int
n_past
)
{
const
int
col
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
const
int
col
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
const
int
row
=
blockDim
.
y
*
blockIdx
.
y
+
threadIdx
.
y
;
const
int
row
=
blockDim
.
y
*
blockIdx
.
y
+
threadIdx
.
y
;
...
@@ -1715,9 +2022,9 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
...
@@ -1715,9 +2022,9 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
dst
[
i
]
=
scale
*
x
[
i
];
dst
[
i
]
=
scale
*
x
[
i
];
}
}
static
void
add_f32_cuda
(
const
float
*
x
,
const
float
*
y
,
float
*
dst
,
const
int
k
,
cudaStream_t
stream
)
{
static
void
add_f32_cuda
(
const
float
*
x
,
const
float
*
y
,
float
*
dst
,
const
int
k
x
,
const
int
ky
,
cudaStream_t
stream
)
{
const
int
num_blocks
=
(
k
+
CUDA_ADD_BLOCK_SIZE
-
1
)
/
CUDA_ADD_BLOCK_SIZE
;
const
int
num_blocks
=
(
k
x
+
CUDA_ADD_BLOCK_SIZE
-
1
)
/
CUDA_ADD_BLOCK_SIZE
;
add_f32
<<<
num_blocks
,
CUDA_ADD_BLOCK_SIZE
,
0
,
stream
>>>
(
x
,
y
,
dst
,
k
);
add_f32
<<<
num_blocks
,
CUDA_ADD_BLOCK_SIZE
,
0
,
stream
>>>
(
x
,
y
,
dst
,
k
x
,
ky
);
}
}
static
void
add_f16_f32_f16_cuda
(
const
half
*
x
,
const
float
*
y
,
half
*
dst
,
const
int
k
,
cudaStream_t
stream
)
{
static
void
add_f16_f32_f16_cuda
(
const
half
*
x
,
const
float
*
y
,
half
*
dst
,
const
int
k
,
cudaStream_t
stream
)
{
...
@@ -1730,11 +2037,22 @@ static void mul_f32_cuda(const float * x, const float * y, float * dst, const in
...
@@ -1730,11 +2037,22 @@ static void mul_f32_cuda(const float * x, const float * y, float * dst, const in
mul_f32
<<<
num_blocks
,
CUDA_MUL_BLOCK_SIZE
,
0
,
stream
>>>
(
x
,
y
,
dst
,
kx
,
ky
);
mul_f32
<<<
num_blocks
,
CUDA_MUL_BLOCK_SIZE
,
0
,
stream
>>>
(
x
,
y
,
dst
,
kx
,
ky
);
}
}
static
void
gelu_f32_cuda
(
const
float
*
x
,
float
*
dst
,
const
int
k
,
cudaStream_t
stream
)
{
const
int
num_blocks
=
(
k
+
CUDA_GELU_BLOCK_SIZE
-
1
)
/
CUDA_GELU_BLOCK_SIZE
;
gelu_f32
<<<
num_blocks
,
CUDA_GELU_BLOCK_SIZE
,
0
,
stream
>>>
(
x
,
dst
,
k
);
}
static
void
silu_f32_cuda
(
const
float
*
x
,
float
*
dst
,
const
int
k
,
cudaStream_t
stream
)
{
static
void
silu_f32_cuda
(
const
float
*
x
,
float
*
dst
,
const
int
k
,
cudaStream_t
stream
)
{
const
int
num_blocks
=
(
k
+
CUDA_SILU_BLOCK_SIZE
-
1
)
/
CUDA_SILU_BLOCK_SIZE
;
const
int
num_blocks
=
(
k
+
CUDA_SILU_BLOCK_SIZE
-
1
)
/
CUDA_SILU_BLOCK_SIZE
;
silu_f32
<<<
num_blocks
,
CUDA_SILU_BLOCK_SIZE
,
0
,
stream
>>>
(
x
,
dst
,
k
);
silu_f32
<<<
num_blocks
,
CUDA_SILU_BLOCK_SIZE
,
0
,
stream
>>>
(
x
,
dst
,
k
);
}
}
static
void
norm_f32_cuda
(
const
float
*
x
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ncols
%
WARP_SIZE
==
0
);
const
dim3
block_dims
(
WARP_SIZE
,
1
,
1
);
norm_f32
<<<
nrows
,
block_dims
,
0
,
stream
>>>
(
x
,
dst
,
ncols
);
}
static
void
rms_norm_f32_cuda
(
const
float
*
x
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
cudaStream_t
stream
)
{
static
void
rms_norm_f32_cuda
(
const
float
*
x
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ncols
%
WARP_SIZE
==
0
);
GGML_ASSERT
(
ncols
%
WARP_SIZE
==
0
);
const
dim3
block_dims
(
WARP_SIZE
,
1
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
1
,
1
);
...
@@ -1900,7 +2218,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
...
@@ -1900,7 +2218,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
}
}
static
void
mul_mat_vec_q4_0_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
cudaStream_t
stream
)
{
static
void
mul_mat_vec_q4_0_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ncols
%
GGML_CUDA_DMMV_X
==
0
);
GGML_ASSERT
(
ncols
%
QK4_0
==
0
);
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
...
@@ -1909,7 +2227,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
...
@@ -1909,7 +2227,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
}
}
static
void
mul_mat_vec_q4_1_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
cudaStream_t
stream
)
{
static
void
mul_mat_vec_q4_1_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ncols
%
GGML_CUDA_DMMV_X
==
0
);
GGML_ASSERT
(
ncols
%
QK4_1
==
0
);
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
...
@@ -1918,7 +2236,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
...
@@ -1918,7 +2236,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
}
}
static
void
mul_mat_vec_q5_0_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
cudaStream_t
stream
)
{
static
void
mul_mat_vec_q5_0_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ncols
%
GGML_CUDA_DMMV_X
==
0
);
GGML_ASSERT
(
ncols
%
QK5_0
==
0
);
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
...
@@ -1927,7 +2245,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
...
@@ -1927,7 +2245,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
}
}
static
void
mul_mat_vec_q5_1_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
cudaStream_t
stream
)
{
static
void
mul_mat_vec_q5_1_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ncols
%
GGML_CUDA_DMMV_X
==
0
);
GGML_ASSERT
(
ncols
%
QK5_1
==
0
);
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
...
@@ -1936,7 +2254,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
...
@@ -1936,7 +2254,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
}
}
static
void
mul_mat_vec_q8_0_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
cudaStream_t
stream
)
{
static
void
mul_mat_vec_q8_0_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ncols
%
GGML_CUDA_DMMV_X
==
0
);
GGML_ASSERT
(
ncols
%
QK8_0
==
0
);
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
...
@@ -1944,6 +2262,51 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
...
@@ -1944,6 +2262,51 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols
,
nrows
);
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols
,
nrows
);
}
}
static
void
mul_mat_vec_q2_K_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ncols
%
QK_K
==
0
);
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
mul_mat_vec_q
<
QK_K
,
QI2_K
,
block_q2_K
,
vec_dot_q2_K_q8_1
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols
,
nrows
);
}
static
void
mul_mat_vec_q3_K_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ncols
%
QK_K
==
0
);
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
mul_mat_vec_q
<
QK_K
,
QI3_K
,
block_q3_K
,
vec_dot_q3_K_q8_1
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols
,
nrows
);
}
static
void
mul_mat_vec_q4_K_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ncols
%
QK_K
==
0
);
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
mul_mat_vec_q
<
QK_K
,
QI4_K
,
block_q4_K
,
vec_dot_q4_K_q8_1
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols
,
nrows
);
}
static
void
mul_mat_vec_q5_K_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ncols
%
QK_K
==
0
);
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
mul_mat_vec_q
<
QK_K
,
QI5_K
,
block_q5_K
,
vec_dot_q5_K_q8_1
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols
,
nrows
);
}
static
void
mul_mat_vec_q6_K_q8_1_cuda
(
const
void
*
vx
,
const
void
*
vy
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
ncols
%
QK_K
==
0
);
const
int
block_num_y
=
(
nrows
+
GGML_CUDA_MMV_Y
-
1
)
/
GGML_CUDA_MMV_Y
;
const
dim3
block_nums
(
1
,
block_num_y
,
1
);
const
dim3
block_dims
(
WARP_SIZE
,
GGML_CUDA_MMV_Y
,
1
);
mul_mat_vec_q
<
QK_K
,
QI6_K
,
block_q6_K
,
vec_dot_q6_K_q8_1
>
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
vx
,
vy
,
dst
,
ncols
,
nrows
);
}
static
void
convert_fp16_to_fp32_cuda
(
const
void
*
vx
,
float
*
y
,
const
int
k
,
cudaStream_t
stream
)
{
static
void
convert_fp16_to_fp32_cuda
(
const
void
*
vx
,
float
*
y
,
const
int
k
,
cudaStream_t
stream
)
{
const
int
num_blocks
=
(
k
+
CUDA_DEQUANTIZE_BLOCK_SIZE
-
1
)
/
CUDA_DEQUANTIZE_BLOCK_SIZE
;
const
int
num_blocks
=
(
k
+
CUDA_DEQUANTIZE_BLOCK_SIZE
-
1
)
/
CUDA_DEQUANTIZE_BLOCK_SIZE
;
dequantize_block
<
1
,
1
,
convert_f16
><<<
num_blocks
,
CUDA_DEQUANTIZE_BLOCK_SIZE
,
0
,
stream
>>>
(
vx
,
y
,
k
);
dequantize_block
<
1
,
1
,
convert_f16
><<<
num_blocks
,
CUDA_DEQUANTIZE_BLOCK_SIZE
,
0
,
stream
>>>
(
vx
,
y
,
k
);
...
@@ -2036,6 +2399,14 @@ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const i
...
@@ -2036,6 +2399,14 @@ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const i
rope_f32
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
x
,
dst
,
ncols
,
p
,
theta_scale
);
rope_f32
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
x
,
dst
,
ncols
,
p
,
theta_scale
);
}
}
static
void
rope_glm_f32_cuda
(
const
float
*
x
,
float
*
dst
,
const
int
ncols
,
const
int
nrows
,
const
float
p
,
const
float
block_p
,
const
float
theta_scale
,
cudaStream_t
stream
)
{
GGML_ASSERT
(
nrows
%
4
==
0
);
const
dim3
block_dims
(
4
*
CUDA_ROPE_BLOCK_SIZE
,
1
,
1
);
const
int
num_blocks_x
=
(
ncols
+
4
*
CUDA_ROPE_BLOCK_SIZE
-
1
)
/
(
4
*
CUDA_ROPE_BLOCK_SIZE
);
const
dim3
block_nums
(
num_blocks_x
,
nrows
,
1
);
rope_glm_f32
<<<
block_nums
,
block_dims
,
0
,
stream
>>>
(
x
,
dst
,
ncols
,
p
,
block_p
,
theta_scale
);
}
static
void
diag_mask_inf_f32_cuda
(
const
float
*
x
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
rows_per_channel
,
const
int
n_past
,
cudaStream_t
stream
)
{
static
void
diag_mask_inf_f32_cuda
(
const
float
*
x
,
float
*
dst
,
const
int
ncols_x
,
const
int
nrows_x
,
const
int
rows_per_channel
,
const
int
n_past
,
cudaStream_t
stream
)
{
const
dim3
block_dims
(
CUDA_DIAG_MASK_INF_BLOCK_SIZE
,
1
,
1
);
const
dim3
block_dims
(
CUDA_DIAG_MASK_INF_BLOCK_SIZE
,
1
,
1
);
const
int
block_num_x
=
(
ncols_x
+
CUDA_DIAG_MASK_INF_BLOCK_SIZE
-
1
)
/
CUDA_DIAG_MASK_INF_BLOCK_SIZE
;
const
int
block_num_x
=
(
ncols_x
+
CUDA_DIAG_MASK_INF_BLOCK_SIZE
-
1
)
/
CUDA_DIAG_MASK_INF_BLOCK_SIZE
;
...
@@ -2263,16 +2634,19 @@ inline void ggml_cuda_op_add(
...
@@ -2263,16 +2634,19 @@ inline void ggml_cuda_op_add(
GGML_ASSERT
(
src0_ddq_i
!=
nullptr
||
src0_ddf_i
!=
nullptr
);
GGML_ASSERT
(
src0_ddq_i
!=
nullptr
||
src0_ddf_i
!=
nullptr
);
GGML_ASSERT
(
src1_ddf_i
!=
nullptr
);
GGML_ASSERT
(
src1_ddf_i
!=
nullptr
);
GGML_ASSERT
(
dst_ddf_i
!=
nullptr
);
GGML_ASSERT
(
dst_ddf_i
!=
nullptr
);
const
int64_t
ne0
=
src0
->
ne
[
0
];
const
int64_t
ne0
0
=
src0
->
ne
[
0
];
const
int64_t
i01_diff
=
i01_high
-
i01_low
;
const
int64_t
i01_diff
=
i01_high
-
i01_low
;
const
int64_t
ne10
=
src1
->
ne
[
0
];
const
int64_t
ne11
=
src1
->
ne
[
1
];
// compute
// compute
if
(
src0
->
type
==
GGML_TYPE_F32
&&
dst
->
type
==
GGML_TYPE_F32
)
{
if
(
src0
->
type
==
GGML_TYPE_F32
&&
dst
->
type
==
GGML_TYPE_F32
)
{
add_f32_cuda
(
src0_ddf_i
,
src1_ddf_i
,
dst_ddf_i
,
ne0
*
i01_diff
,
cudaStream_main
);
add_f32_cuda
(
src0_ddf_i
,
src1_ddf_i
,
dst_ddf_i
,
ne0
0
*
i01_diff
,
ne10
*
ne11
,
cudaStream_main
);
}
else
if
(
src0
->
type
==
GGML_TYPE_F16
&&
dst
->
type
==
GGML_TYPE_F16
)
{
}
else
if
(
src0
->
type
==
GGML_TYPE_F16
&&
dst
->
type
==
GGML_TYPE_F16
)
{
add_f16_f32_f16_cuda
((
half
*
)
src0_ddq_i
,
src1_ddf_i
,
(
half
*
)
dst_ddf_i
,
ne0
*
i01_diff
,
cudaStream_main
);
add_f16_f32_f16_cuda
((
half
*
)
src0_ddq_i
,
src1_ddf_i
,
(
half
*
)
dst_ddf_i
,
ne0
0
*
i01_diff
,
cudaStream_main
);
}
else
{
}
else
{
GGML_ASSERT
(
false
);
GGML_ASSERT
(
false
);
}
}
...
@@ -2291,27 +2665,41 @@ inline void ggml_cuda_op_mul(
...
@@ -2291,27 +2665,41 @@ inline void ggml_cuda_op_mul(
GGML_ASSERT
(
src0_ddf_i
!=
nullptr
);
GGML_ASSERT
(
src0_ddf_i
!=
nullptr
);
GGML_ASSERT
(
src1_ddf_i
!=
nullptr
);
GGML_ASSERT
(
src1_ddf_i
!=
nullptr
);
GGML_ASSERT
(
dst_ddf_i
!=
nullptr
);
GGML_ASSERT
(
dst_ddf_i
!=
nullptr
);
const
int64_t
ne00
=
src0
->
ne
[
0
];
const
int64_t
ne00
=
src0
->
ne
[
0
];
const
int64_t
i01_diff
=
i01_high
-
i01_low
;
const
int64_t
ne10
=
src1
->
ne
[
0
];
const
int64_t
ne10
=
src1
->
ne
[
0
];
const
int64_t
ne11
=
src1
->
ne
[
1
];
const
int64_t
ne11
=
src1
->
ne
[
1
];
for
(
int64_t
i01
=
i01_low
;
i01
<
i01_high
;
i01
++
)
{
mul_f32_cuda
(
src0_ddf_i
,
src1_ddf_i
,
dst_ddf_i
,
ne00
*
i01_diff
,
ne10
*
ne11
,
cudaStream_main
);
const
int64_t
i11
=
i1
*
ne11
+
i01
%
ne11
;
// broadcast src1 across src0
float
*
src0_ddf_i01
=
src0_ddf_i
+
i01
*
ne00
;
(
void
)
dst
;
float
*
src1_ddf_i01
=
src1_ddf_i
+
i11
*
ne10
;
(
void
)
src0_ddq_i
;
float
*
dst_ddf_i01
=
dst_ddf_i
+
i01
*
ne00
;
(
void
)
i02
;
}
// compute
inline
void
ggml_cuda_op_gelu
(
mul_f32_cuda
(
src0_ddf_i01
,
src1_ddf_i01
,
dst_ddf_i01
,
ne00
,
ne10
,
cudaStream_main
);
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
,
char
*
src0_ddq_i
,
}
float
*
src0_ddf_i
,
float
*
src1_ddf_i
,
float
*
dst_ddf_i
,
int64_t
i02
,
int64_t
i01_low
,
int64_t
i01_high
,
int
i1
,
cudaStream_t
&
cudaStream_main
){
GGML_ASSERT
(
src0_ddf_i
!=
nullptr
);
GGML_ASSERT
(
dst_ddf_i
!=
nullptr
);
const
int64_t
ne00
=
src0
->
ne
[
0
];
const
int64_t
i01_diff
=
i01_high
-
i01_low
;
// compute
gelu_f32_cuda
(
src0_ddf_i
,
dst_ddf_i
,
ne00
*
i01_diff
,
cudaStream_main
);
(
void
)
src1
;
(
void
)
dst
;
(
void
)
dst
;
(
void
)
src0_ddq_i
;
(
void
)
src0_ddq_i
;
(
void
)
src1_ddf_i
;
(
void
)
i02
;
(
void
)
i02
;
(
void
)
i1
;
}
}
inline
void
ggml_cuda_op_silu
(
inline
void
ggml_cuda_op_silu
(
...
@@ -2336,6 +2724,28 @@ inline void ggml_cuda_op_silu(
...
@@ -2336,6 +2724,28 @@ inline void ggml_cuda_op_silu(
(
void
)
i1
;
(
void
)
i1
;
}
}
inline
void
ggml_cuda_op_norm
(
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
,
char
*
src0_ddq_i
,
float
*
src0_ddf_i
,
float
*
src1_ddf_i
,
float
*
dst_ddf_i
,
int64_t
i02
,
int64_t
i01_low
,
int64_t
i01_high
,
int
i1
,
cudaStream_t
&
cudaStream_main
){
GGML_ASSERT
(
src0_ddf_i
!=
nullptr
);
GGML_ASSERT
(
dst_ddf_i
!=
nullptr
);
const
int64_t
ne00
=
src0
->
ne
[
0
];
const
int64_t
i01_diff
=
i01_high
-
i01_low
;
// compute
norm_f32_cuda
(
src0_ddf_i
,
dst_ddf_i
,
ne00
,
i01_diff
,
cudaStream_main
);
(
void
)
src1
;
(
void
)
dst
;
(
void
)
src0_ddq_i
;
(
void
)
src1_ddf_i
;
(
void
)
i02
;
(
void
)
i1
;
}
inline
void
ggml_cuda_op_rms_norm
(
inline
void
ggml_cuda_op_rms_norm
(
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
,
char
*
src0_ddq_i
,
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
,
char
*
src0_ddq_i
,
float
*
src0_ddf_i
,
float
*
src1_ddf_i
,
float
*
dst_ddf_i
,
int64_t
i02
,
int64_t
i01_low
,
int64_t
i01_high
,
int
i1
,
float
*
src0_ddf_i
,
float
*
src1_ddf_i
,
float
*
dst_ddf_i
,
int64_t
i02
,
int64_t
i01_low
,
int64_t
i01_high
,
int
i1
,
...
@@ -2376,13 +2786,22 @@ inline void ggml_cuda_op_mul_mat_vec(
...
@@ -2376,13 +2786,22 @@ inline void ggml_cuda_op_mul_mat_vec(
int
id
;
int
id
;
CUDA_CHECK
(
cudaGetDevice
(
&
id
));
CUDA_CHECK
(
cudaGetDevice
(
&
id
));
const
bool
mul_mat_vec_q_implemented
=
src0
->
type
==
GGML_TYPE_Q4_0
||
bool
mul_mat_vec_q_implemented
=
src0
->
type
==
GGML_TYPE_Q4_0
||
src0
->
type
==
GGML_TYPE_Q4_1
||
src0
->
type
==
GGML_TYPE_Q4_1
||
src0
->
type
==
GGML_TYPE_Q5_0
||
src0
->
type
==
GGML_TYPE_Q5_0
||
src0
->
type
==
GGML_TYPE_Q5_1
||
src0
->
type
==
GGML_TYPE_Q5_1
||
src0
->
type
==
GGML_TYPE_Q8_0
;
src0
->
type
==
GGML_TYPE_Q8_0
;
#if QK_K == 256
const
bool
use_mul_mat_vec_q
=
g_compute_capabilities
[
id
]
>=
600
&&
mul_mat_vec_q_implemented
;
mul_mat_vec_q_implemented
=
mul_mat_vec_q_implemented
||
src0
->
type
==
GGML_TYPE_Q2_K
||
src0
->
type
==
GGML_TYPE_Q3_K
||
src0
->
type
==
GGML_TYPE_Q4_K
||
src0
->
type
==
GGML_TYPE_Q5_K
||
src0
->
type
==
GGML_TYPE_Q6_K
;
#endif // QK_K == 256
const
bool
use_mul_mat_vec_q
=
g_compute_capabilities
[
id
]
>=
MIN_CC_DP4A
&&
mul_mat_vec_q_implemented
;
#endif
#endif
if
(
use_mul_mat_vec_q
)
{
if
(
use_mul_mat_vec_q
)
{
...
@@ -2408,6 +2827,21 @@ inline void ggml_cuda_op_mul_mat_vec(
...
@@ -2408,6 +2827,21 @@ inline void ggml_cuda_op_mul_mat_vec(
case
GGML_TYPE_Q8_0
:
case
GGML_TYPE_Q8_0
:
mul_mat_vec_q8_0_q8_1_cuda
(
src0_ddq_i
,
src1_q8_1
,
dst_ddf_i
,
ne00
,
nrows
,
cudaStream_main
);
mul_mat_vec_q8_0_q8_1_cuda
(
src0_ddq_i
,
src1_q8_1
,
dst_ddf_i
,
ne00
,
nrows
,
cudaStream_main
);
break
;
break
;
case
GGML_TYPE_Q2_K
:
mul_mat_vec_q2_K_q8_1_cuda
(
src0_ddq_i
,
src1_q8_1
,
dst_ddf_i
,
ne00
,
nrows
,
cudaStream_main
);
break
;
case
GGML_TYPE_Q3_K
:
mul_mat_vec_q3_K_q8_1_cuda
(
src0_ddq_i
,
src1_q8_1
,
dst_ddf_i
,
ne00
,
nrows
,
cudaStream_main
);
break
;
case
GGML_TYPE_Q4_K
:
mul_mat_vec_q4_K_q8_1_cuda
(
src0_ddq_i
,
src1_q8_1
,
dst_ddf_i
,
ne00
,
nrows
,
cudaStream_main
);
break
;
case
GGML_TYPE_Q5_K
:
mul_mat_vec_q5_K_q8_1_cuda
(
src0_ddq_i
,
src1_q8_1
,
dst_ddf_i
,
ne00
,
nrows
,
cudaStream_main
);
break
;
case
GGML_TYPE_Q6_K
:
mul_mat_vec_q6_K_q8_1_cuda
(
src0_ddq_i
,
src1_q8_1
,
dst_ddf_i
,
ne00
,
nrows
,
cudaStream_main
);
break
;
default:
default:
GGML_ASSERT
(
false
);
GGML_ASSERT
(
false
);
break
;
break
;
...
@@ -2542,13 +2976,21 @@ inline void ggml_cuda_op_rope(
...
@@ -2542,13 +2976,21 @@ inline void ggml_cuda_op_rope(
const
int
n_past
=
((
int32_t
*
)
src1
->
data
)[
0
];
const
int
n_past
=
((
int32_t
*
)
src1
->
data
)[
0
];
const
int
n_dims
=
((
int32_t
*
)
src1
->
data
)[
1
];
const
int
n_dims
=
((
int32_t
*
)
src1
->
data
)[
1
];
const
int
mode
=
((
int32_t
*
)
src1
->
data
)[
2
];
const
int
mode
=
((
int32_t
*
)
src1
->
data
)[
2
];
GGML_ASSERT
(
mode
==
0
)
;
const
int
n_ctx
=
((
int32_t
*
)
src1
->
data
)[
3
]
;
const
float
theta_scale
=
powf
(
10000.0
,
-
2.0
f
/
n_dims
);
const
float
theta_scale
=
powf
(
10000.0
,
-
2.0
f
/
n_dims
);
const
float
p
=
((
mode
&
1
)
==
0
?
n_past
+
i02
:
i02
);
const
float
p
=
((
mode
&
1
)
==
0
?
n_past
+
i02
:
i02
);
bool
is_glm
=
mode
&
4
;
// compute
// compute
rope_f32_cuda
(
src0_ddf_i
,
dst_ddf_i
,
ne00
,
i01_diff
,
p
,
theta_scale
,
cudaStream_main
);
if
(
is_glm
)
{
const
float
id_p
=
min
(
p
,
n_ctx
-
2.
f
);
const
float
block_p
=
max
(
p
-
(
n_ctx
-
2.
f
),
0.
f
);
rope_glm_f32_cuda
(
src0_ddf_i
,
dst_ddf_i
,
ne00
,
i01_diff
,
id_p
,
block_p
,
theta_scale
,
cudaStream_main
);
}
else
{
rope_f32_cuda
(
src0_ddf_i
,
dst_ddf_i
,
ne00
,
i01_diff
,
p
,
theta_scale
,
cudaStream_main
);
}
(
void
)
dst
;
(
void
)
dst
;
(
void
)
src0_ddq_i
;
(
void
)
src0_ddq_i
;
...
@@ -2951,11 +3393,21 @@ void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
...
@@ -2951,11 +3393,21 @@ void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
ggml_cuda_op
(
src0
,
src1
,
dst
,
ggml_cuda_op_mul
,
true
,
false
);
// TODO ggml_cuda_op needs modification for flatten
ggml_cuda_op
(
src0
,
src1
,
dst
,
ggml_cuda_op_mul
,
true
,
false
);
// TODO ggml_cuda_op needs modification for flatten
}
}
void
ggml_cuda_gelu
(
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
)
{
GGML_ASSERT
(
src0
->
type
==
GGML_TYPE_F32
&&
dst
->
type
==
GGML_TYPE_F32
);
ggml_cuda_op
(
src0
,
src1
,
dst
,
ggml_cuda_op_gelu
,
true
,
true
);
}
void
ggml_cuda_silu
(
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
)
{
void
ggml_cuda_silu
(
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
)
{
GGML_ASSERT
(
src0
->
type
==
GGML_TYPE_F32
&&
dst
->
type
==
GGML_TYPE_F32
);
GGML_ASSERT
(
src0
->
type
==
GGML_TYPE_F32
&&
dst
->
type
==
GGML_TYPE_F32
);
ggml_cuda_op
(
src0
,
src1
,
dst
,
ggml_cuda_op_silu
,
true
,
true
);
ggml_cuda_op
(
src0
,
src1
,
dst
,
ggml_cuda_op_silu
,
true
,
true
);
}
}
void
ggml_cuda_norm
(
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
)
{
GGML_ASSERT
(
src0
->
type
==
GGML_TYPE_F32
&&
dst
->
type
==
GGML_TYPE_F32
);
ggml_cuda_op
(
src0
,
src1
,
dst
,
ggml_cuda_op_norm
,
true
,
true
);
}
void
ggml_cuda_rms_norm
(
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
)
{
void
ggml_cuda_rms_norm
(
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
)
{
GGML_ASSERT
(
src0
->
type
==
GGML_TYPE_F32
&&
dst
->
type
==
GGML_TYPE_F32
);
GGML_ASSERT
(
src0
->
type
==
GGML_TYPE_F32
&&
dst
->
type
==
GGML_TYPE_F32
);
ggml_cuda_op
(
src0
,
src1
,
dst
,
ggml_cuda_op_rms_norm
,
true
,
true
);
ggml_cuda_op
(
src0
,
src1
,
dst
,
ggml_cuda_op_rms_norm
,
true
,
true
);
...
@@ -3111,6 +3563,11 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
...
@@ -3111,6 +3563,11 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
(
void
)
dst
;
(
void
)
dst
;
}
}
void
ggml_cuda_dup
(
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
)
{
ggml_cuda_cpy
(
src0
,
dst
,
nullptr
);
(
void
)
src1
;
}
void
ggml_cuda_diag_mask_inf
(
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
)
{
void
ggml_cuda_diag_mask_inf
(
const
ggml_tensor
*
src0
,
const
ggml_tensor
*
src1
,
ggml_tensor
*
dst
)
{
GGML_ASSERT
(
src0
->
type
==
GGML_TYPE_F32
&&
dst
->
type
==
GGML_TYPE_F32
);
GGML_ASSERT
(
src0
->
type
==
GGML_TYPE_F32
&&
dst
->
type
==
GGML_TYPE_F32
);
ggml_cuda_op
(
src0
,
src1
,
dst
,
ggml_cuda_op_diag_mask_inf
,
true
,
true
);
ggml_cuda_op
(
src0
,
src1
,
dst
,
ggml_cuda_op_diag_mask_inf
,
true
,
true
);
...
@@ -3186,7 +3643,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
...
@@ -3186,7 +3643,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
}
}
cudaMemcpy
(
buf
,
buf_host
,
size
,
cudaMemcpyHostToDevice
);
CUDA_CHECK
(
cudaMemcpy
(
buf
,
buf_host
,
size
,
cudaMemcpyHostToDevice
)
)
;
extra
->
data_device
[
id
]
=
buf
;
extra
->
data_device
[
id
]
=
buf
;
...
@@ -3220,6 +3677,22 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
...
@@ -3220,6 +3677,22 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
delete
extra
;
delete
extra
;
}
}
static
struct
ggml_tensor_extra_gpu
*
g_temp_tensor_extras
=
nullptr
;
static
size_t
g_temp_tensor_extra_index
=
0
;
static
struct
ggml_tensor_extra_gpu
*
ggml_cuda_alloc_temp_tensor_extra
()
{
if
(
g_temp_tensor_extras
==
nullptr
)
{
g_temp_tensor_extras
=
new
ggml_tensor_extra_gpu
[
GGML_MAX_NODES
];
}
size_t
alloc_index
=
g_temp_tensor_extra_index
;
g_temp_tensor_extra_index
=
(
g_temp_tensor_extra_index
+
1
)
%
GGML_MAX_NODES
;
struct
ggml_tensor_extra_gpu
*
extra
=
&
g_temp_tensor_extras
[
alloc_index
];
memset
(
extra
,
0
,
sizeof
(
*
extra
));
return
extra
;
}
void
ggml_cuda_assign_buffers_impl
(
struct
ggml_tensor
*
tensor
,
bool
scratch
,
bool
force_inplace
)
{
void
ggml_cuda_assign_buffers_impl
(
struct
ggml_tensor
*
tensor
,
bool
scratch
,
bool
force_inplace
)
{
if
(
scratch
&&
g_scratch_size
==
0
)
{
if
(
scratch
&&
g_scratch_size
==
0
)
{
return
;
return
;
...
@@ -3228,7 +3701,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
...
@@ -3228,7 +3701,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
// recursively assign CUDA buffers until a compute tensor is found
// recursively assign CUDA buffers until a compute tensor is found
if
(
tensor
->
src
[
0
]
!=
nullptr
&&
tensor
->
src
[
0
]
->
backend
==
GGML_BACKEND_CPU
)
{
if
(
tensor
->
src
[
0
]
!=
nullptr
&&
tensor
->
src
[
0
]
->
backend
==
GGML_BACKEND_CPU
)
{
const
ggml_op
src0_op
=
tensor
->
src
[
0
]
->
op
;
const
ggml_op
src0_op
=
tensor
->
src
[
0
]
->
op
;
if
(
src0_op
==
GGML_OP_RESHAPE
||
src0_op
==
GGML_OP_TRANSPOSE
||
src0_op
==
GGML_OP_VIEW
)
{
if
(
src0_op
==
GGML_OP_RESHAPE
||
src0_op
==
GGML_OP_TRANSPOSE
||
src0_op
==
GGML_OP_VIEW
||
src0_op
==
GGML_OP_PERMUTE
)
{
ggml_cuda_assign_buffers_impl
(
tensor
->
src
[
0
],
scratch
,
force_inplace
);
ggml_cuda_assign_buffers_impl
(
tensor
->
src
[
0
],
scratch
,
force_inplace
);
}
}
}
}
...
@@ -3237,8 +3710,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
...
@@ -3237,8 +3710,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
}
}
tensor
->
backend
=
GGML_BACKEND_GPU
;
tensor
->
backend
=
GGML_BACKEND_GPU
;
struct
ggml_tensor_extra_gpu
*
extra
=
new
ggml_tensor_extra_gpu
;
struct
ggml_tensor_extra_gpu
*
extra
;
memset
(
extra
,
0
,
sizeof
(
*
extra
));
const
bool
inplace
=
(
tensor
->
src
[
0
]
!=
nullptr
&&
tensor
->
src
[
0
]
->
data
==
tensor
->
data
)
||
const
bool
inplace
=
(
tensor
->
src
[
0
]
!=
nullptr
&&
tensor
->
src
[
0
]
->
data
==
tensor
->
data
)
||
tensor
->
op
==
GGML_OP_VIEW
||
tensor
->
op
==
GGML_OP_VIEW
||
...
@@ -3253,10 +3725,12 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
...
@@ -3253,10 +3725,12 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
if
(
tensor
->
op
==
GGML_OP_VIEW
)
{
if
(
tensor
->
op
==
GGML_OP_VIEW
)
{
memcpy
(
&
offset
,
tensor
->
src
[
2
]
->
data
,
sizeof
(
size_t
));
memcpy
(
&
offset
,
tensor
->
src
[
2
]
->
data
,
sizeof
(
size_t
));
}
}
extra
=
ggml_cuda_alloc_temp_tensor_extra
();
extra
->
data_device
[
g_main_device
]
=
src0_ddc
+
offset
;
extra
->
data_device
[
g_main_device
]
=
src0_ddc
+
offset
;
}
else
if
(
tensor
->
op
==
GGML_OP_CPY
)
{
}
else
if
(
tensor
->
op
==
GGML_OP_CPY
)
{
struct
ggml_tensor_extra_gpu
*
src1_extra
=
(
ggml_tensor_extra_gpu
*
)
tensor
->
src
[
1
]
->
extra
;
struct
ggml_tensor_extra_gpu
*
src1_extra
=
(
ggml_tensor_extra_gpu
*
)
tensor
->
src
[
1
]
->
extra
;
void
*
src1_ddv
=
src1_extra
->
data_device
[
g_main_device
];
void
*
src1_ddv
=
src1_extra
->
data_device
[
g_main_device
];
extra
=
ggml_cuda_alloc_temp_tensor_extra
();
extra
->
data_device
[
g_main_device
]
=
src1_ddv
;
extra
->
data_device
[
g_main_device
]
=
src1_ddv
;
}
else
if
(
scratch
)
{
}
else
if
(
scratch
)
{
GGML_ASSERT
(
size
<=
g_scratch_size
);
GGML_ASSERT
(
size
<=
g_scratch_size
);
...
@@ -3269,6 +3743,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
...
@@ -3269,6 +3743,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
CUDA_CHECK
(
cudaMalloc
(
&
data
,
g_scratch_size
));
CUDA_CHECK
(
cudaMalloc
(
&
data
,
g_scratch_size
));
g_scratch_buffer
=
data
;
g_scratch_buffer
=
data
;
}
}
extra
=
ggml_cuda_alloc_temp_tensor_extra
();
extra
->
data_device
[
g_main_device
]
=
data
+
g_scratch_offset
;
extra
->
data_device
[
g_main_device
]
=
data
+
g_scratch_offset
;
g_scratch_offset
+=
size
;
g_scratch_offset
+=
size
;
...
@@ -3278,6 +3753,8 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
...
@@ -3278,6 +3753,8 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
void
*
data
;
void
*
data
;
CUDA_CHECK
(
cudaMalloc
(
&
data
,
size
));
CUDA_CHECK
(
cudaMalloc
(
&
data
,
size
));
CUDA_CHECK
(
cudaMemset
(
data
,
0
,
size
));
CUDA_CHECK
(
cudaMemset
(
data
,
0
,
size
));
extra
=
new
ggml_tensor_extra_gpu
;
memset
(
extra
,
0
,
sizeof
(
*
extra
));
extra
->
data_device
[
g_main_device
]
=
data
;
extra
->
data_device
[
g_main_device
]
=
data
;
}
}
...
@@ -3330,6 +3807,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
...
@@ -3330,6 +3807,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
||
(
tensor
->
src
[
1
]
!=
nullptr
&&
tensor
->
src
[
1
]
->
backend
==
GGML_BACKEND_GPU
);
||
(
tensor
->
src
[
1
]
!=
nullptr
&&
tensor
->
src
[
1
]
->
backend
==
GGML_BACKEND_GPU
);
switch
(
tensor
->
op
)
{
switch
(
tensor
->
op
)
{
case
GGML_OP_DUP
:
if
(
!
any_on_device
)
{
return
false
;
}
func
=
ggml_cuda_dup
;
break
;
case
GGML_OP_ADD
:
case
GGML_OP_ADD
:
if
(
!
any_on_device
)
{
if
(
!
any_on_device
)
{
return
false
;
return
false
;
...
@@ -3342,12 +3825,24 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
...
@@ -3342,12 +3825,24 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
}
}
func
=
ggml_cuda_mul
;
func
=
ggml_cuda_mul
;
break
;
break
;
case
GGML_OP_GELU
:
if
(
!
any_on_device
)
{
return
false
;
}
func
=
ggml_cuda_gelu
;
break
;
case
GGML_OP_SILU
:
case
GGML_OP_SILU
:
if
(
!
any_on_device
)
{
if
(
!
any_on_device
)
{
return
false
;
return
false
;
}
}
func
=
ggml_cuda_silu
;
func
=
ggml_cuda_silu
;
break
;
break
;
case
GGML_OP_NORM
:
if
(
!
any_on_device
)
{
return
false
;
}
func
=
ggml_cuda_norm
;
break
;
case
GGML_OP_RMS_NORM
:
case
GGML_OP_RMS_NORM
:
if
(
!
any_on_device
)
{
if
(
!
any_on_device
)
{
return
false
;
return
false
;
...
@@ -3372,6 +3867,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
...
@@ -3372,6 +3867,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
}
}
func
=
ggml_cuda_cpy
;
func
=
ggml_cuda_cpy
;
break
;
break
;
case
GGML_OP_CONT
:
if
(
!
any_on_device
)
{
return
false
;
}
func
=
ggml_cuda_dup
;
break
;
case
GGML_OP_RESHAPE
:
case
GGML_OP_RESHAPE
:
case
GGML_OP_VIEW
:
case
GGML_OP_VIEW
:
case
GGML_OP_PERMUTE
:
case
GGML_OP_PERMUTE
:
...
...
llama/ggml-cuda.h
View file @
a83eaa7a
/**
/**
* llama.cpp - git
5bf2a2771886ee86137e01dbc7492f78fb392066
* llama.cpp - git
e782c9e735f93ab4767ffc37462c523b73a17ddc
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-metal.h
View file @
a83eaa7a
/**
/**
* llama.cpp - git
5bf2a2771886ee86137e01dbc7492f78fb392066
* llama.cpp - git
e782c9e735f93ab4767ffc37462c523b73a17ddc
*
*
* MIT License
* MIT License
*
*
...
...
llama/ggml-metal.m
View file @
a83eaa7a
// +build darwin
// +build darwin
/**
/**
* llama.cpp - git
5bf2a2771886ee86137e01dbc7492f78fb392066
* llama.cpp - git
e782c9e735f93ab4767ffc37462c523b73a17ddc
*
*
* MIT License
* MIT License
*
*
...
@@ -722,8 +722,8 @@ void ggml_metal_graph_compute(
...
@@ -722,8 +722,8 @@ void ggml_metal_graph_compute(
GGML_ASSERT
(
ne02
==
1
);
GGML_ASSERT
(
ne02
==
1
);
GGML_ASSERT
(
ne12
==
1
);
GGML_ASSERT
(
ne12
==
1
);
nth0
=
4
;
nth0
=
2
;
nth1
=
16
;
nth1
=
32
;
[
encoder
setComputePipelineState
:
ctx
->
pipeline_mul_mat_q4_K_f32
];
[
encoder
setComputePipelineState
:
ctx
->
pipeline_mul_mat_q4_K_f32
];
}
break
;
}
break
;
case
GGML_TYPE_Q5_K
:
case
GGML_TYPE_Q5_K
:
...
@@ -731,8 +731,8 @@ void ggml_metal_graph_compute(
...
@@ -731,8 +731,8 @@ void ggml_metal_graph_compute(
GGML_ASSERT
(
ne02
==
1
);
GGML_ASSERT
(
ne02
==
1
);
GGML_ASSERT
(
ne12
==
1
);
GGML_ASSERT
(
ne12
==
1
);
nth0
=
4
;
nth0
=
2
;
nth1
=
16
;
nth1
=
32
;
[
encoder
setComputePipelineState
:
ctx
->
pipeline_mul_mat_q5_K_f32
];
[
encoder
setComputePipelineState
:
ctx
->
pipeline_mul_mat_q5_K_f32
];
}
break
;
}
break
;
case
GGML_TYPE_Q6_K
:
case
GGML_TYPE_Q6_K
:
...
@@ -740,8 +740,8 @@ void ggml_metal_graph_compute(
...
@@ -740,8 +740,8 @@ void ggml_metal_graph_compute(
GGML_ASSERT
(
ne02
==
1
);
GGML_ASSERT
(
ne02
==
1
);
GGML_ASSERT
(
ne12
==
1
);
GGML_ASSERT
(
ne12
==
1
);
nth0
=
4
;
nth0
=
2
;
nth1
=
16
;
nth1
=
32
;
[
encoder
setComputePipelineState
:
ctx
->
pipeline_mul_mat_q6_K_f32
];
[
encoder
setComputePipelineState
:
ctx
->
pipeline_mul_mat_q6_K_f32
];
}
break
;
}
break
;
default:
default:
...
@@ -767,15 +767,18 @@ void ggml_metal_graph_compute(
...
@@ -767,15 +767,18 @@ void ggml_metal_graph_compute(
[
encoder
setBytes
:
&
ne0
length
:
sizeof
(
ne0
)
atIndex
:
13
];
[
encoder
setBytes
:
&
ne0
length
:
sizeof
(
ne0
)
atIndex
:
13
];
[
encoder
setBytes
:
&
ne1
length
:
sizeof
(
ne1
)
atIndex
:
14
];
[
encoder
setBytes
:
&
ne1
length
:
sizeof
(
ne1
)
atIndex
:
14
];
if
(
src0t
==
GGML_TYPE_Q4_0
||
src0t
==
GGML_TYPE_Q4_1
)
{
if
(
src0t
==
GGML_TYPE_Q4_0
||
src0t
==
GGML_TYPE_Q4_1
||
[
encoder
setThreadgroupMemoryLength
:
nth0
*
nth1
*
sizeof
(
float
)
atIndex
:
0
];
src0t
==
GGML_TYPE_Q4_K
)
{
[
encoder
dispatchThreadgroups
:
MTLSizeMake
(
ne01
,
ne11
,
1
)
threadsPerThreadgroup
:
MTLSizeMake
(
nth0
,
nth1
,
1
)];
[
encoder
dispatchThreadgroups
:
MTLSizeMake
((
ne01
+
7
)
/
8
,
ne11
,
1
)
threadsPerThreadgroup
:
MTLSizeMake
(
nth0
,
nth1
,
1
)];
}
else
if
(
src0t
==
GGML_TYPE_Q5_K
)
{
[
encoder
dispatchThreadgroups
:
MTLSizeMake
((
ne01
+
3
)
/
4
,
ne11
,
1
)
threadsPerThreadgroup
:
MTLSizeMake
(
nth0
,
nth1
,
1
)];
}
else
if
(
src0t
==
GGML_TYPE_Q6_K
)
{
[
encoder
dispatchThreadgroups
:
MTLSizeMake
((
ne01
+
1
)
/
2
,
ne11
,
1
)
threadsPerThreadgroup
:
MTLSizeMake
(
nth0
,
nth1
,
1
)];
}
}
else
if
(
src0t
==
GGML_TYPE_Q2_K
||
else
if
(
src0t
==
GGML_TYPE_Q2_K
||
src0t
==
GGML_TYPE_Q3_K
||
src0t
==
GGML_TYPE_Q3_K
)
{
src0t
==
GGML_TYPE_Q4_K
||
src0t
==
GGML_TYPE_Q5_K
||
src0t
==
GGML_TYPE_Q6_K
)
{
[
encoder
setThreadgroupMemoryLength
:
nth0
*
nth1
*
sizeof
(
float
)
atIndex
:
0
];
[
encoder
setThreadgroupMemoryLength
:
nth0
*
nth1
*
sizeof
(
float
)
atIndex
:
0
];
[
encoder
dispatchThreadgroups
:
MTLSizeMake
(
ne01
,
1
,
1
)
threadsPerThreadgroup
:
MTLSizeMake
(
nth0
,
nth1
,
1
)];
[
encoder
dispatchThreadgroups
:
MTLSizeMake
(
ne01
,
1
,
1
)
threadsPerThreadgroup
:
MTLSizeMake
(
nth0
,
nth1
,
1
)];
}
else
{
}
else
{
...
@@ -821,7 +824,7 @@ void ggml_metal_graph_compute(
...
@@ -821,7 +824,7 @@ void ggml_metal_graph_compute(
const
float
eps
=
1e-6
f
;
const
float
eps
=
1e-6
f
;
const
int
nth
=
256
;
const
int
nth
=
512
;
[
encoder
setComputePipelineState
:
ctx
->
pipeline_rms_norm
];
[
encoder
setComputePipelineState
:
ctx
->
pipeline_rms_norm
];
[
encoder
setBuffer
:
id_src0
offset
:
offs_src0
atIndex
:
0
];
[
encoder
setBuffer
:
id_src0
offset
:
offs_src0
atIndex
:
0
];
...
@@ -829,7 +832,7 @@ void ggml_metal_graph_compute(
...
@@ -829,7 +832,7 @@ void ggml_metal_graph_compute(
[
encoder
setBytes
:
&
ne00
length
:
sizeof
(
int64_t
)
atIndex
:
2
];
[
encoder
setBytes
:
&
ne00
length
:
sizeof
(
int64_t
)
atIndex
:
2
];
[
encoder
setBytes
:
&
nb01
length
:
sizeof
(
uint64_t
)
atIndex
:
3
];
[
encoder
setBytes
:
&
nb01
length
:
sizeof
(
uint64_t
)
atIndex
:
3
];
[
encoder
setBytes
:
&
eps
length
:
sizeof
(
float
)
atIndex
:
4
];
[
encoder
setBytes
:
&
eps
length
:
sizeof
(
float
)
atIndex
:
4
];
[
encoder
setThreadgroupMemoryLength
:
nth
*
sizeof
(
float
)
atIndex
:
0
];
[
encoder
setThreadgroupMemoryLength
:
nth
/
32
*
sizeof
(
float
)
atIndex
:
0
];
const
int64_t
nrows
=
ggml_nrows
(
src0
);
const
int64_t
nrows
=
ggml_nrows
(
src0
);
...
@@ -910,28 +913,35 @@ void ggml_metal_graph_compute(
...
@@ -910,28 +913,35 @@ void ggml_metal_graph_compute(
const
int
n_past
=
((
int32_t
*
)(
src1
->
data
))[
0
];
const
int
n_past
=
((
int32_t
*
)(
src1
->
data
))[
0
];
float
freq_base
;
float
freq_scale
;
memcpy
(
&
freq_base
,
(
int32_t
*
)
src1
->
data
+
4
,
sizeof
(
float
));
memcpy
(
&
freq_scale
,
(
int32_t
*
)
src1
->
data
+
5
,
sizeof
(
float
));
[
encoder
setComputePipelineState
:
ctx
->
pipeline_rope
];
[
encoder
setComputePipelineState
:
ctx
->
pipeline_rope
];
[
encoder
setBuffer
:
id_src0
offset
:
offs_src0
atIndex
:
0
];
[
encoder
setBuffer
:
id_src0
offset
:
offs_src0
atIndex
:
0
];
[
encoder
setBuffer
:
id_dst
offset
:
offs_dst
atIndex
:
1
];
[
encoder
setBuffer
:
id_dst
offset
:
offs_dst
atIndex
:
1
];
[
encoder
setBytes
:
&
ne00
length
:
sizeof
(
int64_t
)
atIndex
:
2
];
[
encoder
setBytes
:
&
ne00
length
:
sizeof
(
int64_t
)
atIndex
:
2
];
[
encoder
setBytes
:
&
ne01
length
:
sizeof
(
int64_t
)
atIndex
:
3
];
[
encoder
setBytes
:
&
ne01
length
:
sizeof
(
int64_t
)
atIndex
:
3
];
[
encoder
setBytes
:
&
ne02
length
:
sizeof
(
int64_t
)
atIndex
:
4
];
[
encoder
setBytes
:
&
ne02
length
:
sizeof
(
int64_t
)
atIndex
:
4
];
[
encoder
setBytes
:
&
ne03
length
:
sizeof
(
int64_t
)
atIndex
:
5
];
[
encoder
setBytes
:
&
ne03
length
:
sizeof
(
int64_t
)
atIndex
:
5
];
[
encoder
setBytes
:
&
nb00
length
:
sizeof
(
uint64_t
)
atIndex
:
6
];
[
encoder
setBytes
:
&
nb00
length
:
sizeof
(
uint64_t
)
atIndex
:
6
];
[
encoder
setBytes
:
&
nb01
length
:
sizeof
(
uint64_t
)
atIndex
:
7
];
[
encoder
setBytes
:
&
nb01
length
:
sizeof
(
uint64_t
)
atIndex
:
7
];
[
encoder
setBytes
:
&
nb02
length
:
sizeof
(
uint64_t
)
atIndex
:
8
];
[
encoder
setBytes
:
&
nb02
length
:
sizeof
(
uint64_t
)
atIndex
:
8
];
[
encoder
setBytes
:
&
nb03
length
:
sizeof
(
uint64_t
)
atIndex
:
9
];
[
encoder
setBytes
:
&
nb03
length
:
sizeof
(
uint64_t
)
atIndex
:
9
];
[
encoder
setBytes
:
&
ne0
length
:
sizeof
(
int64_t
)
atIndex
:
10
];
[
encoder
setBytes
:
&
ne0
length
:
sizeof
(
int64_t
)
atIndex
:
10
];
[
encoder
setBytes
:
&
ne1
length
:
sizeof
(
int64_t
)
atIndex
:
11
];
[
encoder
setBytes
:
&
ne1
length
:
sizeof
(
int64_t
)
atIndex
:
11
];
[
encoder
setBytes
:
&
ne2
length
:
sizeof
(
int64_t
)
atIndex
:
12
];
[
encoder
setBytes
:
&
ne2
length
:
sizeof
(
int64_t
)
atIndex
:
12
];
[
encoder
setBytes
:
&
ne3
length
:
sizeof
(
int64_t
)
atIndex
:
13
];
[
encoder
setBytes
:
&
ne3
length
:
sizeof
(
int64_t
)
atIndex
:
13
];
[
encoder
setBytes
:
&
nb0
length
:
sizeof
(
uint64_t
)
atIndex
:
14
];
[
encoder
setBytes
:
&
nb0
length
:
sizeof
(
uint64_t
)
atIndex
:
14
];
[
encoder
setBytes
:
&
nb1
length
:
sizeof
(
uint64_t
)
atIndex
:
15
];
[
encoder
setBytes
:
&
nb1
length
:
sizeof
(
uint64_t
)
atIndex
:
15
];
[
encoder
setBytes
:
&
nb2
length
:
sizeof
(
uint64_t
)
atIndex
:
16
];
[
encoder
setBytes
:
&
nb2
length
:
sizeof
(
uint64_t
)
atIndex
:
16
];
[
encoder
setBytes
:
&
nb3
length
:
sizeof
(
uint64_t
)
atIndex
:
17
];
[
encoder
setBytes
:
&
nb3
length
:
sizeof
(
uint64_t
)
atIndex
:
17
];
[
encoder
setBytes
:
&
n_past
length
:
sizeof
(
int
)
atIndex
:
18
];
[
encoder
setBytes
:
&
n_past
length
:
sizeof
(
int
)
atIndex
:
18
];
[
encoder
setBytes
:
&
n_dims
length
:
sizeof
(
int
)
atIndex
:
19
];
[
encoder
setBytes
:
&
n_dims
length
:
sizeof
(
int
)
atIndex
:
19
];
[
encoder
setBytes
:
&
mode
length
:
sizeof
(
int
)
atIndex
:
20
];
[
encoder
setBytes
:
&
mode
length
:
sizeof
(
int
)
atIndex
:
20
];
[
encoder
setBytes
:
&
freq_base
length
:
sizeof
(
float
)
atIndex
:
21
];
[
encoder
setBytes
:
&
freq_scale
length
:
sizeof
(
float
)
atIndex
:
22
];
[
encoder
dispatchThreadgroups
:
MTLSizeMake
(
ne01
,
ne02
,
ne03
)
threadsPerThreadgroup
:
MTLSizeMake
(
1
,
1
,
1
)];
[
encoder
dispatchThreadgroups
:
MTLSizeMake
(
ne01
,
ne02
,
ne03
)
threadsPerThreadgroup
:
MTLSizeMake
(
1
,
1
,
1
)];
}
break
;
}
break
;
...
...
llama/ggml-metal.metal
View file @
a83eaa7a
/**
/**
* llama.cpp - git
5bf2a2771886ee86137e01dbc7492f78fb392066
* llama.cpp - git
e782c9e735f93ab4767ffc37462c523b73a17ddc
*
*
* MIT License
* MIT License
*
*
...
@@ -357,26 +357,33 @@ kernel void kernel_rms_norm(
...
@@ -357,26 +357,33 @@ kernel void kernel_rms_norm(
threadgroup float * sum [[threadgroup(0)]],
threadgroup float * sum [[threadgroup(0)]],
uint tgpig[[threadgroup_position_in_grid]],
uint tgpig[[threadgroup_position_in_grid]],
uint tpitg[[thread_position_in_threadgroup]],
uint tpitg[[thread_position_in_threadgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]],
uint tiisg[[thread_index_in_simdgroup]],
uint ntg[[threads_per_threadgroup]]) {
uint ntg[[threads_per_threadgroup]]) {
device const float * x = (device const float *) ((device const char *) src0 + tgpig*nb01);
device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
device const float * x_scalar = (device const float *) x;
float4 sumf=0;
float all_sum=0;
// parallel sum
// parallel sum
sum[tpitg] = 0.0f;
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
sumf += x[i00] * x[i00];
sum[tpitg] += x[i00] * x[i00];
}
all_sum = sumf[0] + sumf[1] + sumf[2] + sumf[3];
all_sum = simd_sum(all_sum);
if (tiisg == 0) {
sum[sgitg] = all_sum;
}
}
// reduce
threadgroup_barrier(mem_flags::mem_threadgroup);
threadgroup_barrier(mem_flags::mem_threadgroup);
for (uint i = ntg/2; i > 0; i /= 2) {
// broadcast, simd group number is ntg / 32
if (tpitg < i
) {
for (int i = ntg / 32 / 2; i > 0; i /= 2
) {
sum[tpitg] += sum[
tpitg
+
i
];
if (
tpitg
<
i
) {
}
sum[tpitg] += sum[tpitg + i];
threadgroup_barrier(mem_flags::mem_threadgroup);
}
}
}
// broadcast
if (tpitg == 0) {
if (tpitg == 0) {
for (int i = 4 * (ne00 / 4); i < ne00; i++) {sum[0] += x_scalar[i];}
sum[0] /= ne00;
sum[0] /= ne00;
}
}
...
@@ -385,147 +392,127 @@ kernel void kernel_rms_norm(
...
@@ -385,147 +392,127 @@ kernel void kernel_rms_norm(
const float mean = sum[0];
const float mean = sum[0];
const float scale = 1.0f/sqrt(mean + eps);
const float scale = 1.0f/sqrt(mean + eps);
device float * y = dst + tgpig*ne00;
device float4 * y = (device float4 *) (dst + tgpig*ne00);
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
device float * y_scalar = (device float *) y;
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
y[i00] = x[i00] * scale;
y[i00] = x[i00] * scale;
}
}
if (tpitg == 0) {
for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {y_scalar[i00] = x_scalar[i00] * scale;}
}
}
}
kernel void kernel_mul_mat_q4_0_f32(
// function for calculate inner product between a q4_0 block and 32 floats (yl), sumy is SUM(yl[i])
device const void * src0,
float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thread float * yl) {
device const float * src1,
float d = qb_curr->d;
device float * dst,
float4 acc = 0.f;
constant int64_t & ne00,
device uint16_t * qs = ((device uint16_t *)qb_curr + 1);
constant int64_t & ne10,
for (int i = 0; i < 16; i+=2) {
constant int64_t & ne0,
acc[0] += yl[i] * (qs[i / 2] & 0x000F);
threadgroup float * sum [[threadgroup(0)]],
acc[1] += yl[i + 16] * (qs[i / 2] & 0x00F0);
uint2 tgpig[[threadgroup_position_in_grid]],
acc[2] += yl[i + 1] * (qs[i / 2] & 0x0F00);
uint2 tpitg[[thread_position_in_threadgroup]],
acc[3] += yl[i + 17] * (qs[i / 2] & 0xF000);
uint2 tptg[[threads_per_threadgroup]]) {
}
const int nb = ne00/QK4_0;
return d * (sumy * -8.f + acc[0] + acc[1]/16.f + acc[2]/256.f + acc[3]/4096.f);
}
const int64_t r0 = tgpig.x;
// function for calculate inner product between a q4_1 block and 32 floats (yl), sumy is SUM(yl[i])
const int64_t r1 = tgpig.y;
float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thread float * yl) {
float d = qb_curr->d;
float m = qb_curr->m;
float4 acc = 0.f;
device uint16_t * qs = ((device uint16_t *)qb_curr + 2);
for (int i = 0; i < 16; i+=2) {
acc[0] += yl[i] * (qs[i / 2] & 0x000F);
acc[1] += yl[i + 16] * (qs[i / 2] & 0x00F0);
acc[2] += yl[i + 1] * (qs[i / 2] & 0x0F00);
acc[3] += yl[i + 17] * (qs[i / 2] & 0xF000);
}
return d * (acc[0] + acc[1]/16.f + acc[2]/256.f + acc[3]/4096.f) + sumy * m;
}
device const block_q4_0 * x = (device const block_q4_0 *) src0 + r0*nb;
// putting them in the kernel cause a significant performance penalty
#define N_DST 4 // each SIMD group works on 4 rows
#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
template<typename block_q_type>
void mul_vec_q_n_f32(device const void * src0, device const float * src1, device float * dst,
int64_t ne00, int64_t ne10, int64_t ne0, int64_t ne01,
uint2 tgpig, uint tiisg, uint sgitg) {
const int nb = ne00/QK4_0;
const int r0 = tgpig.x;
const int r1 = tgpig.y;
device const block_q_type * x = (device const block_q_type *) src0 + (r0 * N_SIMDGROUP + sgitg) * N_DST * nb;
device const float * y = (device const float *) src1 + r1*ne10;
device const float * y = (device const float *) src1 + r1*ne10;
float4 y_curr[8]; // src1 vector cache
const int nth = tptg.x*tptg.y;
float sumf[N_DST]={0.f}, all_sum;
const int ith = tptg.y*tpitg.x + tpitg.y;
thread float * yl=(thread float *)y_curr;
const int ix = tpitg.y/4; // 0 or 1
// each thread in a SIMD group deals with 1 block.
const int iy = tpitg.y - 4*ix; // 0...3
for (int column = 0; column < nb / N_SIMDWIDTH; column++) {
float sumy = 0;
const int first = 4 * iy;
for (int i = 0; i < QK4_0 / 4; i++) {
y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0)) + i);
float sumf = 0;
sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
for (int i = 2*tpitg.x + ix; i < nb; i += 2*tptg.x) {
const float d = (float)x[i].d;
device const uint8_t * xl = x[i].qs + first;
device const float * yl = y + i * QK4_0 + first;
float2 acc = {0.0f, 0.0f};
for (int j = 0; j < 4; ++j) {
acc[0] += yl[j] * (xl[j] & 0xF) + yl[j+16] * (xl[j] >> 4);
acc[1] += yl[j] + yl[j+16];
}
}
sumf += d * (acc[0] - 8.f*acc[1]);
for (int row = 0; row < N_DST; row++) {
sumf[row] += block_q_n_dot_y(x+(tiisg + row * nb + column * N_SIMDWIDTH), sumy, yl);
}
}
}
sum[ith] = sumf;
// from now loads two rows every time and 16 blocks per row
int ir = tiisg / (N_SIMDWIDTH / 2);
int ib = tiisg % (N_SIMDWIDTH / 2);
for (int ind = 0; ind < (nb % N_SIMDWIDTH + N_SIMDWIDTH / 2 - 1)/(N_SIMDWIDTH / 2); ind++) {
int nb_start = (nb / N_SIMDWIDTH) * N_SIMDWIDTH + ind * (N_SIMDWIDTH / 2); //where the left blocks start
float sumy = 0;
for (int i = 0; i < QK4_0 / 4; i++) {
y_curr[i] = *((device float4 *)(y + (nb_start + ib) * QK4_0) + i);
sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
}
//
for (int row = 0; row < N_DST; row+=2) {
// Accumulate the sum from all threads in the threadgroup
if (nb_start + ib < nb) {
//
sumf[row + ir] += block_q_n_dot_y(x + (nb_start + ib + (row + ir) * nb), sumy, yl);
threadgroup_barrier(mem_flags::mem_threadgroup);
}
if (ith%4 == 0) {
}
sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
}
threadgroup_barrier(mem_flags::mem_threadgroup);
if (ith%16 == 0) {
sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
}
}
threadgroup_barrier(mem_flags::mem_threadgroup);
if (ith == 0) {
for (int row = 0; row < N_DST; ++row) {
for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
all_sum = simd_sum(sumf[row]);
dst[r1*ne0 + r0] = sum[0];
if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
}
}
}
}
}
kernel void kernel_mul_mat_q4_
1
_f32(
kernel void kernel_mul_mat_q4_
0
_f32(
device const void * src0,
device const void * src0,
device const float * src1,
device const float * src1,
device float * dst,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne00,
constant int64_t & ne10,
constant int64_t & ne10,
constant int64_t & ne0,
constant int64_t & ne0,
threadgroup float * sum [[threadgroup(0
)]],
constant int64_t & ne01[[buffer(4
)]],
uint2 tgpig[[threadgroup_position_in_grid]],
uint2 tgpig[[threadgroup_position_in_grid]],
uint2 tpitg[[thread_position_in_threadgroup]],
uint tiisg[[thread_index_in_simdgroup]],
uint2 tptg[[threads_per_threadgroup]]) {
uint sgitg[[simdgroup_index_in_threadgroup]]) {
const int nb = ne00/QK4_1;
mul_vec_q_n_f32<block_q4_0>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
}
const int64_t r0 = tgpig.x;
const int64_t r1 = tgpig.y;
device const block_q4_1 * x = (device const block_q4_1 *) src0 + r0*nb;
device const float * y = (device const float *) src1 + r1*ne10;
const uint nth = tptg.x*tptg.y;
const uint ith = tptg.y*tpitg.x + tpitg.y;
const int ix = tpitg.y/4; // 0 or 1
const int iy = tpitg.y - 4*ix; // 0...3
const int first = 4 * iy;
float sumf = 0;
for (int i = 2*tpitg.x + ix; i < nb; i += 2*tptg.x) {
const float d = (float)x[i].d;
const float m = (float)x[i].m;
device const uint8_t * xl = x[i].qs + first;
device const float * yl = y + i * QK4_1 + first;
float2 acc = {0.0f, 0.0f};
for (int j = 0; j < 4; ++j) {
acc[0] += yl[j+ 0] * (d * (xl[j] & 0xF) + m);
acc[1] += yl[j+16] * (d * (xl[j] >> 4) + m);
}
sumf += acc[0] + acc[1];
}
sum[ith] = sumf;
//
kernel void kernel_mul_mat_q4_1_f32(
// Accumulate the sum from all threads in the threadgroup
device const void * src0,
//
device const float * src1,
threadgroup_barrier(mem_flags::mem_threadgroup);
device float * dst,
if (ith%4 == 0) {
constant int64_t & ne00,
sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
constant int64_t & ne10,
}
constant int64_t & ne0,
threadgroup_barrier(mem_flags::mem_threadgroup);
constant int64_t & ne01[[buffer(4)]],
if (ith%16 == 0) {
uint2 tgpig[[threadgroup_position_in_grid]],
sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
uint tiisg[[thread_index_in_simdgroup]],
}
uint sgitg[[simdgroup_index_in_threadgroup]]) {
threadgroup_barrier(mem_flags::mem_threadgroup);
mul_vec_q_n_f32<block_q4_1>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
if (ith == 0) {
for (uint i = 16; i < nth; i += 16) sum[0] += sum[i];
dst[r1*ne0 + r0] = sum[0];
}
}
}
kernel void kernel_mul_mat_f16_f32(
kernel void kernel_mul_mat_f16_f32(
...
@@ -641,17 +628,19 @@ kernel void kernel_rope(
...
@@ -641,17 +628,19 @@ kernel void kernel_rope(
constant int & n_past,
constant int & n_past,
constant int & n_dims,
constant int & n_dims,
constant int & mode,
constant int & mode,
constant float & freq_base,
constant float & freq_scale,
uint3 tpig[[thread_position_in_grid]]) {
uint3 tpig[[thread_position_in_grid]]) {
const int64_t i3 = tpig[2];
const int64_t i3 = tpig[2];
const int64_t i2 = tpig[1];
const int64_t i2 = tpig[1];
const int64_t i1 = tpig[0];
const int64_t i1 = tpig[0];
const bool is_neox = mode & 2;
const bool is_neox = mode & 2;
const float theta_scale = pow(
10000.0
, -2.0f/n_dims);
const float theta_scale = pow(
freq_base
, -2.0f/n_dims);
const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
float theta = (float)p;
float theta =
freq_scale *
(float)p;
if (!is_neox) {
if (!is_neox) {
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
...
@@ -1489,6 +1478,7 @@ kernel void kernel_mul_mat_q3_K_f32(
...
@@ -1489,6 +1478,7 @@ kernel void kernel_mul_mat_q3_K_f32(
}
}
#if QK_K == 256
kernel void kernel_mul_mat_q4_K_f32(
kernel void kernel_mul_mat_q4_K_f32(
device const void * src0,
device const void * src0,
device const float * src1,
device const float * src1,
...
@@ -1496,131 +1486,180 @@ kernel void kernel_mul_mat_q4_K_f32(
...
@@ -1496,131 +1486,180 @@ kernel void kernel_mul_mat_q4_K_f32(
constant int64_t & ne00,
constant int64_t & ne00,
constant int64_t & ne10,
constant int64_t & ne10,
constant int64_t & ne0,
constant int64_t & ne0,
threadgroup float * sum [[threadgroup(0
)]],
constant int64_t & ne01[[buffer(4
)]],
uint2 tgpig[[threadgroup_position_in_grid]],
uint2 tgpig[[threadgroup_position_in_grid]],
uint2 tpitg[[thread_position_in_threadgroup]],
uint tiisg[[thread_index_in_simdgroup]],
uint2 tptg[[threads_per_threadgroup]]) {
uint sgitg[[simdgroup_index_in_threadgroup]]) {
const int nb = ne00/QK_K;
const int64_t r0 = tgpig.x;
const int64_t r1 = tgpig.y;
const int nth = tptg.x*tptg.y;
const int ith = tptg.y*tpitg.x + tpitg.y;
device const block_q4_K * x = (device const block_q4_K *) src0 + r0*nb;
device const float * yy = (device const float *) src1 + r1*ne10;
float sumf = 0;
#if QK_K == 256
const uint16_t kmask1 = 0x3f3f;
const uint16_t kmask1 = 0x3f3f;
const uint16_t kmask2 = 0x0f0f;
const uint16_t kmask2 = 0x0f0f;
const uint16_t kmask3 = 0xc0c0;
const uint16_t kmask3 = 0xc0c0;
const int
tid
= t
pitg.y;
// 0...
16
const int
ix
= t
iisg/8;
// 0...
3
const int i
l
= ti
d/4;
// 0...
3
const int i
t
= ti
isg%8;
// 0...
7
const int i
r
=
tid - 4*il;// 0...3
const int i
m
=
it/4; // 0 or 1
const int
n = 4;
const int
ir = it%4; // 0...3
const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
const int nb = ne00/QK_K;
const int in = il%2;
const int r0 = tgpig.x;
const int r1 = tgpig.y;
const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
const int ib_row = first_row * nb;
device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row;
device const float * y = (device const float *) src1 + r1*ne10;
float yl[16];
float yh[16];
float sumf[N_DST]={0.f}, all_sum;
const int l0 = n*(2*ir + in);
const int step = sizeof(block_q4_K) * nb / 2;
const int q_offset = 32*im + l0;
const int y_offset = 64*im + l0;
uchar2 sc1, sc2, sc3, sc4
;
device const float * y4 = y + ix * QK_K + 64 * im + 8 * ir
;
for (int i = tpitg.x; i < nb; i += tptg.x) {
uint16_t sc16[4];
thread const uint8_t * sc8 = (thread const uint8_t *)sc16;
device const uint8_t * q1 = (x + i)->qs + q_offset;
for (int ib = ix; ib < nb; ib += 4) {
device const uint8_t * q2 = q1 + 64;
device const float * y1 = yy + i*QK_K + y_offset;
device const float * y2 = y1 + 128;
const float dall = (float)((x + i)->d);
float4 sumy = {0.f, 0.f, 0.f, 0.f};
const float dmin = (float)((x + i)->dmin);
for (int i = 0; i < 8; ++i) {
yl[i+0] = y4[i+ 0]; sumy[0] += yl[i+0];
yl[i+8] = y4[i+ 32]; sumy[1] += yl[i+8];
yh[i+0] = y4[i+128]; sumy[2] += yh[i+0];
yh[i+8] = y4[i+160]; sumy[3] += yh[i+8];
}
device const uint16_t * a = (device const uint16_t *)(x + i)->scales;
device const uint16_t * sc = (device const uint16_t *)x[ib].scales + im;
sc1 = as_type<uchar2>((uint16_t)(a[im+0] & kmask1));
device const uint16_t * q1 = (device const uint16_t *)x[ib].qs + 16 * im + 4 * ir;
sc2 = as_type<uchar2>((uint16_t)(a[im+2] & kmask1));
device const half * dh = &x[ib].d;
sc3 = as_type<uchar2>((uint16_t)(((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2)));
sc4 = as_type<uchar2>((uint16_t)(((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2)));
for (int row = 0; row < N_DST; row++) {
sc16[0] = sc[0] & kmask1;
sc16[1] = sc[2] & kmask1;
sc16[2] = ((sc[4] >> 0) & kmask2) | ((sc[0] & kmask3) >> 2);
sc16[3] = ((sc[4] >> 4) & kmask2) | ((sc[2] & kmask3) >> 2);
device const uint16_t * q2 = q1 + 32;
float4 acc1 = {0.f, 0.f, 0.f, 0.f};
float4 acc2 = {0.f, 0.f, 0.f, 0.f};
for (int i = 0; i < 8; i += 2) {
acc1[0] += yl[i+0] * (q1[i/2] & 0x000F);
acc1[1] += yl[i+1] * (q1[i/2] & 0x0F00);
acc1[2] += yl[i+8] * (q1[i/2] & 0x00F0);
acc1[3] += yl[i+9] * (q1[i/2] & 0xF000);
acc2[0] += yh[i+0] * (q2[i/2] & 0x000F);
acc2[1] += yh[i+1] * (q2[i/2] & 0x0F00);
acc2[2] += yh[i+8] * (q2[i/2] & 0x00F0);
acc2[3] += yh[i+9] * (q2[i/2] & 0xF000);
}
float4 s = {0.f, 0.f, 0.f, 0.f};
float dall = dh[0];
float smin = 0;
float dmin = dh[1];
for (int l = 0; l < n; ++l) {
sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc8[0] +
(acc1[2] + 1.f/256.f * acc1[3]) * sc8[1] * 1.f/16.f +
(acc2[0] + 1.f/256.f * acc2[1]) * sc8[4] +
(acc2[2] + 1.f/256.f * acc2[3]) * sc8[5] * 1.f/16.f) -
dmin * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]);
q1 += step;
sc += step;
dh += step;
}
s[0] += y1[l] * (q1[l] & 0xF); s[1] += y1[l+32] * (q1[l] >> 4);
y4 += 4 * QK_K;
s[2] += y2[l] * (q2[l] & 0xF); s[3] += y2[l+32] * (q2[l] >> 4);
}
smin += y1[l] * sc2[0] + y1[l+32] * sc2[1] + y2[l] * sc4[0] + y2[l+32] * sc4[1];
for (int row = 0; row < N_DST; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst[r1*ne0 + first_row + row] = all_sum;
}
}
sumf += dall * (s[0] * sc1[0] + s[1] * sc1[1] + s[2] * sc3[0] + s[3] * sc3[1]) - dmin * smin;
}
}
}
#else
#else
uint16_t aux16[2];
kernel void kernel_mul_mat_q4_K_f32(
thread const uint8_t * scales = (thread const uint8_t *)aux16;
device const void * src0,
device const float * src1,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne10,
constant int64_t & ne0,
constant int64_t & ne01[[buffer(4)]],
uint2 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
const int il = 4*tpitg.x;
const int ix = tiisg/4; // 0...7
const int it = tiisg%4; // 0...3
for (int i = tpitg.y; i < nb; i += tptg.y) {
const int nb = ne00/QK_K;
const int r0 = tgpig.x;
const int r1 = tgpig.y;
const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
const int ib_row = first_row * nb;
device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row;
device const float * y = (device const float *) src1 + r1*ne10;
float yl[8];
float yh[8];
float sumf[N_DST]={0.f}, all_sum;
device const uint8_t * q = x[i].qs + il;
const int step = sizeof(block_q4_K) * nb / 2;
device const float * y = yy + i * QK_K + il;
const float d = (float)x[i].d[0];
device const float * y4 = y + ix * QK_K + 8 * it;
const float m = (float)x[i].d[1];
device const uint16_t * a = (device const uint16_t *)x[i].scales;
uint16_t sc16[4];
aux16[0] = a[0] & 0x0f0f;
aux16[1] = (a[0] >> 4) & 0x0f0f;
for (int l = 0; l < 4; ++l) {
for (int ib = ix; ib < nb; ib += 8) {
sumf += d * scales[0] * (y[l+ 0] * (q[l] & 0xF) + y[l+16] * (q[l+16] & 0xF)) - m * scales[2] * (y[l+ 0] + y[l+16])
+ d * scales[1] * (y[l+32] * (q[l] >> 4) + y[l+48] * (q[l+16] >> 4)) - m * scales[3] * (y[l+32] + y[l+48]);
float2 sumy = {0.f, 0.f};
for (int i = 0; i < 8; ++i) {
yl[i] = y4[i+ 0]; sumy[0] += yl[i];
yh[i] = y4[i+32]; sumy[1] += yh[i];
}
}
}
#endif
sum[ith] = sumf;
device const uint16_t * sc = (device const uint16_t *)x[ib].scales;
device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 4 * it;
device const half * dh = x[ib].d;
//
for (int row = 0; row < N_DST; row++) {
// Accumulate the sum from all threads in the threadgroup
// This version is slightly faster than the commented out one below,
sc16[0] = sc[0] & 0x000f;
// which I copy-pasted from ggerganov's q4_0 dot product for metal.
sc16[1] = sc[0] & 0x0f00;
//
sc16[2] = sc[0] & 0x00f0;
threadgroup_barrier(mem_flags::mem_threadgroup);
sc16[3] = sc[0] & 0xf000;
if (ith%4 == 0) {
for (int i = 1; i < 4; ++i) sum[ith] += sum[ith + i];
float2 acc1 = {0.f, 0.f};
}
float2 acc2 = {0.f, 0.f};
threadgroup_barrier(mem_flags::mem_threadgroup);
for (int i = 0; i < 8; i += 2) {
if (ith%16 == 0) {
acc1[0] += yl[i+0] * (qs[i/2] & 0x000F);
for (int i = 4; i < 16; i += 4) sum[ith] += sum[ith + i];
acc1[1] += yl[i+1] * (qs[i/2] & 0x0F00);
}
acc2[0] += yh[i+0] * (qs[i/2] & 0x00F0);
threadgroup_barrier(mem_flags::mem_threadgroup);
acc2[1] += yh[i+1] * (qs[i/2] & 0xF000);
if (ith == 0) {
}
for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
dst[r1*ne0 + r0] = sum[0];
float dall = dh[0];
}
float dmin = dh[1];
sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc16[0] +
(acc2[0] + 1.f/256.f * acc2[1]) * sc16[1] * 1.f/4096.f) -
dmin * 1.f/16.f * (sumy[0] * sc16[2] + sumy[1] * sc16[3] * 1.f/256.f);
qs += step;
sc += step;
dh += step;
}
//// accumulate the sum from all threads in the threadgroup
y4 += 8 * QK_K;
//threadgroup_barrier(mem_flags::mem_threadgroup);
}
//for (uint i = nth/2; i > 0; i /= 2) {
// if (ith < i) {
// sum[ith] += sum[ith + i];
// }
// threadgroup_barrier(mem_flags::mem_threadgroup);
//}
//if (ith == 0) {
for (int row = 0; row < N_DST; ++row) {
// dst[r1*ne0 + r0] = sum[0];
all_sum = simd_sum(sumf[row]);
//}
if (tiisg == 0) {
dst[r1*ne0 + first_row + row] = all_sum;
}
}
}
}
#endif
kernel void kernel_mul_mat_q5_K_f32(
kernel void kernel_mul_mat_q5_K_f32(
device const void * src0,
device const void * src0,
...
@@ -1629,39 +1668,39 @@ kernel void kernel_mul_mat_q5_K_f32(
...
@@ -1629,39 +1668,39 @@ kernel void kernel_mul_mat_q5_K_f32(
constant int64_t & ne00,
constant int64_t & ne00,
constant int64_t & ne10,
constant int64_t & ne10,
constant int64_t & ne0,
constant int64_t & ne0,
threadgroup float * sum [[threadgroup(0)]],
uint2 tgpig[[threadgroup_position_in_grid]],
uint2 tgpig[[threadgroup_position_in_grid]],
uint
2
t
pit
g[[thread_
position_in_threa
dgroup]],
uint t
iis
g[[thread_
index_in_sim
dgroup]],
uint
2 tptg[[threads_per
_threadgroup]]) {
uint
sgitg[[simdgroup_index_in
_threadgroup]]) {
const int nb = ne00/QK_K;
const int nb = ne00/QK_K;
const int64_t r0 = tgpig.x;
const int64_t r0 = tgpig.x;
const int64_t r1 = tgpig.y;
const int64_t r1 = tgpig.y;
device const block_q5_K * x = (device const block_q5_K *) src0 + r0*nb;
const int first_row = (r0 * N_SIMDGROUP + sgitg) * 2;
device const block_q5_K * x = (device const block_q5_K *) src0 + first_row*nb;
device const float * yy = (device const float *) src1 + r1*ne10;
device const float * yy = (device const float *) src1 + r1*ne10;
const int nth = tptg.x*tptg.y;
float sumf[2]={0.f};
const int ith = tptg.y*tpitg.x + tpitg.y;
float sumf = 0
;
const int step = sizeof(block_q5_K) * nb
;
#if QK_K == 256
#if QK_K == 256
#
float yl[16], yh[16];
const uint16_t kmask1 = 0x3f3f;
const uint16_t kmask1 = 0x3f3f;
const uint16_t kmask2 = 0x0f0f;
const uint16_t kmask2 = 0x0f0f;
const uint16_t kmask3 = 0xc0c0;
const uint16_t kmask3 = 0xc0c0;
const int tid = tpitg.y; // 0...16
const int tid = tiisg/4;
const int il = tid/4; // 0...3
const int ix = tiisg%4;
const int ir = tid - 4*il;// 0...3
const int im = tid/4;
const int n = 4;
const int ir = tid%4;
const int n = 8;
const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
const int in = il%2;
const int l0 = n*
(2*ir + in)
;
const int l0 = n*
ir
;
const int q_offset = 32*im + l0;
const int q_offset = 32*im + l0;
const int y_offset = 64*im + l0;
const int y_offset = 64*im + l0;
...
@@ -1670,78 +1709,114 @@ kernel void kernel_mul_mat_q5_K_f32(
...
@@ -1670,78 +1709,114 @@ kernel void kernel_mul_mat_q5_K_f32(
const uint8_t hm3 = hm1 << 4;
const uint8_t hm3 = hm1 << 4;
const uint8_t hm4 = hm2 << 4;
const uint8_t hm4 = hm2 << 4;
uchar2 sc1, sc2, sc3, sc4;
uint16_t sc16[4];
thread const uint8_t * sc8 = (thread const uint8_t *)sc16;
for (int i = tpitg.x; i < nb; i += tptg.x) {
device const float * y1 = yy + ix*QK_K + y_offset;
for (int i = ix; i < nb; i += 4) {
device const uint8_t * q1 = x[i].qs + q_offset;
device const uint8_t * qh = x[i].qh + l0;
device const half * dh = &x[i].d;
device const uint16_t * a = (device const uint16_t *)x[i].scales + im;
device const uint8_t * q1 = (x + i)->qs + q_offset;
device const float * y2 = y1 + 128;
device const uint8_t * q2 = q1 + 64;
float4 sumy = {0.f, 0.f, 0.f, 0.f};
device const uint8_t * qh = (x + i)->qh + l0;
for (int l = 0; l < 8; ++l) {
device const float * y1 = yy + i*QK_K + y_offset;
yl[l+0] = y1[l+ 0]; sumy[0] += yl[l+0];
device const float * y2 = y1 + 128;
yl[l+8] = y1[l+32]; sumy[1] += yl[l+8];
yh[l+0] = y2[l+ 0]; sumy[2] += yh[l+0];
yh[l+8] = y2[l+32]; sumy[3] += yh[l+8];
}
const float dall = (float)((x + i)->d);
for (int row = 0; row < 2; ++row) {
const float dmin = (float)((x + i)->dmin);
device const uint16_t * a = (device const uint16_t *)(x + i)->scales;
device const uint8_t * q2 = q1 + 64;
sc1 = as_type<uchar2>((uint16_t)(a[im+0] & kmask1));
sc2 = as_type<uchar2>((uint16_t)(a[im+2] & kmask1));
sc3 = as_type<uchar2>((uint16_t)(((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2)));
sc4 = as_type<uchar2>((uint16_t)(((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2)));
float4 s = {0.f, 0.f, 0.f, 0.f};
sc16[0] = a[0] & kmask1;
float smin = 0;
sc16[1] = a[2] & kmask1;
for (int l = 0; l < n; ++l) {
sc16[2] = ((a[4] >> 0) & kmask2) | ((a[0] & kmask3) >> 2);
sc16[3] = ((a[4] >> 4) & kmask2) | ((a[2] & kmask3) >> 2);
s[0] += y1[l+ 0] * ((q1[l] & 0xF) + (qh[l] & hm1 ? 16 : 0));
float4 acc = {0.f, 0.f, 0.f, 0.f};
s[1] += y1[l+32] * ((q1[l] >> 4) + (qh[l] & hm2 ? 16 : 0));
for (int l = 0; l < n; ++l) {
s[2] += y2[l+ 0] * ((q2[l] & 0xF) + (qh[l] & hm3 ? 16 : 0));
uint8_t h = qh[l];
s[3] += y2[l+32] * ((q2[l] >> 4) + (qh[l] & hm4 ? 16 : 0));
acc[0] += yl[l+0] * ((uint16_t)(q1[l] & 0x0F) + (h & hm1 ? 16 : 0));
smin += y1[l] * sc2[0] + y1[l+32] * sc2[1] + y2[l] * sc4[0] + y2[l+32] * sc4[1];
acc[1] += yl[l+8] * ((uint16_t)(q1[l] & 0xF0) + (h & hm2 ? 256 : 0));
acc[2] += yh[l+0] * ((uint16_t)(q2[l] & 0x0F) + (h & hm3 ? 16 : 0));
acc[3] += yh[l+8] * ((uint16_t)(q2[l] & 0xF0) + (h & hm4 ? 256 : 0));
}
const float dall = dh[0];
const float dmin = dh[1];
sumf[row] += dall * (acc[0] * sc8[0] + acc[1] * sc8[1] * 1.f/16.f + acc[2] * sc8[4] + acc[3] * sc8[5] * 1.f/16.f) -
dmin * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]);
q1 += step;
qh += step;
dh += step/2;
a += step/2;
}
}
sumf += dall * (s[0] * sc1[0] + s[1] * sc1[1] + s[2] * sc3[0] + s[3] * sc3[1]) - dmin * smin;
y1 += 4 * QK_K;
}
}
#else
#else
const int il = 4 * tpitg.x; // 0, 4, 8, 12
float yl[8], yh[8];
const int im = il/8; // 0, 0, 1, 1
const int in = il%8; // 0, 4, 0, 4
for (int i = tpitg.y; i < nb; i += tptg.y) {
const int il = 4 * (tiisg/8); // 0, 4, 8, 12
const int ix = tiisg%8;
const int im = il/8; // 0, 0, 1, 1
const int in = il%8; // 0, 4, 0, 4
const float d = (float)x[i].d;
device const float * y = yy + ix*QK_K + il;
for (int i = ix; i < nb; i += 8) {
float4 sumy = {0.f, 0.f, 0.f, 0.f};
for (int l = 0; l < 4; ++l) {
yl[l+0] = y[l+ 0];
yl[l+4] = y[l+16];
yh[l+0] = y[l+32];
yh[l+4] = y[l+48];
}
device const half * dh = &x[i].d;
device const uint8_t * q = x[i].qs + il;
device const uint8_t * q = x[i].qs + il;
device const uint8_t * h = x[i].qh + in;
device const uint8_t * h = x[i].qh + in;
device const int8_t * s = x[i].scales;
device const int8_t * s = x[i].scales;
device const float * y = yy + i*QK_K + il;
for (int l = 0; l < 4; ++l) {
for (int row = 0; row < 2; ++row) {
const uint8_t hl = h[l] >> im;
sumf += y[l+ 0] * d * s[0] * ((q[l+ 0] & 0xF) - (hl & 0x01 ? 0 : 16))
const float d = dh[0];
+ y[l+16] * d * s[1] * ((q[l+16] & 0xF) - (hl & 0x04 ? 0 : 16))
+ y[l+32] * d * s[2] * ((q[l+ 0] >> 4) - (hl & 0x10 ? 0 : 16))
float2 acc = {0.f, 0.f};
+ y[l+48] * d * s[3] * ((q[l+16] >> 4) - (hl & 0x40 ? 0 : 16));
for (int l = 0; l < 4; ++l) {
const uint8_t hl = h[l] >> im;
acc[0] += yl[l+0] * s[0] * ((int16_t)(q[l+ 0] & 0x0F) - (hl & 0x01 ? 0 : 16))
+ yl[l+4] * s[1] * ((int16_t)(q[l+16] & 0x0F) - (hl & 0x04 ? 0 : 16));
acc[1] += yh[l+0] * s[2] * ((int16_t)(q[l+ 0] & 0xF0) - (hl & 0x10 ? 0 : 256))
+ yh[l+4] * s[3] * ((int16_t)(q[l+16] & 0xF0) - (hl & 0x40 ? 0 : 256));
}
sumf[row] += d * (acc[0] + 1.f/16.f * acc[1]);
q += step;
h += step;
s += step;
dh += step/2;
}
}
y += 8 * QK_K;
}
}
#endif
#endif
sum[ith] = sumf;
//
for (int row = 0; row < 2; ++row) {
// Accumulate the sum from all threads in the threadgroup
const float tot = simd_sum(sumf[row]);
//
if (tiisg == 0) {
threadgroup_barrier(mem_flags::mem_threadgroup);
dst[r1*ne0 + first_row + row] = tot;
if (ith%4 == 0) {
}
sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
}
threadgroup_barrier(mem_flags::mem_threadgroup);
if (ith%16 == 0) {
sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
}
threadgroup_barrier(mem_flags::mem_threadgroup);
if (ith == 0) {
for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
dst[r1*ne0 + r0] = sum[0];
}
}
}
}
...
@@ -1753,10 +1828,9 @@ kernel void kernel_mul_mat_q6_K_f32(
...
@@ -1753,10 +1828,9 @@ kernel void kernel_mul_mat_q6_K_f32(
constant int64_t & ne00,
constant int64_t & ne00,
constant int64_t & ne10,
constant int64_t & ne10,
constant int64_t & ne0,
constant int64_t & ne0,
threadgroup float * sum [[threadgroup(0)]],
uint2 tgpig[[threadgroup_position_in_grid]],
uint2 tgpig[[threadgroup_position_in_grid]],
uint
2
t
pit
g[[thread_
position_in_threa
dgroup]],
uint t
iis
g[[thread_
index_in_sim
dgroup]],
uint
2 tptg[[threads_per
_threadgroup]]) {
uint
sgitg[[simdgroup_index_in
_threadgroup]]) {
const uint8_t kmask1 = 0x03;
const uint8_t kmask1 = 0x03;
const uint8_t kmask2 = 0x0C;
const uint8_t kmask2 = 0x0C;
...
@@ -1768,19 +1842,18 @@ kernel void kernel_mul_mat_q6_K_f32(
...
@@ -1768,19 +1842,18 @@ kernel void kernel_mul_mat_q6_K_f32(
const int64_t r0 = tgpig.x;
const int64_t r0 = tgpig.x;
const int64_t r1 = tgpig.y;
const int64_t r1 = tgpig.y;
device const block_q6_K * x = (device const block_q6_K *) src0 + r0*nb;
const int row = 2 * r0 + sgitg;
device const float * yy = (device const float *) src1 + r1*ne10;
const int nth = tptg.x*tptg.y
;
device const block_q6_K * x = (device const block_q6_K *) src0 + row * nb; //r0*nb
;
const int ith = tptg.y*tpitg.x + tpitg.y
;
device const float * yy = (device const float *) src1 + r1*ne10
;
float sumf = 0;
float sumf = 0;
#if QK_K == 256
#if QK_K == 256
// Note: we absolutely assume that tptg.y = 16 and QK_K = 256!
const int tid = tiisg/2;
const int i
qs
=
16 * tpitg.y
;
const int i
x
=
tiisg%2
;
const int ip =
iqs / 12
8; // 0 or 1
const int ip =
tid/
8; // 0 or 1
const int il =
(iqs - 128*ip)/16; // 0...7
const int il =
tid%8;
const int n = 4;
const int n = 4;
const int l0 = n*il;
const int l0 = n*il;
const int is = 8*ip + l0/16;
const int is = 8*ip + l0/16;
...
@@ -1789,9 +1862,10 @@ kernel void kernel_mul_mat_q6_K_f32(
...
@@ -1789,9 +1862,10 @@ kernel void kernel_mul_mat_q6_K_f32(
const int q_offset_l = 64*ip + l0;
const int q_offset_l = 64*ip + l0;
const int q_offset_h = 32*ip + l0;
const int q_offset_h = 32*ip + l0;
for (int i =
tpitg.
x; i < nb; i +=
tptg.x
) {
for (int i =
i
x; i < nb; i +=
2
) {
device const uint8_t * ql = x[i].ql + q_offset_l;
device const uint8_t * q1 = x[i].ql + q_offset_l;
device const uint8_t * q2 = q1 + 32;
device const uint8_t * qh = x[i].qh + q_offset_h;
device const uint8_t * qh = x[i].qh + q_offset_h;
device const int8_t * sc = x[i].scales + is;
device const int8_t * sc = x[i].scales + is;
...
@@ -1801,19 +1875,21 @@ kernel void kernel_mul_mat_q6_K_f32(
...
@@ -1801,19 +1875,21 @@ kernel void kernel_mul_mat_q6_K_f32(
float4 sums = {0.f, 0.f, 0.f, 0.f};
float4 sums = {0.f, 0.f, 0.f, 0.f};
for (int l = 0; l < n; ++l) {
for (int l = 0; l < n; ++l) {
sums[0] += y[l+ 0] * ((int8_t)((q
l
[l
+ 0
] & 0xF) | ((qh[l] & kmask1) << 4)) - 32);
sums[0] += y[l+ 0] * ((int8_t)((q
1
[l] & 0xF) | ((qh[l] & kmask1) << 4)) - 32);
sums[1] += y[l+32] * ((int8_t)((q
l
[l
+32
] & 0xF) | ((qh[l] & kmask2) << 2)) - 32);
sums[1] += y[l+32] * ((int8_t)((q
2
[l] & 0xF) | ((qh[l] & kmask2) << 2)) - 32);
sums[2] += y[l+64] * ((int8_t)((q
l
[l
+ 0
] >> 4) | ((qh[l] & kmask3) << 0)) - 32);
sums[2] += y[l+64] * ((int8_t)((q
1
[l] >> 4) | ((qh[l] & kmask3) << 0)) - 32);
sums[3] += y[l+96] * ((int8_t)((q
l
[l
+32
] >> 4) | ((qh[l] & kmask4) >> 2)) - 32);
sums[3] += y[l+96] * ((int8_t)((q
2
[l] >> 4) | ((qh[l] & kmask4) >> 2)) - 32);
}
}
sumf += dall * (sums[0] * sc[0] + sums[1] * sc[2] + sums[2] * sc[4] + sums[3] * sc[6]);
sumf += dall * (sums[0] * sc[0] + sums[1] * sc[2] + sums[2] * sc[4] + sums[3] * sc[6]);
}
}
#else
#else
const int il = 4*tpitg.x; // 0, 4, 8, 12
const int ix = tiisg/4;
const int il = 4*(tiisg%4);
for (int i =
tpitg.y
; i < nb; i +=
tptg.y
) {
for (int i =
ix
; i < nb; i +=
8
) {
device const float * y = yy + i * QK_K + il;
device const float * y = yy + i * QK_K + il;
device const uint8_t * ql = x[i].ql + il;
device const uint8_t * ql = x[i].ql + il;
device const uint8_t * qh = x[i].qh + il;
device const uint8_t * qh = x[i].qh + il;
...
@@ -1833,23 +1909,8 @@ kernel void kernel_mul_mat_q6_K_f32(
...
@@ -1833,23 +1909,8 @@ kernel void kernel_mul_mat_q6_K_f32(
#endif
#endif
sum[ith] = sumf;
const float tot = simd_sum(sumf);
if (tiisg == 0) {
//
dst[r1*ne0 + row] = tot;
// Accumulate the sum from all threads in the threadgroup
//
threadgroup_barrier(mem_flags::mem_threadgroup);
if (ith%4 == 0) {
for (int i = 1; i < 4; ++i) sum[ith] += sum[ith + i];
}
}
threadgroup_barrier(mem_flags::mem_threadgroup);
if (ith%16 == 0) {
for (int i = 4; i < 16; i += 4) sum[ith] += sum[ith + i];
}
threadgroup_barrier(mem_flags::mem_threadgroup);
if (ith == 0) {
for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
dst[r1*ne0 + r0] = sum[0];
}
}
}
llama/ggml.c
View file @
a83eaa7a
/**
/**
* llama.cpp - git
5bf2a2771886ee86137e01dbc7492f78fb392066
* llama.cpp - git
e782c9e735f93ab4767ffc37462c523b73a17ddc
*
*
* MIT License
* MIT License
*
*
...
@@ -51,16 +51,23 @@
...
@@ -51,16 +51,23 @@
#include <float.h>
#include <float.h>
#include <limits.h>
#include <limits.h>
#include <stdarg.h>
#include <stdarg.h>
#include <signal.h>
#ifdef GGML_USE_METAL
#ifdef GGML_USE_METAL
#include <unistd.h>
#include <unistd.h>
#endif
#endif
// static_assert should be a #define, but if it's not,
// fall back to the _Static_assert C11 keyword.
// if C99 - static_assert is noop
// if C99 - static_assert is noop
// ref: https://stackoverflow.com/a/53923785/4039976
// ref: https://stackoverflow.com/a/53923785/4039976
#ifndef static_assert
#ifndef static_assert
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
#define static_assert(cond, msg) _Static_assert(cond, msg)
#else
#define static_assert(cond, msg) struct global_scope_noop_trick
#define static_assert(cond, msg) struct global_scope_noop_trick
#endif
#endif
#endif
#if defined(_MSC_VER)
#if defined(_MSC_VER)
// disable "possible loss of data" to avoid hundreds of casts
// disable "possible loss of data" to avoid hundreds of casts
...
@@ -75,23 +82,23 @@
...
@@ -75,23 +82,23 @@
typedef volatile LONG atomic_int;
typedef volatile LONG atomic_int;
typedef atomic_int atomic_bool;
typedef atomic_int atomic_bool;
static void atomic_store(atomic_int* ptr, LONG val) {
static void atomic_store(atomic_int
* ptr, LONG val) {
InterlockedExchange(ptr, val);
InterlockedExchange(ptr, val);
}
}
static LONG atomic_load(atomic_int* ptr) {
static LONG atomic_load(atomic_int
* ptr) {
return InterlockedCompareExchange(ptr, 0, 0);
return InterlockedCompareExchange(ptr, 0, 0);
}
}
static LONG atomic_fetch_add(atomic_int* ptr, LONG inc) {
static LONG atomic_fetch_add(atomic_int
* ptr, LONG inc) {
return InterlockedExchangeAdd(ptr, inc);
return InterlockedExchangeAdd(ptr, inc);
}
}
static LONG atomic_fetch_sub(atomic_int* ptr, LONG dec) {
static LONG atomic_fetch_sub(atomic_int
* ptr, LONG dec) {
return atomic_fetch_add(ptr, -(dec));
return atomic_fetch_add(ptr, -(dec));
}
}
typedef HANDLE pthread_t;
typedef HANDLE pthread_t;
typedef DWORD thread_ret_t;
typedef DWORD thread_ret_t;
static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void*), void* arg) {
static int pthread_create(pthread_t
* out, void
* unused, thread_ret_t(*func)(void
*), void
* arg) {
(void) unused;
(void) unused;
HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);
HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);
if (handle == NULL)
if (handle == NULL)
...
@@ -103,7 +110,7 @@ static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void
...
@@ -103,7 +110,7 @@ static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void
return 0;
return 0;
}
}
static int pthread_join(pthread_t thread, void* unused) {
static int pthread_join(pthread_t thread, void
* unused) {
(void) unused;
(void) unused;
return (int) WaitForSingleObject(thread, INFINITE);
return (int) WaitForSingleObject(thread, INFINITE);
}
}
...
@@ -116,7 +123,7 @@ static int sched_yield (void) {
...
@@ -116,7 +123,7 @@ static int sched_yield (void) {
#include <pthread.h>
#include <pthread.h>
#include <stdatomic.h>
#include <stdatomic.h>
typedef void* thread_ret_t;
typedef void
* thread_ret_t;
#include <sys/types.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/stat.h>
...
@@ -137,10 +144,6 @@ typedef void* thread_ret_t;
...
@@ -137,10 +144,6 @@ typedef void* thread_ret_t;
#endif
#endif
#endif
#endif
#ifdef __HAIKU__
#define static_assert(cond, msg) _Static_assert(cond, msg)
#endif
/*#define GGML_PERF*/
/*#define GGML_PERF*/
#define GGML_DEBUG 0
#define GGML_DEBUG 0
#define GGML_GELU_FP16
#define GGML_GELU_FP16
...
@@ -3812,6 +3815,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
...
@@ -3812,6 +3815,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"CLAMP",
"CLAMP",
"CONV_1D",
"CONV_1D",
"CONV_2D",
"CONV_2D",
"POOL_1D",
"POOL_2D",
"FLASH_ATTN",
"FLASH_ATTN",
"FLASH_FF",
"FLASH_FF",
...
@@ -3830,7 +3835,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
...
@@ -3830,7 +3835,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"CROSS_ENTROPY_LOSS_BACK",
"CROSS_ENTROPY_LOSS_BACK",
};
};
static_assert(GGML_OP_COUNT == 6
6
, "GGML_OP_COUNT != 6
6
");
static_assert(GGML_OP_COUNT == 6
8
, "GGML_OP_COUNT != 6
8
");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
"none",
...
@@ -3890,6 +3895,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
...
@@ -3890,6 +3895,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"clamp(x)",
"clamp(x)",
"conv_1d(x)",
"conv_1d(x)",
"conv_2d(x)",
"conv_2d(x)",
"pool_1d(x)",
"pool_2d(x)",
"flash_attn(x)",
"flash_attn(x)",
"flash_ff(x)",
"flash_ff(x)",
...
@@ -3908,7 +3915,9 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
...
@@ -3908,7 +3915,9 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"cross_entropy_loss_back(x,y)",
"cross_entropy_loss_back(x,y)",
};
};
static_assert(GGML_OP_COUNT == 66, "GGML_OP_COUNT != 66");
static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
...
@@ -4187,10 +4196,9 @@ static inline bool ggml_is_matrix(const struct ggml_tensor * tensor) {
...
@@ -4187,10 +4196,9 @@ static inline bool ggml_is_matrix(const struct ggml_tensor * tensor) {
static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return
return (t0->ne[0] == t1->ne[0]) &&
(t0->ne[0] == t1->ne[0]) &&
(t1->ne[2]%t0->ne[2] == 0) && // verify t0 is broadcastable
(t0->ne[2] == t1->ne[2]) &&
(t1->ne[3]%t0->ne[3] == 0);
(t0->ne[3] == t1->ne[3]);
}
}
static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
...
@@ -4430,8 +4438,8 @@ void ggml_free(struct ggml_context * ctx) {
...
@@ -4430,8 +4438,8 @@ void ggml_free(struct ggml_context * ctx) {
if (&g_state.contexts[i].context == ctx) {
if (&g_state.contexts[i].context == ctx) {
g_state.contexts[i].used = false;
g_state.contexts[i].used = false;
GGML_PRINT_DEBUG("%s: context %d
with %d objects
has been freed. memory used = %zu\n",
GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n",
__func__, i,
ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size
);
__func__, i,
ggml_used_mem(ctx)
);
if (ctx->mem_buffer_owned) {
if (ctx->mem_buffer_owned) {
GGML_ALIGNED_FREE(ctx->mem_buffer);
GGML_ALIGNED_FREE(ctx->mem_buffer);
...
@@ -4749,7 +4757,7 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) {
...
@@ -4749,7 +4757,7 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) {
{
{
assert(tensor->nb[0] == sizeof(ggml_fp16_t));
assert(tensor->nb[0] == sizeof(ggml_fp16_t));
for (int i = 0; i < n; i++) {
for (int i = 0; i < n; i++) {
ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), value);
ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1),
GGML_FP32_TO_FP16(
value)
)
;
}
}
} break;
} break;
case GGML_TYPE_F32:
case GGML_TYPE_F32:
...
@@ -4801,7 +4809,7 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
...
@@ -4801,7 +4809,7 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
{
{
assert(tensor->nb[0] == sizeof(ggml_fp16_t));
assert(tensor->nb[0] == sizeof(ggml_fp16_t));
for (int i = 0; i < n; i++) {
for (int i = 0; i < n; i++) {
ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), value);
ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1),
GGML_FP32_TO_FP16(
value)
)
;
}
}
} break;
} break;
case GGML_TYPE_F32:
case GGML_TYPE_F32:
...
@@ -5061,11 +5069,15 @@ struct ggml_tensor * ggml_add_impl(
...
@@ -5061,11 +5069,15 @@ struct ggml_tensor * ggml_add_impl(
struct ggml_tensor * a,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * b,
bool inplace) {
bool inplace) {
GGML_ASSERT(ggml_are_same_shape(a, b));
// TODO: support less-strict constraint
// GGML_ASSERT(ggml_can_repeat(b, a));
GGML_ASSERT(ggml_can_repeat_rows(b, a));
bool is_node = false;
bool is_node = false;
if (a->grad || b->grad) {
if (!inplace && (a->grad || b->grad)) {
// TODO: support backward pass for broadcasting
GGML_ASSERT(ggml_are_same_shape(a, b));
is_node = true;
is_node = true;
}
}
...
@@ -6051,8 +6063,8 @@ struct ggml_tensor * ggml_mul_mat(
...
@@ -6051,8 +6063,8 @@ struct ggml_tensor * ggml_mul_mat(
is_node = true;
is_node = true;
}
}
const int64_t ne[4] = { a->ne[1], b->ne[1],
a
->ne[2], b->ne[3] };
const int64_t ne[4] = { a->ne[1], b->ne[1],
b
->ne[2], b->ne[3] };
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, M
IN
(a->n_dims, b->n_dims), ne);
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, M
AX
(a->n_dims, b->n_dims), ne);
result->op = GGML_OP_MUL_MAT;
result->op = GGML_OP_MUL_MAT;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
...
@@ -6970,6 +6982,8 @@ struct ggml_tensor * ggml_rope_impl(
...
@@ -6970,6 +6982,8 @@ struct ggml_tensor * ggml_rope_impl(
int n_past,
int n_past,
int n_dims,
int n_dims,
int mode,
int mode,
float freq_base,
float freq_scale,
int n_ctx,
int n_ctx,
bool inplace) {
bool inplace) {
GGML_ASSERT(n_past >= 0);
GGML_ASSERT(n_past >= 0);
...
@@ -6983,12 +6997,14 @@ struct ggml_tensor * ggml_rope_impl(
...
@@ -6983,12 +6997,14 @@ struct ggml_tensor * ggml_rope_impl(
ggml_scratch_save(ctx);
ggml_scratch_save(ctx);
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32,
4
);
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32,
6
);
((int32_t *) b->data)[0] = n_past;
((int32_t *) b->data)[0] = n_past;
((int32_t *) b->data)[1] = n_dims;
((int32_t *) b->data)[1] = n_dims;
((int32_t *) b->data)[2] = mode;
((int32_t *) b->data)[2] = mode;
((int32_t *) b->data)[3] = n_ctx;
((int32_t *) b->data)[3] = n_ctx;
memcpy((int32_t *) b->data + 4, &freq_base, sizeof(float));
memcpy((int32_t *) b->data + 5, &freq_scale, sizeof(float));
ggml_scratch_load(ctx);
ggml_scratch_load(ctx);
...
@@ -7007,7 +7023,7 @@ struct ggml_tensor * ggml_rope(
...
@@ -7007,7 +7023,7 @@ struct ggml_tensor * ggml_rope(
int n_dims,
int n_dims,
int mode,
int mode,
int n_ctx) {
int n_ctx) {
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, false);
return ggml_rope_impl(ctx, a, n_past, n_dims, mode,
10000.0f, 1.0f,
n_ctx, false);
}
}
struct ggml_tensor * ggml_rope_inplace(
struct ggml_tensor * ggml_rope_inplace(
...
@@ -7017,7 +7033,19 @@ struct ggml_tensor * ggml_rope_inplace(
...
@@ -7017,7 +7033,19 @@ struct ggml_tensor * ggml_rope_inplace(
int n_dims,
int n_dims,
int mode,
int mode,
int n_ctx) {
int n_ctx) {
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, true);
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, 10000.0f, 1.0f, n_ctx, true);
}
struct ggml_tensor * ggml_rope_custom_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_past,
int n_dims,
int mode,
float freq_base,
float freq_scale,
int n_ctx) {
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, freq_base, freq_scale, n_ctx, true);
}
}
// ggml_rope_back
// ggml_rope_back
...
@@ -7188,7 +7216,6 @@ struct ggml_tensor* ggml_conv_2d(
...
@@ -7188,7 +7216,6 @@ struct ggml_tensor* ggml_conv_2d(
int d0,
int d0,
int d1) {
int d1) {
GGML_ASSERT(b->ne[3] == 1);
GGML_ASSERT(a->ne[2] == b->ne[2]);
GGML_ASSERT(a->ne[2] == b->ne[2]);
bool is_node = false;
bool is_node = false;
...
@@ -7200,7 +7227,7 @@ struct ggml_tensor* ggml_conv_2d(
...
@@ -7200,7 +7227,7 @@ struct ggml_tensor* ggml_conv_2d(
const int64_t ne[4] = {
const int64_t ne[4] = {
ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1),
ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1),
a->ne[3],
1
,
a->ne[3],
b->ne[3]
,
};
};
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
...
@@ -7235,6 +7262,98 @@ struct ggml_tensor* ggml_conv_1d_ph(
...
@@ -7235,6 +7262,98 @@ struct ggml_tensor* ggml_conv_1d_ph(
return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
}
}
// ggml_pool_*
static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) {
return (ins + 2 * p - ks) / s + 1;
}
// ggml_pool_2d
struct ggml_tensor* ggml_pool_1d(
struct ggml_context * ctx,
struct ggml_tensor * a,
enum ggml_op_pool op,
int k0,
int s0,
int p0) {
bool is_node = false;
if (a->grad) {
GGML_ASSERT(false); // TODO: implement backward
is_node = true;
}
const int64_t ne[3] = {
ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
a->ne[1],
};
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
ggml_scratch_save(ctx);
struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
((int32_t*)c->data)[0] = op;
((int32_t*)c->data)[1] = k0;
((int32_t*)c->data)[2] = s0;
((int32_t*)c->data)[3] = p0;
ggml_scratch_load(ctx);
result->op = GGML_OP_POOL_1D;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;
result->src[1] = c;
return result;
}
// ggml_pool_2d
struct ggml_tensor* ggml_pool_2d(
struct ggml_context * ctx,
struct ggml_tensor * a,
enum ggml_op_pool op,
int k0,
int k1,
int s0,
int s1,
int p0,
int p1) {
bool is_node = false;
if (a->grad) {
GGML_ASSERT(false); // TODO: implement backward
is_node = true;
}
const int64_t ne[3] = {
ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
ggml_calc_pool_output_size(a->ne[1], k1, s1, p1),
a->ne[2],
};
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
ggml_scratch_save(ctx);
struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 7);
((int32_t*)c->data)[0] = op;
((int32_t*)c->data)[1] = k0;
((int32_t*)c->data)[2] = k1;
((int32_t*)c->data)[3] = s0;
((int32_t*)c->data)[4] = s1;
((int32_t*)c->data)[5] = p0;
((int32_t*)c->data)[6] = p1;
ggml_scratch_load(ctx);
result->op = GGML_OP_POOL_2D;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;
result->src[1] = c;
return result;
}
// ggml_flash_attn
// ggml_flash_attn
struct ggml_tensor * ggml_flash_attn(
struct ggml_tensor * ggml_flash_attn(
...
@@ -8323,7 +8442,7 @@ static void ggml_compute_forward_add_f32(
...
@@ -8323,7 +8442,7 @@ static void ggml_compute_forward_add_f32(
const struct ggml_tensor * src0,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
struct ggml_tensor * dst) {
GGML_ASSERT(ggml_
are_same_shape
(src
0
, src
1
) && ggml_are_same_shape(src0, dst));
GGML_ASSERT(ggml_
can_repeat_rows
(src
1
, src
0
) && ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
return;
...
@@ -8348,23 +8467,23 @@ static void ggml_compute_forward_add_f32(
...
@@ -8348,23 +8467,23 @@ static void ggml_compute_forward_add_f32(
if (nb10 == sizeof(float)) {
if (nb10 == sizeof(float)) {
for (int ir = ir0; ir < ir1; ++ir) {
for (int ir = ir0; ir < ir1; ++ir) {
// src
0, src1 and dst are same shape => same indices
// src
1 is broadcastable across src0 and dst in i1, i2, i3
const int i3 = ir/(ne2*ne1);
const int
64_t
i
0
3 = ir/(ne
0
2*ne
0
1);
const int i2 = (ir - i3*ne2*ne1)/ne1;
const int
64_t
i
0
2 = (ir - i
0
3*ne
0
2*ne
0
1)/ne
0
1;
const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
const int
64_t
i
0
1 = (ir - i
0
3*ne
0
2*ne
0
1 - i
0
2*ne
0
1);
const int64_t i13 = i03 % ne13;
const int64_t i12 = i02 % ne12;
const int64_t i11 = i01 % ne11;
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
#ifdef GGML_USE_ACCELERATE
#ifdef GGML_USE_ACCELERATE
vDSP_vadd(
vDSP_vadd(src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00);
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1,
ne0);
#else
#else
ggml_vec_add_f32(ne0,
ggml_vec_add_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ),
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
#endif
#endif
// }
// }
// }
// }
...
@@ -8372,15 +8491,20 @@ static void ggml_compute_forward_add_f32(
...
@@ -8372,15 +8491,20 @@ static void ggml_compute_forward_add_f32(
} else {
} else {
// src1 is not contiguous
// src1 is not contiguous
for (int ir = ir0; ir < ir1; ++ir) {
for (int ir = ir0; ir < ir1; ++ir) {
// src0, src1 and dst are same shape => same indices
// src1 is broadcastable across src0 and dst in i1, i2, i3
const int i3 = ir/(ne2*ne1);
const int64_t i03 = ir/(ne02*ne01);
const int i2 = (ir - i3*ne2*ne1)/ne1;
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
const int64_t i13 = i03 % ne13;
const int64_t i12 = i02 % ne12;
const int64_t i11 = i01 % ne11;
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 );
float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
for (int i0 = 0; i0 < ne0; i0++) {
for (int i0 = 0; i0 < ne0; i0++) {
float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10);
float * src1_ptr = (float *) ((char *) src1->data + i
1
3*nb13 + i
1
2*nb12 + i1
1
*nb11 + i0*nb10);
dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
}
}
...
@@ -10559,7 +10683,6 @@ static void ggml_compute_forward_rms_norm_back(
...
@@ -10559,7 +10683,6 @@ static void ggml_compute_forward_rms_norm_back(
}
}
}
}
// ggml_compute_forward_mul_mat
// ggml_compute_forward_mul_mat
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
...
@@ -10603,17 +10726,19 @@ static void ggml_compute_forward_mul_mat(
...
@@ -10603,17 +10726,19 @@ static void ggml_compute_forward_mul_mat(
const int ith = params->ith;
const int ith = params->ith;
const int nth = params->nth;
const int nth = params->nth;
GGML_ASSERT(ne02 == ne12);
GGML_ASSERT(ne03 == ne13);
GGML_ASSERT(ne2 == ne12);
GGML_ASSERT(ne3 == ne13);
const enum ggml_type type = src0->type;
const enum ggml_type type = src0->type;
const bool src1_cont = ggml_is_contiguous(src1);
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
GGML_ASSERT(ne0 == ne01);
GGML_ASSERT(ne1 == ne11);
GGML_ASSERT(ne2 == ne12);
GGML_ASSERT(ne3 == ne13);
// we don't support permuted src0 or src1
// we don't support permuted src0 or src1
GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
GGML_ASSERT(nb10 == sizeof(float));
GGML_ASSERT(nb10 == sizeof(float));
...
@@ -10624,16 +10749,16 @@ static void ggml_compute_forward_mul_mat(
...
@@ -10624,16 +10749,16 @@ static void ggml_compute_forward_mul_mat(
GGML_ASSERT(nb1 <= nb2);
GGML_ASSERT(nb1 <= nb2);
GGML_ASSERT(nb2 <= nb3);
GGML_ASSERT(nb2 <= nb3);
GGML_ASSERT(ne0 == ne01);
GGML_ASSERT(ne1 == ne11);
GGML_ASSERT(ne2 == ne02);
GGML_ASSERT(ne3 == ne03);
// nb01 >= nb00 - src0 is not transposed
// nb01 >= nb00 - src0 is not transposed
// compute by src0 rows
// compute by src0 rows
#if defined(GGML_USE_CLBLAST)
#if defined(GGML_USE_CLBLAST)
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
// TODO: handle case when src0 is broadcast-able into src1 across 2nd,3rd dimension
// ref: https://github.com/ggerganov/ggml/pull/224
GGML_ASSERT(ne02 == ne12);
GGML_ASSERT(ne03 == ne13);
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
}
}
...
@@ -10643,6 +10768,11 @@ static void ggml_compute_forward_mul_mat(
...
@@ -10643,6 +10768,11 @@ static void ggml_compute_forward_mul_mat(
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
// TODO: handle case when src0 is broadcast-able into src1 across 2nd,3rd dimension
// ref: https://github.com/ggerganov/ggml/pull/224
GGML_ASSERT(ne02 == ne12);
GGML_ASSERT(ne03 == ne13);
if (params->ith != 0) {
if (params->ith != 0) {
return;
return;
}
}
...
@@ -10663,7 +10793,7 @@ static void ggml_compute_forward_mul_mat(
...
@@ -10663,7 +10793,7 @@ static void ggml_compute_forward_mul_mat(
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
if (type != GGML_TYPE_F32) {
if (type != GGML_TYPE_F32) {
float * const wdata = params->wdata;
float * const wdata
= params->wdata;
ggml_to_float_t const to_float = type_traits[type].to_float;
ggml_to_float_t const to_float = type_traits[type].to_float;
size_t id = 0;
size_t id = 0;
...
@@ -10712,41 +10842,52 @@ static void ggml_compute_forward_mul_mat(
...
@@ -10712,41 +10842,52 @@ static void ggml_compute_forward_mul_mat(
return;
return;
}
}
// parallelize by src0 rows using ggml_vec_dot_q
// parallelize by src0 rows
const int64_t dr = (ne01 + nth - 1)/nth;
// total rows in src0
const int64_t ir10 = dr*ith;
const int
nr = ne01*ne02*
ne0
3
;
const int
64_t ir11 = MIN(ir10 + dr,
ne0
1)
;
//
rows per thread
//
src1 rows
const int
dr = (nr + nth - 1)/nth
;
const int
64_t nr1 = ne11*ne12*ne13
;
// row range for this thread
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
const int ir0 = dr*ith;
const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
const int ir1 = MIN(ir0 + dr, nr);
void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
for (int64_t ir1 = 0; ir1 < nr1; ++ir1) {
const size_t row_size = ne00*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
const int64_t i13 = (ir1/(ne12*ne11));
const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);
for (int ir = ir0; ir < ir1; ++ir) {
const int64_t ir0 = (ir1/ne11)%(ne02*ne03);
// src0 indices
const int64_t i03 = (ir0/(ne02));
const int i03 = ir/(ne02*ne01);
// Hack for "Falcon multi-query-attention key stutter" / alternative to ggml_repeat2.
const int i02 = (ir - i03*ne02*ne01)/ne01;
// See https://github.com/ggerganov/llama.cpp/issues/1602#issuecomment-1606087470:
const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
// GG: this is likely the correct way to broadcast, though need some more thought
// therefore leaving the comments to remind us for now
const int64_t i02 = (i12 / (ne12 / ne02));
// Original from PR/224 (and also essential/correct for non-broadcast matmuls in Falcon)
// const int64_t i02 = (ir0 - i03*ne02);
const int i13 = i03;
const int64_t i1 = i11;
const int i12 = i02;
const int64_t i2 = i12;
const int64_t i3 = i13;
const int i0 = i01;
const char * src0_row = (const char *) src0->data + ( 0 + i02*nb02 + i03*nb03 );
const int i2 = i02;
const int i3 = i03;
void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
char * src1_col = ((char *) wdata + ( (0 + i12*ne11 + i13*ne12*ne11)*row_size));
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
// the original src1 data pointer, so we should index using the indices directly
// TODO: this is a bit of a hack, we should probably have a better way to handle this
const char * src1_col = (const char *) wdata +
(src1_cont || src1->type != vec_dot_type
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
: (i11*nb11 + i12*nb12 + i13*nb13));
float * dst_col = (float *) ((char *) dst->data + (i
0*nb0 + 0
*nb1 + i2*nb2 + i3*nb3));
float * dst_col = (float *) ((char *) dst->data + (i
1
*nb1 + i2*nb2 + i3*nb3));
for (int64_t i
c
= 0; i
c
<
ne
11; ++i
c
) {
for (int64_t i
r
=
ir1
0; i
r
<
ir
11; ++i
r
) {
vec_dot(ne00, &dst_col[i
c*ne0
], src0_row
, (void *) (src1_col + ic*row_size)
);
vec_dot(ne00, &dst_col[i
r
], src0_row
+ ir*nb01, src1_col
);
}
}
}
}
...
@@ -11743,7 +11884,7 @@ static void ggml_compute_forward_alibi_f32(
...
@@ -11743,7 +11884,7 @@ static void ggml_compute_forward_alibi_f32(
const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
const int ne1 = src0->ne[1]; // seq_len_without_past
const int ne1 = src0->ne[1]; // seq_len_without_past
//
const int ne2 = src0->ne[2]; // n_head -> this is k
const int ne2 = src0->ne[2]; // n_head -> this is k
//const int ne3 = src0->ne[3]; // 1 -> bsz
//const int ne3 = src0->ne[3]; // 1 -> bsz
const int n = ggml_nrows(src0);
const int n = ggml_nrows(src0);
...
@@ -11754,8 +11895,9 @@ static void ggml_compute_forward_alibi_f32(
...
@@ -11754,8 +11895,9 @@ static void ggml_compute_forward_alibi_f32(
const int nb2 = src0->nb[2];
const int nb2 = src0->nb[2];
//const int nb3 = src0->nb[3];
//const int nb3 = src0->nb[3];
assert(nb0 == sizeof(float));
GGML_ASSERT(nb0 == sizeof(float));
assert(ne1 + n_past == ne0); (void) n_past;
GGML_ASSERT(ne1 + n_past == ne0);
GGML_ASSERT(n_head == ne2);
// add alibi to src0 (KQ_scaled)
// add alibi to src0 (KQ_scaled)
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
...
@@ -11779,7 +11921,7 @@ static void ggml_compute_forward_alibi_f32(
...
@@ -11779,7 +11921,7 @@ static void ggml_compute_forward_alibi_f32(
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
}
}
pdst[0] =
(i-ne0+1)
* m_k + src[0];
pdst[0] =
i
* m_k + src[0];
}
}
}
}
...
@@ -11808,7 +11950,7 @@ static void ggml_compute_forward_alibi_f16(
...
@@ -11808,7 +11950,7 @@ static void ggml_compute_forward_alibi_f16(
const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
const int ne1 = src0->ne[1]; // seq_len_without_past
const int ne1 = src0->ne[1]; // seq_len_without_past
//
const int ne2 = src0->ne[2]; // n_head -> this is k
const int ne2 = src0->ne[2]; // n_head -> this is k
//const int ne3 = src0->ne[3]; // 1 -> bsz
//const int ne3 = src0->ne[3]; // 1 -> bsz
const int n = ggml_nrows(src0);
const int n = ggml_nrows(src0);
...
@@ -11819,8 +11961,9 @@ static void ggml_compute_forward_alibi_f16(
...
@@ -11819,8 +11961,9 @@ static void ggml_compute_forward_alibi_f16(
const int nb2 = src0->nb[2];
const int nb2 = src0->nb[2];
//const int nb3 = src0->nb[3];
//const int nb3 = src0->nb[3];
assert(nb0 == sizeof(ggml_fp16_t));
GGML_ASSERT(nb0 == sizeof(ggml_fp16_t));
assert(ne1 + n_past == ne0); (void) n_past;
GGML_ASSERT(ne1 + n_past == ne0); (void) n_past;
GGML_ASSERT(n_head == ne2);
// add alibi to src0 (KQ_scaled)
// add alibi to src0 (KQ_scaled)
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
...
@@ -11845,7 +11988,7 @@ static void ggml_compute_forward_alibi_f16(
...
@@ -11845,7 +11988,7 @@ static void ggml_compute_forward_alibi_f16(
}
}
// we return F32
// we return F32
pdst[0] =
(i-ne0+1)
* m_k + GGML_FP16_TO_FP32(src[0]);
pdst[0] =
i
* m_k + GGML_FP16_TO_FP32(src[0]);
}
}
}
}
}
}
...
@@ -11973,16 +12116,21 @@ static void ggml_compute_forward_rope_f32(
...
@@ -11973,16 +12116,21 @@ static void ggml_compute_forward_rope_f32(
const struct ggml_tensor * src1,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
struct ggml_tensor * dst) {
GGML_ASSERT(src1->type == GGML_TYPE_I32);
GGML_ASSERT(src1->type == GGML_TYPE_I32);
GGML_ASSERT(ggml_nelements(src1) ==
4
);
GGML_ASSERT(ggml_nelements(src1) ==
6
);
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
return;
}
}
float freq_base;
float freq_scale;
const int n_past = ((int32_t *) src1->data)[0];
const int n_past = ((int32_t *) src1->data)[0];
const int n_dims = ((int32_t *) src1->data)[1];
const int n_dims = ((int32_t *) src1->data)[1];
const int mode = ((int32_t *) src1->data)[2];
const int mode = ((int32_t *) src1->data)[2];
const int n_ctx = ((int32_t *) src1->data)[3];
const int n_ctx = ((int32_t *) src1->data)[3];
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
assert(n_past >= 0);
assert(n_past >= 0);
...
@@ -12011,7 +12159,7 @@ static void ggml_compute_forward_rope_f32(
...
@@ -12011,7 +12159,7 @@ static void ggml_compute_forward_rope_f32(
// row index used to determine which thread to use
// row index used to determine which thread to use
int ir = 0;
int ir = 0;
const float theta_scale = powf(
10000.0
, -2.0f/n_dims);
const float theta_scale = powf(
freq_base
, -2.0f/n_dims);
const bool is_neox = mode & 2;
const bool is_neox = mode & 2;
const bool is_glm = mode & 4;
const bool is_glm = mode & 4;
...
@@ -12023,7 +12171,7 @@ static void ggml_compute_forward_rope_f32(
...
@@ -12023,7 +12171,7 @@ static void ggml_compute_forward_rope_f32(
if (ir++ < ir0) continue;
if (ir++ < ir0) continue;
if (ir > ir1) break;
if (ir > ir1) break;
float theta = (float)p;
float theta =
freq_scale *
(float)p;
if (is_glm) {
if (is_glm) {
theta = MIN(p, n_ctx - 2);
theta = MIN(p, n_ctx - 2);
...
@@ -12100,16 +12248,21 @@ static void ggml_compute_forward_rope_f16(
...
@@ -12100,16 +12248,21 @@ static void ggml_compute_forward_rope_f16(
const struct ggml_tensor * src1,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
struct ggml_tensor * dst) {
GGML_ASSERT(src1->type == GGML_TYPE_I32);
GGML_ASSERT(src1->type == GGML_TYPE_I32);
GGML_ASSERT(ggml_nelements(src1) ==
4
);
GGML_ASSERT(ggml_nelements(src1) ==
6
);
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
return;
}
}
float freq_base;
float freq_scale;
const int n_past = ((int32_t *) src1->data)[0];
const int n_past = ((int32_t *) src1->data)[0];
const int n_dims = ((int32_t *) src1->data)[1];
const int n_dims = ((int32_t *) src1->data)[1];
const int mode = ((int32_t *) src1->data)[2];
const int mode = ((int32_t *) src1->data)[2];
const int n_ctx = ((int32_t *) src1->data)[3];
const int n_ctx = ((int32_t *) src1->data)[3];
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
assert(n_past >= 0);
assert(n_past >= 0);
...
@@ -12138,7 +12291,7 @@ static void ggml_compute_forward_rope_f16(
...
@@ -12138,7 +12291,7 @@ static void ggml_compute_forward_rope_f16(
// row index used to determine which thread to use
// row index used to determine which thread to use
int ir = 0;
int ir = 0;
const float theta_scale = powf(
10000.0
, -2.0f/n_dims);
const float theta_scale = powf(
freq_base
, -2.0f/n_dims);
const bool is_neox = mode & 2;
const bool is_neox = mode & 2;
const bool is_glm = mode & 4;
const bool is_glm = mode & 4;
...
@@ -12150,7 +12303,7 @@ static void ggml_compute_forward_rope_f16(
...
@@ -12150,7 +12303,7 @@ static void ggml_compute_forward_rope_f16(
if (ir++ < ir0) continue;
if (ir++ < ir0) continue;
if (ir > ir1) break;
if (ir > ir1) break;
float theta = (float)p;
float theta =
freq_scale *
(float)p;
if (is_glm) {
if (is_glm) {
theta = MIN(p, n_ctx - 2);
theta = MIN(p, n_ctx - 2);
...
@@ -12211,7 +12364,7 @@ static void ggml_compute_forward_rope_f16(
...
@@ -12211,7 +12364,7 @@ static void ggml_compute_forward_rope_f16(
const float x0 = GGML_FP16_TO_FP32(src[0]);
const float x0 = GGML_FP16_TO_FP32(src[0]);
const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
dst_data[0]
= GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
}
}
}
}
...
@@ -12893,12 +13046,13 @@ static void ggml_compute_forward_conv_1d(
...
@@ -12893,12 +13046,13 @@ static void ggml_compute_forward_conv_1d(
};
};
}
}
// ggml_compute_forward_conv_2d
_sk_p0
// ggml_compute_forward_conv_2d
static void ggml_compute_forward_conv_2d_
sk_p0_
f16_f32(
static void ggml_compute_forward_conv_2d_f16_f32(
const struct ggml_compute_params * params,
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
const struct ggml_tensor * src1,
const struct ggml_tensor * opt0,
struct ggml_tensor * dst) {
struct ggml_tensor * dst) {
GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
...
@@ -12918,11 +13072,17 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
...
@@ -12918,11 +13072,17 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
// size of the convolution row - the kernel size unrolled across all channels
// size of the convolution row - the kernel size unrolled across all channels
const int ew0 = nk0*nk1*ne02;
const int ew0 = nk0*nk1*ne02;
const int32_t s0 = ((const int32_t*)(opt0->data))[0];
const int32_t s1 = ((const int32_t*)(opt0->data))[1];
const int32_t p0 = ((const int32_t*)(opt0->data))[2];
const int32_t p1 = ((const int32_t*)(opt0->data))[3];
const int32_t d0 = ((const int32_t*)(opt0->data))[4];
const int32_t d1 = ((const int32_t*)(opt0->data))[5];
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
GGML_ASSERT(nb10 == sizeof(float));
GGML_ASSERT(nb10 == sizeof(float));
if (params->type == GGML_TASK_INIT) {
if (params->type == GGML_TASK_INIT) {
// TODO: fix this memset (wsize is overestimated)
memset(params->wdata, 0, params->wsize);
memset(params->wdata, 0, params->wsize);
// prepare source data (src1)
// prepare source data (src1)
...
@@ -12937,8 +13097,13 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
...
@@ -12937,8 +13097,13 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
for (int i0 = 0; i0 < ne0; i0++) {
for (int i0 = 0; i0 < ne0; i0++) {
for (int ik1 = 0; ik1 < nk1; ik1++) {
for (int ik1 = 0; ik1 < nk1; ik1++) {
for (int ik0 = 0; ik0 < nk0; ik0++) {
for (int ik0 = 0; ik0 < nk0; ik0++) {
dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
const int idx0 = i0*s0 + ik0*d0 - p0;
GGML_FP32_TO_FP16(src[(i1*nk1 + ik1)*ne10 + (i0*nk0 + ik0)]);
const int idx1 = i1*s1 + ik1*d1 - p1;
if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) {
dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]);
}
}
}
}
}
}
}
...
@@ -12965,32 +13130,36 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
...
@@ -12965,32 +13130,36 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
for (int i2 = ip0; i2 < ip1; i2++) {
for (int i3 = 0; i3 < ne3; i3++) {
float * dst_data = (float *)((char *) dst->data + i2*nb2);
for (int i2 = ip0; i2 < ip1; i2++) {
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2);
for (int i1 = 0; i1 < ne1; ++i1) {
for (int i0 = 0; i0 < ne0; ++i0) {
for (int i1 = 0; i1 < ne1; ++i1) {
ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0,
for (int i0 = 0; i0 < ne0; ++i0) {
(ggml_fp16_t *) ((char *) src0->data + i2*nb03),
ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0,
(ggml_fp16_t *) wdata + (i1*ne0 + i0)*ew0);
(ggml_fp16_t *) ((char *) src0->data + i2*nb03),
(ggml_fp16_t *) wdata + i3*nb3 + (i1*ne0 + i0)*ew0);
}
}
}
}
}
}
}
}
}
static void ggml_compute_forward_conv_2d
_sk_p0
(
static void ggml_compute_forward_conv_2d(
const struct ggml_compute_params * params,
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
const struct ggml_tensor * opt0,
struct ggml_tensor * dst
) {
switch (src0->type) {
switch (src0->type) {
case GGML_TYPE_F16:
case GGML_TYPE_F16:
{
{
ggml_compute_forward_conv_2d_
sk_p0_
f16_f32(params, src0, src1, dst);
ggml_compute_forward_conv_2d_f16_f32(params, src0, src1,
opt0,
dst);
} break;
} break;
case GGML_TYPE_F32:
case GGML_TYPE_F32:
{
{
//ggml_compute_forward_conv_2d_
sk_p0_
f32(params, src0, src1, dst);
//ggml_compute_forward_conv_2d_f32(params, src0, src1,
opt0,
dst);
GGML_ASSERT(false);
GGML_ASSERT(false);
} break;
} break;
default:
default:
...
@@ -13000,31 +13169,164 @@ static void ggml_compute_forward_conv_2d_sk_p0(
...
@@ -13000,31 +13169,164 @@ static void ggml_compute_forward_conv_2d_sk_p0(
}
}
}
}
// ggml_compute_forward_
conv_2d
// ggml_compute_forward_
pool_1d_sk_p0
static void ggml_compute_forward_conv_2d(
static void ggml_compute_forward_pool_1d_sk_p0(
const struct ggml_compute_params * params,
const enum ggml_op_pool op,
const struct ggml_tensor * src,
const int k,
struct ggml_tensor * dst) {
assert(src->type == GGML_TYPE_F32);
assert(params->ith == 0);
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
}
const char * cdata = (const char *)src->data;
const char * const data_end = cdata + ggml_nbytes(src);
float * drow = (float *)dst->data;
const int64_t rs = dst->ne[0];
while (cdata < data_end) {
const float * const srow = (const float *)cdata;
int j = 0;
for (int64_t i = 0; i < rs; ++i) {
switch (op) {
case GGML_OP_POOL_AVG: drow[i] = 0; break;
case GGML_OP_POOL_MAX: drow[i] = -FLT_MAX; break;
case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
}
for (int ki = 0; ki < k; ++ki) {
switch (op) {
case GGML_OP_POOL_AVG: drow[i] += srow[j]; break;
case GGML_OP_POOL_MAX: if (srow[j] > drow[i]) drow[i] = srow[j]; break;
case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
}
++j;
}
switch (op) {
case GGML_OP_POOL_AVG: drow[i] /= k; break;
case GGML_OP_POOL_MAX: break;
case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
}
}
cdata += src->nb[1];
drow += rs;
}
}
// ggml_compute_forward_pool_1d
static void ggml_compute_forward_pool_1d(
const struct ggml_compute_params* params,
const struct ggml_compute_params* params,
const struct ggml_tensor* src0,
const struct ggml_tensor* src0,
const struct ggml_tensor* src1,
const struct ggml_tensor* opt0,
const struct ggml_tensor* opt0,
struct ggml_tensor* dst) {
struct ggml_tensor* dst) {
const int32_t s0 = ((const int32_t*)(opt0->data))[0];
GGML_ASSERT(opt0->ne[0] == 4);
const int32_t s1 = ((const int32_t*)(opt0->data))[1];
const int* opts = (const int*)opt0->data;
const int32_t p0 = ((const int32_t*)(opt0->data))[2];
enum ggml_op_pool op = opts[0];
const int32_t p1 = ((const int32_t*)(opt0->data))[3];
const int k0 = opts[1];
const int32_t d0 = ((const int32_t*)(opt0->data))[4];
const int s0 = opts[2];
const int32_t d1 = ((const int32_t*)(opt0->data))[5];
const int p0 = opts[3];
GGML_ASSERT(d0 == 1); // dilation not supported
GGML_ASSERT(d1 == 1);
GGML_ASSERT(p0 == 0); // padding not supported
GGML_ASSERT(p0 == 0); // padding not supported
GGML_ASSERT(
p1
== 0);
GGML_ASSERT(
k0
==
s
0);
// only s = k supported
if (s0 == src0->ne[0] && s1 == src0->ne[1]) {
ggml_compute_forward_pool_1d_sk_p0(params, op, src0, k0, dst);
ggml_compute_forward_conv_2d_sk_p0(params, src0, src1, dst);
}
// ggml_compute_forward_pool_2d_sk_p0
static void ggml_compute_forward_pool_2d_sk_p0(
const struct ggml_compute_params * params,
const enum ggml_op_pool op,
const struct ggml_tensor * src,
const int k0,
const int k1,
struct ggml_tensor * dst) {
assert(src->type == GGML_TYPE_F32);
assert(params->ith == 0);
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
}
}
else {
GGML_ASSERT(false); // only stride equal to kernel size is supported
const char * cdata = (const char*)src->data;
};
const char * const data_end = cdata + ggml_nbytes(src);
const int64_t px = dst->ne[0];
const int64_t py = dst->ne[1];
const int64_t pa = px * py;
float * dplane = (float *)dst->data;
const int ka = k0 * k1;
while (cdata < data_end) {
for (int oy = 0; oy < py; ++oy) {
float * const drow = dplane + oy * px;
for (int ox = 0; ox < px; ++ox) {
float * const out = drow + ox;
switch (op) {
case GGML_OP_POOL_AVG: *out = 0; break;
case GGML_OP_POOL_MAX: *out = -FLT_MAX; break;
case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
}
const int ix = ox * k0;
const int iy = oy * k1;
for (int ky = 0; ky < k1; ++ky) {
const float * const srow = (const float *)(cdata + src->nb[1] * (iy + ky));
for (int kx = 0; kx < k0; ++kx) {
int j = ix + kx;
switch (op) {
case GGML_OP_POOL_AVG: *out += srow[j]; break;
case GGML_OP_POOL_MAX: if (srow[j] > *out) *out = srow[j]; break;
case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
}
}
}
switch (op) {
case GGML_OP_POOL_AVG: *out /= ka; break;
case GGML_OP_POOL_MAX: break;
case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
}
}
}
cdata += src->nb[2];
dplane += pa;
}
}
// ggml_compute_forward_pool_2d
static void ggml_compute_forward_pool_2d(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * opt0,
struct ggml_tensor * dst) {
GGML_ASSERT(opt0->ne[0] == 7);
const int* opts = (const int*)opt0->data;
enum ggml_op_pool op = opts[0];
const int k0 = opts[1];
const int k1 = opts[2];
const int s0 = opts[3];
const int s1 = opts[4];
const int p0 = opts[5];
const int p1 = opts[6];
GGML_ASSERT(p0 == 0);
GGML_ASSERT(p1 == 0); // padding not supported
GGML_ASSERT(k0 == s0);
GGML_ASSERT(k1 == s1); // only s = k supported
ggml_compute_forward_pool_2d_sk_p0(params, op, src0, k0, k1, dst);
}
}
...
@@ -14808,6 +15110,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
...
@@ -14808,6 +15110,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
{
ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
} break;
} break;
case GGML_OP_POOL_1D:
{
ggml_compute_forward_pool_1d(params, tensor->src[0], tensor->src[1], tensor);
} break;
case GGML_OP_POOL_2D:
{
ggml_compute_forward_pool_2d(params, tensor->src[0], tensor->src[1], tensor);
} break;
case GGML_OP_FLASH_ATTN:
case GGML_OP_FLASH_ATTN:
{
{
const int32_t t = ggml_get_i32_1d(tensor->src[3], 0);
const int32_t t = ggml_get_i32_1d(tensor->src[3], 0);
...
@@ -15452,7 +15762,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
...
@@ -15452,7 +15762,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
// necessary for llama
// necessary for llama
if (src0->grad) {
if (src0->grad) {
assert(src1->type == GGML_TYPE_I32);
assert(src1->type == GGML_TYPE_I32);
assert(ggml_nelements(src1) ==
4
);
assert(ggml_nelements(src1) ==
6
);
const int n_past = ((int32_t *) src1->data)[0];
const int n_past = ((int32_t *) src1->data)[0];
const int n_dims = ((int32_t *) src1->data)[1];
const int n_dims = ((int32_t *) src1->data)[1];
const int mode = ((int32_t *) src1->data)[2];
const int mode = ((int32_t *) src1->data)[2];
...
@@ -15473,7 +15783,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
...
@@ -15473,7 +15783,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
{
{
if (src0->grad) {
if (src0->grad) {
assert(src1->type == GGML_TYPE_I32);
assert(src1->type == GGML_TYPE_I32);
assert(ggml_nelements(src1) ==
4
);
assert(ggml_nelements(src1) ==
3
);
const int n_past = ((int32_t *) src1->data)[0];
const int n_past = ((int32_t *) src1->data)[0];
const int n_dims = ((int32_t *) src1->data)[1];
const int n_dims = ((int32_t *) src1->data)[1];
const int mode = ((int32_t *) src1->data)[2];
const int mode = ((int32_t *) src1->data)[2];
...
@@ -15508,6 +15818,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
...
@@ -15508,6 +15818,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
{
{
GGML_ASSERT(false); // TODO: not implemented
GGML_ASSERT(false); // TODO: not implemented
} break;
} break;
case GGML_OP_POOL_1D:
{
GGML_ASSERT(false); // TODO: not implemented
} break;
case GGML_OP_POOL_2D:
{
GGML_ASSERT(false); // TODO: not implemented
} break;
case GGML_OP_FLASH_ATTN:
case GGML_OP_FLASH_ATTN:
{
{
struct ggml_tensor * flash_grad = NULL;
struct ggml_tensor * flash_grad = NULL;
...
@@ -15970,6 +16288,9 @@ struct ggml_compute_state_shared {
...
@@ -15970,6 +16288,9 @@ struct ggml_compute_state_shared {
// synchronization primitives
// synchronization primitives
atomic_int n_active; // num active threads
atomic_int n_active; // num active threads
atomic_int node_n; // active graph node
atomic_int node_n; // active graph node
bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
void * abort_callback_data;
};
};
struct ggml_compute_state {
struct ggml_compute_state {
...
@@ -16001,6 +16322,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
...
@@ -16001,6 +16322,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
int node_n = -1;
int node_n = -1;
while (true) {
while (true) {
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
state->shared->node_n += 1;
return (thread_ret_t) GGML_EXIT_ABORTED;
}
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
// all other threads are finished and spinning
// all other threads are finished and spinning
// do finalize and init here so we don't have synchronize again
// do finalize and init here so we don't have synchronize again
...
@@ -16018,8 +16343,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
...
@@ -16018,8 +16343,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
if (GGML_OP_HAS_FINALIZE[node->op]) {
if (GGML_OP_HAS_FINALIZE[node->op]) {
params.nth = n_tasks_arr[node_n];
params.nth = n_tasks_arr[node_n];
ggml_compute_forward(¶ms, node);
ggml_compute_forward(¶ms, node);
ggml_graph_compute_perf_stats_node(node, state->shared);
}
}
ggml_graph_compute_perf_stats_node(node, state->shared);
}
}
// distribute new work or execute it direct if 1T
// distribute new work or execute it direct if 1T
...
@@ -16049,11 +16374,16 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
...
@@ -16049,11 +16374,16 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
if (GGML_OP_HAS_FINALIZE[node->op]) {
if (GGML_OP_HAS_FINALIZE[node->op]) {
params.type = GGML_TASK_FINALIZE;
params.type = GGML_TASK_FINALIZE;
ggml_compute_forward(¶ms, node);
ggml_compute_forward(¶ms, node);
ggml_graph_compute_perf_stats_node(node, state->shared);
}
}
ggml_graph_compute_perf_stats_node(node, state->shared);
} else {
} else {
break;
break;
}
}
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
break;
}
}
}
atomic_store(&state->shared->n_active, n_threads);
atomic_store(&state->shared->n_active, n_threads);
...
@@ -16087,7 +16417,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
...
@@ -16087,7 +16417,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
}
}
}
}
return
0
;
return
GGML_EXIT_SUCCESS
;
}
}
struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
...
@@ -16287,8 +16617,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
...
@@ -16287,8 +16617,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
{
{
n_tasks = n_threads;
n_tasks = n_threads;
GGML_ASSERT(node->src[1]->ne[3] == 1);
const int64_t ne00 = node->src[0]->ne[0]; // W
const int64_t ne00 = node->src[0]->ne[0]; // W
const int64_t ne01 = node->src[0]->ne[1]; // H
const int64_t ne01 = node->src[0]->ne[1]; // H
const int64_t ne02 = node->src[0]->ne[2]; // C
const int64_t ne02 = node->src[0]->ne[2]; // C
...
@@ -16298,19 +16626,22 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
...
@@ -16298,19 +16626,22 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
const int64_t ne11 = node->src[1]->ne[1]; // H
const int64_t ne11 = node->src[1]->ne[1]; // H
const int64_t ne12 = node->src[1]->ne[2]; // C
const int64_t ne12 = node->src[1]->ne[2]; // C
const int64_t ne0 = node->ne[0];
const int64_t ne1 = node->ne[1];
const int64_t ne2 = node->ne[2];
const int64_t nk = ne00*ne01;
const int64_t nk = ne00*ne01;
const int64_t ew0 = nk * ne02;
UNUSED(ne02);
UNUSED(ne03);
UNUSED(ne03);
UNUSED(n
k
);
UNUSED(n
e2
);
size_t cur = 0;
size_t cur = 0;
if (node->src[0]->type == GGML_TYPE_F16 &&
if (node->src[0]->type == GGML_TYPE_F16 &&
node->src[1]->type == GGML_TYPE_F32) {
node->src[1]->type == GGML_TYPE_F32) {
cur = sizeof(ggml_fp16_t)*(ne
1
0*ne1
1*ne12
);
cur = sizeof(ggml_fp16_t)*(ne0*ne1
*ew0
);
} else if (node->src[0]->type == GGML_TYPE_F32 &&
} else if (node->src[0]->type == GGML_TYPE_F32 &&
node->src[1]->type == GGML_TYPE_F32) {
node->src[1]->type == GGML_TYPE_F32) {
cur = sizeof(float)* (ne10*ne11*ne12);
cur = sizeof(float)* (ne10*ne11*ne12);
} else {
} else {
GGML_ASSERT(false);
GGML_ASSERT(false);
...
@@ -16318,6 +16649,11 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
...
@@ -16318,6 +16649,11 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
work_size = MAX(work_size, cur);
work_size = MAX(work_size, cur);
} break;
} break;
case GGML_OP_POOL_1D:
case GGML_OP_POOL_2D:
{
n_tasks = 1;
} break;
case GGML_OP_FLASH_ATTN:
case GGML_OP_FLASH_ATTN:
{
{
n_tasks = n_threads;
n_tasks = n_threads;
...
@@ -16427,7 +16763,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
...
@@ -16427,7 +16763,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
return cplan;
return cplan;
}
}
void
ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
int
ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
{
{
GGML_ASSERT(cplan);
GGML_ASSERT(cplan);
GGML_ASSERT(cplan->n_threads > 0);
GGML_ASSERT(cplan->n_threads > 0);
...
@@ -16453,6 +16789,8 @@ void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan)
...
@@ -16453,6 +16789,8 @@ void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan)
/*.n_threads =*/ n_threads,
/*.n_threads =*/ n_threads,
/*.n_active =*/ n_threads,
/*.n_active =*/ n_threads,
/*.node_n =*/ -1,
/*.node_n =*/ -1,
/*.abort_callback =*/ NULL,
/*.abort_callback_data =*/ NULL,
};
};
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
...
@@ -16476,12 +16814,12 @@ void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan)
...
@@ -16476,12 +16814,12 @@ void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan)
const int64_t perf_start_time_us = ggml_perf_time_us();
const int64_t perf_start_time_us = ggml_perf_time_us();
// this is a work thread too
// this is a work thread too
ggml_graph_compute_thread(&workers[0]);
int compute_status = (size_t)
ggml_graph_compute_thread(&workers[0]);
// don't leave affinity set on the main thread
// don't leave affinity set on the main thread
clear_numa_thread_affinity();
clear_numa_thread_affinity();
// join thread pool
// join
or kill
thread pool
if (n_threads > 1) {
if (n_threads > 1) {
for (int j = 1; j < n_threads; j++) {
for (int j = 1; j < n_threads; j++) {
const int rc = ggml_thread_join(workers[j].thrd, NULL);
const int rc = ggml_thread_join(workers[j].thrd, NULL);
...
@@ -16505,6 +16843,8 @@ void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan)
...
@@ -16505,6 +16843,8 @@ void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan)
(double) perf_time_us_cur / 1000.0,
(double) perf_time_us_cur / 1000.0,
(double) cgraph->perf_time_us / 1000.0 / cgraph->perf_runs);
(double) cgraph->perf_time_us / 1000.0 / cgraph->perf_runs);
}
}
return compute_status;
}
}
void ggml_graph_reset(struct ggml_cgraph * cgraph) {
void ggml_graph_reset(struct ggml_cgraph * cgraph) {
...
@@ -16578,9 +16918,6 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
...
@@ -16578,9 +16918,6 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
}
}
void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
//assert(cgraph->work == NULL);
//assert(cgraph->work_size == 0);
uint64_t size_eval = 0;
uint64_t size_eval = 0;
// compute size of intermediate results
// compute size of intermediate results
...
@@ -17019,9 +17356,6 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
...
@@ -17019,9 +17356,6 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
GGML_PRINT("=== GRAPH ===\n");
GGML_PRINT("=== GRAPH ===\n");
GGML_PRINT_DEBUG("n_threads = %d\n", cgraph->n_threads);
GGML_PRINT_DEBUG("total work size = %zu bytes\n", cgraph->work_size);
GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes);
GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes);
for (int i = 0; i < cgraph->n_nodes; i++) {
for (int i = 0; i < cgraph->n_nodes; i++) {
struct ggml_tensor * node = cgraph->nodes[i];
struct ggml_tensor * node = cgraph->nodes[i];
...
...
llama/ggml.h
View file @
a83eaa7a
/**
/**
* llama.cpp - git
5bf2a2771886ee86137e01dbc7492f78fb392066
* llama.cpp - git
e782c9e735f93ab4767ffc37462c523b73a17ddc
*
*
* MIT License
* MIT License
*
*
...
@@ -227,8 +227,13 @@
...
@@ -227,8 +227,13 @@
#define GGML_MAX_NAME 48
#define GGML_MAX_NAME 48
#define GGML_DEFAULT_N_THREADS 4
#define GGML_DEFAULT_N_THREADS 4
#define GGML_EXIT_SUCCESS 0
#define GGML_EXIT_ABORTED 1
#define GGML_UNUSED(x) (void)(x)
#define GGML_UNUSED(x) (void)(x)
#define GGML_ASSERT(x) \
#define GGML_ASSERT(x) \
do { \
do { \
if (!(x)) { \
if (!(x)) { \
...
@@ -389,6 +394,8 @@ extern "C" {
...
@@ -389,6 +394,8 @@ extern "C" {
GGML_OP_CLAMP
,
GGML_OP_CLAMP
,
GGML_OP_CONV_1D
,
GGML_OP_CONV_1D
,
GGML_OP_CONV_2D
,
GGML_OP_CONV_2D
,
GGML_OP_POOL_1D
,
GGML_OP_POOL_2D
,
GGML_OP_FLASH_ATTN
,
GGML_OP_FLASH_ATTN
,
GGML_OP_FLASH_FF
,
GGML_OP_FLASH_FF
,
...
@@ -468,6 +475,10 @@ extern "C" {
...
@@ -468,6 +475,10 @@ extern "C" {
// the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
// the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
int
n_tasks
[
GGML_MAX_NODES
];
int
n_tasks
[
GGML_MAX_NODES
];
// abort ggml_graph_compute when true
bool
(
*
abort_callback
)(
void
*
data
);
void
*
abort_callback_data
;
};
};
// computation graph
// computation graph
...
@@ -1136,6 +1147,17 @@ extern "C" {
...
@@ -1136,6 +1147,17 @@ extern "C" {
int
mode
,
int
mode
,
int
n_ctx
);
int
n_ctx
);
// custom RoPE, in-place, returns view(a)
GGML_API
struct
ggml_tensor
*
ggml_rope_custom_inplace
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
,
int
n_past
,
int
n_dims
,
int
mode
,
float
freq_base
,
float
freq_scale
,
int
n_ctx
);
// rotary position embedding backward, i.e compute dx from dy
// rotary position embedding backward, i.e compute dx from dy
// a - dy
// a - dy
GGML_API
struct
ggml_tensor
*
ggml_rope_back
(
GGML_API
struct
ggml_tensor
*
ggml_rope_back
(
...
@@ -1190,6 +1212,31 @@ extern "C" {
...
@@ -1190,6 +1212,31 @@ extern "C" {
int
s
,
int
s
,
int
d
);
int
d
);
enum
ggml_op_pool
{
GGML_OP_POOL_MAX
,
GGML_OP_POOL_AVG
,
GGML_OP_POOL_COUNT
,
};
GGML_API
struct
ggml_tensor
*
ggml_pool_1d
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
,
enum
ggml_op_pool
op
,
int
k0
,
// kernel size
int
s0
,
// stride
int
p0
);
// padding
GGML_API
struct
ggml_tensor
*
ggml_pool_2d
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
,
enum
ggml_op_pool
op
,
int
k0
,
int
k1
,
int
s0
,
int
s1
,
int
p0
,
int
p1
);
GGML_API
struct
ggml_tensor
*
ggml_flash_attn
(
GGML_API
struct
ggml_tensor
*
ggml_flash_attn
(
struct
ggml_context
*
ctx
,
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
q
,
struct
ggml_tensor
*
q
,
...
@@ -1329,7 +1376,7 @@ extern "C" {
...
@@ -1329,7 +1376,7 @@ extern "C" {
// ggml_graph_plan() has to be called before ggml_graph_compute()
// ggml_graph_plan() has to be called before ggml_graph_compute()
// when plan.work_size > 0, caller must allocate memory for plan.work_data
// when plan.work_size > 0, caller must allocate memory for plan.work_data
GGML_API
struct
ggml_cplan
ggml_graph_plan
(
struct
ggml_cgraph
*
cgraph
,
int
n_threads
/*= GGML_DEFAULT_N_THREADS*/
);
GGML_API
struct
ggml_cplan
ggml_graph_plan
(
struct
ggml_cgraph
*
cgraph
,
int
n_threads
/*= GGML_DEFAULT_N_THREADS*/
);
GGML_API
void
ggml_graph_compute
(
struct
ggml_cgraph
*
cgraph
,
struct
ggml_cplan
*
cplan
);
GGML_API
int
ggml_graph_compute
(
struct
ggml_cgraph
*
cgraph
,
struct
ggml_cplan
*
cplan
);
GGML_API
void
ggml_graph_reset
(
struct
ggml_cgraph
*
cgraph
);
GGML_API
void
ggml_graph_reset
(
struct
ggml_cgraph
*
cgraph
);
// same as ggml_graph_compute() but the work data is allocated as a part of the context
// same as ggml_graph_compute() but the work data is allocated as a part of the context
...
...
llama/k_quants.c
View file @
a83eaa7a
/**
/**
* llama.cpp - git
5bf2a2771886ee86137e01dbc7492f78fb392066
* llama.cpp - git
e782c9e735f93ab4767ffc37462c523b73a17ddc
*
*
* MIT License
* MIT License
*
*
...
...
llama/k_quants.h
View file @
a83eaa7a
/**
/**
* llama.cpp - git
5bf2a2771886ee86137e01dbc7492f78fb392066
* llama.cpp - git
e782c9e735f93ab4767ffc37462c523b73a17ddc
*
*
* MIT License
* MIT License
*
*
...
@@ -41,6 +41,14 @@
...
@@ -41,6 +41,14 @@
#define K_SCALE_SIZE 12
#define K_SCALE_SIZE 12
#endif
#endif
#ifndef static_assert
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
#define static_assert(cond, msg) _Static_assert(cond, msg)
#else
#define static_assert(cond, msg) struct global_scope_noop_trick
#endif
#endif
//
//
// Super-block quantization structures
// Super-block quantization structures
//
//
...
...
llama/llama-util.h
View file @
a83eaa7a
/**
/**
* llama.cpp - git
5bf2a2771886ee86137e01dbc7492f78fb392066
* llama.cpp - git
e782c9e735f93ab4767ffc37462c523b73a17ddc
*
*
* MIT License
* MIT License
*
*
...
@@ -201,13 +201,13 @@ struct llama_mmap {
...
@@ -201,13 +201,13 @@ struct llama_mmap {
llama_mmap
(
struct
llama_file
*
file
,
size_t
prefetch
=
(
size_t
)
-
1
/* -1 = max value */
,
bool
numa
=
false
)
{
llama_mmap
(
struct
llama_file
*
file
,
size_t
prefetch
=
(
size_t
)
-
1
/* -1 = max value */
,
bool
numa
=
false
)
{
size
=
file
->
size
;
size
=
file
->
size
;
int
fd
=
fileno
(
file
->
fp
);
int
fd
=
fileno
(
file
->
fp
);
int
flags
=
MAP_
PRIVATE
;
int
flags
=
MAP_
SHARED
;
// prefetch/readahead impairs performance on NUMA systems
// prefetch/readahead impairs performance on NUMA systems
if
(
numa
)
{
prefetch
=
0
;
}
if
(
numa
)
{
prefetch
=
0
;
}
#ifdef __linux__
#ifdef __linux__
if
(
prefetch
)
{
flags
|=
MAP_POPULATE
;
}
if
(
prefetch
)
{
flags
|=
MAP_POPULATE
;
}
#endif
#endif
addr
=
mmap
(
NULL
,
file
->
size
,
PROT_READ
|
PROT_WRITE
,
flags
,
fd
,
0
);
addr
=
mmap
(
NULL
,
file
->
size
,
PROT_READ
,
flags
,
fd
,
0
);
if
(
addr
==
MAP_FAILED
)
{
if
(
addr
==
MAP_FAILED
)
{
throw
std
::
runtime_error
(
format
(
"mmap failed: %s"
,
strerror
(
errno
)));
throw
std
::
runtime_error
(
format
(
"mmap failed: %s"
,
strerror
(
errno
)));
}
}
...
@@ -249,7 +249,7 @@ struct llama_mmap {
...
@@ -249,7 +249,7 @@ struct llama_mmap {
throw
std
::
runtime_error
(
format
(
"CreateFileMappingA failed: %s"
,
llama_format_win_err
(
error
).
c_str
()));
throw
std
::
runtime_error
(
format
(
"CreateFileMappingA failed: %s"
,
llama_format_win_err
(
error
).
c_str
()));
}
}
addr
=
MapViewOfFile
(
hMapping
,
FILE_MAP_
COPY
,
0
,
0
,
0
);
addr
=
MapViewOfFile
(
hMapping
,
FILE_MAP_
READ
,
0
,
0
,
0
);
error
=
GetLastError
();
error
=
GetLastError
();
CloseHandle
(
hMapping
);
CloseHandle
(
hMapping
);
...
...
llama/llama.cpp
View file @
a83eaa7a
/**
/**
* llama.cpp - git
5bf2a2771886ee86137e01dbc7492f78fb392066
* llama.cpp - git
e782c9e735f93ab4767ffc37462c523b73a17ddc
*
*
* MIT License
* MIT License
*
*
...
@@ -127,14 +127,15 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
...
@@ -127,14 +127,15 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
// memory sizes
// memory sizes
//
//
static
const
std
::
map
<
e_model
,
size_t
>
&
MEM_REQ_SCRATCH0
()
static
const
std
::
map
<
e_model
,
size_t
>
&
MEM_REQ_SCRATCH0
(
int
n_ctx
)
{
{
static
std
::
map
<
e_model
,
size_t
>
k_sizes
=
{
static
std
::
map
<
e_model
,
size_t
>
k_sizes
=
{
{
MODEL_3B
,
256ull
*
MB
},
/* empirical scaling, still a guess */
{
MODEL_7B
,
512ull
*
MB
},
{
MODEL_3B
,
((
size_t
)
n_ctx
/
16ull
+
128ull
)
*
MB
},
{
MODEL_13B
,
512ull
*
MB
},
{
MODEL_7B
,
((
size_t
)
n_ctx
/
16ull
+
256ull
)
*
MB
},
{
MODEL_30B
,
512ull
*
MB
},
{
MODEL_13B
,
((
size_t
)
n_ctx
/
12ull
+
256ull
)
*
MB
},
{
MODEL_65B
,
1024ull
*
MB
},
{
MODEL_30B
,
((
size_t
)
n_ctx
/
10ull
+
256ull
)
*
MB
},
{
MODEL_65B
,
((
size_t
)
n_ctx
/
8ull
+
512ull
)
*
MB
},
};
};
return
k_sizes
;
return
k_sizes
;
}
}
...
@@ -166,14 +167,14 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
...
@@ -166,14 +167,14 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
// this is mostly needed for temporary mul_mat buffers to dequantize the data
// this is mostly needed for temporary mul_mat buffers to dequantize the data
// not actually needed if BLAS is disabled
// not actually needed if BLAS is disabled
static
const
std
::
map
<
e_model
,
size_t
>
&
MEM_REQ_EVAL
()
static
const
std
::
map
<
e_model
,
size_t
>
&
MEM_REQ_EVAL
(
int
n_ctx
)
{
{
static
std
::
map
<
e_model
,
size_t
>
k_sizes
=
{
static
std
::
map
<
e_model
,
size_t
>
k_sizes
=
{
{
MODEL_3B
,
512ull
*
MB
},
{
MODEL_3B
,
((
size_t
)
n_ctx
/
256ull
+
512ull
)
*
MB
},
{
MODEL_7B
,
768ull
*
MB
},
{
MODEL_7B
,
((
size_t
)
n_ctx
/
256ull
+
768ull
)
*
MB
},
{
MODEL_13B
,
1024ull
*
MB
},
{
MODEL_13B
,
((
size_t
)
n_ctx
/
256ull
+
1024ull
)
*
MB
},
{
MODEL_30B
,
1280ull
*
MB
},
{
MODEL_30B
,
((
size_t
)
n_ctx
/
256ull
+
1280ull
)
*
MB
},
{
MODEL_65B
,
1536ull
*
MB
},
{
MODEL_65B
,
((
size_t
)
n_ctx
/
256ull
+
1536ull
)
*
MB
},
};
};
return
k_sizes
;
return
k_sizes
;
}
}
...
@@ -215,6 +216,10 @@ struct llama_hparams {
...
@@ -215,6 +216,10 @@ struct llama_hparams {
uint32_t
n_head
=
32
;
uint32_t
n_head
=
32
;
uint32_t
n_layer
=
32
;
uint32_t
n_layer
=
32
;
uint32_t
n_rot
=
64
;
uint32_t
n_rot
=
64
;
float
rope_freq_base
=
10000.0
f
;
float
rope_freq_scale
=
1.0
f
;
enum
llama_ftype
ftype
=
LLAMA_FTYPE_MOSTLY_F16
;
enum
llama_ftype
ftype
=
LLAMA_FTYPE_MOSTLY_F16
;
bool
operator
!=
(
const
llama_hparams
&
other
)
const
{
bool
operator
!=
(
const
llama_hparams
&
other
)
const
{
...
@@ -329,7 +334,7 @@ struct llama_model {
...
@@ -329,7 +334,7 @@ struct llama_model {
};
};
struct
llama_context
{
struct
llama_context
{
llama_context
(
const
llama_model
&
model
,
const
llama_vocab
&
vocab
)
:
model
(
model
),
vocab
(
vocab
),
t_load_us
(
model
.
t_load_us
),
t_start_us
(
model
.
t_start_us
)
{}
llama_context
(
const
llama_model
&
model
)
:
model
(
model
),
t_load_us
(
model
.
t_load_us
),
t_start_us
(
model
.
t_start_us
)
{}
#ifdef GGML_USE_METAL
#ifdef GGML_USE_METAL
~
llama_context
()
{
~
llama_context
()
{
if
(
ctx_metal
)
{
if
(
ctx_metal
)
{
...
@@ -350,7 +355,6 @@ struct llama_context {
...
@@ -350,7 +355,6 @@ struct llama_context {
int32_t
n_p_eval
=
0
;
// number of tokens in eval calls for the prompt (with batch size > 1)
int32_t
n_p_eval
=
0
;
// number of tokens in eval calls for the prompt (with batch size > 1)
const
llama_model
&
model
;
const
llama_model
&
model
;
const
llama_vocab
&
vocab
;
bool
model_owner
=
false
;
bool
model_owner
=
false
;
...
@@ -577,7 +581,9 @@ struct llama_file_loader {
...
@@ -577,7 +581,9 @@ struct llama_file_loader {
}
}
// skip to the next multiple of 32 bytes
// skip to the next multiple of 32 bytes
file
.
seek
(
-
static_cast
<
ptrdiff_t
>
(
file
.
tell
())
&
31
,
SEEK_CUR
);
if
(
file_version
>=
LLAMA_FILE_VERSION_GGJT_V1
)
{
file
.
seek
(
-
static_cast
<
ptrdiff_t
>
(
file
.
tell
())
&
31
,
SEEK_CUR
);
}
tensor
.
file_off
=
file
.
tell
();
tensor
.
file_off
=
file
.
tell
();
tensor
.
name
=
name
;
tensor
.
name
=
name
;
...
@@ -674,7 +680,7 @@ struct llama_model_loader {
...
@@ -674,7 +680,7 @@ struct llama_model_loader {
*
ctx_size_p
=
*
mmapped_size_p
=
0
;
*
ctx_size_p
=
*
mmapped_size_p
=
0
;
for
(
const
llama_load_tensor
&
lt
:
tensors_map
.
tensors
)
{
for
(
const
llama_load_tensor
&
lt
:
tensors_map
.
tensors
)
{
*
ctx_size_p
+=
sizeof
(
struct
ggml_tensor
)
+
GGML_OBJECT_SIZE
;
*
ctx_size_p
+=
sizeof
(
struct
ggml_tensor
)
+
GGML_OBJECT_SIZE
;
*
(
use_mmap
?
mmapped_size_p
:
ctx_size_p
)
+=
lt
.
size
;
*
(
use_mmap
?
mmapped_size_p
:
ctx_size_p
)
+=
lt
.
size
+
16
;
}
}
}
}
...
@@ -870,6 +876,8 @@ struct llama_context_params llama_context_default_params() {
...
@@ -870,6 +876,8 @@ struct llama_context_params llama_context_default_params() {
/*.gpu_layers =*/
0
,
/*.gpu_layers =*/
0
,
/*.main_gpu =*/
0
,
/*.main_gpu =*/
0
,
/*.tensor_split =*/
{
0
},
/*.tensor_split =*/
{
0
},
/*.rope_freq_base =*/
10000.0
f
,
/*.rope_freq_scale =*/
1.0
f
,
/*.progress_callback =*/
nullptr
,
/*.progress_callback =*/
nullptr
,
/*.progress_callback_user_data =*/
nullptr
,
/*.progress_callback_user_data =*/
nullptr
,
/*.low_vram =*/
false
,
/*.low_vram =*/
false
,
...
@@ -895,6 +903,10 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
...
@@ -895,6 +903,10 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
return
result
;
return
result
;
}
}
int
llama_max_devices
()
{
return
LLAMA_MAX_DEVICES
;
}
bool
llama_mmap_supported
()
{
bool
llama_mmap_supported
()
{
return
llama_mmap
::
SUPPORTED
;
return
llama_mmap
::
SUPPORTED
;
}
}
...
@@ -993,6 +1005,8 @@ static void llama_model_load_internal(
...
@@ -993,6 +1005,8 @@ static void llama_model_load_internal(
int
n_gpu_layers
,
int
n_gpu_layers
,
int
main_gpu
,
int
main_gpu
,
const
float
*
tensor_split
,
const
float
*
tensor_split
,
float
rope_freq_base
,
float
rope_freq_scale
,
bool
low_vram
,
bool
low_vram
,
ggml_type
memory_type
,
ggml_type
memory_type
,
bool
use_mmap
,
bool
use_mmap
,
...
@@ -1027,22 +1041,27 @@ static void llama_model_load_internal(
...
@@ -1027,22 +1041,27 @@ static void llama_model_load_internal(
}
}
hparams
.
n_ctx
=
n_ctx
;
hparams
.
n_ctx
=
n_ctx
;
hparams
.
rope_freq_base
=
rope_freq_base
;
hparams
.
rope_freq_scale
=
rope_freq_scale
;
}
}
const
uint32_t
n_ff
=
((
2
*
(
4
*
hparams
.
n_embd
)
/
3
+
hparams
.
n_mult
-
1
)
/
hparams
.
n_mult
)
*
hparams
.
n_mult
;
const
uint32_t
n_ff
=
((
2
*
(
4
*
hparams
.
n_embd
)
/
3
+
hparams
.
n_mult
-
1
)
/
hparams
.
n_mult
)
*
hparams
.
n_mult
;
{
{
fprintf
(
stderr
,
"%s: format = %s
\n
"
,
__func__
,
llama_file_version_name
(
file_version
));
fprintf
(
stderr
,
"%s: format = %s
\n
"
,
__func__
,
llama_file_version_name
(
file_version
));
fprintf
(
stderr
,
"%s: n_vocab = %u
\n
"
,
__func__
,
hparams
.
n_vocab
);
fprintf
(
stderr
,
"%s: n_vocab = %u
\n
"
,
__func__
,
hparams
.
n_vocab
);
fprintf
(
stderr
,
"%s: n_ctx = %u
\n
"
,
__func__
,
hparams
.
n_ctx
);
fprintf
(
stderr
,
"%s: n_ctx = %u
\n
"
,
__func__
,
hparams
.
n_ctx
);
fprintf
(
stderr
,
"%s: n_embd = %u
\n
"
,
__func__
,
hparams
.
n_embd
);
fprintf
(
stderr
,
"%s: n_embd = %u
\n
"
,
__func__
,
hparams
.
n_embd
);
fprintf
(
stderr
,
"%s: n_mult = %u
\n
"
,
__func__
,
hparams
.
n_mult
);
fprintf
(
stderr
,
"%s: n_mult = %u
\n
"
,
__func__
,
hparams
.
n_mult
);
fprintf
(
stderr
,
"%s: n_head = %u
\n
"
,
__func__
,
hparams
.
n_head
);
fprintf
(
stderr
,
"%s: n_head = %u
\n
"
,
__func__
,
hparams
.
n_head
);
fprintf
(
stderr
,
"%s: n_layer = %u
\n
"
,
__func__
,
hparams
.
n_layer
);
fprintf
(
stderr
,
"%s: n_layer = %u
\n
"
,
__func__
,
hparams
.
n_layer
);
fprintf
(
stderr
,
"%s: n_rot = %u
\n
"
,
__func__
,
hparams
.
n_rot
);
fprintf
(
stderr
,
"%s: n_rot = %u
\n
"
,
__func__
,
hparams
.
n_rot
);
fprintf
(
stderr
,
"%s: freq_base = %.1f
\n
"
,
__func__
,
hparams
.
rope_freq_base
);
fprintf
(
stderr
,
"%s: freq_scale = %g
\n
"
,
__func__
,
hparams
.
rope_freq_scale
);
fprintf
(
stderr
,
"%s: ftype = %u (%s)
\n
"
,
__func__
,
hparams
.
ftype
,
llama_ftype_name
(
hparams
.
ftype
));
fprintf
(
stderr
,
"%s: ftype = %u (%s)
\n
"
,
__func__
,
hparams
.
ftype
,
llama_ftype_name
(
hparams
.
ftype
));
fprintf
(
stderr
,
"%s: n_ff = %u
\n
"
,
__func__
,
n_ff
);
fprintf
(
stderr
,
"%s: n_ff = %u
\n
"
,
__func__
,
n_ff
);
fprintf
(
stderr
,
"%s: model size = %s
\n
"
,
__func__
,
llama_model_type_name
(
model
.
type
));
fprintf
(
stderr
,
"%s: model size = %s
\n
"
,
__func__
,
llama_model_type_name
(
model
.
type
));
}
}
if
(
file_version
<
LLAMA_FILE_VERSION_GGJT_V2
)
{
if
(
file_version
<
LLAMA_FILE_VERSION_GGJT_V2
)
{
...
@@ -1191,9 +1210,9 @@ static void llama_model_load_internal(
...
@@ -1191,9 +1210,9 @@ static void llama_model_load_internal(
const
size_t
mem_required
=
const
size_t
mem_required
=
ctx_size
+
ctx_size
+
mmapped_size
-
vram_weights
+
// weights in VRAM not in memory
mmapped_size
-
vram_weights
+
// weights in VRAM not in memory
MEM_REQ_SCRATCH0
().
at
(
model
.
type
)
+
MEM_REQ_SCRATCH0
(
hparams
.
n_ctx
).
at
(
model
.
type
)
+
MEM_REQ_SCRATCH1
().
at
(
model
.
type
)
+
MEM_REQ_SCRATCH1
().
at
(
model
.
type
)
+
MEM_REQ_EVAL
().
at
(
model
.
type
);
MEM_REQ_EVAL
(
hparams
.
n_ctx
).
at
(
model
.
type
);
// this is the memory required by one llama_state
// this is the memory required by one llama_state
const
size_t
mem_required_state
=
const
size_t
mem_required_state
=
...
@@ -1297,6 +1316,8 @@ static bool llama_model_load(
...
@@ -1297,6 +1316,8 @@ static bool llama_model_load(
int
n_gpu_layers
,
int
n_gpu_layers
,
int
main_gpu
,
int
main_gpu
,
float
*
tensor_split
,
float
*
tensor_split
,
float
rope_freq_base
,
float
rope_freq_scale
,
bool
low_vram
,
bool
low_vram
,
ggml_type
memory_type
,
ggml_type
memory_type
,
bool
use_mmap
,
bool
use_mmap
,
...
@@ -1305,7 +1326,7 @@ static bool llama_model_load(
...
@@ -1305,7 +1326,7 @@ static bool llama_model_load(
llama_progress_callback
progress_callback
,
llama_progress_callback
progress_callback
,
void
*
progress_callback_user_data
)
{
void
*
progress_callback_user_data
)
{
try
{
try
{
llama_model_load_internal
(
fname
,
model
,
vocab
,
n_ctx
,
n_batch
,
n_gpu_layers
,
main_gpu
,
tensor_split
,
low_vram
,
memory_type
,
llama_model_load_internal
(
fname
,
model
,
vocab
,
n_ctx
,
n_batch
,
n_gpu_layers
,
main_gpu
,
tensor_split
,
rope_freq_base
,
rope_freq_scale
,
low_vram
,
memory_type
,
use_mmap
,
use_mlock
,
vocab_only
,
progress_callback
,
progress_callback_user_data
);
use_mmap
,
use_mlock
,
vocab_only
,
progress_callback
,
progress_callback_user_data
);
return
true
;
return
true
;
}
catch
(
const
std
::
exception
&
err
)
{
}
catch
(
const
std
::
exception
&
err
)
{
...
@@ -1357,6 +1378,9 @@ static bool llama_eval_internal(
...
@@ -1357,6 +1378,9 @@ static bool llama_eval_internal(
const
int
n_rot
=
hparams
.
n_embd
/
hparams
.
n_head
;
const
int
n_rot
=
hparams
.
n_embd
/
hparams
.
n_head
;
const
int
n_gpu_layers
=
model
.
n_gpu_layers
;
const
int
n_gpu_layers
=
model
.
n_gpu_layers
;
const
float
freq_base
=
hparams
.
rope_freq_base
;
const
float
freq_scale
=
hparams
.
rope_freq_scale
;
auto
&
mem_per_token
=
lctx
.
mem_per_token
;
auto
&
mem_per_token
=
lctx
.
mem_per_token
;
auto
&
buf_compute
=
lctx
.
buf_compute
;
auto
&
buf_compute
=
lctx
.
buf_compute
;
...
@@ -1454,11 +1478,11 @@ static bool llama_eval_internal(
...
@@ -1454,11 +1478,11 @@ static bool llama_eval_internal(
offload_func_kq
(
tmpq
);
offload_func_kq
(
tmpq
);
ggml_set_name
(
tmpq
,
"tmpq"
);
ggml_set_name
(
tmpq
,
"tmpq"
);
struct
ggml_tensor
*
Kcur
=
ggml_rope_inplace
(
ctx0
,
ggml_reshape_3d
(
ctx0
,
tmpk
,
n_embd
/
n_head
,
n_head
,
N
),
n_past
,
n_rot
,
0
,
0
);
struct
ggml_tensor
*
Kcur
=
ggml_rope_
custom_
inplace
(
ctx0
,
ggml_reshape_3d
(
ctx0
,
tmpk
,
n_embd
/
n_head
,
n_head
,
N
),
n_past
,
n_rot
,
0
,
freq_base
,
freq_scale
,
0
);
offload_func_kq
(
Kcur
);
offload_func_kq
(
Kcur
);
ggml_set_name
(
Kcur
,
"Kcur"
);
ggml_set_name
(
Kcur
,
"Kcur"
);
struct
ggml_tensor
*
Qcur
=
ggml_rope_inplace
(
ctx0
,
ggml_reshape_3d
(
ctx0
,
tmpq
,
n_embd
/
n_head
,
n_head
,
N
),
n_past
,
n_rot
,
0
,
0
);
struct
ggml_tensor
*
Qcur
=
ggml_rope_
custom_
inplace
(
ctx0
,
ggml_reshape_3d
(
ctx0
,
tmpq
,
n_embd
/
n_head
,
n_head
,
N
),
n_past
,
n_rot
,
0
,
freq_base
,
freq_scale
,
0
);
offload_func_kq
(
Qcur
);
offload_func_kq
(
Qcur
);
ggml_set_name
(
Qcur
,
"Qcur"
);
ggml_set_name
(
Qcur
,
"Qcur"
);
...
@@ -2032,9 +2056,18 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
...
@@ -2032,9 +2056,18 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
}
}
// Normalize the second derivatives
// Normalize the second derivatives
float
second_derivatives_sum
=
std
::
accumulate
(
second_derivatives
.
begin
(),
second_derivatives
.
end
(),
0.0
f
);
{
for
(
float
&
value
:
second_derivatives
)
{
const
float
second_derivatives_sum
=
std
::
accumulate
(
second_derivatives
.
begin
(),
second_derivatives
.
end
(),
0.0
f
);
value
/=
second_derivatives_sum
;
if
(
second_derivatives_sum
>
1e-6
f
)
{
for
(
float
&
value
:
second_derivatives
)
{
value
/=
second_derivatives_sum
;
}
}
else
{
for
(
float
&
value
:
second_derivatives
)
{
value
=
1.0
f
/
second_derivatives
.
size
();
}
}
}
}
float
cum_sum
=
0.0
f
;
float
cum_sum
=
0.0
f
;
...
@@ -2213,7 +2246,7 @@ void llama_sample_classifier_free_guidance(
...
@@ -2213,7 +2246,7 @@ void llama_sample_classifier_free_guidance(
struct
llama_context
*
guidance_ctx
,
struct
llama_context
*
guidance_ctx
,
float
scale
,
float
scale
,
float
smooth_factor
)
{
float
smooth_factor
)
{
int64_t
t_start_sample_us
=
t_start_sample_us
=
ggml_time_us
();
int64_t
t_start_sample_us
=
ggml_time_us
();
assert
(
ctx
);
assert
(
ctx
);
auto
n_vocab
=
llama_n_vocab
(
ctx
);
auto
n_vocab
=
llama_n_vocab
(
ctx
);
...
@@ -2701,8 +2734,9 @@ struct llama_model * llama_load_model_from_file(
...
@@ -2701,8 +2734,9 @@ struct llama_model * llama_load_model_from_file(
ggml_type
memory_type
=
params
.
f16_kv
?
GGML_TYPE_F16
:
GGML_TYPE_F32
;
ggml_type
memory_type
=
params
.
f16_kv
?
GGML_TYPE_F16
:
GGML_TYPE_F32
;
if
(
!
llama_model_load
(
path_model
,
*
model
,
model
->
vocab
,
params
.
n_ctx
,
params
.
n_batch
,
params
.
n_gpu_layers
,
if
(
!
llama_model_load
(
path_model
,
*
model
,
model
->
vocab
,
params
.
n_ctx
,
params
.
n_batch
,
params
.
n_gpu_layers
,
params
.
main_gpu
,
params
.
tensor_split
,
params
.
low_vram
,
memory_type
,
params
.
use_mmap
,
params
.
use_mlock
,
params
.
main_gpu
,
params
.
tensor_split
,
params
.
rope_freq_base
,
params
.
rope_freq_scale
,
params
.
low_vram
,
params
.
vocab_only
,
params
.
progress_callback
,
params
.
progress_callback_user_data
))
{
memory_type
,
params
.
use_mmap
,
params
.
use_mlock
,
params
.
vocab_only
,
params
.
progress_callback
,
params
.
progress_callback_user_data
))
{
delete
model
;
delete
model
;
fprintf
(
stderr
,
"%s: failed to load model
\n
"
,
__func__
);
fprintf
(
stderr
,
"%s: failed to load model
\n
"
,
__func__
);
return
nullptr
;
return
nullptr
;
...
@@ -2723,7 +2757,7 @@ struct llama_context * llama_new_context_with_model(
...
@@ -2723,7 +2757,7 @@ struct llama_context * llama_new_context_with_model(
return
nullptr
;
return
nullptr
;
}
}
llama_context
*
ctx
=
new
llama_context
(
*
model
,
model
->
vocab
);
llama_context
*
ctx
=
new
llama_context
(
*
model
);
if
(
params
.
seed
==
LLAMA_DEFAULT_SEED
)
{
if
(
params
.
seed
==
LLAMA_DEFAULT_SEED
)
{
params
.
seed
=
time
(
NULL
);
params
.
seed
=
time
(
NULL
);
...
@@ -2777,9 +2811,9 @@ struct llama_context * llama_new_context_with_model(
...
@@ -2777,9 +2811,9 @@ struct llama_context * llama_new_context_with_model(
ctx
->
embedding
.
resize
(
hparams
.
n_embd
);
ctx
->
embedding
.
resize
(
hparams
.
n_embd
);
}
}
ctx
->
buf_compute
.
resize
(
MEM_REQ_EVAL
().
at
(
ctx
->
model
.
type
));
ctx
->
buf_compute
.
resize
(
MEM_REQ_EVAL
(
hparams
.
n_ctx
).
at
(
ctx
->
model
.
type
));
ctx
->
buf_scratch
[
0
].
resize
(
MEM_REQ_SCRATCH0
().
at
(
ctx
->
model
.
type
));
ctx
->
buf_scratch
[
0
].
resize
(
MEM_REQ_SCRATCH0
(
hparams
.
n_ctx
).
at
(
ctx
->
model
.
type
));
ctx
->
buf_scratch
[
1
].
resize
(
MEM_REQ_SCRATCH1
().
at
(
ctx
->
model
.
type
));
ctx
->
buf_scratch
[
1
].
resize
(
MEM_REQ_SCRATCH1
().
at
(
ctx
->
model
.
type
));
}
}
...
@@ -3561,13 +3595,13 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
...
@@ -3561,13 +3595,13 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
return
0
;
return
0
;
}
}
int
llama_tokenize
(
int
llama_tokenize
_with_model
(
struct
llama_
context
*
ctx
,
const
struct
llama_
model
*
model
,
const
char
*
text
,
const
char
*
text
,
llama_token
*
tokens
,
llama_token
*
tokens
,
int
n_max_tokens
,
int
n_max_tokens
,
bool
add_bos
)
{
bool
add_bos
)
{
auto
res
=
llama_tokenize
(
ctx
->
vocab
,
text
,
add_bos
);
auto
res
=
llama_tokenize
(
model
->
vocab
,
text
,
add_bos
);
if
(
n_max_tokens
<
(
int
)
res
.
size
())
{
if
(
n_max_tokens
<
(
int
)
res
.
size
())
{
fprintf
(
stderr
,
"%s: too many tokens
\n
"
,
__func__
);
fprintf
(
stderr
,
"%s: too many tokens
\n
"
,
__func__
);
...
@@ -3581,8 +3615,29 @@ int llama_tokenize(
...
@@ -3581,8 +3615,29 @@ int llama_tokenize(
return
res
.
size
();
return
res
.
size
();
}
}
int
llama_tokenize
(
struct
llama_context
*
ctx
,
const
char
*
text
,
llama_token
*
tokens
,
int
n_max_tokens
,
bool
add_bos
)
{
return
llama_tokenize_with_model
(
&
ctx
->
model
,
text
,
tokens
,
n_max_tokens
,
add_bos
);
}
int
llama_n_vocab_from_model
(
const
struct
llama_model
*
model
)
{
return
model
->
vocab
.
id_to_token
.
size
();
}
int
llama_n_ctx_from_model
(
const
struct
llama_model
*
model
)
{
return
model
->
hparams
.
n_ctx
;
}
int
llama_n_embd_from_model
(
const
struct
llama_model
*
model
)
{
return
model
->
hparams
.
n_embd
;
}
int
llama_n_vocab
(
const
struct
llama_context
*
ctx
)
{
int
llama_n_vocab
(
const
struct
llama_context
*
ctx
)
{
return
ctx
->
vocab
.
id_to_token
.
size
();
return
ctx
->
model
.
vocab
.
id_to_token
.
size
();
}
}
int
llama_n_ctx
(
const
struct
llama_context
*
ctx
)
{
int
llama_n_ctx
(
const
struct
llama_context
*
ctx
)
{
...
@@ -3593,19 +3648,27 @@ int llama_n_embd(const struct llama_context * ctx) {
...
@@ -3593,19 +3648,27 @@ int llama_n_embd(const struct llama_context * ctx) {
return
ctx
->
model
.
hparams
.
n_embd
;
return
ctx
->
model
.
hparams
.
n_embd
;
}
}
int
llama_get_vocab
(
int
llama_get_vocab
_from_model
(
const
struct
llama_
context
*
ctx
,
const
struct
llama_
model
*
model
,
const
char
*
*
strings
,
const
char
*
*
strings
,
float
*
scores
,
float
*
scores
,
int
capacity
)
{
int
capacity
)
{
int
n
=
std
::
min
(
capacity
,
(
int
)
ctx
->
vocab
.
id_to_token
.
size
());
int
n
=
std
::
min
(
capacity
,
(
int
)
model
->
vocab
.
id_to_token
.
size
());
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
strings
[
i
]
=
ctx
->
vocab
.
id_to_token
[
i
].
tok
.
c_str
();
strings
[
i
]
=
model
->
vocab
.
id_to_token
[
i
].
tok
.
c_str
();
scores
[
i
]
=
ctx
->
vocab
.
id_to_token
[
i
].
score
;
scores
[
i
]
=
model
->
vocab
.
id_to_token
[
i
].
score
;
}
}
return
n
;
return
n
;
}
}
int
llama_get_vocab
(
const
struct
llama_context
*
ctx
,
const
char
*
*
strings
,
float
*
scores
,
int
capacity
)
{
return
llama_get_vocab_from_model
(
&
ctx
->
model
,
strings
,
scores
,
capacity
);
}
float
*
llama_get_logits
(
struct
llama_context
*
ctx
)
{
float
*
llama_get_logits
(
struct
llama_context
*
ctx
)
{
return
ctx
->
logits
.
data
();
return
ctx
->
logits
.
data
();
}
}
...
@@ -3614,12 +3677,16 @@ float * llama_get_embeddings(struct llama_context * ctx) {
...
@@ -3614,12 +3677,16 @@ float * llama_get_embeddings(struct llama_context * ctx) {
return
ctx
->
embedding
.
data
();
return
ctx
->
embedding
.
data
();
}
}
const
char
*
llama_token_to_str
(
const
struct
llama_
context
*
ctx
,
llama_token
token
)
{
const
char
*
llama_token_to_str
_with_model
(
const
struct
llama_
model
*
model
,
llama_token
token
)
{
if
(
token
>=
llama_n_vocab
(
ctx
))
{
if
(
token
>=
llama_n_vocab
_from_model
(
model
))
{
return
nullptr
;
return
nullptr
;
}
}
return
ctx
->
vocab
.
id_to_token
[
token
].
tok
.
c_str
();
return
model
->
vocab
.
id_to_token
[
token
].
tok
.
c_str
();
}
const
char
*
llama_token_to_str
(
const
struct
llama_context
*
ctx
,
llama_token
token
)
{
return
llama_token_to_str_with_model
(
&
ctx
->
model
,
token
);
}
}
llama_token
llama_token_bos
()
{
llama_token
llama_token_bos
()
{
...
...
llama/llama.h
View file @
a83eaa7a
/**
/**
* llama.cpp - git
5bf2a2771886ee86137e01dbc7492f78fb392066
* llama.cpp - git
e782c9e735f93ab4767ffc37462c523b73a17ddc
*
*
* MIT License
* MIT License
*
*
...
@@ -115,6 +115,11 @@ extern "C" {
...
@@ -115,6 +115,11 @@ extern "C" {
int32_t
n_gpu_layers
;
// number of layers to store in VRAM
int32_t
n_gpu_layers
;
// number of layers to store in VRAM
int32_t
main_gpu
;
// the GPU that is used for scratch and small tensors
int32_t
main_gpu
;
// the GPU that is used for scratch and small tensors
float
tensor_split
[
LLAMA_MAX_DEVICES
];
// how to split layers across multiple GPUs
float
tensor_split
[
LLAMA_MAX_DEVICES
];
// how to split layers across multiple GPUs
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
float
rope_freq_base
;
// RoPE base frequency
float
rope_freq_scale
;
// RoPE frequency scaling factor
// called with a progress value between 0 and 1, pass NULL to disable
// called with a progress value between 0 and 1, pass NULL to disable
llama_progress_callback
progress_callback
;
llama_progress_callback
progress_callback
;
// context pointer passed to the progress callback
// context pointer passed to the progress callback
...
@@ -174,6 +179,8 @@ extern "C" {
...
@@ -174,6 +179,8 @@ extern "C" {
int32_t
n_eval
;
int32_t
n_eval
;
};
};
LLAMA_API
int
llama_max_devices
();
LLAMA_API
struct
llama_context_params
llama_context_default_params
();
LLAMA_API
struct
llama_context_params
llama_context_default_params
();
LLAMA_API
struct
llama_model_quantize_params
llama_model_quantize_default_params
();
LLAMA_API
struct
llama_model_quantize_params
llama_model_quantize_default_params
();
...
@@ -296,10 +303,21 @@ extern "C" {
...
@@ -296,10 +303,21 @@ extern "C" {
int
n_max_tokens
,
int
n_max_tokens
,
bool
add_bos
);
bool
add_bos
);
LLAMA_API
int
llama_tokenize_with_model
(
const
struct
llama_model
*
model
,
const
char
*
text
,
llama_token
*
tokens
,
int
n_max_tokens
,
bool
add_bos
);
LLAMA_API
int
llama_n_vocab
(
const
struct
llama_context
*
ctx
);
LLAMA_API
int
llama_n_vocab
(
const
struct
llama_context
*
ctx
);
LLAMA_API
int
llama_n_ctx
(
const
struct
llama_context
*
ctx
);
LLAMA_API
int
llama_n_ctx
(
const
struct
llama_context
*
ctx
);
LLAMA_API
int
llama_n_embd
(
const
struct
llama_context
*
ctx
);
LLAMA_API
int
llama_n_embd
(
const
struct
llama_context
*
ctx
);
LLAMA_API
int
llama_n_vocab_from_model
(
const
struct
llama_model
*
model
);
LLAMA_API
int
llama_n_ctx_from_model
(
const
struct
llama_model
*
model
);
LLAMA_API
int
llama_n_embd_from_model
(
const
struct
llama_model
*
model
);
// Get the vocabulary as output parameters.
// Get the vocabulary as output parameters.
// Returns number of results.
// Returns number of results.
LLAMA_API
int
llama_get_vocab
(
LLAMA_API
int
llama_get_vocab
(
...
@@ -308,6 +326,12 @@ extern "C" {
...
@@ -308,6 +326,12 @@ extern "C" {
float
*
scores
,
float
*
scores
,
int
capacity
);
int
capacity
);
LLAMA_API
int
llama_get_vocab_from_model
(
const
struct
llama_model
*
model
,
const
char
*
*
strings
,
float
*
scores
,
int
capacity
);
// Token logits obtained from the last call to llama_eval()
// Token logits obtained from the last call to llama_eval()
// The logits for the last token are stored in the last row
// The logits for the last token are stored in the last row
// Can be mutated in order to change the probabilities of the next token
// Can be mutated in order to change the probabilities of the next token
...
@@ -320,7 +344,13 @@ extern "C" {
...
@@ -320,7 +344,13 @@ extern "C" {
LLAMA_API
float
*
llama_get_embeddings
(
struct
llama_context
*
ctx
);
LLAMA_API
float
*
llama_get_embeddings
(
struct
llama_context
*
ctx
);
// Token Id -> String. Uses the vocabulary in the provided context
// Token Id -> String. Uses the vocabulary in the provided context
LLAMA_API
const
char
*
llama_token_to_str
(
const
struct
llama_context
*
ctx
,
llama_token
token
);
LLAMA_API
const
char
*
llama_token_to_str
(
const
struct
llama_context
*
ctx
,
llama_token
token
);
LLAMA_API
const
char
*
llama_token_to_str_with_model
(
const
struct
llama_model
*
model
,
llama_token
token
);
// Special tokens
// Special tokens
LLAMA_API
llama_token
llama_token_bos
();
// beginning-of-sentence
LLAMA_API
llama_token
llama_token_bos
();
// beginning-of-sentence
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment