Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
d7d7e996
Unverified
Commit
d7d7e996
authored
Feb 26, 2025
by
Jeffrey Morgan
Committed by
GitHub
Feb 26, 2025
Browse files
llama: update llama.cpp vendor code to commit d7cfe1ff (#9356)
parent
2db96c18
Changes
149
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
1892 additions
and
1449 deletions
+1892
-1449
ml/backend/ggml/ggml/src/ggml-cuda/wkv6.cu
ml/backend/ggml/ggml/src/ggml-cuda/wkv6.cu
+2
-2
ml/backend/ggml/ggml/src/ggml-hip/CMakeLists.txt
ml/backend/ggml/ggml/src/ggml-hip/CMakeLists.txt
+23
-2
ml/backend/ggml/ggml/src/ggml-impl.h
ml/backend/ggml/ggml/src/ggml-impl.h
+12
-17
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
+83
-30
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
+214
-28
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
+83
-28
ml/backend/ggml/ggml/src/ggml.c
ml/backend/ggml/ggml/src/ggml.c
+145
-1341
ml/backend/ggml/ggml/src/ggml_darwin_arm64.go
ml/backend/ggml/ggml/src/ggml_darwin_arm64.go
+1
-1
ml/backend/ggml/ggml/src/gguf.cpp
ml/backend/ggml/ggml/src/gguf.cpp
+1329
-0
No files found.
ml/backend/ggml/ggml/src/ggml-cuda/wkv6.cu
View file @
d7d7e996
...
...
@@ -73,9 +73,9 @@ void ggml_cuda_op_rwkv_wkv6(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
const
float
*
s_d
=
(
const
float
*
)
dst
->
src
[
5
]
->
data
;
const
int64_t
B
=
dst
->
src
[
5
]
->
ne
[
1
];
const
int64_t
T
=
dst
->
src
[
0
]
->
ne
[
3
];
const
int64_t
T
=
dst
->
src
[
0
]
->
ne
[
2
];
const
int64_t
C
=
dst
->
ne
[
0
];
const
int64_t
H
=
dst
->
src
[
0
]
->
ne
[
2
];
const
int64_t
H
=
dst
->
src
[
0
]
->
ne
[
1
];
float
*
dst_d
=
(
float
*
)
dst
->
data
;
...
...
ml/backend/ggml/ggml/src/ggml-hip/CMakeLists.txt
View file @
d7d7e996
...
...
@@ -40,13 +40,20 @@ find_package(hip REQUIRED)
find_package
(
hipblas REQUIRED
)
find_package
(
rocblas REQUIRED
)
if
(
${
hip_VERSION
}
VERSION_LESS 5.5
)
message
(
FATAL_ERROR
"At least ROCM/HIP V5.5 is required"
)
endif
()
message
(
STATUS
"HIP and hipBLAS found"
)
# Workaround old compilers
set
(
CMAKE_HIP_FLAGS
"
${
CMAKE_HIP_FLAGS
}
--gpu-max-threads-per-block=1024"
)
file
(
GLOB GGML_HEADERS_ROCM
"../ggml-cuda/*.cuh"
)
list
(
APPEND GGML_HEADERS_ROCM
"../../include/ggml-cuda.h"
)
file
(
GLOB GGML_SOURCES_ROCM
"../ggml-cuda/*.cu"
)
file
(
GLOB SRCS
"../ggml-cuda/template-instances/fattn-
w
mma*.cu"
)
file
(
GLOB SRCS
"../ggml-cuda/template-instances/fattn-mma*.cu"
)
list
(
APPEND GGML_SOURCES_ROCM
${
SRCS
}
)
file
(
GLOB SRCS
"../ggml-cuda/template-instances/mmq*.cu"
)
list
(
APPEND GGML_SOURCES_ROCM
${
SRCS
}
)
...
...
@@ -70,7 +77,9 @@ ggml_add_backend_library(ggml-hip
)
# TODO: do not use CUDA definitions for HIP
target_compile_definitions
(
ggml PUBLIC GGML_USE_CUDA
)
if
(
NOT GGML_BACKEND_DL
)
target_compile_definitions
(
ggml PUBLIC GGML_USE_CUDA
)
endif
()
add_compile_definitions
(
GGML_USE_HIP
)
...
...
@@ -90,6 +99,18 @@ if (GGML_CUDA_NO_PEER_COPY)
add_compile_definitions
(
GGML_CUDA_NO_PEER_COPY
)
endif
()
if
(
GGML_HIP_GRAPHS
)
add_compile_definitions
(
GGML_HIP_GRAPHS
)
endif
()
if
(
GGML_HIP_NO_VMM
)
add_compile_definitions
(
GGML_HIP_NO_VMM
)
endif
()
if
(
NOT GGML_CUDA_FA
)
add_compile_definitions
(
GGML_CUDA_NO_FA
)
endif
()
if
(
CXX_IS_HIPCC
)
set_source_files_properties
(
${
GGML_SOURCES_ROCM
}
PROPERTIES LANGUAGE CXX
)
target_link_libraries
(
ggml-hip PRIVATE hip::device
)
...
...
ml/backend/ggml/ggml/src/ggml-impl.h
View file @
d7d7e996
...
...
@@ -3,6 +3,8 @@
// GGML internal header
#include "ggml.h"
#include "gguf.h"
#include <assert.h>
#include <math.h>
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
...
...
@@ -14,7 +16,7 @@
#include <arm_sve.h>
#endif // __ARM_FEATURE_SVE
#if defined(__ARM_NEON) && !defined(__CUDACC__)
#if defined(__ARM_NEON) && !defined(__CUDACC__)
&& !defined(__MUSACC__)
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
//
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
...
...
@@ -551,22 +553,15 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
// expose GGUF internals for test code
GGML_API
size_t
gguf_type_size
(
enum
gguf_type
type
);
GGML_API
struct
gguf_context
*
gguf_init_from_file_impl
(
FILE
*
file
,
struct
gguf_init_params
params
);
struct
gguf_buf
{
void
*
data
;
size_t
size
;
size_t
offset
;
};
GGML_API
struct
gguf_buf
gguf_buf_init
(
size_t
size
);
GGML_API
void
gguf_buf_free
(
struct
gguf_buf
buf
);
GGML_API
void
gguf_write_to_buf
(
const
struct
gguf_context
*
ctx
,
struct
gguf_buf
*
buf
,
bool
only_meta
);
#ifdef __cplusplus
}
#endif
#ifdef __cplusplus
#include <vector>
// expose GGUF internals for test code
GGML_API
size_t
gguf_type_size
(
enum
gguf_type
type
);
GGML_API
struct
gguf_context
*
gguf_init_from_file_impl
(
FILE
*
file
,
struct
gguf_init_params
params
);
GGML_API
void
gguf_write_to_buf
(
const
struct
gguf_context
*
ctx
,
std
::
vector
<
int8_t
>
&
buf
,
bool
only_meta
);
#endif // __cplusplus
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
View file @
d7d7e996
...
...
@@ -477,7 +477,6 @@ GGML_TABLE_BEGIN(uint8_t, ksigns_iq2xs, 128)
240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
GGML_TABLE_END()
//#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A // lowest compute capability for integer intrinsics
GGML_TABLE_BEGIN(uint64_t, ksigns64, 128)
0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,
0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,
...
...
@@ -512,7 +511,6 @@ GGML_TABLE_BEGIN(uint64_t, ksigns64, 128)
0x00ffffffff000000, 0xffffffffff0000ff, 0xffffffffff00ff00, 0x00ffffffff00ffff,
0xffffffffffff0000, 0x00ffffffffff00ff, 0x00ffffffffffff00, 0xffffffffffffffff,
GGML_TABLE_END()
//#endif
GGML_TABLE_BEGIN(uint64_t, iq2xxs_grid, 256)
...
...
@@ -2513,24 +2511,33 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg
template <typename type4x4>
void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg) {
const half d_all = xb->d;
device const uint
8
_t * ql = (device const uint
8
_t *)xb->ql;
device const uint
8
_t * qh = (device const uint
8
_t *)xb->qh;
device const uint
16
_t * ql = (device const uint
16
_t *)xb->ql;
device const uint
16
_t * qh = (device const uint
16
_t *)xb->qh;
device const int8_t * scales = (device const int8_t *)xb->scales;
ql = ql +
64
*(il/8) +
32
*((il/2)&1) +
16
*(il&1);
qh = qh +
32
*(il/8) +
16
*(il&1);
ql = ql +
32
*(il/8) +
16
*((il/2)&1) +
8
*(il&1);
qh = qh +
16
*(il/8) +
8
*(il&1);
float sc = scales[(il%2) + 2 * ((il/2))];
il = (il/2) & 3;
const uint16_t kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
const uint16_t kmask2 = il>1 ? 0xF0 : 0x0F;
const float coef = il>1 ? 1.f/16.f : 1.f;
const uint32_t kmask1 = il>1 ? (il>2 ? 0xC0C0C0C0 : 0x30303030) : (il>0 ? 0x0C0C0C0C : 0x03030303);
const uint32_t kmask2 = il>1 ? 0xF0F0F0F0 : 0x0F0F0F0F;
const float ml = d_all * sc * 32.f;
const float dl = d_all * sc * coef;
for (int i = 0; i < 16; ++i) {
const half q = il&1 ? ((ql[i] & kmask2) | ((qh[i] & kmask1) << 2))
: ((ql[i] & kmask2) | ((qh[i] & kmask1) << 4));
reg[i/4][i%4] = dl * q - ml;
const float dl0 = d_all * sc;
const float dl1 = dl0 / 256.f;
const float dl2 = dl0 / (256.f * 256.f);
const float dl3 = dl0 / (256.f * 256.f * 256.f);
const uint8_t shr_h = il>2 ? 2 : 0;
const uint8_t shl_h = il>1 ? 0 : (il>0 ? 2 : 4);
const uint8_t shr_l = il>1 ? 4 : 0;
for (int i = 0; i < 4; ++i) {
const uint32_t low = (ql[2*i] | (uint32_t)(ql[2*i+1] << 16)) & kmask2;
const uint32_t high = (qh[2*i] | (uint32_t)(qh[2*i+1] << 16)) & kmask1;
const uint32_t q = ((high << shl_h) >> shr_h) | (low >> shr_l);
reg[i][0] = dl0 * ((half)(q & 0xFF)) - ml;
reg[i][1] = dl1 * ((float)(q & 0xFF00)) - ml;
reg[i][2] = dl2 * ((float)(q & 0xFF0000)) - ml;
reg[i][3] = dl3 * ((float)(q & 0xFF000000)) - ml;
}
}
...
...
@@ -3198,7 +3205,7 @@ kernel void kernel_soft_max(
}
// This barrier fixes a failing test
// ref: https://github.com/gg
erganov
/ggml/pull/621#discussion_r1425156335
// ref: https://github.com/gg
ml-org
/ggml/pull/621#discussion_r1425156335
threadgroup_barrier(mem_flags::mem_none);
float sum = simd_sum(lsum);
...
...
@@ -3303,7 +3310,7 @@ kernel void kernel_soft_max_4(
const float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3];
// This barrier fixes a failing test
// ref: https://github.com/gg
erganov
/ggml/pull/621#discussion_r1425156335
// ref: https://github.com/gg
ml-org
/ggml/pull/621#discussion_r1425156335
threadgroup_barrier(mem_flags::mem_none);
float sum = simd_sum(lsum);
...
...
@@ -6517,6 +6524,49 @@ kernel void kernel_cpy_f32_iq4_nl(
}
}
template<typename T4x4, typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread T4x4 &)>
kernel void kernel_cpy_q_f32(
constant ggml_metal_kargs_cpy & args,
device const char * src0,
device char * dst,
uint3 tgpig[[threadgroup_position_in_grid]],
ushort3 tpitg[[thread_position_in_threadgroup]],
ushort3 ntg[[threads_per_threadgroup]]) {
const int i03 = tgpig[2];
const int i02 = tgpig[1];
const int i01 = tgpig[0];
const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00;
const int64_t i3 = n/(args.ne2*args.ne1*args.ne0);
const int64_t i2 = (n - i3*args.ne2*args.ne1*args.ne0)/(args.ne1*args.ne0);
const int64_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0)/args.ne0;
const int64_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0);
device const block_q * src_data = (device const block_q *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01);
device T4x4 * dst_data = (device T4x4 *)(dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
for (int64_t i00 = tpitg.x; i00 < args.ne00/16; i00 += ntg.x) {
T4x4 temp;
dequantize_func(src_data + i00/nl, i00%nl, temp);
dst_data[i00] = temp;
}
}
typedef decltype(kernel_cpy_q_f32<float4x4, block_q4_0, 2, dequantize_q4_0>) cpy_q_f_t;
template [[host_name("kernel_cpy_q4_0_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q4_0, 2, dequantize_q4_0>;
template [[host_name("kernel_cpy_q4_1_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q4_1, 2, dequantize_q4_1>;
template [[host_name("kernel_cpy_q5_0_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q5_0, 2, dequantize_q5_0>;
template [[host_name("kernel_cpy_q5_1_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q5_1, 2, dequantize_q5_1>;
template [[host_name("kernel_cpy_q8_0_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q8_0, 2, dequantize_q8_0>;
template [[host_name("kernel_cpy_q4_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q4_0, 2, dequantize_q4_0>;
template [[host_name("kernel_cpy_q4_1_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q4_1, 2, dequantize_q4_1>;
template [[host_name("kernel_cpy_q5_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q5_0, 2, dequantize_q5_0>;
template [[host_name("kernel_cpy_q5_1_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q5_1, 2, dequantize_q5_1>;
template [[host_name("kernel_cpy_q8_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q8_0, 2, dequantize_q8_0>;
kernel void kernel_concat(
constant ggml_metal_kargs_concat & args,
device const char * src0,
...
...
@@ -6601,7 +6651,6 @@ void kernel_mul_mv_q2_K_f32_impl(
device const half * dh = &x[ib].d;
for (int row = 0; row < N_DST; row++) {
float4 acc1 = {0.f, 0.f, 0.f, 0.f};
float4 acc2 = {0.f, 0.f, 0.f, 0.f};
for (int i = 0; i < 8; i += 2) {
...
...
@@ -6632,7 +6681,7 @@ void kernel_mul_mv_q2_K_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < N_DST; ++row) {
for (int row = 0; row < N_DST
&& first_row + row < args.ne0
; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum;
...
...
@@ -6798,7 +6847,7 @@ void kernel_mul_mv_q3_K_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
if (tiisg == 0) {
for (int row = 0; row < 2; ++row) {
for (int row = 0; row < 2
&& first_row + row < args.ne0
; ++row) {
dst_f32[first_row + row] = sumf1[row];
}
}
...
...
@@ -6914,7 +6963,7 @@ void kernel_mul_mv_q4_K_f32_impl(
device float * dst_f32 = (device float *) dst + (int64_t)im*args.ne0*args.ne1 + (int64_t)r1*args.ne0;
for (int row = 0; row < N_DST; ++row) {
for (int row = 0; row < N_DST
&& first_row + row < args.ne0
; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum;
...
...
@@ -7046,7 +7095,7 @@ void kernel_mul_mv_q5_K_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < 2; ++row) {
for (int row = 0; row < 2
&& first_row + row < args.ne0
; ++row) {
const float tot = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = tot;
...
...
@@ -7091,6 +7140,10 @@ void kernel_mul_mv_q6_K_f32_impl(
const int row = 2*r0 + sgitg;
if (row >= args.ne0) {
return;
}
const uint i12 = im%args.ne12;
const uint i13 = im/args.ne12;
...
...
@@ -7246,7 +7299,7 @@ void kernel_mul_mv_iq2_xxs_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < N_DST; ++row) {
for (int row = 0; row < N_DST
&& first_row + row < args.ne0
; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum * 0.25f;
...
...
@@ -7364,7 +7417,7 @@ void kernel_mul_mv_iq2_xs_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < N_DST; ++row) {
for (int row = 0; row < N_DST
&& first_row + row < args.ne0
; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum * 0.25f;
...
...
@@ -7474,7 +7527,7 @@ void kernel_mul_mv_iq3_xxs_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < N_DST; ++row) {
for (int row = 0; row < N_DST
&& first_row + row < args.ne0
; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum * 0.5f;
...
...
@@ -7586,7 +7639,7 @@ void kernel_mul_mv_iq3_s_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < N_DST; ++row) {
for (int row = 0; row < N_DST
&& first_row + row < args.ne0
; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum;
...
...
@@ -7699,7 +7752,7 @@ void kernel_mul_mv_iq2_s_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < N_DST; ++row) {
for (int row = 0; row < N_DST
&& first_row + row < args.ne0
; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum * 0.25f;
...
...
@@ -7799,7 +7852,7 @@ void kernel_mul_mv_iq1_s_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < N_DST; ++row) {
for (int row = 0; row < N_DST
&& first_row + row < args.ne0
; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum;
...
...
@@ -7894,7 +7947,7 @@ void kernel_mul_mv_iq1_m_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < N_DST; ++row) {
for (int row = 0; row < N_DST
&& first_row + row < args.ne0
; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum;
...
...
@@ -7984,7 +8037,7 @@ void kernel_mul_mv_iq4_nl_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < 2 && first_row + row < args.ne0
1
; ++row) {
for (int row = 0; row < 2 && first_row + row < args.ne0; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum;
...
...
@@ -8073,7 +8126,7 @@ void kernel_mul_mv_iq4_xs_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < 2; ++row) {
for (int row = 0; row < 2
&& first_row + row < args.ne0
; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum;
...
...
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
View file @
d7d7e996
...
...
@@ -19,7 +19,17 @@
// max number of MTLCommandBuffer used to submit a graph for processing
#define GGML_METAL_MAX_COMMAND_BUFFERS 8
#define UNUSED(x) (void)(x)
#ifndef TARGET_OS_VISION
#define TARGET_OS_VISION 0
#endif
// create residency sets only on macOS >= 15.0
#if !TARGET_CPU_X86_64 && TARGET_OS_OSX && __MAC_OS_X_VERSION_MAX_ALLOWED >= 150000 || \
TARGET_OS_IOS && __IPHONE_OS_VERSION_MAX_ALLOWED >= 180000 || \
TARGET_OS_TV && __TV_OS_VERSION_MAX_ALLOWED >= 180000 || \
TARGET_OS_VISION && __VISION_OS_VERSION_MAX_ALLOWED >= 200000
#define GGML_METAL_HAS_RESIDENCY_SETS 1
#endif
// globals
...
...
@@ -39,6 +49,7 @@ static struct ggml_backend_metal_device_context {
bool
has_simdgroup_reduction
;
bool
has_simdgroup_mm
;
bool
has_residency_sets
;
bool
has_bfloat
;
bool
use_bfloat
;
...
...
@@ -48,6 +59,7 @@ static struct ggml_backend_metal_device_context {
/*.mtl_device_ref_count =*/
0
,
/*.has_simdgroup_reduction =*/
false
,
/*.has_simdgroup_mm =*/
false
,
/*.has_residency_sets =*/
false
,
/*.has_bfloat =*/
false
,
/*.use_bfloat =*/
false
,
/*.name =*/
""
,
...
...
@@ -59,12 +71,18 @@ static id<MTLDevice> ggml_backend_metal_device_acq(struct ggml_backend_metal_dev
if
(
ctx
->
mtl_device
==
nil
)
{
ctx
->
mtl_device
=
MTLCreateSystemDefaultDevice
();
}
if
(
ctx
->
mtl_device
)
{
ctx
->
has_simdgroup_reduction
=
[
ctx
->
mtl_device
supportsFamily
:
MTLGPUFamilyApple7
];
ctx
->
has_simdgroup_reduction
|=
[
ctx
->
mtl_device
supportsFamily
:
MTLGPUFamilyMetal3_GGML
];
ctx
->
has_simdgroup_mm
=
[
ctx
->
mtl_device
supportsFamily
:
MTLGPUFamilyApple7
];
#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
ctx
->
has_residency_sets
=
getenv
(
"GGML_METAL_NO_RESIDENCY"
)
==
NULL
;
#endif
ctx
->
has_bfloat
=
[
ctx
->
mtl_device
supportsFamily
:
MTLGPUFamilyMetal3_GGML
];
ctx
->
has_bfloat
|=
[
ctx
->
mtl_device
supportsFamily
:
MTLGPUFamilyApple6
];
...
...
@@ -90,8 +108,10 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
ctx
->
mtl_device_ref_count
--
;
if
(
ctx
->
mtl_device_ref_count
==
0
)
{
[
ctx
->
mtl_device
release
];
ctx
->
mtl_device
=
nil
;
if
(
ctx
->
mtl_device
)
{
[
ctx
->
mtl_device
release
];
ctx
->
mtl_device
=
nil
;
}
}
}
...
...
@@ -388,6 +408,16 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0
,
GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1
,
GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL
,
GGML_METAL_KERNEL_TYPE_CPY_Q4_0_F32
,
GGML_METAL_KERNEL_TYPE_CPY_Q4_0_F16
,
GGML_METAL_KERNEL_TYPE_CPY_Q4_1_F32
,
GGML_METAL_KERNEL_TYPE_CPY_Q4_1_F16
,
GGML_METAL_KERNEL_TYPE_CPY_Q5_0_F32
,
GGML_METAL_KERNEL_TYPE_CPY_Q5_0_F16
,
GGML_METAL_KERNEL_TYPE_CPY_Q5_1_F32
,
GGML_METAL_KERNEL_TYPE_CPY_Q5_1_F16
,
GGML_METAL_KERNEL_TYPE_CPY_Q8_0_F32
,
GGML_METAL_KERNEL_TYPE_CPY_Q8_0_F16
,
GGML_METAL_KERNEL_TYPE_CONCAT
,
GGML_METAL_KERNEL_TYPE_SQR
,
GGML_METAL_KERNEL_TYPE_SQRT
,
...
...
@@ -484,6 +514,11 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
GGML_LOG_INFO
(
"%s: picking default device: %s
\n
"
,
__func__
,
[[
device
name
]
UTF8String
]);
ctx
->
queue
=
[
device
newCommandQueue
];
if
(
ctx
->
queue
==
nil
)
{
GGML_LOG_ERROR
(
"%s: error: failed to create command queue
\n
"
,
__func__
);
return
NULL
;
}
ctx
->
d_queue
=
dispatch_queue_create
(
"ggml-metal"
,
DISPATCH_QUEUE_CONCURRENT
);
id
<
MTLLibrary
>
metal_library
;
...
...
@@ -650,6 +685,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
GGML_LOG_INFO
(
"%s: simdgroup reduction = %s
\n
"
,
__func__
,
ctx_dev
->
has_simdgroup_reduction
?
"true"
:
"false"
);
GGML_LOG_INFO
(
"%s: simdgroup matrix mul. = %s
\n
"
,
__func__
,
ctx_dev
->
has_simdgroup_mm
?
"true"
:
"false"
);
GGML_LOG_INFO
(
"%s: has residency sets = %s
\n
"
,
__func__
,
ctx_dev
->
has_residency_sets
?
"true"
:
"false"
);
GGML_LOG_INFO
(
"%s: has bfloat = %s
\n
"
,
__func__
,
ctx_dev
->
has_bfloat
?
"true"
:
"false"
);
GGML_LOG_INFO
(
"%s: use bfloat = %s
\n
"
,
__func__
,
ctx_dev
->
use_bfloat
?
"true"
:
"false"
);
GGML_LOG_INFO
(
"%s: hasUnifiedMemory = %s
\n
"
,
__func__
,
ctx_dev
->
mtl_device
.
hasUnifiedMemory
?
"true"
:
"false"
);
...
...
@@ -988,6 +1024,16 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
GGML_METAL_ADD_KERNEL
(
GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0
,
cpy_f32_q5_0
,
true
);
GGML_METAL_ADD_KERNEL
(
GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1
,
cpy_f32_q5_1
,
true
);
GGML_METAL_ADD_KERNEL
(
GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL
,
cpy_f32_iq4_nl
,
true
);
GGML_METAL_ADD_KERNEL
(
GGML_METAL_KERNEL_TYPE_CPY_Q4_0_F32
,
cpy_q4_0_f32
,
true
);
GGML_METAL_ADD_KERNEL
(
GGML_METAL_KERNEL_TYPE_CPY_Q4_0_F16
,
cpy_q4_0_f16
,
true
);
GGML_METAL_ADD_KERNEL
(
GGML_METAL_KERNEL_TYPE_CPY_Q4_1_F32
,
cpy_q4_1_f32
,
true
);
GGML_METAL_ADD_KERNEL
(
GGML_METAL_KERNEL_TYPE_CPY_Q4_1_F16
,
cpy_q4_1_f16
,
true
);
GGML_METAL_ADD_KERNEL
(
GGML_METAL_KERNEL_TYPE_CPY_Q5_0_F32
,
cpy_q5_0_f32
,
true
);
GGML_METAL_ADD_KERNEL
(
GGML_METAL_KERNEL_TYPE_CPY_Q5_0_F16
,
cpy_q5_0_f16
,
true
);
GGML_METAL_ADD_KERNEL
(
GGML_METAL_KERNEL_TYPE_CPY_Q5_1_F32
,
cpy_q5_1_f32
,
true
);
GGML_METAL_ADD_KERNEL
(
GGML_METAL_KERNEL_TYPE_CPY_Q5_1_F16
,
cpy_q5_1_f16
,
true
);
GGML_METAL_ADD_KERNEL
(
GGML_METAL_KERNEL_TYPE_CPY_Q8_0_F32
,
cpy_q8_0_f32
,
true
);
GGML_METAL_ADD_KERNEL
(
GGML_METAL_KERNEL_TYPE_CPY_Q8_0_F16
,
cpy_q8_0_f16
,
true
);
GGML_METAL_ADD_KERNEL
(
GGML_METAL_KERNEL_TYPE_CONCAT
,
concat
,
true
);
GGML_METAL_ADD_KERNEL
(
GGML_METAL_KERNEL_TYPE_SQR
,
sqr
,
true
);
GGML_METAL_ADD_KERNEL
(
GGML_METAL_KERNEL_TYPE_SQRT
,
sqrt
,
true
);
...
...
@@ -1037,8 +1083,70 @@ struct ggml_backend_metal_buffer_context {
// multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
int
n_buffers
;
struct
ggml_backend_metal_buffer
buffers
[
GGML_METAL_MAX_BUFFERS
];
// optional MTLResidencySet
id
rset
;
};
// rset init
static
bool
ggml_backend_metal_buffer_rset_init
(
struct
ggml_backend_metal_buffer_context
*
ctx
,
struct
ggml_backend_metal_device_context
*
ctx_dev
,
id
<
MTLDevice
>
device
)
{
ctx
->
rset
=
nil
;
if
(
!
ctx_dev
->
has_residency_sets
)
{
return
true
;
}
#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
if
(
@available
(
macOS
15
.
0
,
iOS
18
.
0
,
tvOS
18
.
0
,
visionOS
2
.
0
,
*
))
{
MTLResidencySetDescriptor
*
desc
=
[[
MTLResidencySetDescriptor
alloc
]
init
];
desc
.
label
=
@"ggml_backend_metal"
;
desc
.
initialCapacity
=
ctx
->
n_buffers
;
NSError
*
error
;
ctx
->
rset
=
[
device
newResidencySetWithDescriptor
:
desc
error
:
&
error
];
if
(
error
)
{
GGML_LOG_ERROR
(
"%s: error: %s
\n
"
,
__func__
,
[[
error
description
]
UTF8String
]);
[
desc
release
];
return
false
;
}
[
desc
release
];
for
(
int
i
=
0
;
i
<
ctx
->
n_buffers
;
i
++
)
{
[
ctx
->
rset
addAllocation
:
ctx
->
buffers
[
i
].
metal
];
}
[
ctx
->
rset
commit
];
[
ctx
->
rset
requestResidency
];
return
true
;
}
#else
GGML_UNUSED
(
ctx_dev
);
GGML_UNUSED
(
device
);
#endif
return
true
;
}
// rset free
static
void
ggml_backend_metal_buffer_rset_free
(
struct
ggml_backend_metal_buffer_context
*
ctx
)
{
#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
if
(
@available
(
macOS
15
.
0
,
iOS
18
.
0
,
tvOS
18
.
0
,
visionOS
2
.
0
,
*
))
{
if
(
ctx
->
rset
)
{
[
ctx
->
rset
endResidency
];
[
ctx
->
rset
removeAllAllocations
];
[
ctx
->
rset
release
];
}
}
#else
GGML_UNUSED
(
ctx
);
#endif
}
// finds the Metal buffer that contains the tensor data on the GPU device
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
// Metal buffer based on the host memory pointer
...
...
@@ -1122,12 +1230,13 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
case
GGML_OP_SUM_ROWS
:
case
GGML_OP_SOFT_MAX
:
case
GGML_OP_GROUP_NORM
:
return
has_simdgroup_reduction
;
return
has_simdgroup_reduction
&&
ggml_is_contiguous
(
op
->
src
[
0
])
;
case
GGML_OP_RMS_NORM
:
return
has_simdgroup_reduction
&&
(
op
->
ne
[
0
]
%
4
==
0
);
return
has_simdgroup_reduction
&&
(
op
->
ne
[
0
]
%
4
==
0
&&
ggml_is_contiguous_1
(
op
->
src
[
0
])
);
case
GGML_OP_ARGMAX
:
case
GGML_OP_NORM
:
return
true
;
case
GGML_OP_NORM
:
return
has_simdgroup_reduction
&&
(
op
->
ne
[
0
]
%
4
==
0
&&
ggml_is_contiguous_1
(
op
->
src
[
0
]));
case
GGML_OP_ROPE
:
{
const
int
mode
=
((
const
int32_t
*
)
op
->
op_params
)[
2
];
...
...
@@ -1201,6 +1310,18 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
default:
return
false
;
}
case
GGML_TYPE_Q4_0
:
case
GGML_TYPE_Q4_1
:
case
GGML_TYPE_Q5_0
:
case
GGML_TYPE_Q5_1
:
case
GGML_TYPE_Q8_0
:
switch
(
op
->
type
)
{
case
GGML_TYPE_F32
:
case
GGML_TYPE_F16
:
return
true
;
default:
return
false
;
}
default:
return
false
;
};
...
...
@@ -1897,7 +2018,7 @@ static void ggml_metal_encode_node(
const
float
m1
=
powf
(
2
.
0
f
,
-
(
max_bias
/
2
.
0
f
)
/
n_head_log2
);
// TODO: add ggml_metal_kargs struct
// TODO: optimize (see https://github.com/gg
erganov
/llama.cpp/pull/10238/commits/7941b6b9ec29a2866fec6fa6c51612515ca509f6)
// TODO: optimize (see https://github.com/gg
ml-org
/llama.cpp/pull/10238/commits/7941b6b9ec29a2866fec6fa6c51612515ca509f6)
[
encoder
setComputePipelineState
:
pipeline
];
[
encoder
setBuffer
:
id_src0
offset
:
offs_src0
atIndex
:
0
];
if
(
id_src1
)
{
...
...
@@ -3843,10 +3964,6 @@ static void ggml_metal_encode_node(
case
GGML_OP_CPY
:
case
GGML_OP_CONT
:
{
GGML_ASSERT
(
ne00
%
ggml_blck_size
(
src0
->
type
)
==
0
);
int
nth
=
MIN
(
1024
,
ne00
/
ggml_blck_size
(
src0
->
type
));
id
<
MTLComputePipelineState
>
pipeline
=
nil
;
switch
(
src0t
)
{
...
...
@@ -3880,7 +3997,47 @@ static void ggml_metal_encode_node(
switch
(
dstt
)
{
case
GGML_TYPE_F32
:
pipeline
=
ctx
->
kernels
[
GGML_METAL_KERNEL_TYPE_CPY_BF16_F32
].
pipeline
;
break
;
case
GGML_TYPE_BF16
:
pipeline
=
ctx
->
kernels
[
GGML_METAL_KERNEL_TYPE_CPY_BF16_BF16
].
pipeline
;
break
;
default:
GGML_ASSERT
(
false
&&
"not implemented"
);
default:
GGML_ABORT
(
"not implemented"
);
};
}
break
;
case
GGML_TYPE_Q4_0
:
{
switch
(
dstt
)
{
case
GGML_TYPE_F32
:
pipeline
=
ctx
->
kernels
[
GGML_METAL_KERNEL_TYPE_CPY_Q4_0_F32
].
pipeline
;
break
;
case
GGML_TYPE_F16
:
pipeline
=
ctx
->
kernels
[
GGML_METAL_KERNEL_TYPE_CPY_Q4_0_F16
].
pipeline
;
break
;
default:
GGML_ABORT
(
"not implemented"
);
};
}
break
;
case
GGML_TYPE_Q4_1
:
{
switch
(
dstt
)
{
case
GGML_TYPE_F32
:
pipeline
=
ctx
->
kernels
[
GGML_METAL_KERNEL_TYPE_CPY_Q4_1_F32
].
pipeline
;
break
;
case
GGML_TYPE_F16
:
pipeline
=
ctx
->
kernels
[
GGML_METAL_KERNEL_TYPE_CPY_Q4_1_F16
].
pipeline
;
break
;
default:
GGML_ABORT
(
"not implemented"
);
};
}
break
;
case
GGML_TYPE_Q5_0
:
{
switch
(
dstt
)
{
case
GGML_TYPE_F32
:
pipeline
=
ctx
->
kernels
[
GGML_METAL_KERNEL_TYPE_CPY_Q5_0_F32
].
pipeline
;
break
;
case
GGML_TYPE_F16
:
pipeline
=
ctx
->
kernels
[
GGML_METAL_KERNEL_TYPE_CPY_Q5_0_F16
].
pipeline
;
break
;
default:
GGML_ABORT
(
"not implemented"
);
};
}
break
;
case
GGML_TYPE_Q5_1
:
{
switch
(
dstt
)
{
case
GGML_TYPE_F32
:
pipeline
=
ctx
->
kernels
[
GGML_METAL_KERNEL_TYPE_CPY_Q5_1_F32
].
pipeline
;
break
;
case
GGML_TYPE_F16
:
pipeline
=
ctx
->
kernels
[
GGML_METAL_KERNEL_TYPE_CPY_Q5_1_F16
].
pipeline
;
break
;
default:
GGML_ABORT
(
"not implemented"
);
};
}
break
;
case
GGML_TYPE_Q8_0
:
{
switch
(
dstt
)
{
case
GGML_TYPE_F32
:
pipeline
=
ctx
->
kernels
[
GGML_METAL_KERNEL_TYPE_CPY_Q8_0_F32
].
pipeline
;
break
;
case
GGML_TYPE_F16
:
pipeline
=
ctx
->
kernels
[
GGML_METAL_KERNEL_TYPE_CPY_Q8_0_F16
].
pipeline
;
break
;
default:
GGML_ABORT
(
"not implemented"
);
};
}
break
;
default:
GGML_ABORT
(
"not implemented"
);
...
...
@@ -3910,7 +4067,11 @@ static void ggml_metal_encode_node(
[
encoder
setBuffer
:
id_src0
offset
:
offs_src0
atIndex
:
1
];
[
encoder
setBuffer
:
id_dst
offset
:
offs_dst
atIndex
:
2
];
GGML_ASSERT
(
ne00
%
ggml_blck_size
(
src0
->
type
)
==
0
);
int
nth
=
MIN
(
1024
,
ne00
/
ggml_blck_size
(
src0
->
type
));
[
encoder
dispatchThreadgroups
:
MTLSizeMake
(
ne01
,
ne02
,
ne03
)
threadsPerThreadgroup
:
MTLSizeMake
(
nth
,
1
,
1
)];
}
break
;
case
GGML_OP_SET
:
{
...
...
@@ -4209,6 +4370,8 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
for
(
int
i
=
0
;
i
<
ctx
->
n_buffers
;
i
++
)
{
[
ctx
->
buffers
[
i
].
metal
release
];
}
ggml_backend_metal_buffer_rset_free
(
ctx
);
ggml_backend_metal_device_rel
(
buffer
->
buft
->
device
->
context
);
if
(
ctx
->
owned
)
{
...
...
@@ -4232,19 +4395,19 @@ static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
static
void
ggml_backend_metal_buffer_memset_tensor
(
ggml_backend_buffer_t
buffer
,
struct
ggml_tensor
*
tensor
,
uint8_t
value
,
size_t
offset
,
size_t
size
)
{
memset
((
char
*
)
tensor
->
data
+
offset
,
value
,
size
);
UNUSED
(
buffer
);
GGML_
UNUSED
(
buffer
);
}
static
void
ggml_backend_metal_buffer_set_tensor
(
ggml_backend_buffer_t
buffer
,
struct
ggml_tensor
*
tensor
,
const
void
*
data
,
size_t
offset
,
size_t
size
)
{
memcpy
((
char
*
)
tensor
->
data
+
offset
,
data
,
size
);
UNUSED
(
buffer
);
GGML_
UNUSED
(
buffer
);
}
static
void
ggml_backend_metal_buffer_get_tensor
(
ggml_backend_buffer_t
buffer
,
const
struct
ggml_tensor
*
tensor
,
void
*
data
,
size_t
offset
,
size_t
size
)
{
memcpy
(
data
,
(
const
char
*
)
tensor
->
data
+
offset
,
size
);
UNUSED
(
buffer
);
GGML_
UNUSED
(
buffer
);
}
static
bool
ggml_backend_metal_buffer_cpy_tensor
(
ggml_backend_buffer_t
buffer
,
const
struct
ggml_tensor
*
src
,
struct
ggml_tensor
*
dst
)
{
...
...
@@ -4254,7 +4417,7 @@ static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, c
}
return
false
;
UNUSED
(
buffer
);
GGML_
UNUSED
(
buffer
);
}
static
void
ggml_backend_metal_buffer_clear
(
ggml_backend_buffer_t
buffer
,
uint8_t
value
)
{
...
...
@@ -4280,7 +4443,7 @@ static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
static
const
char
*
ggml_backend_metal_buffer_type_get_name
(
ggml_backend_buffer_type_t
buft
)
{
return
"Metal"
;
UNUSED
(
buft
);
GGML_
UNUSED
(
buft
);
}
static
void
ggml_backend_metal_log_allocated_size
(
id
<
MTLDevice
>
device
,
size_t
size_aligned
)
{
...
...
@@ -4304,8 +4467,8 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t s
}
#endif
#endif
UNUSED
(
device
);
UNUSED
(
size_aligned
);
GGML_
UNUSED
(
device
);
GGML_
UNUSED
(
size_aligned
);
}
static
ggml_backend_buffer_t
ggml_backend_metal_buffer_type_alloc_buffer
(
ggml_backend_buffer_type_t
buft
,
size_t
size
)
{
...
...
@@ -4318,7 +4481,8 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
size_aligned
+=
(
size_page
-
(
size_aligned
%
size_page
));
}
id
<
MTLDevice
>
device
=
ggml_backend_metal_device_acq
(
buft
->
device
->
context
);
struct
ggml_backend_metal_device_context
*
ctx_dev
=
(
struct
ggml_backend_metal_device_context
*
)
buft
->
device
->
context
;
id
<
MTLDevice
>
device
=
ggml_backend_metal_device_acq
(
ctx_dev
);
ctx
->
all_data
=
ggml_metal_host_malloc
(
size_aligned
);
ctx
->
all_size
=
size_aligned
;
...
...
@@ -4341,7 +4505,14 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
if
(
size_aligned
>
0
&&
(
ctx
->
all_data
==
NULL
||
ctx
->
buffers
[
0
].
metal
==
nil
))
{
GGML_LOG_ERROR
(
"%s: error: failed to allocate buffer, size = %8.2f MiB
\n
"
,
__func__
,
size_aligned
/
1024
.
0
/
1024
.
0
);
free
(
ctx
);
ggml_backend_metal_device_rel
(
buft
->
device
->
context
);
ggml_backend_metal_device_rel
(
ctx_dev
);
return
NULL
;
}
if
(
!
ggml_backend_metal_buffer_rset_init
(
ctx
,
ctx_dev
,
device
))
{
GGML_LOG_ERROR
(
"%s: error: failed to initialize residency set
\n
"
,
__func__
);
free
(
ctx
);
ggml_backend_metal_device_rel
(
ctx_dev
);
return
NULL
;
}
...
...
@@ -4352,7 +4523,7 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
static
size_t
ggml_backend_metal_buffer_type_get_alignment
(
ggml_backend_buffer_type_t
buft
)
{
return
32
;
UNUSED
(
buft
);
GGML_
UNUSED
(
buft
);
}
static
size_t
ggml_backend_metal_buffer_type_get_max_size
(
ggml_backend_buffer_type_t
buft
)
{
...
...
@@ -4362,13 +4533,13 @@ static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_ty
return
max_size
;
UNUSED
(
buft
);
GGML_
UNUSED
(
buft
);
}
static
bool
ggml_backend_metal_buffer_type_is_host
(
ggml_backend_buffer_type_t
buft
)
{
return
true
;
UNUSED
(
buft
);
GGML_
UNUSED
(
buft
);
}
ggml_backend_buffer_type_t
ggml_backend_metal_buffer_type
(
void
)
{
...
...
@@ -4391,7 +4562,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
static
const
char
*
ggml_backend_metal_buffer_from_ptr_type_get_name
(
ggml_backend_buffer_type_t
buft
)
{
return
"Metal_Mapped"
;
UNUSED
(
buft
);
GGML_
UNUSED
(
buft
);
}
static
ggml_backend_buffer_type_t
ggml_backend_metal_buffer_from_ptr_type
(
void
)
{
...
...
@@ -4434,7 +4605,8 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
size_aligned
+=
(
size_page
-
(
size_aligned
%
size_page
));
}
id
<
MTLDevice
>
device
=
ggml_backend_metal_device_acq
(
&
g_ggml_ctx_dev_main
);
struct
ggml_backend_metal_device_context
*
ctx_dev
=
&
g_ggml_ctx_dev_main
;
id
<
MTLDevice
>
device
=
ggml_backend_metal_device_acq
(
ctx_dev
);
// the buffer fits into the max buffer size allowed by the device
if
(
size_aligned
<=
device
.
maxBufferLength
)
{
...
...
@@ -4487,6 +4659,13 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
}
}
if
(
!
ggml_backend_metal_buffer_rset_init
(
ctx
,
ctx_dev
,
device
))
{
GGML_LOG_ERROR
(
"%s: error: failed to initialize residency set
\n
"
,
__func__
);
free
(
ctx
);
ggml_backend_metal_device_rel
(
ctx_dev
);
return
NULL
;
}
return
ggml_backend_buffer_init
(
ggml_backend_metal_buffer_from_ptr_type
(),
ggml_backend_metal_buffer_i
,
ctx
,
size
);
}
...
...
@@ -4495,7 +4674,7 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
static
const
char
*
ggml_backend_metal_name
(
ggml_backend_t
backend
)
{
return
"Metal"
;
UNUSED
(
backend
);
GGML_
UNUSED
(
backend
);
}
static
void
ggml_backend_metal_free
(
ggml_backend_t
backend
)
{
...
...
@@ -4800,6 +4979,13 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back
}
}
if
(
!
ggml_backend_metal_buffer_rset_init
(
ctx
,
ctx_dev
,
device
))
{
GGML_LOG_ERROR
(
"%s: error: failed to initialize residency set
\n
"
,
__func__
);
free
(
ctx
);
ggml_backend_metal_device_rel
(
ctx_dev
);
return
NULL
;
}
return
ggml_backend_buffer_init
(
ggml_backend_metal_buffer_from_ptr_type
(),
ggml_backend_metal_buffer_i
,
ctx
,
size
);
}
...
...
@@ -4813,7 +4999,7 @@ static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml
return
buft
->
iface
.
get_name
==
ggml_backend_metal_buffer_type_get_name
||
buft
->
iface
.
get_name
==
ggml_backend_metal_buffer_from_ptr_type_get_name
;
UNUSED
(
dev
);
GGML_
UNUSED
(
dev
);
}
static
bool
ggml_backend_metal_device_offload_op
(
ggml_backend_dev_t
dev
,
const
struct
ggml_tensor
*
op
)
{
...
...
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
View file @
d7d7e996
...
...
@@ -373,24 +373,33 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg
template <typename type4x4>
void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg) {
const half d_all = xb->d;
device const uint
8
_t * ql = (device const uint
8
_t *)xb->ql;
device const uint
8
_t * qh = (device const uint
8
_t *)xb->qh;
device const uint
16
_t * ql = (device const uint
16
_t *)xb->ql;
device const uint
16
_t * qh = (device const uint
16
_t *)xb->qh;
device const int8_t * scales = (device const int8_t *)xb->scales;
ql = ql +
64
*(il/8) +
32
*((il/2)&1) +
16
*(il&1);
qh = qh +
32
*(il/8) +
16
*(il&1);
ql = ql +
32
*(il/8) +
16
*((il/2)&1) +
8
*(il&1);
qh = qh +
16
*(il/8) +
8
*(il&1);
float sc = scales[(il%2) + 2 * ((il/2))];
il = (il/2) & 3;
const uint16_t kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
const uint16_t kmask2 = il>1 ? 0xF0 : 0x0F;
const float coef = il>1 ? 1.f/16.f : 1.f;
const uint32_t kmask1 = il>1 ? (il>2 ? 0xC0C0C0C0 : 0x30303030) : (il>0 ? 0x0C0C0C0C : 0x03030303);
const uint32_t kmask2 = il>1 ? 0xF0F0F0F0 : 0x0F0F0F0F;
const float ml = d_all * sc * 32.f;
const float dl = d_all * sc * coef;
for (int i = 0; i < 16; ++i) {
const half q = il&1 ? ((ql[i] & kmask2) | ((qh[i] & kmask1) << 2))
: ((ql[i] & kmask2) | ((qh[i] & kmask1) << 4));
reg[i/4][i%4] = dl * q - ml;
const float dl0 = d_all * sc;
const float dl1 = dl0 / 256.f;
const float dl2 = dl0 / (256.f * 256.f);
const float dl3 = dl0 / (256.f * 256.f * 256.f);
const uint8_t shr_h = il>2 ? 2 : 0;
const uint8_t shl_h = il>1 ? 0 : (il>0 ? 2 : 4);
const uint8_t shr_l = il>1 ? 4 : 0;
for (int i = 0; i < 4; ++i) {
const uint32_t low = (ql[2*i] | (uint32_t)(ql[2*i+1] << 16)) & kmask2;
const uint32_t high = (qh[2*i] | (uint32_t)(qh[2*i+1] << 16)) & kmask1;
const uint32_t q = ((high << shl_h) >> shr_h) | (low >> shr_l);
reg[i][0] = dl0 * ((half)(q & 0xFF)) - ml;
reg[i][1] = dl1 * ((float)(q & 0xFF00)) - ml;
reg[i][2] = dl2 * ((float)(q & 0xFF0000)) - ml;
reg[i][3] = dl3 * ((float)(q & 0xFF000000)) - ml;
}
}
...
...
@@ -1058,7 +1067,7 @@ kernel void kernel_soft_max(
}
// This barrier fixes a failing test
// ref: https://github.com/gg
erganov
/ggml/pull/621#discussion_r1425156335
// ref: https://github.com/gg
ml-org
/ggml/pull/621#discussion_r1425156335
threadgroup_barrier(mem_flags::mem_none);
float sum = simd_sum(lsum);
...
...
@@ -1163,7 +1172,7 @@ kernel void kernel_soft_max_4(
const float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3];
// This barrier fixes a failing test
// ref: https://github.com/gg
erganov
/ggml/pull/621#discussion_r1425156335
// ref: https://github.com/gg
ml-org
/ggml/pull/621#discussion_r1425156335
threadgroup_barrier(mem_flags::mem_none);
float sum = simd_sum(lsum);
...
...
@@ -4377,6 +4386,49 @@ kernel void kernel_cpy_f32_iq4_nl(
}
}
template<typename T4x4, typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread T4x4 &)>
kernel void kernel_cpy_q_f32(
constant ggml_metal_kargs_cpy & args,
device const char * src0,
device char * dst,
uint3 tgpig[[threadgroup_position_in_grid]],
ushort3 tpitg[[thread_position_in_threadgroup]],
ushort3 ntg[[threads_per_threadgroup]]) {
const int i03 = tgpig[2];
const int i02 = tgpig[1];
const int i01 = tgpig[0];
const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00;
const int64_t i3 = n/(args.ne2*args.ne1*args.ne0);
const int64_t i2 = (n - i3*args.ne2*args.ne1*args.ne0)/(args.ne1*args.ne0);
const int64_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0)/args.ne0;
const int64_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0);
device const block_q * src_data = (device const block_q *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01);
device T4x4 * dst_data = (device T4x4 *)(dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
for (int64_t i00 = tpitg.x; i00 < args.ne00/16; i00 += ntg.x) {
T4x4 temp;
dequantize_func(src_data + i00/nl, i00%nl, temp);
dst_data[i00] = temp;
}
}
typedef decltype(kernel_cpy_q_f32<float4x4, block_q4_0, 2, dequantize_q4_0>) cpy_q_f_t;
template [[host_name("kernel_cpy_q4_0_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q4_0, 2, dequantize_q4_0>;
template [[host_name("kernel_cpy_q4_1_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q4_1, 2, dequantize_q4_1>;
template [[host_name("kernel_cpy_q5_0_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q5_0, 2, dequantize_q5_0>;
template [[host_name("kernel_cpy_q5_1_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q5_1, 2, dequantize_q5_1>;
template [[host_name("kernel_cpy_q8_0_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q8_0, 2, dequantize_q8_0>;
template [[host_name("kernel_cpy_q4_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q4_0, 2, dequantize_q4_0>;
template [[host_name("kernel_cpy_q4_1_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q4_1, 2, dequantize_q4_1>;
template [[host_name("kernel_cpy_q5_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q5_0, 2, dequantize_q5_0>;
template [[host_name("kernel_cpy_q5_1_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q5_1, 2, dequantize_q5_1>;
template [[host_name("kernel_cpy_q8_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q8_0, 2, dequantize_q8_0>;
kernel void kernel_concat(
constant ggml_metal_kargs_concat & args,
device const char * src0,
...
...
@@ -4461,7 +4513,6 @@ void kernel_mul_mv_q2_K_f32_impl(
device const half * dh = &x[ib].d;
for (int row = 0; row < N_DST; row++) {
float4 acc1 = {0.f, 0.f, 0.f, 0.f};
float4 acc2 = {0.f, 0.f, 0.f, 0.f};
for (int i = 0; i < 8; i += 2) {
...
...
@@ -4492,7 +4543,7 @@ void kernel_mul_mv_q2_K_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < N_DST; ++row) {
for (int row = 0; row < N_DST
&& first_row + row < args.ne0
; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum;
...
...
@@ -4658,7 +4709,7 @@ void kernel_mul_mv_q3_K_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
if (tiisg == 0) {
for (int row = 0; row < 2; ++row) {
for (int row = 0; row < 2
&& first_row + row < args.ne0
; ++row) {
dst_f32[first_row + row] = sumf1[row];
}
}
...
...
@@ -4774,7 +4825,7 @@ void kernel_mul_mv_q4_K_f32_impl(
device float * dst_f32 = (device float *) dst + (int64_t)im*args.ne0*args.ne1 + (int64_t)r1*args.ne0;
for (int row = 0; row < N_DST; ++row) {
for (int row = 0; row < N_DST
&& first_row + row < args.ne0
; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum;
...
...
@@ -4906,7 +4957,7 @@ void kernel_mul_mv_q5_K_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < 2; ++row) {
for (int row = 0; row < 2
&& first_row + row < args.ne0
; ++row) {
const float tot = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = tot;
...
...
@@ -4951,6 +5002,10 @@ void kernel_mul_mv_q6_K_f32_impl(
const int row = 2*r0 + sgitg;
if (row >= args.ne0) {
return;
}
const uint i12 = im%args.ne12;
const uint i13 = im/args.ne12;
...
...
@@ -5106,7 +5161,7 @@ void kernel_mul_mv_iq2_xxs_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < N_DST; ++row) {
for (int row = 0; row < N_DST
&& first_row + row < args.ne0
; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum * 0.25f;
...
...
@@ -5224,7 +5279,7 @@ void kernel_mul_mv_iq2_xs_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < N_DST; ++row) {
for (int row = 0; row < N_DST
&& first_row + row < args.ne0
; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum * 0.25f;
...
...
@@ -5334,7 +5389,7 @@ void kernel_mul_mv_iq3_xxs_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < N_DST; ++row) {
for (int row = 0; row < N_DST
&& first_row + row < args.ne0
; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum * 0.5f;
...
...
@@ -5446,7 +5501,7 @@ void kernel_mul_mv_iq3_s_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < N_DST; ++row) {
for (int row = 0; row < N_DST
&& first_row + row < args.ne0
; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum;
...
...
@@ -5559,7 +5614,7 @@ void kernel_mul_mv_iq2_s_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < N_DST; ++row) {
for (int row = 0; row < N_DST
&& first_row + row < args.ne0
; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum * 0.25f;
...
...
@@ -5659,7 +5714,7 @@ void kernel_mul_mv_iq1_s_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < N_DST; ++row) {
for (int row = 0; row < N_DST
&& first_row + row < args.ne0
; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum;
...
...
@@ -5754,7 +5809,7 @@ void kernel_mul_mv_iq1_m_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < N_DST; ++row) {
for (int row = 0; row < N_DST
&& first_row + row < args.ne0
; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum;
...
...
@@ -5844,7 +5899,7 @@ void kernel_mul_mv_iq4_nl_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < 2 && first_row + row < args.ne0
1
; ++row) {
for (int row = 0; row < 2 && first_row + row < args.ne0; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum;
...
...
@@ -5933,7 +5988,7 @@ void kernel_mul_mv_iq4_xs_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < 2; ++row) {
for (int row = 0; row < 2
&& first_row + row < args.ne0
; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum;
...
...
ml/backend/ggml/ggml/src/ggml.c
View file @
d7d7e996
...
...
@@ -128,6 +128,10 @@ static void ggml_print_backtrace_symbols(void) {
#endif
static
void
ggml_print_backtrace
(
void
)
{
const
char
*
GGML_NO_BACKTRACE
=
getenv
(
"GGML_NO_BACKTRACE"
);
if
(
GGML_NO_BACKTRACE
)
{
return
;
}
char
attach
[
32
];
snprintf
(
attach
,
sizeof
(
attach
),
"attach %d"
,
getpid
());
int
pid
=
fork
();
...
...
@@ -236,7 +240,11 @@ void ggml_log_callback_default(enum ggml_log_level level, const char * text, voi
void
*
ggml_aligned_malloc
(
size_t
size
)
{
#if defined(__s390x__)
const
int
alignment
=
256
;
#else
const
int
alignment
=
64
;
#endif
#if defined(_MSC_VER) || defined(__MINGW32__)
return
_aligned_malloc
(
size
,
alignment
);
...
...
@@ -969,6 +977,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"GET_REL_POS"
,
"ADD_REL_POS"
,
"RWKV_WKV6"
,
"GATED_LINEAR_ATTN"
,
"UNARY"
,
...
...
@@ -988,7 +997,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"OPT_STEP_ADAMW"
,
};
static_assert
(
GGML_OP_COUNT
==
8
3
,
"GGML_OP_COUNT != 8
3
"
);
static_assert
(
GGML_OP_COUNT
==
8
4
,
"GGML_OP_COUNT != 8
4
"
);
static
const
char
*
GGML_OP_SYMBOL
[
GGML_OP_COUNT
]
=
{
"none"
,
...
...
@@ -1066,6 +1075,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"get_rel_pos(x)"
,
"add_rel_pos(x)"
,
"rwkv_wkv6(k, v, r, tf, td, s)"
,
"gated_linear_attn(k, v, q, gate, s)"
,
"unary(x)"
,
...
...
@@ -1085,7 +1095,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"adamw(x)"
,
};
static_assert
(
GGML_OP_COUNT
==
8
3
,
"GGML_OP_COUNT != 8
3
"
);
static_assert
(
GGML_OP_COUNT
==
8
4
,
"GGML_OP_COUNT != 8
4
"
);
static_assert
(
GGML_OP_POOL_COUNT
==
2
,
"GGML_OP_POOL_COUNT != 2"
);
...
...
@@ -1375,7 +1385,7 @@ bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tenso
(
t0
->
nb
[
3
]
==
t1
->
nb
[
3
]);
}
// check if t1 can be represented as a repe
a
tition of t0
// check if t1 can be represented as a repetition of t0
bool
ggml_can_repeat
(
const
struct
ggml_tensor
*
t0
,
const
struct
ggml_tensor
*
t1
)
{
static_assert
(
GGML_MAX_DIMS
==
4
,
"GGML_MAX_DIMS is not 4 - update this function"
);
...
...
@@ -1590,15 +1600,8 @@ static struct ggml_tensor * ggml_new_tensor_impl(
struct
ggml_tensor
*
const
result
=
(
struct
ggml_tensor
*
)((
char
*
)
ctx
->
mem_buffer
+
obj_new
->
offs
);
#ifdef __clang__
// temporary until ggml_tensor::backend is removed
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wdeprecated-declarations"
#endif
*
result
=
(
struct
ggml_tensor
)
{
/*.type =*/
type
,
/*.backend =*/
GGML_BACKEND_TYPE_CPU
,
/*.buffer =*/
NULL
,
/*.ne =*/
{
1
,
1
,
1
,
1
},
/*.nb =*/
{
0
,
0
,
0
,
0
},
...
...
@@ -1614,10 +1617,6 @@ static struct ggml_tensor * ggml_new_tensor_impl(
/*.padding =*/
{
0
},
};
#ifdef __clang__
#pragma clang diagnostic pop
#endif
// TODO: this should not be needed as long as we don't rely on aligned SIMD loads
//GGML_ASSERT_ALIGNED(result->data);
...
...
@@ -3461,12 +3460,14 @@ struct ggml_tensor * ggml_soft_max_ext(
return
ggml_soft_max_impl
(
ctx
,
a
,
mask
,
scale
,
max_bias
,
false
);
}
// ggml_soft_max_back
// ggml_soft_max_
ext_
back
static
struct
ggml_tensor
*
ggml_soft_max_back_impl
(
static
struct
ggml_tensor
*
ggml_soft_max_
ext_
back_impl
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
,
struct
ggml_tensor
*
b
,
float
scale
,
float
max_bias
,
bool
inplace
)
{
struct
ggml_tensor
*
result
=
inplace
?
ggml_view_tensor
(
ctx
,
a
)
:
ggml_dup_tensor
(
ctx
,
a
);
...
...
@@ -3474,21 +3475,28 @@ static struct ggml_tensor * ggml_soft_max_back_impl(
result
->
src
[
0
]
=
a
;
result
->
src
[
1
]
=
b
;
memcpy
((
float
*
)
result
->
op_params
+
0
,
&
scale
,
sizeof
(
float
));
memcpy
((
float
*
)
result
->
op_params
+
1
,
&
max_bias
,
sizeof
(
float
));
return
result
;
}
struct
ggml_tensor
*
ggml_soft_max_back
(
struct
ggml_tensor
*
ggml_soft_max_
ext_
back
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
,
struct
ggml_tensor
*
b
)
{
return
ggml_soft_max_back_impl
(
ctx
,
a
,
b
,
false
);
struct
ggml_tensor
*
b
,
float
scale
,
float
max_bias
)
{
return
ggml_soft_max_ext_back_impl
(
ctx
,
a
,
b
,
scale
,
max_bias
,
false
);
}
struct
ggml_tensor
*
ggml_soft_max_back_inplace
(
struct
ggml_tensor
*
ggml_soft_max_
ext_
back_inplace
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
,
struct
ggml_tensor
*
b
)
{
return
ggml_soft_max_back_impl
(
ctx
,
a
,
b
,
true
);
struct
ggml_tensor
*
b
,
float
scale
,
float
max_bias
)
{
return
ggml_soft_max_ext_back_impl
(
ctx
,
a
,
b
,
scale
,
max_bias
,
true
);
}
// ggml_rope
...
...
@@ -3706,7 +3714,7 @@ void ggml_rope_yarn_corr_dims(
// ggml_rope_back
struct
ggml_tensor
*
ggml_rope_back
(
struct
ggml_tensor
*
ggml_rope_
ext_
back
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
,
struct
ggml_tensor
*
b
,
...
...
@@ -3720,29 +3728,32 @@ struct ggml_tensor * ggml_rope_back(
float
attn_factor
,
float
beta_fast
,
float
beta_slow
)
{
GGML_ASSERT
(
ggml_is_vector
(
b
));
GGML_ASSERT
(
b
->
type
==
GGML_TYPE_I32
);
GGML_ASSERT
(
a
->
ne
[
2
]
==
b
->
ne
[
0
]);
struct
ggml_tensor
*
result
=
ggml_dup_tensor
(
ctx
,
a
);
int32_t
params
[
11
]
=
{
/*n_past*/
0
,
n_dims
,
mode
,
/*n_ctx*/
0
,
n_ctx_orig
};
memcpy
(
params
+
5
,
&
freq_base
,
sizeof
(
float
));
memcpy
(
params
+
6
,
&
freq_scale
,
sizeof
(
float
));
memcpy
(
params
+
7
,
&
ext_factor
,
sizeof
(
float
));
memcpy
(
params
+
8
,
&
attn_factor
,
sizeof
(
float
));
memcpy
(
params
+
9
,
&
beta_fast
,
sizeof
(
float
));
memcpy
(
params
+
10
,
&
beta_slow
,
sizeof
(
float
));
ggml_set_op_params
(
result
,
params
,
sizeof
(
params
));
result
->
op
=
GGML_OP_ROPE_BACK
;
result
->
src
[
0
]
=
a
;
result
->
src
[
1
]
=
b
;
result
->
src
[
2
]
=
c
;
struct
ggml_tensor
*
result
=
ggml_rope_ext
(
ctx
,
a
,
b
,
c
,
n_dims
,
mode
,
n_ctx_orig
,
freq_base
,
freq_scale
,
ext_factor
,
attn_factor
,
beta_fast
,
beta_slow
);
result
->
op
=
GGML_OP_ROPE_BACK
;
return
result
;
}
struct
ggml_tensor
*
ggml_rope_multi_back
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
,
struct
ggml_tensor
*
b
,
struct
ggml_tensor
*
c
,
int
n_dims
,
int
sections
[
4
],
int
mode
,
int
n_ctx_orig
,
float
freq_base
,
float
freq_scale
,
float
ext_factor
,
float
attn_factor
,
float
beta_fast
,
float
beta_slow
)
{
struct
ggml_tensor
*
result
=
ggml_rope_multi
(
ctx
,
a
,
b
,
c
,
n_dims
,
sections
,
mode
,
n_ctx_orig
,
freq_base
,
freq_scale
,
ext_factor
,
attn_factor
,
beta_fast
,
beta_slow
);
result
->
op
=
GGML_OP_ROPE_BACK
;
return
result
;
}
// ggml_clamp
struct
ggml_tensor
*
ggml_clamp
(
...
...
@@ -4661,15 +4672,13 @@ struct ggml_tensor * ggml_rwkv_wkv6(
GGML_ASSERT
(
ggml_is_contiguous
(
state
));
const
int64_t
S
=
k
->
ne
[
0
];
const
int64_t
H
=
k
->
ne
[
2
];
const
int64_t
n_tokens
=
k
->
ne
[
3
];
const
int64_t
H
=
k
->
ne
[
1
];
const
int64_t
n_tokens
=
k
->
ne
[
2
];
const
int64_t
n_seqs
=
state
->
ne
[
1
];
{
GGML_ASSERT
(
k
->
ne
[
1
]
==
1
);
GGML_ASSERT
(
v
->
ne
[
0
]
==
1
&&
v
->
ne
[
1
]
==
S
&&
v
->
ne
[
2
]
==
H
&&
v
->
ne
[
3
]
==
n_tokens
);
GGML_ASSERT
(
r
->
ne
[
0
]
==
1
&&
r
->
ne
[
1
]
==
S
&&
r
->
ne
[
2
]
==
H
&&
r
->
ne
[
3
]
==
n_tokens
);
// TODO: RWKV v4 and v5
GGML_ASSERT
(
td
->
ne
[
0
]
==
1
&&
td
->
ne
[
1
]
==
S
&&
td
->
ne
[
2
]
==
H
&&
td
->
ne
[
3
]
==
n_tokens
);
GGML_ASSERT
(
v
->
ne
[
0
]
==
S
&&
v
->
ne
[
1
]
==
H
&&
v
->
ne
[
2
]
==
n_tokens
);
GGML_ASSERT
(
r
->
ne
[
0
]
==
S
&&
r
->
ne
[
1
]
==
H
&&
r
->
ne
[
2
]
==
n_tokens
);
GGML_ASSERT
(
td
->
ne
[
0
]
==
S
&&
td
->
ne
[
1
]
==
H
&&
td
->
ne
[
2
]
==
n_tokens
);
GGML_ASSERT
(
ggml_nelements
(
state
)
==
S
*
S
*
H
*
n_seqs
);
}
...
...
@@ -4688,6 +4697,49 @@ struct ggml_tensor * ggml_rwkv_wkv6(
return
result
;
}
// ggml_gated_linear_attn
struct
ggml_tensor
*
ggml_gated_linear_attn
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
k
,
struct
ggml_tensor
*
v
,
struct
ggml_tensor
*
q
,
struct
ggml_tensor
*
g
,
struct
ggml_tensor
*
state
,
float
scale
)
{
GGML_ASSERT
(
ggml_is_contiguous
(
k
));
GGML_ASSERT
(
ggml_is_contiguous
(
v
));
GGML_ASSERT
(
ggml_is_contiguous
(
q
));
GGML_ASSERT
(
ggml_is_contiguous
(
g
));
GGML_ASSERT
(
ggml_is_contiguous
(
state
));
const
int64_t
S
=
k
->
ne
[
0
];
const
int64_t
H
=
k
->
ne
[
1
];
const
int64_t
n_tokens
=
k
->
ne
[
2
];
const
int64_t
n_seqs
=
state
->
ne
[
1
];
{
GGML_ASSERT
(
v
->
ne
[
0
]
==
S
&&
v
->
ne
[
1
]
==
H
&&
v
->
ne
[
2
]
==
n_tokens
);
GGML_ASSERT
(
q
->
ne
[
0
]
==
S
&&
q
->
ne
[
1
]
==
H
&&
q
->
ne
[
2
]
==
n_tokens
);
GGML_ASSERT
(
g
->
ne
[
0
]
==
S
&&
g
->
ne
[
1
]
==
H
&&
g
->
ne
[
2
]
==
n_tokens
);
GGML_ASSERT
(
ggml_nelements
(
state
)
==
S
*
S
*
H
*
n_seqs
);
}
// concat output and new_state
const
int64_t
ne
[
4
]
=
{
S
*
H
,
n_tokens
+
S
*
n_seqs
,
1
,
1
};
struct
ggml_tensor
*
result
=
ggml_new_tensor
(
ctx
,
GGML_TYPE_F32
,
4
,
ne
);
ggml_set_op_params_f32
(
result
,
0
,
scale
);
result
->
op
=
GGML_OP_GATED_LINEAR_ATTN
;
result
->
src
[
0
]
=
k
;
result
->
src
[
1
]
=
v
;
result
->
src
[
2
]
=
q
;
result
->
src
[
3
]
=
g
;
result
->
src
[
4
]
=
state
;
return
result
;
}
// ggml_unary
static
struct
ggml_tensor
*
ggml_unary_impl
(
...
...
@@ -5062,10 +5114,10 @@ struct ggml_tensor * ggml_cross_entropy_loss_back(
struct
ggml_tensor
*
a
,
struct
ggml_tensor
*
b
,
struct
ggml_tensor
*
c
)
{
GGML_ASSERT
(
ggml_
are_same_shape
(
a
,
b
));
GGML_ASSERT
(
ggml_
is_scalar
(
c
));
GGML_ASSERT
(
ggml_
is_scalar
(
a
));
GGML_ASSERT
(
ggml_
are_same_shape
(
b
,
c
));
struct
ggml_tensor
*
result
=
ggml_dup_tensor
(
ctx
,
a
);
struct
ggml_tensor
*
result
=
ggml_dup_tensor
(
ctx
,
b
);
result
->
op
=
GGML_OP_CROSS_ENTROPY_LOSS_BACK
;
result
->
src
[
0
]
=
a
;
...
...
@@ -5244,7 +5296,7 @@ static void ggml_sub_or_set(
}
static
void
ggml_compute_backward
(
struct
ggml_context
*
ctx
,
struct
ggml_cgraph
*
cgraph
,
int
i
,
bool
*
grads_needed
)
{
struct
ggml_context
*
ctx
,
struct
ggml_cgraph
*
cgraph
,
int
i
,
const
bool
*
grads_needed
)
{
struct
ggml_tensor
*
tensor
=
cgraph
->
nodes
[
i
];
struct
ggml_tensor
*
grad
=
ggml_graph_get_grad
(
cgraph
,
tensor
);
...
...
@@ -5316,7 +5368,7 @@ static void ggml_compute_backward(
}
break
;
case
GGML_OP_MUL
:
{
if
(
src0_needs_grads
)
{
ggml_add_or_set
(
ctx
,
cgraph
,
isrc0
,
ggml_mul
(
ctx
,
src1
,
grad
));
ggml_add_or_set
(
ctx
,
cgraph
,
isrc0
,
ggml_mul
(
ctx
,
grad
,
src1
));
}
if
(
src1_needs_grads
)
{
struct
ggml_tensor
*
tmp
=
ggml_mul
(
ctx
,
src0
,
grad
);
...
...
@@ -5388,7 +5440,7 @@ static void ggml_compute_backward(
if
(
src0_needs_grads
)
{
float
eps
;
memcpy
(
&
eps
,
tensor
->
op_params
,
sizeof
(
float
));
ggml_add_or_set
(
ctx
,
cgraph
,
isrc0
,
ggml_rms_norm_back
(
ctx
,
src0
,
grad
,
eps
));
ggml_add_or_set
(
ctx
,
cgraph
,
isrc0
,
ggml_rms_norm_back
(
ctx
,
grad
,
src0
,
eps
));
}
}
break
;
case
GGML_OP_MUL_MAT
:
{
...
...
@@ -5408,21 +5460,25 @@ static void ggml_compute_backward(
// src1.shape [n,p,qq,rr]
if
(
src0_needs_grads
)
{
struct
ggml_tensor
*
s1_tg
=
GGML_ASSERT
(
grad
->
ne
[
2
]
==
src1
->
ne
[
2
]);
GGML_ASSERT
(
grad
->
ne
[
3
]
==
src1
->
ne
[
3
]);
struct
ggml_tensor
*
tmp
=
ggml_out_prod
(
ctx
,
// [n,m,qq,rr]
src1
,
// [n,p,qq,rr]
grad
);
// [m,p,qq,rr]
const
int64_t
qq
=
s1_tg
->
ne
[
2
];
const
int64_t
rr
=
s1_tg
->
ne
[
3
];
const
int64_t
q1
=
src0
->
ne
[
2
];
const
int64_t
r1
=
src0
->
ne
[
3
];
const
bool
ne2_broadcasted
=
qq
>
q1
;
const
bool
ne3_broadcasted
=
rr
>
r1
;
if
(
ne2_broadcasted
||
ne3_broadcasted
)
{
// sum broadcast repetitions of s1_tg into shape of src0
s1_tg
=
ggml_repeat_back
(
ctx
,
s1_tg
,
src0
);
if
(
!
ggml_are_same_shape
(
tmp
,
src0
))
{
GGML_ASSERT
(
tmp
->
ne
[
0
]
==
src0
->
ne
[
0
]);
GGML_ASSERT
(
tmp
->
ne
[
1
]
==
src0
->
ne
[
1
]);
GGML_ASSERT
(
tmp
->
ne
[
3
]
==
1
);
const
int64_t
nr2
=
tmp
->
ne
[
2
]
/
src0
->
ne
[
2
];
const
size_t
nb2
=
tmp
->
nb
[
2
]
*
nr2
;
const
size_t
nb3
=
tmp
->
nb
[
2
];
tmp
=
ggml_view_4d
(
ctx
,
tmp
,
src0
->
ne
[
0
],
src0
->
ne
[
1
],
src0
->
ne
[
2
],
nr2
,
tmp
->
nb
[
1
],
nb2
,
nb3
,
0
);
tmp
=
ggml_repeat_back
(
ctx
,
tmp
,
src0
);
}
ggml_add_or_set
(
ctx
,
cgraph
,
isrc0
,
s1_tg
/*= [n,m,q1,r1]*/
);
ggml_add_or_set
(
ctx
,
cgraph
,
isrc0
,
tmp
);
}
if
(
src1_needs_grads
)
{
ggml_add_or_set
(
ctx
,
cgraph
,
isrc1
,
...
...
@@ -5491,7 +5547,9 @@ static void ggml_compute_backward(
if
(
src0_needs_grads
)
{
GGML_ASSERT
(
!
cgraph
->
grads
[
isrc0
]
||
ggml_is_contiguous
(
cgraph
->
grads
[
isrc0
]));
GGML_ASSERT
(
ggml_is_contiguous
(
grad
));
ggml_add_or_set
(
ctx
,
cgraph
,
isrc0
,
grad
);
GGML_ASSERT
(
ggml_nelements
(
tensor
)
==
ggml_nelements
(
src0
));
ggml_add_or_set
(
ctx
,
cgraph
,
isrc0
,
ggml_are_same_shape
(
tensor
,
src0
)
?
grad
:
ggml_reshape
(
ctx
,
grad
,
src0
));
}
}
break
;
case
GGML_OP_RESHAPE
:
{
...
...
@@ -5571,7 +5629,13 @@ static void ggml_compute_backward(
}
break
;
case
GGML_OP_SOFT_MAX
:
{
if
(
src0_needs_grads
)
{
ggml_add_or_set
(
ctx
,
cgraph
,
isrc0
,
ggml_soft_max_back
(
ctx
,
grad
,
tensor
));
float
scale
=
1
.
0
f
;
float
max_bias
=
0
.
0
f
;
memcpy
(
&
scale
,
(
const
float
*
)
tensor
->
op_params
+
0
,
sizeof
(
float
));
memcpy
(
&
max_bias
,
(
const
float
*
)
tensor
->
op_params
+
1
,
sizeof
(
float
));
ggml_add_or_set
(
ctx
,
cgraph
,
isrc0
,
ggml_soft_max_ext_back
(
ctx
,
grad
,
tensor
,
scale
,
max_bias
));
}
GGML_ASSERT
((
!
src1
||
!
src1_needs_grads
)
&&
"backward pass for softmax mask not implemented"
);
}
break
;
...
...
@@ -5583,6 +5647,7 @@ static void ggml_compute_backward(
//const int n_ctx = ((int32_t *) tensor->op_params)[3];
const
int
n_ctx_orig
=
((
const
int32_t
*
)
tensor
->
op_params
)[
4
];
float
freq_base
,
freq_scale
,
ext_factor
,
attn_factor
,
beta_fast
,
beta_slow
;
int
sections
[
4
]
=
{
0
,
0
,
0
,
0
};
memcpy
(
&
freq_base
,
(
const
float
*
)
tensor
->
op_params
+
5
,
sizeof
(
float
));
memcpy
(
&
freq_scale
,
(
const
float
*
)
tensor
->
op_params
+
6
,
sizeof
(
float
));
...
...
@@ -5590,10 +5655,14 @@ static void ggml_compute_backward(
memcpy
(
&
attn_factor
,
(
const
float
*
)
tensor
->
op_params
+
8
,
sizeof
(
float
));
memcpy
(
&
beta_fast
,
(
const
float
*
)
tensor
->
op_params
+
9
,
sizeof
(
float
));
memcpy
(
&
beta_slow
,
(
const
float
*
)
tensor
->
op_params
+
10
,
sizeof
(
float
));
ggml_add_or_set
(
ctx
,
cgraph
,
isrc0
,
ggml_rope_back
(
ctx
,
grad
,
src1
,
src2
,
n_dims
,
mode
,
n_ctx_orig
,
freq_base
,
freq_scale
,
ext_factor
,
attn_factor
,
beta_fast
,
beta_slow
));
memcpy
(
&
sections
,
tensor
->
op_params
+
11
,
sizeof
(
sections
));
struct
ggml_tensor
*
rope_back
=
grad
->
ne
[
2
]
==
src1
->
ne
[
0
]
?
ggml_rope_ext_back
(
ctx
,
grad
,
src1
,
src2
,
n_dims
,
mode
,
n_ctx_orig
,
freq_base
,
freq_scale
,
ext_factor
,
attn_factor
,
beta_fast
,
beta_slow
)
:
ggml_rope_multi_back
(
ctx
,
grad
,
src1
,
src2
,
n_dims
,
sections
,
mode
,
n_ctx_orig
,
freq_base
,
freq_scale
,
ext_factor
,
attn_factor
,
beta_fast
,
beta_slow
);
ggml_add_or_set
(
ctx
,
cgraph
,
isrc0
,
rope_back
);
}
GGML_ASSERT
((
!
src2
||
!
src2_needs_grads
)
&&
"gradients for freq factors not implemented"
);
}
break
;
...
...
@@ -5607,7 +5676,7 @@ static void ggml_compute_backward(
const
int32_t
d1
=
ggml_get_op_params_i32
(
tensor
,
5
);
const
bool
is_2D
=
ggml_get_op_params_i32
(
tensor
,
6
)
==
1
;
ggml_add_or_set
(
ctx
,
cgraph
,
isrc1
,
ggml_im2col_back
(
ctx
,
src0
,
grad
,
src1
->
ne
,
s0
,
s1
,
p0
,
p1
,
d0
,
d1
,
is_2D
));
ggml_add_or_set
(
ctx
,
cgraph
,
isrc1
,
ggml_im2col_back
(
ctx
,
grad
,
src0
,
src1
->
ne
,
s0
,
s1
,
p0
,
p1
,
d0
,
d1
,
is_2D
));
}
}
break
;
case
GGML_OP_POOL_2D
:
{
...
...
@@ -5650,7 +5719,7 @@ static void ggml_compute_backward(
}
break
;
case
GGML_UNARY_OP_SILU
:
{
if
(
src0_needs_grads
)
{
ggml_add_or_set
(
ctx
,
cgraph
,
isrc0
,
ggml_silu_back
(
ctx
,
src0
,
grad
));
ggml_add_or_set
(
ctx
,
cgraph
,
isrc0
,
ggml_silu_back
(
ctx
,
grad
,
src0
));
}
}
break
;
case
GGML_UNARY_OP_EXP
:
{
...
...
@@ -5667,7 +5736,7 @@ static void ggml_compute_backward(
}
break
;
case
GGML_OP_CROSS_ENTROPY_LOSS
:
{
if
(
src0_needs_grads
)
{
ggml_add_or_set
(
ctx
,
cgraph
,
isrc0
,
ggml_cross_entropy_loss_back
(
ctx
,
src0
,
src1
,
grad
));
ggml_add_or_set
(
ctx
,
cgraph
,
isrc0
,
ggml_cross_entropy_loss_back
(
ctx
,
grad
,
src0
,
src1
));
}
GGML_ASSERT
(
!
src1_needs_grads
&&
"backward pass for labels not implemented"
);
}
break
;
...
...
@@ -6438,1271 +6507,6 @@ size_t ggml_quantize_chunk(
////////////////////////////////////////////////////////////////////////////////
struct
gguf_str
{
uint64_t
n
;
// GGUFv2
char
*
data
;
};
static
const
size_t
GGUF_TYPE_SIZE
[
GGUF_TYPE_COUNT
]
=
{
[
GGUF_TYPE_UINT8
]
=
sizeof
(
uint8_t
),
[
GGUF_TYPE_INT8
]
=
sizeof
(
int8_t
),
[
GGUF_TYPE_UINT16
]
=
sizeof
(
uint16_t
),
[
GGUF_TYPE_INT16
]
=
sizeof
(
int16_t
),
[
GGUF_TYPE_UINT32
]
=
sizeof
(
uint32_t
),
[
GGUF_TYPE_INT32
]
=
sizeof
(
int32_t
),
[
GGUF_TYPE_FLOAT32
]
=
sizeof
(
float
),
[
GGUF_TYPE_BOOL
]
=
sizeof
(
bool
),
[
GGUF_TYPE_STRING
]
=
sizeof
(
struct
gguf_str
),
[
GGUF_TYPE_UINT64
]
=
sizeof
(
uint64_t
),
[
GGUF_TYPE_INT64
]
=
sizeof
(
int64_t
),
[
GGUF_TYPE_FLOAT64
]
=
sizeof
(
double
),
[
GGUF_TYPE_ARRAY
]
=
0
,
// undefined
};
static_assert
(
GGUF_TYPE_COUNT
==
13
,
"GGUF_TYPE_COUNT != 13"
);
static
const
char
*
GGUF_TYPE_NAME
[
GGUF_TYPE_COUNT
]
=
{
[
GGUF_TYPE_UINT8
]
=
"u8"
,
[
GGUF_TYPE_INT8
]
=
"i8"
,
[
GGUF_TYPE_UINT16
]
=
"u16"
,
[
GGUF_TYPE_INT16
]
=
"i16"
,
[
GGUF_TYPE_UINT32
]
=
"u32"
,
[
GGUF_TYPE_INT32
]
=
"i32"
,
[
GGUF_TYPE_FLOAT32
]
=
"f32"
,
[
GGUF_TYPE_BOOL
]
=
"bool"
,
[
GGUF_TYPE_STRING
]
=
"str"
,
[
GGUF_TYPE_ARRAY
]
=
"arr"
,
[
GGUF_TYPE_UINT64
]
=
"u64"
,
[
GGUF_TYPE_INT64
]
=
"i64"
,
[
GGUF_TYPE_FLOAT64
]
=
"f64"
,
};
static_assert
(
GGUF_TYPE_COUNT
==
13
,
"GGUF_TYPE_COUNT != 13"
);
union
gguf_value
{
uint8_t
uint8
;
int8_t
int8
;
uint16_t
uint16
;
int16_t
int16
;
uint32_t
uint32
;
int32_t
int32
;
float
float32
;
uint64_t
uint64
;
int64_t
int64
;
double
float64
;
bool
bool_
;
struct
gguf_str
str
;
struct
{
enum
gguf_type
type
;
uint64_t
n
;
// GGUFv2
void
*
data
;
}
arr
;
};
struct
gguf_kv
{
struct
gguf_str
key
;
enum
gguf_type
type
;
union
gguf_value
value
;
};
struct
gguf_header
{
char
magic
[
4
];
uint32_t
version
;
uint64_t
n_tensors
;
// GGUFv2
uint64_t
n_kv
;
// GGUFv2
};
struct
gguf_tensor_info
{
struct
gguf_str
name
;
uint32_t
n_dims
;
uint64_t
ne
[
GGML_MAX_DIMS
];
enum
ggml_type
type
;
uint64_t
offset
;
// offset from start of `data`, must be a multiple of `ALIGNMENT`
// for writing API
const
void
*
data
;
size_t
size
;
};
struct
gguf_context
{
struct
gguf_header
header
;
struct
gguf_kv
*
kv
;
struct
gguf_tensor_info
*
infos
;
size_t
alignment
;
size_t
offset
;
// offset of `data` from beginning of file
size_t
size
;
// size of `data` in bytes
//uint8_t * padding;
void
*
data
;
};
size_t
gguf_type_size
(
enum
gguf_type
type
)
{
GGML_ASSERT
(
0
<=
type
&&
type
<
GGUF_TYPE_COUNT
);
return
GGUF_TYPE_SIZE
[
type
];
}
static
bool
gguf_tensor_info_sanitize
(
struct
gguf_tensor_info
*
info
)
{
if
(
info
->
n_dims
>
GGML_MAX_DIMS
)
{
fprintf
(
stderr
,
"%s: invalid number of dimensions (%"
PRIu32
")
\n
"
,
__func__
,
info
->
n_dims
);
return
false
;
}
if
(
info
->
type
<
0
||
info
->
type
>=
GGML_TYPE_COUNT
)
{
fprintf
(
stderr
,
"%s: invalid type (%d)
\n
"
,
__func__
,
info
->
type
);
return
false
;
}
if
(
strlen
(
info
->
name
.
data
)
>=
GGML_MAX_NAME
)
{
fprintf
(
stderr
,
"%s: tensor '%s' name is too long
\n
"
,
__func__
,
info
->
name
.
data
);
return
false
;
}
for
(
uint32_t
i
=
0
;
i
<
info
->
n_dims
;
++
i
)
{
if
(
info
->
ne
[
i
]
<=
0
)
{
fprintf
(
stderr
,
"%s: invalid number of elements (%"
PRIu64
")
\n
"
,
__func__
,
info
->
ne
[
i
]);
return
false
;
}
}
// prevent overflow for total number of elements
if
(
INT64_MAX
/
info
->
ne
[
1
]
<=
info
->
ne
[
0
])
{
fprintf
(
stderr
,
"%s: invalid number of elements (%"
PRIu64
")
\n
"
,
__func__
,
info
->
ne
[
1
]);
return
false
;
}
if
(
INT64_MAX
/
info
->
ne
[
2
]
<=
info
->
ne
[
0
]
*
info
->
ne
[
1
])
{
fprintf
(
stderr
,
"%s: invalid number of elements (%"
PRIu64
")
\n
"
,
__func__
,
info
->
ne
[
2
]);
return
false
;
}
if
(
INT64_MAX
/
info
->
ne
[
3
]
<=
info
->
ne
[
0
]
*
info
->
ne
[
1
]
*
info
->
ne
[
2
])
{
fprintf
(
stderr
,
"%s: invalid number of elements (%"
PRIu64
")
\n
"
,
__func__
,
info
->
ne
[
3
]);
return
false
;
}
return
true
;
}
static
bool
gguf_fread_el
(
FILE
*
file
,
void
*
dst
,
size_t
size
,
size_t
*
offset
)
{
const
size_t
n
=
fread
(
dst
,
1
,
size
,
file
);
*
offset
+=
n
;
return
n
==
size
;
}
static
bool
gguf_fread_str
(
FILE
*
file
,
struct
gguf_str
*
p
,
size_t
*
offset
)
{
p
->
n
=
0
;
p
->
data
=
NULL
;
bool
ok
=
true
;
ok
=
ok
&&
gguf_fread_el
(
file
,
&
p
->
n
,
sizeof
(
p
->
n
),
offset
);
// early exit if string length is invalid, prevents from integer overflow
if
(
p
->
n
==
SIZE_MAX
)
{
fprintf
(
stderr
,
"%s: invalid string length (%"
PRIu64
")
\n
"
,
__func__
,
p
->
n
);
return
false
;
}
p
->
data
=
calloc
(
p
->
n
+
1
,
1
);
if
(
!
p
->
data
)
{
fprintf
(
stderr
,
"%s: failed to allocate memory for string of length %"
PRIu64
"
\n
"
,
__func__
,
p
->
n
);
return
false
;
}
ok
=
ok
&&
gguf_fread_el
(
file
,
p
->
data
,
p
->
n
,
offset
);
return
ok
;
}
static
void
gguf_free_kv
(
struct
gguf_kv
*
kv
)
{
if
(
kv
->
key
.
data
)
{
GGML_FREE
(
kv
->
key
.
data
);
}
if
(
kv
->
type
==
GGUF_TYPE_STRING
)
{
if
(
kv
->
value
.
str
.
data
)
{
GGML_FREE
(
kv
->
value
.
str
.
data
);
}
}
if
(
kv
->
type
==
GGUF_TYPE_ARRAY
)
{
if
(
kv
->
value
.
arr
.
data
)
{
if
(
kv
->
value
.
arr
.
type
==
GGUF_TYPE_STRING
)
{
for
(
uint64_t
j
=
0
;
j
<
kv
->
value
.
arr
.
n
;
++
j
)
{
struct
gguf_str
*
str
=
&
((
struct
gguf_str
*
)
kv
->
value
.
arr
.
data
)[
j
];
if
(
str
->
data
)
{
GGML_FREE
(
str
->
data
);
}
}
}
GGML_FREE
(
kv
->
value
.
arr
.
data
);
}
}
}
struct
gguf_context
*
gguf_init_empty
(
void
)
{
struct
gguf_context
*
ctx
=
calloc
(
1
,
sizeof
(
struct
gguf_context
));
if
(
!
ctx
)
{
fprintf
(
stderr
,
"%s: failed to allocate memory for context
\n
"
,
__func__
);
return
NULL
;
}
memcpy
(
ctx
->
header
.
magic
,
GGUF_MAGIC
,
sizeof
(
ctx
->
header
.
magic
));
ctx
->
header
.
version
=
GGUF_VERSION
;
ctx
->
header
.
n_tensors
=
0
;
ctx
->
header
.
n_kv
=
0
;
ctx
->
kv
=
NULL
;
ctx
->
infos
=
NULL
;
ctx
->
alignment
=
GGUF_DEFAULT_ALIGNMENT
;
ctx
->
offset
=
0
;
ctx
->
size
=
0
;
ctx
->
data
=
NULL
;
return
ctx
;
}
struct
gguf_context
*
gguf_init_from_file_impl
(
FILE
*
file
,
struct
gguf_init_params
params
)
{
// offset from start of file
size_t
offset
=
0
;
char
magic
[
4
];
// check the magic before making allocations
{
gguf_fread_el
(
file
,
&
magic
,
sizeof
(
magic
),
&
offset
);
for
(
uint32_t
i
=
0
;
i
<
sizeof
(
magic
);
i
++
)
{
if
(
magic
[
i
]
!=
GGUF_MAGIC
[
i
])
{
fprintf
(
stderr
,
"%s: invalid magic characters '%c%c%c%c'
\n
"
,
__func__
,
magic
[
0
],
magic
[
1
],
magic
[
2
],
magic
[
3
]);
return
NULL
;
}
}
}
bool
ok
=
true
;
struct
gguf_context
*
ctx
=
calloc
(
1
,
sizeof
(
struct
gguf_context
));
if
(
!
ctx
)
{
fprintf
(
stderr
,
"%s: failed to allocate memory for context
\n
"
,
__func__
);
return
NULL
;
}
// read the header
{
strncpy
(
ctx
->
header
.
magic
,
magic
,
4
);
ctx
->
kv
=
NULL
;
ctx
->
infos
=
NULL
;
ctx
->
data
=
NULL
;
ok
=
ok
&&
gguf_fread_el
(
file
,
&
ctx
->
header
.
version
,
sizeof
(
ctx
->
header
.
version
),
&
offset
);
ok
=
ok
&&
gguf_fread_el
(
file
,
&
ctx
->
header
.
n_tensors
,
sizeof
(
ctx
->
header
.
n_tensors
),
&
offset
);
ok
=
ok
&&
gguf_fread_el
(
file
,
&
ctx
->
header
.
n_kv
,
sizeof
(
ctx
->
header
.
n_kv
),
&
offset
);
if
(
ctx
->
header
.
version
==
1
)
{
fprintf
(
stderr
,
"%s: GGUFv1 is no longer supported. please use a more up-to-date version
\n
"
,
__func__
);
gguf_free
(
ctx
);
return
NULL
;
}
// sanity-checks to prevent from integer/buffer overflows
ok
=
ok
&&
(
ctx
->
header
.
n_tensors
<
(
SIZE_MAX
/
2
)
/
sizeof
(
struct
gguf_tensor_info
));
ok
=
ok
&&
(
ctx
->
header
.
n_tensors
<
(
SIZE_MAX
/
2
)
/
ggml_tensor_overhead
());
ok
=
ok
&&
(
ctx
->
header
.
n_kv
<
(
SIZE_MAX
/
2
)
/
sizeof
(
struct
gguf_kv
));
if
(
!
ok
)
{
fprintf
(
stderr
,
"%s: failed to read header
\n
"
,
__func__
);
gguf_free
(
ctx
);
return
NULL
;
}
}
// read the kv pairs
{
const
uint64_t
n_kv
=
ctx
->
header
.
n_kv
;
if
(
n_kv
>
0
)
{
ctx
->
kv
=
calloc
(
n_kv
,
sizeof
(
struct
gguf_kv
));
if
(
!
ctx
->
kv
)
{
fprintf
(
stderr
,
"%s: failed to allocate memory for kv pairs
\n
"
,
__func__
);
gguf_free
(
ctx
);
return
NULL
;
}
}
for
(
uint64_t
i
=
0
;
i
<
n_kv
;
++
i
)
{
struct
gguf_kv
*
kv
=
&
ctx
->
kv
[
i
];
//fprintf(stderr, "%s: reading kv %d\n", __func__, i);
ok
=
ok
&&
gguf_fread_str
(
file
,
&
kv
->
key
,
&
offset
);
ok
=
ok
&&
gguf_fread_el
(
file
,
&
kv
->
type
,
sizeof
(
kv
->
type
),
&
offset
);
//fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);
switch
(
kv
->
type
)
{
case
GGUF_TYPE_UINT8
:
ok
=
ok
&&
gguf_fread_el
(
file
,
&
kv
->
value
.
uint8
,
sizeof
(
kv
->
value
.
uint8
),
&
offset
);
break
;
case
GGUF_TYPE_INT8
:
ok
=
ok
&&
gguf_fread_el
(
file
,
&
kv
->
value
.
int8
,
sizeof
(
kv
->
value
.
int8
),
&
offset
);
break
;
case
GGUF_TYPE_UINT16
:
ok
=
ok
&&
gguf_fread_el
(
file
,
&
kv
->
value
.
uint16
,
sizeof
(
kv
->
value
.
uint16
),
&
offset
);
break
;
case
GGUF_TYPE_INT16
:
ok
=
ok
&&
gguf_fread_el
(
file
,
&
kv
->
value
.
int16
,
sizeof
(
kv
->
value
.
int16
),
&
offset
);
break
;
case
GGUF_TYPE_UINT32
:
ok
=
ok
&&
gguf_fread_el
(
file
,
&
kv
->
value
.
uint32
,
sizeof
(
kv
->
value
.
uint32
),
&
offset
);
break
;
case
GGUF_TYPE_INT32
:
ok
=
ok
&&
gguf_fread_el
(
file
,
&
kv
->
value
.
int32
,
sizeof
(
kv
->
value
.
int32
),
&
offset
);
break
;
case
GGUF_TYPE_FLOAT32
:
ok
=
ok
&&
gguf_fread_el
(
file
,
&
kv
->
value
.
float32
,
sizeof
(
kv
->
value
.
float32
),
&
offset
);
break
;
case
GGUF_TYPE_UINT64
:
ok
=
ok
&&
gguf_fread_el
(
file
,
&
kv
->
value
.
uint64
,
sizeof
(
kv
->
value
.
uint64
),
&
offset
);
break
;
case
GGUF_TYPE_INT64
:
ok
=
ok
&&
gguf_fread_el
(
file
,
&
kv
->
value
.
int64
,
sizeof
(
kv
->
value
.
int64
),
&
offset
);
break
;
case
GGUF_TYPE_FLOAT64
:
ok
=
ok
&&
gguf_fread_el
(
file
,
&
kv
->
value
.
float64
,
sizeof
(
kv
->
value
.
float64
),
&
offset
);
break
;
case
GGUF_TYPE_BOOL
:
ok
=
ok
&&
gguf_fread_el
(
file
,
&
kv
->
value
.
bool_
,
sizeof
(
kv
->
value
.
bool_
),
&
offset
);
break
;
case
GGUF_TYPE_STRING
:
ok
=
ok
&&
gguf_fread_str
(
file
,
&
kv
->
value
.
str
,
&
offset
);
break
;
case
GGUF_TYPE_ARRAY
:
{
ok
=
ok
&&
gguf_fread_el
(
file
,
&
kv
->
value
.
arr
.
type
,
sizeof
(
kv
->
value
.
arr
.
type
),
&
offset
);
ok
=
ok
&&
gguf_fread_el
(
file
,
&
kv
->
value
.
arr
.
n
,
sizeof
(
kv
->
value
.
arr
.
n
),
&
offset
);
switch
(
kv
->
value
.
arr
.
type
)
{
case
GGUF_TYPE_UINT8
:
case
GGUF_TYPE_INT8
:
case
GGUF_TYPE_UINT16
:
case
GGUF_TYPE_INT16
:
case
GGUF_TYPE_UINT32
:
case
GGUF_TYPE_INT32
:
case
GGUF_TYPE_FLOAT32
:
case
GGUF_TYPE_UINT64
:
case
GGUF_TYPE_INT64
:
case
GGUF_TYPE_FLOAT64
:
case
GGUF_TYPE_BOOL
:
{
// prevent from integer overflow in the malloc below
if
(
kv
->
value
.
arr
.
n
>=
SIZE_MAX
/
gguf_type_size
(
kv
->
value
.
arr
.
type
))
{
fprintf
(
stderr
,
"%s: array size is too large (%"
PRIu64
")
\n
"
,
__func__
,
kv
->
value
.
arr
.
n
);
gguf_free
(
ctx
);
return
NULL
;
}
kv
->
value
.
arr
.
data
=
calloc
(
kv
->
value
.
arr
.
n
,
gguf_type_size
(
kv
->
value
.
arr
.
type
));
if
(
!
kv
->
value
.
arr
.
data
)
{
fprintf
(
stderr
,
"%s: failed to allocate memory for array
\n
"
,
__func__
);
gguf_free
(
ctx
);
return
NULL
;
}
ok
=
ok
&&
gguf_fread_el
(
file
,
kv
->
value
.
arr
.
data
,
kv
->
value
.
arr
.
n
*
gguf_type_size
(
kv
->
value
.
arr
.
type
),
&
offset
);
}
break
;
case
GGUF_TYPE_STRING
:
{
// prevent from integer overflow in the malloc below
if
(
kv
->
value
.
arr
.
n
>=
SIZE_MAX
/
sizeof
(
struct
gguf_str
))
{
fprintf
(
stderr
,
"%s: array size is too large (%"
PRIu64
")
\n
"
,
__func__
,
kv
->
value
.
arr
.
n
);
gguf_free
(
ctx
);
return
NULL
;
}
kv
->
value
.
arr
.
data
=
calloc
(
kv
->
value
.
arr
.
n
,
sizeof
(
struct
gguf_str
));
if
(
!
kv
->
value
.
arr
.
data
)
{
fprintf
(
stderr
,
"%s: failed to allocate memory for array
\n
"
,
__func__
);
gguf_free
(
ctx
);
return
NULL
;
}
for
(
uint64_t
j
=
0
;
j
<
kv
->
value
.
arr
.
n
;
++
j
)
{
ok
=
ok
&&
gguf_fread_str
(
file
,
&
((
struct
gguf_str
*
)
kv
->
value
.
arr
.
data
)[
j
],
&
offset
);
}
}
break
;
case
GGUF_TYPE_ARRAY
:
default:
{
fprintf
(
stderr
,
"%s: invalid array type %d
\n
"
,
__func__
,
kv
->
value
.
arr
.
type
);
ok
=
false
;
}
break
;
}
}
break
;
default:
{
fprintf
(
stderr
,
"%s: invalid type %d
\n
"
,
__func__
,
kv
->
type
);
ok
=
false
;
}
break
;
}
if
(
!
ok
)
{
break
;
}
}
if
(
!
ok
)
{
fprintf
(
stderr
,
"%s: failed to read key-value pairs
\n
"
,
__func__
);
gguf_free
(
ctx
);
return
NULL
;
}
}
// read the tensor infos
if
(
ctx
->
header
.
n_tensors
>
0
)
{
ctx
->
infos
=
calloc
(
ctx
->
header
.
n_tensors
,
sizeof
(
struct
gguf_tensor_info
));
if
(
!
ctx
->
infos
)
{
fprintf
(
stderr
,
"%s: failed to allocate memory for tensor infos
\n
"
,
__func__
);
gguf_free
(
ctx
);
return
NULL
;
}
for
(
uint64_t
i
=
0
;
i
<
ctx
->
header
.
n_tensors
;
++
i
)
{
struct
gguf_tensor_info
*
info
=
&
ctx
->
infos
[
i
];
for
(
int
j
=
0
;
j
<
GGML_MAX_DIMS
;
++
j
)
{
info
->
ne
[
j
]
=
1
;
}
ok
=
ok
&&
gguf_fread_str
(
file
,
&
info
->
name
,
&
offset
);
ok
=
ok
&&
gguf_fread_el
(
file
,
&
info
->
n_dims
,
sizeof
(
info
->
n_dims
),
&
offset
);
ok
=
ok
&&
(
info
->
n_dims
<=
GGML_MAX_DIMS
);
for
(
uint32_t
j
=
0
;
j
<
info
->
n_dims
;
++
j
)
{
ok
=
ok
&&
gguf_fread_el
(
file
,
&
info
->
ne
[
j
],
sizeof
(
info
->
ne
[
j
]),
&
offset
);
}
ok
=
ok
&&
gguf_fread_el
(
file
,
&
info
->
type
,
sizeof
(
info
->
type
),
&
offset
);
ok
=
ok
&&
gguf_fread_el
(
file
,
&
info
->
offset
,
sizeof
(
info
->
offset
),
&
offset
);
ok
=
ok
&&
gguf_tensor_info_sanitize
(
info
);
// make sure there is no duplicated tensor names
for
(
uint64_t
j
=
0
;
j
<
i
&&
ok
;
++
j
)
{
if
(
strcmp
(
info
->
name
.
data
,
ctx
->
infos
[
j
].
name
.
data
)
==
0
)
{
fprintf
(
stderr
,
"%s: duplicated tensor name %s
\n
"
,
__func__
,
info
->
name
.
data
);
ok
=
false
;
}
}
if
(
!
ok
)
{
fprintf
(
stderr
,
"%s: failed to read tensor info
\n
"
,
__func__
);
gguf_free
(
ctx
);
return
NULL
;
}
}
}
ctx
->
alignment
=
GGUF_DEFAULT_ALIGNMENT
;
int
alignment_idx
=
gguf_find_key
(
ctx
,
"general.alignment"
);
if
(
alignment_idx
!=
-
1
)
{
ctx
->
alignment
=
gguf_get_val_u32
(
ctx
,
alignment_idx
);
}
// we require the data section to be aligned, so take into account any padding
{
const
size_t
offset_pad
=
offset
%
ctx
->
alignment
;
if
(
offset_pad
!=
0
)
{
offset
+=
ctx
->
alignment
-
offset_pad
;
fseek
(
file
,
offset
,
SEEK_SET
);
}
}
// store the current file offset - this is where the data section starts
ctx
->
offset
=
offset
;
// compute the total size of the data section, taking into account the alignment
{
ctx
->
size
=
0
;
for
(
uint64_t
i
=
0
;
i
<
ctx
->
header
.
n_tensors
;
++
i
)
{
struct
gguf_tensor_info
*
info
=
&
ctx
->
infos
[
i
];
const
int64_t
ne
=
(
int64_t
)
info
->
ne
[
0
]
*
(
int64_t
)
info
->
ne
[
1
]
*
(
int64_t
)
info
->
ne
[
2
]
*
(
int64_t
)
info
->
ne
[
3
];
if
(
ggml_blck_size
(
info
->
type
)
==
0
)
{
// this tensor type support have been removed:
fprintf
(
stderr
,
"%s: tensor '%s' of type %d: %s
\n
"
,
__func__
,
info
->
name
.
data
,
(
int
)
info
->
type
,
ggml_type_name
(
info
->
type
));
gguf_free
(
ctx
);
return
NULL
;
}
if
(
ne
%
ggml_blck_size
(
info
->
type
)
!=
0
)
{
fprintf
(
stderr
,
"%s: tensor '%s' of type %d (%s) number of elements (%"
PRId64
") is not a multiple of block size (%"
PRId64
")
\n
"
,
__func__
,
info
->
name
.
data
,
(
int
)
info
->
type
,
ggml_type_name
(
info
->
type
),
ne
,
ggml_blck_size
(
info
->
type
));
gguf_free
(
ctx
);
return
NULL
;
}
const
size_t
size_cur
=
ggml_row_size
(
info
->
type
,
ne
);
ctx
->
size
+=
GGML_PAD
(
size_cur
,
ctx
->
alignment
);
}
}
// load the tensor data only if requested
if
(
params
.
ctx
!=
NULL
)
{
// if the provided gguf_context is no_alloc, then we create "empty" tensors and do not read the binary blob
// otherwise, we load the binary blob into the created ggml_context as well, and point the "data" members of
// the ggml_tensor structs to the appropriate locations in the binary blob
// compute the exact size needed for the new ggml_context
const
size_t
mem_size
=
params
.
no_alloc
?
(
ctx
->
header
.
n_tensors
)
*
ggml_tensor_overhead
()
:
(
ctx
->
header
.
n_tensors
+
1
)
*
ggml_tensor_overhead
()
+
ctx
->
size
;
struct
ggml_init_params
pdata
=
{
.
mem_size
=
mem_size
,
.
mem_buffer
=
NULL
,
.
no_alloc
=
params
.
no_alloc
,
};
*
params
.
ctx
=
ggml_init
(
pdata
);
if
(
*
params
.
ctx
==
NULL
)
{
fprintf
(
stderr
,
"%s: failed to initialize context
\n
"
,
__func__
);
gguf_free
(
ctx
);
return
NULL
;
}
struct
ggml_context
*
ctx_data
=
*
params
.
ctx
;
struct
ggml_tensor
*
data
=
NULL
;
if
(
!
params
.
no_alloc
)
{
data
=
ggml_new_tensor_1d
(
ctx_data
,
GGML_TYPE_I8
,
ctx
->
size
);
ok
=
ok
&&
data
!=
NULL
;
// read the binary blob with the tensor data
ok
=
ok
&&
gguf_fread_el
(
file
,
data
->
data
,
ctx
->
size
,
&
offset
);
if
(
!
ok
)
{
fprintf
(
stderr
,
"%s: failed to read tensor data
\n
"
,
__func__
);
ggml_free
(
ctx_data
);
gguf_free
(
ctx
);
return
NULL
;
}
ctx
->
data
=
data
->
data
;
}
ggml_set_no_alloc
(
ctx_data
,
true
);
// create the tensors
for
(
uint64_t
i
=
0
;
i
<
ctx
->
header
.
n_tensors
;
++
i
)
{
const
int64_t
ne
[
GGML_MAX_DIMS
]
=
{
ctx
->
infos
[
i
].
ne
[
0
],
ctx
->
infos
[
i
].
ne
[
1
],
ctx
->
infos
[
i
].
ne
[
2
],
ctx
->
infos
[
i
].
ne
[
3
],
};
struct
ggml_tensor
*
cur
=
ggml_new_tensor
(
ctx_data
,
ctx
->
infos
[
i
].
type
,
ctx
->
infos
[
i
].
n_dims
,
ne
);
ok
=
ok
&&
cur
!=
NULL
;
if
(
!
ok
)
{
break
;
}
ggml_set_name
(
cur
,
ctx
->
infos
[
i
].
name
.
data
);
// point the data member to the appropriate location in the binary blob using the tensor infos
if
(
!
params
.
no_alloc
)
{
//cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
cur
->
data
=
(
char
*
)
data
->
data
+
ctx
->
infos
[
i
].
offset
;
// offset from data
}
}
if
(
!
ok
)
{
fprintf
(
stderr
,
"%s: failed to read the tensor data
\n
"
,
__func__
);
ggml_free
(
ctx_data
);
gguf_free
(
ctx
);
return
NULL
;
}
ggml_set_no_alloc
(
ctx_data
,
params
.
no_alloc
);
}
return
ctx
;
}
struct
gguf_context
*
gguf_init_from_file
(
const
char
*
fname
,
struct
gguf_init_params
params
)
{
FILE
*
file
=
ggml_fopen
(
fname
,
"rb"
);
if
(
!
file
)
{
fprintf
(
stderr
,
"%s: failed to open '%s': '%s'
\n
"
,
__func__
,
fname
,
strerror
(
errno
));
return
NULL
;
}
struct
gguf_context
*
result
=
gguf_init_from_file_impl
(
file
,
params
);
fclose
(
file
);
return
result
;
}
void
gguf_free
(
struct
gguf_context
*
ctx
)
{
if
(
ctx
==
NULL
)
{
return
;
}
if
(
ctx
->
kv
)
{
// free string memory - not great..
for
(
uint64_t
i
=
0
;
i
<
ctx
->
header
.
n_kv
;
++
i
)
{
gguf_free_kv
(
&
ctx
->
kv
[
i
]);
}
GGML_FREE
(
ctx
->
kv
);
}
if
(
ctx
->
infos
)
{
for
(
uint64_t
i
=
0
;
i
<
ctx
->
header
.
n_tensors
;
++
i
)
{
struct
gguf_tensor_info
*
info
=
&
ctx
->
infos
[
i
];
if
(
info
->
name
.
data
)
{
GGML_FREE
(
info
->
name
.
data
);
}
}
GGML_FREE
(
ctx
->
infos
);
}
GGML_FREE
(
ctx
);
}
const
char
*
gguf_type_name
(
enum
gguf_type
type
)
{
return
GGUF_TYPE_NAME
[
type
];
}
int
gguf_get_version
(
const
struct
gguf_context
*
ctx
)
{
return
ctx
->
header
.
version
;
}
size_t
gguf_get_alignment
(
const
struct
gguf_context
*
ctx
)
{
return
ctx
->
alignment
;
}
size_t
gguf_get_data_offset
(
const
struct
gguf_context
*
ctx
)
{
return
ctx
->
offset
;
}
void
*
gguf_get_data
(
const
struct
gguf_context
*
ctx
)
{
return
ctx
->
data
;
}
int
gguf_get_n_kv
(
const
struct
gguf_context
*
ctx
)
{
return
ctx
->
header
.
n_kv
;
}
int
gguf_find_key
(
const
struct
gguf_context
*
ctx
,
const
char
*
key
)
{
// return -1 if key not found
int
keyfound
=
-
1
;
const
int
n_kv
=
gguf_get_n_kv
(
ctx
);
for
(
int
i
=
0
;
i
<
n_kv
;
++
i
)
{
if
(
strcmp
(
key
,
gguf_get_key
(
ctx
,
i
))
==
0
)
{
keyfound
=
i
;
break
;
}
}
return
keyfound
;
}
const
char
*
gguf_get_key
(
const
struct
gguf_context
*
ctx
,
int
key_id
)
{
GGML_ASSERT
(
key_id
>=
0
&&
key_id
<
gguf_get_n_kv
(
ctx
));
return
ctx
->
kv
[
key_id
].
key
.
data
;
}
enum
gguf_type
gguf_get_kv_type
(
const
struct
gguf_context
*
ctx
,
int
key_id
)
{
GGML_ASSERT
(
key_id
>=
0
&&
key_id
<
gguf_get_n_kv
(
ctx
));
return
ctx
->
kv
[
key_id
].
type
;
}
enum
gguf_type
gguf_get_arr_type
(
const
struct
gguf_context
*
ctx
,
int
key_id
)
{
GGML_ASSERT
(
key_id
>=
0
&&
key_id
<
gguf_get_n_kv
(
ctx
));
GGML_ASSERT
(
ctx
->
kv
[
key_id
].
type
==
GGUF_TYPE_ARRAY
);
return
ctx
->
kv
[
key_id
].
value
.
arr
.
type
;
}
const
void
*
gguf_get_arr_data
(
const
struct
gguf_context
*
ctx
,
int
key_id
)
{
GGML_ASSERT
(
key_id
>=
0
&&
key_id
<
gguf_get_n_kv
(
ctx
));
GGML_ASSERT
(
ctx
->
kv
[
key_id
].
type
==
GGUF_TYPE_ARRAY
);
return
ctx
->
kv
[
key_id
].
value
.
arr
.
data
;
}
const
char
*
gguf_get_arr_str
(
const
struct
gguf_context
*
ctx
,
int
key_id
,
int
i
)
{
GGML_ASSERT
(
key_id
>=
0
&&
key_id
<
gguf_get_n_kv
(
ctx
));
GGML_ASSERT
(
ctx
->
kv
[
key_id
].
type
==
GGUF_TYPE_ARRAY
);
struct
gguf_kv
*
kv
=
&
ctx
->
kv
[
key_id
];
struct
gguf_str
*
str
=
&
((
struct
gguf_str
*
)
kv
->
value
.
arr
.
data
)[
i
];
return
str
->
data
;
}
int
gguf_get_arr_n
(
const
struct
gguf_context
*
ctx
,
int
key_id
)
{
GGML_ASSERT
(
key_id
>=
0
&&
key_id
<
gguf_get_n_kv
(
ctx
));
GGML_ASSERT
(
ctx
->
kv
[
key_id
].
type
==
GGUF_TYPE_ARRAY
);
return
ctx
->
kv
[
key_id
].
value
.
arr
.
n
;
}
uint8_t
gguf_get_val_u8
(
const
struct
gguf_context
*
ctx
,
int
key_id
)
{
GGML_ASSERT
(
key_id
>=
0
&&
key_id
<
gguf_get_n_kv
(
ctx
));
GGML_ASSERT
(
ctx
->
kv
[
key_id
].
type
==
GGUF_TYPE_UINT8
);
return
ctx
->
kv
[
key_id
].
value
.
uint8
;
}
int8_t
gguf_get_val_i8
(
const
struct
gguf_context
*
ctx
,
int
key_id
)
{
GGML_ASSERT
(
key_id
>=
0
&&
key_id
<
gguf_get_n_kv
(
ctx
));
GGML_ASSERT
(
ctx
->
kv
[
key_id
].
type
==
GGUF_TYPE_INT8
);
return
ctx
->
kv
[
key_id
].
value
.
int8
;
}
uint16_t
gguf_get_val_u16
(
const
struct
gguf_context
*
ctx
,
int
key_id
)
{
GGML_ASSERT
(
key_id
>=
0
&&
key_id
<
gguf_get_n_kv
(
ctx
));
GGML_ASSERT
(
ctx
->
kv
[
key_id
].
type
==
GGUF_TYPE_UINT16
);
return
ctx
->
kv
[
key_id
].
value
.
uint16
;
}
int16_t
gguf_get_val_i16
(
const
struct
gguf_context
*
ctx
,
int
key_id
)
{
GGML_ASSERT
(
key_id
>=
0
&&
key_id
<
gguf_get_n_kv
(
ctx
));
GGML_ASSERT
(
ctx
->
kv
[
key_id
].
type
==
GGUF_TYPE_INT16
);
return
ctx
->
kv
[
key_id
].
value
.
int16
;
}
uint32_t
gguf_get_val_u32
(
const
struct
gguf_context
*
ctx
,
int
key_id
)
{
GGML_ASSERT
(
key_id
>=
0
&&
key_id
<
gguf_get_n_kv
(
ctx
));
GGML_ASSERT
(
ctx
->
kv
[
key_id
].
type
==
GGUF_TYPE_UINT32
);
return
ctx
->
kv
[
key_id
].
value
.
uint32
;
}
int32_t
gguf_get_val_i32
(
const
struct
gguf_context
*
ctx
,
int
key_id
)
{
GGML_ASSERT
(
key_id
>=
0
&&
key_id
<
gguf_get_n_kv
(
ctx
));
GGML_ASSERT
(
ctx
->
kv
[
key_id
].
type
==
GGUF_TYPE_INT32
);
return
ctx
->
kv
[
key_id
].
value
.
int32
;
}
float
gguf_get_val_f32
(
const
struct
gguf_context
*
ctx
,
int
key_id
)
{
GGML_ASSERT
(
key_id
>=
0
&&
key_id
<
gguf_get_n_kv
(
ctx
));
GGML_ASSERT
(
ctx
->
kv
[
key_id
].
type
==
GGUF_TYPE_FLOAT32
);
return
ctx
->
kv
[
key_id
].
value
.
float32
;
}
uint64_t
gguf_get_val_u64
(
const
struct
gguf_context
*
ctx
,
int
key_id
)
{
GGML_ASSERT
(
key_id
>=
0
&&
key_id
<
gguf_get_n_kv
(
ctx
));
GGML_ASSERT
(
ctx
->
kv
[
key_id
].
type
==
GGUF_TYPE_UINT64
);
return
ctx
->
kv
[
key_id
].
value
.
uint64
;
}
int64_t
gguf_get_val_i64
(
const
struct
gguf_context
*
ctx
,
int
key_id
)
{
GGML_ASSERT
(
key_id
>=
0
&&
key_id
<
gguf_get_n_kv
(
ctx
));
GGML_ASSERT
(
ctx
->
kv
[
key_id
].
type
==
GGUF_TYPE_INT64
);
return
ctx
->
kv
[
key_id
].
value
.
int64
;
}
double
gguf_get_val_f64
(
const
struct
gguf_context
*
ctx
,
int
key_id
)
{
GGML_ASSERT
(
key_id
>=
0
&&
key_id
<
gguf_get_n_kv
(
ctx
));
GGML_ASSERT
(
ctx
->
kv
[
key_id
].
type
==
GGUF_TYPE_FLOAT64
);
return
ctx
->
kv
[
key_id
].
value
.
float64
;
}
bool
gguf_get_val_bool
(
const
struct
gguf_context
*
ctx
,
int
key_id
)
{
GGML_ASSERT
(
key_id
>=
0
&&
key_id
<
gguf_get_n_kv
(
ctx
));
GGML_ASSERT
(
ctx
->
kv
[
key_id
].
type
==
GGUF_TYPE_BOOL
);
return
ctx
->
kv
[
key_id
].
value
.
bool_
;
}
const
char
*
gguf_get_val_str
(
const
struct
gguf_context
*
ctx
,
int
key_id
)
{
GGML_ASSERT
(
key_id
>=
0
&&
key_id
<
gguf_get_n_kv
(
ctx
));
GGML_ASSERT
(
ctx
->
kv
[
key_id
].
type
==
GGUF_TYPE_STRING
);
return
ctx
->
kv
[
key_id
].
value
.
str
.
data
;
}
const
void
*
gguf_get_val_data
(
const
struct
gguf_context
*
ctx
,
int
key_id
)
{
GGML_ASSERT
(
key_id
>=
0
&&
key_id
<
gguf_get_n_kv
(
ctx
));
GGML_ASSERT
(
ctx
->
kv
[
key_id
].
type
!=
GGUF_TYPE_ARRAY
);
GGML_ASSERT
(
ctx
->
kv
[
key_id
].
type
!=
GGUF_TYPE_STRING
);
return
&
ctx
->
kv
[
key_id
].
value
;
}
int
gguf_get_n_tensors
(
const
struct
gguf_context
*
ctx
)
{
return
ctx
->
header
.
n_tensors
;
}
int
gguf_find_tensor
(
const
struct
gguf_context
*
ctx
,
const
char
*
name
)
{
// return -1 if tensor not found
int
tensorfound
=
-
1
;
const
int
n_tensors
=
gguf_get_n_tensors
(
ctx
);
for
(
int
i
=
0
;
i
<
n_tensors
;
++
i
)
{
if
(
strcmp
(
name
,
gguf_get_tensor_name
(
ctx
,
i
))
==
0
)
{
tensorfound
=
i
;
break
;
}
}
return
tensorfound
;
}
size_t
gguf_get_tensor_offset
(
const
struct
gguf_context
*
ctx
,
int
i
)
{
return
ctx
->
infos
[
i
].
offset
;
}
char
*
gguf_get_tensor_name
(
const
struct
gguf_context
*
ctx
,
int
i
)
{
return
ctx
->
infos
[
i
].
name
.
data
;
}
enum
ggml_type
gguf_get_tensor_type
(
const
struct
gguf_context
*
ctx
,
int
i
)
{
return
ctx
->
infos
[
i
].
type
;
}
// returns the index
static
int
gguf_get_or_add_key
(
struct
gguf_context
*
ctx
,
const
char
*
key
)
{
const
int
idx
=
gguf_find_key
(
ctx
,
key
);
if
(
idx
>=
0
)
{
return
idx
;
}
const
int
n_kv
=
gguf_get_n_kv
(
ctx
);
ctx
->
kv
=
realloc
(
ctx
->
kv
,
(
n_kv
+
1
)
*
sizeof
(
struct
gguf_kv
));
ctx
->
kv
[
n_kv
].
key
.
n
=
strlen
(
key
);
ctx
->
kv
[
n_kv
].
key
.
data
=
strdup
(
key
);
ctx
->
header
.
n_kv
++
;
return
n_kv
;
}
void
gguf_remove_key
(
struct
gguf_context
*
ctx
,
const
char
*
key
)
{
const
int
idx
=
gguf_find_key
(
ctx
,
key
);
if
(
idx
>=
0
)
{
const
int
n_kv
=
gguf_get_n_kv
(
ctx
);
gguf_free_kv
(
&
ctx
->
kv
[
idx
]);
for
(
int
i
=
idx
;
i
<
n_kv
-
1
;
++
i
)
{
ctx
->
kv
[
i
]
=
ctx
->
kv
[
i
+
1
];
}
ctx
->
kv
=
realloc
(
ctx
->
kv
,
(
n_kv
-
1
)
*
sizeof
(
struct
gguf_kv
));
ctx
->
header
.
n_kv
--
;
}
}
void
gguf_set_val_u8
(
struct
gguf_context
*
ctx
,
const
char
*
key
,
uint8_t
val
)
{
const
int
idx
=
gguf_get_or_add_key
(
ctx
,
key
);
ctx
->
kv
[
idx
].
type
=
GGUF_TYPE_UINT8
;
ctx
->
kv
[
idx
].
value
.
uint8
=
val
;
}
void
gguf_set_val_i8
(
struct
gguf_context
*
ctx
,
const
char
*
key
,
int8_t
val
)
{
const
int
idx
=
gguf_get_or_add_key
(
ctx
,
key
);
ctx
->
kv
[
idx
].
type
=
GGUF_TYPE_INT8
;
ctx
->
kv
[
idx
].
value
.
int8
=
val
;
}
void
gguf_set_val_u16
(
struct
gguf_context
*
ctx
,
const
char
*
key
,
uint16_t
val
)
{
const
int
idx
=
gguf_get_or_add_key
(
ctx
,
key
);
ctx
->
kv
[
idx
].
type
=
GGUF_TYPE_UINT16
;
ctx
->
kv
[
idx
].
value
.
uint16
=
val
;
}
void
gguf_set_val_i16
(
struct
gguf_context
*
ctx
,
const
char
*
key
,
int16_t
val
)
{
const
int
idx
=
gguf_get_or_add_key
(
ctx
,
key
);
ctx
->
kv
[
idx
].
type
=
GGUF_TYPE_INT16
;
ctx
->
kv
[
idx
].
value
.
int16
=
val
;
}
void
gguf_set_val_u32
(
struct
gguf_context
*
ctx
,
const
char
*
key
,
uint32_t
val
)
{
const
int
idx
=
gguf_get_or_add_key
(
ctx
,
key
);
ctx
->
kv
[
idx
].
type
=
GGUF_TYPE_UINT32
;
ctx
->
kv
[
idx
].
value
.
uint32
=
val
;
}
void
gguf_set_val_i32
(
struct
gguf_context
*
ctx
,
const
char
*
key
,
int32_t
val
)
{
const
int
idx
=
gguf_get_or_add_key
(
ctx
,
key
);
ctx
->
kv
[
idx
].
type
=
GGUF_TYPE_INT32
;
ctx
->
kv
[
idx
].
value
.
int32
=
val
;
}
void
gguf_set_val_f32
(
struct
gguf_context
*
ctx
,
const
char
*
key
,
float
val
)
{
const
int
idx
=
gguf_get_or_add_key
(
ctx
,
key
);
ctx
->
kv
[
idx
].
type
=
GGUF_TYPE_FLOAT32
;
ctx
->
kv
[
idx
].
value
.
float32
=
val
;
}
void
gguf_set_val_u64
(
struct
gguf_context
*
ctx
,
const
char
*
key
,
uint64_t
val
)
{
const
int
idx
=
gguf_get_or_add_key
(
ctx
,
key
);
ctx
->
kv
[
idx
].
type
=
GGUF_TYPE_UINT64
;
ctx
->
kv
[
idx
].
value
.
uint64
=
val
;
}
void
gguf_set_val_i64
(
struct
gguf_context
*
ctx
,
const
char
*
key
,
int64_t
val
)
{
const
int
idx
=
gguf_get_or_add_key
(
ctx
,
key
);
ctx
->
kv
[
idx
].
type
=
GGUF_TYPE_INT64
;
ctx
->
kv
[
idx
].
value
.
int64
=
val
;
}
void
gguf_set_val_f64
(
struct
gguf_context
*
ctx
,
const
char
*
key
,
double
val
)
{
const
int
idx
=
gguf_get_or_add_key
(
ctx
,
key
);
ctx
->
kv
[
idx
].
type
=
GGUF_TYPE_FLOAT64
;
ctx
->
kv
[
idx
].
value
.
float64
=
val
;
}
void
gguf_set_val_bool
(
struct
gguf_context
*
ctx
,
const
char
*
key
,
bool
val
)
{
const
int
idx
=
gguf_get_or_add_key
(
ctx
,
key
);
ctx
->
kv
[
idx
].
type
=
GGUF_TYPE_BOOL
;
ctx
->
kv
[
idx
].
value
.
bool_
=
val
;
}
void
gguf_set_val_str
(
struct
gguf_context
*
ctx
,
const
char
*
key
,
const
char
*
val
)
{
const
int
idx
=
gguf_get_or_add_key
(
ctx
,
key
);
ctx
->
kv
[
idx
].
type
=
GGUF_TYPE_STRING
;
ctx
->
kv
[
idx
].
value
.
str
.
n
=
strlen
(
val
);
ctx
->
kv
[
idx
].
value
.
str
.
data
=
strdup
(
val
);
}
void
gguf_set_arr_data
(
struct
gguf_context
*
ctx
,
const
char
*
key
,
enum
gguf_type
type
,
const
void
*
data
,
int
n
)
{
const
int
idx
=
gguf_get_or_add_key
(
ctx
,
key
);
ctx
->
kv
[
idx
].
type
=
GGUF_TYPE_ARRAY
;
ctx
->
kv
[
idx
].
value
.
arr
.
type
=
type
;
ctx
->
kv
[
idx
].
value
.
arr
.
n
=
n
;
ctx
->
kv
[
idx
].
value
.
arr
.
data
=
GGML_CALLOC
(
n
,
gguf_type_size
(
type
));
memcpy
(
ctx
->
kv
[
idx
].
value
.
arr
.
data
,
data
,
n
*
gguf_type_size
(
type
));
}
void
gguf_set_arr_str
(
struct
gguf_context
*
ctx
,
const
char
*
key
,
const
char
**
data
,
int
n
)
{
const
int
idx
=
gguf_get_or_add_key
(
ctx
,
key
);
ctx
->
kv
[
idx
].
type
=
GGUF_TYPE_ARRAY
;
ctx
->
kv
[
idx
].
value
.
arr
.
type
=
GGUF_TYPE_STRING
;
ctx
->
kv
[
idx
].
value
.
arr
.
n
=
n
;
ctx
->
kv
[
idx
].
value
.
arr
.
data
=
GGML_CALLOC
(
n
,
sizeof
(
struct
gguf_str
));
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
struct
gguf_str
*
str
=
&
((
struct
gguf_str
*
)
ctx
->
kv
[
idx
].
value
.
arr
.
data
)[
i
];
str
->
n
=
strlen
(
data
[
i
]);
str
->
data
=
strdup
(
data
[
i
]);
}
}
// set or add KV pairs from another context
void
gguf_set_kv
(
struct
gguf_context
*
ctx
,
struct
gguf_context
*
src
)
{
for
(
uint32_t
i
=
0
;
i
<
src
->
header
.
n_kv
;
i
++
)
{
switch
(
src
->
kv
[
i
].
type
)
{
case
GGUF_TYPE_UINT8
:
gguf_set_val_u8
(
ctx
,
src
->
kv
[
i
].
key
.
data
,
src
->
kv
[
i
].
value
.
uint8
);
break
;
case
GGUF_TYPE_INT8
:
gguf_set_val_i8
(
ctx
,
src
->
kv
[
i
].
key
.
data
,
src
->
kv
[
i
].
value
.
int8
);
break
;
case
GGUF_TYPE_UINT16
:
gguf_set_val_u16
(
ctx
,
src
->
kv
[
i
].
key
.
data
,
src
->
kv
[
i
].
value
.
uint16
);
break
;
case
GGUF_TYPE_INT16
:
gguf_set_val_i16
(
ctx
,
src
->
kv
[
i
].
key
.
data
,
src
->
kv
[
i
].
value
.
int16
);
break
;
case
GGUF_TYPE_UINT32
:
gguf_set_val_u32
(
ctx
,
src
->
kv
[
i
].
key
.
data
,
src
->
kv
[
i
].
value
.
uint32
);
break
;
case
GGUF_TYPE_INT32
:
gguf_set_val_i32
(
ctx
,
src
->
kv
[
i
].
key
.
data
,
src
->
kv
[
i
].
value
.
int32
);
break
;
case
GGUF_TYPE_FLOAT32
:
gguf_set_val_f32
(
ctx
,
src
->
kv
[
i
].
key
.
data
,
src
->
kv
[
i
].
value
.
float32
);
break
;
case
GGUF_TYPE_UINT64
:
gguf_set_val_u64
(
ctx
,
src
->
kv
[
i
].
key
.
data
,
src
->
kv
[
i
].
value
.
uint64
);
break
;
case
GGUF_TYPE_INT64
:
gguf_set_val_i64
(
ctx
,
src
->
kv
[
i
].
key
.
data
,
src
->
kv
[
i
].
value
.
int64
);
break
;
case
GGUF_TYPE_FLOAT64
:
gguf_set_val_f64
(
ctx
,
src
->
kv
[
i
].
key
.
data
,
src
->
kv
[
i
].
value
.
float64
);
break
;
case
GGUF_TYPE_BOOL
:
gguf_set_val_bool
(
ctx
,
src
->
kv
[
i
].
key
.
data
,
src
->
kv
[
i
].
value
.
bool_
);
break
;
case
GGUF_TYPE_STRING
:
gguf_set_val_str
(
ctx
,
src
->
kv
[
i
].
key
.
data
,
src
->
kv
[
i
].
value
.
str
.
data
);
break
;
case
GGUF_TYPE_ARRAY
:
{
if
(
src
->
kv
[
i
].
value
.
arr
.
type
==
GGUF_TYPE_STRING
)
{
const
char
**
data
=
GGML_CALLOC
(
src
->
kv
[
i
].
value
.
arr
.
n
,
sizeof
(
char
*
));
for
(
uint32_t
j
=
0
;
j
<
src
->
kv
[
i
].
value
.
arr
.
n
;
j
++
)
{
data
[
j
]
=
((
struct
gguf_str
*
)
src
->
kv
[
i
].
value
.
arr
.
data
)[
j
].
data
;
}
gguf_set_arr_str
(
ctx
,
src
->
kv
[
i
].
key
.
data
,
data
,
src
->
kv
[
i
].
value
.
arr
.
n
);
GGML_FREE
((
void
*
)
data
);
}
else
if
(
src
->
kv
[
i
].
value
.
arr
.
type
==
GGUF_TYPE_ARRAY
)
{
GGML_ABORT
(
"nested arrays not supported"
);
}
else
{
gguf_set_arr_data
(
ctx
,
src
->
kv
[
i
].
key
.
data
,
src
->
kv
[
i
].
value
.
arr
.
type
,
src
->
kv
[
i
].
value
.
arr
.
data
,
src
->
kv
[
i
].
value
.
arr
.
n
);
}
}
break
;
default:
GGML_ABORT
(
"invalid type"
);
}
}
}
void
gguf_add_tensor
(
struct
gguf_context
*
ctx
,
const
struct
ggml_tensor
*
tensor
)
{
GGML_ASSERT
(
tensor
);
if
(
gguf_find_tensor
(
ctx
,
tensor
->
name
)
!=
-
1
)
{
GGML_ABORT
(
"duplicated tensor name"
);
}
const
int
idx
=
ctx
->
header
.
n_tensors
;
ctx
->
infos
=
realloc
(
ctx
->
infos
,
(
idx
+
1
)
*
sizeof
(
struct
gguf_tensor_info
));
ctx
->
infos
[
idx
].
name
.
n
=
strlen
(
tensor
->
name
);
ctx
->
infos
[
idx
].
name
.
data
=
strdup
(
tensor
->
name
);
for
(
int
i
=
0
;
i
<
GGML_MAX_DIMS
;
++
i
)
{
ctx
->
infos
[
idx
].
ne
[
i
]
=
1
;
}
ctx
->
infos
[
idx
].
n_dims
=
ggml_n_dims
(
tensor
);
for
(
uint32_t
i
=
0
;
i
<
ctx
->
infos
[
idx
].
n_dims
;
i
++
)
{
ctx
->
infos
[
idx
].
ne
[
i
]
=
tensor
->
ne
[
i
];
}
ctx
->
infos
[
idx
].
type
=
tensor
->
type
;
ctx
->
infos
[
idx
].
offset
=
0
;
ctx
->
infos
[
idx
].
data
=
tensor
->
data
;
ctx
->
infos
[
idx
].
size
=
ggml_nbytes
(
tensor
);
if
(
ctx
->
header
.
n_tensors
>
0
)
{
ctx
->
infos
[
idx
].
offset
=
ctx
->
infos
[
idx
-
1
].
offset
+
GGML_PAD
(
ctx
->
infos
[
idx
-
1
].
size
,
ctx
->
alignment
);
}
ctx
->
header
.
n_tensors
++
;
}
void
gguf_set_tensor_type
(
struct
gguf_context
*
ctx
,
const
char
*
name
,
enum
ggml_type
type
)
{
const
int
idx
=
gguf_find_tensor
(
ctx
,
name
);
if
(
idx
<
0
)
{
GGML_ABORT
(
"tensor not found"
);
}
ctx
->
infos
[
idx
].
type
=
type
;
}
void
gguf_set_tensor_data
(
struct
gguf_context
*
ctx
,
const
char
*
name
,
const
void
*
data
,
size_t
size
)
{
const
int
idx
=
gguf_find_tensor
(
ctx
,
name
);
if
(
idx
<
0
)
{
GGML_ABORT
(
"tensor not found"
);
}
ctx
->
infos
[
idx
].
data
=
data
;
ctx
->
infos
[
idx
].
size
=
size
;
// update offsets
for
(
uint32_t
i
=
idx
+
1
;
i
<
ctx
->
header
.
n_tensors
;
++
i
)
{
ctx
->
infos
[
i
].
offset
=
ctx
->
infos
[
i
-
1
].
offset
+
GGML_PAD
(
ctx
->
infos
[
i
-
1
].
size
,
ctx
->
alignment
);
}
}
//static void gguf_fwrite_str(FILE * file, const struct gguf_str * val) {
// fwrite(&val->n, sizeof(val->n), 1, file);
// fwrite(val->data, sizeof(char), val->n, file);
//}
//
//static void gguf_fwrite_el(FILE * file, const void * val, size_t size) {
// fwrite(val, sizeof(char), size, file);
//}
struct
gguf_buf
gguf_buf_init
(
size_t
size
)
{
struct
gguf_buf
buf
=
{
/*buf.data =*/
size
==
0
?
NULL
:
GGML_CALLOC
(
1
,
size
),
/*buf.size =*/
size
,
/*buf.offset =*/
0
,
};
return
buf
;
}
void
gguf_buf_free
(
struct
gguf_buf
buf
)
{
if
(
buf
.
data
)
{
GGML_FREE
(
buf
.
data
);
}
}
static
void
gguf_buf_grow
(
struct
gguf_buf
*
buf
,
size_t
size
)
{
if
(
buf
->
offset
+
size
>
buf
->
size
)
{
buf
->
size
=
1
.
5
*
(
buf
->
offset
+
size
);
if
(
buf
->
data
)
{
buf
->
data
=
realloc
(
buf
->
data
,
buf
->
size
);
}
}
}
static
void
gguf_bwrite_str
(
struct
gguf_buf
*
buf
,
const
struct
gguf_str
*
val
)
{
gguf_buf_grow
(
buf
,
sizeof
(
val
->
n
)
+
val
->
n
);
if
(
buf
->
data
)
{
memcpy
((
char
*
)
buf
->
data
+
buf
->
offset
,
&
val
->
n
,
sizeof
(
val
->
n
));
}
buf
->
offset
+=
sizeof
(
val
->
n
);
if
(
buf
->
data
)
{
memcpy
((
char
*
)
buf
->
data
+
buf
->
offset
,
val
->
data
,
val
->
n
);
}
buf
->
offset
+=
val
->
n
;
}
static
void
gguf_bwrite_el
(
struct
gguf_buf
*
buf
,
const
void
*
val
,
size_t
el_size
)
{
gguf_buf_grow
(
buf
,
el_size
);
if
(
buf
->
data
)
{
memcpy
((
char
*
)
buf
->
data
+
buf
->
offset
,
val
,
el_size
);
}
buf
->
offset
+=
el_size
;
}
void
gguf_write_to_buf
(
const
struct
gguf_context
*
ctx
,
struct
gguf_buf
*
buf
,
bool
only_meta
)
{
// write header
gguf_bwrite_el
(
buf
,
&
ctx
->
header
.
magic
,
sizeof
(
ctx
->
header
.
magic
));
gguf_bwrite_el
(
buf
,
&
ctx
->
header
.
version
,
sizeof
(
ctx
->
header
.
version
));
gguf_bwrite_el
(
buf
,
&
ctx
->
header
.
n_tensors
,
sizeof
(
ctx
->
header
.
n_tensors
));
gguf_bwrite_el
(
buf
,
&
ctx
->
header
.
n_kv
,
sizeof
(
ctx
->
header
.
n_kv
));
// write key-value pairs
for
(
uint32_t
i
=
0
;
i
<
ctx
->
header
.
n_kv
;
++
i
)
{
struct
gguf_kv
*
kv
=
&
ctx
->
kv
[
i
];
gguf_bwrite_str
(
buf
,
&
kv
->
key
);
gguf_bwrite_el
(
buf
,
&
kv
->
type
,
sizeof
(
kv
->
type
));
switch
(
kv
->
type
)
{
case
GGUF_TYPE_UINT8
:
gguf_bwrite_el
(
buf
,
&
kv
->
value
.
uint8
,
sizeof
(
kv
->
value
.
uint8
)
);
break
;
case
GGUF_TYPE_INT8
:
gguf_bwrite_el
(
buf
,
&
kv
->
value
.
int8
,
sizeof
(
kv
->
value
.
int8
)
);
break
;
case
GGUF_TYPE_UINT16
:
gguf_bwrite_el
(
buf
,
&
kv
->
value
.
uint16
,
sizeof
(
kv
->
value
.
uint16
)
);
break
;
case
GGUF_TYPE_INT16
:
gguf_bwrite_el
(
buf
,
&
kv
->
value
.
int16
,
sizeof
(
kv
->
value
.
int16
)
);
break
;
case
GGUF_TYPE_UINT32
:
gguf_bwrite_el
(
buf
,
&
kv
->
value
.
uint32
,
sizeof
(
kv
->
value
.
uint32
)
);
break
;
case
GGUF_TYPE_INT32
:
gguf_bwrite_el
(
buf
,
&
kv
->
value
.
int32
,
sizeof
(
kv
->
value
.
int32
)
);
break
;
case
GGUF_TYPE_FLOAT32
:
gguf_bwrite_el
(
buf
,
&
kv
->
value
.
float32
,
sizeof
(
kv
->
value
.
float32
));
break
;
case
GGUF_TYPE_UINT64
:
gguf_bwrite_el
(
buf
,
&
kv
->
value
.
uint64
,
sizeof
(
kv
->
value
.
uint64
)
);
break
;
case
GGUF_TYPE_INT64
:
gguf_bwrite_el
(
buf
,
&
kv
->
value
.
int64
,
sizeof
(
kv
->
value
.
int64
)
);
break
;
case
GGUF_TYPE_FLOAT64
:
gguf_bwrite_el
(
buf
,
&
kv
->
value
.
float64
,
sizeof
(
kv
->
value
.
float64
));
break
;
case
GGUF_TYPE_BOOL
:
gguf_bwrite_el
(
buf
,
&
kv
->
value
.
bool_
,
sizeof
(
kv
->
value
.
bool_
)
);
break
;
case
GGUF_TYPE_STRING
:
gguf_bwrite_str
(
buf
,
&
kv
->
value
.
str
);
break
;
case
GGUF_TYPE_ARRAY
:
{
gguf_bwrite_el
(
buf
,
&
kv
->
value
.
arr
.
type
,
sizeof
(
kv
->
value
.
arr
.
type
));
gguf_bwrite_el
(
buf
,
&
kv
->
value
.
arr
.
n
,
sizeof
(
kv
->
value
.
arr
.
n
)
);
switch
(
kv
->
value
.
arr
.
type
)
{
case
GGUF_TYPE_UINT8
:
case
GGUF_TYPE_INT8
:
case
GGUF_TYPE_UINT16
:
case
GGUF_TYPE_INT16
:
case
GGUF_TYPE_UINT32
:
case
GGUF_TYPE_INT32
:
case
GGUF_TYPE_FLOAT32
:
case
GGUF_TYPE_UINT64
:
case
GGUF_TYPE_INT64
:
case
GGUF_TYPE_FLOAT64
:
case
GGUF_TYPE_BOOL
:
{
gguf_bwrite_el
(
buf
,
kv
->
value
.
arr
.
data
,
kv
->
value
.
arr
.
n
*
gguf_type_size
(
kv
->
value
.
arr
.
type
));
}
break
;
case
GGUF_TYPE_STRING
:
{
for
(
uint32_t
j
=
0
;
j
<
kv
->
value
.
arr
.
n
;
++
j
)
{
gguf_bwrite_str
(
buf
,
&
((
struct
gguf_str
*
)
kv
->
value
.
arr
.
data
)[
j
]);
}
}
break
;
case
GGUF_TYPE_ARRAY
:
default:
GGML_ABORT
(
"invalid type"
);
}
}
break
;
default:
GGML_ABORT
(
"invalid type"
);
}
}
// write tensor infos
for
(
uint32_t
i
=
0
;
i
<
ctx
->
header
.
n_tensors
;
++
i
)
{
struct
gguf_tensor_info
*
info
=
&
ctx
->
infos
[
i
];
gguf_bwrite_str
(
buf
,
&
info
->
name
);
gguf_bwrite_el
(
buf
,
&
info
->
n_dims
,
sizeof
(
info
->
n_dims
));
for
(
uint32_t
j
=
0
;
j
<
info
->
n_dims
;
++
j
)
{
gguf_bwrite_el
(
buf
,
&
info
->
ne
[
j
],
sizeof
(
info
->
ne
[
j
]));
}
gguf_bwrite_el
(
buf
,
&
info
->
type
,
sizeof
(
info
->
type
));
gguf_bwrite_el
(
buf
,
&
info
->
offset
,
sizeof
(
info
->
offset
));
}
// we require the data section to be aligned, so take into account any padding
{
const
size_t
offset
=
buf
->
offset
;
const
size_t
offset_pad
=
GGML_PAD
(
offset
,
ctx
->
alignment
);
if
(
offset_pad
!=
offset
)
{
uint8_t
pad
=
0
;
for
(
size_t
i
=
0
;
i
<
offset_pad
-
offset
;
++
i
)
{
gguf_bwrite_el
(
buf
,
&
pad
,
sizeof
(
pad
));
}
}
}
if
(
only_meta
)
{
return
;
}
size_t
offset
=
0
;
// write tensor data
for
(
uint32_t
i
=
0
;
i
<
ctx
->
header
.
n_tensors
;
++
i
)
{
struct
gguf_tensor_info
*
info
=
&
ctx
->
infos
[
i
];
const
size_t
size
=
info
->
size
;
const
size_t
size_pad
=
GGML_PAD
(
size
,
ctx
->
alignment
);
gguf_bwrite_el
(
buf
,
info
->
data
,
size
);
if
(
size_pad
!=
size
)
{
uint8_t
pad
=
0
;
for
(
size_t
j
=
0
;
j
<
size_pad
-
size
;
++
j
)
{
gguf_bwrite_el
(
buf
,
&
pad
,
sizeof
(
pad
));
}
}
GGML_ASSERT
(
offset
==
info
->
offset
);
offset
+=
size_pad
;
}
}
void
gguf_write_to_file
(
const
struct
gguf_context
*
ctx
,
const
char
*
fname
,
bool
only_meta
)
{
FILE
*
file
=
ggml_fopen
(
fname
,
"wb"
);
if
(
!
file
)
{
GGML_ABORT
(
"failed to open file for writing"
);
}
struct
gguf_buf
buf
=
gguf_buf_init
(
16
*
1024
);
gguf_write_to_buf
(
ctx
,
&
buf
,
only_meta
);
fwrite
(
buf
.
data
,
1
,
buf
.
offset
,
file
);
gguf_buf_free
(
buf
);
fclose
(
file
);
}
size_t
gguf_get_meta_size
(
const
struct
gguf_context
*
ctx
)
{
// no allocs - only compute size
struct
gguf_buf
buf
=
gguf_buf_init
(
0
);
gguf_write_to_buf
(
ctx
,
&
buf
,
true
);
return
buf
.
offset
;
}
void
gguf_get_meta_data
(
const
struct
gguf_context
*
ctx
,
void
*
data
)
{
struct
gguf_buf
buf
=
gguf_buf_init
(
16
*
1024
);
gguf_write_to_buf
(
ctx
,
&
buf
,
true
);
memcpy
(
data
,
buf
.
data
,
buf
.
offset
);
gguf_buf_free
(
buf
);
}
void
ggml_log_set
(
ggml_log_callback
log_callback
,
void
*
user_data
)
{
g_logger_state
.
log_callback
=
log_callback
?
log_callback
:
ggml_log_callback_default
;
g_logger_state
.
log_callback_user_data
=
user_data
;
...
...
ml/backend/ggml/ggml/src/ggml_darwin_arm64.go
View file @
d7d7e996
package
ggml
// #cgo CPPFLAGS: -DGGML_USE_METAL -DGGML_USE_BLAS
// #cgo CPPFLAGS: -DGGML_USE_METAL
-DGGML_METAL_EMBED_LIBRARY
-DGGML_USE_BLAS
// #cgo LDFLAGS: -framework Foundation
import
"C"
...
...
ml/backend/ggml/ggml/src/gguf.cpp
0 → 100644
View file @
d7d7e996
#include "ggml.h"
#include "ggml-backend.h"
#include "ggml-impl.h"
#include "gguf.h"
#include <cinttypes>
#include <cstddef>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <map>
#include <new>
#include <stdexcept>
#include <string>
#include <vector>
template
<
typename
T
>
struct
type_to_gguf_type
;
template
<
>
struct
type_to_gguf_type
<
uint8_t
>
{
static
constexpr
enum
gguf_type
value
=
GGUF_TYPE_UINT8
;
};
template
<
>
struct
type_to_gguf_type
<
int8_t
>
{
static
constexpr
enum
gguf_type
value
=
GGUF_TYPE_INT8
;
};
template
<
>
struct
type_to_gguf_type
<
uint16_t
>
{
static
constexpr
enum
gguf_type
value
=
GGUF_TYPE_UINT16
;
};
template
<
>
struct
type_to_gguf_type
<
int16_t
>
{
static
constexpr
enum
gguf_type
value
=
GGUF_TYPE_INT16
;
};
template
<
>
struct
type_to_gguf_type
<
uint32_t
>
{
static
constexpr
enum
gguf_type
value
=
GGUF_TYPE_UINT32
;
};
template
<
>
struct
type_to_gguf_type
<
int32_t
>
{
static
constexpr
enum
gguf_type
value
=
GGUF_TYPE_INT32
;
};
template
<
>
struct
type_to_gguf_type
<
float
>
{
static
constexpr
enum
gguf_type
value
=
GGUF_TYPE_FLOAT32
;
};
template
<
>
struct
type_to_gguf_type
<
bool
>
{
static
constexpr
enum
gguf_type
value
=
GGUF_TYPE_BOOL
;
};
template
<
>
struct
type_to_gguf_type
<
std
::
string
>
{
static
constexpr
enum
gguf_type
value
=
GGUF_TYPE_STRING
;
};
template
<
>
struct
type_to_gguf_type
<
uint64_t
>
{
static
constexpr
enum
gguf_type
value
=
GGUF_TYPE_UINT64
;
};
template
<
>
struct
type_to_gguf_type
<
int64_t
>
{
static
constexpr
enum
gguf_type
value
=
GGUF_TYPE_INT64
;
};
template
<
>
struct
type_to_gguf_type
<
double
>
{
static
constexpr
enum
gguf_type
value
=
GGUF_TYPE_FLOAT64
;
};
static
const
std
::
map
<
gguf_type
,
size_t
>
GGUF_TYPE_SIZE
=
{
{
GGUF_TYPE_UINT8
,
sizeof
(
uint8_t
)},
{
GGUF_TYPE_INT8
,
sizeof
(
int8_t
)},
{
GGUF_TYPE_UINT16
,
sizeof
(
uint16_t
)},
{
GGUF_TYPE_INT16
,
sizeof
(
int16_t
)},
{
GGUF_TYPE_UINT32
,
sizeof
(
uint32_t
)},
{
GGUF_TYPE_INT32
,
sizeof
(
int32_t
)},
{
GGUF_TYPE_FLOAT32
,
sizeof
(
float
)},
{
GGUF_TYPE_BOOL
,
sizeof
(
int8_t
)},
{
GGUF_TYPE_STRING
,
0
},
// undefined
{
GGUF_TYPE_ARRAY
,
0
},
// undefined
{
GGUF_TYPE_UINT64
,
sizeof
(
uint64_t
)},
{
GGUF_TYPE_INT64
,
sizeof
(
int64_t
)},
{
GGUF_TYPE_FLOAT64
,
sizeof
(
double
)},
};
static_assert
(
GGUF_TYPE_COUNT
==
13
,
"GGUF_TYPE_COUNT != 13"
);
static
const
std
::
map
<
gguf_type
,
const
char
*>
GGUF_TYPE_NAME
=
{
{
GGUF_TYPE_UINT8
,
"u8"
},
{
GGUF_TYPE_INT8
,
"i8"
},
{
GGUF_TYPE_UINT16
,
"u16"
},
{
GGUF_TYPE_INT16
,
"i16"
},
{
GGUF_TYPE_UINT32
,
"u32"
},
{
GGUF_TYPE_INT32
,
"i32"
},
{
GGUF_TYPE_FLOAT32
,
"f32"
},
{
GGUF_TYPE_BOOL
,
"bool"
},
{
GGUF_TYPE_STRING
,
"str"
},
{
GGUF_TYPE_ARRAY
,
"arr"
},
{
GGUF_TYPE_UINT64
,
"u64"
},
{
GGUF_TYPE_INT64
,
"i64"
},
{
GGUF_TYPE_FLOAT64
,
"f64"
},
};
static_assert
(
GGUF_TYPE_COUNT
==
13
,
"GGUF_TYPE_COUNT != 13"
);
size_t
gguf_type_size
(
enum
gguf_type
type
)
{
auto
it
=
GGUF_TYPE_SIZE
.
find
(
type
);
return
it
==
GGUF_TYPE_SIZE
.
end
()
?
0
:
it
->
second
;
}
struct
gguf_kv
{
std
::
string
key
;
bool
is_array
;
enum
gguf_type
type
;
std
::
vector
<
int8_t
>
data
;
std
::
vector
<
std
::
string
>
data_string
;
template
<
typename
T
>
gguf_kv
(
const
std
::
string
&
key
,
const
T
value
)
:
key
(
key
),
is_array
(
false
),
type
(
type_to_gguf_type
<
T
>::
value
)
{
GGML_ASSERT
(
!
key
.
empty
());
data
.
resize
(
sizeof
(
T
));
memcpy
(
data
.
data
(),
&
value
,
sizeof
(
T
));
}
template
<
typename
T
>
gguf_kv
(
const
std
::
string
&
key
,
const
std
::
vector
<
T
>
&
value
)
:
key
(
key
),
is_array
(
true
),
type
(
type_to_gguf_type
<
T
>::
value
)
{
GGML_ASSERT
(
!
key
.
empty
());
data
.
resize
(
value
.
size
()
*
sizeof
(
T
));
for
(
size_t
i
=
0
;
i
<
value
.
size
();
++
i
)
{
const
T
tmp
=
value
[
i
];
memcpy
(
data
.
data
()
+
i
*
sizeof
(
T
),
&
tmp
,
sizeof
(
T
));
}
}
gguf_kv
(
const
std
::
string
&
key
,
const
std
::
string
&
value
)
:
key
(
key
),
is_array
(
false
),
type
(
GGUF_TYPE_STRING
)
{
GGML_ASSERT
(
!
key
.
empty
());
data_string
.
push_back
(
value
);
}
gguf_kv
(
const
std
::
string
&
key
,
const
std
::
vector
<
std
::
string
>
&
value
)
:
key
(
key
),
is_array
(
true
),
type
(
GGUF_TYPE_STRING
)
{
GGML_ASSERT
(
!
key
.
empty
());
data_string
=
value
;
}
const
std
::
string
&
get_key
()
const
{
return
key
;
}
const
enum
gguf_type
&
get_type
()
const
{
return
type
;
}
size_t
get_ne
()
const
{
if
(
type
==
GGUF_TYPE_STRING
)
{
const
size_t
ne
=
data_string
.
size
();
GGML_ASSERT
(
is_array
||
ne
==
1
);
return
ne
;
}
const
size_t
type_size
=
gguf_type_size
(
type
);
GGML_ASSERT
(
data
.
size
()
%
type_size
==
0
);
const
size_t
ne
=
data
.
size
()
/
type_size
;
GGML_ASSERT
(
is_array
||
ne
==
1
);
return
ne
;
}
template
<
typename
T
>
const
T
&
get_val
(
const
size_t
i
=
0
)
const
{
GGML_ASSERT
(
type_to_gguf_type
<
T
>::
value
==
type
);
if
constexpr
(
std
::
is_same
<
T
,
std
::
string
>::
value
)
{
GGML_ASSERT
(
data_string
.
size
()
>=
i
+
1
);
return
data_string
[
i
];
}
const
size_t
type_size
=
gguf_type_size
(
type
);
GGML_ASSERT
(
data
.
size
()
%
type_size
==
0
);
GGML_ASSERT
(
data
.
size
()
>=
(
i
+
1
)
*
type_size
);
return
reinterpret_cast
<
const
T
*>
(
data
.
data
())[
i
];
}
void
cast
(
const
enum
gguf_type
new_type
)
{
const
size_t
new_type_size
=
gguf_type_size
(
new_type
);
GGML_ASSERT
(
data
.
size
()
%
new_type_size
==
0
);
type
=
new_type
;
}
};
struct
gguf_tensor_info
{
struct
ggml_tensor
t
;
// for holding the equivalent info
uint64_t
offset
;
// offset from start of `data`, must be a multiple of `ALIGNMENT`
};
struct
gguf_context
{
uint32_t
version
=
GGUF_VERSION
;
std
::
vector
<
struct
gguf_kv
>
kv
;
std
::
vector
<
struct
gguf_tensor_info
>
info
;
size_t
alignment
=
GGUF_DEFAULT_ALIGNMENT
;
size_t
offset
=
0
;
// offset of `data` from beginning of file
size_t
size
=
0
;
// size of `data` in bytes
void
*
data
=
nullptr
;
};
struct
gguf_reader
{
FILE
*
file
;
gguf_reader
(
FILE
*
file
)
:
file
(
file
)
{}
template
<
typename
T
>
bool
read
(
T
&
dst
)
const
{
return
fread
(
&
dst
,
1
,
sizeof
(
dst
),
file
)
==
sizeof
(
dst
);
}
template
<
typename
T
>
bool
read
(
std
::
vector
<
T
>
&
dst
,
const
size_t
n
)
const
{
dst
.
resize
(
n
);
for
(
size_t
i
=
0
;
i
<
dst
.
size
();
++
i
)
{
if
constexpr
(
std
::
is_same
<
T
,
bool
>::
value
)
{
bool
tmp
;
if
(
!
read
(
tmp
))
{
return
false
;
}
dst
[
i
]
=
tmp
;
}
else
{
if
(
!
read
(
dst
[
i
]))
{
return
false
;
}
}
}
return
true
;
}
bool
read
(
bool
&
dst
)
const
{
int8_t
tmp
=
-
1
;
if
(
!
read
(
tmp
))
{
return
false
;
}
dst
=
tmp
!=
0
;
return
true
;
}
bool
read
(
enum
ggml_type
&
dst
)
const
{
int32_t
tmp
=
-
1
;
if
(
!
read
(
tmp
))
{
return
false
;
}
dst
=
ggml_type
(
tmp
);
return
true
;
}
bool
read
(
enum
gguf_type
&
dst
)
const
{
int32_t
tmp
=
-
1
;
if
(
!
read
(
tmp
))
{
return
false
;
}
dst
=
gguf_type
(
tmp
);
return
true
;
}
bool
read
(
std
::
string
&
dst
)
const
{
uint64_t
size
=
-
1
;
if
(
!
read
(
size
))
{
return
false
;
}
dst
.
resize
(
size
);
return
fread
(
dst
.
data
(),
1
,
dst
.
length
(),
file
)
==
dst
.
length
();
}
bool
read
(
void
*
dst
,
const
size_t
size
)
const
{
return
fread
(
dst
,
1
,
size
,
file
)
==
size
;
}
};
struct
gguf_context
*
gguf_init_empty
(
void
)
{
return
new
gguf_context
;
}
template
<
typename
T
>
bool
gguf_read_emplace_helper
(
const
struct
gguf_reader
&
gr
,
std
::
vector
<
struct
gguf_kv
>
&
kv
,
const
std
::
string
&
key
,
const
bool
is_array
,
const
size_t
n
)
{
if
(
is_array
)
{
std
::
vector
<
T
>
value
;
try
{
if
(
!
gr
.
read
(
value
,
n
))
{
return
false
;
}
}
catch
(
std
::
length_error
&
)
{
fprintf
(
stderr
,
"%s: encountered length_error while reading value for key '%s'
\n
"
,
__func__
,
key
.
c_str
());
return
false
;
}
catch
(
std
::
bad_alloc
&
)
{
fprintf
(
stderr
,
"%s: encountered bad_alloc error while reading value for key '%s'
\n
"
,
__func__
,
key
.
c_str
());
return
false
;
}
kv
.
emplace_back
(
key
,
value
);
}
else
{
T
value
;
if
(
!
gr
.
read
(
value
))
{
return
false
;
}
kv
.
emplace_back
(
key
,
value
);
}
return
true
;
}
struct
gguf_context
*
gguf_init_from_file_impl
(
FILE
*
file
,
struct
gguf_init_params
params
)
{
const
struct
gguf_reader
gr
(
file
);
struct
gguf_context
*
ctx
=
new
gguf_context
;
bool
ok
=
true
;
// file magic
{
std
::
vector
<
char
>
magic
;
ok
=
ok
&&
gr
.
read
(
magic
,
4
);
if
(
!
ok
)
{
fprintf
(
stderr
,
"%s: failed to read magic
\n
"
,
__func__
);
gguf_free
(
ctx
);
return
nullptr
;
}
for
(
uint32_t
i
=
0
;
i
<
magic
.
size
();
i
++
)
{
if
(
magic
[
i
]
!=
GGUF_MAGIC
[
i
])
{
fprintf
(
stderr
,
"%s: invalid magic characters: '%c%c%c%c', expected 'GGUF'
\n
"
,
__func__
,
magic
[
0
],
magic
[
1
],
magic
[
2
],
magic
[
3
]);
gguf_free
(
ctx
);
return
nullptr
;
}
}
}
// header
int64_t
n_kv
=
0
;
int64_t
n_tensors
=
0
;
if
(
ok
&&
gr
.
read
(
ctx
->
version
))
{
if
(
ctx
->
version
==
1
)
{
fprintf
(
stderr
,
"%s: GGUFv1 is no longer supported, please use a more up-to-date version
\n
"
,
__func__
);
ok
=
false
;
}
if
(
ctx
->
version
>
GGUF_VERSION
)
{
fprintf
(
stderr
,
"%s: this GGUF file is version %"
PRIu32
" but this software only supports up to version %d
\n
"
,
__func__
,
ctx
->
version
,
GGUF_VERSION
);
ok
=
false
;
}
}
else
{
ok
=
false
;
}
if
(
ok
&&
gr
.
read
(
n_tensors
))
{
static_assert
(
sizeof
(
size_t
)
<=
8
&&
sizeof
(
gguf_tensor_info
)
>=
2
,
"int64_t insufficient for indexing"
);
if
(
n_tensors
<
0
||
n_tensors
>
int64_t
(
SIZE_MAX
/
sizeof
(
gguf_tensor_info
)))
{
fprintf
(
stderr
,
"%s: number of tensors is %"
PRIi64
" but must be in [0, %zu]
\n
"
,
__func__
,
n_tensors
,
SIZE_MAX
/
sizeof
(
gguf_tensor_info
));
ok
=
false
;
}
}
else
{
ok
=
false
;
}
if
(
ok
&&
gr
.
read
(
n_kv
))
{
static_assert
(
sizeof
(
size_t
)
<=
8
&&
sizeof
(
gguf_tensor_info
)
>=
2
,
"int64_t insufficient for indexing"
);
if
(
n_kv
<
0
||
n_kv
>
int64_t
(
SIZE_MAX
/
sizeof
(
gguf_kv
)))
{
fprintf
(
stderr
,
"%s: number of key value pairs is %"
PRIi64
" but must be in [0, %zu]
\n
"
,
__func__
,
n_kv
,
SIZE_MAX
/
sizeof
(
gguf_kv
));
ok
=
false
;
}
}
else
{
ok
=
false
;
}
if
(
!
ok
)
{
fprintf
(
stderr
,
"%s: failed to read header
\n
"
,
__func__
);
gguf_free
(
ctx
);
return
nullptr
;
}
// KV pairs
{
for
(
int64_t
i
=
0
;
ok
&&
i
<
n_kv
;
++
i
)
{
std
::
string
key
;
gguf_type
type
=
gguf_type
(
-
1
);
bool
is_array
=
false
;
uint64_t
n
=
1
;
try
{
ok
=
ok
&&
gr
.
read
(
key
);
}
catch
(
std
::
length_error
&
)
{
fprintf
(
stderr
,
"%s: encountered length_error while reading key %"
PRIi64
"
\n
"
,
__func__
,
i
);
ok
=
false
;
}
catch
(
std
::
bad_alloc
&
)
{
fprintf
(
stderr
,
"%s: encountered bad_alloc error while reading key %"
PRIi64
"
\n
"
,
__func__
,
i
);
ok
=
false
;
}
for
(
size_t
j
=
0
;
ok
&&
j
<
ctx
->
kv
.
size
();
++
j
)
{
if
(
key
==
ctx
->
kv
[
j
].
key
)
{
fprintf
(
stderr
,
"%s: duplicate key '%s' for tensors %zu and %"
PRIi64
"
\n
"
,
__func__
,
key
.
c_str
(),
j
,
i
);
ok
=
false
;
}
}
if
(
!
ok
)
{
break
;
}
ok
=
ok
&&
gr
.
read
(
type
);
if
(
type
==
GGUF_TYPE_ARRAY
)
{
is_array
=
true
;
ok
=
ok
&&
gr
.
read
(
type
);
ok
=
ok
&&
gr
.
read
(
n
);
}
if
(
!
ok
)
{
break
;
}
switch
(
type
)
{
case
GGUF_TYPE_UINT8
:
ok
=
ok
&&
gguf_read_emplace_helper
<
uint8_t
>
(
gr
,
ctx
->
kv
,
key
,
is_array
,
n
);
break
;
case
GGUF_TYPE_INT8
:
ok
=
ok
&&
gguf_read_emplace_helper
<
int8_t
>
(
gr
,
ctx
->
kv
,
key
,
is_array
,
n
);
break
;
case
GGUF_TYPE_UINT16
:
ok
=
ok
&&
gguf_read_emplace_helper
<
uint16_t
>
(
gr
,
ctx
->
kv
,
key
,
is_array
,
n
);
break
;
case
GGUF_TYPE_INT16
:
ok
=
ok
&&
gguf_read_emplace_helper
<
int16_t
>
(
gr
,
ctx
->
kv
,
key
,
is_array
,
n
);
break
;
case
GGUF_TYPE_UINT32
:
ok
=
ok
&&
gguf_read_emplace_helper
<
uint32_t
>
(
gr
,
ctx
->
kv
,
key
,
is_array
,
n
);
break
;
case
GGUF_TYPE_INT32
:
ok
=
ok
&&
gguf_read_emplace_helper
<
int32_t
>
(
gr
,
ctx
->
kv
,
key
,
is_array
,
n
);
break
;
case
GGUF_TYPE_FLOAT32
:
ok
=
ok
&&
gguf_read_emplace_helper
<
float
>
(
gr
,
ctx
->
kv
,
key
,
is_array
,
n
);
break
;
case
GGUF_TYPE_BOOL
:
ok
=
ok
&&
gguf_read_emplace_helper
<
bool
>
(
gr
,
ctx
->
kv
,
key
,
is_array
,
n
);
break
;
case
GGUF_TYPE_STRING
:
ok
=
ok
&&
gguf_read_emplace_helper
<
std
::
string
>
(
gr
,
ctx
->
kv
,
key
,
is_array
,
n
);
break
;
case
GGUF_TYPE_UINT64
:
ok
=
ok
&&
gguf_read_emplace_helper
<
uint64_t
>
(
gr
,
ctx
->
kv
,
key
,
is_array
,
n
);
break
;
case
GGUF_TYPE_INT64
:
ok
=
ok
&&
gguf_read_emplace_helper
<
int64_t
>
(
gr
,
ctx
->
kv
,
key
,
is_array
,
n
);
break
;
case
GGUF_TYPE_FLOAT64
:
ok
=
ok
&&
gguf_read_emplace_helper
<
double
>
(
gr
,
ctx
->
kv
,
key
,
is_array
,
n
);
break
;
case
GGUF_TYPE_ARRAY
:
default:
{
fprintf
(
stderr
,
"%s: key '%s' has invalid GGUF type %d
\n
"
,
__func__
,
key
.
c_str
(),
type
);
ok
=
false
;
}
break
;
}
}
if
(
!
ok
)
{
fprintf
(
stderr
,
"%s: failed to read key-value pairs
\n
"
,
__func__
);
gguf_free
(
ctx
);
return
nullptr
;
}
GGML_ASSERT
(
int64_t
(
ctx
->
kv
.
size
())
==
n_kv
);
const
int
alignment_idx
=
gguf_find_key
(
ctx
,
GGUF_KEY_GENERAL_ALIGNMENT
);
ctx
->
alignment
=
alignment_idx
==
-
1
?
GGUF_DEFAULT_ALIGNMENT
:
gguf_get_val_u32
(
ctx
,
alignment_idx
);
if
(
ctx
->
alignment
==
0
||
(
ctx
->
alignment
&
(
ctx
->
alignment
-
1
))
!=
0
)
{
fprintf
(
stderr
,
"%s: alignment %zu is not a power of 2
\n
"
,
__func__
,
ctx
->
alignment
);
gguf_free
(
ctx
);
return
nullptr
;
}
}
// read the tensor info
for
(
int64_t
i
=
0
;
ok
&&
i
<
n_tensors
;
++
i
)
{
struct
gguf_tensor_info
info
;
// tensor name
{
std
::
string
name
;
try
{
ok
=
ok
&&
gr
.
read
(
name
);
}
catch
(
std
::
length_error
&
)
{
fprintf
(
stderr
,
"%s: encountered length_error while reading tensor name %"
PRIi64
"
\n
"
,
__func__
,
i
);
ok
=
false
;
}
catch
(
std
::
bad_alloc
&
)
{
fprintf
(
stderr
,
"%s: encountered bad_alloc error while reading tensor name %"
PRIi64
"
\n
"
,
__func__
,
i
);
ok
=
false
;
}
if
(
name
.
length
()
>=
GGML_MAX_NAME
)
{
fprintf
(
stderr
,
"%s: tensor name %"
PRIi64
" is too long: %zu >= %d
\n
"
,
__func__
,
i
,
name
.
length
(),
GGML_MAX_NAME
);
ok
=
false
;
break
;
}
ggml_set_name
(
&
info
.
t
,
name
.
c_str
());
// make sure there are no duplicate tensor names
for
(
int64_t
j
=
0
;
ok
&&
j
<
i
;
++
j
)
{
if
(
strcmp
(
info
.
t
.
name
,
ctx
->
info
[
j
].
t
.
name
)
==
0
)
{
fprintf
(
stderr
,
"%s: duplicate tensor name '%s' for tensors %"
PRIi64
" and %"
PRIi64
"
\n
"
,
__func__
,
info
.
t
.
name
,
j
,
i
);
ok
=
false
;
break
;
}
}
}
if
(
!
ok
)
{
break
;
}
// tensor shape
{
uint32_t
n_dims
=
-
1
;
ok
=
ok
&&
gr
.
read
(
n_dims
);
if
(
n_dims
>
GGML_MAX_DIMS
)
{
fprintf
(
stderr
,
"%s: tensor '%s' has invalid number of dimensions: %"
PRIu32
" > %"
PRIu32
"
\n
"
,
__func__
,
info
.
t
.
name
,
n_dims
,
GGML_MAX_DIMS
);
ok
=
false
;
break
;
}
for
(
uint32_t
j
=
0
;
ok
&&
j
<
GGML_MAX_DIMS
;
++
j
)
{
info
.
t
.
ne
[
j
]
=
1
;
if
(
j
<
n_dims
)
{
ok
=
ok
&&
gr
.
read
(
info
.
t
.
ne
[
j
]);
}
// check that all ne are non-negative
if
(
info
.
t
.
ne
[
j
]
<
0
)
{
fprintf
(
stderr
,
"%s: tensor '%s' dimension %"
PRIu32
" has invalid number of elements: %"
PRIi64
" < 0
\n
"
,
__func__
,
info
.
t
.
name
,
j
,
info
.
t
.
ne
[
j
]);
ok
=
false
;
break
;
}
}
// check that the total number of elements is representable
if
(
ok
&&
((
INT64_MAX
/
info
.
t
.
ne
[
1
]
<=
info
.
t
.
ne
[
0
])
||
(
INT64_MAX
/
info
.
t
.
ne
[
2
]
<=
info
.
t
.
ne
[
0
]
*
info
.
t
.
ne
[
1
])
||
(
INT64_MAX
/
info
.
t
.
ne
[
3
]
<=
info
.
t
.
ne
[
0
]
*
info
.
t
.
ne
[
1
]
*
info
.
t
.
ne
[
2
])))
{
fprintf
(
stderr
,
"%s: total number of elements in tensor '%s' with shape "
"(%"
PRIi64
", %"
PRIi64
", %"
PRIi64
", %"
PRIi64
") is >= %"
PRIi64
"
\n
"
,
__func__
,
info
.
t
.
name
,
info
.
t
.
ne
[
0
],
info
.
t
.
ne
[
1
],
info
.
t
.
ne
[
2
],
info
.
t
.
ne
[
3
],
INT64_MAX
);
ok
=
false
;
break
;
}
}
if
(
!
ok
)
{
break
;
}
// tensor type
{
ok
=
ok
&&
gr
.
read
(
info
.
t
.
type
);
// check that tensor type is within defined range
if
(
info
.
t
.
type
<
0
||
info
.
t
.
type
>=
GGML_TYPE_COUNT
)
{
fprintf
(
stderr
,
"%s: tensor '%s' has invalid ggml type %d (%s)
\n
"
,
__func__
,
info
.
t
.
name
,
info
.
t
.
type
,
ggml_type_name
(
info
.
t
.
type
));
ok
=
false
;
break
;
}
const
size_t
type_size
=
ggml_type_size
(
info
.
t
.
type
);
const
int64_t
blck_size
=
ggml_blck_size
(
info
.
t
.
type
);
// check that row size is divisible by block size
if
(
blck_size
==
0
||
info
.
t
.
ne
[
0
]
%
blck_size
!=
0
)
{
fprintf
(
stderr
,
"%s: tensor '%s' of type %d (%s) has %"
PRId64
" elements per row, "
"not a multiple of block size (%"
PRId64
")
\n
"
,
__func__
,
info
.
t
.
name
,
(
int
)
info
.
t
.
type
,
ggml_type_name
(
info
.
t
.
type
),
info
.
t
.
ne
[
0
],
blck_size
);
ok
=
false
;
break
;
}
// calculate byte offsets given the tensor shape and type
info
.
t
.
nb
[
0
]
=
type_size
;
info
.
t
.
nb
[
1
]
=
info
.
t
.
nb
[
0
]
*
(
info
.
t
.
ne
[
0
]
/
blck_size
);
for
(
int
j
=
2
;
j
<
GGML_MAX_DIMS
;
++
j
)
{
info
.
t
.
nb
[
j
]
=
info
.
t
.
nb
[
j
-
1
]
*
info
.
t
.
ne
[
j
-
1
];
}
}
if
(
!
ok
)
{
break
;
}
// tensor data offset within buffer
ok
=
ok
&&
gr
.
read
(
info
.
offset
);
ctx
->
info
.
push_back
(
info
);
}
if
(
!
ok
)
{
fprintf
(
stderr
,
"%s: failed to read tensor info
\n
"
,
__func__
);
gguf_free
(
ctx
);
return
nullptr
;
}
GGML_ASSERT
(
int64_t
(
ctx
->
info
.
size
())
==
n_tensors
);
// we require the data section to be aligned, so take into account any padding
if
(
fseek
(
file
,
GGML_PAD
(
ftell
(
file
),
ctx
->
alignment
),
SEEK_SET
)
!=
0
)
{
fprintf
(
stderr
,
"%s: failed to seek to beginning of data section
\n
"
,
__func__
);
gguf_free
(
ctx
);
return
nullptr
;
}
// store the current file offset - this is where the data section starts
ctx
->
offset
=
ftell
(
file
);
// compute the total size of the data section, taking into account the alignment
{
ctx
->
size
=
0
;
for
(
size_t
i
=
0
;
i
<
ctx
->
info
.
size
();
++
i
)
{
const
gguf_tensor_info
&
ti
=
ctx
->
info
[
i
];
if
(
ti
.
offset
!=
ctx
->
size
)
{
fprintf
(
stderr
,
"%s: tensor '%s' has offset %"
PRIu64
", expected %zu
\n
"
,
__func__
,
ti
.
t
.
name
,
ti
.
offset
,
ctx
->
size
);
fprintf
(
stderr
,
"%s: failed to read tensor data
\n
"
,
__func__
);
gguf_free
(
ctx
);
return
nullptr
;
}
ctx
->
size
+=
GGML_PAD
(
ggml_nbytes
(
&
ti
.
t
),
ctx
->
alignment
);
}
}
// load the tensor data only if requested
if
(
params
.
ctx
!=
nullptr
)
{
// if the provided gguf_context is no_alloc, then we create "empty" tensors and do not read the binary blob
// otherwise, we load the binary blob into the created ggml_context as well, and point the "data" members of
// the ggml_tensor structs to the appropriate locations in the binary blob
// compute the exact size needed for the new ggml_context
const
size_t
mem_size
=
params
.
no_alloc
?
(
n_tensors
)
*
ggml_tensor_overhead
()
:
(
n_tensors
+
1
)
*
ggml_tensor_overhead
()
+
ctx
->
size
;
struct
ggml_init_params
pdata
=
{
/*mem_size =*/
mem_size
,
/*mem_buffer =*/
nullptr
,
/*no_alloc =*/
params
.
no_alloc
,
};
*
params
.
ctx
=
ggml_init
(
pdata
);
if
(
*
params
.
ctx
==
nullptr
)
{
fprintf
(
stderr
,
"%s: failed to initialize ggml context for storing tensors
\n
"
,
__func__
);
gguf_free
(
ctx
);
return
nullptr
;
}
struct
ggml_context
*
ctx_data
=
*
params
.
ctx
;
struct
ggml_tensor
*
data
=
nullptr
;
if
(
!
params
.
no_alloc
)
{
data
=
ggml_new_tensor_1d
(
ctx_data
,
GGML_TYPE_I8
,
ctx
->
size
);
ok
=
ok
&&
data
!=
nullptr
;
if
(
ok
)
{
ggml_set_name
(
data
,
"GGUF tensor data binary blob"
);
}
// read the binary blob with the tensor data
ok
=
ok
&&
gr
.
read
(
data
->
data
,
ctx
->
size
);
if
(
!
ok
)
{
fprintf
(
stderr
,
"%s: failed to read tensor data binary blob
\n
"
,
__func__
);
ggml_free
(
ctx_data
);
*
params
.
ctx
=
nullptr
;
gguf_free
(
ctx
);
return
nullptr
;
}
ctx
->
data
=
data
->
data
;
}
ggml_set_no_alloc
(
ctx_data
,
true
);
// create the tensors
for
(
size_t
i
=
0
;
i
<
ctx
->
info
.
size
();
++
i
)
{
const
struct
gguf_tensor_info
&
info
=
ctx
->
info
[
i
];
struct
ggml_tensor
*
cur
=
ggml_new_tensor
(
ctx_data
,
info
.
t
.
type
,
GGML_MAX_DIMS
,
info
.
t
.
ne
);
ok
=
ok
&&
cur
!=
nullptr
;
if
(
!
ok
)
{
break
;
}
ggml_set_name
(
cur
,
info
.
t
.
name
);
// point the data member to the appropriate location in the binary blob using the tensor info
if
(
!
params
.
no_alloc
)
{
cur
->
data
=
(
char
*
)
data
->
data
+
info
.
offset
;
}
}
if
(
!
ok
)
{
fprintf
(
stderr
,
"%s: failed to create tensors
\n
"
,
__func__
);
ggml_free
(
ctx_data
);
*
params
.
ctx
=
nullptr
;
gguf_free
(
ctx
);
return
nullptr
;
}
ggml_set_no_alloc
(
ctx_data
,
params
.
no_alloc
);
}
return
ctx
;
}
struct
gguf_context
*
gguf_init_from_file
(
const
char
*
fname
,
struct
gguf_init_params
params
)
{
FILE
*
file
=
ggml_fopen
(
fname
,
"rb"
);
if
(
!
file
)
{
fprintf
(
stderr
,
"%s: failed to open GGUF file '%s'
\n
"
,
__func__
,
fname
);
return
nullptr
;
}
struct
gguf_context
*
result
=
gguf_init_from_file_impl
(
file
,
params
);
fclose
(
file
);
return
result
;
}
void
gguf_free
(
struct
gguf_context
*
ctx
)
{
if
(
ctx
==
nullptr
)
{
return
;
}
delete
ctx
;
}
const
char
*
gguf_type_name
(
enum
gguf_type
type
)
{
auto
it
=
GGUF_TYPE_NAME
.
find
(
type
);
return
it
==
GGUF_TYPE_NAME
.
end
()
?
nullptr
:
it
->
second
;
}
uint32_t
gguf_get_version
(
const
struct
gguf_context
*
ctx
)
{
return
ctx
->
version
;
}
size_t
gguf_get_alignment
(
const
struct
gguf_context
*
ctx
)
{
return
ctx
->
alignment
;
}
size_t
gguf_get_data_offset
(
const
struct
gguf_context
*
ctx
)
{
return
ctx
->
offset
;
}
int64_t
gguf_get_n_kv
(
const
struct
gguf_context
*
ctx
)
{
return
ctx
->
kv
.
size
();
}
int64_t
gguf_find_key
(
const
struct
gguf_context
*
ctx
,
const
char
*
key
)
{
// return -1 if key not found
int64_t
keyfound
=
-
1
;
const
int64_t
n_kv
=
gguf_get_n_kv
(
ctx
);
for
(
int64_t
i
=
0
;
i
<
n_kv
;
++
i
)
{
if
(
strcmp
(
key
,
gguf_get_key
(
ctx
,
i
))
==
0
)
{
keyfound
=
i
;
break
;
}
}
return
keyfound
;
}
const
char
*
gguf_get_key
(
const
struct
gguf_context
*
ctx
,
int64_t
key_id
)
{
GGML_ASSERT
(
key_id
>=
0
&&
key_id
<
gguf_get_n_kv
(
ctx
));
return
ctx
->
kv
[
key_id
].
get_key
().
c_str
();
}
enum
gguf_type
gguf_get_kv_type
(
const
struct
gguf_context
*
ctx
,
int64_t
key_id
)
{
GGML_ASSERT
(
key_id
>=
0
&&
key_id
<
gguf_get_n_kv
(
ctx
));
return
ctx
->
kv
[
key_id
].
is_array
?
GGUF_TYPE_ARRAY
:
ctx
->
kv
[
key_id
].
get_type
();
}
enum
gguf_type
gguf_get_arr_type
(
const
struct
gguf_context
*
ctx
,
int64_t
key_id
)
{
GGML_ASSERT
(
key_id
>=
0
&&
key_id
<
gguf_get_n_kv
(
ctx
));
GGML_ASSERT
(
ctx
->
kv
[
key_id
].
is_array
);
return
ctx
->
kv
[
key_id
].
get_type
();
}
const
void
*
gguf_get_arr_data
(
const
struct
gguf_context
*
ctx
,
int64_t
key_id
)
{
GGML_ASSERT
(
key_id
>=
0
&&
key_id
<
gguf_get_n_kv
(
ctx
));
GGML_ASSERT
(
ctx
->
kv
[
key_id
].
get_type
()
!=
GGUF_TYPE_STRING
);
return
ctx
->
kv
[
key_id
].
data
.
data
();
}
const
char
*
gguf_get_arr_str
(
const
struct
gguf_context
*
ctx
,
int64_t
key_id
,
size_t
i
)
{
GGML_ASSERT
(
key_id
>=
0
&&
key_id
<
gguf_get_n_kv
(
ctx
));
GGML_ASSERT
(
ctx
->
kv
[
key_id
].
get_type
()
==
GGUF_TYPE_STRING
);
return
ctx
->
kv
[
key_id
].
data_string
[
i
].
c_str
();
}
size_t
gguf_get_arr_n
(
const
struct
gguf_context
*
ctx
,
int64_t
key_id
)
{
GGML_ASSERT
(
key_id
>=
0
&&
key_id
<
gguf_get_n_kv
(
ctx
));
if
(
ctx
->
kv
[
key_id
].
type
==
GGUF_TYPE_STRING
)
{
return
ctx
->
kv
[
key_id
].
data_string
.
size
();
}
const
size_t
type_size
=
gguf_type_size
(
ctx
->
kv
[
key_id
].
type
);
GGML_ASSERT
(
ctx
->
kv
[
key_id
].
data
.
size
()
%
type_size
==
0
);
return
ctx
->
kv
[
key_id
].
data
.
size
()
/
type_size
;
}
uint8_t
gguf_get_val_u8
(
const
struct
gguf_context
*
ctx
,
int64_t
key_id
)
{
GGML_ASSERT
(
key_id
>=
0
&&
key_id
<
gguf_get_n_kv
(
ctx
));
GGML_ASSERT
(
ctx
->
kv
[
key_id
].
get_ne
()
==
1
);
return
ctx
->
kv
[
key_id
].
get_val
<
uint8_t
>
();
}
int8_t
gguf_get_val_i8
(
const
struct
gguf_context
*
ctx
,
int64_t
key_id
)
{
GGML_ASSERT
(
key_id
>=
0
&&
key_id
<
gguf_get_n_kv
(
ctx
));
GGML_ASSERT
(
ctx
->
kv
[
key_id
].
get_ne
()
==
1
);
return
ctx
->
kv
[
key_id
].
get_val
<
int8_t
>
();
}
uint16_t
gguf_get_val_u16
(
const
struct
gguf_context
*
ctx
,
int64_t
key_id
)
{
GGML_ASSERT
(
key_id
>=
0
&&
key_id
<
gguf_get_n_kv
(
ctx
));
GGML_ASSERT
(
ctx
->
kv
[
key_id
].
get_ne
()
==
1
);
return
ctx
->
kv
[
key_id
].
get_val
<
uint16_t
>
();
}
int16_t
gguf_get_val_i16
(
const
struct
gguf_context
*
ctx
,
int64_t
key_id
)
{
GGML_ASSERT
(
key_id
>=
0
&&
key_id
<
gguf_get_n_kv
(
ctx
));
GGML_ASSERT
(
ctx
->
kv
[
key_id
].
get_ne
()
==
1
);
return
ctx
->
kv
[
key_id
].
get_val
<
int16_t
>
();
}
uint32_t
gguf_get_val_u32
(
const
struct
gguf_context
*
ctx
,
int64_t
key_id
)
{
GGML_ASSERT
(
key_id
>=
0
&&
key_id
<
gguf_get_n_kv
(
ctx
));
GGML_ASSERT
(
ctx
->
kv
[
key_id
].
get_ne
()
==
1
);
return
ctx
->
kv
[
key_id
].
get_val
<
uint32_t
>
();
}
int32_t
gguf_get_val_i32
(
const
struct
gguf_context
*
ctx
,
int64_t
key_id
)
{
GGML_ASSERT
(
key_id
>=
0
&&
key_id
<
gguf_get_n_kv
(
ctx
));
GGML_ASSERT
(
ctx
->
kv
[
key_id
].
get_ne
()
==
1
);
return
ctx
->
kv
[
key_id
].
get_val
<
int32_t
>
();
}
float
gguf_get_val_f32
(
const
struct
gguf_context
*
ctx
,
int64_t
key_id
)
{
GGML_ASSERT
(
key_id
>=
0
&&
key_id
<
gguf_get_n_kv
(
ctx
));
GGML_ASSERT
(
ctx
->
kv
[
key_id
].
get_ne
()
==
1
);
return
ctx
->
kv
[
key_id
].
get_val
<
float
>
();
}
uint64_t
gguf_get_val_u64
(
const
struct
gguf_context
*
ctx
,
int64_t
key_id
)
{
GGML_ASSERT
(
key_id
>=
0
&&
key_id
<
gguf_get_n_kv
(
ctx
));
GGML_ASSERT
(
ctx
->
kv
[
key_id
].
get_ne
()
==
1
);
return
ctx
->
kv
[
key_id
].
get_val
<
uint64_t
>
();
}
int64_t
gguf_get_val_i64
(
const
struct
gguf_context
*
ctx
,
int64_t
key_id
)
{
GGML_ASSERT
(
key_id
>=
0
&&
key_id
<
gguf_get_n_kv
(
ctx
));
GGML_ASSERT
(
ctx
->
kv
[
key_id
].
get_ne
()
==
1
);
return
ctx
->
kv
[
key_id
].
get_val
<
int64_t
>
();
}
double
gguf_get_val_f64
(
const
struct
gguf_context
*
ctx
,
int64_t
key_id
)
{
GGML_ASSERT
(
key_id
>=
0
&&
key_id
<
gguf_get_n_kv
(
ctx
));
GGML_ASSERT
(
ctx
->
kv
[
key_id
].
get_ne
()
==
1
);
return
ctx
->
kv
[
key_id
].
get_val
<
double
>
();
}
bool
gguf_get_val_bool
(
const
struct
gguf_context
*
ctx
,
int64_t
key_id
)
{
GGML_ASSERT
(
key_id
>=
0
&&
key_id
<
gguf_get_n_kv
(
ctx
));
GGML_ASSERT
(
ctx
->
kv
[
key_id
].
get_ne
()
==
1
);
return
ctx
->
kv
[
key_id
].
get_val
<
bool
>
();
}
const
char
*
gguf_get_val_str
(
const
struct
gguf_context
*
ctx
,
int64_t
key_id
)
{
GGML_ASSERT
(
key_id
>=
0
&&
key_id
<
gguf_get_n_kv
(
ctx
));
GGML_ASSERT
(
ctx
->
kv
[
key_id
].
get_ne
()
==
1
);
return
ctx
->
kv
[
key_id
].
get_val
<
std
::
string
>
().
c_str
();
}
const
void
*
gguf_get_val_data
(
const
struct
gguf_context
*
ctx
,
int64_t
key_id
)
{
GGML_ASSERT
(
key_id
>=
0
&&
key_id
<
gguf_get_n_kv
(
ctx
));
GGML_ASSERT
(
ctx
->
kv
[
key_id
].
get_ne
()
==
1
);
GGML_ASSERT
(
ctx
->
kv
[
key_id
].
get_type
()
!=
GGUF_TYPE_STRING
);
return
ctx
->
kv
[
key_id
].
data
.
data
();
}
int64_t
gguf_get_n_tensors
(
const
struct
gguf_context
*
ctx
)
{
return
ctx
->
info
.
size
();
}
int64_t
gguf_find_tensor
(
const
struct
gguf_context
*
ctx
,
const
char
*
name
)
{
// return -1 if tensor not found
int64_t
tensor_id
=
-
1
;
const
int64_t
n_tensors
=
gguf_get_n_tensors
(
ctx
);
for
(
int64_t
i
=
0
;
i
<
n_tensors
;
++
i
)
{
if
(
strcmp
(
name
,
gguf_get_tensor_name
(
ctx
,
i
))
==
0
)
{
tensor_id
=
i
;
break
;
}
}
return
tensor_id
;
}
size_t
gguf_get_tensor_offset
(
const
struct
gguf_context
*
ctx
,
int64_t
tensor_id
)
{
GGML_ASSERT
(
tensor_id
>=
0
&&
tensor_id
<
gguf_get_n_tensors
(
ctx
));
return
ctx
->
info
[
tensor_id
].
offset
;
}
const
char
*
gguf_get_tensor_name
(
const
struct
gguf_context
*
ctx
,
int64_t
tensor_id
)
{
GGML_ASSERT
(
tensor_id
>=
0
&&
tensor_id
<
gguf_get_n_tensors
(
ctx
));
return
ctx
->
info
[
tensor_id
].
t
.
name
;
}
enum
ggml_type
gguf_get_tensor_type
(
const
struct
gguf_context
*
ctx
,
int64_t
tensor_id
)
{
GGML_ASSERT
(
tensor_id
>=
0
&&
tensor_id
<
gguf_get_n_tensors
(
ctx
));
return
ctx
->
info
[
tensor_id
].
t
.
type
;
}
size_t
gguf_get_tensor_size
(
const
struct
gguf_context
*
ctx
,
int64_t
tensor_id
)
{
GGML_ASSERT
(
tensor_id
>=
0
&&
tensor_id
<
gguf_get_n_tensors
(
ctx
));
return
ggml_nbytes
(
&
ctx
->
info
[
tensor_id
].
t
);
}
int64_t
gguf_remove_key
(
struct
gguf_context
*
ctx
,
const
char
*
key
)
{
const
int64_t
key_id
=
gguf_find_key
(
ctx
,
key
);
if
(
key_id
>=
0
)
{
ctx
->
kv
.
erase
(
ctx
->
kv
.
begin
()
+
key_id
);
}
return
key_id
;
}
template
<
typename
T
>
static
void
gguf_check_reserved_keys
(
const
std
::
string
&
key
,
const
T
val
)
{
if
(
key
==
GGUF_KEY_GENERAL_ALIGNMENT
)
{
if
constexpr
(
std
::
is_same
<
T
,
uint32_t
>::
value
)
{
GGML_ASSERT
(
val
>
0
&&
(
val
&
(
val
-
1
))
==
0
&&
GGUF_KEY_GENERAL_ALIGNMENT
" must be power of 2"
);
}
else
{
GGML_ABORT
(
GGUF_KEY_GENERAL_ALIGNMENT
" must be type u32"
);
}
}
}
void
gguf_set_val_u8
(
struct
gguf_context
*
ctx
,
const
char
*
key
,
uint8_t
val
)
{
gguf_check_reserved_keys
(
key
,
val
);
gguf_remove_key
(
ctx
,
key
);
ctx
->
kv
.
emplace_back
(
key
,
val
);
}
void
gguf_set_val_i8
(
struct
gguf_context
*
ctx
,
const
char
*
key
,
int8_t
val
)
{
gguf_check_reserved_keys
(
key
,
val
);
gguf_remove_key
(
ctx
,
key
);
ctx
->
kv
.
emplace_back
(
key
,
val
);
}
void
gguf_set_val_u16
(
struct
gguf_context
*
ctx
,
const
char
*
key
,
uint16_t
val
)
{
gguf_check_reserved_keys
(
key
,
val
);
gguf_remove_key
(
ctx
,
key
);
ctx
->
kv
.
emplace_back
(
key
,
val
);
}
void
gguf_set_val_i16
(
struct
gguf_context
*
ctx
,
const
char
*
key
,
int16_t
val
)
{
gguf_check_reserved_keys
(
key
,
val
);
gguf_remove_key
(
ctx
,
key
);
ctx
->
kv
.
emplace_back
(
key
,
val
);
}
void
gguf_set_val_u32
(
struct
gguf_context
*
ctx
,
const
char
*
key
,
uint32_t
val
)
{
gguf_check_reserved_keys
(
key
,
val
);
gguf_remove_key
(
ctx
,
key
);
ctx
->
kv
.
emplace_back
(
key
,
val
);
}
void
gguf_set_val_i32
(
struct
gguf_context
*
ctx
,
const
char
*
key
,
int32_t
val
)
{
gguf_check_reserved_keys
(
key
,
val
);
gguf_remove_key
(
ctx
,
key
);
ctx
->
kv
.
emplace_back
(
key
,
val
);
}
void
gguf_set_val_f32
(
struct
gguf_context
*
ctx
,
const
char
*
key
,
float
val
)
{
gguf_check_reserved_keys
(
key
,
val
);
gguf_remove_key
(
ctx
,
key
);
ctx
->
kv
.
emplace_back
(
key
,
val
);
}
void
gguf_set_val_u64
(
struct
gguf_context
*
ctx
,
const
char
*
key
,
uint64_t
val
)
{
gguf_check_reserved_keys
(
key
,
val
);
gguf_remove_key
(
ctx
,
key
);
ctx
->
kv
.
emplace_back
(
key
,
val
);
}
void
gguf_set_val_i64
(
struct
gguf_context
*
ctx
,
const
char
*
key
,
int64_t
val
)
{
gguf_check_reserved_keys
(
key
,
val
);
gguf_remove_key
(
ctx
,
key
);
ctx
->
kv
.
emplace_back
(
key
,
val
);
}
void
gguf_set_val_f64
(
struct
gguf_context
*
ctx
,
const
char
*
key
,
double
val
)
{
gguf_check_reserved_keys
(
key
,
val
);
gguf_remove_key
(
ctx
,
key
);
ctx
->
kv
.
emplace_back
(
key
,
val
);
}
void
gguf_set_val_bool
(
struct
gguf_context
*
ctx
,
const
char
*
key
,
bool
val
)
{
gguf_check_reserved_keys
(
key
,
val
);
gguf_remove_key
(
ctx
,
key
);
ctx
->
kv
.
emplace_back
(
key
,
val
);
}
void
gguf_set_val_str
(
struct
gguf_context
*
ctx
,
const
char
*
key
,
const
char
*
val
)
{
gguf_check_reserved_keys
(
key
,
val
);
gguf_remove_key
(
ctx
,
key
);
ctx
->
kv
.
emplace_back
(
key
,
std
::
string
(
val
));
}
void
gguf_set_arr_data
(
struct
gguf_context
*
ctx
,
const
char
*
key
,
enum
gguf_type
type
,
const
void
*
data
,
size_t
n
)
{
gguf_check_reserved_keys
(
key
,
data
);
gguf_remove_key
(
ctx
,
key
);
const
size_t
nbytes
=
n
*
gguf_type_size
(
type
);
std
::
vector
<
int8_t
>
tmp
(
nbytes
);
if
(
!
tmp
.
empty
())
{
memcpy
(
tmp
.
data
(),
data
,
nbytes
);
}
ctx
->
kv
.
emplace_back
(
key
,
tmp
);
ctx
->
kv
.
back
().
cast
(
type
);
}
void
gguf_set_arr_str
(
struct
gguf_context
*
ctx
,
const
char
*
key
,
const
char
**
data
,
size_t
n
)
{
gguf_check_reserved_keys
(
key
,
data
);
gguf_remove_key
(
ctx
,
key
);
std
::
vector
<
std
::
string
>
tmp
(
n
);
for
(
size_t
i
=
0
;
i
<
n
;
++
i
)
{
tmp
[
i
]
=
data
[
i
];
}
ctx
->
kv
.
emplace_back
(
key
,
tmp
);
}
// set or add KV pairs from another context
void
gguf_set_kv
(
struct
gguf_context
*
ctx
,
const
struct
gguf_context
*
src
)
{
const
int64_t
n_kv
=
gguf_get_n_kv
(
src
);
for
(
int64_t
i
=
0
;
i
<
n_kv
;
++
i
)
{
const
struct
gguf_kv
&
kv
=
src
->
kv
[
i
];
if
(
!
kv
.
is_array
)
{
switch
(
kv
.
get_type
())
{
case
GGUF_TYPE_UINT8
:
gguf_set_val_u8
(
ctx
,
kv
.
get_key
().
c_str
(),
kv
.
get_val
<
uint8_t
>
());
break
;
case
GGUF_TYPE_INT8
:
gguf_set_val_i8
(
ctx
,
kv
.
get_key
().
c_str
(),
kv
.
get_val
<
int8_t
>
());
break
;
case
GGUF_TYPE_UINT16
:
gguf_set_val_u16
(
ctx
,
kv
.
get_key
().
c_str
(),
kv
.
get_val
<
uint16_t
>
());
break
;
case
GGUF_TYPE_INT16
:
gguf_set_val_i16
(
ctx
,
kv
.
get_key
().
c_str
(),
kv
.
get_val
<
int16_t
>
());
break
;
case
GGUF_TYPE_UINT32
:
gguf_set_val_u32
(
ctx
,
kv
.
get_key
().
c_str
(),
kv
.
get_val
<
uint32_t
>
());
break
;
case
GGUF_TYPE_INT32
:
gguf_set_val_i32
(
ctx
,
kv
.
get_key
().
c_str
(),
kv
.
get_val
<
int32_t
>
());
break
;
case
GGUF_TYPE_FLOAT32
:
gguf_set_val_f32
(
ctx
,
kv
.
get_key
().
c_str
(),
kv
.
get_val
<
float
>
());
break
;
case
GGUF_TYPE_UINT64
:
gguf_set_val_u64
(
ctx
,
kv
.
get_key
().
c_str
(),
kv
.
get_val
<
uint64_t
>
());
break
;
case
GGUF_TYPE_INT64
:
gguf_set_val_i64
(
ctx
,
kv
.
get_key
().
c_str
(),
kv
.
get_val
<
int64_t
>
());
break
;
case
GGUF_TYPE_FLOAT64
:
gguf_set_val_f64
(
ctx
,
kv
.
get_key
().
c_str
(),
kv
.
get_val
<
double
>
());
break
;
case
GGUF_TYPE_BOOL
:
gguf_set_val_bool
(
ctx
,
kv
.
get_key
().
c_str
(),
kv
.
get_val
<
bool
>
());
break
;
case
GGUF_TYPE_STRING
:
gguf_set_val_str
(
ctx
,
kv
.
get_key
().
c_str
(),
kv
.
get_val
<
std
::
string
>
().
c_str
());
break
;
case
GGUF_TYPE_ARRAY
:
default:
GGML_ABORT
(
"invalid type"
);
}
continue
;
}
const
size_t
ne
=
kv
.
get_ne
();
switch
(
kv
.
get_type
())
{
case
GGUF_TYPE_UINT8
:
case
GGUF_TYPE_INT8
:
case
GGUF_TYPE_UINT16
:
case
GGUF_TYPE_INT16
:
case
GGUF_TYPE_UINT32
:
case
GGUF_TYPE_INT32
:
case
GGUF_TYPE_FLOAT32
:
case
GGUF_TYPE_UINT64
:
case
GGUF_TYPE_INT64
:
case
GGUF_TYPE_FLOAT64
:
case
GGUF_TYPE_BOOL
:
{
gguf_set_arr_data
(
ctx
,
kv
.
get_key
().
c_str
(),
kv
.
get_type
(),
kv
.
data
.
data
(),
ne
);
}
break
;
case
GGUF_TYPE_STRING
:
{
std
::
vector
<
const
char
*>
tmp
(
ne
);
for
(
size_t
j
=
0
;
j
<
ne
;
++
j
)
{
tmp
[
j
]
=
kv
.
data_string
[
j
].
c_str
();
}
gguf_set_arr_str
(
ctx
,
kv
.
get_key
().
c_str
(),
tmp
.
data
(),
ne
);
}
break
;
case
GGUF_TYPE_ARRAY
:
default:
GGML_ABORT
(
"invalid type"
);
}
}
}
void
gguf_add_tensor
(
struct
gguf_context
*
ctx
,
const
struct
ggml_tensor
*
tensor
)
{
GGML_ASSERT
(
tensor
);
if
(
gguf_find_tensor
(
ctx
,
tensor
->
name
)
!=
-
1
)
{
GGML_ABORT
(
"duplicate tensor name: %s"
,
tensor
->
name
);
}
struct
gguf_tensor_info
ti
;
ti
.
t
=
*
tensor
;
ti
.
offset
=
ctx
->
info
.
empty
()
?
0
:
ctx
->
info
.
back
().
offset
+
GGML_PAD
(
ggml_nbytes
(
&
ctx
->
info
.
back
().
t
),
ctx
->
alignment
);
ctx
->
info
.
push_back
(
ti
);
}
void
gguf_set_tensor_type
(
struct
gguf_context
*
ctx
,
const
char
*
name
,
enum
ggml_type
type
)
{
const
int64_t
tensor_id
=
gguf_find_tensor
(
ctx
,
name
);
if
(
tensor_id
<
0
)
{
GGML_ABORT
(
"tensor not found: %s"
,
name
);
}
struct
ggml_tensor
*
tensor
=
&
ctx
->
info
[
tensor_id
].
t
;
const
size_t
type_size
=
ggml_type_size
(
type
);
const
int64_t
blck_size
=
ggml_blck_size
(
type
);
tensor
->
type
=
type
;
GGML_ASSERT
(
tensor
->
ne
[
0
]
%
blck_size
==
0
&&
"tensor row size not divisible by block size of new type"
);
tensor
->
nb
[
0
]
=
type_size
;
tensor
->
nb
[
1
]
=
tensor
->
nb
[
0
]
*
(
tensor
->
ne
[
0
]
/
blck_size
);
for
(
int
i
=
2
;
i
<
GGML_MAX_DIMS
;
i
++
)
{
tensor
->
nb
[
i
]
=
tensor
->
nb
[
i
-
1
]
*
tensor
->
ne
[
i
-
1
];
}
// update offsets
const
int64_t
n_tensors
=
gguf_get_n_tensors
(
ctx
);
for
(
int64_t
i
=
tensor_id
+
1
;
i
<
n_tensors
;
++
i
)
{
ctx
->
info
[
i
].
offset
=
ctx
->
info
[
i
-
1
].
offset
+
GGML_PAD
(
ggml_nbytes
(
&
ctx
->
info
[
i
-
1
].
t
),
ctx
->
alignment
);
}
}
void
gguf_set_tensor_data
(
struct
gguf_context
*
ctx
,
const
char
*
name
,
const
void
*
data
)
{
const
int64_t
tensor_id
=
gguf_find_tensor
(
ctx
,
name
);
if
(
tensor_id
<
0
)
{
GGML_ABORT
(
"tensor not found: %s"
,
name
);
}
ctx
->
info
[
tensor_id
].
t
.
data
=
(
void
*
)(
uintptr_t
)
data
;
// double cast suppresses warning about casting away const
}
struct
gguf_writer
{
std
::
vector
<
int8_t
>
&
buf
;
gguf_writer
(
std
::
vector
<
int8_t
>
&
buf
)
:
buf
(
buf
)
{}
template
<
typename
T
>
void
write
(
const
T
&
val
)
const
{
for
(
size_t
i
=
0
;
i
<
sizeof
(
val
);
++
i
)
{
buf
.
push_back
(
reinterpret_cast
<
const
int8_t
*>
(
&
val
)[
i
]);
}
}
void
write
(
const
std
::
vector
<
int8_t
>
&
val
)
const
{
buf
.
insert
(
buf
.
end
(),
val
.
begin
(),
val
.
end
());
}
void
write
(
const
bool
&
val
)
const
{
const
int8_t
val8
=
val
?
1
:
0
;
write
(
val8
);
}
void
write
(
const
std
::
string
&
val
)
const
{
{
const
uint64_t
n
=
val
.
length
();
write
(
n
);
}
for
(
size_t
i
=
0
;
i
<
val
.
length
();
++
i
)
{
buf
.
push_back
(
reinterpret_cast
<
const
int8_t
*>
(
val
.
data
())[
i
]);
}
}
void
write
(
const
char
*
val
)
const
{
write
(
std
::
string
(
val
));
}
void
write
(
const
enum
ggml_type
&
val
)
const
{
write
(
int32_t
(
val
));
}
void
write
(
const
enum
gguf_type
&
val
)
const
{
write
(
int32_t
(
val
));
}
void
write
(
const
struct
gguf_kv
&
kv
)
const
{
const
uint64_t
ne
=
kv
.
get_ne
();
write
(
kv
.
get_key
());
if
(
kv
.
is_array
)
{
write
(
GGUF_TYPE_ARRAY
);
write
(
kv
.
get_type
());
write
(
ne
);
}
else
{
write
(
kv
.
get_type
());
}
switch
(
kv
.
get_type
())
{
case
GGUF_TYPE_UINT8
:
case
GGUF_TYPE_INT8
:
case
GGUF_TYPE_UINT16
:
case
GGUF_TYPE_INT16
:
case
GGUF_TYPE_UINT32
:
case
GGUF_TYPE_INT32
:
case
GGUF_TYPE_FLOAT32
:
case
GGUF_TYPE_UINT64
:
case
GGUF_TYPE_INT64
:
case
GGUF_TYPE_FLOAT64
:
{
write
(
kv
.
data
);
}
break
;
case
GGUF_TYPE_BOOL
:
{
for
(
size_t
i
=
0
;
i
<
ne
;
++
i
)
{
write
(
kv
.
get_val
<
bool
>
(
i
));
}
}
break
;
case
GGUF_TYPE_STRING
:
{
for
(
size_t
i
=
0
;
i
<
ne
;
++
i
)
{
write
(
kv
.
get_val
<
std
::
string
>
(
i
));
}
}
break
;
case
GGUF_TYPE_ARRAY
:
default:
GGML_ABORT
(
"invalid type"
);
}
}
void
write_tensor_meta
(
const
struct
gguf_tensor_info
&
info
)
const
{
write
(
info
.
t
.
name
);
const
uint32_t
n_dims
=
ggml_n_dims
(
&
info
.
t
);
write
(
n_dims
);
for
(
uint32_t
j
=
0
;
j
<
n_dims
;
++
j
)
{
write
(
info
.
t
.
ne
[
j
]);
}
write
(
info
.
t
.
type
);
write
(
info
.
offset
);
}
void
pad
(
const
size_t
alignment
)
const
{
while
(
buf
.
size
()
%
alignment
!=
0
)
{
const
int8_t
zero
=
0
;
write
(
zero
);
}
}
void
write_tensor_data
(
const
struct
gguf_tensor_info
&
info
,
const
size_t
offset_data
,
const
size_t
alignment
)
const
{
GGML_ASSERT
(
buf
.
size
()
-
offset_data
==
info
.
offset
);
GGML_ASSERT
(
ggml_is_contiguous
(
&
info
.
t
));
const
size_t
offset
=
buf
.
size
();
const
size_t
nbytes
=
ggml_nbytes
(
&
info
.
t
);
buf
.
resize
(
offset
+
nbytes
);
if
(
info
.
t
.
buffer
)
{
ggml_backend_tensor_get
(
&
info
.
t
,
buf
.
data
()
+
offset
,
0
,
nbytes
);
}
else
{
GGML_ASSERT
(
info
.
t
.
data
);
memcpy
(
buf
.
data
()
+
offset
,
info
.
t
.
data
,
nbytes
);
}
pad
(
alignment
);
}
};
void
gguf_write_to_buf
(
const
struct
gguf_context
*
ctx
,
std
::
vector
<
int8_t
>
&
buf
,
bool
only_meta
)
{
const
struct
gguf_writer
gw
(
buf
);
const
int64_t
n_kv
=
gguf_get_n_kv
(
ctx
);
const
int64_t
n_tensors
=
gguf_get_n_tensors
(
ctx
);
// write header
gw
.
write
(
GGUF_MAGIC
[
0
]);
gw
.
write
(
GGUF_MAGIC
[
1
]);
gw
.
write
(
GGUF_MAGIC
[
2
]);
gw
.
write
(
GGUF_MAGIC
[
3
]);
gw
.
write
(
ctx
->
version
);
gw
.
write
(
n_tensors
);
gw
.
write
(
n_kv
);
// write key-value pairs
for
(
int64_t
i
=
0
;
i
<
n_kv
;
++
i
)
{
gw
.
write
(
ctx
->
kv
[
i
]);
}
// write tensor info
for
(
int64_t
i
=
0
;
i
<
n_tensors
;
++
i
)
{
gw
.
write_tensor_meta
(
ctx
->
info
[
i
]);
}
// we require the data section to be aligned
gw
.
pad
(
ctx
->
alignment
);
if
(
only_meta
)
{
return
;
}
const
size_t
offset_data
=
gw
.
buf
.
size
();
// write tensor data
for
(
int64_t
i
=
0
;
i
<
n_tensors
;
++
i
)
{
gw
.
write_tensor_data
(
ctx
->
info
[
i
],
offset_data
,
ctx
->
alignment
);
}
}
bool
gguf_write_to_file
(
const
struct
gguf_context
*
ctx
,
const
char
*
fname
,
bool
only_meta
)
{
FILE
*
file
=
ggml_fopen
(
fname
,
"wb"
);
if
(
!
file
)
{
fprintf
(
stderr
,
"%s: failed to open file '%s' for writing GGUF data
\n
"
,
__func__
,
fname
);
return
false
;
}
std
::
vector
<
int8_t
>
buf
;
gguf_write_to_buf
(
ctx
,
buf
,
only_meta
);
const
bool
ok
=
fwrite
(
buf
.
data
(),
1
,
buf
.
size
(),
file
)
==
buf
.
size
();
fclose
(
file
);
return
ok
;
}
size_t
gguf_get_meta_size
(
const
struct
gguf_context
*
ctx
)
{
// only return size
std
::
vector
<
int8_t
>
buf
;
gguf_write_to_buf
(
ctx
,
buf
,
/*only_meta =*/
true
);
return
buf
.
size
();
}
void
gguf_get_meta_data
(
const
struct
gguf_context
*
ctx
,
void
*
data
)
{
std
::
vector
<
int8_t
>
buf
;
gguf_write_to_buf
(
ctx
,
buf
,
/*only_meta =*/
true
);
memcpy
(
data
,
buf
.
data
(),
buf
.
size
());
}
Prev
1
…
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment