Unverified Commit d7d7e996 authored by Jeffrey Morgan's avatar Jeffrey Morgan Committed by GitHub
Browse files

llama: update llama.cpp vendor code to commit d7cfe1ff (#9356)

parent 2db96c18
...@@ -73,9 +73,9 @@ void ggml_cuda_op_rwkv_wkv6(ggml_backend_cuda_context & ctx, ggml_tensor * dst) ...@@ -73,9 +73,9 @@ void ggml_cuda_op_rwkv_wkv6(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
const float * s_d = (const float *)dst->src[5]->data; const float * s_d = (const float *)dst->src[5]->data;
const int64_t B = dst->src[5]->ne[1]; const int64_t B = dst->src[5]->ne[1];
const int64_t T = dst->src[0]->ne[3]; const int64_t T = dst->src[0]->ne[2];
const int64_t C = dst->ne[0]; const int64_t C = dst->ne[0];
const int64_t H = dst->src[0]->ne[2]; const int64_t H = dst->src[0]->ne[1];
float * dst_d = (float *)dst->data; float * dst_d = (float *)dst->data;
......
...@@ -40,13 +40,20 @@ find_package(hip REQUIRED) ...@@ -40,13 +40,20 @@ find_package(hip REQUIRED)
find_package(hipblas REQUIRED) find_package(hipblas REQUIRED)
find_package(rocblas REQUIRED) find_package(rocblas REQUIRED)
if (${hip_VERSION} VERSION_LESS 5.5)
message(FATAL_ERROR "At least ROCM/HIP V5.5 is required")
endif()
message(STATUS "HIP and hipBLAS found") message(STATUS "HIP and hipBLAS found")
# Workaround old compilers
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} --gpu-max-threads-per-block=1024")
file(GLOB GGML_HEADERS_ROCM "../ggml-cuda/*.cuh") file(GLOB GGML_HEADERS_ROCM "../ggml-cuda/*.cuh")
list(APPEND GGML_HEADERS_ROCM "../../include/ggml-cuda.h") list(APPEND GGML_HEADERS_ROCM "../../include/ggml-cuda.h")
file(GLOB GGML_SOURCES_ROCM "../ggml-cuda/*.cu") file(GLOB GGML_SOURCES_ROCM "../ggml-cuda/*.cu")
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-wmma*.cu") file(GLOB SRCS "../ggml-cuda/template-instances/fattn-mma*.cu")
list(APPEND GGML_SOURCES_ROCM ${SRCS}) list(APPEND GGML_SOURCES_ROCM ${SRCS})
file(GLOB SRCS "../ggml-cuda/template-instances/mmq*.cu") file(GLOB SRCS "../ggml-cuda/template-instances/mmq*.cu")
list(APPEND GGML_SOURCES_ROCM ${SRCS}) list(APPEND GGML_SOURCES_ROCM ${SRCS})
...@@ -70,7 +77,9 @@ ggml_add_backend_library(ggml-hip ...@@ -70,7 +77,9 @@ ggml_add_backend_library(ggml-hip
) )
# TODO: do not use CUDA definitions for HIP # TODO: do not use CUDA definitions for HIP
target_compile_definitions(ggml PUBLIC GGML_USE_CUDA) if (NOT GGML_BACKEND_DL)
target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
endif()
add_compile_definitions(GGML_USE_HIP) add_compile_definitions(GGML_USE_HIP)
...@@ -90,6 +99,18 @@ if (GGML_CUDA_NO_PEER_COPY) ...@@ -90,6 +99,18 @@ if (GGML_CUDA_NO_PEER_COPY)
add_compile_definitions(GGML_CUDA_NO_PEER_COPY) add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
endif() endif()
if (GGML_HIP_GRAPHS)
add_compile_definitions(GGML_HIP_GRAPHS)
endif()
if (GGML_HIP_NO_VMM)
add_compile_definitions(GGML_HIP_NO_VMM)
endif()
if (NOT GGML_CUDA_FA)
add_compile_definitions(GGML_CUDA_NO_FA)
endif()
if (CXX_IS_HIPCC) if (CXX_IS_HIPCC)
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX) set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
target_link_libraries(ggml-hip PRIVATE hip::device) target_link_libraries(ggml-hip PRIVATE hip::device)
......
...@@ -3,6 +3,8 @@ ...@@ -3,6 +3,8 @@
// GGML internal header // GGML internal header
#include "ggml.h" #include "ggml.h"
#include "gguf.h"
#include <assert.h> #include <assert.h>
#include <math.h> #include <math.h>
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/ #include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
...@@ -14,7 +16,7 @@ ...@@ -14,7 +16,7 @@
#include <arm_sve.h> #include <arm_sve.h>
#endif // __ARM_FEATURE_SVE #endif // __ARM_FEATURE_SVE
#if defined(__ARM_NEON) && !defined(__CUDACC__) #if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__)
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example: // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
// //
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/ // $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
...@@ -551,22 +553,15 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) { ...@@ -551,22 +553,15 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x) #define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x) #define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
// expose GGUF internals for test code
GGML_API size_t gguf_type_size(enum gguf_type type);
GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params);
struct gguf_buf {
void * data;
size_t size;
size_t offset;
};
GGML_API struct gguf_buf gguf_buf_init(size_t size);
GGML_API void gguf_buf_free(struct gguf_buf buf);
GGML_API void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
#ifdef __cplusplus
#include <vector>
// expose GGUF internals for test code
GGML_API size_t gguf_type_size(enum gguf_type type);
GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params);
GGML_API void gguf_write_to_buf(const struct gguf_context * ctx, std::vector<int8_t> & buf, bool only_meta);
#endif // __cplusplus
...@@ -477,7 +477,6 @@ GGML_TABLE_BEGIN(uint8_t, ksigns_iq2xs, 128) ...@@ -477,7 +477,6 @@ GGML_TABLE_BEGIN(uint8_t, ksigns_iq2xs, 128)
240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255, 240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
GGML_TABLE_END() GGML_TABLE_END()
//#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A // lowest compute capability for integer intrinsics
GGML_TABLE_BEGIN(uint64_t, ksigns64, 128) GGML_TABLE_BEGIN(uint64_t, ksigns64, 128)
0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff, 0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,
0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff, 0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,
...@@ -512,7 +511,6 @@ GGML_TABLE_BEGIN(uint64_t, ksigns64, 128) ...@@ -512,7 +511,6 @@ GGML_TABLE_BEGIN(uint64_t, ksigns64, 128)
0x00ffffffff000000, 0xffffffffff0000ff, 0xffffffffff00ff00, 0x00ffffffff00ffff, 0x00ffffffff000000, 0xffffffffff0000ff, 0xffffffffff00ff00, 0x00ffffffff00ffff,
0xffffffffffff0000, 0x00ffffffffff00ff, 0x00ffffffffffff00, 0xffffffffffffffff, 0xffffffffffff0000, 0x00ffffffffff00ff, 0x00ffffffffffff00, 0xffffffffffffffff,
GGML_TABLE_END() GGML_TABLE_END()
//#endif
GGML_TABLE_BEGIN(uint64_t, iq2xxs_grid, 256) GGML_TABLE_BEGIN(uint64_t, iq2xxs_grid, 256)
...@@ -2513,24 +2511,33 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg ...@@ -2513,24 +2511,33 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg
template <typename type4x4> template <typename type4x4>
void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg) { void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg) {
const half d_all = xb->d; const half d_all = xb->d;
device const uint8_t * ql = (device const uint8_t *)xb->ql; device const uint16_t * ql = (device const uint16_t *)xb->ql;
device const uint8_t * qh = (device const uint8_t *)xb->qh; device const uint16_t * qh = (device const uint16_t *)xb->qh;
device const int8_t * scales = (device const int8_t *)xb->scales; device const int8_t * scales = (device const int8_t *)xb->scales;
ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1); ql = ql + 32*(il/8) + 16*((il/2)&1) + 8*(il&1);
qh = qh + 32*(il/8) + 16*(il&1); qh = qh + 16*(il/8) + 8*(il&1);
float sc = scales[(il%2) + 2 * ((il/2))]; float sc = scales[(il%2) + 2 * ((il/2))];
il = (il/2) & 3; il = (il/2) & 3;
const uint16_t kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); const uint32_t kmask1 = il>1 ? (il>2 ? 0xC0C0C0C0 : 0x30303030) : (il>0 ? 0x0C0C0C0C : 0x03030303);
const uint16_t kmask2 = il>1 ? 0xF0 : 0x0F; const uint32_t kmask2 = il>1 ? 0xF0F0F0F0 : 0x0F0F0F0F;
const float coef = il>1 ? 1.f/16.f : 1.f;
const float ml = d_all * sc * 32.f; const float ml = d_all * sc * 32.f;
const float dl = d_all * sc * coef; const float dl0 = d_all * sc;
for (int i = 0; i < 16; ++i) { const float dl1 = dl0 / 256.f;
const half q = il&1 ? ((ql[i] & kmask2) | ((qh[i] & kmask1) << 2)) const float dl2 = dl0 / (256.f * 256.f);
: ((ql[i] & kmask2) | ((qh[i] & kmask1) << 4)); const float dl3 = dl0 / (256.f * 256.f * 256.f);
reg[i/4][i%4] = dl * q - ml; const uint8_t shr_h = il>2 ? 2 : 0;
const uint8_t shl_h = il>1 ? 0 : (il>0 ? 2 : 4);
const uint8_t shr_l = il>1 ? 4 : 0;
for (int i = 0; i < 4; ++i) {
const uint32_t low = (ql[2*i] | (uint32_t)(ql[2*i+1] << 16)) & kmask2;
const uint32_t high = (qh[2*i] | (uint32_t)(qh[2*i+1] << 16)) & kmask1;
const uint32_t q = ((high << shl_h) >> shr_h) | (low >> shr_l);
reg[i][0] = dl0 * ((half)(q & 0xFF)) - ml;
reg[i][1] = dl1 * ((float)(q & 0xFF00)) - ml;
reg[i][2] = dl2 * ((float)(q & 0xFF0000)) - ml;
reg[i][3] = dl3 * ((float)(q & 0xFF000000)) - ml;
} }
} }
...@@ -3198,7 +3205,7 @@ kernel void kernel_soft_max( ...@@ -3198,7 +3205,7 @@ kernel void kernel_soft_max(
} }
// This barrier fixes a failing test // This barrier fixes a failing test
// ref: https://github.com/ggerganov/ggml/pull/621#discussion_r1425156335 // ref: https://github.com/ggml-org/ggml/pull/621#discussion_r1425156335
threadgroup_barrier(mem_flags::mem_none); threadgroup_barrier(mem_flags::mem_none);
float sum = simd_sum(lsum); float sum = simd_sum(lsum);
...@@ -3303,7 +3310,7 @@ kernel void kernel_soft_max_4( ...@@ -3303,7 +3310,7 @@ kernel void kernel_soft_max_4(
const float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3]; const float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3];
// This barrier fixes a failing test // This barrier fixes a failing test
// ref: https://github.com/ggerganov/ggml/pull/621#discussion_r1425156335 // ref: https://github.com/ggml-org/ggml/pull/621#discussion_r1425156335
threadgroup_barrier(mem_flags::mem_none); threadgroup_barrier(mem_flags::mem_none);
float sum = simd_sum(lsum); float sum = simd_sum(lsum);
...@@ -6517,6 +6524,49 @@ kernel void kernel_cpy_f32_iq4_nl( ...@@ -6517,6 +6524,49 @@ kernel void kernel_cpy_f32_iq4_nl(
} }
} }
template<typename T4x4, typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread T4x4 &)>
kernel void kernel_cpy_q_f32(
constant ggml_metal_kargs_cpy & args,
device const char * src0,
device char * dst,
uint3 tgpig[[threadgroup_position_in_grid]],
ushort3 tpitg[[thread_position_in_threadgroup]],
ushort3 ntg[[threads_per_threadgroup]]) {
const int i03 = tgpig[2];
const int i02 = tgpig[1];
const int i01 = tgpig[0];
const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00;
const int64_t i3 = n/(args.ne2*args.ne1*args.ne0);
const int64_t i2 = (n - i3*args.ne2*args.ne1*args.ne0)/(args.ne1*args.ne0);
const int64_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0)/args.ne0;
const int64_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0);
device const block_q * src_data = (device const block_q *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01);
device T4x4 * dst_data = (device T4x4 *)(dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
for (int64_t i00 = tpitg.x; i00 < args.ne00/16; i00 += ntg.x) {
T4x4 temp;
dequantize_func(src_data + i00/nl, i00%nl, temp);
dst_data[i00] = temp;
}
}
typedef decltype(kernel_cpy_q_f32<float4x4, block_q4_0, 2, dequantize_q4_0>) cpy_q_f_t;
template [[host_name("kernel_cpy_q4_0_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q4_0, 2, dequantize_q4_0>;
template [[host_name("kernel_cpy_q4_1_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q4_1, 2, dequantize_q4_1>;
template [[host_name("kernel_cpy_q5_0_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q5_0, 2, dequantize_q5_0>;
template [[host_name("kernel_cpy_q5_1_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q5_1, 2, dequantize_q5_1>;
template [[host_name("kernel_cpy_q8_0_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q8_0, 2, dequantize_q8_0>;
template [[host_name("kernel_cpy_q4_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q4_0, 2, dequantize_q4_0>;
template [[host_name("kernel_cpy_q4_1_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q4_1, 2, dequantize_q4_1>;
template [[host_name("kernel_cpy_q5_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q5_0, 2, dequantize_q5_0>;
template [[host_name("kernel_cpy_q5_1_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q5_1, 2, dequantize_q5_1>;
template [[host_name("kernel_cpy_q8_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q8_0, 2, dequantize_q8_0>;
kernel void kernel_concat( kernel void kernel_concat(
constant ggml_metal_kargs_concat & args, constant ggml_metal_kargs_concat & args,
device const char * src0, device const char * src0,
...@@ -6601,7 +6651,6 @@ void kernel_mul_mv_q2_K_f32_impl( ...@@ -6601,7 +6651,6 @@ void kernel_mul_mv_q2_K_f32_impl(
device const half * dh = &x[ib].d; device const half * dh = &x[ib].d;
for (int row = 0; row < N_DST; row++) { for (int row = 0; row < N_DST; row++) {
float4 acc1 = {0.f, 0.f, 0.f, 0.f}; float4 acc1 = {0.f, 0.f, 0.f, 0.f};
float4 acc2 = {0.f, 0.f, 0.f, 0.f}; float4 acc2 = {0.f, 0.f, 0.f, 0.f};
for (int i = 0; i < 8; i += 2) { for (int i = 0; i < 8; i += 2) {
...@@ -6632,7 +6681,7 @@ void kernel_mul_mv_q2_K_f32_impl( ...@@ -6632,7 +6681,7 @@ void kernel_mul_mv_q2_K_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < N_DST; ++row) { for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
all_sum = simd_sum(sumf[row]); all_sum = simd_sum(sumf[row]);
if (tiisg == 0) { if (tiisg == 0) {
dst_f32[first_row + row] = all_sum; dst_f32[first_row + row] = all_sum;
...@@ -6798,7 +6847,7 @@ void kernel_mul_mv_q3_K_f32_impl( ...@@ -6798,7 +6847,7 @@ void kernel_mul_mv_q3_K_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
if (tiisg == 0) { if (tiisg == 0) {
for (int row = 0; row < 2; ++row) { for (int row = 0; row < 2 && first_row + row < args.ne0; ++row) {
dst_f32[first_row + row] = sumf1[row]; dst_f32[first_row + row] = sumf1[row];
} }
} }
...@@ -6914,7 +6963,7 @@ void kernel_mul_mv_q4_K_f32_impl( ...@@ -6914,7 +6963,7 @@ void kernel_mul_mv_q4_K_f32_impl(
device float * dst_f32 = (device float *) dst + (int64_t)im*args.ne0*args.ne1 + (int64_t)r1*args.ne0; device float * dst_f32 = (device float *) dst + (int64_t)im*args.ne0*args.ne1 + (int64_t)r1*args.ne0;
for (int row = 0; row < N_DST; ++row) { for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
all_sum = simd_sum(sumf[row]); all_sum = simd_sum(sumf[row]);
if (tiisg == 0) { if (tiisg == 0) {
dst_f32[first_row + row] = all_sum; dst_f32[first_row + row] = all_sum;
...@@ -7046,7 +7095,7 @@ void kernel_mul_mv_q5_K_f32_impl( ...@@ -7046,7 +7095,7 @@ void kernel_mul_mv_q5_K_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < 2; ++row) { for (int row = 0; row < 2 && first_row + row < args.ne0; ++row) {
const float tot = simd_sum(sumf[row]); const float tot = simd_sum(sumf[row]);
if (tiisg == 0) { if (tiisg == 0) {
dst_f32[first_row + row] = tot; dst_f32[first_row + row] = tot;
...@@ -7091,6 +7140,10 @@ void kernel_mul_mv_q6_K_f32_impl( ...@@ -7091,6 +7140,10 @@ void kernel_mul_mv_q6_K_f32_impl(
const int row = 2*r0 + sgitg; const int row = 2*r0 + sgitg;
if (row >= args.ne0) {
return;
}
const uint i12 = im%args.ne12; const uint i12 = im%args.ne12;
const uint i13 = im/args.ne12; const uint i13 = im/args.ne12;
...@@ -7246,7 +7299,7 @@ void kernel_mul_mv_iq2_xxs_f32_impl( ...@@ -7246,7 +7299,7 @@ void kernel_mul_mv_iq2_xxs_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < N_DST; ++row) { for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
all_sum = simd_sum(sumf[row]); all_sum = simd_sum(sumf[row]);
if (tiisg == 0) { if (tiisg == 0) {
dst_f32[first_row + row] = all_sum * 0.25f; dst_f32[first_row + row] = all_sum * 0.25f;
...@@ -7364,7 +7417,7 @@ void kernel_mul_mv_iq2_xs_f32_impl( ...@@ -7364,7 +7417,7 @@ void kernel_mul_mv_iq2_xs_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < N_DST; ++row) { for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
all_sum = simd_sum(sumf[row]); all_sum = simd_sum(sumf[row]);
if (tiisg == 0) { if (tiisg == 0) {
dst_f32[first_row + row] = all_sum * 0.25f; dst_f32[first_row + row] = all_sum * 0.25f;
...@@ -7474,7 +7527,7 @@ void kernel_mul_mv_iq3_xxs_f32_impl( ...@@ -7474,7 +7527,7 @@ void kernel_mul_mv_iq3_xxs_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < N_DST; ++row) { for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
all_sum = simd_sum(sumf[row]); all_sum = simd_sum(sumf[row]);
if (tiisg == 0) { if (tiisg == 0) {
dst_f32[first_row + row] = all_sum * 0.5f; dst_f32[first_row + row] = all_sum * 0.5f;
...@@ -7586,7 +7639,7 @@ void kernel_mul_mv_iq3_s_f32_impl( ...@@ -7586,7 +7639,7 @@ void kernel_mul_mv_iq3_s_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < N_DST; ++row) { for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
all_sum = simd_sum(sumf[row]); all_sum = simd_sum(sumf[row]);
if (tiisg == 0) { if (tiisg == 0) {
dst_f32[first_row + row] = all_sum; dst_f32[first_row + row] = all_sum;
...@@ -7699,7 +7752,7 @@ void kernel_mul_mv_iq2_s_f32_impl( ...@@ -7699,7 +7752,7 @@ void kernel_mul_mv_iq2_s_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < N_DST; ++row) { for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
all_sum = simd_sum(sumf[row]); all_sum = simd_sum(sumf[row]);
if (tiisg == 0) { if (tiisg == 0) {
dst_f32[first_row + row] = all_sum * 0.25f; dst_f32[first_row + row] = all_sum * 0.25f;
...@@ -7799,7 +7852,7 @@ void kernel_mul_mv_iq1_s_f32_impl( ...@@ -7799,7 +7852,7 @@ void kernel_mul_mv_iq1_s_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < N_DST; ++row) { for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
all_sum = simd_sum(sumf[row]); all_sum = simd_sum(sumf[row]);
if (tiisg == 0) { if (tiisg == 0) {
dst_f32[first_row + row] = all_sum; dst_f32[first_row + row] = all_sum;
...@@ -7894,7 +7947,7 @@ void kernel_mul_mv_iq1_m_f32_impl( ...@@ -7894,7 +7947,7 @@ void kernel_mul_mv_iq1_m_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < N_DST; ++row) { for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
all_sum = simd_sum(sumf[row]); all_sum = simd_sum(sumf[row]);
if (tiisg == 0) { if (tiisg == 0) {
dst_f32[first_row + row] = all_sum; dst_f32[first_row + row] = all_sum;
...@@ -7984,7 +8037,7 @@ void kernel_mul_mv_iq4_nl_f32_impl( ...@@ -7984,7 +8037,7 @@ void kernel_mul_mv_iq4_nl_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < 2 && first_row + row < args.ne01; ++row) { for (int row = 0; row < 2 && first_row + row < args.ne0; ++row) {
all_sum = simd_sum(sumf[row]); all_sum = simd_sum(sumf[row]);
if (tiisg == 0) { if (tiisg == 0) {
dst_f32[first_row + row] = all_sum; dst_f32[first_row + row] = all_sum;
...@@ -8073,7 +8126,7 @@ void kernel_mul_mv_iq4_xs_f32_impl( ...@@ -8073,7 +8126,7 @@ void kernel_mul_mv_iq4_xs_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < 2; ++row) { for (int row = 0; row < 2 && first_row + row < args.ne0; ++row) {
all_sum = simd_sum(sumf[row]); all_sum = simd_sum(sumf[row]);
if (tiisg == 0) { if (tiisg == 0) {
dst_f32[first_row + row] = all_sum; dst_f32[first_row + row] = all_sum;
......
...@@ -19,7 +19,17 @@ ...@@ -19,7 +19,17 @@
// max number of MTLCommandBuffer used to submit a graph for processing // max number of MTLCommandBuffer used to submit a graph for processing
#define GGML_METAL_MAX_COMMAND_BUFFERS 8 #define GGML_METAL_MAX_COMMAND_BUFFERS 8
#define UNUSED(x) (void)(x) #ifndef TARGET_OS_VISION
#define TARGET_OS_VISION 0
#endif
// create residency sets only on macOS >= 15.0
#if !TARGET_CPU_X86_64 && TARGET_OS_OSX && __MAC_OS_X_VERSION_MAX_ALLOWED >= 150000 || \
TARGET_OS_IOS && __IPHONE_OS_VERSION_MAX_ALLOWED >= 180000 || \
TARGET_OS_TV && __TV_OS_VERSION_MAX_ALLOWED >= 180000 || \
TARGET_OS_VISION && __VISION_OS_VERSION_MAX_ALLOWED >= 200000
#define GGML_METAL_HAS_RESIDENCY_SETS 1
#endif
// globals // globals
...@@ -39,6 +49,7 @@ static struct ggml_backend_metal_device_context { ...@@ -39,6 +49,7 @@ static struct ggml_backend_metal_device_context {
bool has_simdgroup_reduction; bool has_simdgroup_reduction;
bool has_simdgroup_mm; bool has_simdgroup_mm;
bool has_residency_sets;
bool has_bfloat; bool has_bfloat;
bool use_bfloat; bool use_bfloat;
...@@ -48,6 +59,7 @@ static struct ggml_backend_metal_device_context { ...@@ -48,6 +59,7 @@ static struct ggml_backend_metal_device_context {
/*.mtl_device_ref_count =*/ 0, /*.mtl_device_ref_count =*/ 0,
/*.has_simdgroup_reduction =*/ false, /*.has_simdgroup_reduction =*/ false,
/*.has_simdgroup_mm =*/ false, /*.has_simdgroup_mm =*/ false,
/*.has_residency_sets =*/ false,
/*.has_bfloat =*/ false, /*.has_bfloat =*/ false,
/*.use_bfloat =*/ false, /*.use_bfloat =*/ false,
/*.name =*/ "", /*.name =*/ "",
...@@ -59,12 +71,18 @@ static id<MTLDevice> ggml_backend_metal_device_acq(struct ggml_backend_metal_dev ...@@ -59,12 +71,18 @@ static id<MTLDevice> ggml_backend_metal_device_acq(struct ggml_backend_metal_dev
if (ctx->mtl_device == nil) { if (ctx->mtl_device == nil) {
ctx->mtl_device = MTLCreateSystemDefaultDevice(); ctx->mtl_device = MTLCreateSystemDefaultDevice();
}
if (ctx->mtl_device) {
ctx->has_simdgroup_reduction = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7]; ctx->has_simdgroup_reduction = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7];
ctx->has_simdgroup_reduction |= [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML]; ctx->has_simdgroup_reduction |= [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
ctx->has_simdgroup_mm = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7]; ctx->has_simdgroup_mm = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7];
#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
ctx->has_residency_sets = getenv("GGML_METAL_NO_RESIDENCY") == NULL;
#endif
ctx->has_bfloat = [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML]; ctx->has_bfloat = [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
ctx->has_bfloat |= [ctx->mtl_device supportsFamily:MTLGPUFamilyApple6]; ctx->has_bfloat |= [ctx->mtl_device supportsFamily:MTLGPUFamilyApple6];
...@@ -90,8 +108,10 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte ...@@ -90,8 +108,10 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
ctx->mtl_device_ref_count--; ctx->mtl_device_ref_count--;
if (ctx->mtl_device_ref_count == 0) { if (ctx->mtl_device_ref_count == 0) {
[ctx->mtl_device release]; if (ctx->mtl_device) {
ctx->mtl_device = nil; [ctx->mtl_device release];
ctx->mtl_device = nil;
}
} }
} }
...@@ -388,6 +408,16 @@ enum ggml_metal_kernel_type { ...@@ -388,6 +408,16 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0, GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0,
GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1, GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1,
GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL, GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL,
GGML_METAL_KERNEL_TYPE_CPY_Q4_0_F32,
GGML_METAL_KERNEL_TYPE_CPY_Q4_0_F16,
GGML_METAL_KERNEL_TYPE_CPY_Q4_1_F32,
GGML_METAL_KERNEL_TYPE_CPY_Q4_1_F16,
GGML_METAL_KERNEL_TYPE_CPY_Q5_0_F32,
GGML_METAL_KERNEL_TYPE_CPY_Q5_0_F16,
GGML_METAL_KERNEL_TYPE_CPY_Q5_1_F32,
GGML_METAL_KERNEL_TYPE_CPY_Q5_1_F16,
GGML_METAL_KERNEL_TYPE_CPY_Q8_0_F32,
GGML_METAL_KERNEL_TYPE_CPY_Q8_0_F16,
GGML_METAL_KERNEL_TYPE_CONCAT, GGML_METAL_KERNEL_TYPE_CONCAT,
GGML_METAL_KERNEL_TYPE_SQR, GGML_METAL_KERNEL_TYPE_SQR,
GGML_METAL_KERNEL_TYPE_SQRT, GGML_METAL_KERNEL_TYPE_SQRT,
...@@ -484,6 +514,11 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de ...@@ -484,6 +514,11 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]); GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
ctx->queue = [device newCommandQueue]; ctx->queue = [device newCommandQueue];
if (ctx->queue == nil) {
GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
return NULL;
}
ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT); ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
id<MTLLibrary> metal_library; id<MTLLibrary> metal_library;
...@@ -650,6 +685,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de ...@@ -650,6 +685,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
GGML_LOG_INFO("%s: simdgroup reduction = %s\n", __func__, ctx_dev->has_simdgroup_reduction ? "true" : "false"); GGML_LOG_INFO("%s: simdgroup reduction = %s\n", __func__, ctx_dev->has_simdgroup_reduction ? "true" : "false");
GGML_LOG_INFO("%s: simdgroup matrix mul. = %s\n", __func__, ctx_dev->has_simdgroup_mm ? "true" : "false"); GGML_LOG_INFO("%s: simdgroup matrix mul. = %s\n", __func__, ctx_dev->has_simdgroup_mm ? "true" : "false");
GGML_LOG_INFO("%s: has residency sets = %s\n", __func__, ctx_dev->has_residency_sets ? "true" : "false");
GGML_LOG_INFO("%s: has bfloat = %s\n", __func__, ctx_dev->has_bfloat ? "true" : "false"); GGML_LOG_INFO("%s: has bfloat = %s\n", __func__, ctx_dev->has_bfloat ? "true" : "false");
GGML_LOG_INFO("%s: use bfloat = %s\n", __func__, ctx_dev->use_bfloat ? "true" : "false"); GGML_LOG_INFO("%s: use bfloat = %s\n", __func__, ctx_dev->use_bfloat ? "true" : "false");
GGML_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx_dev->mtl_device.hasUnifiedMemory ? "true" : "false"); GGML_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx_dev->mtl_device.hasUnifiedMemory ? "true" : "false");
...@@ -988,6 +1024,16 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de ...@@ -988,6 +1024,16 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0, cpy_f32_q5_0, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0, cpy_f32_q5_0, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1, cpy_f32_q5_1, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1, cpy_f32_q5_1, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL, cpy_f32_iq4_nl, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL, cpy_f32_iq4_nl, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_Q4_0_F32, cpy_q4_0_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_Q4_0_F16, cpy_q4_0_f16, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_Q4_1_F32, cpy_q4_1_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_Q4_1_F16, cpy_q4_1_f16, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_Q5_0_F32, cpy_q5_0_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_Q5_0_F16, cpy_q5_0_f16, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_Q5_1_F32, cpy_q5_1_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_Q5_1_F16, cpy_q5_1_f16, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_Q8_0_F32, cpy_q8_0_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_Q8_0_F16, cpy_q8_0_f16, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONCAT, concat, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONCAT, concat, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQR, sqr, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQR, sqr, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQRT, sqrt, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQRT, sqrt, true);
...@@ -1037,8 +1083,70 @@ struct ggml_backend_metal_buffer_context { ...@@ -1037,8 +1083,70 @@ struct ggml_backend_metal_buffer_context {
// multiple buffers are used only to avoid the maximum buffer size limitation when using mmap // multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
int n_buffers; int n_buffers;
struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS]; struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
// optional MTLResidencySet
id rset;
}; };
// rset init
static bool ggml_backend_metal_buffer_rset_init(
struct ggml_backend_metal_buffer_context * ctx,
struct ggml_backend_metal_device_context * ctx_dev,
id<MTLDevice> device) {
ctx->rset = nil;
if (!ctx_dev->has_residency_sets) {
return true;
}
#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, visionOS 2.0, *)) {
MTLResidencySetDescriptor * desc = [[MTLResidencySetDescriptor alloc] init];
desc.label = @"ggml_backend_metal";
desc.initialCapacity = ctx->n_buffers;
NSError * error;
ctx->rset = [device newResidencySetWithDescriptor:desc error:&error];
if (error) {
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
[desc release];
return false;
}
[desc release];
for (int i = 0; i < ctx->n_buffers; i++) {
[ctx->rset addAllocation:ctx->buffers[i].metal];
}
[ctx->rset commit];
[ctx->rset requestResidency];
return true;
}
#else
GGML_UNUSED(ctx_dev);
GGML_UNUSED(device);
#endif
return true;
}
// rset free
static void ggml_backend_metal_buffer_rset_free(struct ggml_backend_metal_buffer_context * ctx) {
#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, visionOS 2.0, *)) {
if (ctx->rset) {
[ctx->rset endResidency];
[ctx->rset removeAllAllocations];
[ctx->rset release];
}
}
#else
GGML_UNUSED(ctx);
#endif
}
// finds the Metal buffer that contains the tensor data on the GPU device // finds the Metal buffer that contains the tensor data on the GPU device
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
// Metal buffer based on the host memory pointer // Metal buffer based on the host memory pointer
...@@ -1122,12 +1230,13 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex ...@@ -1122,12 +1230,13 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
case GGML_OP_SUM_ROWS: case GGML_OP_SUM_ROWS:
case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX:
case GGML_OP_GROUP_NORM: case GGML_OP_GROUP_NORM:
return has_simdgroup_reduction; return has_simdgroup_reduction && ggml_is_contiguous(op->src[0]);
case GGML_OP_RMS_NORM: case GGML_OP_RMS_NORM:
return has_simdgroup_reduction && (op->ne[0] % 4 == 0); return has_simdgroup_reduction && (op->ne[0] % 4 == 0 && ggml_is_contiguous_1(op->src[0]));
case GGML_OP_ARGMAX: case GGML_OP_ARGMAX:
case GGML_OP_NORM:
return true; return true;
case GGML_OP_NORM:
return has_simdgroup_reduction && (op->ne[0] % 4 == 0 && ggml_is_contiguous_1(op->src[0]));
case GGML_OP_ROPE: case GGML_OP_ROPE:
{ {
const int mode = ((const int32_t *) op->op_params)[2]; const int mode = ((const int32_t *) op->op_params)[2];
...@@ -1201,6 +1310,18 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex ...@@ -1201,6 +1310,18 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
default: default:
return false; return false;
} }
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0:
switch (op->type) {
case GGML_TYPE_F32:
case GGML_TYPE_F16:
return true;
default:
return false;
}
default: default:
return false; return false;
}; };
...@@ -1897,7 +2018,7 @@ static void ggml_metal_encode_node( ...@@ -1897,7 +2018,7 @@ static void ggml_metal_encode_node(
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
// TODO: add ggml_metal_kargs struct // TODO: add ggml_metal_kargs struct
// TODO: optimize (see https://github.com/ggerganov/llama.cpp/pull/10238/commits/7941b6b9ec29a2866fec6fa6c51612515ca509f6) // TODO: optimize (see https://github.com/ggml-org/llama.cpp/pull/10238/commits/7941b6b9ec29a2866fec6fa6c51612515ca509f6)
[encoder setComputePipelineState:pipeline]; [encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
if (id_src1) { if (id_src1) {
...@@ -3843,10 +3964,6 @@ static void ggml_metal_encode_node( ...@@ -3843,10 +3964,6 @@ static void ggml_metal_encode_node(
case GGML_OP_CPY: case GGML_OP_CPY:
case GGML_OP_CONT: case GGML_OP_CONT:
{ {
GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0);
int nth = MIN(1024, ne00/ggml_blck_size(src0->type));
id<MTLComputePipelineState> pipeline = nil; id<MTLComputePipelineState> pipeline = nil;
switch (src0t) { switch (src0t) {
...@@ -3880,7 +3997,47 @@ static void ggml_metal_encode_node( ...@@ -3880,7 +3997,47 @@ static void ggml_metal_encode_node(
switch (dstt) { switch (dstt) {
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_BF16_F32].pipeline; break; case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_BF16_F32].pipeline; break;
case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_BF16_BF16].pipeline; break; case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_BF16_BF16].pipeline; break;
default: GGML_ASSERT(false && "not implemented"); default: GGML_ABORT("not implemented");
};
} break;
case GGML_TYPE_Q4_0:
{
switch (dstt) {
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_Q4_0_F32].pipeline; break;
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_Q4_0_F16].pipeline; break;
default: GGML_ABORT("not implemented");
};
} break;
case GGML_TYPE_Q4_1:
{
switch (dstt) {
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_Q4_1_F32].pipeline; break;
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_Q4_1_F16].pipeline; break;
default: GGML_ABORT("not implemented");
};
} break;
case GGML_TYPE_Q5_0:
{
switch (dstt) {
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_Q5_0_F32].pipeline; break;
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_Q5_0_F16].pipeline; break;
default: GGML_ABORT("not implemented");
};
} break;
case GGML_TYPE_Q5_1:
{
switch (dstt) {
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_Q5_1_F32].pipeline; break;
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_Q5_1_F16].pipeline; break;
default: GGML_ABORT("not implemented");
};
} break;
case GGML_TYPE_Q8_0:
{
switch (dstt) {
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_Q8_0_F32].pipeline; break;
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_Q8_0_F16].pipeline; break;
default: GGML_ABORT("not implemented");
}; };
} break; } break;
default: GGML_ABORT("not implemented"); default: GGML_ABORT("not implemented");
...@@ -3910,7 +4067,11 @@ static void ggml_metal_encode_node( ...@@ -3910,7 +4067,11 @@ static void ggml_metal_encode_node(
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
[encoder setBuffer:id_dst offset:offs_dst atIndex:2]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0);
int nth = MIN(1024, ne00/ggml_blck_size(src0->type));
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break; } break;
case GGML_OP_SET: case GGML_OP_SET:
{ {
...@@ -4209,6 +4370,8 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) ...@@ -4209,6 +4370,8 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
for (int i = 0; i < ctx->n_buffers; i++) { for (int i = 0; i < ctx->n_buffers; i++) {
[ctx->buffers[i].metal release]; [ctx->buffers[i].metal release];
} }
ggml_backend_metal_buffer_rset_free(ctx);
ggml_backend_metal_device_rel(buffer->buft->device->context); ggml_backend_metal_device_rel(buffer->buft->device->context);
if (ctx->owned) { if (ctx->owned) {
...@@ -4232,19 +4395,19 @@ static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) { ...@@ -4232,19 +4395,19 @@ static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
static void ggml_backend_metal_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { static void ggml_backend_metal_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
memset((char *)tensor->data + offset, value, size); memset((char *)tensor->data + offset, value, size);
UNUSED(buffer); GGML_UNUSED(buffer);
} }
static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
memcpy((char *)tensor->data + offset, data, size); memcpy((char *)tensor->data + offset, data, size);
UNUSED(buffer); GGML_UNUSED(buffer);
} }
static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
memcpy(data, (const char *)tensor->data + offset, size); memcpy(data, (const char *)tensor->data + offset, size);
UNUSED(buffer); GGML_UNUSED(buffer);
} }
static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
...@@ -4254,7 +4417,7 @@ static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, c ...@@ -4254,7 +4417,7 @@ static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, c
} }
return false; return false;
UNUSED(buffer); GGML_UNUSED(buffer);
} }
static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
...@@ -4280,7 +4443,7 @@ static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = { ...@@ -4280,7 +4443,7 @@ static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) { static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
return "Metal"; return "Metal";
UNUSED(buft); GGML_UNUSED(buft);
} }
static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t size_aligned) { static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t size_aligned) {
...@@ -4304,8 +4467,8 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t s ...@@ -4304,8 +4467,8 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t s
} }
#endif #endif
#endif #endif
UNUSED(device); GGML_UNUSED(device);
UNUSED(size_aligned); GGML_UNUSED(size_aligned);
} }
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
...@@ -4318,7 +4481,8 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba ...@@ -4318,7 +4481,8 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
size_aligned += (size_page - (size_aligned % size_page)); size_aligned += (size_page - (size_aligned % size_page));
} }
id<MTLDevice> device = ggml_backend_metal_device_acq(buft->device->context); struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)buft->device->context;
id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
ctx->all_data = ggml_metal_host_malloc(size_aligned); ctx->all_data = ggml_metal_host_malloc(size_aligned);
ctx->all_size = size_aligned; ctx->all_size = size_aligned;
...@@ -4341,7 +4505,14 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba ...@@ -4341,7 +4505,14 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) { if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) {
GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
free(ctx); free(ctx);
ggml_backend_metal_device_rel(buft->device->context); ggml_backend_metal_device_rel(ctx_dev);
return NULL;
}
if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
free(ctx);
ggml_backend_metal_device_rel(ctx_dev);
return NULL; return NULL;
} }
...@@ -4352,7 +4523,7 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba ...@@ -4352,7 +4523,7 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
return 32; return 32;
UNUSED(buft); GGML_UNUSED(buft);
} }
static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
...@@ -4362,13 +4533,13 @@ static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_ty ...@@ -4362,13 +4533,13 @@ static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_ty
return max_size; return max_size;
UNUSED(buft); GGML_UNUSED(buft);
} }
static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) { static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
return true; return true;
UNUSED(buft); GGML_UNUSED(buft);
} }
ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
...@@ -4391,7 +4562,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { ...@@ -4391,7 +4562,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
static const char * ggml_backend_metal_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) { static const char * ggml_backend_metal_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
return "Metal_Mapped"; return "Metal_Mapped";
UNUSED(buft); GGML_UNUSED(buft);
} }
static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type(void) { static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type(void) {
...@@ -4434,7 +4605,8 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz ...@@ -4434,7 +4605,8 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
size_aligned += (size_page - (size_aligned % size_page)); size_aligned += (size_page - (size_aligned % size_page));
} }
id<MTLDevice> device = ggml_backend_metal_device_acq(&g_ggml_ctx_dev_main); struct ggml_backend_metal_device_context * ctx_dev = &g_ggml_ctx_dev_main;
id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
// the buffer fits into the max buffer size allowed by the device // the buffer fits into the max buffer size allowed by the device
if (size_aligned <= device.maxBufferLength) { if (size_aligned <= device.maxBufferLength) {
...@@ -4487,6 +4659,13 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz ...@@ -4487,6 +4659,13 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
} }
} }
if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
free(ctx);
ggml_backend_metal_device_rel(ctx_dev);
return NULL;
}
return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size); return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
} }
...@@ -4495,7 +4674,7 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz ...@@ -4495,7 +4674,7 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
static const char * ggml_backend_metal_name(ggml_backend_t backend) { static const char * ggml_backend_metal_name(ggml_backend_t backend) {
return "Metal"; return "Metal";
UNUSED(backend); GGML_UNUSED(backend);
} }
static void ggml_backend_metal_free(ggml_backend_t backend) { static void ggml_backend_metal_free(ggml_backend_t backend) {
...@@ -4800,6 +4979,13 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back ...@@ -4800,6 +4979,13 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back
} }
} }
if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
free(ctx);
ggml_backend_metal_device_rel(ctx_dev);
return NULL;
}
return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size); return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
} }
...@@ -4813,7 +4999,7 @@ static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml ...@@ -4813,7 +4999,7 @@ static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml
return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name || return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name ||
buft->iface.get_name == ggml_backend_metal_buffer_from_ptr_type_get_name; buft->iface.get_name == ggml_backend_metal_buffer_from_ptr_type_get_name;
UNUSED(dev); GGML_UNUSED(dev);
} }
static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
......
...@@ -373,24 +373,33 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg ...@@ -373,24 +373,33 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg
template <typename type4x4> template <typename type4x4>
void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg) { void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg) {
const half d_all = xb->d; const half d_all = xb->d;
device const uint8_t * ql = (device const uint8_t *)xb->ql; device const uint16_t * ql = (device const uint16_t *)xb->ql;
device const uint8_t * qh = (device const uint8_t *)xb->qh; device const uint16_t * qh = (device const uint16_t *)xb->qh;
device const int8_t * scales = (device const int8_t *)xb->scales; device const int8_t * scales = (device const int8_t *)xb->scales;
ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1); ql = ql + 32*(il/8) + 16*((il/2)&1) + 8*(il&1);
qh = qh + 32*(il/8) + 16*(il&1); qh = qh + 16*(il/8) + 8*(il&1);
float sc = scales[(il%2) + 2 * ((il/2))]; float sc = scales[(il%2) + 2 * ((il/2))];
il = (il/2) & 3; il = (il/2) & 3;
const uint16_t kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); const uint32_t kmask1 = il>1 ? (il>2 ? 0xC0C0C0C0 : 0x30303030) : (il>0 ? 0x0C0C0C0C : 0x03030303);
const uint16_t kmask2 = il>1 ? 0xF0 : 0x0F; const uint32_t kmask2 = il>1 ? 0xF0F0F0F0 : 0x0F0F0F0F;
const float coef = il>1 ? 1.f/16.f : 1.f;
const float ml = d_all * sc * 32.f; const float ml = d_all * sc * 32.f;
const float dl = d_all * sc * coef; const float dl0 = d_all * sc;
for (int i = 0; i < 16; ++i) { const float dl1 = dl0 / 256.f;
const half q = il&1 ? ((ql[i] & kmask2) | ((qh[i] & kmask1) << 2)) const float dl2 = dl0 / (256.f * 256.f);
: ((ql[i] & kmask2) | ((qh[i] & kmask1) << 4)); const float dl3 = dl0 / (256.f * 256.f * 256.f);
reg[i/4][i%4] = dl * q - ml; const uint8_t shr_h = il>2 ? 2 : 0;
const uint8_t shl_h = il>1 ? 0 : (il>0 ? 2 : 4);
const uint8_t shr_l = il>1 ? 4 : 0;
for (int i = 0; i < 4; ++i) {
const uint32_t low = (ql[2*i] | (uint32_t)(ql[2*i+1] << 16)) & kmask2;
const uint32_t high = (qh[2*i] | (uint32_t)(qh[2*i+1] << 16)) & kmask1;
const uint32_t q = ((high << shl_h) >> shr_h) | (low >> shr_l);
reg[i][0] = dl0 * ((half)(q & 0xFF)) - ml;
reg[i][1] = dl1 * ((float)(q & 0xFF00)) - ml;
reg[i][2] = dl2 * ((float)(q & 0xFF0000)) - ml;
reg[i][3] = dl3 * ((float)(q & 0xFF000000)) - ml;
} }
} }
...@@ -1058,7 +1067,7 @@ kernel void kernel_soft_max( ...@@ -1058,7 +1067,7 @@ kernel void kernel_soft_max(
} }
// This barrier fixes a failing test // This barrier fixes a failing test
// ref: https://github.com/ggerganov/ggml/pull/621#discussion_r1425156335 // ref: https://github.com/ggml-org/ggml/pull/621#discussion_r1425156335
threadgroup_barrier(mem_flags::mem_none); threadgroup_barrier(mem_flags::mem_none);
float sum = simd_sum(lsum); float sum = simd_sum(lsum);
...@@ -1163,7 +1172,7 @@ kernel void kernel_soft_max_4( ...@@ -1163,7 +1172,7 @@ kernel void kernel_soft_max_4(
const float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3]; const float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3];
// This barrier fixes a failing test // This barrier fixes a failing test
// ref: https://github.com/ggerganov/ggml/pull/621#discussion_r1425156335 // ref: https://github.com/ggml-org/ggml/pull/621#discussion_r1425156335
threadgroup_barrier(mem_flags::mem_none); threadgroup_barrier(mem_flags::mem_none);
float sum = simd_sum(lsum); float sum = simd_sum(lsum);
...@@ -4377,6 +4386,49 @@ kernel void kernel_cpy_f32_iq4_nl( ...@@ -4377,6 +4386,49 @@ kernel void kernel_cpy_f32_iq4_nl(
} }
} }
template<typename T4x4, typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread T4x4 &)>
kernel void kernel_cpy_q_f32(
constant ggml_metal_kargs_cpy & args,
device const char * src0,
device char * dst,
uint3 tgpig[[threadgroup_position_in_grid]],
ushort3 tpitg[[thread_position_in_threadgroup]],
ushort3 ntg[[threads_per_threadgroup]]) {
const int i03 = tgpig[2];
const int i02 = tgpig[1];
const int i01 = tgpig[0];
const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00;
const int64_t i3 = n/(args.ne2*args.ne1*args.ne0);
const int64_t i2 = (n - i3*args.ne2*args.ne1*args.ne0)/(args.ne1*args.ne0);
const int64_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0)/args.ne0;
const int64_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0);
device const block_q * src_data = (device const block_q *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01);
device T4x4 * dst_data = (device T4x4 *)(dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
for (int64_t i00 = tpitg.x; i00 < args.ne00/16; i00 += ntg.x) {
T4x4 temp;
dequantize_func(src_data + i00/nl, i00%nl, temp);
dst_data[i00] = temp;
}
}
typedef decltype(kernel_cpy_q_f32<float4x4, block_q4_0, 2, dequantize_q4_0>) cpy_q_f_t;
template [[host_name("kernel_cpy_q4_0_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q4_0, 2, dequantize_q4_0>;
template [[host_name("kernel_cpy_q4_1_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q4_1, 2, dequantize_q4_1>;
template [[host_name("kernel_cpy_q5_0_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q5_0, 2, dequantize_q5_0>;
template [[host_name("kernel_cpy_q5_1_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q5_1, 2, dequantize_q5_1>;
template [[host_name("kernel_cpy_q8_0_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q8_0, 2, dequantize_q8_0>;
template [[host_name("kernel_cpy_q4_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q4_0, 2, dequantize_q4_0>;
template [[host_name("kernel_cpy_q4_1_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q4_1, 2, dequantize_q4_1>;
template [[host_name("kernel_cpy_q5_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q5_0, 2, dequantize_q5_0>;
template [[host_name("kernel_cpy_q5_1_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q5_1, 2, dequantize_q5_1>;
template [[host_name("kernel_cpy_q8_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q8_0, 2, dequantize_q8_0>;
kernel void kernel_concat( kernel void kernel_concat(
constant ggml_metal_kargs_concat & args, constant ggml_metal_kargs_concat & args,
device const char * src0, device const char * src0,
...@@ -4461,7 +4513,6 @@ void kernel_mul_mv_q2_K_f32_impl( ...@@ -4461,7 +4513,6 @@ void kernel_mul_mv_q2_K_f32_impl(
device const half * dh = &x[ib].d; device const half * dh = &x[ib].d;
for (int row = 0; row < N_DST; row++) { for (int row = 0; row < N_DST; row++) {
float4 acc1 = {0.f, 0.f, 0.f, 0.f}; float4 acc1 = {0.f, 0.f, 0.f, 0.f};
float4 acc2 = {0.f, 0.f, 0.f, 0.f}; float4 acc2 = {0.f, 0.f, 0.f, 0.f};
for (int i = 0; i < 8; i += 2) { for (int i = 0; i < 8; i += 2) {
...@@ -4492,7 +4543,7 @@ void kernel_mul_mv_q2_K_f32_impl( ...@@ -4492,7 +4543,7 @@ void kernel_mul_mv_q2_K_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < N_DST; ++row) { for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
all_sum = simd_sum(sumf[row]); all_sum = simd_sum(sumf[row]);
if (tiisg == 0) { if (tiisg == 0) {
dst_f32[first_row + row] = all_sum; dst_f32[first_row + row] = all_sum;
...@@ -4658,7 +4709,7 @@ void kernel_mul_mv_q3_K_f32_impl( ...@@ -4658,7 +4709,7 @@ void kernel_mul_mv_q3_K_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
if (tiisg == 0) { if (tiisg == 0) {
for (int row = 0; row < 2; ++row) { for (int row = 0; row < 2 && first_row + row < args.ne0; ++row) {
dst_f32[first_row + row] = sumf1[row]; dst_f32[first_row + row] = sumf1[row];
} }
} }
...@@ -4774,7 +4825,7 @@ void kernel_mul_mv_q4_K_f32_impl( ...@@ -4774,7 +4825,7 @@ void kernel_mul_mv_q4_K_f32_impl(
device float * dst_f32 = (device float *) dst + (int64_t)im*args.ne0*args.ne1 + (int64_t)r1*args.ne0; device float * dst_f32 = (device float *) dst + (int64_t)im*args.ne0*args.ne1 + (int64_t)r1*args.ne0;
for (int row = 0; row < N_DST; ++row) { for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
all_sum = simd_sum(sumf[row]); all_sum = simd_sum(sumf[row]);
if (tiisg == 0) { if (tiisg == 0) {
dst_f32[first_row + row] = all_sum; dst_f32[first_row + row] = all_sum;
...@@ -4906,7 +4957,7 @@ void kernel_mul_mv_q5_K_f32_impl( ...@@ -4906,7 +4957,7 @@ void kernel_mul_mv_q5_K_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < 2; ++row) { for (int row = 0; row < 2 && first_row + row < args.ne0; ++row) {
const float tot = simd_sum(sumf[row]); const float tot = simd_sum(sumf[row]);
if (tiisg == 0) { if (tiisg == 0) {
dst_f32[first_row + row] = tot; dst_f32[first_row + row] = tot;
...@@ -4951,6 +5002,10 @@ void kernel_mul_mv_q6_K_f32_impl( ...@@ -4951,6 +5002,10 @@ void kernel_mul_mv_q6_K_f32_impl(
const int row = 2*r0 + sgitg; const int row = 2*r0 + sgitg;
if (row >= args.ne0) {
return;
}
const uint i12 = im%args.ne12; const uint i12 = im%args.ne12;
const uint i13 = im/args.ne12; const uint i13 = im/args.ne12;
...@@ -5106,7 +5161,7 @@ void kernel_mul_mv_iq2_xxs_f32_impl( ...@@ -5106,7 +5161,7 @@ void kernel_mul_mv_iq2_xxs_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < N_DST; ++row) { for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
all_sum = simd_sum(sumf[row]); all_sum = simd_sum(sumf[row]);
if (tiisg == 0) { if (tiisg == 0) {
dst_f32[first_row + row] = all_sum * 0.25f; dst_f32[first_row + row] = all_sum * 0.25f;
...@@ -5224,7 +5279,7 @@ void kernel_mul_mv_iq2_xs_f32_impl( ...@@ -5224,7 +5279,7 @@ void kernel_mul_mv_iq2_xs_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < N_DST; ++row) { for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
all_sum = simd_sum(sumf[row]); all_sum = simd_sum(sumf[row]);
if (tiisg == 0) { if (tiisg == 0) {
dst_f32[first_row + row] = all_sum * 0.25f; dst_f32[first_row + row] = all_sum * 0.25f;
...@@ -5334,7 +5389,7 @@ void kernel_mul_mv_iq3_xxs_f32_impl( ...@@ -5334,7 +5389,7 @@ void kernel_mul_mv_iq3_xxs_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < N_DST; ++row) { for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
all_sum = simd_sum(sumf[row]); all_sum = simd_sum(sumf[row]);
if (tiisg == 0) { if (tiisg == 0) {
dst_f32[first_row + row] = all_sum * 0.5f; dst_f32[first_row + row] = all_sum * 0.5f;
...@@ -5446,7 +5501,7 @@ void kernel_mul_mv_iq3_s_f32_impl( ...@@ -5446,7 +5501,7 @@ void kernel_mul_mv_iq3_s_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < N_DST; ++row) { for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
all_sum = simd_sum(sumf[row]); all_sum = simd_sum(sumf[row]);
if (tiisg == 0) { if (tiisg == 0) {
dst_f32[first_row + row] = all_sum; dst_f32[first_row + row] = all_sum;
...@@ -5559,7 +5614,7 @@ void kernel_mul_mv_iq2_s_f32_impl( ...@@ -5559,7 +5614,7 @@ void kernel_mul_mv_iq2_s_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < N_DST; ++row) { for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
all_sum = simd_sum(sumf[row]); all_sum = simd_sum(sumf[row]);
if (tiisg == 0) { if (tiisg == 0) {
dst_f32[first_row + row] = all_sum * 0.25f; dst_f32[first_row + row] = all_sum * 0.25f;
...@@ -5659,7 +5714,7 @@ void kernel_mul_mv_iq1_s_f32_impl( ...@@ -5659,7 +5714,7 @@ void kernel_mul_mv_iq1_s_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < N_DST; ++row) { for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
all_sum = simd_sum(sumf[row]); all_sum = simd_sum(sumf[row]);
if (tiisg == 0) { if (tiisg == 0) {
dst_f32[first_row + row] = all_sum; dst_f32[first_row + row] = all_sum;
...@@ -5754,7 +5809,7 @@ void kernel_mul_mv_iq1_m_f32_impl( ...@@ -5754,7 +5809,7 @@ void kernel_mul_mv_iq1_m_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < N_DST; ++row) { for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
all_sum = simd_sum(sumf[row]); all_sum = simd_sum(sumf[row]);
if (tiisg == 0) { if (tiisg == 0) {
dst_f32[first_row + row] = all_sum; dst_f32[first_row + row] = all_sum;
...@@ -5844,7 +5899,7 @@ void kernel_mul_mv_iq4_nl_f32_impl( ...@@ -5844,7 +5899,7 @@ void kernel_mul_mv_iq4_nl_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < 2 && first_row + row < args.ne01; ++row) { for (int row = 0; row < 2 && first_row + row < args.ne0; ++row) {
all_sum = simd_sum(sumf[row]); all_sum = simd_sum(sumf[row]);
if (tiisg == 0) { if (tiisg == 0) {
dst_f32[first_row + row] = all_sum; dst_f32[first_row + row] = all_sum;
...@@ -5933,7 +5988,7 @@ void kernel_mul_mv_iq4_xs_f32_impl( ...@@ -5933,7 +5988,7 @@ void kernel_mul_mv_iq4_xs_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < 2; ++row) { for (int row = 0; row < 2 && first_row + row < args.ne0; ++row) {
all_sum = simd_sum(sumf[row]); all_sum = simd_sum(sumf[row]);
if (tiisg == 0) { if (tiisg == 0) {
dst_f32[first_row + row] = all_sum; dst_f32[first_row + row] = all_sum;
......
...@@ -128,6 +128,10 @@ static void ggml_print_backtrace_symbols(void) { ...@@ -128,6 +128,10 @@ static void ggml_print_backtrace_symbols(void) {
#endif #endif
static void ggml_print_backtrace(void) { static void ggml_print_backtrace(void) {
const char * GGML_NO_BACKTRACE = getenv("GGML_NO_BACKTRACE");
if (GGML_NO_BACKTRACE) {
return;
}
char attach[32]; char attach[32];
snprintf(attach, sizeof(attach), "attach %d", getpid()); snprintf(attach, sizeof(attach), "attach %d", getpid());
int pid = fork(); int pid = fork();
...@@ -236,7 +240,11 @@ void ggml_log_callback_default(enum ggml_log_level level, const char * text, voi ...@@ -236,7 +240,11 @@ void ggml_log_callback_default(enum ggml_log_level level, const char * text, voi
void * ggml_aligned_malloc(size_t size) { void * ggml_aligned_malloc(size_t size) {
#if defined(__s390x__)
const int alignment = 256;
#else
const int alignment = 64; const int alignment = 64;
#endif
#if defined(_MSC_VER) || defined(__MINGW32__) #if defined(_MSC_VER) || defined(__MINGW32__)
return _aligned_malloc(size, alignment); return _aligned_malloc(size, alignment);
...@@ -969,6 +977,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { ...@@ -969,6 +977,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"GET_REL_POS", "GET_REL_POS",
"ADD_REL_POS", "ADD_REL_POS",
"RWKV_WKV6", "RWKV_WKV6",
"GATED_LINEAR_ATTN",
"UNARY", "UNARY",
...@@ -988,7 +997,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { ...@@ -988,7 +997,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"OPT_STEP_ADAMW", "OPT_STEP_ADAMW",
}; };
static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83"); static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none", "none",
...@@ -1066,6 +1075,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { ...@@ -1066,6 +1075,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"get_rel_pos(x)", "get_rel_pos(x)",
"add_rel_pos(x)", "add_rel_pos(x)",
"rwkv_wkv6(k, v, r, tf, td, s)", "rwkv_wkv6(k, v, r, tf, td, s)",
"gated_linear_attn(k, v, q, gate, s)",
"unary(x)", "unary(x)",
...@@ -1085,7 +1095,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { ...@@ -1085,7 +1095,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"adamw(x)", "adamw(x)",
}; };
static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83"); static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
...@@ -1375,7 +1385,7 @@ bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tenso ...@@ -1375,7 +1385,7 @@ bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tenso
(t0->nb[3] == t1->nb[3]); (t0->nb[3] == t1->nb[3]);
} }
// check if t1 can be represented as a repeatition of t0 // check if t1 can be represented as a repetition of t0
bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
...@@ -1590,15 +1600,8 @@ static struct ggml_tensor * ggml_new_tensor_impl( ...@@ -1590,15 +1600,8 @@ static struct ggml_tensor * ggml_new_tensor_impl(
struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs); struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs);
#ifdef __clang__
// temporary until ggml_tensor::backend is removed
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wdeprecated-declarations"
#endif
*result = (struct ggml_tensor) { *result = (struct ggml_tensor) {
/*.type =*/ type, /*.type =*/ type,
/*.backend =*/ GGML_BACKEND_TYPE_CPU,
/*.buffer =*/ NULL, /*.buffer =*/ NULL,
/*.ne =*/ { 1, 1, 1, 1 }, /*.ne =*/ { 1, 1, 1, 1 },
/*.nb =*/ { 0, 0, 0, 0 }, /*.nb =*/ { 0, 0, 0, 0 },
...@@ -1614,10 +1617,6 @@ static struct ggml_tensor * ggml_new_tensor_impl( ...@@ -1614,10 +1617,6 @@ static struct ggml_tensor * ggml_new_tensor_impl(
/*.padding =*/ { 0 }, /*.padding =*/ { 0 },
}; };
#ifdef __clang__
#pragma clang diagnostic pop
#endif
// TODO: this should not be needed as long as we don't rely on aligned SIMD loads // TODO: this should not be needed as long as we don't rely on aligned SIMD loads
//GGML_ASSERT_ALIGNED(result->data); //GGML_ASSERT_ALIGNED(result->data);
...@@ -3461,12 +3460,14 @@ struct ggml_tensor * ggml_soft_max_ext( ...@@ -3461,12 +3460,14 @@ struct ggml_tensor * ggml_soft_max_ext(
return ggml_soft_max_impl(ctx, a, mask, scale, max_bias, false); return ggml_soft_max_impl(ctx, a, mask, scale, max_bias, false);
} }
// ggml_soft_max_back // ggml_soft_max_ext_back
static struct ggml_tensor * ggml_soft_max_back_impl( static struct ggml_tensor * ggml_soft_max_ext_back_impl(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b, struct ggml_tensor * b,
float scale,
float max_bias,
bool inplace) { bool inplace) {
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
...@@ -3474,21 +3475,28 @@ static struct ggml_tensor * ggml_soft_max_back_impl( ...@@ -3474,21 +3475,28 @@ static struct ggml_tensor * ggml_soft_max_back_impl(
result->src[0] = a; result->src[0] = a;
result->src[1] = b; result->src[1] = b;
memcpy((float *) result->op_params + 0, &scale, sizeof(float));
memcpy((float *) result->op_params + 1, &max_bias, sizeof(float));
return result; return result;
} }
struct ggml_tensor * ggml_soft_max_back( struct ggml_tensor * ggml_soft_max_ext_back(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b) { struct ggml_tensor * b,
return ggml_soft_max_back_impl(ctx, a, b, false); float scale,
float max_bias) {
return ggml_soft_max_ext_back_impl(ctx, a, b, scale, max_bias, false);
} }
struct ggml_tensor * ggml_soft_max_back_inplace( struct ggml_tensor * ggml_soft_max_ext_back_inplace(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b) { struct ggml_tensor * b,
return ggml_soft_max_back_impl(ctx, a, b, true); float scale,
float max_bias) {
return ggml_soft_max_ext_back_impl(ctx, a, b, scale, max_bias, true);
} }
// ggml_rope // ggml_rope
...@@ -3706,7 +3714,7 @@ void ggml_rope_yarn_corr_dims( ...@@ -3706,7 +3714,7 @@ void ggml_rope_yarn_corr_dims(
// ggml_rope_back // ggml_rope_back
struct ggml_tensor * ggml_rope_back( struct ggml_tensor * ggml_rope_ext_back(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b, struct ggml_tensor * b,
...@@ -3720,29 +3728,32 @@ struct ggml_tensor * ggml_rope_back( ...@@ -3720,29 +3728,32 @@ struct ggml_tensor * ggml_rope_back(
float attn_factor, float attn_factor,
float beta_fast, float beta_fast,
float beta_slow) { float beta_slow) {
GGML_ASSERT(ggml_is_vector(b)); struct ggml_tensor * result = ggml_rope_ext(
GGML_ASSERT(b->type == GGML_TYPE_I32); ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
GGML_ASSERT(a->ne[2] == b->ne[0]); result->op = GGML_OP_ROPE_BACK;
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
memcpy(params + 5, &freq_base, sizeof(float));
memcpy(params + 6, &freq_scale, sizeof(float));
memcpy(params + 7, &ext_factor, sizeof(float));
memcpy(params + 8, &attn_factor, sizeof(float));
memcpy(params + 9, &beta_fast, sizeof(float));
memcpy(params + 10, &beta_slow, sizeof(float));
ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_ROPE_BACK;
result->src[0] = a;
result->src[1] = b;
result->src[2] = c;
return result; return result;
} }
struct ggml_tensor * ggml_rope_multi_back(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c,
int n_dims,
int sections[4],
int mode,
int n_ctx_orig,
float freq_base,
float freq_scale,
float ext_factor,
float attn_factor,
float beta_fast,
float beta_slow) {
struct ggml_tensor * result = ggml_rope_multi(
ctx, a, b, c, n_dims, sections, mode, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
result->op = GGML_OP_ROPE_BACK;
return result;
}
// ggml_clamp // ggml_clamp
struct ggml_tensor * ggml_clamp( struct ggml_tensor * ggml_clamp(
...@@ -4661,15 +4672,13 @@ struct ggml_tensor * ggml_rwkv_wkv6( ...@@ -4661,15 +4672,13 @@ struct ggml_tensor * ggml_rwkv_wkv6(
GGML_ASSERT(ggml_is_contiguous(state)); GGML_ASSERT(ggml_is_contiguous(state));
const int64_t S = k->ne[0]; const int64_t S = k->ne[0];
const int64_t H = k->ne[2]; const int64_t H = k->ne[1];
const int64_t n_tokens = k->ne[3]; const int64_t n_tokens = k->ne[2];
const int64_t n_seqs = state->ne[1]; const int64_t n_seqs = state->ne[1];
{ {
GGML_ASSERT(k->ne[1] == 1); GGML_ASSERT(v->ne[0] == S && v->ne[1] == H && v->ne[2] == n_tokens);
GGML_ASSERT(v->ne[0] == 1 && v->ne[1] == S && v->ne[2] == H && v->ne[3] == n_tokens); GGML_ASSERT(r->ne[0] == S && r->ne[1] == H && r->ne[2] == n_tokens);
GGML_ASSERT(r->ne[0] == 1 && r->ne[1] == S && r->ne[2] == H && r->ne[3] == n_tokens); GGML_ASSERT(td->ne[0] == S && td->ne[1] == H && td->ne[2] == n_tokens);
// TODO: RWKV v4 and v5
GGML_ASSERT(td->ne[0] == 1 && td->ne[1] == S && td->ne[2] == H && td->ne[3] == n_tokens);
GGML_ASSERT(ggml_nelements(state) == S * S * H * n_seqs); GGML_ASSERT(ggml_nelements(state) == S * S * H * n_seqs);
} }
...@@ -4688,6 +4697,49 @@ struct ggml_tensor * ggml_rwkv_wkv6( ...@@ -4688,6 +4697,49 @@ struct ggml_tensor * ggml_rwkv_wkv6(
return result; return result;
} }
// ggml_gated_linear_attn
struct ggml_tensor * ggml_gated_linear_attn(
struct ggml_context * ctx,
struct ggml_tensor * k,
struct ggml_tensor * v,
struct ggml_tensor * q,
struct ggml_tensor * g,
struct ggml_tensor * state,
float scale) {
GGML_ASSERT(ggml_is_contiguous(k));
GGML_ASSERT(ggml_is_contiguous(v));
GGML_ASSERT(ggml_is_contiguous(q));
GGML_ASSERT(ggml_is_contiguous(g));
GGML_ASSERT(ggml_is_contiguous(state));
const int64_t S = k->ne[0];
const int64_t H = k->ne[1];
const int64_t n_tokens = k->ne[2];
const int64_t n_seqs = state->ne[1];
{
GGML_ASSERT(v->ne[0] == S && v->ne[1] == H && v->ne[2] == n_tokens);
GGML_ASSERT(q->ne[0] == S && q->ne[1] == H && q->ne[2] == n_tokens);
GGML_ASSERT(g->ne[0] == S && g->ne[1] == H && g->ne[2] == n_tokens);
GGML_ASSERT(ggml_nelements(state) == S * S * H * n_seqs);
}
// concat output and new_state
const int64_t ne[4] = { S * H, n_tokens + S * n_seqs, 1, 1 };
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
ggml_set_op_params_f32(result, 0, scale);
result->op = GGML_OP_GATED_LINEAR_ATTN;
result->src[0] = k;
result->src[1] = v;
result->src[2] = q;
result->src[3] = g;
result->src[4] = state;
return result;
}
// ggml_unary // ggml_unary
static struct ggml_tensor * ggml_unary_impl( static struct ggml_tensor * ggml_unary_impl(
...@@ -5062,10 +5114,10 @@ struct ggml_tensor * ggml_cross_entropy_loss_back( ...@@ -5062,10 +5114,10 @@ struct ggml_tensor * ggml_cross_entropy_loss_back(
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b, struct ggml_tensor * b,
struct ggml_tensor * c) { struct ggml_tensor * c) {
GGML_ASSERT(ggml_are_same_shape(a, b)); GGML_ASSERT(ggml_is_scalar(a));
GGML_ASSERT(ggml_is_scalar(c)); GGML_ASSERT(ggml_are_same_shape(b, c));
struct ggml_tensor * result = ggml_dup_tensor(ctx, a); struct ggml_tensor * result = ggml_dup_tensor(ctx, b);
result->op = GGML_OP_CROSS_ENTROPY_LOSS_BACK; result->op = GGML_OP_CROSS_ENTROPY_LOSS_BACK;
result->src[0] = a; result->src[0] = a;
...@@ -5244,7 +5296,7 @@ static void ggml_sub_or_set( ...@@ -5244,7 +5296,7 @@ static void ggml_sub_or_set(
} }
static void ggml_compute_backward( static void ggml_compute_backward(
struct ggml_context * ctx, struct ggml_cgraph * cgraph, int i, bool * grads_needed) { struct ggml_context * ctx, struct ggml_cgraph * cgraph, int i, const bool * grads_needed) {
struct ggml_tensor * tensor = cgraph->nodes[i]; struct ggml_tensor * tensor = cgraph->nodes[i];
struct ggml_tensor * grad = ggml_graph_get_grad(cgraph, tensor); struct ggml_tensor * grad = ggml_graph_get_grad(cgraph, tensor);
...@@ -5316,7 +5368,7 @@ static void ggml_compute_backward( ...@@ -5316,7 +5368,7 @@ static void ggml_compute_backward(
} break; } break;
case GGML_OP_MUL: { case GGML_OP_MUL: {
if (src0_needs_grads) { if (src0_needs_grads) {
ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, src1, grad)); ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, grad, src1));
} }
if (src1_needs_grads) { if (src1_needs_grads) {
struct ggml_tensor * tmp = ggml_mul(ctx, src0, grad); struct ggml_tensor * tmp = ggml_mul(ctx, src0, grad);
...@@ -5388,7 +5440,7 @@ static void ggml_compute_backward( ...@@ -5388,7 +5440,7 @@ static void ggml_compute_backward(
if (src0_needs_grads) { if (src0_needs_grads) {
float eps; float eps;
memcpy(&eps, tensor->op_params, sizeof(float)); memcpy(&eps, tensor->op_params, sizeof(float));
ggml_add_or_set(ctx, cgraph, isrc0, ggml_rms_norm_back(ctx, src0, grad, eps)); ggml_add_or_set(ctx, cgraph, isrc0, ggml_rms_norm_back(ctx, grad, src0, eps));
} }
} break; } break;
case GGML_OP_MUL_MAT: { case GGML_OP_MUL_MAT: {
...@@ -5408,21 +5460,25 @@ static void ggml_compute_backward( ...@@ -5408,21 +5460,25 @@ static void ggml_compute_backward(
// src1.shape [n,p,qq,rr] // src1.shape [n,p,qq,rr]
if (src0_needs_grads) { if (src0_needs_grads) {
struct ggml_tensor * s1_tg = GGML_ASSERT(grad->ne[2] == src1->ne[2]);
GGML_ASSERT(grad->ne[3] == src1->ne[3]);
struct ggml_tensor * tmp =
ggml_out_prod(ctx, // [n,m,qq,rr] ggml_out_prod(ctx, // [n,m,qq,rr]
src1, // [n,p,qq,rr] src1, // [n,p,qq,rr]
grad); // [m,p,qq,rr] grad); // [m,p,qq,rr]
const int64_t qq = s1_tg->ne[2]; if (!ggml_are_same_shape(tmp, src0)) {
const int64_t rr = s1_tg->ne[3]; GGML_ASSERT(tmp->ne[0] == src0->ne[0]);
const int64_t q1 = src0->ne[2]; GGML_ASSERT(tmp->ne[1] == src0->ne[1]);
const int64_t r1 = src0->ne[3]; GGML_ASSERT(tmp->ne[3] == 1);
const bool ne2_broadcasted = qq > q1;
const bool ne3_broadcasted = rr > r1; const int64_t nr2 = tmp->ne[2] / src0->ne[2];
if (ne2_broadcasted || ne3_broadcasted) { const size_t nb2 = tmp->nb[2] * nr2;
// sum broadcast repetitions of s1_tg into shape of src0 const size_t nb3 = tmp->nb[2];
s1_tg = ggml_repeat_back(ctx, s1_tg, src0);
tmp = ggml_view_4d(ctx, tmp, src0->ne[0], src0->ne[1], src0->ne[2], nr2, tmp->nb[1], nb2, nb3, 0);
tmp = ggml_repeat_back(ctx, tmp, src0);
} }
ggml_add_or_set(ctx, cgraph, isrc0, s1_tg /*= [n,m,q1,r1]*/); ggml_add_or_set(ctx, cgraph, isrc0, tmp);
} }
if (src1_needs_grads) { if (src1_needs_grads) {
ggml_add_or_set(ctx, cgraph, isrc1, ggml_add_or_set(ctx, cgraph, isrc1,
...@@ -5491,7 +5547,9 @@ static void ggml_compute_backward( ...@@ -5491,7 +5547,9 @@ static void ggml_compute_backward(
if (src0_needs_grads) { if (src0_needs_grads) {
GGML_ASSERT(!cgraph->grads[isrc0] || ggml_is_contiguous(cgraph->grads[isrc0])); GGML_ASSERT(!cgraph->grads[isrc0] || ggml_is_contiguous(cgraph->grads[isrc0]));
GGML_ASSERT(ggml_is_contiguous(grad)); GGML_ASSERT(ggml_is_contiguous(grad));
ggml_add_or_set(ctx, cgraph, isrc0, grad); GGML_ASSERT(ggml_nelements(tensor) == ggml_nelements(src0));
ggml_add_or_set(ctx, cgraph, isrc0,
ggml_are_same_shape(tensor, src0) ? grad : ggml_reshape(ctx, grad, src0));
} }
} break; } break;
case GGML_OP_RESHAPE: { case GGML_OP_RESHAPE: {
...@@ -5571,7 +5629,13 @@ static void ggml_compute_backward( ...@@ -5571,7 +5629,13 @@ static void ggml_compute_backward(
} break; } break;
case GGML_OP_SOFT_MAX: { case GGML_OP_SOFT_MAX: {
if (src0_needs_grads) { if (src0_needs_grads) {
ggml_add_or_set(ctx, cgraph, isrc0, ggml_soft_max_back(ctx, grad, tensor)); float scale = 1.0f;
float max_bias = 0.0f;
memcpy(&scale, (const float *) tensor->op_params + 0, sizeof(float));
memcpy(&max_bias, (const float *) tensor->op_params + 1, sizeof(float));
ggml_add_or_set(ctx, cgraph, isrc0, ggml_soft_max_ext_back(ctx, grad, tensor, scale, max_bias));
} }
GGML_ASSERT((!src1 || !src1_needs_grads) && "backward pass for softmax mask not implemented"); GGML_ASSERT((!src1 || !src1_needs_grads) && "backward pass for softmax mask not implemented");
} break; } break;
...@@ -5583,6 +5647,7 @@ static void ggml_compute_backward( ...@@ -5583,6 +5647,7 @@ static void ggml_compute_backward(
//const int n_ctx = ((int32_t *) tensor->op_params)[3]; //const int n_ctx = ((int32_t *) tensor->op_params)[3];
const int n_ctx_orig = ((const int32_t *) tensor->op_params)[4]; const int n_ctx_orig = ((const int32_t *) tensor->op_params)[4];
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
int sections[4] = {0, 0, 0, 0};
memcpy(&freq_base, (const float *) tensor->op_params + 5, sizeof(float)); memcpy(&freq_base, (const float *) tensor->op_params + 5, sizeof(float));
memcpy(&freq_scale, (const float *) tensor->op_params + 6, sizeof(float)); memcpy(&freq_scale, (const float *) tensor->op_params + 6, sizeof(float));
...@@ -5590,10 +5655,14 @@ static void ggml_compute_backward( ...@@ -5590,10 +5655,14 @@ static void ggml_compute_backward(
memcpy(&attn_factor, (const float *) tensor->op_params + 8, sizeof(float)); memcpy(&attn_factor, (const float *) tensor->op_params + 8, sizeof(float));
memcpy(&beta_fast, (const float *) tensor->op_params + 9, sizeof(float)); memcpy(&beta_fast, (const float *) tensor->op_params + 9, sizeof(float));
memcpy(&beta_slow, (const float *) tensor->op_params + 10, sizeof(float)); memcpy(&beta_slow, (const float *) tensor->op_params + 10, sizeof(float));
memcpy(&sections, tensor->op_params + 11, sizeof(sections));
ggml_add_or_set(ctx, cgraph, isrc0,
ggml_rope_back(ctx, grad, src1, src2, n_dims, mode, n_ctx_orig, freq_base, struct ggml_tensor * rope_back = grad->ne[2] == src1->ne[0] ?
freq_scale, ext_factor, attn_factor, beta_fast, beta_slow)); ggml_rope_ext_back(ctx, grad, src1, src2, n_dims,
mode, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow) :
ggml_rope_multi_back(ctx, grad, src1, src2, n_dims, sections,
mode, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
ggml_add_or_set(ctx, cgraph, isrc0, rope_back);
} }
GGML_ASSERT((!src2 || !src2_needs_grads) && "gradients for freq factors not implemented"); GGML_ASSERT((!src2 || !src2_needs_grads) && "gradients for freq factors not implemented");
} break; } break;
...@@ -5607,7 +5676,7 @@ static void ggml_compute_backward( ...@@ -5607,7 +5676,7 @@ static void ggml_compute_backward(
const int32_t d1 = ggml_get_op_params_i32(tensor, 5); const int32_t d1 = ggml_get_op_params_i32(tensor, 5);
const bool is_2D = ggml_get_op_params_i32(tensor, 6) == 1; const bool is_2D = ggml_get_op_params_i32(tensor, 6) == 1;
ggml_add_or_set(ctx, cgraph, isrc1, ggml_im2col_back(ctx, src0, grad, src1->ne, s0, s1, p0, p1, d0, d1, is_2D)); ggml_add_or_set(ctx, cgraph, isrc1, ggml_im2col_back(ctx, grad, src0, src1->ne, s0, s1, p0, p1, d0, d1, is_2D));
} }
} break; } break;
case GGML_OP_POOL_2D: { case GGML_OP_POOL_2D: {
...@@ -5650,7 +5719,7 @@ static void ggml_compute_backward( ...@@ -5650,7 +5719,7 @@ static void ggml_compute_backward(
} break; } break;
case GGML_UNARY_OP_SILU: { case GGML_UNARY_OP_SILU: {
if (src0_needs_grads) { if (src0_needs_grads) {
ggml_add_or_set(ctx, cgraph, isrc0, ggml_silu_back(ctx, src0, grad)); ggml_add_or_set(ctx, cgraph, isrc0, ggml_silu_back(ctx, grad, src0));
} }
} break; } break;
case GGML_UNARY_OP_EXP: { case GGML_UNARY_OP_EXP: {
...@@ -5667,7 +5736,7 @@ static void ggml_compute_backward( ...@@ -5667,7 +5736,7 @@ static void ggml_compute_backward(
} break; } break;
case GGML_OP_CROSS_ENTROPY_LOSS: { case GGML_OP_CROSS_ENTROPY_LOSS: {
if (src0_needs_grads) { if (src0_needs_grads) {
ggml_add_or_set(ctx, cgraph, isrc0, ggml_cross_entropy_loss_back(ctx, src0, src1, grad)); ggml_add_or_set(ctx, cgraph, isrc0, ggml_cross_entropy_loss_back(ctx, grad, src0, src1));
} }
GGML_ASSERT(!src1_needs_grads && "backward pass for labels not implemented"); GGML_ASSERT(!src1_needs_grads && "backward pass for labels not implemented");
} break; } break;
...@@ -6438,1271 +6507,6 @@ size_t ggml_quantize_chunk( ...@@ -6438,1271 +6507,6 @@ size_t ggml_quantize_chunk(
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
struct gguf_str {
uint64_t n; // GGUFv2
char * data;
};
static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
[GGUF_TYPE_UINT8] = sizeof(uint8_t),
[GGUF_TYPE_INT8] = sizeof(int8_t),
[GGUF_TYPE_UINT16] = sizeof(uint16_t),
[GGUF_TYPE_INT16] = sizeof(int16_t),
[GGUF_TYPE_UINT32] = sizeof(uint32_t),
[GGUF_TYPE_INT32] = sizeof(int32_t),
[GGUF_TYPE_FLOAT32] = sizeof(float),
[GGUF_TYPE_BOOL] = sizeof(bool),
[GGUF_TYPE_STRING] = sizeof(struct gguf_str),
[GGUF_TYPE_UINT64] = sizeof(uint64_t),
[GGUF_TYPE_INT64] = sizeof(int64_t),
[GGUF_TYPE_FLOAT64] = sizeof(double),
[GGUF_TYPE_ARRAY] = 0, // undefined
};
static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
[GGUF_TYPE_UINT8] = "u8",
[GGUF_TYPE_INT8] = "i8",
[GGUF_TYPE_UINT16] = "u16",
[GGUF_TYPE_INT16] = "i16",
[GGUF_TYPE_UINT32] = "u32",
[GGUF_TYPE_INT32] = "i32",
[GGUF_TYPE_FLOAT32] = "f32",
[GGUF_TYPE_BOOL] = "bool",
[GGUF_TYPE_STRING] = "str",
[GGUF_TYPE_ARRAY] = "arr",
[GGUF_TYPE_UINT64] = "u64",
[GGUF_TYPE_INT64] = "i64",
[GGUF_TYPE_FLOAT64] = "f64",
};
static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
union gguf_value {
uint8_t uint8;
int8_t int8;
uint16_t uint16;
int16_t int16;
uint32_t uint32;
int32_t int32;
float float32;
uint64_t uint64;
int64_t int64;
double float64;
bool bool_;
struct gguf_str str;
struct {
enum gguf_type type;
uint64_t n; // GGUFv2
void * data;
} arr;
};
struct gguf_kv {
struct gguf_str key;
enum gguf_type type;
union gguf_value value;
};
struct gguf_header {
char magic[4];
uint32_t version;
uint64_t n_tensors; // GGUFv2
uint64_t n_kv; // GGUFv2
};
struct gguf_tensor_info {
struct gguf_str name;
uint32_t n_dims;
uint64_t ne[GGML_MAX_DIMS];
enum ggml_type type;
uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`
// for writing API
const void * data;
size_t size;
};
struct gguf_context {
struct gguf_header header;
struct gguf_kv * kv;
struct gguf_tensor_info * infos;
size_t alignment;
size_t offset; // offset of `data` from beginning of file
size_t size; // size of `data` in bytes
//uint8_t * padding;
void * data;
};
size_t gguf_type_size(enum gguf_type type) {
GGML_ASSERT(0 <= type && type < GGUF_TYPE_COUNT);
return GGUF_TYPE_SIZE[type];
}
static bool gguf_tensor_info_sanitize(struct gguf_tensor_info * info) {
if (info->n_dims > GGML_MAX_DIMS) {
fprintf(stderr, "%s: invalid number of dimensions (%" PRIu32 ")\n", __func__, info->n_dims);
return false;
}
if (info->type < 0 || info->type >= GGML_TYPE_COUNT) {
fprintf(stderr, "%s: invalid type (%d)\n", __func__, info->type);
return false;
}
if (strlen(info->name.data) >= GGML_MAX_NAME) {
fprintf(stderr, "%s: tensor '%s' name is too long\n", __func__, info->name.data);
return false;
}
for (uint32_t i = 0; i < info->n_dims; ++i) {
if (info->ne[i] <= 0) {
fprintf(stderr, "%s: invalid number of elements (%" PRIu64 ")\n", __func__, info->ne[i]);
return false;
}
}
// prevent overflow for total number of elements
if (INT64_MAX/info->ne[1] <= info->ne[0]) {
fprintf(stderr, "%s: invalid number of elements (%" PRIu64 ")\n", __func__, info->ne[1]);
return false;
}
if (INT64_MAX/info->ne[2] <= info->ne[0]*info->ne[1]) {
fprintf(stderr, "%s: invalid number of elements (%" PRIu64 ")\n", __func__, info->ne[2]);
return false;
}
if (INT64_MAX/info->ne[3] <= info->ne[0]*info->ne[1]*info->ne[2]) {
fprintf(stderr, "%s: invalid number of elements (%" PRIu64 ")\n", __func__, info->ne[3]);
return false;
}
return true;
}
static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) {
const size_t n = fread(dst, 1, size, file);
*offset += n;
return n == size;
}
static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
p->n = 0;
p->data = NULL;
bool ok = true;
ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset);
// early exit if string length is invalid, prevents from integer overflow
if (p->n == SIZE_MAX) {
fprintf(stderr, "%s: invalid string length (%" PRIu64 ")\n", __func__, p->n);
return false;
}
p->data = calloc(p->n + 1, 1);
if (!p->data) {
fprintf(stderr, "%s: failed to allocate memory for string of length %" PRIu64 "\n", __func__, p->n);
return false;
}
ok = ok && gguf_fread_el(file, p->data, p->n, offset);
return ok;
}
static void gguf_free_kv(struct gguf_kv * kv) {
if (kv->key.data) {
GGML_FREE(kv->key.data);
}
if (kv->type == GGUF_TYPE_STRING) {
if (kv->value.str.data) {
GGML_FREE(kv->value.str.data);
}
}
if (kv->type == GGUF_TYPE_ARRAY) {
if (kv->value.arr.data) {
if (kv->value.arr.type == GGUF_TYPE_STRING) {
for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
if (str->data) {
GGML_FREE(str->data);
}
}
}
GGML_FREE(kv->value.arr.data);
}
}
}
struct gguf_context * gguf_init_empty(void) {
struct gguf_context * ctx = calloc(1, sizeof(struct gguf_context));
if (!ctx) {
fprintf(stderr, "%s: failed to allocate memory for context\n", __func__);
return NULL;
}
memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
ctx->header.version = GGUF_VERSION;
ctx->header.n_tensors = 0;
ctx->header.n_kv = 0;
ctx->kv = NULL;
ctx->infos = NULL;
ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
ctx->offset = 0;
ctx->size = 0;
ctx->data = NULL;
return ctx;
}
struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params) {
// offset from start of file
size_t offset = 0;
char magic[4];
// check the magic before making allocations
{
gguf_fread_el(file, &magic, sizeof(magic), &offset);
for (uint32_t i = 0; i < sizeof(magic); i++) {
if (magic[i] != GGUF_MAGIC[i]) {
fprintf(stderr, "%s: invalid magic characters '%c%c%c%c'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
return NULL;
}
}
}
bool ok = true;
struct gguf_context * ctx = calloc(1, sizeof(struct gguf_context));
if (!ctx) {
fprintf(stderr, "%s: failed to allocate memory for context\n", __func__);
return NULL;
}
// read the header
{
strncpy(ctx->header.magic, magic, 4);
ctx->kv = NULL;
ctx->infos = NULL;
ctx->data = NULL;
ok = ok && gguf_fread_el(file, &ctx->header.version, sizeof(ctx->header.version), &offset);
ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset);
if (ctx->header.version == 1) {
fprintf(stderr, "%s: GGUFv1 is no longer supported. please use a more up-to-date version\n", __func__);
gguf_free(ctx);
return NULL;
}
// sanity-checks to prevent from integer/buffer overflows
ok = ok && (ctx->header.n_tensors < (SIZE_MAX/2)/sizeof(struct gguf_tensor_info));
ok = ok && (ctx->header.n_tensors < (SIZE_MAX/2)/ggml_tensor_overhead());
ok = ok && (ctx->header.n_kv < (SIZE_MAX/2)/sizeof(struct gguf_kv));
if (!ok) {
fprintf(stderr, "%s: failed to read header\n", __func__);
gguf_free(ctx);
return NULL;
}
}
// read the kv pairs
{
const uint64_t n_kv = ctx->header.n_kv;
if (n_kv > 0) {
ctx->kv = calloc(n_kv, sizeof(struct gguf_kv));
if (!ctx->kv) {
fprintf(stderr, "%s: failed to allocate memory for kv pairs\n", __func__);
gguf_free(ctx);
return NULL;
}
}
for (uint64_t i = 0; i < n_kv; ++i) {
struct gguf_kv * kv = &ctx->kv[i];
//fprintf(stderr, "%s: reading kv %d\n", __func__, i);
ok = ok && gguf_fread_str(file, &kv->key, &offset);
ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);
//fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);
switch (kv->type) {
case GGUF_TYPE_UINT8: ok = ok && gguf_fread_el (file, &kv->value.uint8, sizeof(kv->value.uint8), &offset); break;
case GGUF_TYPE_INT8: ok = ok && gguf_fread_el (file, &kv->value.int8, sizeof(kv->value.int8), &offset); break;
case GGUF_TYPE_UINT16: ok = ok && gguf_fread_el (file, &kv->value.uint16, sizeof(kv->value.uint16), &offset); break;
case GGUF_TYPE_INT16: ok = ok && gguf_fread_el (file, &kv->value.int16, sizeof(kv->value.int16), &offset); break;
case GGUF_TYPE_UINT32: ok = ok && gguf_fread_el (file, &kv->value.uint32, sizeof(kv->value.uint32), &offset); break;
case GGUF_TYPE_INT32: ok = ok && gguf_fread_el (file, &kv->value.int32, sizeof(kv->value.int32), &offset); break;
case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break;
case GGUF_TYPE_UINT64: ok = ok && gguf_fread_el (file, &kv->value.uint64, sizeof(kv->value.uint64), &offset); break;
case GGUF_TYPE_INT64: ok = ok && gguf_fread_el (file, &kv->value.int64, sizeof(kv->value.int64), &offset); break;
case GGUF_TYPE_FLOAT64: ok = ok && gguf_fread_el (file, &kv->value.float64, sizeof(kv->value.float64), &offset); break;
case GGUF_TYPE_BOOL: ok = ok && gguf_fread_el (file, &kv->value.bool_, sizeof(kv->value.bool_), &offset); break;
case GGUF_TYPE_STRING: ok = ok && gguf_fread_str(file, &kv->value.str, &offset); break;
case GGUF_TYPE_ARRAY:
{
ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
switch (kv->value.arr.type) {
case GGUF_TYPE_UINT8:
case GGUF_TYPE_INT8:
case GGUF_TYPE_UINT16:
case GGUF_TYPE_INT16:
case GGUF_TYPE_UINT32:
case GGUF_TYPE_INT32:
case GGUF_TYPE_FLOAT32:
case GGUF_TYPE_UINT64:
case GGUF_TYPE_INT64:
case GGUF_TYPE_FLOAT64:
case GGUF_TYPE_BOOL:
{
// prevent from integer overflow in the malloc below
if (kv->value.arr.n >= SIZE_MAX/gguf_type_size(kv->value.arr.type)) {
fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
gguf_free(ctx);
return NULL;
}
kv->value.arr.data = calloc(kv->value.arr.n, gguf_type_size(kv->value.arr.type));
if (!kv->value.arr.data) {
fprintf(stderr, "%s: failed to allocate memory for array\n", __func__);
gguf_free(ctx);
return NULL;
}
ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type), &offset);
} break;
case GGUF_TYPE_STRING:
{
// prevent from integer overflow in the malloc below
if (kv->value.arr.n >= SIZE_MAX/sizeof(struct gguf_str)) {
fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
gguf_free(ctx);
return NULL;
}
kv->value.arr.data = calloc(kv->value.arr.n, sizeof(struct gguf_str));
if (!kv->value.arr.data) {
fprintf(stderr, "%s: failed to allocate memory for array\n", __func__);
gguf_free(ctx);
return NULL;
}
for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
}
} break;
case GGUF_TYPE_ARRAY:
default:
{
fprintf(stderr, "%s: invalid array type %d\n", __func__, kv->value.arr.type);
ok = false;
} break;
}
} break;
default:
{
fprintf(stderr, "%s: invalid type %d\n", __func__, kv->type);
ok = false;
} break;
}
if (!ok) {
break;
}
}
if (!ok) {
fprintf(stderr, "%s: failed to read key-value pairs\n", __func__);
gguf_free(ctx);
return NULL;
}
}
// read the tensor infos
if (ctx->header.n_tensors > 0) {
ctx->infos = calloc(ctx->header.n_tensors, sizeof(struct gguf_tensor_info));
if (!ctx->infos) {
fprintf(stderr, "%s: failed to allocate memory for tensor infos\n", __func__);
gguf_free(ctx);
return NULL;
}
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
struct gguf_tensor_info * info = &ctx->infos[i];
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
info->ne[j] = 1;
}
ok = ok && gguf_fread_str(file, &info->name, &offset);
ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset);
ok = ok && (info->n_dims <= GGML_MAX_DIMS);
for (uint32_t j = 0; j < info->n_dims; ++j) {
ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
}
ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
ok = ok && gguf_tensor_info_sanitize(info);
// make sure there is no duplicated tensor names
for (uint64_t j = 0; j < i && ok; ++j) {
if (strcmp(info->name.data, ctx->infos[j].name.data) == 0) {
fprintf(stderr, "%s: duplicated tensor name %s\n", __func__, info->name.data);
ok = false;
}
}
if (!ok) {
fprintf(stderr, "%s: failed to read tensor info\n", __func__);
gguf_free(ctx);
return NULL;
}
}
}
ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
int alignment_idx = gguf_find_key(ctx, "general.alignment");
if (alignment_idx != -1) {
ctx->alignment = gguf_get_val_u32(ctx, alignment_idx);
}
// we require the data section to be aligned, so take into account any padding
{
const size_t offset_pad = offset % ctx->alignment;
if (offset_pad != 0) {
offset += ctx->alignment - offset_pad;
fseek(file, offset, SEEK_SET);
}
}
// store the current file offset - this is where the data section starts
ctx->offset = offset;
// compute the total size of the data section, taking into account the alignment
{
ctx->size = 0;
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
struct gguf_tensor_info * info = &ctx->infos[i];
const int64_t ne =
(int64_t) info->ne[0] *
(int64_t) info->ne[1] *
(int64_t) info->ne[2] *
(int64_t) info->ne[3];
if (ggml_blck_size(info->type) == 0 ) {
// this tensor type support have been removed:
fprintf(stderr, "%s: tensor '%s' of type %d: %s\n",
__func__, info->name.data, (int) info->type, ggml_type_name(info->type));
gguf_free(ctx);
return NULL;
}
if (ne % ggml_blck_size(info->type) != 0) {
fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n",
__func__, info->name.data, (int) info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
gguf_free(ctx);
return NULL;
}
const size_t size_cur = ggml_row_size(info->type, ne);
ctx->size += GGML_PAD(size_cur, ctx->alignment);
}
}
// load the tensor data only if requested
if (params.ctx != NULL) {
// if the provided gguf_context is no_alloc, then we create "empty" tensors and do not read the binary blob
// otherwise, we load the binary blob into the created ggml_context as well, and point the "data" members of
// the ggml_tensor structs to the appropriate locations in the binary blob
// compute the exact size needed for the new ggml_context
const size_t mem_size =
params.no_alloc ?
(ctx->header.n_tensors )*ggml_tensor_overhead() :
(ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size;
struct ggml_init_params pdata = {
.mem_size = mem_size,
.mem_buffer = NULL,
.no_alloc = params.no_alloc,
};
*params.ctx = ggml_init(pdata);
if (*params.ctx == NULL) {
fprintf(stderr, "%s: failed to initialize context\n", __func__);
gguf_free(ctx);
return NULL;
}
struct ggml_context * ctx_data = *params.ctx;
struct ggml_tensor * data = NULL;
if (!params.no_alloc) {
data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
ok = ok && data != NULL;
// read the binary blob with the tensor data
ok = ok && gguf_fread_el(file, data->data, ctx->size, &offset);
if (!ok) {
fprintf(stderr, "%s: failed to read tensor data\n", __func__);
ggml_free(ctx_data);
gguf_free(ctx);
return NULL;
}
ctx->data = data->data;
}
ggml_set_no_alloc(ctx_data, true);
// create the tensors
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
const int64_t ne[GGML_MAX_DIMS] = {
ctx->infos[i].ne[0],
ctx->infos[i].ne[1],
ctx->infos[i].ne[2],
ctx->infos[i].ne[3],
};
struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne);
ok = ok && cur != NULL;
if (!ok) {
break;
}
ggml_set_name(cur, ctx->infos[i].name.data);
// point the data member to the appropriate location in the binary blob using the tensor infos
if (!params.no_alloc) {
//cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data
}
}
if (!ok) {
fprintf(stderr, "%s: failed to read the tensor data\n", __func__);
ggml_free(ctx_data);
gguf_free(ctx);
return NULL;
}
ggml_set_no_alloc(ctx_data, params.no_alloc);
}
return ctx;
}
struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
FILE * file = ggml_fopen(fname, "rb");
if (!file) {
fprintf(stderr, "%s: failed to open '%s': '%s'\n", __func__, fname, strerror(errno));
return NULL;
}
struct gguf_context * result = gguf_init_from_file_impl(file, params);
fclose(file);
return result;
}
void gguf_free(struct gguf_context * ctx) {
if (ctx == NULL) {
return;
}
if (ctx->kv) {
// free string memory - not great..
for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
gguf_free_kv(&ctx->kv[i]);
}
GGML_FREE(ctx->kv);
}
if (ctx->infos) {
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
struct gguf_tensor_info * info = &ctx->infos[i];
if (info->name.data) {
GGML_FREE(info->name.data);
}
}
GGML_FREE(ctx->infos);
}
GGML_FREE(ctx);
}
const char * gguf_type_name(enum gguf_type type) {
return GGUF_TYPE_NAME[type];
}
int gguf_get_version(const struct gguf_context * ctx) {
return ctx->header.version;
}
size_t gguf_get_alignment(const struct gguf_context * ctx) {
return ctx->alignment;
}
size_t gguf_get_data_offset(const struct gguf_context * ctx) {
return ctx->offset;
}
void * gguf_get_data(const struct gguf_context * ctx) {
return ctx->data;
}
int gguf_get_n_kv(const struct gguf_context * ctx) {
return ctx->header.n_kv;
}
int gguf_find_key(const struct gguf_context * ctx, const char * key) {
// return -1 if key not found
int keyfound = -1;
const int n_kv = gguf_get_n_kv(ctx);
for (int i = 0; i < n_kv; ++i) {
if (strcmp(key, gguf_get_key(ctx, i)) == 0) {
keyfound = i;
break;
}
}
return keyfound;
}
const char * gguf_get_key(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
return ctx->kv[key_id].key.data;
}
enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
return ctx->kv[key_id].type;
}
enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
return ctx->kv[key_id].value.arr.type;
}
const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
return ctx->kv[key_id].value.arr.data;
}
const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
struct gguf_kv * kv = &ctx->kv[key_id];
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
return str->data;
}
int gguf_get_arr_n(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
return ctx->kv[key_id].value.arr.n;
}
uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT8);
return ctx->kv[key_id].value.uint8;
}
int8_t gguf_get_val_i8(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT8);
return ctx->kv[key_id].value.int8;
}
uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT16);
return ctx->kv[key_id].value.uint16;
}
int16_t gguf_get_val_i16(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT16);
return ctx->kv[key_id].value.int16;
}
uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT32);
return ctx->kv[key_id].value.uint32;
}
int32_t gguf_get_val_i32(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT32);
return ctx->kv[key_id].value.int32;
}
float gguf_get_val_f32(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT32);
return ctx->kv[key_id].value.float32;
}
uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT64);
return ctx->kv[key_id].value.uint64;
}
int64_t gguf_get_val_i64(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT64);
return ctx->kv[key_id].value.int64;
}
double gguf_get_val_f64(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT64);
return ctx->kv[key_id].value.float64;
}
bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_BOOL);
return ctx->kv[key_id].value.bool_;
}
const char * gguf_get_val_str(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_STRING);
return ctx->kv[key_id].value.str.data;
}
const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_ARRAY);
GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_STRING);
return &ctx->kv[key_id].value;
}
int gguf_get_n_tensors(const struct gguf_context * ctx) {
return ctx->header.n_tensors;
}
int gguf_find_tensor(const struct gguf_context * ctx, const char * name) {
// return -1 if tensor not found
int tensorfound = -1;
const int n_tensors = gguf_get_n_tensors(ctx);
for (int i = 0; i < n_tensors; ++i) {
if (strcmp(name, gguf_get_tensor_name(ctx, i)) == 0) {
tensorfound = i;
break;
}
}
return tensorfound;
}
size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i) {
return ctx->infos[i].offset;
}
char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) {
return ctx->infos[i].name.data;
}
enum ggml_type gguf_get_tensor_type(const struct gguf_context * ctx, int i) {
return ctx->infos[i].type;
}
// returns the index
static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
const int idx = gguf_find_key(ctx, key);
if (idx >= 0) {
return idx;
}
const int n_kv = gguf_get_n_kv(ctx);
ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
ctx->kv[n_kv].key.n = strlen(key);
ctx->kv[n_kv].key.data = strdup(key);
ctx->header.n_kv++;
return n_kv;
}
void gguf_remove_key(struct gguf_context * ctx, const char * key) {
const int idx = gguf_find_key(ctx, key);
if (idx >= 0) {
const int n_kv = gguf_get_n_kv(ctx);
gguf_free_kv(&ctx->kv[idx]);
for (int i = idx; i < n_kv-1; ++i) {
ctx->kv[i] = ctx->kv[i+1];
}
ctx->kv = realloc(ctx->kv, (n_kv - 1) * sizeof(struct gguf_kv));
ctx->header.n_kv--;
}
}
void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) {
const int idx = gguf_get_or_add_key(ctx, key);
ctx->kv[idx].type = GGUF_TYPE_UINT8;
ctx->kv[idx].value.uint8 = val;
}
void gguf_set_val_i8(struct gguf_context * ctx, const char * key, int8_t val) {
const int idx = gguf_get_or_add_key(ctx, key);
ctx->kv[idx].type = GGUF_TYPE_INT8;
ctx->kv[idx].value.int8 = val;
}
void gguf_set_val_u16(struct gguf_context * ctx, const char * key, uint16_t val) {
const int idx = gguf_get_or_add_key(ctx, key);
ctx->kv[idx].type = GGUF_TYPE_UINT16;
ctx->kv[idx].value.uint16 = val;
}
void gguf_set_val_i16(struct gguf_context * ctx, const char * key, int16_t val) {
const int idx = gguf_get_or_add_key(ctx, key);
ctx->kv[idx].type = GGUF_TYPE_INT16;
ctx->kv[idx].value.int16 = val;
}
void gguf_set_val_u32(struct gguf_context * ctx, const char * key, uint32_t val) {
const int idx = gguf_get_or_add_key(ctx, key);
ctx->kv[idx].type = GGUF_TYPE_UINT32;
ctx->kv[idx].value.uint32 = val;
}
void gguf_set_val_i32(struct gguf_context * ctx, const char * key, int32_t val) {
const int idx = gguf_get_or_add_key(ctx, key);
ctx->kv[idx].type = GGUF_TYPE_INT32;
ctx->kv[idx].value.int32 = val;
}
void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) {
const int idx = gguf_get_or_add_key(ctx, key);
ctx->kv[idx].type = GGUF_TYPE_FLOAT32;
ctx->kv[idx].value.float32 = val;
}
void gguf_set_val_u64(struct gguf_context * ctx, const char * key, uint64_t val) {
const int idx = gguf_get_or_add_key(ctx, key);
ctx->kv[idx].type = GGUF_TYPE_UINT64;
ctx->kv[idx].value.uint64 = val;
}
void gguf_set_val_i64(struct gguf_context * ctx, const char * key, int64_t val) {
const int idx = gguf_get_or_add_key(ctx, key);
ctx->kv[idx].type = GGUF_TYPE_INT64;
ctx->kv[idx].value.int64 = val;
}
void gguf_set_val_f64(struct gguf_context * ctx, const char * key, double val) {
const int idx = gguf_get_or_add_key(ctx, key);
ctx->kv[idx].type = GGUF_TYPE_FLOAT64;
ctx->kv[idx].value.float64 = val;
}
void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) {
const int idx = gguf_get_or_add_key(ctx, key);
ctx->kv[idx].type = GGUF_TYPE_BOOL;
ctx->kv[idx].value.bool_ = val;
}
void gguf_set_val_str(struct gguf_context * ctx, const char * key, const char * val) {
const int idx = gguf_get_or_add_key(ctx, key);
ctx->kv[idx].type = GGUF_TYPE_STRING;
ctx->kv[idx].value.str.n = strlen(val);
ctx->kv[idx].value.str.data = strdup(val);
}
void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n) {
const int idx = gguf_get_or_add_key(ctx, key);
ctx->kv[idx].type = GGUF_TYPE_ARRAY;
ctx->kv[idx].value.arr.type = type;
ctx->kv[idx].value.arr.n = n;
ctx->kv[idx].value.arr.data = GGML_CALLOC(n, gguf_type_size(type));
memcpy(ctx->kv[idx].value.arr.data, data, n*gguf_type_size(type));
}
void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, int n) {
const int idx = gguf_get_or_add_key(ctx, key);
ctx->kv[idx].type = GGUF_TYPE_ARRAY;
ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
ctx->kv[idx].value.arr.n = n;
ctx->kv[idx].value.arr.data = GGML_CALLOC(n, sizeof(struct gguf_str));
for (int i = 0; i < n; i++) {
struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
str->n = strlen(data[i]);
str->data = strdup(data[i]);
}
}
// set or add KV pairs from another context
void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
for (uint32_t i = 0; i < src->header.n_kv; i++) {
switch (src->kv[i].type) {
case GGUF_TYPE_UINT8: gguf_set_val_u8 (ctx, src->kv[i].key.data, src->kv[i].value.uint8); break;
case GGUF_TYPE_INT8: gguf_set_val_i8 (ctx, src->kv[i].key.data, src->kv[i].value.int8); break;
case GGUF_TYPE_UINT16: gguf_set_val_u16 (ctx, src->kv[i].key.data, src->kv[i].value.uint16); break;
case GGUF_TYPE_INT16: gguf_set_val_i16 (ctx, src->kv[i].key.data, src->kv[i].value.int16); break;
case GGUF_TYPE_UINT32: gguf_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32); break;
case GGUF_TYPE_INT32: gguf_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32); break;
case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32); break;
case GGUF_TYPE_UINT64: gguf_set_val_u64 (ctx, src->kv[i].key.data, src->kv[i].value.uint64); break;
case GGUF_TYPE_INT64: gguf_set_val_i64 (ctx, src->kv[i].key.data, src->kv[i].value.int64); break;
case GGUF_TYPE_FLOAT64: gguf_set_val_f64 (ctx, src->kv[i].key.data, src->kv[i].value.float64); break;
case GGUF_TYPE_BOOL: gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_); break;
case GGUF_TYPE_STRING: gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break;
case GGUF_TYPE_ARRAY:
{
if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
const char ** data = GGML_CALLOC(src->kv[i].value.arr.n, sizeof(char *));
for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
}
gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
GGML_FREE((void *)data);
} else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
GGML_ABORT("nested arrays not supported");
} else {
gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n);
}
} break;
default: GGML_ABORT("invalid type");
}
}
}
void gguf_add_tensor(
struct gguf_context * ctx,
const struct ggml_tensor * tensor) {
GGML_ASSERT(tensor);
if (gguf_find_tensor(ctx, tensor->name) != -1) {
GGML_ABORT("duplicated tensor name");
}
const int idx = ctx->header.n_tensors;
ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
ctx->infos[idx].name.n = strlen(tensor->name);
ctx->infos[idx].name.data = strdup(tensor->name);
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
ctx->infos[idx].ne[i] = 1;
}
ctx->infos[idx].n_dims = ggml_n_dims(tensor);
for (uint32_t i = 0; i < ctx->infos[idx].n_dims; i++) {
ctx->infos[idx].ne[i] = tensor->ne[i];
}
ctx->infos[idx].type = tensor->type;
ctx->infos[idx].offset = 0;
ctx->infos[idx].data = tensor->data;
ctx->infos[idx].size = ggml_nbytes(tensor);
if (ctx->header.n_tensors > 0) {
ctx->infos[idx].offset = ctx->infos[idx - 1].offset + GGML_PAD(ctx->infos[idx - 1].size, ctx->alignment);
}
ctx->header.n_tensors++;
}
void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) {
const int idx = gguf_find_tensor(ctx, name);
if (idx < 0) {
GGML_ABORT("tensor not found");
}
ctx->infos[idx].type = type;
}
void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size) {
const int idx = gguf_find_tensor(ctx, name);
if (idx < 0) {
GGML_ABORT("tensor not found");
}
ctx->infos[idx].data = data;
ctx->infos[idx].size = size;
// update offsets
for (uint32_t i = idx + 1; i < ctx->header.n_tensors; ++i) {
ctx->infos[i].offset = ctx->infos[i - 1].offset + GGML_PAD(ctx->infos[i - 1].size, ctx->alignment);
}
}
//static void gguf_fwrite_str(FILE * file, const struct gguf_str * val) {
// fwrite(&val->n, sizeof(val->n), 1, file);
// fwrite(val->data, sizeof(char), val->n, file);
//}
//
//static void gguf_fwrite_el(FILE * file, const void * val, size_t size) {
// fwrite(val, sizeof(char), size, file);
//}
struct gguf_buf gguf_buf_init(size_t size) {
struct gguf_buf buf = {
/*buf.data =*/ size == 0 ? NULL : GGML_CALLOC(1, size),
/*buf.size =*/ size,
/*buf.offset =*/ 0,
};
return buf;
}
void gguf_buf_free(struct gguf_buf buf) {
if (buf.data) {
GGML_FREE(buf.data);
}
}
static void gguf_buf_grow(struct gguf_buf * buf, size_t size) {
if (buf->offset + size > buf->size) {
buf->size = 1.5*(buf->offset + size);
if (buf->data) {
buf->data = realloc(buf->data, buf->size);
}
}
}
static void gguf_bwrite_str(struct gguf_buf * buf, const struct gguf_str * val) {
gguf_buf_grow(buf, sizeof(val->n) + val->n);
if (buf->data) {
memcpy((char *) buf->data + buf->offset, &val->n, sizeof(val->n));
}
buf->offset += sizeof(val->n);
if (buf->data) {
memcpy((char *) buf->data + buf->offset, val->data, val->n);
}
buf->offset += val->n;
}
static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_size) {
gguf_buf_grow(buf, el_size);
if (buf->data) {
memcpy((char *) buf->data + buf->offset, val, el_size);
}
buf->offset += el_size;
}
void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
// write header
gguf_bwrite_el(buf, &ctx->header.magic, sizeof(ctx->header.magic));
gguf_bwrite_el(buf, &ctx->header.version, sizeof(ctx->header.version));
gguf_bwrite_el(buf, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors));
gguf_bwrite_el(buf, &ctx->header.n_kv, sizeof(ctx->header.n_kv));
// write key-value pairs
for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
struct gguf_kv * kv = &ctx->kv[i];
gguf_bwrite_str(buf, &kv->key);
gguf_bwrite_el (buf, &kv->type, sizeof(kv->type));
switch (kv->type) {
case GGUF_TYPE_UINT8: gguf_bwrite_el( buf, &kv->value.uint8, sizeof(kv->value.uint8) ); break;
case GGUF_TYPE_INT8: gguf_bwrite_el (buf, &kv->value.int8, sizeof(kv->value.int8) ); break;
case GGUF_TYPE_UINT16: gguf_bwrite_el (buf, &kv->value.uint16, sizeof(kv->value.uint16) ); break;
case GGUF_TYPE_INT16: gguf_bwrite_el (buf, &kv->value.int16, sizeof(kv->value.int16) ); break;
case GGUF_TYPE_UINT32: gguf_bwrite_el (buf, &kv->value.uint32, sizeof(kv->value.uint32) ); break;
case GGUF_TYPE_INT32: gguf_bwrite_el (buf, &kv->value.int32, sizeof(kv->value.int32) ); break;
case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break;
case GGUF_TYPE_UINT64: gguf_bwrite_el (buf, &kv->value.uint64, sizeof(kv->value.uint64) ); break;
case GGUF_TYPE_INT64: gguf_bwrite_el (buf, &kv->value.int64, sizeof(kv->value.int64) ); break;
case GGUF_TYPE_FLOAT64: gguf_bwrite_el (buf, &kv->value.float64, sizeof(kv->value.float64)); break;
case GGUF_TYPE_BOOL: gguf_bwrite_el (buf, &kv->value.bool_, sizeof(kv->value.bool_) ); break;
case GGUF_TYPE_STRING: gguf_bwrite_str(buf, &kv->value.str ); break;
case GGUF_TYPE_ARRAY:
{
gguf_bwrite_el(buf, &kv->value.arr.type, sizeof(kv->value.arr.type));
gguf_bwrite_el(buf, &kv->value.arr.n, sizeof(kv->value.arr.n) );
switch (kv->value.arr.type) {
case GGUF_TYPE_UINT8:
case GGUF_TYPE_INT8:
case GGUF_TYPE_UINT16:
case GGUF_TYPE_INT16:
case GGUF_TYPE_UINT32:
case GGUF_TYPE_INT32:
case GGUF_TYPE_FLOAT32:
case GGUF_TYPE_UINT64:
case GGUF_TYPE_INT64:
case GGUF_TYPE_FLOAT64:
case GGUF_TYPE_BOOL:
{
gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type));
} break;
case GGUF_TYPE_STRING:
{
for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
gguf_bwrite_str(buf, &((struct gguf_str *) kv->value.arr.data)[j]);
}
} break;
case GGUF_TYPE_ARRAY:
default: GGML_ABORT("invalid type");
}
} break;
default: GGML_ABORT("invalid type");
}
}
// write tensor infos
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
struct gguf_tensor_info * info = &ctx->infos[i];
gguf_bwrite_str(buf, &info->name);
gguf_bwrite_el (buf, &info->n_dims, sizeof(info->n_dims));
for (uint32_t j = 0; j < info->n_dims; ++j) {
gguf_bwrite_el(buf, &info->ne[j], sizeof(info->ne[j]));
}
gguf_bwrite_el(buf, &info->type, sizeof(info->type));
gguf_bwrite_el(buf, &info->offset, sizeof(info->offset));
}
// we require the data section to be aligned, so take into account any padding
{
const size_t offset = buf->offset;
const size_t offset_pad = GGML_PAD(offset, ctx->alignment);
if (offset_pad != offset) {
uint8_t pad = 0;
for (size_t i = 0; i < offset_pad - offset; ++i) {
gguf_bwrite_el(buf, &pad, sizeof(pad));
}
}
}
if (only_meta) {
return;
}
size_t offset = 0;
// write tensor data
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
struct gguf_tensor_info * info = &ctx->infos[i];
const size_t size = info->size;
const size_t size_pad = GGML_PAD(size, ctx->alignment);
gguf_bwrite_el(buf, info->data, size);
if (size_pad != size) {
uint8_t pad = 0;
for (size_t j = 0; j < size_pad - size; ++j) {
gguf_bwrite_el(buf, &pad, sizeof(pad));
}
}
GGML_ASSERT(offset == info->offset);
offset += size_pad;
}
}
void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) {
FILE * file = ggml_fopen(fname, "wb");
if (!file) {
GGML_ABORT("failed to open file for writing");
}
struct gguf_buf buf = gguf_buf_init(16*1024);
gguf_write_to_buf(ctx, &buf, only_meta);
fwrite(buf.data, 1, buf.offset, file);
gguf_buf_free(buf);
fclose(file);
}
size_t gguf_get_meta_size(const struct gguf_context * ctx) {
// no allocs - only compute size
struct gguf_buf buf = gguf_buf_init(0);
gguf_write_to_buf(ctx, &buf, true);
return buf.offset;
}
void gguf_get_meta_data(const struct gguf_context * ctx, void * data) {
struct gguf_buf buf = gguf_buf_init(16*1024);
gguf_write_to_buf(ctx, &buf, true);
memcpy(data, buf.data, buf.offset);
gguf_buf_free(buf);
}
void ggml_log_set(ggml_log_callback log_callback, void * user_data) { void ggml_log_set(ggml_log_callback log_callback, void * user_data) {
g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default; g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default;
g_logger_state.log_callback_user_data = user_data; g_logger_state.log_callback_user_data = user_data;
......
package ggml package ggml
// #cgo CPPFLAGS: -DGGML_USE_METAL -DGGML_USE_BLAS // #cgo CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_EMBED_LIBRARY -DGGML_USE_BLAS
// #cgo LDFLAGS: -framework Foundation // #cgo LDFLAGS: -framework Foundation
import "C" import "C"
......
#include "ggml.h"
#include "ggml-backend.h"
#include "ggml-impl.h"
#include "gguf.h"
#include <cinttypes>
#include <cstddef>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <map>
#include <new>
#include <stdexcept>
#include <string>
#include <vector>
template <typename T>
struct type_to_gguf_type;
template <>
struct type_to_gguf_type<uint8_t> {
static constexpr enum gguf_type value = GGUF_TYPE_UINT8;
};
template <>
struct type_to_gguf_type<int8_t> {
static constexpr enum gguf_type value = GGUF_TYPE_INT8;
};
template <>
struct type_to_gguf_type<uint16_t> {
static constexpr enum gguf_type value = GGUF_TYPE_UINT16;
};
template <>
struct type_to_gguf_type<int16_t> {
static constexpr enum gguf_type value = GGUF_TYPE_INT16;
};
template <>
struct type_to_gguf_type<uint32_t> {
static constexpr enum gguf_type value = GGUF_TYPE_UINT32;
};
template <>
struct type_to_gguf_type<int32_t> {
static constexpr enum gguf_type value = GGUF_TYPE_INT32;
};
template <>
struct type_to_gguf_type<float> {
static constexpr enum gguf_type value = GGUF_TYPE_FLOAT32;
};
template <>
struct type_to_gguf_type<bool> {
static constexpr enum gguf_type value = GGUF_TYPE_BOOL;
};
template <>
struct type_to_gguf_type<std::string> {
static constexpr enum gguf_type value = GGUF_TYPE_STRING;
};
template <>
struct type_to_gguf_type<uint64_t> {
static constexpr enum gguf_type value = GGUF_TYPE_UINT64;
};
template <>
struct type_to_gguf_type<int64_t> {
static constexpr enum gguf_type value = GGUF_TYPE_INT64;
};
template <>
struct type_to_gguf_type<double> {
static constexpr enum gguf_type value = GGUF_TYPE_FLOAT64;
};
static const std::map<gguf_type, size_t> GGUF_TYPE_SIZE = {
{GGUF_TYPE_UINT8, sizeof(uint8_t)},
{GGUF_TYPE_INT8, sizeof(int8_t)},
{GGUF_TYPE_UINT16, sizeof(uint16_t)},
{GGUF_TYPE_INT16, sizeof(int16_t)},
{GGUF_TYPE_UINT32, sizeof(uint32_t)},
{GGUF_TYPE_INT32, sizeof(int32_t)},
{GGUF_TYPE_FLOAT32, sizeof(float)},
{GGUF_TYPE_BOOL, sizeof(int8_t)},
{GGUF_TYPE_STRING, 0}, // undefined
{GGUF_TYPE_ARRAY, 0}, // undefined
{GGUF_TYPE_UINT64, sizeof(uint64_t)},
{GGUF_TYPE_INT64, sizeof(int64_t)},
{GGUF_TYPE_FLOAT64, sizeof(double)},
};
static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
static const std::map<gguf_type, const char *> GGUF_TYPE_NAME = {
{GGUF_TYPE_UINT8, "u8"},
{GGUF_TYPE_INT8, "i8"},
{GGUF_TYPE_UINT16, "u16"},
{GGUF_TYPE_INT16, "i16"},
{GGUF_TYPE_UINT32, "u32"},
{GGUF_TYPE_INT32, "i32"},
{GGUF_TYPE_FLOAT32, "f32"},
{GGUF_TYPE_BOOL, "bool"},
{GGUF_TYPE_STRING, "str"},
{GGUF_TYPE_ARRAY, "arr"},
{GGUF_TYPE_UINT64, "u64"},
{GGUF_TYPE_INT64, "i64"},
{GGUF_TYPE_FLOAT64, "f64"},
};
static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
size_t gguf_type_size(enum gguf_type type) {
auto it = GGUF_TYPE_SIZE.find(type);
return it == GGUF_TYPE_SIZE.end() ? 0 : it->second;
}
struct gguf_kv {
std::string key;
bool is_array;
enum gguf_type type;
std::vector<int8_t> data;
std::vector<std::string> data_string;
template <typename T>
gguf_kv(const std::string & key, const T value)
: key(key), is_array(false), type(type_to_gguf_type<T>::value) {
GGML_ASSERT(!key.empty());
data.resize(sizeof(T));
memcpy(data.data(), &value, sizeof(T));
}
template <typename T>
gguf_kv(const std::string & key, const std::vector<T> & value)
: key(key), is_array(true), type(type_to_gguf_type<T>::value) {
GGML_ASSERT(!key.empty());
data.resize(value.size()*sizeof(T));
for (size_t i = 0; i < value.size(); ++i) {
const T tmp = value[i];
memcpy(data.data() + i*sizeof(T), &tmp, sizeof(T));
}
}
gguf_kv(const std::string & key, const std::string & value)
: key(key), is_array(false), type(GGUF_TYPE_STRING) {
GGML_ASSERT(!key.empty());
data_string.push_back(value);
}
gguf_kv(const std::string & key, const std::vector<std::string> & value)
: key(key), is_array(true), type(GGUF_TYPE_STRING) {
GGML_ASSERT(!key.empty());
data_string = value;
}
const std::string & get_key() const {
return key;
}
const enum gguf_type & get_type() const {
return type;
}
size_t get_ne() const {
if (type == GGUF_TYPE_STRING) {
const size_t ne = data_string.size();
GGML_ASSERT(is_array || ne == 1);
return ne;
}
const size_t type_size = gguf_type_size(type);
GGML_ASSERT(data.size() % type_size == 0);
const size_t ne = data.size() / type_size;
GGML_ASSERT(is_array || ne == 1);
return ne;
}
template <typename T>
const T & get_val(const size_t i = 0) const {
GGML_ASSERT(type_to_gguf_type<T>::value == type);
if constexpr (std::is_same<T, std::string>::value) {
GGML_ASSERT(data_string.size() >= i+1);
return data_string[i];
}
const size_t type_size = gguf_type_size(type);
GGML_ASSERT(data.size() % type_size == 0);
GGML_ASSERT(data.size() >= (i+1)*type_size);
return reinterpret_cast<const T *>(data.data())[i];
}
void cast(const enum gguf_type new_type) {
const size_t new_type_size = gguf_type_size(new_type);
GGML_ASSERT(data.size() % new_type_size == 0);
type = new_type;
}
};
struct gguf_tensor_info {
struct ggml_tensor t; // for holding the equivalent info
uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`
};
struct gguf_context {
uint32_t version = GGUF_VERSION;
std::vector<struct gguf_kv> kv;
std::vector<struct gguf_tensor_info> info;
size_t alignment = GGUF_DEFAULT_ALIGNMENT;
size_t offset = 0; // offset of `data` from beginning of file
size_t size = 0; // size of `data` in bytes
void * data = nullptr;
};
struct gguf_reader {
FILE * file;
gguf_reader(FILE * file) : file(file) {}
template <typename T>
bool read(T & dst) const {
return fread(&dst, 1, sizeof(dst), file) == sizeof(dst);
}
template <typename T>
bool read(std::vector<T> & dst, const size_t n) const {
dst.resize(n);
for (size_t i = 0; i < dst.size(); ++i) {
if constexpr (std::is_same<T, bool>::value) {
bool tmp;
if (!read(tmp)) {
return false;
}
dst[i] = tmp;
} else {
if (!read(dst[i])) {
return false;
}
}
}
return true;
}
bool read(bool & dst) const {
int8_t tmp = -1;
if (!read(tmp)) {
return false;
}
dst = tmp != 0;
return true;
}
bool read(enum ggml_type & dst) const {
int32_t tmp = -1;
if (!read(tmp)) {
return false;
}
dst = ggml_type(tmp);
return true;
}
bool read(enum gguf_type & dst) const {
int32_t tmp = -1;
if (!read(tmp)) {
return false;
}
dst = gguf_type(tmp);
return true;
}
bool read(std::string & dst) const {
uint64_t size = -1;
if (!read(size)) {
return false;
}
dst.resize(size);
return fread(dst.data(), 1, dst.length(), file) == dst.length();
}
bool read(void * dst, const size_t size) const {
return fread(dst, 1, size, file) == size;
}
};
struct gguf_context * gguf_init_empty(void) {
return new gguf_context;
}
template<typename T>
bool gguf_read_emplace_helper(const struct gguf_reader & gr, std::vector<struct gguf_kv> & kv, const std::string & key, const bool is_array, const size_t n) {
if (is_array) {
std::vector<T> value;
try {
if (!gr.read(value, n)) {
return false;
}
} catch (std::length_error &) {
fprintf(stderr, "%s: encountered length_error while reading value for key '%s'\n", __func__, key.c_str());
return false;
} catch (std::bad_alloc &) {
fprintf(stderr, "%s: encountered bad_alloc error while reading value for key '%s'\n", __func__, key.c_str());
return false;
}
kv.emplace_back(key, value);
} else {
T value;
if (!gr.read(value)) {
return false;
}
kv.emplace_back(key, value);
}
return true;
}
struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params) {
const struct gguf_reader gr(file);
struct gguf_context * ctx = new gguf_context;
bool ok = true;
// file magic
{
std::vector<char> magic;
ok = ok && gr.read(magic, 4);
if (!ok) {
fprintf(stderr, "%s: failed to read magic\n", __func__);
gguf_free(ctx);
return nullptr;
}
for (uint32_t i = 0; i < magic.size(); i++) {
if (magic[i] != GGUF_MAGIC[i]) {
fprintf(stderr, "%s: invalid magic characters: '%c%c%c%c', expected 'GGUF'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
gguf_free(ctx);
return nullptr;
}
}
}
// header
int64_t n_kv = 0;
int64_t n_tensors = 0;
if (ok && gr.read(ctx->version)) {
if (ctx->version == 1) {
fprintf(stderr, "%s: GGUFv1 is no longer supported, please use a more up-to-date version\n", __func__);
ok = false;
}
if (ctx->version > GGUF_VERSION) {
fprintf(stderr, "%s: this GGUF file is version %" PRIu32 " but this software only supports up to version %d\n",
__func__, ctx->version, GGUF_VERSION);
ok = false;
}
} else {
ok = false;
}
if (ok && gr.read(n_tensors)) {
static_assert(sizeof(size_t) <= 8 && sizeof(gguf_tensor_info) >= 2, "int64_t insufficient for indexing");
if (n_tensors < 0 || n_tensors > int64_t(SIZE_MAX/sizeof(gguf_tensor_info))) {
fprintf(stderr, "%s: number of tensors is %" PRIi64 " but must be in [0, %zu]\n",
__func__, n_tensors, SIZE_MAX/sizeof(gguf_tensor_info));
ok = false;
}
} else {
ok = false;
}
if (ok && gr.read(n_kv)) {
static_assert(sizeof(size_t) <= 8 && sizeof(gguf_tensor_info) >= 2, "int64_t insufficient for indexing");
if (n_kv < 0 || n_kv > int64_t(SIZE_MAX/sizeof(gguf_kv))) {
fprintf(stderr, "%s: number of key value pairs is %" PRIi64 " but must be in [0, %zu]\n",
__func__, n_kv, SIZE_MAX/sizeof(gguf_kv));
ok = false;
}
} else {
ok = false;
}
if (!ok) {
fprintf(stderr, "%s: failed to read header\n", __func__);
gguf_free(ctx);
return nullptr;
}
// KV pairs
{
for (int64_t i = 0; ok && i < n_kv; ++i) {
std::string key;
gguf_type type = gguf_type(-1);
bool is_array = false;
uint64_t n = 1;
try {
ok = ok && gr.read(key);
} catch (std::length_error &) {
fprintf(stderr, "%s: encountered length_error while reading key %" PRIi64 "\n", __func__, i);
ok = false;
} catch (std::bad_alloc &) {
fprintf(stderr, "%s: encountered bad_alloc error while reading key %" PRIi64 "\n", __func__, i);
ok = false;
}
for (size_t j = 0; ok && j < ctx->kv.size(); ++j) {
if (key == ctx->kv[j].key) {
fprintf(stderr, "%s: duplicate key '%s' for tensors %zu and %" PRIi64 " \n", __func__, key.c_str(), j, i);
ok = false;
}
}
if (!ok) {
break;
}
ok = ok && gr.read(type);
if (type == GGUF_TYPE_ARRAY) {
is_array = true;
ok = ok && gr.read(type);
ok = ok && gr.read(n);
}
if (!ok) {
break;
}
switch (type) {
case GGUF_TYPE_UINT8: ok = ok && gguf_read_emplace_helper<uint8_t> (gr, ctx->kv, key, is_array, n); break;
case GGUF_TYPE_INT8: ok = ok && gguf_read_emplace_helper<int8_t> (gr, ctx->kv, key, is_array, n); break;
case GGUF_TYPE_UINT16: ok = ok && gguf_read_emplace_helper<uint16_t> (gr, ctx->kv, key, is_array, n); break;
case GGUF_TYPE_INT16: ok = ok && gguf_read_emplace_helper<int16_t> (gr, ctx->kv, key, is_array, n); break;
case GGUF_TYPE_UINT32: ok = ok && gguf_read_emplace_helper<uint32_t> (gr, ctx->kv, key, is_array, n); break;
case GGUF_TYPE_INT32: ok = ok && gguf_read_emplace_helper<int32_t> (gr, ctx->kv, key, is_array, n); break;
case GGUF_TYPE_FLOAT32: ok = ok && gguf_read_emplace_helper<float> (gr, ctx->kv, key, is_array, n); break;
case GGUF_TYPE_BOOL: ok = ok && gguf_read_emplace_helper<bool> (gr, ctx->kv, key, is_array, n); break;
case GGUF_TYPE_STRING: ok = ok && gguf_read_emplace_helper<std::string>(gr, ctx->kv, key, is_array, n); break;
case GGUF_TYPE_UINT64: ok = ok && gguf_read_emplace_helper<uint64_t> (gr, ctx->kv, key, is_array, n); break;
case GGUF_TYPE_INT64: ok = ok && gguf_read_emplace_helper<int64_t> (gr, ctx->kv, key, is_array, n); break;
case GGUF_TYPE_FLOAT64: ok = ok && gguf_read_emplace_helper<double> (gr, ctx->kv, key, is_array, n); break;
case GGUF_TYPE_ARRAY:
default:
{
fprintf(stderr, "%s: key '%s' has invalid GGUF type %d\n", __func__, key.c_str(), type);
ok = false;
} break;
}
}
if (!ok) {
fprintf(stderr, "%s: failed to read key-value pairs\n", __func__);
gguf_free(ctx);
return nullptr;
}
GGML_ASSERT(int64_t(ctx->kv.size()) == n_kv);
const int alignment_idx = gguf_find_key(ctx, GGUF_KEY_GENERAL_ALIGNMENT);
ctx->alignment = alignment_idx == -1 ? GGUF_DEFAULT_ALIGNMENT : gguf_get_val_u32(ctx, alignment_idx);
if (ctx->alignment == 0 || (ctx->alignment & (ctx->alignment - 1)) != 0) {
fprintf(stderr, "%s: alignment %zu is not a power of 2\n", __func__, ctx->alignment);
gguf_free(ctx);
return nullptr;
}
}
// read the tensor info
for (int64_t i = 0; ok && i < n_tensors; ++i) {
struct gguf_tensor_info info;
// tensor name
{
std::string name;
try {
ok = ok && gr.read(name);
} catch (std::length_error &) {
fprintf(stderr, "%s: encountered length_error while reading tensor name %" PRIi64 "\n", __func__, i);
ok = false;
} catch (std::bad_alloc &) {
fprintf(stderr, "%s: encountered bad_alloc error while reading tensor name %" PRIi64 "\n", __func__, i);
ok = false;
}
if (name.length() >= GGML_MAX_NAME) {
fprintf(stderr, "%s: tensor name %" PRIi64 " is too long: %zu >= %d\n", __func__, i, name.length(), GGML_MAX_NAME);
ok = false;
break;
}
ggml_set_name(&info.t, name.c_str());
// make sure there are no duplicate tensor names
for (int64_t j = 0; ok && j < i; ++j) {
if (strcmp(info.t.name, ctx->info[j].t.name) == 0) {
fprintf(stderr, "%s: duplicate tensor name '%s' for tensors %" PRIi64 " and %" PRIi64 "\n", __func__, info.t.name, j, i);
ok = false;
break;
}
}
}
if (!ok) {
break;
}
// tensor shape
{
uint32_t n_dims = -1;
ok = ok && gr.read(n_dims);
if (n_dims > GGML_MAX_DIMS) {
fprintf(stderr, "%s: tensor '%s' has invalid number of dimensions: %" PRIu32 " > %" PRIu32 "\n",
__func__, info.t.name, n_dims, GGML_MAX_DIMS);
ok = false;
break;
}
for (uint32_t j = 0; ok && j < GGML_MAX_DIMS; ++j) {
info.t.ne[j] = 1;
if (j < n_dims) {
ok = ok && gr.read(info.t.ne[j]);
}
// check that all ne are non-negative
if (info.t.ne[j] < 0) {
fprintf(stderr, "%s: tensor '%s' dimension %" PRIu32 " has invalid number of elements: %" PRIi64 " < 0\n",
__func__, info.t.name, j, info.t.ne[j]);
ok = false;
break;
}
}
// check that the total number of elements is representable
if (ok && ((INT64_MAX/info.t.ne[1] <= info.t.ne[0]) ||
(INT64_MAX/info.t.ne[2] <= info.t.ne[0]*info.t.ne[1]) ||
(INT64_MAX/info.t.ne[3] <= info.t.ne[0]*info.t.ne[1]*info.t.ne[2]))) {
fprintf(stderr, "%s: total number of elements in tensor '%s' with shape "
"(%" PRIi64 ", %" PRIi64 ", %" PRIi64 ", %" PRIi64 ") is >= %" PRIi64 "\n",
__func__, info.t.name, info.t.ne[0], info.t.ne[1], info.t.ne[2], info.t.ne[3], INT64_MAX);
ok = false;
break;
}
}
if (!ok) {
break;
}
// tensor type
{
ok = ok && gr.read(info.t.type);
// check that tensor type is within defined range
if (info.t.type < 0 || info.t.type >= GGML_TYPE_COUNT) {
fprintf(stderr, "%s: tensor '%s' has invalid ggml type %d (%s)\n",
__func__, info.t.name, info.t.type, ggml_type_name(info.t.type));
ok = false;
break;
}
const size_t type_size = ggml_type_size(info.t.type);
const int64_t blck_size = ggml_blck_size(info.t.type);
// check that row size is divisible by block size
if (blck_size == 0 || info.t.ne[0] % blck_size != 0) {
fprintf(stderr, "%s: tensor '%s' of type %d (%s) has %" PRId64 " elements per row, "
"not a multiple of block size (%" PRId64 ")\n",
__func__, info.t.name, (int) info.t.type, ggml_type_name(info.t.type), info.t.ne[0], blck_size);
ok = false;
break;
}
// calculate byte offsets given the tensor shape and type
info.t.nb[0] = type_size;
info.t.nb[1] = info.t.nb[0]*(info.t.ne[0]/blck_size);
for (int j = 2; j < GGML_MAX_DIMS; ++j) {
info.t.nb[j] = info.t.nb[j - 1]*info.t.ne[j - 1];
}
}
if (!ok) {
break;
}
// tensor data offset within buffer
ok = ok && gr.read(info.offset);
ctx->info.push_back(info);
}
if (!ok) {
fprintf(stderr, "%s: failed to read tensor info\n", __func__);
gguf_free(ctx);
return nullptr;
}
GGML_ASSERT(int64_t(ctx->info.size()) == n_tensors);
// we require the data section to be aligned, so take into account any padding
if (fseek(file, GGML_PAD(ftell(file), ctx->alignment), SEEK_SET) != 0) {
fprintf(stderr, "%s: failed to seek to beginning of data section\n", __func__);
gguf_free(ctx);
return nullptr;
}
// store the current file offset - this is where the data section starts
ctx->offset = ftell(file);
// compute the total size of the data section, taking into account the alignment
{
ctx->size = 0;
for (size_t i = 0; i < ctx->info.size(); ++i) {
const gguf_tensor_info & ti = ctx->info[i];
if (ti.offset != ctx->size) {
fprintf(stderr, "%s: tensor '%s' has offset %" PRIu64 ", expected %zu\n",
__func__, ti.t.name, ti.offset, ctx->size);
fprintf(stderr, "%s: failed to read tensor data\n", __func__);
gguf_free(ctx);
return nullptr;
}
ctx->size += GGML_PAD(ggml_nbytes(&ti.t), ctx->alignment);
}
}
// load the tensor data only if requested
if (params.ctx != nullptr) {
// if the provided gguf_context is no_alloc, then we create "empty" tensors and do not read the binary blob
// otherwise, we load the binary blob into the created ggml_context as well, and point the "data" members of
// the ggml_tensor structs to the appropriate locations in the binary blob
// compute the exact size needed for the new ggml_context
const size_t mem_size =
params.no_alloc ?
(n_tensors )*ggml_tensor_overhead() :
(n_tensors + 1)*ggml_tensor_overhead() + ctx->size;
struct ggml_init_params pdata = {
/*mem_size =*/ mem_size,
/*mem_buffer =*/ nullptr,
/*no_alloc =*/ params.no_alloc,
};
*params.ctx = ggml_init(pdata);
if (*params.ctx == nullptr) {
fprintf(stderr, "%s: failed to initialize ggml context for storing tensors\n", __func__);
gguf_free(ctx);
return nullptr;
}
struct ggml_context * ctx_data = *params.ctx;
struct ggml_tensor * data = nullptr;
if (!params.no_alloc) {
data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
ok = ok && data != nullptr;
if (ok) {
ggml_set_name(data, "GGUF tensor data binary blob");
}
// read the binary blob with the tensor data
ok = ok && gr.read(data->data, ctx->size);
if (!ok) {
fprintf(stderr, "%s: failed to read tensor data binary blob\n", __func__);
ggml_free(ctx_data);
*params.ctx = nullptr;
gguf_free(ctx);
return nullptr;
}
ctx->data = data->data;
}
ggml_set_no_alloc(ctx_data, true);
// create the tensors
for (size_t i = 0; i < ctx->info.size(); ++i) {
const struct gguf_tensor_info & info = ctx->info[i];
struct ggml_tensor * cur = ggml_new_tensor(ctx_data, info.t.type, GGML_MAX_DIMS, info.t.ne);
ok = ok && cur != nullptr;
if (!ok) {
break;
}
ggml_set_name(cur, info.t.name);
// point the data member to the appropriate location in the binary blob using the tensor info
if (!params.no_alloc) {
cur->data = (char *) data->data + info.offset;
}
}
if (!ok) {
fprintf(stderr, "%s: failed to create tensors\n", __func__);
ggml_free(ctx_data);
*params.ctx = nullptr;
gguf_free(ctx);
return nullptr;
}
ggml_set_no_alloc(ctx_data, params.no_alloc);
}
return ctx;
}
struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
FILE * file = ggml_fopen(fname, "rb");
if (!file) {
fprintf(stderr, "%s: failed to open GGUF file '%s'\n", __func__, fname);
return nullptr;
}
struct gguf_context * result = gguf_init_from_file_impl(file, params);
fclose(file);
return result;
}
void gguf_free(struct gguf_context * ctx) {
if (ctx == nullptr) {
return;
}
delete ctx;
}
const char * gguf_type_name(enum gguf_type type) {
auto it = GGUF_TYPE_NAME.find(type);
return it == GGUF_TYPE_NAME.end() ? nullptr : it->second;
}
uint32_t gguf_get_version(const struct gguf_context * ctx) {
return ctx->version;
}
size_t gguf_get_alignment(const struct gguf_context * ctx) {
return ctx->alignment;
}
size_t gguf_get_data_offset(const struct gguf_context * ctx) {
return ctx->offset;
}
int64_t gguf_get_n_kv(const struct gguf_context * ctx) {
return ctx->kv.size();
}
int64_t gguf_find_key(const struct gguf_context * ctx, const char * key) {
// return -1 if key not found
int64_t keyfound = -1;
const int64_t n_kv = gguf_get_n_kv(ctx);
for (int64_t i = 0; i < n_kv; ++i) {
if (strcmp(key, gguf_get_key(ctx, i)) == 0) {
keyfound = i;
break;
}
}
return keyfound;
}
const char * gguf_get_key(const struct gguf_context * ctx, int64_t key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
return ctx->kv[key_id].get_key().c_str();
}
enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int64_t key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
return ctx->kv[key_id].is_array ? GGUF_TYPE_ARRAY : ctx->kv[key_id].get_type();
}
enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int64_t key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].is_array);
return ctx->kv[key_id].get_type();
}
const void * gguf_get_arr_data(const struct gguf_context * ctx, int64_t key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].get_type() != GGUF_TYPE_STRING);
return ctx->kv[key_id].data.data();
}
const char * gguf_get_arr_str(const struct gguf_context * ctx, int64_t key_id, size_t i) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].get_type() == GGUF_TYPE_STRING);
return ctx->kv[key_id].data_string[i].c_str();
}
size_t gguf_get_arr_n(const struct gguf_context * ctx, int64_t key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
if (ctx->kv[key_id].type == GGUF_TYPE_STRING) {
return ctx->kv[key_id].data_string.size();
}
const size_t type_size = gguf_type_size(ctx->kv[key_id].type);
GGML_ASSERT(ctx->kv[key_id].data.size() % type_size == 0);
return ctx->kv[key_id].data.size() / type_size;
}
uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int64_t key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].get_ne() == 1);
return ctx->kv[key_id].get_val<uint8_t>();
}
int8_t gguf_get_val_i8(const struct gguf_context * ctx, int64_t key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].get_ne() == 1);
return ctx->kv[key_id].get_val<int8_t>();
}
uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int64_t key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].get_ne() == 1);
return ctx->kv[key_id].get_val<uint16_t>();
}
int16_t gguf_get_val_i16(const struct gguf_context * ctx, int64_t key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].get_ne() == 1);
return ctx->kv[key_id].get_val<int16_t>();
}
uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int64_t key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].get_ne() == 1);
return ctx->kv[key_id].get_val<uint32_t>();
}
int32_t gguf_get_val_i32(const struct gguf_context * ctx, int64_t key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].get_ne() == 1);
return ctx->kv[key_id].get_val<int32_t>();
}
float gguf_get_val_f32(const struct gguf_context * ctx, int64_t key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].get_ne() == 1);
return ctx->kv[key_id].get_val<float>();
}
uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int64_t key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].get_ne() == 1);
return ctx->kv[key_id].get_val<uint64_t>();
}
int64_t gguf_get_val_i64(const struct gguf_context * ctx, int64_t key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].get_ne() == 1);
return ctx->kv[key_id].get_val<int64_t>();
}
double gguf_get_val_f64(const struct gguf_context * ctx, int64_t key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].get_ne() == 1);
return ctx->kv[key_id].get_val<double>();
}
bool gguf_get_val_bool(const struct gguf_context * ctx, int64_t key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].get_ne() == 1);
return ctx->kv[key_id].get_val<bool>();
}
const char * gguf_get_val_str(const struct gguf_context * ctx, int64_t key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].get_ne() == 1);
return ctx->kv[key_id].get_val<std::string>().c_str();
}
const void * gguf_get_val_data(const struct gguf_context * ctx, int64_t key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].get_ne() == 1);
GGML_ASSERT(ctx->kv[key_id].get_type() != GGUF_TYPE_STRING);
return ctx->kv[key_id].data.data();
}
int64_t gguf_get_n_tensors(const struct gguf_context * ctx) {
return ctx->info.size();
}
int64_t gguf_find_tensor(const struct gguf_context * ctx, const char * name) {
// return -1 if tensor not found
int64_t tensor_id = -1;
const int64_t n_tensors = gguf_get_n_tensors(ctx);
for (int64_t i = 0; i < n_tensors; ++i) {
if (strcmp(name, gguf_get_tensor_name(ctx, i)) == 0) {
tensor_id = i;
break;
}
}
return tensor_id;
}
size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int64_t tensor_id) {
GGML_ASSERT(tensor_id >= 0 && tensor_id < gguf_get_n_tensors(ctx));
return ctx->info[tensor_id].offset;
}
const char * gguf_get_tensor_name(const struct gguf_context * ctx, int64_t tensor_id) {
GGML_ASSERT(tensor_id >= 0 && tensor_id < gguf_get_n_tensors(ctx));
return ctx->info[tensor_id].t.name;
}
enum ggml_type gguf_get_tensor_type(const struct gguf_context * ctx, int64_t tensor_id) {
GGML_ASSERT(tensor_id >= 0 && tensor_id < gguf_get_n_tensors(ctx));
return ctx->info[tensor_id].t.type;
}
size_t gguf_get_tensor_size(const struct gguf_context * ctx, int64_t tensor_id) {
GGML_ASSERT(tensor_id >= 0 && tensor_id < gguf_get_n_tensors(ctx));
return ggml_nbytes(&ctx->info[tensor_id].t);
}
int64_t gguf_remove_key(struct gguf_context * ctx, const char * key) {
const int64_t key_id = gguf_find_key(ctx, key);
if (key_id >= 0) {
ctx->kv.erase(ctx->kv.begin() + key_id);
}
return key_id;
}
template<typename T>
static void gguf_check_reserved_keys(const std::string & key, const T val) {
if (key == GGUF_KEY_GENERAL_ALIGNMENT) {
if constexpr (std::is_same<T, uint32_t>::value) {
GGML_ASSERT(val > 0 && (val & (val - 1)) == 0 && GGUF_KEY_GENERAL_ALIGNMENT " must be power of 2");
} else {
GGML_ABORT(GGUF_KEY_GENERAL_ALIGNMENT " must be type u32");
}
}
}
void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) {
gguf_check_reserved_keys(key, val);
gguf_remove_key(ctx, key);
ctx->kv.emplace_back(key, val);
}
void gguf_set_val_i8(struct gguf_context * ctx, const char * key, int8_t val) {
gguf_check_reserved_keys(key, val);
gguf_remove_key(ctx, key);
ctx->kv.emplace_back(key, val);
}
void gguf_set_val_u16(struct gguf_context * ctx, const char * key, uint16_t val) {
gguf_check_reserved_keys(key, val);
gguf_remove_key(ctx, key);
ctx->kv.emplace_back(key, val);
}
void gguf_set_val_i16(struct gguf_context * ctx, const char * key, int16_t val) {
gguf_check_reserved_keys(key, val);
gguf_remove_key(ctx, key);
ctx->kv.emplace_back(key, val);
}
void gguf_set_val_u32(struct gguf_context * ctx, const char * key, uint32_t val) {
gguf_check_reserved_keys(key, val);
gguf_remove_key(ctx, key);
ctx->kv.emplace_back(key, val);
}
void gguf_set_val_i32(struct gguf_context * ctx, const char * key, int32_t val) {
gguf_check_reserved_keys(key, val);
gguf_remove_key(ctx, key);
ctx->kv.emplace_back(key, val);
}
void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) {
gguf_check_reserved_keys(key, val);
gguf_remove_key(ctx, key);
ctx->kv.emplace_back(key, val);
}
void gguf_set_val_u64(struct gguf_context * ctx, const char * key, uint64_t val) {
gguf_check_reserved_keys(key, val);
gguf_remove_key(ctx, key);
ctx->kv.emplace_back(key, val);
}
void gguf_set_val_i64(struct gguf_context * ctx, const char * key, int64_t val) {
gguf_check_reserved_keys(key, val);
gguf_remove_key(ctx, key);
ctx->kv.emplace_back(key, val);
}
void gguf_set_val_f64(struct gguf_context * ctx, const char * key, double val) {
gguf_check_reserved_keys(key, val);
gguf_remove_key(ctx, key);
ctx->kv.emplace_back(key, val);
}
void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) {
gguf_check_reserved_keys(key, val);
gguf_remove_key(ctx, key);
ctx->kv.emplace_back(key, val);
}
void gguf_set_val_str(struct gguf_context * ctx, const char * key, const char * val) {
gguf_check_reserved_keys(key, val);
gguf_remove_key(ctx, key);
ctx->kv.emplace_back(key, std::string(val));
}
void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, size_t n) {
gguf_check_reserved_keys(key, data);
gguf_remove_key(ctx, key);
const size_t nbytes = n*gguf_type_size(type);
std::vector<int8_t> tmp(nbytes);
if (!tmp.empty()) {
memcpy(tmp.data(), data, nbytes);
}
ctx->kv.emplace_back(key, tmp);
ctx->kv.back().cast(type);
}
void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, size_t n) {
gguf_check_reserved_keys(key, data);
gguf_remove_key(ctx, key);
std::vector<std::string> tmp(n);
for (size_t i = 0; i < n; ++i) {
tmp[i] = data[i];
}
ctx->kv.emplace_back(key, tmp);
}
// set or add KV pairs from another context
void gguf_set_kv(struct gguf_context * ctx, const struct gguf_context * src) {
const int64_t n_kv = gguf_get_n_kv(src);
for (int64_t i = 0; i < n_kv; ++i) {
const struct gguf_kv & kv = src->kv[i];
if (!kv.is_array) {
switch (kv.get_type()) {
case GGUF_TYPE_UINT8: gguf_set_val_u8 (ctx, kv.get_key().c_str(), kv.get_val<uint8_t>()); break;
case GGUF_TYPE_INT8: gguf_set_val_i8 (ctx, kv.get_key().c_str(), kv.get_val<int8_t>()); break;
case GGUF_TYPE_UINT16: gguf_set_val_u16 (ctx, kv.get_key().c_str(), kv.get_val<uint16_t>()); break;
case GGUF_TYPE_INT16: gguf_set_val_i16 (ctx, kv.get_key().c_str(), kv.get_val<int16_t>()); break;
case GGUF_TYPE_UINT32: gguf_set_val_u32 (ctx, kv.get_key().c_str(), kv.get_val<uint32_t>()); break;
case GGUF_TYPE_INT32: gguf_set_val_i32 (ctx, kv.get_key().c_str(), kv.get_val<int32_t>()); break;
case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, kv.get_key().c_str(), kv.get_val<float>()); break;
case GGUF_TYPE_UINT64: gguf_set_val_u64 (ctx, kv.get_key().c_str(), kv.get_val<uint64_t>()); break;
case GGUF_TYPE_INT64: gguf_set_val_i64 (ctx, kv.get_key().c_str(), kv.get_val<int64_t>()); break;
case GGUF_TYPE_FLOAT64: gguf_set_val_f64 (ctx, kv.get_key().c_str(), kv.get_val<double>()); break;
case GGUF_TYPE_BOOL: gguf_set_val_bool(ctx, kv.get_key().c_str(), kv.get_val<bool>()); break;
case GGUF_TYPE_STRING: gguf_set_val_str (ctx, kv.get_key().c_str(), kv.get_val<std::string>().c_str()); break;
case GGUF_TYPE_ARRAY:
default: GGML_ABORT("invalid type");
}
continue;
}
const size_t ne = kv.get_ne();
switch (kv.get_type()) {
case GGUF_TYPE_UINT8:
case GGUF_TYPE_INT8:
case GGUF_TYPE_UINT16:
case GGUF_TYPE_INT16:
case GGUF_TYPE_UINT32:
case GGUF_TYPE_INT32:
case GGUF_TYPE_FLOAT32:
case GGUF_TYPE_UINT64:
case GGUF_TYPE_INT64:
case GGUF_TYPE_FLOAT64:
case GGUF_TYPE_BOOL: {
gguf_set_arr_data(ctx, kv.get_key().c_str(), kv.get_type(), kv.data.data(), ne);
} break;
case GGUF_TYPE_STRING: {
std::vector<const char *> tmp(ne);
for (size_t j = 0; j < ne; ++j) {
tmp[j] = kv.data_string[j].c_str();
}
gguf_set_arr_str(ctx, kv.get_key().c_str(), tmp.data(), ne);
} break;
case GGUF_TYPE_ARRAY:
default: GGML_ABORT("invalid type");
}
}
}
void gguf_add_tensor(
struct gguf_context * ctx,
const struct ggml_tensor * tensor) {
GGML_ASSERT(tensor);
if (gguf_find_tensor(ctx, tensor->name) != -1) {
GGML_ABORT("duplicate tensor name: %s", tensor->name);
}
struct gguf_tensor_info ti;
ti.t = *tensor;
ti.offset = ctx->info.empty() ? 0 :
ctx->info.back().offset + GGML_PAD(ggml_nbytes(&ctx->info.back().t), ctx->alignment);
ctx->info.push_back(ti);
}
void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) {
const int64_t tensor_id = gguf_find_tensor(ctx, name);
if (tensor_id < 0) {
GGML_ABORT("tensor not found: %s", name);
}
struct ggml_tensor * tensor = &ctx->info[tensor_id].t;
const size_t type_size = ggml_type_size(type);
const int64_t blck_size = ggml_blck_size(type);
tensor->type = type;
GGML_ASSERT(tensor->ne[0] % blck_size == 0 && "tensor row size not divisible by block size of new type");
tensor->nb[0] = type_size;
tensor->nb[1] = tensor->nb[0]*(tensor->ne[0]/blck_size);
for (int i = 2; i < GGML_MAX_DIMS; i++) {
tensor->nb[i] = tensor->nb[i - 1]*tensor->ne[i - 1];
}
// update offsets
const int64_t n_tensors = gguf_get_n_tensors(ctx);
for (int64_t i = tensor_id + 1; i < n_tensors; ++i) {
ctx->info[i].offset = ctx->info[i - 1].offset + GGML_PAD(ggml_nbytes(&ctx->info[i - 1].t), ctx->alignment);
}
}
void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data) {
const int64_t tensor_id = gguf_find_tensor(ctx, name);
if (tensor_id < 0) {
GGML_ABORT("tensor not found: %s", name);
}
ctx->info[tensor_id].t.data = (void *)(uintptr_t)data; // double cast suppresses warning about casting away const
}
struct gguf_writer {
std::vector<int8_t> & buf;
gguf_writer(std::vector<int8_t> & buf) : buf(buf) {}
template <typename T>
void write(const T & val) const {
for (size_t i = 0; i < sizeof(val); ++i) {
buf.push_back(reinterpret_cast<const int8_t *>(&val)[i]);
}
}
void write(const std::vector<int8_t> & val) const {
buf.insert(buf.end(), val.begin(), val.end());
}
void write(const bool & val) const {
const int8_t val8 = val ? 1 : 0;
write(val8);
}
void write(const std::string & val) const {
{
const uint64_t n = val.length();
write(n);
}
for (size_t i = 0; i < val.length(); ++i) {
buf.push_back(reinterpret_cast<const int8_t *>(val.data())[i]);
}
}
void write(const char * val) const {
write(std::string(val));
}
void write(const enum ggml_type & val) const {
write(int32_t(val));
}
void write(const enum gguf_type & val) const {
write(int32_t(val));
}
void write(const struct gguf_kv & kv) const {
const uint64_t ne = kv.get_ne();
write(kv.get_key());
if (kv.is_array) {
write(GGUF_TYPE_ARRAY);
write(kv.get_type());
write(ne);
} else {
write(kv.get_type());
}
switch (kv.get_type()) {
case GGUF_TYPE_UINT8:
case GGUF_TYPE_INT8:
case GGUF_TYPE_UINT16:
case GGUF_TYPE_INT16:
case GGUF_TYPE_UINT32:
case GGUF_TYPE_INT32:
case GGUF_TYPE_FLOAT32:
case GGUF_TYPE_UINT64:
case GGUF_TYPE_INT64:
case GGUF_TYPE_FLOAT64: {
write(kv.data);
} break;
case GGUF_TYPE_BOOL: {
for (size_t i = 0; i < ne; ++i) {
write(kv.get_val<bool>(i));
}
} break;
case GGUF_TYPE_STRING: {
for (size_t i = 0; i < ne; ++i) {
write(kv.get_val<std::string>(i));
}
} break;
case GGUF_TYPE_ARRAY:
default: GGML_ABORT("invalid type");
}
}
void write_tensor_meta(const struct gguf_tensor_info & info) const {
write(info.t.name);
const uint32_t n_dims = ggml_n_dims(&info.t);
write(n_dims);
for (uint32_t j = 0; j < n_dims; ++j) {
write(info.t.ne[j]);
}
write(info.t.type);
write(info.offset);
}
void pad(const size_t alignment) const {
while (buf.size() % alignment != 0) {
const int8_t zero = 0;
write(zero);
}
}
void write_tensor_data(const struct gguf_tensor_info & info, const size_t offset_data, const size_t alignment) const {
GGML_ASSERT(buf.size() - offset_data == info.offset);
GGML_ASSERT(ggml_is_contiguous(&info.t));
const size_t offset = buf.size();
const size_t nbytes = ggml_nbytes(&info.t);
buf.resize(offset + nbytes);
if (info.t.buffer) {
ggml_backend_tensor_get(&info.t, buf.data() + offset, 0, nbytes);
} else {
GGML_ASSERT(info.t.data);
memcpy(buf.data() + offset, info.t.data, nbytes);
}
pad(alignment);
}
};
void gguf_write_to_buf(const struct gguf_context * ctx, std::vector<int8_t> & buf, bool only_meta) {
const struct gguf_writer gw(buf);
const int64_t n_kv = gguf_get_n_kv(ctx);
const int64_t n_tensors = gguf_get_n_tensors(ctx);
// write header
gw.write(GGUF_MAGIC[0]);
gw.write(GGUF_MAGIC[1]);
gw.write(GGUF_MAGIC[2]);
gw.write(GGUF_MAGIC[3]);
gw.write(ctx->version);
gw.write(n_tensors);
gw.write(n_kv);
// write key-value pairs
for (int64_t i = 0; i < n_kv; ++i) {
gw.write(ctx->kv[i]);
}
// write tensor info
for (int64_t i = 0; i < n_tensors; ++i) {
gw.write_tensor_meta(ctx->info[i]);
}
// we require the data section to be aligned
gw.pad(ctx->alignment);
if (only_meta) {
return;
}
const size_t offset_data = gw.buf.size();
// write tensor data
for (int64_t i = 0; i < n_tensors; ++i) {
gw.write_tensor_data(ctx->info[i], offset_data, ctx->alignment);
}
}
bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) {
FILE * file = ggml_fopen(fname, "wb");
if (!file) {
fprintf(stderr, "%s: failed to open file '%s' for writing GGUF data\n", __func__, fname);
return false;
}
std::vector<int8_t> buf;
gguf_write_to_buf(ctx, buf, only_meta);
const bool ok = fwrite(buf.data(), 1, buf.size(), file) == buf.size();
fclose(file);
return ok;
}
size_t gguf_get_meta_size(const struct gguf_context * ctx) {
// only return size
std::vector<int8_t> buf;
gguf_write_to_buf(ctx, buf, /*only_meta =*/ true);
return buf.size();
}
void gguf_get_meta_data(const struct gguf_context * ctx, void * data) {
std::vector<int8_t> buf;
gguf_write_to_buf(ctx, buf, /*only_meta =*/ true);
memcpy(data, buf.data(), buf.size());
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment