Unverified Commit 7a81daf0 authored by Jeffrey Morgan's avatar Jeffrey Morgan Committed by GitHub
Browse files

llama: update vendor code to commit ba1cb19c (#8101)

parent 60f75560
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
...@@ -31,6 +31,7 @@ ...@@ -31,6 +31,7 @@
#include "ggml-backend.h" #include "ggml-backend.h"
#include "ggml-impl.h" #include "ggml-impl.h"
#include "ggml-cpu.h" #include "ggml-cpu.h"
#include "ggml-cpu-traits.h"
#if defined(__gnu_linux__) #if defined(__gnu_linux__)
#include <sys/syscall.h> #include <sys/syscall.h>
...@@ -43,31 +44,65 @@ ...@@ -43,31 +44,65 @@
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__) #if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
// AMX type_trais
namespace ggml::cpu::amx {
class tensor_traits : public ggml::cpu::tensor_traits {
bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override {
size = ggml_backend_amx_desired_wsize(op);
return true;
}
bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) override {
if (op->op == GGML_OP_MUL_MAT) {
ggml_backend_amx_mul_mat(params, op);
return true;
}
return false;
}
};
static ggml::cpu::tensor_traits * get_tensor_traits(ggml_backend_buffer_t, struct ggml_tensor *) {
static tensor_traits traits;
return &traits;
}
} // namespace ggml::cpu::amx
// AMX buffer interface // AMX buffer interface
static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
free(buffer->context); free(buffer->context);
} }
static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
return (void *)(buffer->context); return (void *) (buffer->context);
} }
static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { static void ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
memset((char *)tensor->data + offset, value, size); tensor->extra = (void *) ggml::cpu::amx::get_tensor_traits(buffer, tensor);
GGML_UNUSED(buffer); GGML_UNUSED(buffer);
} }
static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
uint8_t value, size_t offset, size_t size) {
memset((char *) tensor->data + offset, value, size);
GGML_UNUSED(buffer);
}
static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
const void * data, size_t offset, size_t size) {
if (qtype_has_amx_kernels(tensor->type)) { if (qtype_has_amx_kernels(tensor->type)) {
GGML_LOG_DEBUG("%s: amx repack tensor %s of type %s\n", __func__, tensor->name, ggml_type_name(tensor->type));
ggml_backend_amx_convert_weight(tensor, data, offset, size); ggml_backend_amx_convert_weight(tensor, data, offset, size);
} else { } else {
memcpy((char *)tensor->data + offset, data, size); memcpy((char *) tensor->data + offset, data, size);
} }
GGML_UNUSED(buffer); GGML_UNUSED(buffer);
} }
/*
// need to figure what we need to do with buffer->extra.
static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
GGML_ASSERT(!qtype_has_amx_kernels(tensor->type)); GGML_ASSERT(!qtype_has_amx_kernels(tensor->type));
memcpy(data, (const char *)tensor->data + offset, size); memcpy(data, (const char *)tensor->data + offset, size);
...@@ -88,6 +123,7 @@ static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con ...@@ -88,6 +123,7 @@ static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con
GGML_UNUSED(buffer); GGML_UNUSED(buffer);
} }
*/
static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
memset(buffer->context, value, buffer->size); memset(buffer->context, value, buffer->size);
...@@ -96,13 +132,13 @@ static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t ...@@ -96,13 +132,13 @@ static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = { static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
/* .free_buffer = */ ggml_backend_amx_buffer_free_buffer, /* .free_buffer = */ ggml_backend_amx_buffer_free_buffer,
/* .get_base = */ ggml_backend_amx_buffer_get_base, /* .get_base = */ ggml_backend_amx_buffer_get_base,
/* .init_tensor = */ NULL, // no initialization required /* .init_tensor = */ ggml_backend_amx_buffer_init_tensor,
/* .memset_tensor = */ ggml_backend_amx_buffer_memset_tensor, /* .memset_tensor = */ ggml_backend_amx_buffer_memset_tensor,
/* .set_tensor = */ ggml_backend_amx_buffer_set_tensor, /* .set_tensor = */ ggml_backend_amx_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_amx_buffer_get_tensor, /* .get_tensor = */ nullptr,
/* .cpy_tensor = */ ggml_backend_amx_buffer_cpy_tensor, /* .cpy_tensor = */ nullptr,
/* .clear = */ ggml_backend_amx_buffer_clear, /* .clear = */ ggml_backend_amx_buffer_clear,
/* .reset = */ NULL, /* .reset = */ nullptr,
}; };
static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) { static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
...@@ -112,7 +148,7 @@ static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_ty ...@@ -112,7 +148,7 @@ static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_ty
} }
static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
void * data = aligned_alloc(TENSOR_ALIGNMENT, size); void * data = ggml_aligned_malloc(size);
if (data == NULL) { if (data == NULL) {
fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size); fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
return NULL; return NULL;
...@@ -127,14 +163,44 @@ static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_typ ...@@ -127,14 +163,44 @@ static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_typ
GGML_UNUSED(buft); GGML_UNUSED(buft);
} }
static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) { namespace ggml::cpu::amx {
return ggml_backend_amx_get_alloc_size(tensor); class extra_buffer_type : ggml::cpu::extra_buffer_type {
bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
// handle only 2d gemm for now
auto is_contiguous_2d = [](const struct ggml_tensor * t) {
return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
};
if (op->op == GGML_OP_MUL_MAT && is_contiguous_2d(op->src[0]) && // src0 must be contiguous
is_contiguous_2d(op->src[1]) && // src1 must be contiguous
op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_amx_buffer_type() &&
op->ne[0] % (TILE_N * 2) == 0 && // out_features is 32x
(qtype_has_amx_kernels(op->src[0]->type) || (op->src[0]->type == GGML_TYPE_F16))) {
// src1 must be host buffer
if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
return false;
}
// src1 must be float32
if (op->src[1]->type == GGML_TYPE_F32) {
return true;
}
}
return false;
}
GGML_UNUSED(buft); ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
} if (op->op == GGML_OP_MUL_MAT && op->src[0]->buffer &&
op->src[0]->buffer->buft == ggml_backend_amx_buffer_type()) {
return (ggml::cpu::tensor_traits *) op->src[0]->extra;
}
static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft) { return nullptr;
return false; }
};
} // namespace ggml::cpu::amx
static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
return ggml_backend_amx_get_alloc_size(tensor);
GGML_UNUSED(buft); GGML_UNUSED(buft);
} }
...@@ -155,68 +221,26 @@ static bool ggml_amx_init() { ...@@ -155,68 +221,26 @@ static bool ggml_amx_init() {
return true; return true;
#endif #endif
} }
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() { ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = { static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
/* .iface = */ { /* .iface = */ {
/* .get_name = */ ggml_backend_amx_buffer_type_get_name, /* .get_name = */ ggml_backend_amx_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer, /* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_max_size = */ nullptr, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size, /* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size,
/* .is_host = */ ggml_backend_amx_buffer_type_is_host, /* .is_host = */ nullptr,
}, },
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
/* .context = */ NULL, /* .context = */ new ggml::cpu::amx::extra_buffer_type(),
}; };
if (!ggml_amx_init()) { if (!ggml_amx_init()) {
return NULL; return nullptr;
} }
return &ggml_backend_buffer_type_amx; return &ggml_backend_buffer_type_amx;
} }
bool ggml_backend_amx_buft_is_amx(ggml_backend_buffer_type_t buft) { #endif // defined(__AMX_INT8__) && defined(__AVX512VNNI__)
return buft->iface.get_name == ggml_backend_amx_buffer_type_get_name;
}
bool ggml_backend_amx_device_supports_op(const struct ggml_tensor * op) {
// handle only 2d gemm for now
auto is_contiguous_2d = [](const struct ggml_tensor * t) {
return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
};
switch (op->op) {
case GGML_OP_NONE:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE:
return true;
case GGML_OP_MUL_MAT: {
const struct ggml_tensor * src0 = op->src[0];
const struct ggml_tensor * src1 = op->src[1];
const enum ggml_type type = src0->type;
const int64_t ne0 = op->ne[0];
// amx kernels enables for Q4_0, Q4_1, Q8_0, F16
// Q4_K, Q5_K, Q6_K, IQ4_XS enabled for QK_K = 256
bool has_amx_kernels = qtype_has_amx_kernels(type) || (type == GGML_TYPE_F16);
bool can_use_amx =
is_contiguous_2d(src0) && // src0 must be contiguous
is_contiguous_2d(src1) && // src1 must be contiguous
src1->type == GGML_TYPE_F32 && // src1 must be float32
has_amx_kernels && // with amx kernel impls
ne0 % (TILE_N * 2) == 0; // out_features is 32x
return can_use_amx;
}
default:
return false;
}
}
#endif // defined(__AMX_INT8__) && defined(__AVX512VNNI__)
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
...@@ -27,20 +27,8 @@ ...@@ -27,20 +27,8 @@
#include "ggml-backend.h" #include "ggml-backend.h"
#include "ggml-cpu-impl.h" #include "ggml-cpu-impl.h"
#ifdef __cplusplus // GGML internal header
extern "C" {
#endif
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__) #if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void); ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
bool ggml_backend_amx_buft_is_amx(ggml_backend_buffer_type_t buft);
bool ggml_backend_amx_device_supports_op(const struct ggml_tensor * op);
void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst);
#endif
#ifdef __cplusplus
}
#endif #endif
int LLAMA_BUILD_NUMBER = 0; int LLAMA_BUILD_NUMBER = 0;
char const *LLAMA_COMMIT = "40c6d79fb52f995f47507fedfeaae2ac05d9b35c"; char const *LLAMA_COMMIT = "ba1cb19cdd0d92e012e0f6e009e0620f854b6afd";
char const *LLAMA_COMPILER = ""; char const *LLAMA_COMPILER = "";
char const *LLAMA_BUILD_TARGET = ""; char const *LLAMA_BUILD_TARGET = "";
This diff is collapsed.
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
...@@ -71,6 +71,7 @@ CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity ...@@ -71,6 +71,7 @@ CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity
CLIP_API void clip_free(struct clip_ctx * ctx); CLIP_API void clip_free(struct clip_ctx * ctx);
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx); CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w);
CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx); CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx);
CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx); CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx);
...@@ -81,11 +82,13 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx); ...@@ -81,11 +82,13 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx); CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
CLIP_API int clip_n_patches (const struct clip_ctx * ctx); CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx); CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img);
CLIP_API int clip_n_mmproj_embd (const struct clip_ctx * ctx);
CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip); CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size); CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);
CLIP_API struct clip_image_size * clip_image_size_init(); CLIP_API struct clip_image_size * clip_image_size_init();
CLIP_API struct clip_image_u8 * clip_image_u8_init (); CLIP_API struct clip_image_u8 * clip_image_u8_init ();
...@@ -112,6 +115,9 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons ...@@ -112,6 +115,9 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype); CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx); CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
#ifdef __cplusplus #ifdef __cplusplus
} }
......
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
...@@ -1041,38 +1041,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { ...@@ -1041,38 +1041,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
return mparams; return mparams;
} }
static ggml_type kv_cache_type_from_str(const std::string & s) {
if (s == "f32") {
return GGML_TYPE_F32;
}
if (s == "f16") {
return GGML_TYPE_F16;
}
if (s == "bf16") {
return GGML_TYPE_BF16;
}
if (s == "q8_0") {
return GGML_TYPE_Q8_0;
}
if (s == "q4_0") {
return GGML_TYPE_Q4_0;
}
if (s == "q4_1") {
return GGML_TYPE_Q4_1;
}
if (s == "iq4_nl") {
return GGML_TYPE_IQ4_NL;
}
if (s == "q5_0") {
return GGML_TYPE_Q5_0;
}
if (s == "q5_1") {
return GGML_TYPE_Q5_1;
}
throw std::runtime_error("Unsupported cache type: " + s);
}
struct llama_context_params common_context_params_to_llama(const common_params & params) { struct llama_context_params common_context_params_to_llama(const common_params & params) {
auto cparams = llama_context_default_params(); auto cparams = llama_context_default_params();
...@@ -1107,8 +1075,8 @@ struct llama_context_params common_context_params_to_llama(const common_params & ...@@ -1107,8 +1075,8 @@ struct llama_context_params common_context_params_to_llama(const common_params &
cparams.pooling_type = LLAMA_POOLING_TYPE_RANK; cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
} }
cparams.type_k = kv_cache_type_from_str(params.cache_type_k); cparams.type_k = params.cache_type_k;
cparams.type_v = kv_cache_type_from_str(params.cache_type_v); cparams.type_v = params.cache_type_v;
return cparams; return cparams;
} }
...@@ -1134,12 +1102,6 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p ...@@ -1134,12 +1102,6 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
#define CURL_MAX_RETRY 3 #define CURL_MAX_RETRY 3
#define CURL_RETRY_DELAY_SECONDS 2 #define CURL_RETRY_DELAY_SECONDS 2
static bool starts_with(const std::string & str, const std::string & prefix) {
// While we wait for C++20's std::string::starts_with...
return str.rfind(prefix, 0) == 0;
}
static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) { static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
int remaining_attempts = max_attempts; int remaining_attempts = max_attempts;
......
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
...@@ -63,9 +63,9 @@ using llama_tokens = std::vector<llama_token>; ...@@ -63,9 +63,9 @@ using llama_tokens = std::vector<llama_token>;
// build info // build info
extern int LLAMA_BUILD_NUMBER; extern int LLAMA_BUILD_NUMBER;
extern char const * LLAMA_COMMIT; extern const char * LLAMA_COMMIT;
extern char const * LLAMA_COMPILER; extern const char * LLAMA_COMPILER;
extern char const * LLAMA_BUILD_TARGET; extern const char * LLAMA_BUILD_TARGET;
struct common_control_vector_load_info; struct common_control_vector_load_info;
...@@ -241,7 +241,7 @@ struct common_params { ...@@ -241,7 +241,7 @@ struct common_params {
struct common_params_speculative speculative; struct common_params_speculative speculative;
std::string model = ""; // model path // NOLINT std::string model = ""; // model path // NOLINT
std::string model_alias = "unknown"; // model alias // NOLINT std::string model_alias = ""; // model alias // NOLINT
std::string model_url = ""; // model url to download // NOLINT std::string model_url = ""; // model url to download // NOLINT
std::string hf_token = ""; // HF token // NOLINT std::string hf_token = ""; // HF token // NOLINT
std::string hf_repo = ""; // HF repo // NOLINT std::string hf_repo = ""; // HF repo // NOLINT
...@@ -312,8 +312,8 @@ struct common_params { ...@@ -312,8 +312,8 @@ struct common_params {
bool warmup = true; // warmup run bool warmup = true; // warmup run
bool check_tensors = false; // validate tensor data bool check_tensors = false; // validate tensor data
std::string cache_type_k = "f16"; // KV cache data type for the K ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
std::string cache_type_v = "f16"; // KV cache data type for the V ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
// multimodal models (see examples/llava) // multimodal models (see examples/llava)
std::string mmproj = ""; // path to multimodal projector // NOLINT std::string mmproj = ""; // path to multimodal projector // NOLINT
...@@ -463,6 +463,11 @@ std::vector<std::string> string_split<std::string>(const std::string & input, ch ...@@ -463,6 +463,11 @@ std::vector<std::string> string_split<std::string>(const std::string & input, ch
return parts; return parts;
} }
static bool string_starts_with(const std::string & str,
const std::string & prefix) { // While we wait for C++20's std::string::starts_with...
return str.rfind(prefix, 0) == 0;
}
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides); bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
void string_process_escapes(std::string & input); void string_process_escapes(std::string & input);
......
/**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
*
* MIT License
*
* Copyright (c) 2023-2024 The ggml authors
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "ggml-aarch64.h"
#include "ggml-impl.h"
#include "ggml-quants.h"
#include <assert.h>
#define UNUSED GGML_UNUSED
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
block_q4_0x4 out;
for (int i = 0; i < 4; i++) {
out.d[i] = in[i].d;
}
const int end = QK4_0 * 2 / blck_size_interleave;
if (blck_size_interleave == 8) {
const uint64_t xor_mask = 0x8888888888888888ULL;
for (int i = 0; i < end; ++i) {
int src_id = i % 4;
int src_offset = (i / 4) * blck_size_interleave;
int dst_offset = i * blck_size_interleave;
uint64_t elems;
// Using memcpy to avoid unaligned memory accesses
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
elems ^= xor_mask;
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
}
} else if (blck_size_interleave == 4) {
const uint32_t xor_mask = 0x88888888;
for (int i = 0; i < end; ++i) {
int src_id = i % 4;
int src_offset = (i / 4) * blck_size_interleave;
int dst_offset = i * blck_size_interleave;
uint32_t elems;
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t));
elems ^= xor_mask;
memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t));
}
} else {
GGML_ASSERT(false);
}
return out;
}
// interleave 8 block_q4_0s in blocks of blck_size_interleave
// returns an interleaved block_q4_0x8
// in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
// first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) {
block_q4_0x8 out;
for (int i = 0; i < 8; i++) {
out.d[i] = in[i].d;
}
const int end = QK4_0 * 4 / blck_size_interleave;
const uint64_t xor_mask = 0x8888888888888888ULL;
for (int i = 0; i < end; ++i) {
int src_id = i % 8;
int src_offset = (i / 8) * blck_size_interleave;
int dst_offset = i * blck_size_interleave;
uint64_t elems;
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
elems ^= xor_mask;
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
}
return out;
}
static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, int nrows_interleaved, int blck_size_interleave) {
assert(n_per_row % QK4_0 == 0);
const int nb = n_per_row / QK4_0;
void * out_ptr = NULL;
if (nrows_interleaved == 8) {
out_ptr = (block_q4_0x8 *) dst;
}
else if (nrows_interleaved == 4) {
out_ptr = (block_q4_0x4 *) dst;
}
assert(nrows_interleaved <= 8);
block_q4_0 dst_tmp[8];
for (int b = 0; b < (nrow * n_per_row); b += nrows_interleaved * n_per_row) {
for (int64_t x = 0; x < nb; x++) {
for (int i = 0; i < nrows_interleaved; i++ ) {
quantize_row_q4_0_ref(src + b + i * n_per_row + x * QK4_0, (block_q4_0 *) dst_tmp + i, QK4_0);
}
if (nrows_interleaved == 8) {
*(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, blck_size_interleave);
out_ptr = (block_q4_0x8 *) out_ptr + 1;
}
else if (nrows_interleaved == 4) {
*(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, blck_size_interleave);
out_ptr = (block_q4_0x4 *) out_ptr + 1;
}
}
}
return ((nrow * n_per_row) / QK4_0 * sizeof(block_q4_0));
}
size_t quantize_q4_0_4x4(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
UNUSED(quant_weights);
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 4);
}
size_t quantize_q4_0_4x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
UNUSED(quant_weights);
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 8);
}
size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
UNUSED(quant_weights);
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8);
}
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
...@@ -72,6 +72,10 @@ ...@@ -72,6 +72,10 @@
#include "ggml-vulkan.h" #include "ggml-vulkan.h"
#endif #endif
#ifdef GGML_USE_OPENCL
#include "ggml-opencl.h"
#endif
#ifdef GGML_USE_BLAS #ifdef GGML_USE_BLAS
#include "ggml-blas.h" #include "ggml-blas.h"
#endif #endif
...@@ -172,6 +176,9 @@ struct ggml_backend_registry { ...@@ -172,6 +176,9 @@ struct ggml_backend_registry {
#ifdef GGML_USE_VULKAN #ifdef GGML_USE_VULKAN
register_backend(ggml_backend_vk_reg()); register_backend(ggml_backend_vk_reg());
#endif #endif
#ifdef GGML_USE_OPENCL
register_backend(ggml_backend_opencl_reg());
#endif
#ifdef GGML_USE_CANN #ifdef GGML_USE_CANN
register_backend(ggml_backend_cann_reg()); register_backend(ggml_backend_cann_reg());
#endif #endif
...@@ -475,11 +482,21 @@ static std::string backend_filename_suffix() { ...@@ -475,11 +482,21 @@ static std::string backend_filename_suffix() {
#endif #endif
} }
static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent) { static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) {
// enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths // enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
// TODO: search system paths // TODO: search system paths
std::vector<std::string> search_paths = { "./", get_executable_path() };
std::string file_prefix = backend_filename_prefix() + name + "-"; std::string file_prefix = backend_filename_prefix() + name + "-";
std::vector<std::string> search_paths;
if (user_search_path == nullptr) {
search_paths.push_back("./");
search_paths.push_back(get_executable_path());
} else {
#if defined(_WIN32)
search_paths.push_back(std::string(user_search_path) + "\\");
#else
search_paths.push_back(std::string(user_search_path) + "/");
#endif
}
int best_score = 0; int best_score = 0;
std::string best_path; std::string best_path;
...@@ -489,7 +506,8 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent) ...@@ -489,7 +506,8 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent)
if (!fs::exists(search_path)) { if (!fs::exists(search_path)) {
continue; continue;
} }
for (const auto & entry : fs::directory_iterator(search_path)) { fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
for (const auto & entry : dir_it) {
if (entry.is_regular_file()) { if (entry.is_regular_file()) {
std::string filename = entry.path().filename().string(); std::string filename = entry.path().filename().string();
std::string ext = entry.path().extension().string(); std::string ext = entry.path().extension().string();
...@@ -509,6 +527,10 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent) ...@@ -509,6 +527,10 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent)
best_score = s; best_score = s;
best_path = entry.path().string(); best_path = entry.path().string();
} }
} else {
if (!silent) {
GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, entry.path().string().c_str());
}
} }
} }
} }
...@@ -531,15 +553,26 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent) ...@@ -531,15 +553,26 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent)
} }
void ggml_backend_load_all() { void ggml_backend_load_all() {
ggml_backend_load_best("blas", true); ggml_backend_load_all_from_path(nullptr);
ggml_backend_load_best("cann", true); }
ggml_backend_load_best("cuda", true);
ggml_backend_load_best("hip", true); void ggml_backend_load_all_from_path(const char * dir_path) {
ggml_backend_load_best("kompute", true); #ifdef NDEBUG
ggml_backend_load_best("metal", true); bool silent = true;
ggml_backend_load_best("rpc", true); #else
ggml_backend_load_best("sycl", true); bool silent = false;
ggml_backend_load_best("vulkan", true); #endif
ggml_backend_load_best("musa", true);
ggml_backend_load_best("cpu", true); ggml_backend_load_best("blas", silent, dir_path);
ggml_backend_load_best("cann", silent, dir_path);
ggml_backend_load_best("cuda", silent, dir_path);
ggml_backend_load_best("hip", silent, dir_path);
ggml_backend_load_best("kompute", silent, dir_path);
ggml_backend_load_best("metal", silent, dir_path);
ggml_backend_load_best("rpc", silent, dir_path);
ggml_backend_load_best("sycl", silent, dir_path);
ggml_backend_load_best("vulkan", silent, dir_path);
ggml_backend_load_best("opencl", silent, dir_path);
ggml_backend_load_best("musa", silent, dir_path);
ggml_backend_load_best("cpu", silent, dir_path);
} }
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
...@@ -254,6 +254,7 @@ extern "C" { ...@@ -254,6 +254,7 @@ extern "C" {
GGML_API void ggml_backend_unload(ggml_backend_reg_t reg); GGML_API void ggml_backend_unload(ggml_backend_reg_t reg);
// Load all known backends from dynamic libraries // Load all known backends from dynamic libraries
GGML_API void ggml_backend_load_all(void); GGML_API void ggml_backend_load_all(void);
GGML_API void ggml_backend_load_all_from_path(const char * dir_path);
// //
// Backend scheduler // Backend scheduler
......
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
...@@ -32,7 +32,20 @@ ...@@ -32,7 +32,20 @@
typedef uint16_t ggml_half; typedef uint16_t ggml_half;
typedef uint32_t ggml_half2; typedef uint32_t ggml_half2;
#define GGML_COMMON_AGGR #define GGML_COMMON_AGGR_U
#define GGML_COMMON_AGGR_S
#define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_CPP)
#include <cstdint>
typedef uint16_t ggml_half;
typedef uint32_t ggml_half2;
// std-c++ allow anonymous unions but some compiler warn on it
#define GGML_COMMON_AGGR_U data
// std-c++ do not allow it.
#define GGML_COMMON_AGGR_S data
#define GGML_COMMON_DECL #define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_METAL) #elif defined(GGML_COMMON_DECL_METAL)
...@@ -41,7 +54,8 @@ typedef uint32_t ggml_half2; ...@@ -41,7 +54,8 @@ typedef uint32_t ggml_half2;
typedef half ggml_half; typedef half ggml_half;
typedef half2 ggml_half2; typedef half2 ggml_half2;
#define GGML_COMMON_AGGR #define GGML_COMMON_AGGR_U
#define GGML_COMMON_AGGR_S
#define GGML_COMMON_DECL #define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_CUDA) #elif defined(GGML_COMMON_DECL_CUDA)
...@@ -55,7 +69,8 @@ typedef half2 ggml_half2; ...@@ -55,7 +69,8 @@ typedef half2 ggml_half2;
typedef half ggml_half; typedef half ggml_half;
typedef half2 ggml_half2; typedef half2 ggml_half2;
#define GGML_COMMON_AGGR data #define GGML_COMMON_AGGR_U
#define GGML_COMMON_AGGR_S data
#define GGML_COMMON_DECL #define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_HIP) #elif defined(GGML_COMMON_DECL_HIP)
...@@ -65,7 +80,8 @@ typedef half2 ggml_half2; ...@@ -65,7 +80,8 @@ typedef half2 ggml_half2;
typedef half ggml_half; typedef half ggml_half;
typedef half2 ggml_half2; typedef half2 ggml_half2;
#define GGML_COMMON_AGGR data #define GGML_COMMON_AGGR_U
#define GGML_COMMON_AGGR_S data
#define GGML_COMMON_DECL #define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_SYCL) #elif defined(GGML_COMMON_DECL_SYCL)
...@@ -75,7 +91,8 @@ typedef half2 ggml_half2; ...@@ -75,7 +91,8 @@ typedef half2 ggml_half2;
typedef sycl::half ggml_half; typedef sycl::half ggml_half;
typedef sycl::half2 ggml_half2; typedef sycl::half2 ggml_half2;
#define GGML_COMMON_AGGR data #define GGML_COMMON_AGGR_U
#define GGML_COMMON_AGGR_S data
#define GGML_COMMON_DECL #define GGML_COMMON_DECL
#endif #endif
...@@ -180,9 +197,9 @@ typedef struct { ...@@ -180,9 +197,9 @@ typedef struct {
struct { struct {
ggml_half d; // delta ggml_half d; // delta
ggml_half m; // min ggml_half m; // min
} GGML_COMMON_AGGR; } GGML_COMMON_AGGR_S;
ggml_half2 dm; ggml_half2 dm;
}; } GGML_COMMON_AGGR_U;
uint8_t qs[QK4_1 / 2]; // nibbles / quants uint8_t qs[QK4_1 / 2]; // nibbles / quants
} block_q4_1; } block_q4_1;
static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding"); static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding");
...@@ -201,9 +218,9 @@ typedef struct { ...@@ -201,9 +218,9 @@ typedef struct {
struct { struct {
ggml_half d; // delta ggml_half d; // delta
ggml_half m; // min ggml_half m; // min
} GGML_COMMON_AGGR; } GGML_COMMON_AGGR_S;
ggml_half2 dm; ggml_half2 dm;
}; } GGML_COMMON_AGGR_U;
uint8_t qh[4]; // 5-th bit of quants uint8_t qh[4]; // 5-th bit of quants
uint8_t qs[QK5_1 / 2]; // nibbles / quants uint8_t qs[QK5_1 / 2]; // nibbles / quants
} block_q5_1; } block_q5_1;
...@@ -222,37 +239,13 @@ typedef struct { ...@@ -222,37 +239,13 @@ typedef struct {
struct { struct {
ggml_half d; // delta ggml_half d; // delta
ggml_half s; // d * sum(qs[i]) ggml_half s; // d * sum(qs[i])
} GGML_COMMON_AGGR; } GGML_COMMON_AGGR_S;
ggml_half2 ds; ggml_half2 ds;
}; } GGML_COMMON_AGGR_U;
int8_t qs[QK8_1]; // quants int8_t qs[QK8_1]; // quants
} block_q8_1; } block_q8_1;
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding"); static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");
typedef struct {
ggml_half d[4]; // deltas for 4 q4_0 blocks
uint8_t qs[QK4_0 * 2]; // nibbles / quants for 4 q4_0 blocks
} block_q4_0x4;
static_assert(sizeof(block_q4_0x4) == 4 * sizeof(ggml_half) + QK4_0 * 2, "wrong q4_0x4 block size/padding");
typedef struct {
ggml_half d[8]; // deltas for 8 q4_0 blocks
uint8_t qs[QK4_0 * 4]; // nibbles / quants for 8 q4_0 blocks
} block_q4_0x8;
static_assert(sizeof(block_q4_0x8) == 8 * sizeof(ggml_half) + QK4_0 * 4, "wrong q4_0x8 block size/padding");
typedef struct {
ggml_half d[4]; // deltas for 4 q8_0 blocks
int8_t qs[QK8_0 * 4]; // quants for 4 q8_0 blocks
} block_q8_0x4;
static_assert(sizeof(block_q8_0x4) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong q8_0x4 block size/padding");
typedef struct {
ggml_half d[8]; // deltas for 8 q8_0 blocks
int8_t qs[QK8_0 * 8]; // quants for 8 q8_0 blocks
} block_q8_0x8;
static_assert(sizeof(block_q8_0x8) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong q8_0x8 block size/padding");
// //
// Ternary quantization // Ternary quantization
// //
...@@ -287,9 +280,9 @@ typedef struct { ...@@ -287,9 +280,9 @@ typedef struct {
struct { struct {
ggml_half d; // super-block scale for quantized scales ggml_half d; // super-block scale for quantized scales
ggml_half dmin; // super-block scale for quantized mins ggml_half dmin; // super-block scale for quantized mins
} GGML_COMMON_AGGR; } GGML_COMMON_AGGR_S;
ggml_half2 dm; ggml_half2 dm;
}; } GGML_COMMON_AGGR_U;
} block_q2_K; } block_q2_K;
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding"); static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
...@@ -314,9 +307,9 @@ typedef struct { ...@@ -314,9 +307,9 @@ typedef struct {
struct { struct {
ggml_half d; // super-block scale for quantized scales ggml_half d; // super-block scale for quantized scales
ggml_half dmin; // super-block scale for quantized mins ggml_half dmin; // super-block scale for quantized mins
} GGML_COMMON_AGGR; } GGML_COMMON_AGGR_S;
ggml_half2 dm; ggml_half2 dm;
}; } GGML_COMMON_AGGR_U;
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qs[QK_K/2]; // 4--bit quants uint8_t qs[QK_K/2]; // 4--bit quants
} block_q4_K; } block_q4_K;
...@@ -331,9 +324,9 @@ typedef struct { ...@@ -331,9 +324,9 @@ typedef struct {
struct { struct {
ggml_half d; // super-block scale for quantized scales ggml_half d; // super-block scale for quantized scales
ggml_half dmin; // super-block scale for quantized mins ggml_half dmin; // super-block scale for quantized mins
} GGML_COMMON_AGGR; } GGML_COMMON_AGGR_S;
ggml_half2 dm; ggml_half2 dm;
}; } GGML_COMMON_AGGR_U;
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qh[QK_K/8]; // quants, high bit uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits uint8_t qs[QK_K/2]; // quants, low 4 bits
...@@ -444,12 +437,6 @@ typedef struct { ...@@ -444,12 +437,6 @@ typedef struct {
} block_iq4_xs; } block_iq4_xs;
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding"); static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
typedef struct {
ggml_half d[4]; // deltas for 4 iq4_nl blocks
uint8_t qs[QK4_NL * 2];// nibbles / quants for 4 iq4_nl blocks
} block_iq4_nlx4;
static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
#endif // GGML_COMMON_DECL #endif // GGML_COMMON_DECL
#endif // GGML_COMMON_DECL #endif // GGML_COMMON_DECL
...@@ -463,6 +450,13 @@ static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wro ...@@ -463,6 +450,13 @@ static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wro
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = { #define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
#define GGML_TABLE_END() }; #define GGML_TABLE_END() };
#define GGML_COMMON_IMPL
#elif defined(GGML_COMMON_IMPL_CPP)
#include <cstdint>
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
#define GGML_TABLE_END() };
#define GGML_COMMON_IMPL #define GGML_COMMON_IMPL
#elif defined(GGML_COMMON_IMPL_METAL) #elif defined(GGML_COMMON_IMPL_METAL)
#include <metal_stdlib> #include <metal_stdlib>
...@@ -505,7 +499,7 @@ GGML_TABLE_BEGIN(uint8_t, ksigns_iq2xs, 128) ...@@ -505,7 +499,7 @@ GGML_TABLE_BEGIN(uint8_t, ksigns_iq2xs, 128)
240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255, 240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
GGML_TABLE_END() GGML_TABLE_END()
//#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics //#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A // lowest compute capability for integer intrinsics
GGML_TABLE_BEGIN(uint64_t, ksigns64, 128) GGML_TABLE_BEGIN(uint64_t, ksigns64, 128)
0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff, 0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,
0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff, 0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,
......
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
...@@ -26,33 +26,9 @@ ...@@ -26,33 +26,9 @@
#pragma once #pragma once
#include "ggml-cpu-traits.h"
#include "ggml.h" #include "ggml.h"
// GGML internal header // GGML internal header
#ifdef __cplusplus ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
extern "C" {
#endif
// Quantization
void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
// GEMV
void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
// GEMM
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * data, size_t data_size);
enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur);
#ifdef __cplusplus
}
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment