llama: update vendored code to commit 46e3556 (#8308)

1deafd82 · Jeffrey Morgan · GitHub · 57f038ec · 1deafd82 · 1deafd82
Unverified Commit 1deafd82 authored Jan 08, 2025 by Jeffrey Morgan Committed by GitHub Jan 08, 2025
20 changed files
--- a/api/types.go
+++ b/api/types.go
@@ -225,7 +225,6 @@ type Options struct {
 	Mirostat         int      `json:"mirostat,omitempty"`
 	MirostatTau      float32  `json:"mirostat_tau,omitempty"`
 	MirostatEta      float32  `json:"mirostat_eta,omitempty"`
-	PenalizeNewline  bool     `json:"penalize_newline,omitempty"`
 	Stop             []string `json:"stop,omitempty"`
 }
@@ -606,7 +605,6 @@ func DefaultOptions() Options {
 		Mirostat:         0,
 		MirostatTau:      5.0,
 		MirostatEta:      0.1,
-		PenalizeNewline:  true,
 		Seed:             -1,
 		Runner: Runner{

--- a/llama/amx.cpp
+++ b/llama/amx.cpp
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
 *
 * MIT License
 *

--- a/llama/amx.h
+++ b/llama/amx.h
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
 *
 * MIT License
 *

--- a/llama/clip.cpp
+++ b/llama/clip.cpp
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
 *
 * MIT License
 *
@@ -935,7 +935,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
                mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3));
                mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
                // stride = 1, padding = 1, bias is nullptr
-                block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
+                block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
                // layer norm
                // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
@@ -983,7 +983,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            // block_2
            {
                // stride = 2
-                block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
+                block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
                // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
                // layer norm
@@ -1044,7 +1044,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            // mlp_2 ne [24, 24, 2048, 1]
            mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
            // weight ne = [3, 3, 2048, 1]
-            struct ggml_tensor * peg_0 = ggml_conv_depthwise_2d(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
+            struct ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
            peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
            peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
            mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
@@ -1262,28 +1262,28 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
    }
 #ifdef GGML_USE_CUDA
-    new_clip->backend = ggml_backend_cuda_init(0);
+   new_clip->backend = ggml_backend_cuda_init(0);
-    LOG_INF("%s: CLIP using CUDA backend\n", __func__);
+   LOG_INF("%s: CLIP using CUDA backend\n", __func__);
 #endif
 #ifdef GGML_USE_METAL
-    new_clip->backend = ggml_backend_metal_init();
+   new_clip->backend = ggml_backend_metal_init();
-    LOG_INF("%s: CLIP using Metal backend\n", __func__);
+   LOG_INF("%s: CLIP using Metal backend\n", __func__);
 #endif
 #ifdef GGML_USE_CANN
-    new_clip->backend = ggml_backend_cann_init(0);
+   new_clip->backend = ggml_backend_cann_init(0);
-    LOG_INF("%s: CLIP using CANN backend\n", __func__);
+   LOG_INF("%s: CLIP using CANN backend\n", __func__);
 #endif
 #ifdef GGML_USE_VULKAN
-    new_clip->backend = ggml_backend_vk_init(0);
+   new_clip->backend = ggml_backend_vk_init(0);
-    LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
+   LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
 #endif
 #ifdef GGML_USE_SYCL
-    new_clip->backend = ggml_backend_sycl_init(0);
+   new_clip->backend = ggml_backend_sycl_init(0);
-    LOG_INF("%s: CLIP using SYCL backend\n", __func__);
+   LOG_INF("%s: CLIP using SYCL backend\n", __func__);
 #endif
    if (!new_clip->backend) {

--- a/llama/clip.h
+++ b/llama/clip.h
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
 *
 * MIT License
 *

--- a/llama/common.cpp
+++ b/llama/common.cpp
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
 *
 * MIT License
 *
@@ -44,6 +44,7 @@
 #include <cstdarg>
 #include <cstring>
 #include <ctime>
+#include <filesystem>
 #include <fstream>
 #include <iostream>
 #include <iterator>
@@ -88,7 +89,9 @@
 #ifdef __linux__
 #include <linux/limits.h>
 #elif defined(_WIN32)
-#define PATH_MAX MAX_PATH
+#   if !defined(PATH_MAX)
+#   define PATH_MAX MAX_PATH
+#   endif
 #else
 #include <sys/syslimits.h>
 #endif
@@ -912,9 +915,8 @@ struct common_init_result common_init_from_params(common_params & params) {
    }
    if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
-        LOG_ERR("%s: KV cache shifting is not supported for this model (--no-context-shift to disable)'\n", __func__);
+        LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);
-        llama_free_model(model);
+        params.ctx_shift = false;
-        return iparams;
    }
    if (!params.control_vectors.empty()) {
@@ -945,20 +947,21 @@ struct common_init_result common_init_from_params(common_params & params) {
    // load and optionally apply lora adapters
    for (auto & la : params.lora_adapters) {
-        common_lora_adapter_container loaded_la;
+        llama_lora_adapter_ptr lora;
-        loaded_la.path = la.path;
+        lora.reset(llama_lora_adapter_init(model, la.path.c_str()));
-        loaded_la.scale = la.scale;
+        if (lora == nullptr) {
-        loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
-        if (loaded_la.adapter == nullptr) {
            LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
            llama_free(lctx);
            llama_free_model(model);
            return iparams;
        }
-        iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
+        la.ptr = lora.get();
+        iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
    }
    if (!params.lora_init_without_apply) {
-        common_lora_adapters_apply(lctx, iparams.lora_adapters);
+        common_lora_adapters_apply(lctx, params.lora_adapters);
    }
    if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
@@ -966,6 +969,25 @@ struct common_init_result common_init_from_params(common_params & params) {
        params.sampling.ignore_eos = false;
    }
+    if (params.sampling.ignore_eos) {
+        for (llama_token i = 0; i < llama_n_vocab(model); i++) {
+            if (llama_token_is_eog(model, i)) {
+                LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
+                params.sampling.logit_bias.push_back({i, -INFINITY});
+            }
+        }
+    }
+    if (params.sampling.penalty_last_n == -1) {
+        LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
+        params.sampling.penalty_last_n = llama_n_ctx(lctx);
+    }
+    if (params.sampling.dry_penalty_last_n == -1) {
+        LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
+        params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
+    }
    if (params.warmup) {
        LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
@@ -1000,17 +1022,17 @@ struct common_init_result common_init_from_params(common_params & params) {
        llama_perf_context_reset(lctx);
    }
-    iparams.model   = model;
+    iparams.model.reset(model);
-    iparams.context = lctx;
+    iparams.context.reset(lctx);
    return iparams;
 }
-void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
+void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora) {
    llama_lora_adapter_clear(ctx);
-    for (auto & la : lora_adapters) {
+    for (auto & la : lora) {
        if (la.scale != 0.0f) {
-            llama_lora_adapter_set(ctx, la.adapter, la.scale);
+            llama_lora_adapter_set(ctx, la.ptr, la.scale);
        }
    }
 }
@@ -1102,7 +1124,7 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
 #define CURL_MAX_RETRY 3
 #define CURL_RETRY_DELAY_SECONDS 2
-static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
+static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
    int remaining_attempts = max_attempts;
    while (remaining_attempts > 0) {
@@ -1126,7 +1148,6 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
 }
 static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
    // Initialize libcurl
    std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
    if (!curl) {
@@ -1156,8 +1177,7 @@ static bool common_download_file(const std::string & url, const std::string & pa
 #endif
    // Check if the file already exists locally
-    struct stat model_file_info;
+    auto file_exists = std::filesystem::exists(path);
-    auto file_exists = (stat(path.c_str(), &model_file_info) == 0);
    // If the file exists, check its JSON metadata companion file.
    std::string metadata_path = path + ".json";
@@ -1199,11 +1219,13 @@ static bool common_download_file(const std::string & url, const std::string & pa
        std::string etag;
        std::string last_modified;
    };
    common_load_model_from_url_headers headers;
    {
        typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
        auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
-            common_load_model_from_url_headers *headers = (common_load_model_from_url_headers *) userdata;
+            common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
            static std::regex header_regex("([^:]+): (.*)\r\n");
            static std::regex etag_regex("ETag", std::regex_constants::icase);
@@ -1618,6 +1640,18 @@ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token
 // Chat template utils
 //
+std::string common_get_builtin_chat_template(const struct llama_model * model) {
+    static const char * template_key = "tokenizer.chat_template";
+    // call with NULL buffer to get the total size of the string
+    int32_t res = llama_model_meta_val_str(model, template_key, NULL, 0);
+    if (res > 0) {
+        std::vector<char> model_template(res + 1, 0);
+        llama_model_meta_val_str(model, template_key, model_template.data(), model_template.size());
+        return std::string(model_template.data(), model_template.size() - 1);
+    }
+    return "";
+}
 bool common_chat_verify_template(const std::string & tmpl) {
    llama_chat_message chat[] = {{"user", "test"}};
    int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
@@ -1787,7 +1821,9 @@ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm)
            break;
        case 0: // max absolute
            for (int i = 0; i < n; i++) {
-                if (sum < std::abs(inp[i])) sum = std::abs(inp[i]);
+                if (sum < std::abs(inp[i])) {
+                    sum = std::abs(inp[i]);
+                }
            }
            sum /= 32760.0; // make an int16 range
            break;

--- a/llama/common.h
+++ b/llama/common.h
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
 *
 * MIT License
 *
@@ -28,7 +28,7 @@
 #pragma once
-#include "llama.h"
+#include "llama-cpp.h"
 #include <string>
 #include <vector>
@@ -53,10 +53,8 @@
 struct common_lora_adapter_info {
    std::string path;
    float scale;
-};
-struct common_lora_adapter_container : common_lora_adapter_info {
+    struct llama_lora_adapter * ptr;
-    struct llama_lora_adapter * adapter;
 };
 using llama_tokens = std::vector<llama_token>;
@@ -106,6 +104,7 @@ enum llama_example {
    LLAMA_EXAMPLE_LLAVA,
    LLAMA_EXAMPLE_LOOKUP,
    LLAMA_EXAMPLE_PARALLEL,
+    LLAMA_EXAMPLE_TTS,
    LLAMA_EXAMPLE_COUNT,
 };
@@ -121,6 +120,7 @@ enum common_sampler_type {
    COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
    COMMON_SAMPLER_TYPE_XTC         = 8,
    COMMON_SAMPLER_TYPE_INFILL      = 9,
+    COMMON_SAMPLER_TYPE_PENALTIES   = 10,
 };
 // dimensionality reduction methods, used by cvector-generator
@@ -156,7 +156,6 @@ struct common_params_sampling {
    int32_t mirostat           = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
    float   mirostat_tau       = 5.00f; // target entropy
    float   mirostat_eta       = 0.10f; // learning rate
-    bool    penalize_nl        = false; // consider newlines as a repeatable token
    bool    ignore_eos         = false;
    bool    no_perf            = false; // disable performance metrics
    bool    timing_per_token   = false;
@@ -165,6 +164,7 @@ struct common_params_sampling {
    std::vector<enum common_sampler_type> samplers = {
+        COMMON_SAMPLER_TYPE_PENALTIES,
        COMMON_SAMPLER_TYPE_DRY,
        COMMON_SAMPLER_TYPE_TOP_K,
        COMMON_SAMPLER_TYPE_TYPICAL_P,
@@ -184,6 +184,7 @@ struct common_params_sampling {
 struct common_params_speculative {
    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
    int32_t n_ctx        =     0; // draft context size
    int32_t n_max        =    16; // maximum number of tokens to draft during speculative decoding
    int32_t n_min        =     5; // minimum number of draft tokens to use for speculative decoding
@@ -197,6 +198,14 @@ struct common_params_speculative {
    std::string model = ""; // draft model for speculative decoding                          // NOLINT
 };
+struct common_params_vocoder {
+    std::string hf_repo = ""; // HF repo                                                     // NOLINT
+    std::string hf_file = ""; // HF file                                                     // NOLINT
+    std::string model     = ""; // model path                                                // NOLINT
+    std::string model_url = ""; // model url to download                                     // NOLINT
+};
 struct common_params {
    int32_t n_predict             =    -1; // new tokens to predict
    int32_t n_ctx                 =  4096; // context size
@@ -219,11 +228,13 @@ struct common_params {
    float   defrag_thold          =  0.1f; // KV cache defragmentation threshold
    // offload params
-    std::vector<ggml_backend_dev_t> devices;         // devices to use for offloading
+    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
-    int32_t n_gpu_layers                    =    -1; // number of layers to store in VRAM (-1 - use default)
-    int32_t main_gpu                        =     0; // the GPU that is used for scratch and small tensors
+    int32_t n_gpu_layers      = -1;  // number of layers to store in VRAM (-1 - use default)
-    float   tensor_split[128]               =   {0}; // how split tensors should be distributed across GPUs
+    int32_t main_gpu          = 0;   // the GPU that is used for scratch and small tensors
-    enum llama_split_mode        split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
+    float   tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
+    enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
    struct cpu_params cpuparams;
    struct cpu_params cpuparams_batch;
@@ -237,8 +248,9 @@ struct common_params {
    enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
    enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
-    struct common_params_sampling sampling;
+    struct common_params_sampling    sampling;
    struct common_params_speculative speculative;
+    struct common_params_vocoder     vocoder;
    std::string model                = ""; // model path                                                    // NOLINT
    std::string model_alias          = ""; // model alias                                                   // NOLINT
@@ -490,10 +502,12 @@ std::string fs_get_cache_file(const std::string & filename);
 // Model utils
 //
+// note: defines object's lifetime
 struct common_init_result {
-    struct llama_model   * model   = nullptr;
+    llama_model_ptr   model;
-    struct llama_context * context = nullptr;
+    llama_context_ptr context;
-    std::vector<common_lora_adapter_container> lora_adapters;
+    std::vector<llama_lora_adapter_ptr> lora;
 };
 struct common_init_result     common_init_from_params(common_params & params);
@@ -515,7 +529,7 @@ struct llama_model * common_load_model_from_hf(
    const struct llama_model_params & params);
 // clear LoRA adapters from context, then apply new list of adapters
-void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
+void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora);
 //
 // Batch utils
@@ -583,6 +597,9 @@ struct common_chat_msg {
    std::string content;
 };
+// Get the built-in chat template for the model. Return empty string if not present.
+std::string common_get_builtin_chat_template(const struct llama_model * model);
 // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
 bool common_chat_verify_template(const std::string & tmpl);
@@ -619,7 +636,8 @@ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_si
 // Embedding utils
 //
-void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
+// TODO: repace embd_norm with an enum
+void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
 float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
@@ -648,6 +666,10 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
 // Split utils
 //
-static const char * const LLM_KV_SPLIT_NO            = "split.no";
+namespace {
-static const char * const LLM_KV_SPLIT_COUNT         = "split.count";
-static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
+const char * const LLM_KV_SPLIT_NO            = "split.no";
+const char * const LLM_KV_SPLIT_COUNT         = "split.count";
+const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
+}
--- a/llama/ggml-alloc.c
+++ b/llama/ggml-alloc.c
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
 *
 * MIT License
 *
@@ -560,7 +560,6 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
        size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node);
        hn->buffer_id = buffer_id;
        hn->offset = offset;
-        return;
    }
 }

--- a/llama/ggml-alloc.h
+++ b/llama/ggml-alloc.h
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
 *
 * MIT License
 *

--- a/llama/ggml-backend-impl.h
+++ b/llama/ggml-backend-impl.h
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
 *
 * MIT License
 *

--- a/llama/ggml-backend-reg.cpp
+++ b/llama/ggml-backend-reg.cpp
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
 *
 * MIT License
 *
@@ -92,6 +92,26 @@
 #include "ggml-kompute.h"
 #endif
+// disable C++17 deprecation warning for std::codecvt_utf8
+#if defined(__clang__)
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wdeprecated-declarations"
+#endif
+static std::wstring utf8_to_utf16(const std::string & str) {
+    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
+    return converter.from_bytes(str);
+}
+static std::string utf16_to_utf8(const std::wstring & str) {
+    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
+    return converter.to_bytes(str);
+}
+#if defined(__clang__)
+#    pragma clang diagnostic pop
+#endif
 #ifdef _WIN32
 using dl_handle = std::remove_pointer_t<HMODULE>;
@@ -114,11 +134,6 @@ static dl_handle * dl_load_library(const std::wstring & path) {
    return handle;
 }
-static dl_handle * dl_load_library(const std::string & path) {
-    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
-    return dl_load_library(converter.from_bytes(path));
-}
 static void * dl_get_sym(dl_handle * handle, const char * name) {
    DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
    SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
@@ -140,8 +155,8 @@ struct dl_handle_deleter {
    }
 };
-static void * dl_load_library(const std::string & path) {
+static void * dl_load_library(const std::wstring & path) {
-    dl_handle * handle = dlopen(path.c_str(), RTLD_NOW | RTLD_LOCAL);
+    dl_handle * handle = dlopen(utf16_to_utf8(path).c_str(), RTLD_NOW | RTLD_LOCAL);
    return handle;
 }
@@ -182,9 +197,9 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_CANN
        register_backend(ggml_backend_cann_reg());
 #endif
-#ifdef GGML_USE_BLAS
+// #ifdef GGML_USE_BLAS
-        register_backend(ggml_backend_blas_reg());
+//         register_backend(ggml_backend_blas_reg());
-#endif
+// #endif
 #ifdef GGML_USE_RPC
        register_backend(ggml_backend_rpc_reg());
 #endif
@@ -228,11 +243,11 @@ struct ggml_backend_registry {
        devices.push_back(device);
    }
-    ggml_backend_reg_t load_backend(const char * path, bool silent) {
+    ggml_backend_reg_t load_backend(const std::wstring & path, bool silent) {
        dl_handle_ptr handle { dl_load_library(path) };
        if (!handle) {
            if (!silent) {
-                GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path);
+                GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(path).c_str());
            }
            return nullptr;
        }
@@ -240,7 +255,7 @@ struct ggml_backend_registry {
        auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
        if (score_fn && score_fn() == 0) {
            if (!silent) {
-                GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path);
+                GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, utf16_to_utf8(path).c_str());
            }
            return nullptr;
        }
@@ -248,7 +263,7 @@ struct ggml_backend_registry {
        auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init");
        if (!backend_init_fn) {
            if (!silent) {
-                GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path);
+                GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, utf16_to_utf8(path).c_str());
            }
            return nullptr;
        }
@@ -257,16 +272,16 @@ struct ggml_backend_registry {
        if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
            if (!silent) {
                if (!reg) {
-                    GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path);
+                    GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, utf16_to_utf8(path).c_str());
                } else {
                    GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
-                        __func__, path, reg->api_version, GGML_BACKEND_API_VERSION);
+                        __func__, utf16_to_utf8(path).c_str(), reg->api_version, GGML_BACKEND_API_VERSION);
                }
            }
            return nullptr;
        }
-        GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path);
+        GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), utf16_to_utf8(path).c_str());
        register_backend(reg, std::move(handle));
@@ -402,14 +417,14 @@ ggml_backend_t ggml_backend_init_best(void) {
 // Dynamic loading
 ggml_backend_reg_t ggml_backend_load(const char * path) {
-    return get_reg().load_backend(path, false);
+    return get_reg().load_backend(utf8_to_utf16(path), false);
 }
 void ggml_backend_unload(ggml_backend_reg_t reg) {
    get_reg().unload_backend(reg, true);
 }
-static std::string get_executable_path() {
+static std::wstring get_executable_path() {
 #if defined(__APPLE__)
    // get executable path
    std::vector<char> path;
@@ -427,13 +442,17 @@ static std::string get_executable_path() {
    if (last_slash != std::string::npos) {
        base_path = base_path.substr(0, last_slash);
    }
-    return base_path + "/";
+    return utf8_to_utf16(base_path + "/");
-#elif defined(__linux__)
+#elif defined(__linux__) || defined(__FreeBSD__)
    std::string base_path = ".";
    std::vector<char> path(1024);
    while (true) {
        // get executable path
+#    if defined(__linux__)
        ssize_t len = readlink("/proc/self/exe", path.data(), path.size());
+#    elif defined(__FreeBSD__)
+        ssize_t len = readlink("/proc/curproc/file", path.data(), path.size());
+#    endif
        if (len == -1) {
            break;
        }
@@ -449,57 +468,63 @@ static std::string get_executable_path() {
        path.resize(path.size() * 2);
    }
-    return base_path + "/";
+    return utf8_to_utf16(base_path + "/");
 #elif defined(_WIN32)
-    std::vector<char> path(MAX_PATH);
+    std::vector<wchar_t> path(MAX_PATH);
-    DWORD len = GetModuleFileNameA(NULL, path.data(), path.size());
+    DWORD len = GetModuleFileNameW(NULL, path.data(), path.size());
    if (len == 0) {
-        return "";
+        return {};
    }
-    std::string base_path(path.data(), len);
+    std::wstring base_path(path.data(), len);
    // remove executable name
    auto last_slash = base_path.find_last_of('\\');
    if (last_slash != std::string::npos) {
        base_path = base_path.substr(0, last_slash);
    }
-    return base_path + "\\";
+    return base_path + L"\\";
+#else
+    return {};
+#endif
+}
+static std::wstring backend_filename_prefix() {
+#ifdef _WIN32
+    return L"ggml-";
+#else
+    return L"libggml-";
 #endif
 }
-static std::string backend_filename_prefix() {
+static std::wstring backend_filename_suffix() {
 #ifdef _WIN32
-    return "ggml-";
+    return L".dll";
 #else
-    return "libggml-";
+    return L".so";
 #endif
 }
-static std::string backend_filename_suffix() {
+static std::wstring path_separator() {
 #ifdef _WIN32
-    return ".dll";
+    return L"\\";
 #else
-    return ".so";
+    return L"/";
 #endif
 }
 static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) {
    // enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
     // TODO: search system paths
-    std::string file_prefix = backend_filename_prefix() + name + "-";
+    std::wstring file_prefix = backend_filename_prefix() + utf8_to_utf16(name) + L"-";
-    std::vector<std::string> search_paths;
+    std::vector<std::wstring> search_paths;
    if (user_search_path == nullptr) {
-        search_paths.push_back("./");
+        search_paths.push_back(L"." + path_separator());
        search_paths.push_back(get_executable_path());
    } else {
-#if defined(_WIN32)
+        search_paths.push_back(utf8_to_utf16(user_search_path) + path_separator());
-        search_paths.push_back(std::string(user_search_path) + "\\");
-#else
-        search_paths.push_back(std::string(user_search_path) + "/");
-#endif
    }
    int best_score = 0;
-    std::string best_path;
+    std::wstring best_path;
    namespace fs = std::filesystem;
    for (const auto & search_path : search_paths) {
@@ -509,27 +534,27 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
        fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
        for (const auto & entry : dir_it) {
            if (entry.is_regular_file()) {
-                std::string filename = entry.path().filename().string();
+                std::wstring filename = entry.path().filename().wstring();
-                std::string ext = entry.path().extension().string();
+                std::wstring ext = entry.path().extension().wstring();
                if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
-                    dl_handle_ptr handle { dl_load_library(entry.path().c_str()) };
+                    dl_handle_ptr handle { dl_load_library(entry.path().wstring()) };
                    if (!handle && !silent) {
-                        GGML_LOG_ERROR("%s: failed to load %s\n", __func__, entry.path().string().c_str());
+                        GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
                    }
                    if (handle) {
                        auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
                        if (score_fn) {
                            int s = score_fn();
 #ifndef NDEBUG
-                            GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, entry.path().string().c_str(), s);
+                            GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), s);
 #endif
                            if (s > best_score) {
                                best_score = s;
-                                best_path = entry.path().string();
+                                best_path = entry.path().wstring();
                            }
                        } else {
                            if (!silent) {
-                                GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, entry.path().string().c_str());
+                                GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
                            }
                        }
                    }
@@ -541,15 +566,15 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
    if (best_score == 0) {
        // try to load the base backend
        for (const auto & search_path : search_paths) {
-            std::string path = search_path + backend_filename_prefix() + name + backend_filename_suffix();
+            std::wstring path = search_path + backend_filename_prefix() + utf8_to_utf16(name) + backend_filename_suffix();
            if (fs::exists(path)) {
-                return get_reg().load_backend(path.c_str(), silent);
+                return get_reg().load_backend(path, silent);
            }
        }
        return nullptr;
    }
-    return get_reg().load_backend(best_path.c_str(), silent);
+    return get_reg().load_backend(best_path, silent);
 }
 void ggml_backend_load_all() {

--- a/llama/ggml-backend.cpp
+++ b/llama/ggml-backend.cpp
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
 *
 * MIT License
 *
@@ -826,9 +826,12 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
    for (int i = 0; i < graph->n_nodes; i++) {
        if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
            ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
-            GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
+            GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs", cur_split, ggml_backend_name(split_backend),
                sched->splits[cur_split].n_inputs);
            for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
+                if (j == 0) {
+                    GGML_LOG_DEBUG(": ");
+                }
                GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
                    fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
            }

--- a/llama/ggml-backend.h
+++ b/llama/ggml-backend.h
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
 *
 * MIT License
 *

--- a/llama/ggml-blas.cpp
+++ b/llama/ggml-blas.cpp
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
 *
 * MIT License
 *

--- a/llama/ggml-blas.h
+++ b/llama/ggml-blas.h
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
 *
 * MIT License
 *

--- a/llama/ggml-common.h
+++ b/llama/ggml-common.h
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
 *
 * MIT License
 *

--- a/llama/ggml-cpp.h
+++ b/llama/ggml-cpp.h
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
 *
 * MIT License
 *

--- a/llama/ggml-cpu-aarch64.cpp
+++ b/llama/ggml-cpu-aarch64.cpp
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
 *
 * MIT License
 *
@@ -220,9 +220,12 @@ static inline __m256i sum_i16_pairs_int32x8(const __m256i x) {
 }
 static inline __m256i mul_sum_us8_pairs_int32x8(const __m256i ax, const __m256i sy) {
-#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
+#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
    const __m256i zero = _mm256_setzero_si256();
    return _mm256_dpbusd_epi32(zero, ax, sy);
+#elif defined(__AVXVNNI__)
+    const __m256i zero = _mm256_setzero_si256();
+    return _mm256_dpbusd_avx_epi32(zero, ax, sy);
 #else
    // Perform multiplication and create 16-bit values
    const __m256i dot = _mm256_maddubs_epi16(ax, sy);
@@ -590,21 +593,21 @@ static void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
 #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
    if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
-        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *)vx;
+        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;
        for (int c = 0; c < nc; c += ncols_interleaved) {
-            const block_q8_0 * a_ptr = (const block_q8_0 *)vy;
+            const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
            float32x4_t acc = vdupq_n_f32(0);
            for (int b = 0; b < nb; b++) {
-                int8x16_t b0 = vld1q_s8((const int8_t *)b_ptr->qs);
+                int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs);
-                int8x16_t b1 = vld1q_s8((const int8_t *)b_ptr->qs + 16);
+                int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16);
-                int8x16_t b2 = vld1q_s8((const int8_t *)b_ptr->qs + 32);
+                int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32);
-                int8x16_t b3 = vld1q_s8((const int8_t *)b_ptr->qs + 48);
+                int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48);
-                float16x4_t bd = vld1_f16((const __fp16 *)b_ptr->d);
+                float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
                int8x16_t a0 = vld1q_s8(a_ptr->qs);
                int8x16_t a1 = vld1q_s8(a_ptr->qs + qk/2);
-                float16x4_t ad = vld1_dup_f16((const __fp16 *)&a_ptr->d);
+                float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
                int32x4_t ret = vdupq_n_s32(0);
@@ -673,72 +676,52 @@ static void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
    UNUSED(ncols_interleaved);
    UNUSED(blocklen);
-#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
-    if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
+    if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
-        const void * b_ptr = vx;
+        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;
-        const void * a_ptr = vy;
-        float * res_ptr = s;
-        __asm__ __volatile__(
+        for (int c = 0; c < nc; c += ncols_interleaved) {
-            "movi v2.16b, #0x4\n"
+            const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
-            "movi v1.16b, #0xf0\n"
+            float32x4_t acc = vdupq_n_f32(0);
-            "add %x[b_ptr], %x[b_ptr], #0x8\n"
+            for (int b = 0; b < nb; b++) {
-            "1:"  // Column loop
+                int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs);
-            "add x23, %x[a_ptr], #0x2\n"
+                int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16);
-            "movi v0.16b, #0x0\n"
+                int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32);
-            "mov x22, %x[nb]\n"
+                int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48);
-            "2:"  // Block loop
+                float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
-            "ldr q31, [%x[b_ptr], #0x0]\n"
-            "ldr q30, [%x[b_ptr], #0x10]\n"
+                int8x16_t a0 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs);
-            "mov x21, x23\n"
+                int8x16_t a1 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 1);
-            "movi v29.4s, #0x0\n"
+                int8x16_t a2 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 2);
-            "ldr q28, [%x[b_ptr], #0x20]\n"
+                int8x16_t a3 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 3);
-            "ldr q27, [%x[b_ptr], #0x30]\n"
+                float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
-            "movi v26.4s, #0x0\n"
-            "sub x20, x23, #0x2\n"
+                int32x4_t ret0 = vdupq_n_s32(0);
-            "ld1r { v25.8h }, [x20]\n"
+                int32x4_t ret1 = vdupq_n_s32(0);
-            "ldr q24, [%x[b_ptr], #-0x8]\n"
-            "sub x22, x22, #0x1\n"
+                ret0 = vdotq_s32(ret0, b0 << 4, a0);
-            "add x23, x23, #0x22\n"
+                ret1 = vdotq_s32(ret1, b1 << 4, a0);
-            "ld1r { v23.2d }, [x21], #0x8\n"
+                ret0 = vdotq_s32(ret0, b2 << 4, a1);
-            "sshl v22.16b, v31.16b, v2.16b\n"
+                ret1 = vdotq_s32(ret1, b3 << 4, a1);
-            "sshl v16.16b, v30.16b, v2.16b\n"
-            "add %x[b_ptr], %x[b_ptr], #0x48\n"
+                ret0 = vdotq_s32(ret0, b0 & 0xf0U, a2);
-            "ld1r { v21.2d }, [x21], #0x8\n"
+                ret1 = vdotq_s32(ret1, b1 & 0xf0U, a2);
-            "sshl v20.16b, v28.16b, v2.16b\n"
+                ret0 = vdotq_s32(ret0, b2 & 0xf0U, a3);
-            "sshl v19.16b, v27.16b, v2.16b\n"
+                ret1 = vdotq_s32(ret1, b3 & 0xf0U, a3);
-            "ld1r { v18.2d }, [x21], #0x8\n"
-            "ld1r { v17.2d }, [x21], #0x8\n"
+                int32x4_t ret = vpaddq_s32(ret0, ret1);
-            "and v31.16b, v31.16b, v1.16b\n"
-            "and v30.16b, v30.16b, v1.16b\n"
+                acc = vfmaq_f32(acc, vcvtq_n_f32_s32(ret, 4),
-            ".inst 0x4e9796dd  // sdot v29.4s, v22.16b, v23.16b\n"
+                        vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
-            ".inst 0x4e97961a  // sdot v26.4s, v16.16b, v23.16b\n"
+                a_ptr++;
-            "and v28.16b, v28.16b, v1.16b\n"
+                b_ptr++;
-            "and v27.16b, v27.16b, v1.16b\n"
+            }
-            "fcvtl v25.4s, v25.4h\n"
+            vst1q_f32(s, acc);
-            "fcvtl v16.4s, v24.4h\n"
+            s += ncols_interleaved;
-            ".inst 0x4e95969d  // sdot v29.4s, v20.16b, v21.16b\n"
+        }
-            ".inst 0x4e95967a  // sdot v26.4s, v19.16b, v21.16b\n"
-            "fmul v16.4s, v16.4s, v25.4s\n"
-            ".inst 0x4e9297fd  // sdot v29.4s, v31.16b, v18.16b\n"
-            ".inst 0x4e9297da  // sdot v26.4s, v30.16b, v18.16b\n"
-            ".inst 0x4e91979d  // sdot v29.4s, v28.16b, v17.16b\n"
-            ".inst 0x4e91977a  // sdot v26.4s, v27.16b, v17.16b\n"
-            "addp v29.4s, v29.4s, v26.4s\n"
-            "scvtf v29.4s, v29.4s, #0x4\n"
-            "fmla v0.4s, v29.4s, v16.4s\n"
-            "cbnz x22, 2b\n"
-            "sub %x[nc], %x[nc], #0x4\n"
-            "str q0, [%x[res_ptr], #0x0]\n"
-            "add %x[res_ptr], %x[res_ptr], #0x10\n"
-            "cbnz %x[nc], 1b\n"
-            : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
-            : [a_ptr] "r" (a_ptr), [nb] "r" (nb)
-            : "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"
-        );
        return;
    }
-#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
    float sumf[4];
    int sumi;

--- a/llama/ggml-cpu-aarch64.h
+++ b/llama/ggml-cpu-aarch64.h
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
 *
 * MIT License
 *

--- a/llama/ggml-cpu-impl.h
+++ b/llama/ggml-cpu-impl.h
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
 *
 * MIT License
 *