Unverified Commit 1deafd82 authored by Jeffrey Morgan's avatar Jeffrey Morgan Committed by GitHub
Browse files

llama: update vendored code to commit 46e3556 (#8308)

parent 57f038ec
...@@ -225,7 +225,6 @@ type Options struct { ...@@ -225,7 +225,6 @@ type Options struct {
Mirostat int `json:"mirostat,omitempty"` Mirostat int `json:"mirostat,omitempty"`
MirostatTau float32 `json:"mirostat_tau,omitempty"` MirostatTau float32 `json:"mirostat_tau,omitempty"`
MirostatEta float32 `json:"mirostat_eta,omitempty"` MirostatEta float32 `json:"mirostat_eta,omitempty"`
PenalizeNewline bool `json:"penalize_newline,omitempty"`
Stop []string `json:"stop,omitempty"` Stop []string `json:"stop,omitempty"`
} }
...@@ -606,7 +605,6 @@ func DefaultOptions() Options { ...@@ -606,7 +605,6 @@ func DefaultOptions() Options {
Mirostat: 0, Mirostat: 0,
MirostatTau: 5.0, MirostatTau: 5.0,
MirostatEta: 0.1, MirostatEta: 0.1,
PenalizeNewline: true,
Seed: -1, Seed: -1,
Runner: Runner{ Runner: Runner{
......
/** /**
* llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
* *
* MIT License * MIT License
* *
...@@ -935,7 +935,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 ...@@ -935,7 +935,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3)); mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3));
mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]); mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
// stride = 1, padding = 1, bias is nullptr // stride = 1, padding = 1, bias is nullptr
block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1); block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
// layer norm // layer norm
// // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
...@@ -983,7 +983,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 ...@@ -983,7 +983,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
// block_2 // block_2
{ {
// stride = 2 // stride = 2
block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1); block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
// block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1] // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
// layer norm // layer norm
...@@ -1044,7 +1044,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 ...@@ -1044,7 +1044,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
// mlp_2 ne [24, 24, 2048, 1] // mlp_2 ne [24, 24, 2048, 1]
mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0); mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
// weight ne = [3, 3, 2048, 1] // weight ne = [3, 3, 2048, 1]
struct ggml_tensor * peg_0 = ggml_conv_depthwise_2d(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1); struct ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3)); peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b); peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3)); mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
...@@ -1262,28 +1262,28 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { ...@@ -1262,28 +1262,28 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
} }
#ifdef GGML_USE_CUDA #ifdef GGML_USE_CUDA
new_clip->backend = ggml_backend_cuda_init(0); new_clip->backend = ggml_backend_cuda_init(0);
LOG_INF("%s: CLIP using CUDA backend\n", __func__); LOG_INF("%s: CLIP using CUDA backend\n", __func__);
#endif #endif
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
new_clip->backend = ggml_backend_metal_init(); new_clip->backend = ggml_backend_metal_init();
LOG_INF("%s: CLIP using Metal backend\n", __func__); LOG_INF("%s: CLIP using Metal backend\n", __func__);
#endif #endif
#ifdef GGML_USE_CANN #ifdef GGML_USE_CANN
new_clip->backend = ggml_backend_cann_init(0); new_clip->backend = ggml_backend_cann_init(0);
LOG_INF("%s: CLIP using CANN backend\n", __func__); LOG_INF("%s: CLIP using CANN backend\n", __func__);
#endif #endif
#ifdef GGML_USE_VULKAN #ifdef GGML_USE_VULKAN
new_clip->backend = ggml_backend_vk_init(0); new_clip->backend = ggml_backend_vk_init(0);
LOG_INF("%s: CLIP using Vulkan backend\n", __func__); LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
#endif #endif
#ifdef GGML_USE_SYCL #ifdef GGML_USE_SYCL
new_clip->backend = ggml_backend_sycl_init(0); new_clip->backend = ggml_backend_sycl_init(0);
LOG_INF("%s: CLIP using SYCL backend\n", __func__); LOG_INF("%s: CLIP using SYCL backend\n", __func__);
#endif #endif
if (!new_clip->backend) { if (!new_clip->backend) {
......
/** /**
* llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
* *
* MIT License * MIT License
* *
...@@ -44,6 +44,7 @@ ...@@ -44,6 +44,7 @@
#include <cstdarg> #include <cstdarg>
#include <cstring> #include <cstring>
#include <ctime> #include <ctime>
#include <filesystem>
#include <fstream> #include <fstream>
#include <iostream> #include <iostream>
#include <iterator> #include <iterator>
...@@ -88,7 +89,9 @@ ...@@ -88,7 +89,9 @@
#ifdef __linux__ #ifdef __linux__
#include <linux/limits.h> #include <linux/limits.h>
#elif defined(_WIN32) #elif defined(_WIN32)
#define PATH_MAX MAX_PATH # if !defined(PATH_MAX)
# define PATH_MAX MAX_PATH
# endif
#else #else
#include <sys/syslimits.h> #include <sys/syslimits.h>
#endif #endif
...@@ -912,9 +915,8 @@ struct common_init_result common_init_from_params(common_params & params) { ...@@ -912,9 +915,8 @@ struct common_init_result common_init_from_params(common_params & params) {
} }
if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) { if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
LOG_ERR("%s: KV cache shifting is not supported for this model (--no-context-shift to disable)'\n", __func__); LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);
llama_free_model(model); params.ctx_shift = false;
return iparams;
} }
if (!params.control_vectors.empty()) { if (!params.control_vectors.empty()) {
...@@ -945,20 +947,21 @@ struct common_init_result common_init_from_params(common_params & params) { ...@@ -945,20 +947,21 @@ struct common_init_result common_init_from_params(common_params & params) {
// load and optionally apply lora adapters // load and optionally apply lora adapters
for (auto & la : params.lora_adapters) { for (auto & la : params.lora_adapters) {
common_lora_adapter_container loaded_la; llama_lora_adapter_ptr lora;
loaded_la.path = la.path; lora.reset(llama_lora_adapter_init(model, la.path.c_str()));
loaded_la.scale = la.scale; if (lora == nullptr) {
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
if (loaded_la.adapter == nullptr) {
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str()); LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
llama_free(lctx); llama_free(lctx);
llama_free_model(model); llama_free_model(model);
return iparams; return iparams;
} }
iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
la.ptr = lora.get();
iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
} }
if (!params.lora_init_without_apply) { if (!params.lora_init_without_apply) {
common_lora_adapters_apply(lctx, iparams.lora_adapters); common_lora_adapters_apply(lctx, params.lora_adapters);
} }
if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) { if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
...@@ -966,6 +969,25 @@ struct common_init_result common_init_from_params(common_params & params) { ...@@ -966,6 +969,25 @@ struct common_init_result common_init_from_params(common_params & params) {
params.sampling.ignore_eos = false; params.sampling.ignore_eos = false;
} }
if (params.sampling.ignore_eos) {
for (llama_token i = 0; i < llama_n_vocab(model); i++) {
if (llama_token_is_eog(model, i)) {
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
params.sampling.logit_bias.push_back({i, -INFINITY});
}
}
}
if (params.sampling.penalty_last_n == -1) {
LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
params.sampling.penalty_last_n = llama_n_ctx(lctx);
}
if (params.sampling.dry_penalty_last_n == -1) {
LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
}
if (params.warmup) { if (params.warmup) {
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__); LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
...@@ -1000,17 +1022,17 @@ struct common_init_result common_init_from_params(common_params & params) { ...@@ -1000,17 +1022,17 @@ struct common_init_result common_init_from_params(common_params & params) {
llama_perf_context_reset(lctx); llama_perf_context_reset(lctx);
} }
iparams.model = model; iparams.model.reset(model);
iparams.context = lctx; iparams.context.reset(lctx);
return iparams; return iparams;
} }
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) { void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora) {
llama_lora_adapter_clear(ctx); llama_lora_adapter_clear(ctx);
for (auto & la : lora_adapters) { for (auto & la : lora) {
if (la.scale != 0.0f) { if (la.scale != 0.0f) {
llama_lora_adapter_set(ctx, la.adapter, la.scale); llama_lora_adapter_set(ctx, la.ptr, la.scale);
} }
} }
} }
...@@ -1102,7 +1124,7 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p ...@@ -1102,7 +1124,7 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
#define CURL_MAX_RETRY 3 #define CURL_MAX_RETRY 3
#define CURL_RETRY_DELAY_SECONDS 2 #define CURL_RETRY_DELAY_SECONDS 2
static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) { static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
int remaining_attempts = max_attempts; int remaining_attempts = max_attempts;
while (remaining_attempts > 0) { while (remaining_attempts > 0) {
...@@ -1126,7 +1148,6 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_ ...@@ -1126,7 +1148,6 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
} }
static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) { static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
// Initialize libcurl // Initialize libcurl
std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup); std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
if (!curl) { if (!curl) {
...@@ -1156,8 +1177,7 @@ static bool common_download_file(const std::string & url, const std::string & pa ...@@ -1156,8 +1177,7 @@ static bool common_download_file(const std::string & url, const std::string & pa
#endif #endif
// Check if the file already exists locally // Check if the file already exists locally
struct stat model_file_info; auto file_exists = std::filesystem::exists(path);
auto file_exists = (stat(path.c_str(), &model_file_info) == 0);
// If the file exists, check its JSON metadata companion file. // If the file exists, check its JSON metadata companion file.
std::string metadata_path = path + ".json"; std::string metadata_path = path + ".json";
...@@ -1199,11 +1219,13 @@ static bool common_download_file(const std::string & url, const std::string & pa ...@@ -1199,11 +1219,13 @@ static bool common_download_file(const std::string & url, const std::string & pa
std::string etag; std::string etag;
std::string last_modified; std::string last_modified;
}; };
common_load_model_from_url_headers headers; common_load_model_from_url_headers headers;
{ {
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *); typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t { auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
common_load_model_from_url_headers *headers = (common_load_model_from_url_headers *) userdata; common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
static std::regex header_regex("([^:]+): (.*)\r\n"); static std::regex header_regex("([^:]+): (.*)\r\n");
static std::regex etag_regex("ETag", std::regex_constants::icase); static std::regex etag_regex("ETag", std::regex_constants::icase);
...@@ -1618,6 +1640,18 @@ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token ...@@ -1618,6 +1640,18 @@ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token
// Chat template utils // Chat template utils
// //
std::string common_get_builtin_chat_template(const struct llama_model * model) {
static const char * template_key = "tokenizer.chat_template";
// call with NULL buffer to get the total size of the string
int32_t res = llama_model_meta_val_str(model, template_key, NULL, 0);
if (res > 0) {
std::vector<char> model_template(res + 1, 0);
llama_model_meta_val_str(model, template_key, model_template.data(), model_template.size());
return std::string(model_template.data(), model_template.size() - 1);
}
return "";
}
bool common_chat_verify_template(const std::string & tmpl) { bool common_chat_verify_template(const std::string & tmpl) {
llama_chat_message chat[] = {{"user", "test"}}; llama_chat_message chat[] = {{"user", "test"}};
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0); int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
...@@ -1787,7 +1821,9 @@ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm) ...@@ -1787,7 +1821,9 @@ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm)
break; break;
case 0: // max absolute case 0: // max absolute
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
if (sum < std::abs(inp[i])) sum = std::abs(inp[i]); if (sum < std::abs(inp[i])) {
sum = std::abs(inp[i]);
}
} }
sum /= 32760.0; // make an int16 range sum /= 32760.0; // make an int16 range
break; break;
......
/** /**
* llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
* *
* MIT License * MIT License
* *
...@@ -28,7 +28,7 @@ ...@@ -28,7 +28,7 @@
#pragma once #pragma once
#include "llama.h" #include "llama-cpp.h"
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -53,10 +53,8 @@ ...@@ -53,10 +53,8 @@
struct common_lora_adapter_info { struct common_lora_adapter_info {
std::string path; std::string path;
float scale; float scale;
};
struct common_lora_adapter_container : common_lora_adapter_info { struct llama_lora_adapter * ptr;
struct llama_lora_adapter * adapter;
}; };
using llama_tokens = std::vector<llama_token>; using llama_tokens = std::vector<llama_token>;
...@@ -106,6 +104,7 @@ enum llama_example { ...@@ -106,6 +104,7 @@ enum llama_example {
LLAMA_EXAMPLE_LLAVA, LLAMA_EXAMPLE_LLAVA,
LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_LOOKUP,
LLAMA_EXAMPLE_PARALLEL, LLAMA_EXAMPLE_PARALLEL,
LLAMA_EXAMPLE_TTS,
LLAMA_EXAMPLE_COUNT, LLAMA_EXAMPLE_COUNT,
}; };
...@@ -121,6 +120,7 @@ enum common_sampler_type { ...@@ -121,6 +120,7 @@ enum common_sampler_type {
COMMON_SAMPLER_TYPE_TEMPERATURE = 7, COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
COMMON_SAMPLER_TYPE_XTC = 8, COMMON_SAMPLER_TYPE_XTC = 8,
COMMON_SAMPLER_TYPE_INFILL = 9, COMMON_SAMPLER_TYPE_INFILL = 9,
COMMON_SAMPLER_TYPE_PENALTIES = 10,
}; };
// dimensionality reduction methods, used by cvector-generator // dimensionality reduction methods, used by cvector-generator
...@@ -156,7 +156,6 @@ struct common_params_sampling { ...@@ -156,7 +156,6 @@ struct common_params_sampling {
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
float mirostat_tau = 5.00f; // target entropy float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate float mirostat_eta = 0.10f; // learning rate
bool penalize_nl = false; // consider newlines as a repeatable token
bool ignore_eos = false; bool ignore_eos = false;
bool no_perf = false; // disable performance metrics bool no_perf = false; // disable performance metrics
bool timing_per_token = false; bool timing_per_token = false;
...@@ -165,6 +164,7 @@ struct common_params_sampling { ...@@ -165,6 +164,7 @@ struct common_params_sampling {
std::vector<enum common_sampler_type> samplers = { std::vector<enum common_sampler_type> samplers = {
COMMON_SAMPLER_TYPE_PENALTIES,
COMMON_SAMPLER_TYPE_DRY, COMMON_SAMPLER_TYPE_DRY,
COMMON_SAMPLER_TYPE_TOP_K, COMMON_SAMPLER_TYPE_TOP_K,
COMMON_SAMPLER_TYPE_TYPICAL_P, COMMON_SAMPLER_TYPE_TYPICAL_P,
...@@ -184,6 +184,7 @@ struct common_params_sampling { ...@@ -184,6 +184,7 @@ struct common_params_sampling {
struct common_params_speculative { struct common_params_speculative {
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
int32_t n_ctx = 0; // draft context size int32_t n_ctx = 0; // draft context size
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding
...@@ -197,6 +198,14 @@ struct common_params_speculative { ...@@ -197,6 +198,14 @@ struct common_params_speculative {
std::string model = ""; // draft model for speculative decoding // NOLINT std::string model = ""; // draft model for speculative decoding // NOLINT
}; };
struct common_params_vocoder {
std::string hf_repo = ""; // HF repo // NOLINT
std::string hf_file = ""; // HF file // NOLINT
std::string model = ""; // model path // NOLINT
std::string model_url = ""; // model url to download // NOLINT
};
struct common_params { struct common_params {
int32_t n_predict = -1; // new tokens to predict int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 4096; // context size int32_t n_ctx = 4096; // context size
...@@ -219,11 +228,13 @@ struct common_params { ...@@ -219,11 +228,13 @@ struct common_params {
float defrag_thold = 0.1f; // KV cache defragmentation threshold float defrag_thold = 0.1f; // KV cache defragmentation threshold
// offload params // offload params
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
struct cpu_params cpuparams; struct cpu_params cpuparams;
struct cpu_params cpuparams_batch; struct cpu_params cpuparams_batch;
...@@ -237,8 +248,9 @@ struct common_params { ...@@ -237,8 +248,9 @@ struct common_params {
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
struct common_params_sampling sampling; struct common_params_sampling sampling;
struct common_params_speculative speculative; struct common_params_speculative speculative;
struct common_params_vocoder vocoder;
std::string model = ""; // model path // NOLINT std::string model = ""; // model path // NOLINT
std::string model_alias = ""; // model alias // NOLINT std::string model_alias = ""; // model alias // NOLINT
...@@ -490,10 +502,12 @@ std::string fs_get_cache_file(const std::string & filename); ...@@ -490,10 +502,12 @@ std::string fs_get_cache_file(const std::string & filename);
// Model utils // Model utils
// //
// note: defines object's lifetime
struct common_init_result { struct common_init_result {
struct llama_model * model = nullptr; llama_model_ptr model;
struct llama_context * context = nullptr; llama_context_ptr context;
std::vector<common_lora_adapter_container> lora_adapters;
std::vector<llama_lora_adapter_ptr> lora;
}; };
struct common_init_result common_init_from_params(common_params & params); struct common_init_result common_init_from_params(common_params & params);
...@@ -515,7 +529,7 @@ struct llama_model * common_load_model_from_hf( ...@@ -515,7 +529,7 @@ struct llama_model * common_load_model_from_hf(
const struct llama_model_params & params); const struct llama_model_params & params);
// clear LoRA adapters from context, then apply new list of adapters // clear LoRA adapters from context, then apply new list of adapters
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters); void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora);
// //
// Batch utils // Batch utils
...@@ -583,6 +597,9 @@ struct common_chat_msg { ...@@ -583,6 +597,9 @@ struct common_chat_msg {
std::string content; std::string content;
}; };
// Get the built-in chat template for the model. Return empty string if not present.
std::string common_get_builtin_chat_template(const struct llama_model * model);
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
bool common_chat_verify_template(const std::string & tmpl); bool common_chat_verify_template(const std::string & tmpl);
...@@ -619,7 +636,8 @@ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_si ...@@ -619,7 +636,8 @@ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_si
// Embedding utils // Embedding utils
// //
void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2); // TODO: repace embd_norm with an enum
void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
float common_embd_similarity_cos(const float * embd1, const float * embd2, int n); float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
...@@ -648,6 +666,10 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c ...@@ -648,6 +666,10 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
// Split utils // Split utils
// //
static const char * const LLM_KV_SPLIT_NO = "split.no"; namespace {
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count"; const char * const LLM_KV_SPLIT_NO = "split.no";
const char * const LLM_KV_SPLIT_COUNT = "split.count";
const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
}
/** /**
* llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
* *
* MIT License * MIT License
* *
...@@ -560,7 +560,6 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor ...@@ -560,7 +560,6 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node); size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node);
hn->buffer_id = buffer_id; hn->buffer_id = buffer_id;
hn->offset = offset; hn->offset = offset;
return;
} }
} }
......
/** /**
* llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
* *
* MIT License * MIT License
* *
...@@ -92,6 +92,26 @@ ...@@ -92,6 +92,26 @@
#include "ggml-kompute.h" #include "ggml-kompute.h"
#endif #endif
// disable C++17 deprecation warning for std::codecvt_utf8
#if defined(__clang__)
# pragma clang diagnostic push
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
#endif
static std::wstring utf8_to_utf16(const std::string & str) {
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
return converter.from_bytes(str);
}
static std::string utf16_to_utf8(const std::wstring & str) {
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
return converter.to_bytes(str);
}
#if defined(__clang__)
# pragma clang diagnostic pop
#endif
#ifdef _WIN32 #ifdef _WIN32
using dl_handle = std::remove_pointer_t<HMODULE>; using dl_handle = std::remove_pointer_t<HMODULE>;
...@@ -114,11 +134,6 @@ static dl_handle * dl_load_library(const std::wstring & path) { ...@@ -114,11 +134,6 @@ static dl_handle * dl_load_library(const std::wstring & path) {
return handle; return handle;
} }
static dl_handle * dl_load_library(const std::string & path) {
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
return dl_load_library(converter.from_bytes(path));
}
static void * dl_get_sym(dl_handle * handle, const char * name) { static void * dl_get_sym(dl_handle * handle, const char * name) {
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
...@@ -140,8 +155,8 @@ struct dl_handle_deleter { ...@@ -140,8 +155,8 @@ struct dl_handle_deleter {
} }
}; };
static void * dl_load_library(const std::string & path) { static void * dl_load_library(const std::wstring & path) {
dl_handle * handle = dlopen(path.c_str(), RTLD_NOW | RTLD_LOCAL); dl_handle * handle = dlopen(utf16_to_utf8(path).c_str(), RTLD_NOW | RTLD_LOCAL);
return handle; return handle;
} }
...@@ -182,9 +197,9 @@ struct ggml_backend_registry { ...@@ -182,9 +197,9 @@ struct ggml_backend_registry {
#ifdef GGML_USE_CANN #ifdef GGML_USE_CANN
register_backend(ggml_backend_cann_reg()); register_backend(ggml_backend_cann_reg());
#endif #endif
#ifdef GGML_USE_BLAS // #ifdef GGML_USE_BLAS
register_backend(ggml_backend_blas_reg()); // register_backend(ggml_backend_blas_reg());
#endif // #endif
#ifdef GGML_USE_RPC #ifdef GGML_USE_RPC
register_backend(ggml_backend_rpc_reg()); register_backend(ggml_backend_rpc_reg());
#endif #endif
...@@ -228,11 +243,11 @@ struct ggml_backend_registry { ...@@ -228,11 +243,11 @@ struct ggml_backend_registry {
devices.push_back(device); devices.push_back(device);
} }
ggml_backend_reg_t load_backend(const char * path, bool silent) { ggml_backend_reg_t load_backend(const std::wstring & path, bool silent) {
dl_handle_ptr handle { dl_load_library(path) }; dl_handle_ptr handle { dl_load_library(path) };
if (!handle) { if (!handle) {
if (!silent) { if (!silent) {
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path); GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(path).c_str());
} }
return nullptr; return nullptr;
} }
...@@ -240,7 +255,7 @@ struct ggml_backend_registry { ...@@ -240,7 +255,7 @@ struct ggml_backend_registry {
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score"); auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
if (score_fn && score_fn() == 0) { if (score_fn && score_fn() == 0) {
if (!silent) { if (!silent) {
GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path); GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, utf16_to_utf8(path).c_str());
} }
return nullptr; return nullptr;
} }
...@@ -248,7 +263,7 @@ struct ggml_backend_registry { ...@@ -248,7 +263,7 @@ struct ggml_backend_registry {
auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init"); auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init");
if (!backend_init_fn) { if (!backend_init_fn) {
if (!silent) { if (!silent) {
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path); GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, utf16_to_utf8(path).c_str());
} }
return nullptr; return nullptr;
} }
...@@ -257,16 +272,16 @@ struct ggml_backend_registry { ...@@ -257,16 +272,16 @@ struct ggml_backend_registry {
if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) { if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
if (!silent) { if (!silent) {
if (!reg) { if (!reg) {
GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path); GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, utf16_to_utf8(path).c_str());
} else { } else {
GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n", GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
__func__, path, reg->api_version, GGML_BACKEND_API_VERSION); __func__, utf16_to_utf8(path).c_str(), reg->api_version, GGML_BACKEND_API_VERSION);
} }
} }
return nullptr; return nullptr;
} }
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path); GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), utf16_to_utf8(path).c_str());
register_backend(reg, std::move(handle)); register_backend(reg, std::move(handle));
...@@ -402,14 +417,14 @@ ggml_backend_t ggml_backend_init_best(void) { ...@@ -402,14 +417,14 @@ ggml_backend_t ggml_backend_init_best(void) {
// Dynamic loading // Dynamic loading
ggml_backend_reg_t ggml_backend_load(const char * path) { ggml_backend_reg_t ggml_backend_load(const char * path) {
return get_reg().load_backend(path, false); return get_reg().load_backend(utf8_to_utf16(path), false);
} }
void ggml_backend_unload(ggml_backend_reg_t reg) { void ggml_backend_unload(ggml_backend_reg_t reg) {
get_reg().unload_backend(reg, true); get_reg().unload_backend(reg, true);
} }
static std::string get_executable_path() { static std::wstring get_executable_path() {
#if defined(__APPLE__) #if defined(__APPLE__)
// get executable path // get executable path
std::vector<char> path; std::vector<char> path;
...@@ -427,13 +442,17 @@ static std::string get_executable_path() { ...@@ -427,13 +442,17 @@ static std::string get_executable_path() {
if (last_slash != std::string::npos) { if (last_slash != std::string::npos) {
base_path = base_path.substr(0, last_slash); base_path = base_path.substr(0, last_slash);
} }
return base_path + "/"; return utf8_to_utf16(base_path + "/");
#elif defined(__linux__) #elif defined(__linux__) || defined(__FreeBSD__)
std::string base_path = "."; std::string base_path = ".";
std::vector<char> path(1024); std::vector<char> path(1024);
while (true) { while (true) {
// get executable path // get executable path
# if defined(__linux__)
ssize_t len = readlink("/proc/self/exe", path.data(), path.size()); ssize_t len = readlink("/proc/self/exe", path.data(), path.size());
# elif defined(__FreeBSD__)
ssize_t len = readlink("/proc/curproc/file", path.data(), path.size());
# endif
if (len == -1) { if (len == -1) {
break; break;
} }
...@@ -449,57 +468,63 @@ static std::string get_executable_path() { ...@@ -449,57 +468,63 @@ static std::string get_executable_path() {
path.resize(path.size() * 2); path.resize(path.size() * 2);
} }
return base_path + "/"; return utf8_to_utf16(base_path + "/");
#elif defined(_WIN32) #elif defined(_WIN32)
std::vector<char> path(MAX_PATH); std::vector<wchar_t> path(MAX_PATH);
DWORD len = GetModuleFileNameA(NULL, path.data(), path.size()); DWORD len = GetModuleFileNameW(NULL, path.data(), path.size());
if (len == 0) { if (len == 0) {
return ""; return {};
} }
std::string base_path(path.data(), len); std::wstring base_path(path.data(), len);
// remove executable name // remove executable name
auto last_slash = base_path.find_last_of('\\'); auto last_slash = base_path.find_last_of('\\');
if (last_slash != std::string::npos) { if (last_slash != std::string::npos) {
base_path = base_path.substr(0, last_slash); base_path = base_path.substr(0, last_slash);
} }
return base_path + "\\"; return base_path + L"\\";
#else
return {};
#endif
}
static std::wstring backend_filename_prefix() {
#ifdef _WIN32
return L"ggml-";
#else
return L"libggml-";
#endif #endif
} }
static std::string backend_filename_prefix() { static std::wstring backend_filename_suffix() {
#ifdef _WIN32 #ifdef _WIN32
return "ggml-"; return L".dll";
#else #else
return "libggml-"; return L".so";
#endif #endif
} }
static std::string backend_filename_suffix() { static std::wstring path_separator() {
#ifdef _WIN32 #ifdef _WIN32
return ".dll"; return L"\\";
#else #else
return ".so"; return L"/";
#endif #endif
} }
static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) { static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) {
// enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths // enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
// TODO: search system paths // TODO: search system paths
std::string file_prefix = backend_filename_prefix() + name + "-"; std::wstring file_prefix = backend_filename_prefix() + utf8_to_utf16(name) + L"-";
std::vector<std::string> search_paths; std::vector<std::wstring> search_paths;
if (user_search_path == nullptr) { if (user_search_path == nullptr) {
search_paths.push_back("./"); search_paths.push_back(L"." + path_separator());
search_paths.push_back(get_executable_path()); search_paths.push_back(get_executable_path());
} else { } else {
#if defined(_WIN32) search_paths.push_back(utf8_to_utf16(user_search_path) + path_separator());
search_paths.push_back(std::string(user_search_path) + "\\");
#else
search_paths.push_back(std::string(user_search_path) + "/");
#endif
} }
int best_score = 0; int best_score = 0;
std::string best_path; std::wstring best_path;
namespace fs = std::filesystem; namespace fs = std::filesystem;
for (const auto & search_path : search_paths) { for (const auto & search_path : search_paths) {
...@@ -509,27 +534,27 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, ...@@ -509,27 +534,27 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied); fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
for (const auto & entry : dir_it) { for (const auto & entry : dir_it) {
if (entry.is_regular_file()) { if (entry.is_regular_file()) {
std::string filename = entry.path().filename().string(); std::wstring filename = entry.path().filename().wstring();
std::string ext = entry.path().extension().string(); std::wstring ext = entry.path().extension().wstring();
if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) { if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
dl_handle_ptr handle { dl_load_library(entry.path().c_str()) }; dl_handle_ptr handle { dl_load_library(entry.path().wstring()) };
if (!handle && !silent) { if (!handle && !silent) {
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, entry.path().string().c_str()); GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
} }
if (handle) { if (handle) {
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score"); auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
if (score_fn) { if (score_fn) {
int s = score_fn(); int s = score_fn();
#ifndef NDEBUG #ifndef NDEBUG
GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, entry.path().string().c_str(), s); GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), s);
#endif #endif
if (s > best_score) { if (s > best_score) {
best_score = s; best_score = s;
best_path = entry.path().string(); best_path = entry.path().wstring();
} }
} else { } else {
if (!silent) { if (!silent) {
GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, entry.path().string().c_str()); GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
} }
} }
} }
...@@ -541,15 +566,15 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, ...@@ -541,15 +566,15 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
if (best_score == 0) { if (best_score == 0) {
// try to load the base backend // try to load the base backend
for (const auto & search_path : search_paths) { for (const auto & search_path : search_paths) {
std::string path = search_path + backend_filename_prefix() + name + backend_filename_suffix(); std::wstring path = search_path + backend_filename_prefix() + utf8_to_utf16(name) + backend_filename_suffix();
if (fs::exists(path)) { if (fs::exists(path)) {
return get_reg().load_backend(path.c_str(), silent); return get_reg().load_backend(path, silent);
} }
} }
return nullptr; return nullptr;
} }
return get_reg().load_backend(best_path.c_str(), silent); return get_reg().load_backend(best_path, silent);
} }
void ggml_backend_load_all() { void ggml_backend_load_all() {
......
/** /**
* llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
* *
* MIT License * MIT License
* *
...@@ -826,9 +826,12 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str ...@@ -826,9 +826,12 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
for (int i = 0; i < graph->n_nodes; i++) { for (int i = 0; i < graph->n_nodes; i++) {
if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) { if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id]; ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend), GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs", cur_split, ggml_backend_name(split_backend),
sched->splits[cur_split].n_inputs); sched->splits[cur_split].n_inputs);
for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) { for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
if (j == 0) {
GGML_LOG_DEBUG(": ");
}
GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name, GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j]))); fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
} }
......
/** /**
* llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
* *
* MIT License * MIT License
* *
...@@ -220,9 +220,12 @@ static inline __m256i sum_i16_pairs_int32x8(const __m256i x) { ...@@ -220,9 +220,12 @@ static inline __m256i sum_i16_pairs_int32x8(const __m256i x) {
} }
static inline __m256i mul_sum_us8_pairs_int32x8(const __m256i ax, const __m256i sy) { static inline __m256i mul_sum_us8_pairs_int32x8(const __m256i ax, const __m256i sy) {
#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__)) #if defined(__AVX512VNNI__) && defined(__AVX512VL__)
const __m256i zero = _mm256_setzero_si256(); const __m256i zero = _mm256_setzero_si256();
return _mm256_dpbusd_epi32(zero, ax, sy); return _mm256_dpbusd_epi32(zero, ax, sy);
#elif defined(__AVXVNNI__)
const __m256i zero = _mm256_setzero_si256();
return _mm256_dpbusd_avx_epi32(zero, ax, sy);
#else #else
// Perform multiplication and create 16-bit values // Perform multiplication and create 16-bit values
const __m256i dot = _mm256_maddubs_epi16(ax, sy); const __m256i dot = _mm256_maddubs_epi16(ax, sy);
...@@ -590,21 +593,21 @@ static void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c ...@@ -590,21 +593,21 @@ static void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD) #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) { if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *)vx; const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;
for (int c = 0; c < nc; c += ncols_interleaved) { for (int c = 0; c < nc; c += ncols_interleaved) {
const block_q8_0 * a_ptr = (const block_q8_0 *)vy; const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
float32x4_t acc = vdupq_n_f32(0); float32x4_t acc = vdupq_n_f32(0);
for (int b = 0; b < nb; b++) { for (int b = 0; b < nb; b++) {
int8x16_t b0 = vld1q_s8((const int8_t *)b_ptr->qs); int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs);
int8x16_t b1 = vld1q_s8((const int8_t *)b_ptr->qs + 16); int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16);
int8x16_t b2 = vld1q_s8((const int8_t *)b_ptr->qs + 32); int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32);
int8x16_t b3 = vld1q_s8((const int8_t *)b_ptr->qs + 48); int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48);
float16x4_t bd = vld1_f16((const __fp16 *)b_ptr->d); float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
int8x16_t a0 = vld1q_s8(a_ptr->qs); int8x16_t a0 = vld1q_s8(a_ptr->qs);
int8x16_t a1 = vld1q_s8(a_ptr->qs + qk/2); int8x16_t a1 = vld1q_s8(a_ptr->qs + qk/2);
float16x4_t ad = vld1_dup_f16((const __fp16 *)&a_ptr->d); float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
int32x4_t ret = vdupq_n_s32(0); int32x4_t ret = vdupq_n_s32(0);
...@@ -673,72 +676,52 @@ static void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c ...@@ -673,72 +676,52 @@ static void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
UNUSED(ncols_interleaved); UNUSED(ncols_interleaved);
UNUSED(blocklen); UNUSED(blocklen);
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) { if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
const void * b_ptr = vx; const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;
const void * a_ptr = vy;
float * res_ptr = s;
__asm__ __volatile__( for (int c = 0; c < nc; c += ncols_interleaved) {
"movi v2.16b, #0x4\n" const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
"movi v1.16b, #0xf0\n" float32x4_t acc = vdupq_n_f32(0);
"add %x[b_ptr], %x[b_ptr], #0x8\n" for (int b = 0; b < nb; b++) {
"1:" // Column loop int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs);
"add x23, %x[a_ptr], #0x2\n" int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16);
"movi v0.16b, #0x0\n" int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32);
"mov x22, %x[nb]\n" int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48);
"2:" // Block loop float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
"ldr q31, [%x[b_ptr], #0x0]\n"
"ldr q30, [%x[b_ptr], #0x10]\n" int8x16_t a0 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs);
"mov x21, x23\n" int8x16_t a1 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 1);
"movi v29.4s, #0x0\n" int8x16_t a2 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 2);
"ldr q28, [%x[b_ptr], #0x20]\n" int8x16_t a3 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 3);
"ldr q27, [%x[b_ptr], #0x30]\n" float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
"movi v26.4s, #0x0\n"
"sub x20, x23, #0x2\n" int32x4_t ret0 = vdupq_n_s32(0);
"ld1r { v25.8h }, [x20]\n" int32x4_t ret1 = vdupq_n_s32(0);
"ldr q24, [%x[b_ptr], #-0x8]\n"
"sub x22, x22, #0x1\n" ret0 = vdotq_s32(ret0, b0 << 4, a0);
"add x23, x23, #0x22\n" ret1 = vdotq_s32(ret1, b1 << 4, a0);
"ld1r { v23.2d }, [x21], #0x8\n" ret0 = vdotq_s32(ret0, b2 << 4, a1);
"sshl v22.16b, v31.16b, v2.16b\n" ret1 = vdotq_s32(ret1, b3 << 4, a1);
"sshl v16.16b, v30.16b, v2.16b\n"
"add %x[b_ptr], %x[b_ptr], #0x48\n" ret0 = vdotq_s32(ret0, b0 & 0xf0U, a2);
"ld1r { v21.2d }, [x21], #0x8\n" ret1 = vdotq_s32(ret1, b1 & 0xf0U, a2);
"sshl v20.16b, v28.16b, v2.16b\n" ret0 = vdotq_s32(ret0, b2 & 0xf0U, a3);
"sshl v19.16b, v27.16b, v2.16b\n" ret1 = vdotq_s32(ret1, b3 & 0xf0U, a3);
"ld1r { v18.2d }, [x21], #0x8\n"
"ld1r { v17.2d }, [x21], #0x8\n" int32x4_t ret = vpaddq_s32(ret0, ret1);
"and v31.16b, v31.16b, v1.16b\n"
"and v30.16b, v30.16b, v1.16b\n" acc = vfmaq_f32(acc, vcvtq_n_f32_s32(ret, 4),
".inst 0x4e9796dd // sdot v29.4s, v22.16b, v23.16b\n" vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
".inst 0x4e97961a // sdot v26.4s, v16.16b, v23.16b\n" a_ptr++;
"and v28.16b, v28.16b, v1.16b\n" b_ptr++;
"and v27.16b, v27.16b, v1.16b\n" }
"fcvtl v25.4s, v25.4h\n" vst1q_f32(s, acc);
"fcvtl v16.4s, v24.4h\n" s += ncols_interleaved;
".inst 0x4e95969d // sdot v29.4s, v20.16b, v21.16b\n" }
".inst 0x4e95967a // sdot v26.4s, v19.16b, v21.16b\n"
"fmul v16.4s, v16.4s, v25.4s\n"
".inst 0x4e9297fd // sdot v29.4s, v31.16b, v18.16b\n"
".inst 0x4e9297da // sdot v26.4s, v30.16b, v18.16b\n"
".inst 0x4e91979d // sdot v29.4s, v28.16b, v17.16b\n"
".inst 0x4e91977a // sdot v26.4s, v27.16b, v17.16b\n"
"addp v29.4s, v29.4s, v26.4s\n"
"scvtf v29.4s, v29.4s, #0x4\n"
"fmla v0.4s, v29.4s, v16.4s\n"
"cbnz x22, 2b\n"
"sub %x[nc], %x[nc], #0x4\n"
"str q0, [%x[res_ptr], #0x0]\n"
"add %x[res_ptr], %x[res_ptr], #0x10\n"
"cbnz %x[nc], 1b\n"
: [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
: [a_ptr] "r" (a_ptr), [nb] "r" (nb)
: "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"
);
return; return;
} }
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
float sumf[4]; float sumf[4];
int sumi; int sumi;
......
/** /**
* llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
* *
* MIT License * MIT License
* *
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment