Unverified Commit 8dd12c87 authored by Jeffrey Morgan's avatar Jeffrey Morgan Committed by GitHub
Browse files

llama: update to commit e1e8e099 (#10513)

parent e6d2d041
...@@ -232,7 +232,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) ...@@ -232,7 +232,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
// } // }
if (k <= 0) { if (k <= 0) {
k = cur_p->size; return;
} }
k = std::min(k, (int) cur_p->size); k = std::min(k, (int) cur_p->size);
...@@ -298,6 +298,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) ...@@ -298,6 +298,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
} }
cur_p->sorted = true; cur_p->sorted = true;
} }
cur_p->size = k; cur_p->size = k;
} }
......
...@@ -1497,7 +1497,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { ...@@ -1497,7 +1497,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre == "llama3" || tokenizer_pre == "llama3" ||
tokenizer_pre == "llama-v3" || tokenizer_pre == "llama-v3" ||
tokenizer_pre == "llama-bpe"|| tokenizer_pre == "llama-bpe"||
tokenizer_pre == "falcon3") { tokenizer_pre == "falcon3" ||
tokenizer_pre == "pixtral") {
pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3; pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
ignore_merges = true; ignore_merges = true;
add_bos = true; add_bos = true;
......
...@@ -85,7 +85,7 @@ index e2617b06..242e50a7 100644 ...@@ -85,7 +85,7 @@ index e2617b06..242e50a7 100644
/** /**
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index a7febef7..31750b6f 100644 index 9fb2134f..04ce764e 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -534,6 +534,7 @@ struct ggml_backend_cuda_buffer_context { @@ -534,6 +534,7 @@ struct ggml_backend_cuda_buffer_context {
...@@ -125,10 +125,10 @@ index 50579227..2799a0a5 100644 ...@@ -125,10 +125,10 @@ index 50579227..2799a0a5 100644
static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 266d8af4..12886cd3 100644 index d92392ed..425524d0 100644
--- a/ggml/src/ggml-metal/ggml-metal.m --- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -4759,6 +4759,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) @@ -5077,6 +5077,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
} }
free(ctx); free(ctx);
...@@ -149,10 +149,10 @@ index 05a2f4e6..392cc18d 100644 ...@@ -149,10 +149,10 @@ index 05a2f4e6..392cc18d 100644
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
index a0667b7d..bd83adc5 100644 index 140a775f..e33c4ba0 100644
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp --- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -468,6 +468,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) { @@ -477,6 +477,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0); bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
GGML_ASSERT(status); GGML_ASSERT(status);
delete ctx; delete ctx;
...@@ -161,10 +161,10 @@ index a0667b7d..bd83adc5 100644 ...@@ -161,10 +161,10 @@ index a0667b7d..bd83adc5 100644
static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 1de34c96..4600f61e 100644 index 66b6f2cc..e3e6deae 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp --- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -316,6 +316,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try { @@ -317,6 +317,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
ggml_sycl_set_device(ctx->device); ggml_sycl_set_device(ctx->device);
delete ctx; delete ctx;
...@@ -172,7 +172,7 @@ index 1de34c96..4600f61e 100644 ...@@ -172,7 +172,7 @@ index 1de34c96..4600f61e 100644
} }
catch (sycl::exception const &exc) { catch (sycl::exception const &exc) {
std::cerr << exc.what() << "Exception caught at file:" << __FILE__ std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -761,6 +762,7 @@ struct ggml_backend_sycl_split_buffer_context { @@ -762,6 +763,7 @@ struct ggml_backend_sycl_split_buffer_context {
static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context; ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
delete ctx; delete ctx;
...@@ -180,7 +180,7 @@ index 1de34c96..4600f61e 100644 ...@@ -180,7 +180,7 @@ index 1de34c96..4600f61e 100644
} }
static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -1095,6 +1097,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_ @@ -1096,6 +1098,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_sycl_host_free(buffer->context); ggml_sycl_host_free(buffer->context);
...@@ -189,10 +189,10 @@ index 1de34c96..4600f61e 100644 ...@@ -189,10 +189,10 @@ index 1de34c96..4600f61e 100644
static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 39f3cd34..c569a8a5 100644 index c0bdb9e1..03d03064 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -8653,6 +8653,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) { @@ -8660,6 +8660,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
ggml_vk_destroy_buffer(ctx->dev_buffer); ggml_vk_destroy_buffer(ctx->dev_buffer);
delete ctx; delete ctx;
...@@ -200,7 +200,7 @@ index 39f3cd34..c569a8a5 100644 ...@@ -200,7 +200,7 @@ index 39f3cd34..c569a8a5 100644
} }
static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -8796,6 +8797,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe @@ -8803,6 +8804,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()"); VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
ggml_vk_host_free(vk_instance.devices[0], buffer->context); ggml_vk_host_free(vk_instance.devices[0], buffer->context);
......
...@@ -10,7 +10,7 @@ logs instead of throwing an error ...@@ -10,7 +10,7 @@ logs instead of throwing an error
1 file changed, 3 insertions(+), 11 deletions(-) 1 file changed, 3 insertions(+), 11 deletions(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 48060517..a35b498c 100644 index 50ded286..a9ee9f03 100644
--- a/src/llama-vocab.cpp --- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp
@@ -1491,16 +1491,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { @@ -1491,16 +1491,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
...@@ -31,7 +31,7 @@ index 48060517..a35b498c 100644 ...@@ -31,7 +31,7 @@ index 48060517..a35b498c 100644
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if ( } else if (
tokenizer_pre == "llama3" || tokenizer_pre == "llama3" ||
@@ -1634,7 +1625,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { @@ -1635,7 +1626,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE; pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
clean_spaces = false; clean_spaces = false;
} else { } else {
......
...@@ -11,10 +11,10 @@ instead of forcing one or the error ...@@ -11,10 +11,10 @@ instead of forcing one or the error
1 file changed, 3 insertions(+), 3 deletions(-) 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 983385f8..32f59819 100644 index 5a2eef9b..9c1fe93f 100644
--- a/src/llama-context.cpp --- a/src/llama-context.cpp
+++ b/src/llama-context.cpp +++ b/src/llama-context.cpp
@@ -1236,7 +1236,7 @@ int llama_context::decode(llama_batch & inp_batch) { @@ -1225,7 +1225,7 @@ int llama_context::decode(llama_batch & inp_batch) {
int64_t n_outputs_all = 0; int64_t n_outputs_all = 0;
// count outputs // count outputs
...@@ -23,7 +23,7 @@ index 983385f8..32f59819 100644 ...@@ -23,7 +23,7 @@ index 983385f8..32f59819 100644
for (uint32_t i = 0; i < n_tokens_all; ++i) { for (uint32_t i = 0; i < n_tokens_all; ++i) {
n_outputs_all += batch.logits[i] != 0; n_outputs_all += batch.logits[i] != 0;
} }
@@ -1348,7 +1348,7 @@ int llama_context::decode(llama_batch & inp_batch) { @@ -1337,7 +1337,7 @@ int llama_context::decode(llama_batch & inp_batch) {
// ggml_graph_dump_dot(gf, NULL, "llama.dot"); // ggml_graph_dump_dot(gf, NULL, "llama.dot");
//} //}
...@@ -32,7 +32,7 @@ index 983385f8..32f59819 100644 ...@@ -32,7 +32,7 @@ index 983385f8..32f59819 100644
auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr; auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr;
if (t_embd && res->get_embd_pooled()) { if (t_embd && res->get_embd_pooled()) {
@@ -1492,7 +1492,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) { @@ -1481,7 +1481,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
const auto n_embd = hparams.n_embd; const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead // TODO: use a per-batch flag for logits presence instead
......
...@@ -10,12 +10,12 @@ filesystems for paths that include wide characters ...@@ -10,12 +10,12 @@ filesystems for paths that include wide characters
1 file changed, 39 insertions(+) 1 file changed, 39 insertions(+)
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 75970615..d57b4bd6 100644 index ad3e7df1..b3218c78 100644
--- a/examples/llava/clip.cpp --- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp
@@ -29,6 +29,19 @@ @@ -30,6 +30,19 @@
#include <limits>
#include <array> #include <array>
#include <numeric>
+#if defined(_WIN32) +#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN
...@@ -33,7 +33,7 @@ index 75970615..d57b4bd6 100644 ...@@ -33,7 +33,7 @@ index 75970615..d57b4bd6 100644
struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL}; struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
//#define CLIP_DEBUG_FUNCTIONS //#define CLIP_DEBUG_FUNCTIONS
@@ -1430,7 +1443,29 @@ struct clip_model_loader { @@ -1971,7 +1984,29 @@ struct clip_model_loader {
{ {
std::vector<uint8_t> read_buf; std::vector<uint8_t> read_buf;
...@@ -63,7 +63,7 @@ index 75970615..d57b4bd6 100644 ...@@ -63,7 +63,7 @@ index 75970615..d57b4bd6 100644
if (!fin) { if (!fin) {
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str())); throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
} }
@@ -1457,7 +1492,11 @@ struct clip_model_loader { @@ -1998,7 +2033,11 @@ struct clip_model_loader {
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
} }
} }
......
...@@ -15,10 +15,10 @@ adds support for the Solar Pro architecture ...@@ -15,10 +15,10 @@ adds support for the Solar Pro architecture
7 files changed, 248 insertions(+) 7 files changed, 248 insertions(+)
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 62e1480b..f754bc8f 100644 index f2bc8ca7..5ab3f572 100644
--- a/src/llama-arch.cpp --- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp +++ b/src/llama-arch.cpp
@@ -68,6 +68,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = { @@ -69,6 +69,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_GRANITE, "granite" }, { LLM_ARCH_GRANITE, "granite" },
{ LLM_ARCH_GRANITE_MOE, "granitemoe" }, { LLM_ARCH_GRANITE_MOE, "granitemoe" },
{ LLM_ARCH_CHAMELEON, "chameleon" }, { LLM_ARCH_CHAMELEON, "chameleon" },
...@@ -26,7 +26,7 @@ index 62e1480b..f754bc8f 100644 ...@@ -26,7 +26,7 @@ index 62e1480b..f754bc8f 100644
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" }, { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
{ LLM_ARCH_PLM, "plm" }, { LLM_ARCH_PLM, "plm" },
{ LLM_ARCH_BAILINGMOE, "bailingmoe" }, { LLM_ARCH_BAILINGMOE, "bailingmoe" },
@@ -140,6 +141,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = { @@ -142,6 +143,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" }, { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" }, { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
...@@ -34,7 +34,7 @@ index 62e1480b..f754bc8f 100644 ...@@ -34,7 +34,7 @@ index 62e1480b..f754bc8f 100644
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" }, { LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" }, { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
@@ -1482,6 +1484,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N @@ -1502,6 +1504,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
}, },
}, },
...@@ -59,7 +59,7 @@ index 62e1480b..f754bc8f 100644 ...@@ -59,7 +59,7 @@ index 62e1480b..f754bc8f 100644
{ {
LLM_ARCH_WAVTOKENIZER_DEC, LLM_ARCH_WAVTOKENIZER_DEC,
{ {
@@ -1660,6 +1680,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = { @@ -1680,6 +1700,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, {LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
// this tensor is loaded for T5, but never used // this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}}, {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
...@@ -68,10 +68,10 @@ index 62e1480b..f754bc8f 100644 ...@@ -68,10 +68,10 @@ index 62e1480b..f754bc8f 100644
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h diff --git a/src/llama-arch.h b/src/llama-arch.h
index 98ca00a1..439aaeab 100644 index 41a023da..525c1b7d 100644
--- a/src/llama-arch.h --- a/src/llama-arch.h
+++ b/src/llama-arch.h +++ b/src/llama-arch.h
@@ -72,6 +72,7 @@ enum llm_arch { @@ -73,6 +73,7 @@ enum llm_arch {
LLM_ARCH_GRANITE, LLM_ARCH_GRANITE,
LLM_ARCH_GRANITE_MOE, LLM_ARCH_GRANITE_MOE,
LLM_ARCH_CHAMELEON, LLM_ARCH_CHAMELEON,
...@@ -79,7 +79,7 @@ index 98ca00a1..439aaeab 100644 ...@@ -79,7 +79,7 @@ index 98ca00a1..439aaeab 100644
LLM_ARCH_WAVTOKENIZER_DEC, LLM_ARCH_WAVTOKENIZER_DEC,
LLM_ARCH_PLM, LLM_ARCH_PLM,
LLM_ARCH_BAILINGMOE, LLM_ARCH_BAILINGMOE,
@@ -144,6 +145,7 @@ enum llm_kv { @@ -146,6 +147,7 @@ enum llm_kv {
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
LLM_KV_ATTENTION_SLIDING_WINDOW, LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE, LLM_KV_ATTENTION_SCALE,
...@@ -87,7 +87,7 @@ index 98ca00a1..439aaeab 100644 ...@@ -87,7 +87,7 @@ index 98ca00a1..439aaeab 100644
LLM_KV_ATTENTION_KEY_LENGTH_MLA, LLM_KV_ATTENTION_KEY_LENGTH_MLA,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA, LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
@@ -344,6 +346,7 @@ enum llm_tensor { @@ -346,6 +348,7 @@ enum llm_tensor {
LLM_TENSOR_ENC_OUTPUT_NORM, LLM_TENSOR_ENC_OUTPUT_NORM,
LLM_TENSOR_CLS, LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT, LLM_TENSOR_CLS_OUT,
...@@ -115,7 +115,7 @@ index 90dfe7a7..8a667960 100644 ...@@ -115,7 +115,7 @@ index 90dfe7a7..8a667960 100644
if (il < n_layer) { if (il < n_layer) {
return n_swa > 0 && n_swa_pattern > 0 && il % n_swa_pattern < (n_swa_pattern - 1); return n_swa > 0 && n_swa_pattern > 0 && il % n_swa_pattern < (n_swa_pattern - 1);
diff --git a/src/llama-hparams.h b/src/llama-hparams.h diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index 80fcd65d..6e278945 100644 index 7ee6a5b7..48dce407 100644
--- a/src/llama-hparams.h --- a/src/llama-hparams.h
+++ b/src/llama-hparams.h +++ b/src/llama-hparams.h
@@ -55,6 +55,8 @@ struct llama_hparams { @@ -55,6 +55,8 @@ struct llama_hparams {
...@@ -127,7 +127,7 @@ index 80fcd65d..6e278945 100644 ...@@ -127,7 +127,7 @@ index 80fcd65d..6e278945 100644
uint32_t n_layer_dense_lead = 0; uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0; uint32_t n_lora_q = 0;
uint32_t n_lora_kv = 0; uint32_t n_lora_kv = 0;
@@ -153,6 +155,9 @@ struct llama_hparams { @@ -154,6 +156,9 @@ struct llama_hparams {
// dimension of the recurrent state embeddings // dimension of the recurrent state embeddings
uint32_t n_embd_v_s() const; uint32_t n_embd_v_s() const;
...@@ -150,10 +150,10 @@ index ea73a8a7..a012aeae 100644 ...@@ -150,10 +150,10 @@ index ea73a8a7..a012aeae 100644
llama_model_loader::llama_model_loader( llama_model_loader::llama_model_loader(
const std::string & fname, const std::string & fname,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 6b7bfecf..aba42819 100644 index 822e2bb2..572378c9 100644
--- a/src/llama-model.cpp --- a/src/llama-model.cpp
+++ b/src/llama-model.cpp +++ b/src/llama-model.cpp
@@ -1374,6 +1374,21 @@ void llama_model::load_hparams(llama_model_loader & ml) { @@ -1386,6 +1386,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN; default: type = LLM_TYPE_UNKNOWN;
} }
} break; } break;
...@@ -175,7 +175,7 @@ index 6b7bfecf..aba42819 100644 ...@@ -175,7 +175,7 @@ index 6b7bfecf..aba42819 100644
case LLM_ARCH_WAVTOKENIZER_DEC: case LLM_ARCH_WAVTOKENIZER_DEC:
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -3717,6 +3732,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) { @@ -3741,6 +3756,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
...@@ -210,7 +210,7 @@ index 6b7bfecf..aba42819 100644 ...@@ -210,7 +210,7 @@ index 6b7bfecf..aba42819 100644
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
@@ -12296,6 +12339,165 @@ struct llm_build_chameleon : public llm_graph_context { @@ -12342,6 +12385,165 @@ struct llm_build_chameleon : public llm_graph_context {
} }
}; };
...@@ -376,7 +376,7 @@ index 6b7bfecf..aba42819 100644 ...@@ -376,7 +376,7 @@ index 6b7bfecf..aba42819 100644
struct llm_build_wavtokenizer_dec : public llm_graph_context { struct llm_build_wavtokenizer_dec : public llm_graph_context {
llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
ggml_tensor * cur; ggml_tensor * cur;
@@ -13045,6 +13247,10 @@ llm_graph_result_ptr llama_model::build_graph( @@ -13092,6 +13294,10 @@ llm_graph_result_ptr llama_model::build_graph(
{ {
llm = std::make_unique<llm_build_chameleon>(*this, params, gf); llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
} break; } break;
...@@ -387,7 +387,7 @@ index 6b7bfecf..aba42819 100644 ...@@ -387,7 +387,7 @@ index 6b7bfecf..aba42819 100644
case LLM_ARCH_WAVTOKENIZER_DEC: case LLM_ARCH_WAVTOKENIZER_DEC:
{ {
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf); llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf);
@@ -13191,6 +13397,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { @@ -13238,6 +13444,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_GRANITE: case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE_MOE: case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_CHAMELEON: case LLM_ARCH_CHAMELEON:
...@@ -396,18 +396,18 @@ index 6b7bfecf..aba42819 100644 ...@@ -396,18 +396,18 @@ index 6b7bfecf..aba42819 100644
return LLAMA_ROPE_TYPE_NORM; return LLAMA_ROPE_TYPE_NORM;
diff --git a/src/llama-model.h b/src/llama-model.h diff --git a/src/llama-model.h b/src/llama-model.h
index fd82d106..5865d5e9 100644 index 95eca002..856e6042 100644
--- a/src/llama-model.h --- a/src/llama-model.h
+++ b/src/llama-model.h +++ b/src/llama-model.h
@@ -62,6 +62,7 @@ enum llm_type { @@ -64,6 +64,7 @@ enum llm_type {
LLM_TYPE_15B, LLM_TYPE_15B,
LLM_TYPE_16B, LLM_TYPE_16B,
LLM_TYPE_20B, LLM_TYPE_20B,
+ LLM_TYPE_22B, + LLM_TYPE_22B,
LLM_TYPE_27B,
LLM_TYPE_30B, LLM_TYPE_30B,
LLM_TYPE_32B, LLM_TYPE_32B,
LLM_TYPE_34B, @@ -311,6 +312,8 @@ struct llama_layer {
@@ -307,6 +308,8 @@ struct llama_layer {
struct ggml_tensor * ffn_up_scale = nullptr; struct ggml_tensor * ffn_up_scale = nullptr;
struct ggml_tensor * ffn_down_scale = nullptr; struct ggml_tensor * ffn_down_scale = nullptr;
......
This diff is collapsed.
...@@ -18,10 +18,10 @@ adds the unpad operator to GGML ...@@ -18,10 +18,10 @@ adds the unpad operator to GGML
10 files changed, 223 insertions(+), 2 deletions(-) 10 files changed, 223 insertions(+), 2 deletions(-)
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 8fcc16df..d19fc167 100644 index 1b8603e7..53ef31b2 100644
--- a/ggml/include/ggml.h --- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h +++ b/ggml/include/ggml.h
@@ -488,6 +488,7 @@ extern "C" { @@ -489,6 +489,7 @@ extern "C" {
GGML_OP_UPSCALE, // nearest interpolate GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_PAD, GGML_OP_PAD,
GGML_OP_PAD_REFLECT_1D, GGML_OP_PAD_REFLECT_1D,
...@@ -29,7 +29,7 @@ index 8fcc16df..d19fc167 100644 ...@@ -29,7 +29,7 @@ index 8fcc16df..d19fc167 100644
GGML_OP_ARANGE, GGML_OP_ARANGE,
GGML_OP_TIMESTEP_EMBEDDING, GGML_OP_TIMESTEP_EMBEDDING,
GGML_OP_ARGSORT, GGML_OP_ARGSORT,
@@ -1757,6 +1758,15 @@ extern "C" { @@ -1777,6 +1778,15 @@ extern "C" {
int p0, int p0,
int p1); int p1);
...@@ -46,10 +46,10 @@ index 8fcc16df..d19fc167 100644 ...@@ -46,10 +46,10 @@ index 8fcc16df..d19fc167 100644
// timesteps: [N,] // timesteps: [N,]
// return: [N, dim] // return: [N, dim]
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 50400328..432942bf 100644 index 64405449..34624cca 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c --- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -1960,6 +1960,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm @@ -1964,6 +1964,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{ {
ggml_compute_forward_pad_reflect_1d(params, tensor); ggml_compute_forward_pad_reflect_1d(params, tensor);
} break; } break;
...@@ -60,7 +60,7 @@ index 50400328..432942bf 100644 ...@@ -60,7 +60,7 @@ index 50400328..432942bf 100644
case GGML_OP_ARANGE: case GGML_OP_ARANGE:
{ {
ggml_compute_forward_arange(params, tensor); ggml_compute_forward_arange(params, tensor);
@@ -2282,6 +2286,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { @@ -2287,6 +2291,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
case GGML_OP_UPSCALE: case GGML_OP_UPSCALE:
case GGML_OP_PAD: case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D: case GGML_OP_PAD_REFLECT_1D:
...@@ -69,10 +69,10 @@ index 50400328..432942bf 100644 ...@@ -69,10 +69,10 @@ index 50400328..432942bf 100644
case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 6050147b..66b8da68 100644 index 7413192b..becdae07 100644
--- a/ggml/src/ggml-cpu/ops.cpp --- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp
@@ -6531,6 +6531,61 @@ void ggml_compute_forward_pad_reflect_1d( @@ -6703,6 +6703,61 @@ void ggml_compute_forward_pad_reflect_1d(
} }
} }
...@@ -135,10 +135,10 @@ index 6050147b..66b8da68 100644 ...@@ -135,10 +135,10 @@ index 6050147b..66b8da68 100644
static void ggml_compute_forward_arange_f32( static void ggml_compute_forward_arange_f32(
diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h
index 410a3720..3eca1cf8 100644 index dc081b9e..a7125555 100644
--- a/ggml/src/ggml-cpu/ops.h --- a/ggml/src/ggml-cpu/ops.h
+++ b/ggml/src/ggml-cpu/ops.h +++ b/ggml/src/ggml-cpu/ops.h
@@ -71,6 +71,7 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params @@ -72,6 +72,7 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params
void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
...@@ -147,10 +147,10 @@ index 410a3720..3eca1cf8 100644 ...@@ -147,10 +147,10 @@ index 410a3720..3eca1cf8 100644
void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 31750b6f..0fef9522 100644 index 04ce764e..491acccb 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2246,6 +2246,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg @@ -2223,6 +2223,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_PAD: case GGML_OP_PAD:
ggml_cuda_op_pad(ctx, dst); ggml_cuda_op_pad(ctx, dst);
break; break;
...@@ -160,7 +160,7 @@ index 31750b6f..0fef9522 100644 ...@@ -160,7 +160,7 @@ index 31750b6f..0fef9522 100644
case GGML_OP_ARANGE: case GGML_OP_ARANGE:
ggml_cuda_op_arange(ctx, dst); ggml_cuda_op_arange(ctx, dst);
break; break;
@@ -3222,6 +3225,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g @@ -3197,6 +3200,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_UPSCALE: case GGML_OP_UPSCALE:
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST; return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
case GGML_OP_PAD: case GGML_OP_PAD:
...@@ -233,7 +233,7 @@ index 8fd386b0..e2ededc3 100644 ...@@ -233,7 +233,7 @@ index 8fd386b0..e2ededc3 100644
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 12886cd3..b2e95a66 100644 index 425524d0..112abef6 100644
--- a/ggml/src/ggml-metal/ggml-metal.m --- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -341,6 +341,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte @@ -341,6 +341,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
...@@ -244,7 +244,7 @@ index 12886cd3..b2e95a66 100644 ...@@ -244,7 +244,7 @@ index 12886cd3..b2e95a66 100644
GGML_METAL_KERNEL_TYPE_ARANGE_F32, GGML_METAL_KERNEL_TYPE_ARANGE_F32,
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
@@ -1020,6 +1021,7 @@ @implementation GGMLMetalClass @@ -1277,6 +1278,7 @@ @implementation GGMLMetalClass
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true);
...@@ -252,7 +252,7 @@ index 12886cd3..b2e95a66 100644 ...@@ -252,7 +252,7 @@ index 12886cd3..b2e95a66 100644
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
@@ -1384,6 +1386,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex @@ -1647,6 +1649,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
case GGML_OP_POOL_2D: case GGML_OP_POOL_2D:
case GGML_OP_PAD: case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D: case GGML_OP_PAD_REFLECT_1D:
...@@ -260,7 +260,7 @@ index 12886cd3..b2e95a66 100644 ...@@ -260,7 +260,7 @@ index 12886cd3..b2e95a66 100644
case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
case GGML_OP_LEAKY_RELU: case GGML_OP_LEAKY_RELU:
@@ -3731,6 +3734,36 @@ static void ggml_metal_encode_node( @@ -4047,6 +4050,36 @@ static bool ggml_metal_encode_node(
const int nth = MIN(1024, ne0); const int nth = MIN(1024, ne0);
...@@ -298,7 +298,7 @@ index 12886cd3..b2e95a66 100644 ...@@ -298,7 +298,7 @@ index 12886cd3..b2e95a66 100644
} break; } break;
case GGML_OP_ARANGE: case GGML_OP_ARANGE:
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 8d6e99e6..71f0f97f 100644 index 9f4147e9..6ceb3cef 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal --- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -2975,6 +2975,51 @@ kernel void kernel_pad_reflect_1d_f32( @@ -2975,6 +2975,51 @@ kernel void kernel_pad_reflect_1d_f32(
...@@ -354,10 +354,10 @@ index 8d6e99e6..71f0f97f 100644 ...@@ -354,10 +354,10 @@ index 8d6e99e6..71f0f97f 100644
device char * dst, device char * dst,
constant ggml_metal_kargs_arange & args, constant ggml_metal_kargs_arange & args,
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 950772c7..2276b631 100644 index 7654ae17..3c57aff8 100644
--- a/ggml/src/ggml.c --- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c +++ b/ggml/src/ggml.c
@@ -963,6 +963,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { @@ -923,6 +923,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"UPSCALE", "UPSCALE",
"PAD", "PAD",
"PAD_REFLECT_1D", "PAD_REFLECT_1D",
...@@ -365,16 +365,16 @@ index 950772c7..2276b631 100644 ...@@ -365,16 +365,16 @@ index 950772c7..2276b631 100644
"ARANGE", "ARANGE",
"TIMESTEP_EMBEDDING", "TIMESTEP_EMBEDDING",
"ARGSORT", "ARGSORT",
@@ -993,7 +994,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { @@ -953,7 +954,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"OPT_STEP_ADAMW", "OPT_STEP_ADAMW",
}; };
-static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81"); -static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82"); +static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none", "none",
@@ -1057,6 +1058,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { @@ -1018,6 +1019,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"upscale(x)", "upscale(x)",
"pad(x)", "pad(x)",
"pad_reflect_1d(x)", "pad_reflect_1d(x)",
...@@ -382,16 +382,16 @@ index 950772c7..2276b631 100644 ...@@ -382,16 +382,16 @@ index 950772c7..2276b631 100644
"arange(start, stop, step)", "arange(start, stop, step)",
"timestep_embedding(timesteps, dim, max_period)", "timestep_embedding(timesteps, dim, max_period)",
"argsort(x)", "argsort(x)",
@@ -1087,7 +1089,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { @@ -1048,7 +1050,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"adamw(x)", "adamw(x)",
}; };
-static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81"); -static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82"); +static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@@ -4262,6 +4264,25 @@ struct ggml_tensor * ggml_pad_reflect_1d( @@ -4270,6 +4272,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
return result; return result;
} }
......
...@@ -12,7 +12,7 @@ regex ...@@ -12,7 +12,7 @@ regex
2 files changed, 22 insertions(+), 1 deletion(-) 2 files changed, 22 insertions(+), 1 deletion(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index a35b498c..032019c9 100644 index a9ee9f03..1306864e 100644
--- a/src/llama-vocab.cpp --- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp
@@ -296,7 +296,7 @@ struct llm_tokenizer_bpe : llm_tokenizer { @@ -296,7 +296,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
......
...@@ -8,10 +8,10 @@ Subject: [PATCH] maintain ordering for rules for grammar ...@@ -8,10 +8,10 @@ Subject: [PATCH] maintain ordering for rules for grammar
1 file changed, 1 insertion(+), 1 deletion(-) 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index 90679822..56043678 100644 index 5b3059c2..656b3eca 100644
--- a/common/json-schema-to-grammar.cpp --- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp
@@ -346,7 +346,7 @@ private: @@ -349,7 +349,7 @@ private:
friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options); friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
std::function<json(const std::string &)> _fetch_json; std::function<json(const std::string &)> _fetch_json;
bool _dotall; bool _dotall;
......
...@@ -22,10 +22,10 @@ multiple batches of processing until everything is complete. ...@@ -22,10 +22,10 @@ multiple batches of processing until everything is complete.
4 files changed, 51 insertions(+), 106 deletions(-) 4 files changed, 51 insertions(+), 106 deletions(-)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 0343ba8a..4b3e6a83 100644 index cd06ad91..77177c5e 100644
--- a/src/llama-context.cpp --- a/src/llama-context.cpp
+++ b/src/llama-context.cpp +++ b/src/llama-context.cpp
@@ -594,13 +594,12 @@ llm_graph_result_ptr llama_context::build_kv_self_shift( @@ -583,13 +583,12 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
llm_graph_result_ptr llama_context::build_kv_self_defrag( llm_graph_result_ptr llama_context::build_kv_self_defrag(
ggml_context * ctx0, ggml_context * ctx0,
...@@ -41,7 +41,7 @@ index 0343ba8a..4b3e6a83 100644 ...@@ -41,7 +41,7 @@ index 0343ba8a..4b3e6a83 100644
#if 0 #if 0
// CPU defrag // CPU defrag
// //
@@ -672,32 +671,20 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag( @@ -661,32 +660,20 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size()); ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
} }
#else #else
...@@ -79,7 +79,7 @@ index 0343ba8a..4b3e6a83 100644 ...@@ -79,7 +79,7 @@ index 0343ba8a..4b3e6a83 100644
ggml_tensor * view_v_src; ggml_tensor * view_v_src;
ggml_tensor * view_v_dst; ggml_tensor * view_v_dst;
@@ -705,34 +692,30 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag( @@ -694,34 +681,30 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
if (cparams.flash_attn) { if (cparams.flash_attn) {
// NOTE: the V cache is not transposed when using flash attention // NOTE: the V cache is not transposed when using flash attention
view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il], view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il],
...@@ -122,7 +122,7 @@ index 0343ba8a..4b3e6a83 100644 ...@@ -122,7 +122,7 @@ index 0343ba8a..4b3e6a83 100644
#endif #endif
return res; return res;
@@ -741,8 +724,6 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag( @@ -730,8 +713,6 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
void llama_context::kv_self_update() { void llama_context::kv_self_update() {
auto & kv = kv_self; auto & kv = kv_self;
...@@ -131,7 +131,7 @@ index 0343ba8a..4b3e6a83 100644 ...@@ -131,7 +131,7 @@ index 0343ba8a..4b3e6a83 100644
if (kv->has_shift) { if (kv->has_shift) {
if (!kv->get_can_shift()) { if (!kv->get_can_shift()) {
GGML_ABORT("The current context does not support K-shift"); GGML_ABORT("The current context does not support K-shift");
@@ -763,8 +744,6 @@ void llama_context::kv_self_update() { @@ -752,8 +733,6 @@ void llama_context::kv_self_update() {
res->set_inputs(nullptr); res->set_inputs(nullptr);
graph_compute(gf, false); graph_compute(gf, false);
...@@ -140,7 +140,7 @@ index 0343ba8a..4b3e6a83 100644 ...@@ -140,7 +140,7 @@ index 0343ba8a..4b3e6a83 100644
} }
{ {
@@ -779,49 +758,28 @@ void llama_context::kv_self_update() { @@ -768,49 +747,28 @@ void llama_context::kv_self_update() {
// defragment the KV cache if needed // defragment the KV cache if needed
if (kv->do_defrag) { if (kv->do_defrag) {
LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__); LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
...@@ -202,7 +202,7 @@ index 0343ba8a..4b3e6a83 100644 ...@@ -202,7 +202,7 @@ index 0343ba8a..4b3e6a83 100644
} }
enum llama_pooling_type llama_context::pooling_type() const { enum llama_pooling_type llama_context::pooling_type() const {
@@ -1305,9 +1263,12 @@ int llama_context::decode(llama_batch & inp_batch) { @@ -1294,9 +1252,12 @@ int llama_context::decode(llama_batch & inp_batch) {
// find KV slot // find KV slot
{ {
if (!kv_self->find_slot(ubatch)) { if (!kv_self->find_slot(ubatch)) {
...@@ -219,7 +219,7 @@ index 0343ba8a..4b3e6a83 100644 ...@@ -219,7 +219,7 @@ index 0343ba8a..4b3e6a83 100644
if (!kv_self->recurrent) { if (!kv_self->recurrent) {
diff --git a/src/llama-context.h b/src/llama-context.h diff --git a/src/llama-context.h b/src/llama-context.h
index baa03276..a59ff8fd 100644 index a50c4afa..30f84bfd 100644
--- a/src/llama-context.h --- a/src/llama-context.h
+++ b/src/llama-context.h +++ b/src/llama-context.h
@@ -5,6 +5,7 @@ @@ -5,6 +5,7 @@
...@@ -230,7 +230,7 @@ index baa03276..a59ff8fd 100644 ...@@ -230,7 +230,7 @@ index baa03276..a59ff8fd 100644
#include "ggml-cpp.h" #include "ggml-cpp.h"
@@ -180,7 +181,8 @@ private: @@ -179,7 +180,8 @@ private:
llm_graph_result_ptr build_kv_self_defrag( llm_graph_result_ptr build_kv_self_defrag(
ggml_context * ctx0, ggml_context * ctx0,
......
...@@ -8,10 +8,10 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants ...@@ -8,10 +8,10 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
1 file changed, 2 insertions(+) 1 file changed, 2 insertions(+)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index f00700da..91d6a7d5 100644 index 43d9fc4f..4c0d3824 100644
--- a/ggml/src/CMakeLists.txt --- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt
@@ -278,6 +278,7 @@ function(ggml_add_cpu_backend_variant tag_name) @@ -279,6 +279,7 @@ function(ggml_add_cpu_backend_variant tag_name)
endforeach() endforeach()
ggml_add_cpu_backend_variant_impl(${tag_name}) ggml_add_cpu_backend_variant_impl(${tag_name})
...@@ -19,11 +19,11 @@ index f00700da..91d6a7d5 100644 ...@@ -19,11 +19,11 @@ index f00700da..91d6a7d5 100644
endfunction() endfunction()
ggml_add_backend(CPU) ggml_add_backend(CPU)
@@ -286,6 +287,7 @@ if (GGML_CPU_ALL_VARIANTS) @@ -287,6 +288,7 @@ if (GGML_CPU_ALL_VARIANTS)
if (NOT GGML_BACKEND_DL) if (NOT GGML_BACKEND_DL)
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL") message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
endif() endif()
+ add_custom_target(ggml-cpu) + add_custom_target(ggml-cpu)
ggml_add_cpu_backend_variant(sandybridge AVX) ggml_add_cpu_backend_variant(x64)
ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 BMI2 FMA) ggml_add_cpu_backend_variant(sse42 SSE42)
ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 BMI2 FMA AVX512) ggml_add_cpu_backend_variant(sandybridge SSE42 AVX)
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com> From: jmorganca <jmorganca@gmail.com>
Date: Tue, 8 Apr 2025 20:33:01 -0700 Date: Thu, 1 May 2025 15:05:08 -0700
Subject: [PATCH] remove amx Subject: [PATCH] remove amx
disable amx as it reduces performance on some systems disable amx as it reduces performance on some systems
...@@ -9,16 +9,16 @@ disable amx as it reduces performance on some systems ...@@ -9,16 +9,16 @@ disable amx as it reduces performance on some systems
1 file changed, 4 deletions(-) 1 file changed, 4 deletions(-)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 91d6a7d5..d6b393a2 100644 index 4c0d3824..79c26312 100644
--- a/ggml/src/CMakeLists.txt --- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt
@@ -293,10 +293,6 @@ if (GGML_CPU_ALL_VARIANTS) @@ -296,10 +296,6 @@ if (GGML_CPU_ALL_VARIANTS)
ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 BMI2 FMA AVX512) ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI) ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 BMI2 FMA AVX_VNNI) ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
- if (NOT MSVC) - if (NOT MSVC)
- # MSVC doesn't support AMX - # MSVC doesn't support AMX
- ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8) - ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
- endif() - endif()
elseif (GGML_CPU) elseif (GGML_CPU)
ggml_add_cpu_backend_variant_impl("") ggml_add_cpu_backend_variant_impl("")
......
...@@ -53,7 +53,7 @@ index 381a9c7d..e45b453d 100644 ...@@ -53,7 +53,7 @@ index 381a9c7d..e45b453d 100644
} }
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 032019c9..ba37df35 100644 index 1306864e..d6515ff6 100644
--- a/src/llama-vocab.cpp --- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp
@@ -1459,7 +1459,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { @@ -1459,7 +1459,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
......
...@@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor ...@@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor
1 file changed, 6 insertions(+) 1 file changed, 6 insertions(+)
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 432942bf..6d4abe4c 100644 index 34624cca..59bd3c62 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c --- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -15,6 +15,8 @@ @@ -15,6 +15,8 @@
...@@ -20,7 +20,7 @@ index 432942bf..6d4abe4c 100644 ...@@ -20,7 +20,7 @@ index 432942bf..6d4abe4c 100644
#if defined(_MSC_VER) || defined(__MINGW32__) #if defined(_MSC_VER) || defined(__MINGW32__)
#include <malloc.h> // using malloc.h with MSC/MINGW #include <malloc.h> // using malloc.h with MSC/MINGW
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
@@ -2854,6 +2856,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { @@ -2859,6 +2861,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
ggml_compute_forward(&params, node); ggml_compute_forward(&params, node);
......
...@@ -13,10 +13,10 @@ models not supported in llama.cpp ...@@ -13,10 +13,10 @@ models not supported in llama.cpp
4 files changed, 24 insertions(+) 4 files changed, 24 insertions(+)
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 0568565f..dd01df60 100644 index eb7b5325..df42d1a5 100644
--- a/src/llama-arch.cpp --- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp +++ b/src/llama-arch.cpp
@@ -73,6 +73,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = { @@ -74,6 +74,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" }, { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
{ LLM_ARCH_PLM, "plm" }, { LLM_ARCH_PLM, "plm" },
{ LLM_ARCH_BAILINGMOE, "bailingmoe" }, { LLM_ARCH_BAILINGMOE, "bailingmoe" },
...@@ -24,7 +24,7 @@ index 0568565f..dd01df60 100644 ...@@ -24,7 +24,7 @@ index 0568565f..dd01df60 100644
{ LLM_ARCH_UNKNOWN, "(unknown)" }, { LLM_ARCH_UNKNOWN, "(unknown)" },
}; };
@@ -1586,6 +1587,22 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N @@ -1606,6 +1607,22 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
}, },
}, },
...@@ -48,10 +48,10 @@ index 0568565f..dd01df60 100644 ...@@ -48,10 +48,10 @@ index 0568565f..dd01df60 100644
LLM_ARCH_UNKNOWN, LLM_ARCH_UNKNOWN,
{ {
diff --git a/src/llama-arch.h b/src/llama-arch.h diff --git a/src/llama-arch.h b/src/llama-arch.h
index 6a989034..b6227eeb 100644 index bc8a4f0b..bda9d071 100644
--- a/src/llama-arch.h --- a/src/llama-arch.h
+++ b/src/llama-arch.h +++ b/src/llama-arch.h
@@ -75,6 +75,7 @@ enum llm_arch { @@ -76,6 +76,7 @@ enum llm_arch {
LLM_ARCH_CHAMELEON, LLM_ARCH_CHAMELEON,
LLM_ARCH_SOLAR, LLM_ARCH_SOLAR,
LLM_ARCH_WAVTOKENIZER_DEC, LLM_ARCH_WAVTOKENIZER_DEC,
...@@ -60,10 +60,10 @@ index 6a989034..b6227eeb 100644 ...@@ -60,10 +60,10 @@ index 6a989034..b6227eeb 100644
LLM_ARCH_BAILINGMOE, LLM_ARCH_BAILINGMOE,
LLM_ARCH_UNKNOWN, LLM_ARCH_UNKNOWN,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index d051696c..c8374159 100644 index 9d099f11..ef70486d 100644
--- a/src/llama-model.cpp --- a/src/llama-model.cpp
+++ b/src/llama-model.cpp +++ b/src/llama-model.cpp
@@ -1425,6 +1425,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { @@ -1437,6 +1437,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN; default: type = LLM_TYPE_UNKNOWN;
} }
} break; } break;
...@@ -71,7 +71,7 @@ index d051696c..c8374159 100644 ...@@ -71,7 +71,7 @@ index d051696c..c8374159 100644
default: throw std::runtime_error("unsupported model architecture"); default: throw std::runtime_error("unsupported model architecture");
} }
@@ -13704,6 +13705,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { @@ -13751,6 +13752,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_CHAMELEON: case LLM_ARCH_CHAMELEON:
case LLM_ARCH_SOLAR: case LLM_ARCH_SOLAR:
case LLM_ARCH_BAILINGMOE: case LLM_ARCH_BAILINGMOE:
......
...@@ -184,10 +184,10 @@ index f8c291de..2a3a62db 100644 ...@@ -184,10 +184,10 @@ index f8c291de..2a3a62db 100644
const char * grammar_root, const char * grammar_root,
bool lazy, bool lazy,
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index d1497985..b1a9dca3 100644 index c0a5f934..75731053 100644
--- a/src/llama-sampling.cpp --- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp
@@ -1465,7 +1465,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) { @@ -1466,7 +1466,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
trigger_patterns_c.push_back(trigger_pattern.pattern.c_str()); trigger_patterns_c.push_back(trigger_pattern.pattern.c_str());
} }
...@@ -196,7 +196,7 @@ index d1497985..b1a9dca3 100644 ...@@ -196,7 +196,7 @@ index d1497985..b1a9dca3 100644
ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(), ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(),
ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size()); ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size());
@@ -1547,7 +1547,7 @@ static struct llama_sampler * llama_sampler_init_grammar_impl( @@ -1548,7 +1548,7 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
/* .vocab = */ vocab, /* .vocab = */ vocab,
/* .grammar_str = */ grammar_str, /* .grammar_str = */ grammar_str,
/* .grammar_root = */ grammar_root, /* .grammar_root = */ grammar_root,
......
...@@ -133,6 +133,11 @@ extern "C" { ...@@ -133,6 +133,11 @@ extern "C" {
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void); GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);
GGML_BACKEND_API void ggml_cpu_bf16_to_fp32(const ggml_bf16_t *, float *, int64_t);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
extern "C" { extern "C" {
#endif #endif
#define RPC_PROTO_MAJOR_VERSION 1 #define RPC_PROTO_MAJOR_VERSION 2
#define RPC_PROTO_MINOR_VERSION 0 #define RPC_PROTO_MINOR_VERSION 0
#define RPC_PROTO_PATCH_VERSION 0 #define RPC_PROTO_PATCH_VERSION 0
#define GGML_RPC_MAX_SERVERS 16 #define GGML_RPC_MAX_SERVERS 16
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment