Unverified Commit 49a9c9ba authored by Daniel Hiltgen's avatar Daniel Hiltgen Committed by GitHub
Browse files

GGML update to ec98e2002 (#13451)

* Revert "add support for NVIDIA Nemotron 3 Nano"

This reverts commit e7d2ae9d69421012e9a8765c06a3fdf0e45b12f3.

* GGML update to 380b4c984

Remove MaskBatchPadding as GGML_KQ_MASK_PAD is no longer present (no
padding required)

* update to c45f89d55

* ec98e2002

solar pro needed more adjusting - needs verification

* review comments
parent 1c094038
...@@ -32,6 +32,10 @@ ...@@ -32,6 +32,10 @@
#define STB_IMAGE_IMPLEMENTATION #define STB_IMAGE_IMPLEMENTATION
#include "stb/stb_image.h" #include "stb/stb_image.h"
#ifdef MTMD_INTERNAL_HEADER
#error "mtmd-helper is a public library outside of mtmd. it must not include internal headers"
#endif
// //
// internal logging functions // internal logging functions
// //
......
...@@ -161,8 +161,7 @@ struct mtmd_context { ...@@ -161,8 +161,7 @@ struct mtmd_context {
// string template for slice image delimiters with row/col (idefics3) // string template for slice image delimiters with row/col (idefics3)
std::string sli_img_start_tmpl; std::string sli_img_start_tmpl;
// for whisper, we pre-calculate the mel filter bank std::unique_ptr<mtmd_audio_preprocessor> audio_preproc;
whisper_preprocessor::whisper_filters w_filters;
// TODO @ngxson : add timings // TODO @ngxson : add timings
...@@ -228,7 +227,7 @@ struct mtmd_context { ...@@ -228,7 +227,7 @@ struct mtmd_context {
void init_vision() { void init_vision() {
GGML_ASSERT(ctx_v != nullptr); GGML_ASSERT(ctx_v != nullptr);
use_mrope = clip_is_qwen2vl(ctx_v); use_mrope = clip_is_mrope(ctx_v);
projector_type proj = clip_get_projector_type(ctx_v); projector_type proj = clip_get_projector_type(ctx_v);
int minicpmv_version = clip_is_minicpmv(ctx_v); int minicpmv_version = clip_is_minicpmv(ctx_v);
...@@ -320,6 +319,10 @@ struct mtmd_context { ...@@ -320,6 +319,10 @@ struct mtmd_context {
img_beg = "<|image_start|>"; img_beg = "<|image_start|>";
img_end = "<|image_end|>"; img_end = "<|image_end|>";
} else if (proj == PROJECTOR_TYPE_GLM4V) {
img_beg = "<|begin_of_image|>";
img_end = "<|end_of_image|>";
} }
} }
...@@ -327,14 +330,25 @@ struct mtmd_context { ...@@ -327,14 +330,25 @@ struct mtmd_context {
GGML_ASSERT(ctx_a != nullptr); GGML_ASSERT(ctx_a != nullptr);
projector_type proj = clip_get_projector_type(ctx_a); projector_type proj = clip_get_projector_type(ctx_a);
if (clip_has_whisper_encoder(ctx_a)) {
// TODO @ngxson : check if model n_mel is 128 or 80
w_filters = whisper_precalc_filters::get_128_bins();
}
LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n" LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
" https://github.com/ggml-org/llama.cpp/discussions/13759\n", __func__); " https://github.com/ggml-org/llama.cpp/discussions/13759\n", __func__);
// set preprocessor
switch (proj) {
case PROJECTOR_TYPE_QWEN2A:
case PROJECTOR_TYPE_QWEN25O:
case PROJECTOR_TYPE_ULTRAVOX:
case PROJECTOR_TYPE_VOXTRAL:
audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
break;
default:
GGML_ABORT("unsupported audio projector type");
}
// initialize audio preprocessor
audio_preproc->initialize();
// set special tokens
if (proj == PROJECTOR_TYPE_QWEN2A) { if (proj == PROJECTOR_TYPE_QWEN2A) {
// <|audio_bos|> ... (embeddings) ... <|audio_eos|> // <|audio_bos|> ... (embeddings) ... <|audio_eos|>
aud_beg = "<|audio_bos|>"; aud_beg = "<|audio_bos|>";
...@@ -663,11 +677,10 @@ struct mtmd_tokenizer { ...@@ -663,11 +677,10 @@ struct mtmd_tokenizer {
} }
// preprocess audio // preprocess audio
GGML_ASSERT(ctx->w_filters.n_mel); // make sure we have filter preloaded std::vector<mtmd_audio_mel> mel_spec_chunks;
std::vector<whisper_preprocessor::whisper_mel> mel_spec_chunks;
const float * samples = (const float *)bitmap->data.data(); const float * samples = (const float *)bitmap->data.data();
size_t n_samples = bitmap->data.size() / sizeof(float); size_t n_samples = bitmap->data.size() / sizeof(float);
bool ok = whisper_preprocessor::preprocess_audio(samples, n_samples, ctx->w_filters, mel_spec_chunks); bool ok = ctx->audio_preproc->preprocess(samples, n_samples, mel_spec_chunks);
if (!ok) { if (!ok) {
LOG_ERR("Unable to preprocess audio\n"); LOG_ERR("Unable to preprocess audio\n");
return 2; return 2;
...@@ -873,8 +886,7 @@ int mtmd_get_audio_bitrate(mtmd_context * ctx) { ...@@ -873,8 +886,7 @@ int mtmd_get_audio_bitrate(mtmd_context * ctx) {
if (!ctx->ctx_a) { if (!ctx->ctx_a) {
return -1; return -1;
} }
// for now, we assume that all audio models have the same bitrate return clip_get_hparams(ctx->ctx_a)->audio_sample_rate;
return 16000; // 16kHz
} }
// //
......
...@@ -22,6 +22,11 @@ ...@@ -22,6 +22,11 @@
* Issues related to API usage may receive lower priority support. * Issues related to API usage may receive lower priority support.
* *
* For the usage, see an example in mtmd-cli.cpp * For the usage, see an example in mtmd-cli.cpp
*
* For contributors:
* - Make sure the C API is aligned with the libllama C API (as in llama.h)
* - Do not include model name (e.g., qwen, gemma) in the API, use generic terms instead
* - Keep the API minimal, do not expose internal details unless necessary
*/ */
#ifdef LLAMA_SHARED #ifdef LLAMA_SHARED
......
...@@ -42,6 +42,7 @@ import ( ...@@ -42,6 +42,7 @@ import (
_ "github.com/ollama/ollama/llama/llama.cpp/common" _ "github.com/ollama/ollama/llama/llama.cpp/common"
_ "github.com/ollama/ollama/llama/llama.cpp/src" _ "github.com/ollama/ollama/llama/llama.cpp/src"
_ "github.com/ollama/ollama/llama/llama.cpp/tools/mtmd" _ "github.com/ollama/ollama/llama/llama.cpp/tools/mtmd"
_ "github.com/ollama/ollama/llama/llama.cpp/tools/mtmd/models"
"github.com/ollama/ollama/ml" "github.com/ollama/ollama/ml"
ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src" ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
) )
......
...@@ -23,10 +23,10 @@ problem. ...@@ -23,10 +23,10 @@ problem.
8 files changed, 21 insertions(+), 2 deletions(-) 8 files changed, 21 insertions(+), 2 deletions(-)
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 08681f35e..afde2f0b7 100644 index 8547ecc84..9f37ca70c 100644
--- a/ggml/src/ggml-backend.cpp --- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp
@@ -113,7 +113,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) { @@ -112,7 +112,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
if (buffer->iface.free_buffer != NULL) { if (buffer->iface.free_buffer != NULL) {
buffer->iface.free_buffer(buffer); buffer->iface.free_buffer(buffer);
} }
...@@ -34,7 +34,7 @@ index 08681f35e..afde2f0b7 100644 ...@@ -34,7 +34,7 @@ index 08681f35e..afde2f0b7 100644
} }
size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) { size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
@@ -586,6 +585,7 @@ static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) @@ -591,6 +590,7 @@ static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer)
free(ctx->buffers); free(ctx->buffers);
free(ctx); free(ctx);
...@@ -42,7 +42,7 @@ index 08681f35e..afde2f0b7 100644 ...@@ -42,7 +42,7 @@ index 08681f35e..afde2f0b7 100644
} }
static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
@@ -2106,6 +2106,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { @@ -2125,6 +2125,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
GGML_ASSERT(buffer); GGML_ASSERT(buffer);
ggml_aligned_free(buffer->context, buffer->size); ggml_aligned_free(buffer->context, buffer->size);
...@@ -54,7 +54,7 @@ index 08681f35e..afde2f0b7 100644 ...@@ -54,7 +54,7 @@ index 08681f35e..afde2f0b7 100644
} }
static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
@@ -2158,7 +2163,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = { @@ -2177,7 +2182,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
}; };
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = { static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
...@@ -64,7 +64,7 @@ index 08681f35e..afde2f0b7 100644 ...@@ -64,7 +64,7 @@ index 08681f35e..afde2f0b7 100644
/* .init_tensor = */ NULL, // no initialization required /* .init_tensor = */ NULL, // no initialization required
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor, /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index 81288464c..866758782 100644 index da624c587..efc63e092 100644
--- a/ggml/src/ggml-cann/ggml-cann.cpp --- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -831,6 +831,7 @@ static bool ggml_backend_buffer_is_cann(ggml_backend_buffer_t buffer) { @@ -831,6 +831,7 @@ static bool ggml_backend_buffer_is_cann(ggml_backend_buffer_t buffer) {
...@@ -84,7 +84,7 @@ index 81288464c..866758782 100644 ...@@ -84,7 +84,7 @@ index 81288464c..866758782 100644
/** /**
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 279679a4e..5145c1e88 100644 index ab0f6fe9c..6519af435 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -583,6 +583,7 @@ struct ggml_backend_cuda_buffer_context { @@ -583,6 +583,7 @@ struct ggml_backend_cuda_buffer_context {
...@@ -156,10 +156,10 @@ index 18a45d2d9..89041805e 100644 ...@@ -156,10 +156,10 @@ index 18a45d2d9..89041805e 100644
static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 7449a9160..e69a1ff5f 100644 index e996d98be..84b679315 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp --- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -355,6 +355,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try { @@ -356,6 +356,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
ggml_sycl_set_device(ctx->device); ggml_sycl_set_device(ctx->device);
delete ctx; delete ctx;
...@@ -167,7 +167,7 @@ index 7449a9160..e69a1ff5f 100644 ...@@ -167,7 +167,7 @@ index 7449a9160..e69a1ff5f 100644
} }
catch (sycl::exception const &exc) { catch (sycl::exception const &exc) {
std::cerr << exc.what() << "Exception caught at file:" << __FILE__ std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -816,6 +817,7 @@ struct ggml_backend_sycl_split_buffer_context { @@ -817,6 +818,7 @@ struct ggml_backend_sycl_split_buffer_context {
static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context; ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
delete ctx; delete ctx;
...@@ -175,7 +175,7 @@ index 7449a9160..e69a1ff5f 100644 ...@@ -175,7 +175,7 @@ index 7449a9160..e69a1ff5f 100644
} }
static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -1158,6 +1160,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_ @@ -1159,6 +1161,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_sycl_host_free(buffer->context); ggml_sycl_host_free(buffer->context);
...@@ -184,10 +184,10 @@ index 7449a9160..e69a1ff5f 100644 ...@@ -184,10 +184,10 @@ index 7449a9160..e69a1ff5f 100644
static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index c6f5809cc..c801d2fd2 100644 index 34ec09d40..120191ca0 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -12271,6 +12271,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) { @@ -12365,6 +12365,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
ggml_vk_destroy_buffer(ctx->dev_buffer); ggml_vk_destroy_buffer(ctx->dev_buffer);
delete ctx; delete ctx;
...@@ -195,7 +195,7 @@ index c6f5809cc..c801d2fd2 100644 ...@@ -195,7 +195,7 @@ index c6f5809cc..c801d2fd2 100644
} }
static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -12414,6 +12415,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe @@ -12508,6 +12509,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()"); VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
ggml_vk_host_free(vk_instance.devices[0], buffer->context); ggml_vk_host_free(vk_instance.devices[0], buffer->context);
......
...@@ -10,7 +10,7 @@ logs instead of throwing an error ...@@ -10,7 +10,7 @@ logs instead of throwing an error
1 file changed, 3 insertions(+), 11 deletions(-) 1 file changed, 3 insertions(+), 11 deletions(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index e2cca66e4..8246a0a14 100644 index 7b01a2edf..63250cdf1 100644
--- a/src/llama-vocab.cpp --- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp
@@ -1825,16 +1825,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { @@ -1825,16 +1825,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
...@@ -31,7 +31,7 @@ index e2cca66e4..8246a0a14 100644 ...@@ -31,7 +31,7 @@ index e2cca66e4..8246a0a14 100644
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if ( } else if (
tokenizer_pre == "llama3" || tokenizer_pre == "llama3" ||
@@ -2014,7 +2005,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { @@ -2015,7 +2006,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2; pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2;
clean_spaces = false; clean_spaces = false;
} else { } else {
......
...@@ -10,7 +10,7 @@ filesystems for paths that include wide characters ...@@ -10,7 +10,7 @@ filesystems for paths that include wide characters
1 file changed, 39 insertions(+) 1 file changed, 39 insertions(+)
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 3ed08a0fe..6be1470ad 100644 index 35e3aef0a..84a3796b5 100644
--- a/tools/mtmd/clip.cpp --- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp
@@ -24,6 +24,19 @@ @@ -24,6 +24,19 @@
...@@ -32,8 +32,8 @@ index 3ed08a0fe..6be1470ad 100644 ...@@ -32,8 +32,8 @@ index 3ed08a0fe..6be1470ad 100644
+ +
struct clip_logger_state g_logger_state = {clip_log_callback_default, NULL}; struct clip_logger_state g_logger_state = {clip_log_callback_default, NULL};
enum ffn_op_type { //#define CLIP_DEBUG_FUNCTIONS
@@ -3257,7 +3270,29 @@ struct clip_model_loader { @@ -1619,7 +1632,29 @@ struct clip_model_loader {
{ {
std::vector<uint8_t> read_buf; std::vector<uint8_t> read_buf;
...@@ -63,7 +63,7 @@ index 3ed08a0fe..6be1470ad 100644 ...@@ -63,7 +63,7 @@ index 3ed08a0fe..6be1470ad 100644
if (!fin) { if (!fin) {
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str())); throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
} }
@@ -3284,7 +3319,11 @@ struct clip_model_loader { @@ -1646,7 +1681,11 @@ struct clip_model_loader {
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
} }
} }
......
...@@ -6,7 +6,7 @@ Subject: [PATCH] solar-pro ...@@ -6,7 +6,7 @@ Subject: [PATCH] solar-pro
adds support for the Solar Pro architecture adds support for the Solar Pro architecture
--- ---
src/CMakeLists.txt | 1 + src/CMakeLists.txt | 1 +
src/llama-arch.cpp | 21 +++++ src/llama-arch.cpp | 20 +++++
src/llama-arch.h | 3 + src/llama-arch.h | 3 +
src/llama-hparams.cpp | 8 ++ src/llama-hparams.cpp | 8 ++
src/llama-hparams.h | 5 ++ src/llama-hparams.h | 5 ++
...@@ -15,7 +15,7 @@ adds support for the Solar Pro architecture ...@@ -15,7 +15,7 @@ adds support for the Solar Pro architecture
src/llama-model.h | 3 + src/llama-model.h | 3 +
src/models/models.h | 5 ++ src/models/models.h | 5 ++
src/models/solar.cpp | 158 +++++++++++++++++++++++++++++++++++++ src/models/solar.cpp | 158 +++++++++++++++++++++++++++++++++++++
10 files changed, 253 insertions(+), 1 deletion(-) 10 files changed, 252 insertions(+), 1 deletion(-)
create mode 100644 src/models/solar.cpp create mode 100644 src/models/solar.cpp
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
...@@ -31,10 +31,10 @@ index 4192af7c0..bd44d73e7 100644 ...@@ -31,10 +31,10 @@ index 4192af7c0..bd44d73e7 100644
models/starcoder.cpp models/starcoder.cpp
models/starcoder2.cpp models/starcoder2.cpp
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 64ad1b776..a5fe4f66c 100644 index 8caf80afc..2ce8ffec0 100644
--- a/src/llama-arch.cpp --- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp +++ b/src/llama-arch.cpp
@@ -85,6 +85,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = { @@ -87,6 +87,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_GRANITE_MOE, "granitemoe" }, { LLM_ARCH_GRANITE_MOE, "granitemoe" },
{ LLM_ARCH_GRANITE_HYBRID, "granitehybrid" }, { LLM_ARCH_GRANITE_HYBRID, "granitehybrid" },
{ LLM_ARCH_CHAMELEON, "chameleon" }, { LLM_ARCH_CHAMELEON, "chameleon" },
...@@ -42,7 +42,7 @@ index 64ad1b776..a5fe4f66c 100644 ...@@ -42,7 +42,7 @@ index 64ad1b776..a5fe4f66c 100644
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" }, { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
{ LLM_ARCH_PLM, "plm" }, { LLM_ARCH_PLM, "plm" },
{ LLM_ARCH_BAILINGMOE, "bailingmoe" }, { LLM_ARCH_BAILINGMOE, "bailingmoe" },
@@ -206,6 +207,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = { @@ -208,6 +209,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" }, { LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
{ LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" }, { LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
{ LLM_KV_ATTENTION_TEMPERATURE_SCALE, "%s.attention.temperature_scale" }, { LLM_KV_ATTENTION_TEMPERATURE_SCALE, "%s.attention.temperature_scale" },
...@@ -50,32 +50,38 @@ index 64ad1b776..a5fe4f66c 100644 ...@@ -50,32 +50,38 @@ index 64ad1b776..a5fe4f66c 100644
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" }, { LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" }, { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
@@ -2025,6 +2027,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N @@ -339,6 +341,7 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
}, { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
}, { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
+ { + { LLM_TENSOR_BSKCN_TV, "bskcn_tv" },
+ LLM_ARCH_SOLAR, { LLM_TENSOR_POS_EMBD, "position_embd" },
+ { { LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act" },
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, @@ -2176,6 +2179,22 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
+ { LLM_TENSOR_OUTPUT, "output" }, return {
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, LLM_TENSOR_TOKEN_EMBD,
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, };
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + case LLM_ARCH_SOLAR:
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + return {
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + LLM_TENSOR_TOKEN_EMBD,
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + LLM_TENSOR_OUTPUT_NORM,
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + LLM_TENSOR_OUTPUT,
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + LLM_TENSOR_ATTN_NORM,
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + LLM_TENSOR_ATTN_Q,
+ { LLM_TENSOR_BSKCN_TV, "bskcn_tv" }, + LLM_TENSOR_ATTN_K,
+ }, + LLM_TENSOR_ATTN_V,
+ }, + LLM_TENSOR_ATTN_OUT,
{ + LLM_TENSOR_FFN_NORM,
LLM_ARCH_WAVTOKENIZER_DEC, + LLM_TENSOR_FFN_GATE,
{ + LLM_TENSOR_FFN_DOWN,
@@ -2710,6 +2730,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = { + LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_BSKCN_TV,
+ };
default:
GGML_ABORT("unknown architecture for tensor mapping");
}
@@ -2344,6 +2363,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_LAUREL_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_LAUREL_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
// this tensor is loaded for T5, but never used // this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}}, {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
...@@ -84,10 +90,10 @@ index 64ad1b776..a5fe4f66c 100644 ...@@ -84,10 +90,10 @@ index 64ad1b776..a5fe4f66c 100644
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h diff --git a/src/llama-arch.h b/src/llama-arch.h
index e11318002..ec9e3a6df 100644 index 6cbf9b1f8..14d461c76 100644
--- a/src/llama-arch.h --- a/src/llama-arch.h
+++ b/src/llama-arch.h +++ b/src/llama-arch.h
@@ -89,6 +89,7 @@ enum llm_arch { @@ -91,6 +91,7 @@ enum llm_arch {
LLM_ARCH_GRANITE_MOE, LLM_ARCH_GRANITE_MOE,
LLM_ARCH_GRANITE_HYBRID, LLM_ARCH_GRANITE_HYBRID,
LLM_ARCH_CHAMELEON, LLM_ARCH_CHAMELEON,
...@@ -95,7 +101,7 @@ index e11318002..ec9e3a6df 100644 ...@@ -95,7 +101,7 @@ index e11318002..ec9e3a6df 100644
LLM_ARCH_WAVTOKENIZER_DEC, LLM_ARCH_WAVTOKENIZER_DEC,
LLM_ARCH_PLM, LLM_ARCH_PLM,
LLM_ARCH_BAILINGMOE, LLM_ARCH_BAILINGMOE,
@@ -210,6 +211,7 @@ enum llm_kv { @@ -212,6 +213,7 @@ enum llm_kv {
LLM_KV_ATTENTION_OUTPUT_SCALE, LLM_KV_ATTENTION_OUTPUT_SCALE,
LLM_KV_ATTENTION_TEMPERATURE_LENGTH, LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
LLM_KV_ATTENTION_TEMPERATURE_SCALE, LLM_KV_ATTENTION_TEMPERATURE_SCALE,
...@@ -103,7 +109,7 @@ index e11318002..ec9e3a6df 100644 ...@@ -103,7 +109,7 @@ index e11318002..ec9e3a6df 100644
LLM_KV_ATTENTION_KEY_LENGTH_MLA, LLM_KV_ATTENTION_KEY_LENGTH_MLA,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA, LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
@@ -462,6 +464,7 @@ enum llm_tensor { @@ -465,6 +467,7 @@ enum llm_tensor {
LLM_TENSOR_ENC_OUTPUT_NORM, LLM_TENSOR_ENC_OUTPUT_NORM,
LLM_TENSOR_CLS, LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT, LLM_TENSOR_CLS_OUT,
...@@ -112,10 +118,10 @@ index e11318002..ec9e3a6df 100644 ...@@ -112,10 +118,10 @@ index e11318002..ec9e3a6df 100644
LLM_TENSOR_CONVNEXT_DW, LLM_TENSOR_CONVNEXT_DW,
LLM_TENSOR_CONVNEXT_NORM, LLM_TENSOR_CONVNEXT_NORM,
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
index 8cdbaf69f..41127bf91 100644 index fe1fa4341..aabff2f06 100644
--- a/src/llama-hparams.cpp --- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp
@@ -161,6 +161,14 @@ uint32_t llama_hparams::n_pos_per_embd() const { @@ -163,6 +163,14 @@ uint32_t llama_hparams::n_pos_per_embd() const {
return rope_type == LLAMA_ROPE_TYPE_MROPE || rope_type == LLAMA_ROPE_TYPE_IMROPE ? 4 : 1; return rope_type == LLAMA_ROPE_TYPE_MROPE || rope_type == LLAMA_ROPE_TYPE_IMROPE ? 4 : 1;
} }
...@@ -131,10 +137,10 @@ index 8cdbaf69f..41127bf91 100644 ...@@ -131,10 +137,10 @@ index 8cdbaf69f..41127bf91 100644
if (il < n_layer) { if (il < n_layer) {
return swa_layers[il]; return swa_layers[il];
diff --git a/src/llama-hparams.h b/src/llama-hparams.h diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index 6eff334a5..a778fc3cf 100644 index f6e95b5d2..c6e673276 100644
--- a/src/llama-hparams.h --- a/src/llama-hparams.h
+++ b/src/llama-hparams.h +++ b/src/llama-hparams.h
@@ -64,6 +64,8 @@ struct llama_hparams { @@ -65,6 +65,8 @@ struct llama_hparams {
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr; std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr; std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
...@@ -143,7 +149,7 @@ index 6eff334a5..a778fc3cf 100644 ...@@ -143,7 +149,7 @@ index 6eff334a5..a778fc3cf 100644
uint32_t n_layer_dense_lead = 0; uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0; uint32_t n_lora_q = 0;
uint32_t n_lora_kv = 0; uint32_t n_lora_kv = 0;
@@ -256,6 +258,9 @@ struct llama_hparams { @@ -259,6 +261,9 @@ struct llama_hparams {
uint32_t n_pos_per_embd() const; uint32_t n_pos_per_embd() const;
...@@ -154,7 +160,7 @@ index 6eff334a5..a778fc3cf 100644 ...@@ -154,7 +160,7 @@ index 6eff334a5..a778fc3cf 100644
bool has_kv(uint32_t il) const; bool has_kv(uint32_t il) const;
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index aa3a65f87..ee303bd58 100644 index ca2ea2461..8916a6242 100644
--- a/src/llama-model-loader.cpp --- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp
@@ -466,7 +466,7 @@ namespace GGUFMeta { @@ -466,7 +466,7 @@ namespace GGUFMeta {
...@@ -167,10 +173,10 @@ index aa3a65f87..ee303bd58 100644 ...@@ -167,10 +173,10 @@ index aa3a65f87..ee303bd58 100644
llama_model_loader::llama_model_loader( llama_model_loader::llama_model_loader(
const std::string & fname, const std::string & fname,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 04fccc979..3c503b424 100644 index ae8207ee1..00cd579e0 100644
--- a/src/llama-model.cpp --- a/src/llama-model.cpp
+++ b/src/llama-model.cpp +++ b/src/llama-model.cpp
@@ -1975,6 +1975,21 @@ void llama_model::load_hparams(llama_model_loader & ml) { @@ -1995,6 +1995,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN; default: type = LLM_TYPE_UNKNOWN;
} }
} break; } break;
...@@ -192,7 +198,7 @@ index 04fccc979..3c503b424 100644 ...@@ -192,7 +198,7 @@ index 04fccc979..3c503b424 100644
case LLM_ARCH_WAVTOKENIZER_DEC: case LLM_ARCH_WAVTOKENIZER_DEC:
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -5401,6 +5416,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) { @@ -5429,6 +5444,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
...@@ -227,7 +233,7 @@ index 04fccc979..3c503b424 100644 ...@@ -227,7 +233,7 @@ index 04fccc979..3c503b424 100644
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
@@ -7480,6 +7523,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { @@ -7534,6 +7577,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
{ {
llm = std::make_unique<llm_build_chameleon>(*this, params); llm = std::make_unique<llm_build_chameleon>(*this, params);
} break; } break;
...@@ -238,7 +244,7 @@ index 04fccc979..3c503b424 100644 ...@@ -238,7 +244,7 @@ index 04fccc979..3c503b424 100644
case LLM_ARCH_WAVTOKENIZER_DEC: case LLM_ARCH_WAVTOKENIZER_DEC:
{ {
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params); llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
@@ -7743,6 +7790,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { @@ -7798,6 +7845,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_GRANITE_MOE: case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_GRANITE_HYBRID: case LLM_ARCH_GRANITE_HYBRID:
case LLM_ARCH_CHAMELEON: case LLM_ARCH_CHAMELEON:
...@@ -247,7 +253,7 @@ index 04fccc979..3c503b424 100644 ...@@ -247,7 +253,7 @@ index 04fccc979..3c503b424 100644
case LLM_ARCH_NEO_BERT: case LLM_ARCH_NEO_BERT:
case LLM_ARCH_SMOLLM3: case LLM_ARCH_SMOLLM3:
diff --git a/src/llama-model.h b/src/llama-model.h diff --git a/src/llama-model.h b/src/llama-model.h
index f8342cf2c..cbf4e1bfa 100644 index c6eb95318..b378b23ec 100644
--- a/src/llama-model.h --- a/src/llama-model.h
+++ b/src/llama-model.h +++ b/src/llama-model.h
@@ -76,6 +76,7 @@ enum llm_type { @@ -76,6 +76,7 @@ enum llm_type {
...@@ -258,7 +264,7 @@ index f8342cf2c..cbf4e1bfa 100644 ...@@ -258,7 +264,7 @@ index f8342cf2c..cbf4e1bfa 100644
LLM_TYPE_26B, LLM_TYPE_26B,
LLM_TYPE_27B, LLM_TYPE_27B,
LLM_TYPE_30B, LLM_TYPE_30B,
@@ -404,6 +405,8 @@ struct llama_layer { @@ -405,6 +406,8 @@ struct llama_layer {
struct ggml_tensor * ffn_act_beta = nullptr; struct ggml_tensor * ffn_act_beta = nullptr;
struct ggml_tensor * ffn_act_eps = nullptr; struct ggml_tensor * ffn_act_eps = nullptr;
...@@ -268,7 +274,7 @@ index f8342cf2c..cbf4e1bfa 100644 ...@@ -268,7 +274,7 @@ index f8342cf2c..cbf4e1bfa 100644
struct llama_layer_convnext convnext; struct llama_layer_convnext convnext;
diff --git a/src/models/models.h b/src/models/models.h diff --git a/src/models/models.h b/src/models/models.h
index 6494f5450..e0aec822c 100644 index ffb36acc6..6d84a185d 100644
--- a/src/models/models.h --- a/src/models/models.h
+++ b/src/models/models.h +++ b/src/models/models.h
@@ -515,6 +515,11 @@ struct llm_build_smollm3 : public llm_graph_context { @@ -515,6 +515,11 @@ struct llm_build_smollm3 : public llm_graph_context {
......
...@@ -12,7 +12,7 @@ regex ...@@ -12,7 +12,7 @@ regex
2 files changed, 22 insertions(+), 1 deletion(-) 2 files changed, 22 insertions(+), 1 deletion(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 8246a0a14..dfba7778b 100644 index 63250cdf1..dd86a1745 100644
--- a/src/llama-vocab.cpp --- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp
@@ -299,7 +299,7 @@ struct llm_tokenizer_bpe : llm_tokenizer { @@ -299,7 +299,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
......
...@@ -8,10 +8,10 @@ Subject: [PATCH] maintain ordering for rules for grammar ...@@ -8,10 +8,10 @@ Subject: [PATCH] maintain ordering for rules for grammar
1 file changed, 1 insertion(+), 1 deletion(-) 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index c3b4e5d9d..6be552826 100644 index 2f67c74d7..acf00e2d2 100644
--- a/common/json-schema-to-grammar.cpp --- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp
@@ -310,7 +310,7 @@ private: @@ -311,7 +311,7 @@ private:
friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options); friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
std::function<json(const std::string &)> _fetch_json; std::function<json(const std::string &)> _fetch_json;
bool _dotall; bool _dotall;
......
...@@ -53,7 +53,7 @@ index b165d8bdc..f91d4faba 100644 ...@@ -53,7 +53,7 @@ index b165d8bdc..f91d4faba 100644
} }
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index dfba7778b..f72f321b9 100644 index dd86a1745..d63ce9c84 100644
--- a/src/llama-vocab.cpp --- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp
@@ -1781,9 +1781,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { @@ -1781,9 +1781,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
......
...@@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor ...@@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor
1 file changed, 6 insertions(+) 1 file changed, 6 insertions(+)
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index b468b115a..bb65985b4 100644 index a59b51893..53891a91f 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c --- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -15,6 +15,8 @@ @@ -15,6 +15,8 @@
...@@ -20,7 +20,7 @@ index b468b115a..bb65985b4 100644 ...@@ -20,7 +20,7 @@ index b468b115a..bb65985b4 100644
#if defined(_MSC_VER) || defined(__MINGW32__) #if defined(_MSC_VER) || defined(__MINGW32__)
#include <malloc.h> // using malloc.h with MSC/MINGW #include <malloc.h> // using malloc.h with MSC/MINGW
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
@@ -2928,6 +2930,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { @@ -2945,6 +2947,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
ggml_compute_forward(&params, node); ggml_compute_forward(&params, node);
......
...@@ -11,10 +11,10 @@ Subject: [PATCH] graph memory reporting on failure ...@@ -11,10 +11,10 @@ Subject: [PATCH] graph memory reporting on failure
4 files changed, 40 insertions(+), 3 deletions(-) 4 files changed, 40 insertions(+), 3 deletions(-)
diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h
index 2cb150fd2..7ab3f0192 100644 index 78aa059dd..7fa8403b3 100644
--- a/ggml/include/ggml-alloc.h --- a/ggml/include/ggml-alloc.h
+++ b/ggml/include/ggml-alloc.h +++ b/ggml/include/ggml-alloc.h
@@ -65,6 +65,7 @@ GGML_API bool ggml_gallocr_reserve_n( @@ -72,6 +72,7 @@ GGML_API bool ggml_gallocr_reserve_n(
GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph); GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id); GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
...@@ -23,10 +23,10 @@ index 2cb150fd2..7ab3f0192 100644 ...@@ -23,10 +23,10 @@ index 2cb150fd2..7ab3f0192 100644
// Utils // Utils
// Create a buffer and allocate all the tensors in a ggml_context // Create a buffer and allocate all the tensors in a ggml_context
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index f1b740785..c54ff98bf 100644 index 4ed5f3577..a7ebe5dcd 100644
--- a/ggml/include/ggml-backend.h --- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h
@@ -318,6 +318,7 @@ extern "C" { @@ -319,6 +319,7 @@ extern "C" {
GGML_API ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend); GGML_API ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend);
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend); GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
...@@ -35,10 +35,10 @@ index f1b740785..c54ff98bf 100644 ...@@ -35,10 +35,10 @@ index f1b740785..c54ff98bf 100644
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend); GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node); GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index a5995fdc2..dbfd8b5b2 100644 index 41419b617..73b39bfea 100644
--- a/ggml/src/ggml-alloc.c --- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c +++ b/ggml/src/ggml-alloc.c
@@ -494,6 +494,7 @@ struct node_alloc { @@ -485,6 +485,7 @@ struct node_alloc {
struct ggml_gallocr { struct ggml_gallocr {
ggml_backend_buffer_type_t * bufts; // [n_buffers] ggml_backend_buffer_type_t * bufts; // [n_buffers]
struct vbuffer ** buffers; // [n_buffers] struct vbuffer ** buffers; // [n_buffers]
...@@ -46,7 +46,7 @@ index a5995fdc2..dbfd8b5b2 100644 ...@@ -46,7 +46,7 @@ index a5995fdc2..dbfd8b5b2 100644
struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers] struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
int n_buffers; int n_buffers;
@@ -517,6 +518,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs @@ -508,6 +509,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
galloc->buffers = calloc(n_bufs, sizeof(struct vbuffer *)); galloc->buffers = calloc(n_bufs, sizeof(struct vbuffer *));
GGML_ASSERT(galloc->buffers != NULL); GGML_ASSERT(galloc->buffers != NULL);
...@@ -56,7 +56,7 @@ index a5995fdc2..dbfd8b5b2 100644 ...@@ -56,7 +56,7 @@ index a5995fdc2..dbfd8b5b2 100644
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *)); galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
GGML_ASSERT(galloc->buf_tallocs != NULL); GGML_ASSERT(galloc->buf_tallocs != NULL);
@@ -584,6 +588,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) { @@ -575,6 +579,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
ggml_hash_set_free(&galloc->hash_set); ggml_hash_set_free(&galloc->hash_set);
free(galloc->hash_values); free(galloc->hash_values);
free(galloc->bufts); free(galloc->bufts);
...@@ -64,7 +64,7 @@ index a5995fdc2..dbfd8b5b2 100644 ...@@ -64,7 +64,7 @@ index a5995fdc2..dbfd8b5b2 100644
free(galloc->buffers); free(galloc->buffers);
free(galloc->buf_tallocs); free(galloc->buf_tallocs);
free(galloc->node_allocs); free(galloc->node_allocs);
@@ -899,6 +904,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c @@ -904,6 +909,8 @@ static bool ggml_gallocr_reserve_n_impl(
} }
} }
...@@ -73,18 +73,19 @@ index a5995fdc2..dbfd8b5b2 100644 ...@@ -73,18 +73,19 @@ index a5995fdc2..dbfd8b5b2 100644
// reallocate buffers if needed // reallocate buffers if needed
for (int i = 0; i < galloc->n_buffers; i++) { for (int i = 0; i < galloc->n_buffers; i++) {
// if the buffer type is used multiple times, we reuse the same buffer // if the buffer type is used multiple times, we reuse the same buffer
@@ -933,14 +940,19 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c @@ -940,15 +947,20 @@ static bool ggml_gallocr_reserve_n_impl(
#endif galloc->buffers[i] = NULL;
ggml_vbuffer_free(galloc->buffers[i]); } else {
galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE); galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
- if (galloc->buffers[i] == NULL) { - if (galloc->buffers[i] == NULL) {
+ if (galloc->buffers[i]) { + if (galloc->buffers[i]) {
+ galloc->buffer_sizes[i] = ggml_vbuffer_size(galloc->buffers[i]); + galloc->buffer_sizes[i] = ggml_vbuffer_size(galloc->buffers[i]);
+ } else { + } else {
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size); GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
- return false; - return false;
+ galloc->buffer_sizes[i] = new_size; + galloc->buffer_sizes[i] = new_size;
+ success = false; + success = false;
}
} }
+ } else { + } else {
+ galloc->buffer_sizes[i] = ggml_vbuffer_size(galloc->buffers[i]); + galloc->buffer_sizes[i] = ggml_vbuffer_size(galloc->buffers[i]);
...@@ -95,8 +96,8 @@ index a5995fdc2..dbfd8b5b2 100644 ...@@ -95,8 +96,8 @@ index a5995fdc2..dbfd8b5b2 100644
+ return success; + return success;
} }
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) { void ggml_gallocr_reserve_n_size(
@@ -1095,6 +1107,22 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) { @@ -1118,6 +1130,22 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
return ggml_vbuffer_size(galloc->buffers[buffer_id]); return ggml_vbuffer_size(galloc->buffers[buffer_id]);
} }
...@@ -120,10 +121,10 @@ index a5995fdc2..dbfd8b5b2 100644 ...@@ -120,10 +121,10 @@ index a5995fdc2..dbfd8b5b2 100644
static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) { static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index afde2f0b7..dbf8486a0 100644 index 9f37ca70c..1459d16dd 100644
--- a/ggml/src/ggml-backend.cpp --- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp
@@ -1840,6 +1840,13 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe @@ -1859,6 +1859,13 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
return ggml_gallocr_get_buffer_size(sched->galloc, backend_index); return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
} }
......
...@@ -10,7 +10,7 @@ Subject: [PATCH] ggml: Export GPU UUIDs ...@@ -10,7 +10,7 @@ Subject: [PATCH] ggml: Export GPU UUIDs
3 files changed, 63 insertions(+), 6 deletions(-) 3 files changed, 63 insertions(+), 6 deletions(-)
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index c54ff98bf..229bf387b 100644 index a7ebe5dcd..03557bb31 100644
--- a/ggml/include/ggml-backend.h --- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h
@@ -158,6 +158,7 @@ extern "C" { @@ -158,6 +158,7 @@ extern "C" {
...@@ -22,7 +22,7 @@ index c54ff98bf..229bf387b 100644 ...@@ -22,7 +22,7 @@ index c54ff98bf..229bf387b 100644
size_t memory_total; size_t memory_total;
// device type // device type
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 5145c1e88..f641c1016 100644 index 6519af435..c9d3a2b03 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -189,6 +189,51 @@ static int ggml_cuda_parse_id(char devName[]) { @@ -189,6 +189,51 @@ static int ggml_cuda_parse_id(char devName[]) {
...@@ -136,7 +136,7 @@ index 5145c1e88..f641c1016 100644 ...@@ -136,7 +136,7 @@ index 5145c1e88..f641c1016 100644
props->type = ggml_backend_cuda_device_get_type(dev); props->type = ggml_backend_cuda_device_get_type(dev);
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str(); props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total); ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
@@ -4833,6 +4887,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { @@ -4834,6 +4888,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
cudaDeviceProp prop; cudaDeviceProp prop;
CUDA_CHECK(cudaGetDeviceProperties(&prop, i)); CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
dev_ctx->description = prop.name; dev_ctx->description = prop.name;
......
...@@ -10,7 +10,7 @@ Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> ...@@ -10,7 +10,7 @@ Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
2 files changed, 13 insertions(+) 2 files changed, 13 insertions(+)
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index d06fa42e6..0f5712e21 100644 index 2638fe4fc..c4e905a4e 100644
--- a/tools/mtmd/mtmd.cpp --- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp
@@ -87,6 +87,16 @@ enum mtmd_slice_tmpl { @@ -87,6 +87,16 @@ enum mtmd_slice_tmpl {
...@@ -31,10 +31,10 @@ index d06fa42e6..0f5712e21 100644 ...@@ -31,10 +31,10 @@ index d06fa42e6..0f5712e21 100644
return "<__media__>"; return "<__media__>";
} }
diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
index b3df24c29..a6a1af3b8 100644 index 9f7e861e9..72cec1937 100644
--- a/tools/mtmd/mtmd.h --- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h +++ b/tools/mtmd/mtmd.h
@@ -75,6 +75,9 @@ typedef struct mtmd_input_chunk mtmd_input_chunk; @@ -80,6 +80,9 @@ typedef struct mtmd_input_chunk mtmd_input_chunk;
typedef struct mtmd_input_chunks mtmd_input_chunks; typedef struct mtmd_input_chunks mtmd_input_chunks;
typedef struct mtmd_input_text mtmd_input_text; typedef struct mtmd_input_text mtmd_input_text;
......
...@@ -8,10 +8,10 @@ Subject: [PATCH] no power throttling win32 with gnuc ...@@ -8,10 +8,10 @@ Subject: [PATCH] no power throttling win32 with gnuc
1 file changed, 1 insertion(+), 1 deletion(-) 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index bb65985b4..47089a62e 100644 index 53891a91f..8d4851312 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c --- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -2464,7 +2464,7 @@ static bool ggml_thread_apply_priority(int32_t prio) { @@ -2479,7 +2479,7 @@ static bool ggml_thread_apply_priority(int32_t prio) {
// Newer Windows 11 versions aggresively park (offline) CPU cores and often place // Newer Windows 11 versions aggresively park (offline) CPU cores and often place
// all our threads onto the first 4 cores which results in terrible performance with // all our threads onto the first 4 cores which results in terrible performance with
// n_threads > 4 // n_threads > 4
......
...@@ -20,7 +20,7 @@ consistent performance. ...@@ -20,7 +20,7 @@ consistent performance.
8 files changed, 58 insertions(+), 32 deletions(-) 8 files changed, 58 insertions(+), 32 deletions(-)
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index 229bf387b..2763f2bd6 100644 index 03557bb31..93c95602d 100644
--- a/ggml/include/ggml-backend.h --- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h
@@ -98,7 +98,7 @@ extern "C" { @@ -98,7 +98,7 @@ extern "C" {
...@@ -40,8 +40,8 @@ index 229bf387b..2763f2bd6 100644 ...@@ -40,8 +40,8 @@ index 229bf387b..2763f2bd6 100644
+ GGML_API void ggml_backend_sched_set_batch_size(ggml_backend_sched_t sched, int batch_size); + GGML_API void ggml_backend_sched_set_batch_size(ggml_backend_sched_t sched, int batch_size);
+ +
// Initialize backend buffers from a measure graph // Initialize backend buffers from a measure graph
GGML_API void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes);
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
index 6792ba986..0f5b03cef 100644 index 6792ba986..0f5b03cef 100644
--- a/ggml/src/ggml-backend-impl.h --- a/ggml/src/ggml-backend-impl.h
...@@ -58,10 +58,10 @@ index 6792ba986..0f5b03cef 100644 ...@@ -58,10 +58,10 @@ index 6792ba986..0f5b03cef 100644
// (optional) event synchronization // (optional) event synchronization
// record an event on this stream // record an event on this stream
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index dbf8486a0..312ca873c 100644 index 1459d16dd..498186a7c 100644
--- a/ggml/src/ggml-backend.cpp --- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp
@@ -348,14 +348,14 @@ enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_ba @@ -353,14 +353,14 @@ enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_ba
} }
enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
...@@ -79,7 +79,7 @@ index dbf8486a0..312ca873c 100644 ...@@ -79,7 +79,7 @@ index dbf8486a0..312ca873c 100644
} }
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
@@ -722,6 +722,8 @@ struct ggml_backend_sched { @@ -727,6 +727,8 @@ struct ggml_backend_sched {
bool op_offload; bool op_offload;
...@@ -88,7 +88,7 @@ index dbf8486a0..312ca873c 100644 ...@@ -88,7 +88,7 @@ index dbf8486a0..312ca873c 100644
int debug; int debug;
// used for debugging graph reallocations [GGML_SCHED_DEBUG_REALLOC] // used for debugging graph reallocations [GGML_SCHED_DEBUG_REALLOC]
@@ -820,7 +822,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st @@ -825,7 +827,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor); int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
// check if a backend with higher prio wants to offload the op // check if a backend with higher prio wants to offload the op
...@@ -97,7 +97,7 @@ index dbf8486a0..312ca873c 100644 ...@@ -97,7 +97,7 @@ index dbf8486a0..312ca873c 100644
for (int b = 0; b < src_backend_id; b++) { for (int b = 0; b < src_backend_id; b++) {
if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) { if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
SET_CAUSE(tensor, "1.off"); SET_CAUSE(tensor, "1.off");
@@ -1572,7 +1574,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s @@ -1577,7 +1579,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
} }
if (!sched->callback_eval) { if (!sched->callback_eval) {
...@@ -106,7 +106,7 @@ index dbf8486a0..312ca873c 100644 ...@@ -106,7 +106,7 @@ index dbf8486a0..312ca873c 100644
if (ec != GGML_STATUS_SUCCESS) { if (ec != GGML_STATUS_SUCCESS) {
return ec; return ec;
} }
@@ -1594,7 +1596,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s @@ -1599,7 +1601,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1); struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
...@@ -115,7 +115,7 @@ index dbf8486a0..312ca873c 100644 ...@@ -115,7 +115,7 @@ index dbf8486a0..312ca873c 100644
if (ec != GGML_STATUS_SUCCESS) { if (ec != GGML_STATUS_SUCCESS) {
return ec; return ec;
} }
@@ -1684,6 +1686,7 @@ ggml_backend_sched_t ggml_backend_sched_new( @@ -1689,6 +1691,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends); sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
sched->op_offload = op_offload; sched->op_offload = op_offload;
...@@ -123,7 +123,7 @@ index dbf8486a0..312ca873c 100644 ...@@ -123,7 +123,7 @@ index dbf8486a0..312ca873c 100644
ggml_backend_sched_reset(sched); ggml_backend_sched_reset(sched);
@@ -1715,6 +1718,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { @@ -1720,6 +1723,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
free(sched); free(sched);
} }
...@@ -156,7 +156,7 @@ index 5b888cdd8..88d088952 100644 ...@@ -156,7 +156,7 @@ index 5b888cdd8..88d088952 100644
static struct ggml_backend_i blas_backend_i = { static struct ggml_backend_i blas_backend_i = {
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
index 3191faaa4..32f14c811 100644 index f4713a421..92ba577a5 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp --- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -164,7 +164,7 @@ static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backe @@ -164,7 +164,7 @@ static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backe
...@@ -178,7 +178,7 @@ index 3191faaa4..32f14c811 100644 ...@@ -178,7 +178,7 @@ index 3191faaa4..32f14c811 100644
static const struct ggml_backend_i ggml_backend_cpu_i = { static const struct ggml_backend_i ggml_backend_cpu_i = {
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index f641c1016..17062697b 100644 index c9d3a2b03..25548629d 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2901,7 +2901,7 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { @@ -2901,7 +2901,7 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
...@@ -278,10 +278,10 @@ index 8fc1c2fb5..ba95b4acc 100644 ...@@ -278,10 +278,10 @@ index 8fc1c2fb5..ba95b4acc 100644
static void ggml_backend_metal_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) { static void ggml_backend_metal_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) {
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index c801d2fd2..b2c0d0cee 100644 index 120191ca0..5349bce24 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -13006,7 +13006,7 @@ static uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const stru @@ -13099,7 +13099,7 @@ static uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const stru
return num_adds; return num_adds;
} }
...@@ -290,7 +290,7 @@ index c801d2fd2..b2c0d0cee 100644 ...@@ -290,7 +290,7 @@ index c801d2fd2..b2c0d0cee 100644
VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)"); VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
@@ -13241,6 +13241,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg @@ -13334,6 +13334,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
return GGML_STATUS_SUCCESS; return GGML_STATUS_SUCCESS;
UNUSED(backend); UNUSED(backend);
......
...@@ -8,7 +8,7 @@ Subject: [PATCH] fix mtmd-audio.cpp build on windows ...@@ -8,7 +8,7 @@ Subject: [PATCH] fix mtmd-audio.cpp build on windows
1 file changed, 1 insertion(+), 1 deletion(-) 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp
index 4d053895c..84bdc2777 100644 index f68829a61..2024d3d37 100644
--- a/tools/mtmd/mtmd-audio.cpp --- a/tools/mtmd/mtmd-audio.cpp
+++ b/tools/mtmd/mtmd-audio.cpp +++ b/tools/mtmd/mtmd-audio.cpp
@@ -1,6 +1,6 @@ @@ -1,6 +1,6 @@
......
...@@ -10,13 +10,13 @@ must be recreated with no-alloc set to false before loading data. ...@@ -10,13 +10,13 @@ must be recreated with no-alloc set to false before loading data.
--- ---
ggml/include/ggml-backend.h | 1 + ggml/include/ggml-backend.h | 1 +
ggml/src/ggml-backend-impl.h | 16 +++ ggml/src/ggml-backend-impl.h | 16 +++
ggml/src/ggml-backend.cpp | 72 +++++++++- ggml/src/ggml-backend.cpp | 75 ++++++++++-
ggml/src/ggml-cuda/common.cuh | 62 ++++++++- ggml/src/ggml-cuda/common.cuh | 62 ++++++++-
ggml/src/ggml-cuda/ggml-cuda.cu | 224 ++++++++++++++++++++++++++------ ggml/src/ggml-cuda/ggml-cuda.cu | 224 ++++++++++++++++++++++++++------
5 files changed, 331 insertions(+), 44 deletions(-) 5 files changed, 333 insertions(+), 45 deletions(-)
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index 2763f2bd6..b3b5b356a 100644 index 93c95602d..dbbb61d9c 100644
--- a/ggml/include/ggml-backend.h --- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h
@@ -305,6 +305,7 @@ extern "C" { @@ -305,6 +305,7 @@ extern "C" {
...@@ -75,13 +75,19 @@ index 0f5b03cef..7bdf9d81f 100644 ...@@ -75,13 +75,19 @@ index 0f5b03cef..7bdf9d81f 100644
struct ggml_backend { struct ggml_backend {
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 312ca873c..4092dfe8a 100644 index 498186a7c..7746e8b92 100644
--- a/ggml/src/ggml-backend.cpp --- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp
@@ -41,6 +41,19 @@ ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t @@ -36,11 +36,25 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
}
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
- GGML_ASSERT(buft);
if (size == 0) {
// return a dummy buffer for zero-sized allocations
return ggml_backend_buffer_init(buft, {}, NULL, 0); return ggml_backend_buffer_init(buft, {}, NULL, 0);
} }
+
+ if (buft->no_alloc) { + if (buft->no_alloc) {
+ ggml_backend_buffer_t buf; + ggml_backend_buffer_t buf;
+ +
...@@ -95,10 +101,11 @@ index 312ca873c..4092dfe8a 100644 ...@@ -95,10 +101,11 @@ index 312ca873c..4092dfe8a 100644
+ return buf; + return buf;
+ } + }
+ +
GGML_ASSERT(buft); + GGML_ASSERT(buft);
return buft->iface.alloc_buffer(buft, size); return buft->iface.alloc_buffer(buft, size);
} }
@@ -95,7 +108,8 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
@@ -94,7 +108,8 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
/* .buft = */ buft, /* .buft = */ buft,
/* .context = */ context, /* .context = */ context,
/* .size = */ size, /* .size = */ size,
...@@ -108,7 +115,7 @@ index 312ca873c..4092dfe8a 100644 ...@@ -108,7 +115,7 @@ index 312ca873c..4092dfe8a 100644
}; };
return buffer; return buffer;
@@ -127,6 +141,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) { @@ -126,6 +141,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
return NULL; return NULL;
} }
...@@ -118,10 +125,10 @@ index 312ca873c..4092dfe8a 100644 ...@@ -118,10 +125,10 @@ index 312ca873c..4092dfe8a 100644
+ return (void *)ggml_backend_buffer_get_alignment(buffer); + return (void *)ggml_backend_buffer_get_alignment(buffer);
+ } + }
+ +
void * base = buffer->iface.get_base(buffer); // FIXME JG: a multi_buffer has a non-zero size, according to the above comment get_base is not optional,
// I don't know whether the above comment is correct
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL"); if (!buffer->iface.get_base) {
@@ -731,6 +751,12 @@ struct ggml_backend_sched { @@ -736,6 +757,12 @@ struct ggml_backend_sched {
int debug_realloc; int debug_realloc;
int debug_graph_size; int debug_graph_size;
int debug_prev_graph_size; int debug_prev_graph_size;
...@@ -134,7 +141,7 @@ index 312ca873c..4092dfe8a 100644 ...@@ -134,7 +141,7 @@ index 312ca873c..4092dfe8a 100644
}; };
#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor) #define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
@@ -1630,6 +1656,17 @@ ggml_backend_sched_t ggml_backend_sched_new( @@ -1635,6 +1662,17 @@ ggml_backend_sched_t ggml_backend_sched_new(
size_t graph_size, size_t graph_size,
bool parallel, bool parallel,
bool op_offload) { bool op_offload) {
...@@ -152,7 +159,7 @@ index 312ca873c..4092dfe8a 100644 ...@@ -152,7 +159,7 @@ index 312ca873c..4092dfe8a 100644
GGML_ASSERT(n_backends > 0); GGML_ASSERT(n_backends > 0);
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS); GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU); GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
@@ -1682,11 +1719,14 @@ ggml_backend_sched_t ggml_backend_sched_new( @@ -1687,11 +1725,14 @@ ggml_backend_sched_t ggml_backend_sched_new(
sched->events[b][c] = ggml_backend_event_new(backends[b]->device); sched->events[b][c] = ggml_backend_event_new(backends[b]->device);
} }
} }
...@@ -167,7 +174,7 @@ index 312ca873c..4092dfe8a 100644 ...@@ -167,7 +174,7 @@ index 312ca873c..4092dfe8a 100644
ggml_backend_sched_reset(sched); ggml_backend_sched_reset(sched);
@@ -1701,6 +1741,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { @@ -1706,6 +1747,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
for (int c = 0; c < sched->n_copies; c++) { for (int c = 0; c < sched->n_copies; c++) {
ggml_backend_event_free(sched->events[b][c]); ggml_backend_event_free(sched->events[b][c]);
} }
...@@ -178,7 +185,7 @@ index 312ca873c..4092dfe8a 100644 ...@@ -178,7 +185,7 @@ index 312ca873c..4092dfe8a 100644
} }
ggml_gallocr_free(sched->galloc); ggml_gallocr_free(sched->galloc);
ggml_free(sched->ctx); ggml_free(sched->ctx);
@@ -1746,6 +1790,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * @@ -1765,6 +1810,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
return false; return false;
} }
...@@ -203,7 +210,7 @@ index 312ca873c..4092dfe8a 100644 ...@@ -203,7 +210,7 @@ index 312ca873c..4092dfe8a 100644
ggml_backend_sched_reset(sched); ggml_backend_sched_reset(sched);
return true; return true;
@@ -1851,7 +1913,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, @@ -1870,7 +1933,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched,
int backend_index = ggml_backend_sched_backend_id(sched, backend); int backend_index = ggml_backend_sched_backend_id(sched, backend);
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
...@@ -219,7 +226,7 @@ index 312ca873c..4092dfe8a 100644 ...@@ -219,7 +226,7 @@ index 312ca873c..4092dfe8a 100644
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) { void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index c4529f5d9..8b0fb5d42 100644 index 9fcb2f9fd..e800ee8f6 100644
--- a/ggml/src/ggml-cuda/common.cuh --- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh
@@ -37,6 +37,41 @@ @@ -37,6 +37,41 @@
...@@ -264,7 +271,7 @@ index c4529f5d9..8b0fb5d42 100644 ...@@ -264,7 +271,7 @@ index c4529f5d9..8b0fb5d42 100644
#define STRINGIZE_IMPL(...) #__VA_ARGS__ #define STRINGIZE_IMPL(...) #__VA_ARGS__
#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__) #define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
@@ -938,6 +973,9 @@ struct ggml_cuda_pool { @@ -941,6 +976,9 @@ struct ggml_cuda_pool {
virtual void * alloc(size_t size, size_t * actual_size) = 0; virtual void * alloc(size_t size, size_t * actual_size) = 0;
virtual void free(void * ptr, size_t size) = 0; virtual void free(void * ptr, size_t size) = 0;
...@@ -274,7 +281,7 @@ index c4529f5d9..8b0fb5d42 100644 ...@@ -274,7 +281,7 @@ index c4529f5d9..8b0fb5d42 100644
}; };
template<typename T> template<typename T>
@@ -1229,11 +1267,15 @@ struct ggml_backend_cuda_context { @@ -1232,11 +1270,15 @@ struct ggml_backend_cuda_context {
// pool // pool
std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS]; std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS];
...@@ -292,7 +299,7 @@ index c4529f5d9..8b0fb5d42 100644 ...@@ -292,7 +299,7 @@ index c4529f5d9..8b0fb5d42 100644
} }
return *pools[device][curr_stream_no]; return *pools[device][curr_stream_no];
} }
@@ -1241,6 +1283,22 @@ struct ggml_backend_cuda_context { @@ -1244,6 +1286,22 @@ struct ggml_backend_cuda_context {
ggml_cuda_pool & pool() { ggml_cuda_pool & pool() {
return pool(device); return pool(device);
} }
...@@ -316,7 +323,7 @@ index c4529f5d9..8b0fb5d42 100644 ...@@ -316,7 +323,7 @@ index c4529f5d9..8b0fb5d42 100644
struct ggml_cuda_mm_fusion_args_host { struct ggml_cuda_mm_fusion_args_host {
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 17062697b..ede1d089a 100644 index 25548629d..eeaae3fe4 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -365,6 +365,8 @@ const ggml_cuda_device_info & ggml_cuda_info() { @@ -365,6 +365,8 @@ const ggml_cuda_device_info & ggml_cuda_info() {
......
...@@ -8,10 +8,10 @@ Subject: [PATCH] decode: disable output_all ...@@ -8,10 +8,10 @@ Subject: [PATCH] decode: disable output_all
1 file changed, 1 insertion(+), 2 deletions(-) 1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 417140071..87f407f99 100644 index 8786d4ee3..9e6998272 100644
--- a/src/llama-context.cpp --- a/src/llama-context.cpp
+++ b/src/llama-context.cpp +++ b/src/llama-context.cpp
@@ -999,8 +999,7 @@ int llama_context::decode(const llama_batch & batch_inp) { @@ -1051,8 +1051,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
const int64_t n_vocab = vocab.n_tokens(); const int64_t n_vocab = vocab.n_tokens();
const int64_t n_embd = hparams.n_embd_inp(); const int64_t n_embd = hparams.n_embd_inp();
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment