Unverified Commit 0cefd46f authored by Jeffrey Morgan's avatar Jeffrey Morgan Committed by GitHub
Browse files

llama: update to commit de4c07f93 (#10655)

parent ad035ad5
...@@ -6,16 +6,16 @@ Subject: [PATCH] clip-unicode ...@@ -6,16 +6,16 @@ Subject: [PATCH] clip-unicode
fixes loading vision models in llama.cpp on windows fixes loading vision models in llama.cpp on windows
filesystems for paths that include wide characters filesystems for paths that include wide characters
--- ---
examples/llava/clip.cpp | 39 +++++++++++++++++++++++++++++++++++++++ tools/mtmd/clip.cpp | 39 +++++++++++++++++++++++++++++++++++++++
1 file changed, 39 insertions(+) 1 file changed, 39 insertions(+)
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index ad3e7df1..b3218c78 100644 index 41ba45a7..cdd8ca44 100644
--- a/examples/llava/clip.cpp --- a/tools/mtmd/clip.cpp
+++ b/examples/llava/clip.cpp +++ b/tools/mtmd/clip.cpp
@@ -30,6 +30,19 @@ @@ -31,6 +31,19 @@
#include <array>
#include <numeric> #include <numeric>
#include <functional>
+#if defined(_WIN32) +#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN
...@@ -32,8 +32,8 @@ index ad3e7df1..b3218c78 100644 ...@@ -32,8 +32,8 @@ index ad3e7df1..b3218c78 100644
+ +
struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL}; struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
//#define CLIP_DEBUG_FUNCTIONS enum ffn_op_type {
@@ -1971,7 +1984,29 @@ struct clip_model_loader { @@ -2190,7 +2203,29 @@ struct clip_model_loader {
{ {
std::vector<uint8_t> read_buf; std::vector<uint8_t> read_buf;
...@@ -63,7 +63,7 @@ index ad3e7df1..b3218c78 100644 ...@@ -63,7 +63,7 @@ index ad3e7df1..b3218c78 100644
if (!fin) { if (!fin) {
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str())); throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
} }
@@ -1998,7 +2033,11 @@ struct clip_model_loader { @@ -2217,7 +2252,11 @@ struct clip_model_loader {
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
} }
} }
......
...@@ -138,7 +138,7 @@ index 7ee6a5b7..48dce407 100644 ...@@ -138,7 +138,7 @@ index 7ee6a5b7..48dce407 100644
}; };
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index ea73a8a7..a012aeae 100644 index 4cce5166..7f6617fa 100644
--- a/src/llama-model-loader.cpp --- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp
@@ -439,6 +439,7 @@ namespace GGUFMeta { @@ -439,6 +439,7 @@ namespace GGUFMeta {
...@@ -150,10 +150,10 @@ index ea73a8a7..a012aeae 100644 ...@@ -150,10 +150,10 @@ index ea73a8a7..a012aeae 100644
llama_model_loader::llama_model_loader( llama_model_loader::llama_model_loader(
const std::string & fname, const std::string & fname,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 822e2bb2..572378c9 100644 index 3a4e72a3..831b68c0 100644
--- a/src/llama-model.cpp --- a/src/llama-model.cpp
+++ b/src/llama-model.cpp +++ b/src/llama-model.cpp
@@ -1386,6 +1386,21 @@ void llama_model::load_hparams(llama_model_loader & ml) { @@ -1402,6 +1402,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN; default: type = LLM_TYPE_UNKNOWN;
} }
} break; } break;
...@@ -175,7 +175,7 @@ index 822e2bb2..572378c9 100644 ...@@ -175,7 +175,7 @@ index 822e2bb2..572378c9 100644
case LLM_ARCH_WAVTOKENIZER_DEC: case LLM_ARCH_WAVTOKENIZER_DEC:
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -3741,6 +3756,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) { @@ -3774,6 +3789,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
...@@ -210,7 +210,7 @@ index 822e2bb2..572378c9 100644 ...@@ -210,7 +210,7 @@ index 822e2bb2..572378c9 100644
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
@@ -12342,6 +12385,165 @@ struct llm_build_chameleon : public llm_graph_context { @@ -12397,6 +12440,165 @@ struct llm_build_chameleon : public llm_graph_context {
} }
}; };
...@@ -376,7 +376,7 @@ index 822e2bb2..572378c9 100644 ...@@ -376,7 +376,7 @@ index 822e2bb2..572378c9 100644
struct llm_build_wavtokenizer_dec : public llm_graph_context { struct llm_build_wavtokenizer_dec : public llm_graph_context {
llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
ggml_tensor * cur; ggml_tensor * cur;
@@ -13092,6 +13294,10 @@ llm_graph_result_ptr llama_model::build_graph( @@ -13157,6 +13359,10 @@ llm_graph_result_ptr llama_model::build_graph(
{ {
llm = std::make_unique<llm_build_chameleon>(*this, params, gf); llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
} break; } break;
...@@ -387,7 +387,7 @@ index 822e2bb2..572378c9 100644 ...@@ -387,7 +387,7 @@ index 822e2bb2..572378c9 100644
case LLM_ARCH_WAVTOKENIZER_DEC: case LLM_ARCH_WAVTOKENIZER_DEC:
{ {
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf); llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf);
@@ -13238,6 +13444,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { @@ -13301,6 +13507,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_GRANITE: case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE_MOE: case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_CHAMELEON: case LLM_ARCH_CHAMELEON:
...@@ -396,10 +396,10 @@ index 822e2bb2..572378c9 100644 ...@@ -396,10 +396,10 @@ index 822e2bb2..572378c9 100644
return LLAMA_ROPE_TYPE_NORM; return LLAMA_ROPE_TYPE_NORM;
diff --git a/src/llama-model.h b/src/llama-model.h diff --git a/src/llama-model.h b/src/llama-model.h
index 95eca002..856e6042 100644 index 6bdec263..43746c7d 100644
--- a/src/llama-model.h --- a/src/llama-model.h
+++ b/src/llama-model.h +++ b/src/llama-model.h
@@ -64,6 +64,7 @@ enum llm_type { @@ -65,6 +65,7 @@ enum llm_type {
LLM_TYPE_15B, LLM_TYPE_15B,
LLM_TYPE_16B, LLM_TYPE_16B,
LLM_TYPE_20B, LLM_TYPE_20B,
...@@ -407,7 +407,7 @@ index 95eca002..856e6042 100644 ...@@ -407,7 +407,7 @@ index 95eca002..856e6042 100644
LLM_TYPE_27B, LLM_TYPE_27B,
LLM_TYPE_30B, LLM_TYPE_30B,
LLM_TYPE_32B, LLM_TYPE_32B,
@@ -311,6 +312,8 @@ struct llama_layer { @@ -315,6 +316,8 @@ struct llama_layer {
struct ggml_tensor * ffn_up_scale = nullptr; struct ggml_tensor * ffn_up_scale = nullptr;
struct ggml_tensor * ffn_down_scale = nullptr; struct ggml_tensor * ffn_down_scale = nullptr;
......
...@@ -5,88 +5,27 @@ Subject: [PATCH] add mllama support ...@@ -5,88 +5,27 @@ Subject: [PATCH] add mllama support
adds support for the llama 3.2 vision architecture adds support for the llama 3.2 vision architecture
--- ---
examples/llava/llava.cpp | 5 +-
examples/llava/mtmd.cpp | 6 +-
ggml/src/ggml-backend-reg.cpp | 6 +- ggml/src/ggml-backend-reg.cpp | 6 +-
include/llama.h | 6 + include/llama.h | 6 +
src/llama-arch.cpp | 44 +++++ src/llama-arch.cpp | 44 +++++
src/llama-arch.h | 10 ++ src/llama-arch.h | 10 ++
src/llama-batch.cpp | 3 + src/llama-batch.cpp | 3 +
src/llama-context.cpp | 25 ++- src/llama-context.cpp | 23 ++-
src/llama-context.h | 1 + src/llama-context.h | 1 +
src/llama-cparams.h | 1 + src/llama-cparams.h | 1 +
src/llama-graph.cpp | 25 +++ src/llama-graph.cpp | 25 +++
src/llama-graph.h | 12 ++ src/llama-graph.h | 12 ++
src/llama-hparams.cpp | 4 + src/llama-hparams.cpp | 4 +
src/llama-hparams.h | 7 + src/llama-hparams.h | 7 +
src/llama-kv-cache.cpp | 12 +- src/llama-kv-cache.cpp | 14 +-
src/llama-model-loader.cpp | 2 + src/llama-model-loader.cpp | 2 +
src/llama-model.cpp | 309 +++++++++++++++++++++++++++++++++- src/llama-model.cpp | 311 +++++++++++++++++++++++++++++++++-
src/llama-model.h | 12 ++ src/llama-model.h | 12 ++
src/llama-quant.cpp | 4 +- src/llama-quant.cpp | 4 +-
19 files changed, 473 insertions(+), 21 deletions(-) tools/mtmd/llava.cpp | 5 +-
tools/mtmd/mtmd-helper.cpp | 7 +-
19 files changed, 475 insertions(+), 22 deletions(-)
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index c00d16ae..bab027b5 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -457,7 +457,7 @@ struct llava_embd_batch {
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
- llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+ llava_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
pos .resize(n_tokens);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
@@ -469,6 +469,7 @@ struct llava_embd_batch {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ embd,
+ /*n_embd =*/ n_embd,
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
@@ -492,7 +493,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
n_eval = n_batch;
}
float * embd = image_embed->embed+i*n_embd;
- llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
+ llava_embd_batch llava_batch = llava_embd_batch(embd, n_embd, n_eval, *n_past, 0);
if (llama_decode(ctx_llama, llava_batch.batch)) {
LOG_ERR("%s : failed to eval\n", __func__);
return false;
diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
index 7081fd73..c14ac501 100644
--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
@@ -476,7 +476,7 @@ struct decode_embd_batch {
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
- decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
+ decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
pos .resize(n_tokens * n_pos_per_embd);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
@@ -487,6 +487,7 @@ struct decode_embd_batch {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ embd,
+ /*n_embd =*/ n_embd,
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
@@ -610,7 +611,8 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
int32_t i_batch = 0;
int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
float * embd = mtmd_get_output_embd(ctx);
- decode_embd_batch batch_embd(embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
+ int n_embd = llama_model_n_embd(llama_get_model(lctx));
+ decode_embd_batch batch_embd(embd, n_embd, n_tokens, n_past, 0);
const int nx = mtmd_image_tokens_get_nx(chunk.tokens_image.get());
const int ny = mtmd_image_tokens_get_ny(chunk.tokens_image.get());
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 405d8e31..82ae1b5b 100644 index 405d8e31..82ae1b5b 100644
--- a/ggml/src/ggml-backend-reg.cpp --- a/ggml/src/ggml-backend-reg.cpp
...@@ -105,10 +44,10 @@ index 405d8e31..82ae1b5b 100644 ...@@ -105,10 +44,10 @@ index 405d8e31..82ae1b5b 100644
register_backend(ggml_backend_rpc_reg()); register_backend(ggml_backend_rpc_reg());
#endif #endif
diff --git a/include/llama.h b/include/llama.h diff --git a/include/llama.h b/include/llama.h
index 06c56395..f1628e88 100644 index abedebdb..41beef21 100644
--- a/include/llama.h --- a/include/llama.h
+++ b/include/llama.h +++ b/include/llama.h
@@ -256,6 +256,7 @@ extern "C" { @@ -258,6 +258,7 @@ extern "C" {
llama_token * token; llama_token * token;
float * embd; float * embd;
...@@ -116,15 +55,15 @@ index 06c56395..f1628e88 100644 ...@@ -116,15 +55,15 @@ index 06c56395..f1628e88 100644
llama_pos * pos; llama_pos * pos;
int32_t * n_seq_id; int32_t * n_seq_id;
llama_seq_id ** seq_id; llama_seq_id ** seq_id;
@@ -358,6 +359,7 @@ extern "C" { @@ -365,6 +366,7 @@ extern "C" {
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
bool flash_attn; // whether to use flash attention [EXPERIMENTAL] bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
bool no_perf; // whether to measure performance timings bool no_perf; // whether to measure performance timings
bool op_offload; // whether to offload host tensor operations to device
+ bool cross_attn; // whether to use cross attention + bool cross_attn; // whether to use cross attention
};
// Abort callback // model quantization parameters
// if it returns true, execution of llama_decode() will be aborted @@ -464,6 +466,10 @@ extern "C" {
@@ -459,6 +461,10 @@ extern "C" {
struct llama_context_params params), struct llama_context_params params),
"use llama_init_from_model instead"); "use llama_init_from_model instead");
...@@ -247,10 +186,10 @@ index 525c1b7d..bc8a4f0b 100644 ...@@ -247,10 +186,10 @@ index 525c1b7d..bc8a4f0b 100644
LLM_TENSOR_CONVNEXT_DW, LLM_TENSOR_CONVNEXT_DW,
LLM_TENSOR_CONVNEXT_NORM, LLM_TENSOR_CONVNEXT_NORM,
diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
index 01d5ca57..8682b0e6 100644 index a88b2fe3..241b316e 100644
--- a/src/llama-batch.cpp --- a/src/llama-batch.cpp
+++ b/src/llama-batch.cpp +++ b/src/llama-batch.cpp
@@ -316,6 +316,7 @@ struct llama_batch llama_batch_get_one( @@ -320,6 +320,7 @@ struct llama_batch llama_batch_get_one(
/*n_tokens =*/ n_tokens, /*n_tokens =*/ n_tokens,
/*tokens =*/ tokens, /*tokens =*/ tokens,
/*embd =*/ nullptr, /*embd =*/ nullptr,
...@@ -258,7 +197,7 @@ index 01d5ca57..8682b0e6 100644 ...@@ -258,7 +197,7 @@ index 01d5ca57..8682b0e6 100644
/*pos =*/ nullptr, /*pos =*/ nullptr,
/*n_seq_id =*/ nullptr, /*n_seq_id =*/ nullptr,
/*seq_id =*/ nullptr, /*seq_id =*/ nullptr,
@@ -328,6 +329,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_ @@ -332,6 +333,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
/*n_tokens =*/ 0, /*n_tokens =*/ 0,
/*tokens =*/ nullptr, /*tokens =*/ nullptr,
/*embd =*/ nullptr, /*embd =*/ nullptr,
...@@ -266,7 +205,7 @@ index 01d5ca57..8682b0e6 100644 ...@@ -266,7 +205,7 @@ index 01d5ca57..8682b0e6 100644
/*pos =*/ nullptr, /*pos =*/ nullptr,
/*n_seq_id =*/ nullptr, /*n_seq_id =*/ nullptr,
/*seq_id =*/ nullptr, /*seq_id =*/ nullptr,
@@ -336,6 +338,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_ @@ -340,6 +342,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
if (embd) { if (embd) {
batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd); batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
...@@ -275,10 +214,10 @@ index 01d5ca57..8682b0e6 100644 ...@@ -275,10 +214,10 @@ index 01d5ca57..8682b0e6 100644
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc); batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
} }
diff --git a/src/llama-context.cpp b/src/llama-context.cpp diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 9c1fe93f..cd06ad91 100644 index dca22d8b..c22687e4 100644
--- a/src/llama-context.cpp --- a/src/llama-context.cpp
+++ b/src/llama-context.cpp +++ b/src/llama-context.cpp
@@ -851,7 +851,7 @@ float * llama_context::get_logits_ith(int32_t i) { @@ -514,7 +514,7 @@ float * llama_context::get_logits_ith(int32_t i) {
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs)); throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
} }
...@@ -287,7 +226,7 @@ index 9c1fe93f..cd06ad91 100644 ...@@ -287,7 +226,7 @@ index 9c1fe93f..cd06ad91 100644
} catch (const std::exception & err) { } catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what()); LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
#ifndef NDEBUG #ifndef NDEBUG
@@ -972,6 +972,10 @@ void llama_context::set_warmup(bool value) { @@ -632,6 +632,10 @@ void llama_context::set_warmup(bool value) {
cparams.warmup = value; cparams.warmup = value;
} }
...@@ -298,16 +237,16 @@ index 9c1fe93f..cd06ad91 100644 ...@@ -298,16 +237,16 @@ index 9c1fe93f..cd06ad91 100644
void llama_context::set_adapter_lora( void llama_context::set_adapter_lora(
llama_adapter_lora * adapter, llama_adapter_lora * adapter,
float scale) { float scale) {
@@ -1047,7 +1051,7 @@ int llama_context::encode(llama_batch & inp_batch) { @@ -709,7 +713,7 @@ int llama_context::encode(llama_batch & inp_batch) {
const int64_t n_embd = hparams.n_embd; const int64_t n_embd = hparams.n_embd;
- sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true); - llama_sbatch sbatch = llama_sbatch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
+ sbatch.from_batch(batch, batch.n_embd, /* simple_split */ true, /* logits_all */ true); + llama_sbatch sbatch = llama_sbatch(batch, batch.n_embd, /* simple_split */ true, /* logits_all */ true);
const llama_ubatch ubatch = sbatch.split_simple(n_tokens); const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
@@ -1187,10 +1191,9 @@ int llama_context::decode(llama_batch & inp_batch) { @@ -863,10 +867,9 @@ int llama_context::decode(llama_batch & inp_batch) {
const llama_batch & batch = batch_allocr.batch; const llama_batch & batch = batch_allocr.batch;
...@@ -319,16 +258,16 @@ index 9c1fe93f..cd06ad91 100644 ...@@ -319,16 +258,16 @@ index 9c1fe93f..cd06ad91 100644
const int64_t n_tokens_all = batch.n_tokens; const int64_t n_tokens_all = batch.n_tokens;
const int64_t n_embd = hparams.n_embd; const int64_t n_embd = hparams.n_embd;
@@ -1238,7 +1241,7 @@ int llama_context::decode(llama_batch & inp_batch) { @@ -1087,7 +1090,7 @@ int llama_context::decode(llama_batch & inp_batch) {
// make the outputs have the same order they had in the user-provided batch
const bool logits_all = n_outputs_all == n_tokens_all; // note: this is mostly relevant for recurrent models atm
if (!sorted_output) {
- sbatch.from_batch(batch, n_embd, - const uint32_t n_vocab = model.vocab.n_tokens();
+ sbatch.from_batch(batch, batch.n_embd, + const uint32_t n_vocab = model.hparams.n_vocab;
/* simple_split */ !kv_self->recurrent, const uint32_t n_embd = model.hparams.n_embd;
/* logits_all */ logits_all);
@@ -1472,12 +1475,11 @@ int llama_context::decode(llama_batch & inp_batch) { GGML_ASSERT((size_t) n_outputs == out_ids.size());
@@ -1142,12 +1145,11 @@ int llama_context::decode(llama_batch & inp_batch) {
int32_t llama_context::output_reserve(int32_t n_outputs) { int32_t llama_context::output_reserve(int32_t n_outputs) {
const auto & hparams = model.hparams; const auto & hparams = model.hparams;
...@@ -342,16 +281,7 @@ index 9c1fe93f..cd06ad91 100644 ...@@ -342,16 +281,7 @@ index 9c1fe93f..cd06ad91 100644
const auto n_embd = hparams.n_embd; const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead // TODO: use a per-batch flag for logits presence instead
@@ -1545,7 +1547,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) { @@ -1682,7 +1684,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
void llama_context::output_reorder() {
auto & out_ids = sbatch.out_ids;
if (!out_ids.empty()) {
- const uint32_t n_vocab = model.vocab.n_tokens();
+ const uint32_t n_vocab = model.hparams.n_vocab;
const uint32_t n_embd = model.hparams.n_embd;
GGML_ASSERT((size_t) n_outputs == out_ids.size());
@@ -2052,7 +2054,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
{ {
LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__); LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
...@@ -360,15 +290,15 @@ index 9c1fe93f..cd06ad91 100644 ...@@ -360,15 +290,15 @@ index 9c1fe93f..cd06ad91 100644
io.write(&logits_size, sizeof(logits_size)); io.write(&logits_size, sizeof(logits_size));
@@ -2235,6 +2237,7 @@ llama_context_params llama_context_default_params() { @@ -2091,6 +2093,7 @@ llama_context_params llama_context_default_params() {
/*.offload_kqv =*/ true,
/*.flash_attn =*/ false, /*.flash_attn =*/ false,
/*.no_perf =*/ true, /*.no_perf =*/ true,
/*.op_offload =*/ true,
+ /*.cross_attn =*/ false, + /*.cross_attn =*/ false,
/*.abort_callback =*/ nullptr,
/*.abort_callback_data =*/ nullptr,
}; };
@@ -2362,6 +2365,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
return result;
@@ -2216,6 +2219,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
ctx->set_warmup(warmup); ctx->set_warmup(warmup);
} }
...@@ -380,10 +310,10 @@ index 9c1fe93f..cd06ad91 100644 ...@@ -380,10 +310,10 @@ index 9c1fe93f..cd06ad91 100644
ctx->synchronize(); ctx->synchronize();
} }
diff --git a/src/llama-context.h b/src/llama-context.h diff --git a/src/llama-context.h b/src/llama-context.h
index 5457f077..a50c4afa 100644 index c0ceacb1..c4ab242a 100644
--- a/src/llama-context.h --- a/src/llama-context.h
+++ b/src/llama-context.h +++ b/src/llama-context.h
@@ -65,6 +65,7 @@ struct llama_context { @@ -71,6 +71,7 @@ struct llama_context {
void set_embeddings (bool value); void set_embeddings (bool value);
void set_causal_attn(bool value); void set_causal_attn(bool value);
void set_warmup(bool value); void set_warmup(bool value);
...@@ -392,22 +322,22 @@ index 5457f077..a50c4afa 100644 ...@@ -392,22 +322,22 @@ index 5457f077..a50c4afa 100644
void set_adapter_lora( void set_adapter_lora(
llama_adapter_lora * adapter, llama_adapter_lora * adapter,
diff --git a/src/llama-cparams.h b/src/llama-cparams.h diff --git a/src/llama-cparams.h b/src/llama-cparams.h
index 30e550f0..85ad91b9 100644 index 246fa577..7a6156ce 100644
--- a/src/llama-cparams.h --- a/src/llama-cparams.h
+++ b/src/llama-cparams.h +++ b/src/llama-cparams.h
@@ -29,6 +29,7 @@ struct llama_cparams { @@ -31,6 +31,7 @@ struct llama_cparams {
bool offload_kqv;
bool flash_attn;
bool no_perf; bool no_perf;
+ bool cross_attn;
bool warmup; bool warmup;
bool op_offload;
+ bool cross_attn;
enum llama_pooling_type pooling_type; enum llama_pooling_type pooling_type;
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index fabb9ca2..b67216a4 100644 index b0e3f635..f14869cf 100644
--- a/src/llama-graph.cpp --- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp +++ b/src/llama-graph.cpp
@@ -560,6 +560,12 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) { @@ -532,6 +532,12 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
} }
} }
...@@ -420,7 +350,7 @@ index fabb9ca2..b67216a4 100644 ...@@ -420,7 +350,7 @@ index fabb9ca2..b67216a4 100644
// //
// llm_graph_context // llm_graph_context
// //
@@ -1532,6 +1538,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const { @@ -1514,6 +1520,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
return (llm_graph_input_attn_cross *) res->add_input(std::move(inp)); return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
} }
...@@ -447,10 +377,10 @@ index fabb9ca2..b67216a4 100644 ...@@ -447,10 +377,10 @@ index fabb9ca2..b67216a4 100644
llm_graph_input_attn_cross * inp, llm_graph_input_attn_cross * inp,
ggml_cgraph * gf, ggml_cgraph * gf,
diff --git a/src/llama-graph.h b/src/llama-graph.h diff --git a/src/llama-graph.h b/src/llama-graph.h
index d0c8d321..0fe18150 100644 index 832a8c09..5a322785 100644
--- a/src/llama-graph.h --- a/src/llama-graph.h
+++ b/src/llama-graph.h +++ b/src/llama-graph.h
@@ -86,6 +86,7 @@ public: @@ -87,6 +87,7 @@ public:
ggml_tensor * tokens = nullptr; // I32 [n_batch] ggml_tensor * tokens = nullptr; // I32 [n_batch]
ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch] ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch]
...@@ -458,7 +388,7 @@ index d0c8d321..0fe18150 100644 ...@@ -458,7 +388,7 @@ index d0c8d321..0fe18150 100644
}; };
class llm_graph_input_pos : public llm_graph_input_i { class llm_graph_input_pos : public llm_graph_input_i {
@@ -283,6 +284,16 @@ public: @@ -284,6 +285,16 @@ public:
const llama_cross * cross = nullptr; const llama_cross * cross = nullptr;
}; };
...@@ -475,7 +405,7 @@ index d0c8d321..0fe18150 100644 ...@@ -475,7 +405,7 @@ index d0c8d321..0fe18150 100644
// //
// llm_graph_result // llm_graph_result
// //
@@ -491,6 +502,7 @@ struct llm_graph_context { @@ -495,6 +506,7 @@ struct llm_graph_context {
ggml_tensor * build_inp_cls() const; ggml_tensor * build_inp_cls() const;
ggml_tensor * build_inp_s_copy() const; ggml_tensor * build_inp_s_copy() const;
ggml_tensor * build_inp_s_mask() const; ggml_tensor * build_inp_s_mask() const;
...@@ -535,11 +465,11 @@ index 48dce407..b6fc7e6d 100644 ...@@ -535,11 +465,11 @@ index 48dce407..b6fc7e6d 100644
}; };
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index 7c9d46d8..69f8d35a 100644 index 3dcad65b..a7b0a7eb 100644
--- a/src/llama-kv-cache.cpp --- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp
@@ -95,8 +95,16 @@ bool llama_kv_cache_unified::init( @@ -100,8 +100,16 @@ llama_kv_cache_unified::llama_kv_cache_unified(
return false; throw std::runtime_error("failed to create ggml context for kv cache");
} }
- ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size); - ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
...@@ -557,8 +487,17 @@ index 7c9d46d8..69f8d35a 100644 ...@@ -557,8 +487,17 @@ index 7c9d46d8..69f8d35a 100644
ggml_format_name(k, "cache_k_l%d", i); ggml_format_name(k, "cache_k_l%d", i);
ggml_format_name(v, "cache_v_l%d", i); ggml_format_name(v, "cache_v_l%d", i);
k_l.push_back(k); k_l.push_back(k);
@@ -446,7 +454,7 @@ void llama_kv_cache_unified::set_full() {
llama_sbatch llama_kv_cache_unified::sbatch_init(
const llama_batch & batch,
bool logits_all) {
- return llama_sbatch(batch, hparams.n_embd, true, logits_all);
+ return llama_sbatch(batch, batch.n_embd, true, logits_all);
}
llama_ubatch llama_kv_cache_unified::ubatch_next(
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index a012aeae..2e11507d 100644 index 7f6617fa..2acfd4a8 100644
--- a/src/llama-model-loader.cpp --- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp
@@ -315,6 +315,8 @@ namespace GGUFMeta { @@ -315,6 +315,8 @@ namespace GGUFMeta {
...@@ -571,10 +510,10 @@ index a012aeae..2e11507d 100644 ...@@ -571,10 +510,10 @@ index a012aeae..2e11507d 100644
bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) { bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
const int kid = gguf_find_key(meta.get(), key.c_str()); const int kid = gguf_find_key(meta.get(), key.c_str());
diff --git a/src/llama-model.cpp b/src/llama-model.cpp diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 572378c9..9d099f11 100644 index 831b68c0..e8298f56 100644
--- a/src/llama-model.cpp --- a/src/llama-model.cpp
+++ b/src/llama-model.cpp +++ b/src/llama-model.cpp
@@ -423,6 +423,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { @@ -433,6 +433,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
// get general kv // get general kv
ml.get_key(LLM_KV_GENERAL_NAME, name, false); ml.get_key(LLM_KV_GENERAL_NAME, name, false);
...@@ -582,7 +521,7 @@ index 572378c9..9d099f11 100644 ...@@ -582,7 +521,7 @@ index 572378c9..9d099f11 100644
// everything past this point is not vocab-related // everything past this point is not vocab-related
if (hparams.vocab_only) { if (hparams.vocab_only) {
@@ -434,6 +435,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { @@ -444,6 +445,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer); ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false); ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false); ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
...@@ -590,7 +529,7 @@ index 572378c9..9d099f11 100644 ...@@ -590,7 +529,7 @@ index 572378c9..9d099f11 100644
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) { if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features); ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
@@ -457,9 +459,11 @@ void llama_model::load_hparams(llama_model_loader & ml) { @@ -467,9 +469,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0); std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0); std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0); std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
...@@ -602,7 +541,7 @@ index 572378c9..9d099f11 100644 ...@@ -602,7 +541,7 @@ index 572378c9..9d099f11 100644
// n_head_kv is optional, default to n_head // n_head_kv is optional, default to n_head
hparams.n_head_kv_arr = hparams.n_head_arr; hparams.n_head_kv_arr = hparams.n_head_arr;
@@ -512,7 +516,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { @@ -522,7 +526,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false); ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
...@@ -611,7 +550,7 @@ index 572378c9..9d099f11 100644 ...@@ -611,7 +550,7 @@ index 572378c9..9d099f11 100644
if (hparams.n_rot != hparams.n_embd_head_k) { if (hparams.n_rot != hparams.n_embd_head_k) {
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k)); throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
} }
@@ -575,6 +579,16 @@ void llama_model::load_hparams(llama_model_loader & ml) { @@ -585,6 +589,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
hparams.use_kq_norm = false; hparams.use_kq_norm = false;
} }
} break; } break;
...@@ -628,7 +567,7 @@ index 572378c9..9d099f11 100644 ...@@ -628,7 +567,7 @@ index 572378c9..9d099f11 100644
case LLM_ARCH_DECI: case LLM_ARCH_DECI:
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -1562,7 +1576,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { @@ -1581,7 +1595,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
const int64_t n_embd_head_v = hparams.n_embd_head_v; const int64_t n_embd_head_v = hparams.n_embd_head_v;
const int64_t n_ff = hparams.n_ff(); const int64_t n_ff = hparams.n_ff();
const int64_t n_embd_gqa = n_embd_v_gqa; const int64_t n_embd_gqa = n_embd_v_gqa;
...@@ -637,7 +576,7 @@ index 572378c9..9d099f11 100644 ...@@ -637,7 +576,7 @@ index 572378c9..9d099f11 100644
const int64_t n_token_types = vocab.n_token_types(); const int64_t n_token_types = vocab.n_token_types();
const int64_t n_rot = hparams.n_rot; const int64_t n_rot = hparams.n_rot;
const int64_t n_expert = hparams.n_expert; const int64_t n_expert = hparams.n_expert;
@@ -1815,6 +1829,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) { @@ -1840,6 +1854,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
} }
} }
} break; } break;
...@@ -690,7 +629,7 @@ index 572378c9..9d099f11 100644 ...@@ -690,7 +629,7 @@ index 572378c9..9d099f11 100644
case LLM_ARCH_DECI: case LLM_ARCH_DECI:
{ {
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -4707,6 +4767,246 @@ struct llm_build_llama : public llm_graph_context { @@ -4756,6 +4816,246 @@ struct llm_build_llama : public llm_graph_context {
} }
}; };
...@@ -832,7 +771,7 @@ index 572378c9..9d099f11 100644 ...@@ -832,7 +771,7 @@ index 572378c9..9d099f11 100644
+ // self attention layer + // self attention layer
+ +
+ // rope freq factors for llama3; may return nullptr for llama2 and other models + // rope freq factors for llama3; may return nullptr for llama2 and other models
+ ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il); + ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+ +
+ // compute Q and K and RoPE them + // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
...@@ -937,7 +876,16 @@ index 572378c9..9d099f11 100644 ...@@ -937,7 +876,16 @@ index 572378c9..9d099f11 100644
struct llm_build_deci : public llm_graph_context { struct llm_build_deci : public llm_graph_context {
llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -13063,6 +13363,10 @@ llm_graph_result_ptr llama_model::build_graph( @@ -12496,7 +12796,7 @@ struct llm_build_solar : public llm_graph_context {
// self-attention
{
// rope freq factors for llama3; may return nullptr for llama2 and other models
- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
+ ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
// compute Q and K and RoPE them
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -13128,6 +13428,10 @@ llm_graph_result_ptr llama_model::build_graph(
{ {
llm = std::make_unique<llm_build_llama>(*this, params, gf); llm = std::make_unique<llm_build_llama>(*this, params, gf);
} break; } break;
...@@ -948,7 +896,7 @@ index 572378c9..9d099f11 100644 ...@@ -948,7 +896,7 @@ index 572378c9..9d099f11 100644
case LLM_ARCH_DECI: case LLM_ARCH_DECI:
{ {
llm = std::make_unique<llm_build_deci>(*this, params, gf); llm = std::make_unique<llm_build_deci>(*this, params, gf);
@@ -13424,6 +13728,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { @@ -13489,6 +13793,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
// use what we call a normal RoPE, operating on pairs of consecutive head values // use what we call a normal RoPE, operating on pairs of consecutive head values
case LLM_ARCH_LLAMA: case LLM_ARCH_LLAMA:
case LLM_ARCH_LLAMA4: case LLM_ARCH_LLAMA4:
...@@ -957,7 +905,7 @@ index 572378c9..9d099f11 100644 ...@@ -957,7 +905,7 @@ index 572378c9..9d099f11 100644
case LLM_ARCH_BAICHUAN: case LLM_ARCH_BAICHUAN:
case LLM_ARCH_STARCODER: case LLM_ARCH_STARCODER:
diff --git a/src/llama-model.h b/src/llama-model.h diff --git a/src/llama-model.h b/src/llama-model.h
index 856e6042..6be91282 100644 index 43746c7d..9281e629 100644
--- a/src/llama-model.h --- a/src/llama-model.h
+++ b/src/llama-model.h +++ b/src/llama-model.h
@@ -11,6 +11,7 @@ @@ -11,6 +11,7 @@
...@@ -968,7 +916,7 @@ index 856e6042..6be91282 100644 ...@@ -968,7 +916,7 @@ index 856e6042..6be91282 100644
struct llama_cparams; struct llama_cparams;
struct llama_ubatch; struct llama_ubatch;
@@ -73,6 +74,7 @@ enum llm_type { @@ -74,6 +75,7 @@ enum llm_type {
LLM_TYPE_40B, LLM_TYPE_40B,
LLM_TYPE_65B, LLM_TYPE_65B,
LLM_TYPE_70B, LLM_TYPE_70B,
...@@ -976,7 +924,7 @@ index 856e6042..6be91282 100644 ...@@ -976,7 +924,7 @@ index 856e6042..6be91282 100644
LLM_TYPE_236B, LLM_TYPE_236B,
LLM_TYPE_290B, LLM_TYPE_290B,
LLM_TYPE_314B, LLM_TYPE_314B,
@@ -314,6 +316,16 @@ struct llama_layer { @@ -318,6 +320,16 @@ struct llama_layer {
struct ggml_tensor * bskcn_tv = nullptr; struct ggml_tensor * bskcn_tv = nullptr;
...@@ -994,7 +942,7 @@ index 856e6042..6be91282 100644 ...@@ -994,7 +942,7 @@ index 856e6042..6be91282 100644
struct llama_layer_convnext convnext; struct llama_layer_convnext convnext;
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 7dc54227..223e1f3f 100644 index 820d5128..56531980 100644
--- a/src/llama-quant.cpp --- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp +++ b/src/llama-quant.cpp
@@ -639,7 +639,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: @@ -639,7 +639,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
...@@ -1008,3 +956,72 @@ index 7dc54227..223e1f3f 100644 ...@@ -1008,3 +956,72 @@ index 7dc54227..223e1f3f 100644
} }
size_t total_size_org = 0; size_t total_size_org = 0;
diff --git a/tools/mtmd/llava.cpp b/tools/mtmd/llava.cpp
index ebef8b3c..b0eb79bb 100644
--- a/tools/mtmd/llava.cpp
+++ b/tools/mtmd/llava.cpp
@@ -462,7 +462,7 @@ struct llava_embd_batch {
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
- llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+ llava_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
pos .resize(n_tokens);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
@@ -474,6 +474,7 @@ struct llava_embd_batch {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ embd,
+ /*n_embd =*/ n_embd,
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
@@ -497,7 +498,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
n_eval = n_batch;
}
float * embd = image_embed->embed+i*n_embd;
- llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
+ llava_embd_batch llava_batch = llava_embd_batch(embd, n_embd, n_eval, *n_past, 0);
if (llama_decode(ctx_llama, llava_batch.batch)) {
LOG_ERR("%s : failed to eval\n", __func__);
return false;
diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp
index 7a328867..61ebdd43 100644
--- a/tools/mtmd/mtmd-helper.cpp
+++ b/tools/mtmd/mtmd-helper.cpp
@@ -58,7 +58,7 @@ struct decode_embd_batch {
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
- decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
+ decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
pos .resize(n_tokens * n_pos_per_embd);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
@@ -69,6 +69,7 @@ struct decode_embd_batch {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ embd,
+ /*n_embd =*/ n_embd,
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
@@ -131,6 +132,7 @@ struct decode_embd_batch {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ batch.embd + offset * n_mmproj_embd,
+ /*n_embd =*/ batch.n_embd,
/*pos =*/ pos_ptr,
/*n_seq_id =*/ batch.n_seq_id + offset,
/*seq_id =*/ batch.seq_id + offset,
@@ -166,7 +168,8 @@ int32_t mtmd_helper_decode_image_chunk(
int32_t n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens);
int32_t i_batch = 0;
int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
- decode_embd_batch batch_embd(encoded_embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
+ int n_embd = llama_model_n_embd(llama_get_model(lctx));
+ decode_embd_batch batch_embd(encoded_embd, n_embd, n_tokens, n_past, seq_id);
const int nx = mtmd_image_tokens_get_nx(image_tokens);
const int ny = mtmd_image_tokens_get_ny(image_tokens);
...@@ -18,7 +18,7 @@ adds the unpad operator to GGML ...@@ -18,7 +18,7 @@ adds the unpad operator to GGML
10 files changed, 223 insertions(+), 2 deletions(-) 10 files changed, 223 insertions(+), 2 deletions(-)
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 1b8603e7..53ef31b2 100644 index e91dedf1..8dc107ba 100644
--- a/ggml/include/ggml.h --- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h +++ b/ggml/include/ggml.h
@@ -489,6 +489,7 @@ extern "C" { @@ -489,6 +489,7 @@ extern "C" {
...@@ -29,7 +29,7 @@ index 1b8603e7..53ef31b2 100644 ...@@ -29,7 +29,7 @@ index 1b8603e7..53ef31b2 100644
GGML_OP_ARANGE, GGML_OP_ARANGE,
GGML_OP_TIMESTEP_EMBEDDING, GGML_OP_TIMESTEP_EMBEDDING,
GGML_OP_ARGSORT, GGML_OP_ARGSORT,
@@ -1777,6 +1778,15 @@ extern "C" { @@ -1781,6 +1782,15 @@ extern "C" {
int p0, int p0,
int p1); int p1);
...@@ -46,10 +46,10 @@ index 1b8603e7..53ef31b2 100644 ...@@ -46,10 +46,10 @@ index 1b8603e7..53ef31b2 100644
// timesteps: [N,] // timesteps: [N,]
// return: [N, dim] // return: [N, dim]
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 64405449..34624cca 100644 index a30e67f2..835e6495 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c --- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -1964,6 +1964,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm @@ -1951,6 +1951,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{ {
ggml_compute_forward_pad_reflect_1d(params, tensor); ggml_compute_forward_pad_reflect_1d(params, tensor);
} break; } break;
...@@ -60,7 +60,7 @@ index 64405449..34624cca 100644 ...@@ -60,7 +60,7 @@ index 64405449..34624cca 100644
case GGML_OP_ARANGE: case GGML_OP_ARANGE:
{ {
ggml_compute_forward_arange(params, tensor); ggml_compute_forward_arange(params, tensor);
@@ -2287,6 +2291,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { @@ -2274,6 +2278,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
case GGML_OP_UPSCALE: case GGML_OP_UPSCALE:
case GGML_OP_PAD: case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D: case GGML_OP_PAD_REFLECT_1D:
...@@ -69,10 +69,10 @@ index 64405449..34624cca 100644 ...@@ -69,10 +69,10 @@ index 64405449..34624cca 100644
case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 7413192b..becdae07 100644 index 955fec59..1868a10c 100644
--- a/ggml/src/ggml-cpu/ops.cpp --- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp
@@ -6703,6 +6703,61 @@ void ggml_compute_forward_pad_reflect_1d( @@ -6690,6 +6690,61 @@ void ggml_compute_forward_pad_reflect_1d(
} }
} }
...@@ -147,10 +147,10 @@ index dc081b9e..a7125555 100644 ...@@ -147,10 +147,10 @@ index dc081b9e..a7125555 100644
void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 04ce764e..491acccb 100644 index cb0d8528..6fe86674 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2223,6 +2223,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg @@ -2238,6 +2238,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_PAD: case GGML_OP_PAD:
ggml_cuda_op_pad(ctx, dst); ggml_cuda_op_pad(ctx, dst);
break; break;
...@@ -160,7 +160,7 @@ index 04ce764e..491acccb 100644 ...@@ -160,7 +160,7 @@ index 04ce764e..491acccb 100644
case GGML_OP_ARANGE: case GGML_OP_ARANGE:
ggml_cuda_op_arange(ctx, dst); ggml_cuda_op_arange(ctx, dst);
break; break;
@@ -3197,6 +3200,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g @@ -3212,6 +3215,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_UPSCALE: case GGML_OP_UPSCALE:
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST; return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
case GGML_OP_PAD: case GGML_OP_PAD:
...@@ -233,10 +233,10 @@ index 8fd386b0..e2ededc3 100644 ...@@ -233,10 +233,10 @@ index 8fd386b0..e2ededc3 100644
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 425524d0..112abef6 100644 index 1b56f858..7641247e 100644
--- a/ggml/src/ggml-metal/ggml-metal.m --- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -341,6 +341,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte @@ -347,6 +347,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
GGML_METAL_KERNEL_TYPE_UPSCALE_F32, GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
GGML_METAL_KERNEL_TYPE_PAD_F32, GGML_METAL_KERNEL_TYPE_PAD_F32,
GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
...@@ -244,7 +244,7 @@ index 425524d0..112abef6 100644 ...@@ -244,7 +244,7 @@ index 425524d0..112abef6 100644
GGML_METAL_KERNEL_TYPE_ARANGE_F32, GGML_METAL_KERNEL_TYPE_ARANGE_F32,
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
@@ -1277,6 +1278,7 @@ @implementation GGMLMetalClass @@ -1294,6 +1295,7 @@ @implementation GGMLMetalClass
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true);
...@@ -252,7 +252,7 @@ index 425524d0..112abef6 100644 ...@@ -252,7 +252,7 @@ index 425524d0..112abef6 100644
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
@@ -1647,6 +1649,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex @@ -1655,6 +1657,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
case GGML_OP_POOL_2D: case GGML_OP_POOL_2D:
case GGML_OP_PAD: case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D: case GGML_OP_PAD_REFLECT_1D:
...@@ -260,7 +260,7 @@ index 425524d0..112abef6 100644 ...@@ -260,7 +260,7 @@ index 425524d0..112abef6 100644
case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
case GGML_OP_LEAKY_RELU: case GGML_OP_LEAKY_RELU:
@@ -4047,6 +4050,36 @@ static bool ggml_metal_encode_node( @@ -4184,6 +4187,36 @@ static bool ggml_metal_encode_node(
const int nth = MIN(1024, ne0); const int nth = MIN(1024, ne0);
...@@ -298,10 +298,10 @@ index 425524d0..112abef6 100644 ...@@ -298,10 +298,10 @@ index 425524d0..112abef6 100644
} break; } break;
case GGML_OP_ARANGE: case GGML_OP_ARANGE:
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 9f4147e9..6ceb3cef 100644 index 9cfddf45..080a943b 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal --- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -2975,6 +2975,51 @@ kernel void kernel_pad_reflect_1d_f32( @@ -3121,6 +3121,51 @@ kernel void kernel_pad_reflect_1d_f32(
} }
} }
...@@ -354,7 +354,7 @@ index 9f4147e9..6ceb3cef 100644 ...@@ -354,7 +354,7 @@ index 9f4147e9..6ceb3cef 100644
device char * dst, device char * dst,
constant ggml_metal_kargs_arange & args, constant ggml_metal_kargs_arange & args,
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 7654ae17..3c57aff8 100644 index 8a654624..6b034d35 100644
--- a/ggml/src/ggml.c --- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c +++ b/ggml/src/ggml.c
@@ -923,6 +923,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { @@ -923,6 +923,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
...@@ -391,7 +391,7 @@ index 7654ae17..3c57aff8 100644 ...@@ -391,7 +391,7 @@ index 7654ae17..3c57aff8 100644
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@@ -4270,6 +4272,25 @@ struct ggml_tensor * ggml_pad_reflect_1d( @@ -4274,6 +4276,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
return result; return result;
} }
......
...@@ -12,10 +12,10 @@ regex ...@@ -12,10 +12,10 @@ regex
2 files changed, 22 insertions(+), 1 deletion(-) 2 files changed, 22 insertions(+), 1 deletion(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index a9ee9f03..1306864e 100644 index 806c1b3d..10f34d33 100644
--- a/src/llama-vocab.cpp --- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp
@@ -296,7 +296,7 @@ struct llm_tokenizer_bpe : llm_tokenizer { @@ -298,7 +298,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM: case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
regex_exprs = { regex_exprs = {
"[\r\n]", "[\r\n]",
......
...@@ -15,33 +15,102 @@ but this can leave a cache that still does not have adequate space ...@@ -15,33 +15,102 @@ but this can leave a cache that still does not have adequate space
even after defragmentation is triggered. Instead, we should do even after defragmentation is triggered. Instead, we should do
multiple batches of processing until everything is complete. multiple batches of processing until everything is complete.
--- ---
src/llama-context.cpp | 105 +++++++++++++---------------------------- src/llama-context.h | 1 +
src/llama-context.h | 4 +- src/llama-kv-cache.cpp | 107 ++++++++++++++---------------------------
src/llama-kv-cache.cpp | 39 +++------------ src/llama-kv-cache.h | 12 ++++-
src/llama-kv-cache.h | 9 +++- 3 files changed, 47 insertions(+), 73 deletions(-)
4 files changed, 51 insertions(+), 106 deletions(-)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp diff --git a/src/llama-context.h b/src/llama-context.h
index cd06ad91..77177c5e 100644 index c4ab242a..9970dfc6 100644
--- a/src/llama-context.cpp --- a/src/llama-context.h
+++ b/src/llama-context.cpp +++ b/src/llama-context.h
@@ -583,13 +583,12 @@ llm_graph_result_ptr llama_context::build_kv_self_shift( @@ -5,6 +5,7 @@
#include "llama-cparams.h"
llm_graph_result_ptr llama_context::build_kv_self_defrag( #include "llama-graph.h"
ggml_context * ctx0, #include "llama-adapter.h"
- ggml_cgraph * gf) const { +#include "llama-kv-cache.h"
+ ggml_cgraph * gf,
+ const std::vector<struct llama_kv_defrag_move> & moves) const { #include "ggml-cpp.h"
auto res = std::make_unique<llm_graph_result>(); #include "ggml-opt.h"
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index a7b0a7eb..1a50c034 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -372,8 +372,6 @@ void llama_kv_cache_unified::commit() {
}
bool llama_kv_cache_unified::update(llama_context & lctx) {
- bool need_reserve = false;
-
auto * sched = lctx.get_sched();
if (has_shift) {
@@ -396,8 +394,6 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
res->set_inputs(nullptr);
lctx.graph_compute(gf, false);
-
- need_reserve = true;
}
{
@@ -411,27 +407,36 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
if (do_defrag) {
LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
+ const uint32_t n_max_nodes = lctx.graph_max_nodes();
+ const uint32_t max_moves = (n_max_nodes - 2*model.hparams.n_layer)/(6*model.hparams.n_layer);
+ if (!defrag_prepare(n_max_nodes)) {
+ LLAMA_LOG_ERROR("%s: failed to prepare defragmentation\n", __func__);
+ return false;
+ }
+
+ for (std::size_t i = 0; i < defrag_info.moves.size(); i += max_moves) {
+ std::vector<struct llama_kv_defrag_move> chunk;
+ auto end = std::min(i + max_moves, defrag_info.moves.size());
+ chunk.assign(defrag_info.moves.begin() + i, defrag_info.moves.begin() + end);
- if (defrag_prepare(lctx.graph_max_nodes())) {
ggml_backend_sched_reset(sched);
auto * gf = lctx.graph_init();
- auto res = build_graph_defrag(lctx.get_cparams(), lctx.get_ctx_compute(), gf);
+ auto res = build_graph_defrag(lctx.get_cparams(), lctx.get_ctx_compute(), gf, chunk);
ggml_backend_sched_alloc_graph(sched, gf);
res->set_inputs(nullptr);
lctx.graph_compute(gf, false);
-
- need_reserve = true;
}
do_defrag = false;
}
const auto & hparams = model.hparams; - return need_reserve;
+ // we never need to reserve a worst case graph
+ return false;
}
- const auto & ids = kv_self->defrag_info.ids; void llama_kv_cache_unified::defrag_sched(float thold) {
@@ -715,11 +720,10 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
const llama_cparams & cparams,
ggml_context * ctx,
- ggml_cgraph * gf) const {
+ ggml_cgraph * gf,
+ const std::vector<struct llama_kv_defrag_move> & moves) const {
auto res = std::make_unique<llm_graph_result>();
- const auto & ids = defrag_info.ids;
- -
#if 0 #if 0
// CPU defrag // CPU defrag
// //
@@ -661,32 +660,20 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag( @@ -791,32 +795,20 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size()); ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
} }
#else #else
...@@ -63,188 +132,63 @@ index cd06ad91..77177c5e 100644 ...@@ -63,188 +132,63 @@ index cd06ad91..77177c5e 100644
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self->k_l[il], ggml_tensor * view_k_src = ggml_view_2d(ctx, k_l[il],
- n_embd_k_gqa, nm, - n_embd_k_gqa, nm,
+ n_embd_k_gqa, move.len, + n_embd_k_gqa, move.len,
ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa), ggml_row_size(k_l[il]->type, n_embd_k_gqa),
- ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*i)); - ggml_row_size(k_l[il]->type, n_embd_k_gqa*i));
+ ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*move.src)); + ggml_row_size(k_l[il]->type, n_embd_k_gqa*move.src));
ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self->k_l[il], ggml_tensor * view_k_dst = ggml_view_2d(ctx, k_l[il],
- n_embd_k_gqa, nm, - n_embd_k_gqa, nm,
+ n_embd_k_gqa, move.len, + n_embd_k_gqa, move.len,
ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa), ggml_row_size(k_l[il]->type, n_embd_k_gqa),
- ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*id)); - ggml_row_size(k_l[il]->type, n_embd_k_gqa*id));
+ ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*move.dst)); + ggml_row_size(k_l[il]->type, n_embd_k_gqa*move.dst));
ggml_tensor * view_v_src; ggml_tensor * view_v_src;
ggml_tensor * view_v_dst; ggml_tensor * view_v_dst;
@@ -694,34 +681,30 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag( @@ -824,31 +816,29 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
if (cparams.flash_attn) { if (cparams.flash_attn) {
// NOTE: the V cache is not transposed when using flash attention // NOTE: the V cache is not transposed when using flash attention
view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il], view_v_src = ggml_view_2d(ctx, v_l[il],
- n_embd_v_gqa, nm, - n_embd_v_gqa, nm,
+ n_embd_v_gqa, move.len, + n_embd_v_gqa, move.len,
ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa), ggml_row_size(v_l[il]->type, n_embd_v_gqa),
- ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*i)); - ggml_row_size(v_l[il]->type, n_embd_v_gqa*i));
+ ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*move.src)); + ggml_row_size(v_l[il]->type, n_embd_v_gqa*move.dst));
view_v_dst = ggml_view_2d(ctx0, kv_self->v_l[il], view_v_dst = ggml_view_2d(ctx, v_l[il],
- n_embd_v_gqa, nm, - n_embd_v_gqa, nm,
+ n_embd_v_gqa, move.len, + move.len, n_embd_v_gqa,
ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa), ggml_row_size(v_l[il]->type, n_embd_v_gqa),
- ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*id)); - ggml_row_size(v_l[il]->type, n_embd_v_gqa*id));
+ ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*move.dst)); + ggml_row_size(v_l[il]->type, move.src));
} else { } else {
view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il], view_v_src = ggml_view_2d(ctx, v_l[il],
- nm, n_embd_v_gqa, - nm, n_embd_v_gqa,
+ move.len, n_embd_v_gqa, + move.len, n_embd_v_gqa,
ggml_row_size(kv_self->v_l[il]->type, kv_self->size), ggml_row_size(v_l[il]->type, size),
- ggml_row_size(kv_self->v_l[il]->type, i)); - ggml_row_size(v_l[il]->type, i));
+ ggml_row_size(kv_self->v_l[il]->type, move.src)); + ggml_row_size(v_l[il]->type, move.src));
view_v_dst = ggml_view_2d(ctx0, kv_self->v_l[il], view_v_dst = ggml_view_2d(ctx, v_l[il],
- nm, n_embd_v_gqa, - nm, n_embd_v_gqa,
+ move.len, n_embd_v_gqa, + move.len, n_embd_v_gqa,
ggml_row_size(kv_self->v_l[il]->type, kv_self->size), ggml_row_size(v_l[il]->type, size),
- ggml_row_size(kv_self->v_l[il]->type, id)); - ggml_row_size(v_l[il]->type, id));
+ ggml_row_size(kv_self->v_l[il]->type, move.dst)); + ggml_row_size(v_l[il]->type, move.dst));
} }
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst)); ggml_build_forward_expand(gf, ggml_cpy(ctx, view_k_src, view_k_dst));
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst)); ggml_build_forward_expand(gf, ggml_cpy(ctx, view_v_src, view_v_dst));
} }
- -
- i += nm - 1; - i += nm - 1;
} }
-
- //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
#endif
return res;
@@ -730,8 +713,6 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
void llama_context::kv_self_update() {
auto & kv = kv_self;
- bool need_reserve = false;
-
if (kv->has_shift) {
if (!kv->get_can_shift()) {
GGML_ABORT("The current context does not support K-shift");
@@ -752,8 +733,6 @@ void llama_context::kv_self_update() {
res->set_inputs(nullptr);
graph_compute(gf, false);
-
- need_reserve = true;
}
{
@@ -768,49 +747,28 @@ void llama_context::kv_self_update() {
// defragment the KV cache if needed
if (kv->do_defrag) {
LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
+ const uint32_t n_max_nodes = graph_max_nodes();
+ const uint32_t max_moves = (n_max_nodes - 2*model.hparams.n_layer)/(6*model.hparams.n_layer);
+ if (!kv->defrag_prepare(n_max_nodes)) {
+ LLAMA_LOG_ERROR("%s: failed to prepare defragmentation\n", __func__);
+ return;
+ }
- if (kv->defrag_prepare(graph_max_nodes())) {
- ggml_backend_sched_reset(sched.get());
+ for (std::size_t i = 0; i < kv_self->defrag_info.moves.size(); i += max_moves) {
+ std::vector<struct llama_kv_defrag_move> chunk;
+ auto end = std::min(i + max_moves, kv_self->defrag_info.moves.size());
+ chunk.assign(kv_self->defrag_info.moves.begin() + i, kv_self->defrag_info.moves.begin() + end);
+ ggml_backend_sched_reset(sched.get());
auto * gf = graph_init();
-
- auto res = build_kv_self_defrag(ctx_compute.get(), gf);
-
+ auto res = build_kv_self_defrag(ctx_compute.get(), gf, chunk);
ggml_backend_sched_alloc_graph(sched.get(), gf);
-
res->set_inputs(nullptr);
-
graph_compute(gf, false);
-
- need_reserve = true;
}
kv->do_defrag = false;
}
-
- // reserve a worst case graph if needed
- if (need_reserve) {
- LLAMA_LOG_DEBUG("%s: reserving a worst case graph\n", __func__);
-
- // build worst-case graph
- uint32_t n_seqs = 1; // TODO: worst-case number of sequences
- uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
-
- // simulate full KV cache
- kv_self->n = kv_self->size;
-
- llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
- llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-
- auto * gf = graph_init();
- graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT);
-
- // initialize scheduler with the worst-case graph
- ggml_backend_sched_reset(sched.get());
- if (!ggml_backend_sched_reserve(sched.get(), gf)) {
- LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
- }
- }
}
enum llama_pooling_type llama_context::pooling_type() const {
@@ -1294,9 +1252,12 @@ int llama_context::decode(llama_batch & inp_batch) {
// find KV slot
{
if (!kv_self->find_slot(ubatch)) {
- LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
-
- return 1;
+ kv_self->defrag();
+ kv_self_update();
+ if (!kv_self->find_slot(ubatch)) {
+ LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
+ return 1;
+ }
}
if (!kv_self->recurrent) {
diff --git a/src/llama-context.h b/src/llama-context.h
index a50c4afa..30f84bfd 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -5,6 +5,7 @@
#include "llama-cparams.h"
#include "llama-graph.h"
#include "llama-adapter.h"
+#include "llama-kv-cache.h"
#include "ggml-cpp.h"
@@ -179,7 +180,8 @@ private: //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
@@ -865,17 +855,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
llm_graph_result_ptr build_kv_self_defrag(
ggml_context * ctx0,
- ggml_cgraph * gf) const;
+ ggml_cgraph * gf,
+ const std::vector<struct llama_kv_defrag_move> & moves) const;
// TODO: read/write lora adapters and cvec
size_t state_write_data(llama_io_write_i & io);
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index 69f8d35a..35a750d3 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -781,17 +781,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
assert(n_used <= n_kv); assert(n_used <= n_kv);
...@@ -263,7 +207,7 @@ index 69f8d35a..35a750d3 100644 ...@@ -263,7 +207,7 @@ index 69f8d35a..35a750d3 100644
// determine which KV cells to move where // determine which KV cells to move where
// //
@@ -799,10 +789,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { @@ -883,10 +863,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
// //
// if ids[i] == i || ids[i] == n_kv, then cell i is not moved // if ids[i] == i || ids[i] == n_kv, then cell i is not moved
// //
...@@ -275,7 +219,7 @@ index 69f8d35a..35a750d3 100644 ...@@ -275,7 +219,7 @@ index 69f8d35a..35a750d3 100644
for (uint32_t i0 = 0; i0 < n_used; ++i0) { for (uint32_t i0 = 0; i0 < n_used; ++i0) {
const auto & cell0 = cells[i0]; const auto & cell0 = cells[i0];
@@ -851,19 +838,11 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { @@ -935,19 +912,11 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
// are we moving a continuous block of memory? // are we moving a continuous block of memory?
bool cont = false; bool cont = false;
...@@ -295,7 +239,7 @@ index 69f8d35a..35a750d3 100644 ...@@ -295,7 +239,7 @@ index 69f8d35a..35a750d3 100644
cont = false; cont = false;
continue; continue;
} }
@@ -879,8 +858,10 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { @@ -963,8 +932,10 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
head = n_used; head = n_used;
if (!cont) { if (!cont) {
...@@ -307,7 +251,7 @@ index 69f8d35a..35a750d3 100644 ...@@ -307,7 +251,7 @@ index 69f8d35a..35a750d3 100644
} }
nf++; nf++;
@@ -890,22 +871,16 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { @@ -974,22 +945,16 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
} }
} }
...@@ -325,37 +269,47 @@ index 69f8d35a..35a750d3 100644 ...@@ -325,37 +269,47 @@ index 69f8d35a..35a750d3 100644
return false; return false;
} }
- LLAMA_LOG_DEBUG("(tmp log) KV defrag cell moves: %u\n", n_moves); - LLAMA_LOG_DEBUG("%s: (tmp log) KV defrag cell moves: %u\n", __func__, n_moves);
- -
- LLAMA_LOG_DEBUG("expected gf nodes: %u\n", 6*n_moves*n_layer); - LLAMA_LOG_DEBUG("%s: expected gf nodes: %u\n", __func__, 6*n_moves*n_layer);
+ // LLAMA_LOG_DEBUG("(tmp log) KV defrag cell moves: %u\n", n_moves); + // LLAMA_LOG_DEBUG("(tmp log) KV defrag cell moves: %u\n", n_moves);
return true; return true;
} }
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
index 56c74035..25cbcb56 100644 index bf3b4b6a..928b9712 100644
--- a/src/llama-kv-cache.h --- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h
@@ -43,6 +43,13 @@ private: @@ -82,6 +82,13 @@ struct llama_kv_cache_guard {
private:
llama_kv_cache * kv; llama_kv_cache * kv;
}; };
+
+// block of KV slots to move when defragging +// block of KV slots to move when defragging
+struct llama_kv_defrag_move { +struct llama_kv_defrag_move {
+ uint32_t src; + uint32_t src;
+ uint32_t dst; + uint32_t dst;
+ uint32_t len; + uint32_t len;
+}; +};
+
struct llama_kv_cell {
llama_pos pos = -1;
llama_pos delta = 0;
@@ -131,7 +138,7 @@ public:
// defrag
//
// llama_kv_cache_unified
@@ -207,7 +214,7 @@ private:
// defrag
struct { struct {
- std::vector<uint32_t> ids; - std::vector<uint32_t> ids;
+ std::vector<llama_kv_defrag_move> moves; + std::vector<llama_kv_defrag_move> moves;
} defrag_info; } defrag_info;
// return true if cells have been moved // return true if cells have been moved
@@ -249,7 +256,8 @@ private:
llm_graph_result_ptr build_graph_defrag(
const llama_cparams & cparams,
ggml_context * ctx,
- ggml_cgraph * gf) const;
+ ggml_cgraph * gf,
+ const std::vector<llama_kv_defrag_move> & moves) const;
void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
...@@ -8,7 +8,7 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants ...@@ -8,7 +8,7 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
1 file changed, 2 insertions(+) 1 file changed, 2 insertions(+)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 43d9fc4f..4c0d3824 100644 index ddea5ad3..45918bf6 100644
--- a/ggml/src/CMakeLists.txt --- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt
@@ -279,6 +279,7 @@ function(ggml_add_cpu_backend_variant tag_name) @@ -279,6 +279,7 @@ function(ggml_add_cpu_backend_variant tag_name)
......
...@@ -9,7 +9,7 @@ disable amx as it reduces performance on some systems ...@@ -9,7 +9,7 @@ disable amx as it reduces performance on some systems
1 file changed, 4 deletions(-) 1 file changed, 4 deletions(-)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 4c0d3824..79c26312 100644 index 45918bf6..0beaed86 100644
--- a/ggml/src/CMakeLists.txt --- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt
@@ -296,10 +296,6 @@ if (GGML_CPU_ALL_VARIANTS) @@ -296,10 +296,6 @@ if (GGML_CPU_ALL_VARIANTS)
......
...@@ -53,15 +53,15 @@ index 381a9c7d..e45b453d 100644 ...@@ -53,15 +53,15 @@ index 381a9c7d..e45b453d 100644
} }
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 1306864e..d6515ff6 100644 index 10f34d33..b098bb25 100644
--- a/src/llama-vocab.cpp --- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp
@@ -1459,7 +1459,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { @@ -1471,7 +1471,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
GGML_ASSERT(pc_type == GGUF_TYPE_INT8 || pc_type == GGUF_TYPE_UINT8);
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str()); - const size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
if (precompiled_charsmap_keyidx != -1) { + const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
- size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
+ size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx); const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap); precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
#ifdef IS_BIG_ENDIAN #ifdef IS_BIG_ENDIAN
...@@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor ...@@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor
1 file changed, 6 insertions(+) 1 file changed, 6 insertions(+)
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 34624cca..59bd3c62 100644 index 835e6495..3902894b 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c --- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -15,6 +15,8 @@ @@ -15,6 +15,8 @@
...@@ -20,7 +20,7 @@ index 34624cca..59bd3c62 100644 ...@@ -20,7 +20,7 @@ index 34624cca..59bd3c62 100644
#if defined(_MSC_VER) || defined(__MINGW32__) #if defined(_MSC_VER) || defined(__MINGW32__)
#include <malloc.h> // using malloc.h with MSC/MINGW #include <malloc.h> // using malloc.h with MSC/MINGW
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
@@ -2859,6 +2861,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { @@ -2846,6 +2848,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
ggml_compute_forward(&params, node); ggml_compute_forward(&params, node);
......
...@@ -184,7 +184,7 @@ index f8c291de..2a3a62db 100644 ...@@ -184,7 +184,7 @@ index f8c291de..2a3a62db 100644
const char * grammar_root, const char * grammar_root,
bool lazy, bool lazy,
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index c0a5f934..75731053 100644 index 804b11e0..15a10ca8 100644
--- a/src/llama-sampling.cpp --- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp
@@ -1466,7 +1466,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) { @@ -1466,7 +1466,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
......
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@kernel.org>
Date: Thu, 1 May 2025 13:46:10 -0700
Subject: [PATCH] ggml: Don't assert fail when tensor data changes (#13222)
The following scenario will cause an assertion failure in the graph
allocator:
- Build and allocate a graph containing a tensor with a non-NULL data
pointer
- Build and allocate a new graph where that data is NULL
Result:
ggml-alloc.c:819: GGML_ASSERT(talloc->buffer_id >= 0) failed
This happens during revalidation because we think that memory should
have been previously allocated based on the current graph but in
reality the previous graph was different. In this situation, we
should do a full reallocation pass.
---
ggml/src/ggml-alloc.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index a3d3f690..5fd379f6 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -816,7 +816,10 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
size_t node_size = 0;
if (!node->data && !node->view_src) {
- GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API
+ // If we previously had data but don't now then reallocate
+ if (talloc->buffer_id < 0) {
+ return false;
+ }
node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
}
return talloc->size_max >= node_size;
...@@ -406,6 +406,7 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, ...@@ -406,6 +406,7 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
C.int(len(schedBackends)), C.int(len(schedBackends)),
C.size_t(maxGraphNodes), C.size_t(maxGraphNodes),
C._Bool(len(gpus) > 1 && slices.Contains(gpus, output.d)), C._Bool(len(gpus) > 1 && slices.Contains(gpus, output.d)),
C._Bool(false),
), ),
schedBackends: schedBackends, schedBackends: schedBackends,
schedBufts: schedBufts, schedBufts: schedBufts,
......
...@@ -38,7 +38,7 @@ extern "C" { ...@@ -38,7 +38,7 @@ extern "C" {
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size); GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft); GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft); GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft); GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft);
...@@ -59,7 +59,7 @@ extern "C" { ...@@ -59,7 +59,7 @@ extern "C" {
GGML_API enum ggml_status ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); GGML_API enum ggml_status ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer); GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer); GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor);
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value); GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer); GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage); GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
...@@ -248,7 +248,7 @@ extern "C" { ...@@ -248,7 +248,7 @@ extern "C" {
// preferrably to run on the same backend as the buffer // preferrably to run on the same backend as the buffer
ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false); sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false, true);
// initialize buffers from a max size graph (optional) // initialize buffers from a max size graph (optional)
reserve_graph = build_graph(sched, max_batch_size); reserve_graph = build_graph(sched, max_batch_size);
...@@ -289,7 +289,7 @@ extern "C" { ...@@ -289,7 +289,7 @@ extern "C" {
typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data); typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
// Initialize a backend scheduler, backends with low index are given priority over backends with high index // Initialize a backend scheduler, backends with low index are given priority over backends with high index
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel); GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload);
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
// Initialize backend buffers from a measure graph // Initialize backend buffers from a measure graph
......
...@@ -24,7 +24,7 @@ typedef std::unique_ptr<gguf_context, gguf_context_deleter> gguf_context_ptr; ...@@ -24,7 +24,7 @@ typedef std::unique_ptr<gguf_context, gguf_context_deleter> gguf_context_ptr;
struct ggml_gallocr_deleter { void operator()(ggml_gallocr_t galloc) { ggml_gallocr_free(galloc); } }; struct ggml_gallocr_deleter { void operator()(ggml_gallocr_t galloc) { ggml_gallocr_free(galloc); } };
typedef std::unique_ptr<ggml_gallocr_t, ggml_gallocr_deleter> ggml_gallocr_ptr; typedef std::unique_ptr<ggml_gallocr, ggml_gallocr_deleter> ggml_gallocr_ptr;
// ggml-backend // ggml-backend
......
...@@ -37,13 +37,16 @@ extern "C" { ...@@ -37,13 +37,16 @@ extern "C" {
// ====== Dataset ====== // ====== Dataset ======
GGML_API ggml_opt_dataset_t ggml_opt_dataset_init( GGML_API ggml_opt_dataset_t ggml_opt_dataset_init(
int64_t ne_datapoint, // number of elements per datapoint enum ggml_type type_data, // the type for the internal data tensor
int64_t ne_label, // number of elements per label enum ggml_type type_label, // the type for the internal labels tensor
int64_t ndata, // total number of datapoints/labels int64_t ne_datapoint, // number of elements per datapoint
int64_t ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied) int64_t ne_label, // number of elements per label
int64_t ndata, // total number of datapoints/labels
int64_t ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied)
GGML_API void ggml_opt_dataset_free(ggml_opt_dataset_t dataset); GGML_API void ggml_opt_dataset_free(ggml_opt_dataset_t dataset);
// get underlying tensors that store the data // get underlying tensors that store the data
GGML_API int64_t ggml_opt_dataset_ndata (ggml_opt_dataset_t dataset);
GGML_API struct ggml_tensor * ggml_opt_dataset_data (ggml_opt_dataset_t dataset); // shape = [ne_datapoint, ndata] GGML_API struct ggml_tensor * ggml_opt_dataset_data (ggml_opt_dataset_t dataset); // shape = [ne_datapoint, ndata]
GGML_API struct ggml_tensor * ggml_opt_dataset_labels(ggml_opt_dataset_t dataset); // shape = [nd_label, ndata] GGML_API struct ggml_tensor * ggml_opt_dataset_labels(ggml_opt_dataset_t dataset); // shape = [nd_label, ndata]
...@@ -56,13 +59,19 @@ extern "C" { ...@@ -56,13 +59,19 @@ extern "C" {
struct ggml_tensor * data_batch, // shape = [ne_datapoint, ndata_batch] struct ggml_tensor * data_batch, // shape = [ne_datapoint, ndata_batch]
struct ggml_tensor * labels_batch, // shape = [ne_label, ndata_batch] struct ggml_tensor * labels_batch, // shape = [ne_label, ndata_batch]
int64_t ibatch); int64_t ibatch);
GGML_API void ggml_opt_dataset_get_batch_host(
ggml_opt_dataset_t dataset,
void * data_batch,
size_t nb_data_batch,
void * labels_batch,
int64_t ibatch);
// ====== Model / Context ====== // ====== Model / Context ======
enum ggml_opt_build_type { enum ggml_opt_build_type {
GGML_OPT_BUILD_TYPE_FORWARD, GGML_OPT_BUILD_TYPE_FORWARD = 10,
GGML_OPT_BUILD_TYPE_GRAD, GGML_OPT_BUILD_TYPE_GRAD = 20,
GGML_OPT_BUILD_TYPE_OPT, GGML_OPT_BUILD_TYPE_OPT = 30,
}; };
// parameters that control which optimizer is used and how said optimizer tries to find the minimal loss // parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
...@@ -81,20 +90,22 @@ extern "C" { ...@@ -81,20 +90,22 @@ extern "C" {
// userdata can be used to pass arbitrary data // userdata can be used to pass arbitrary data
typedef struct ggml_opt_optimizer_params (*ggml_opt_get_optimizer_params)(void * userdata); typedef struct ggml_opt_optimizer_params (*ggml_opt_get_optimizer_params)(void * userdata);
// returns the default optimizer params (constant) // returns the default optimizer params (constant, hard-coded values)
// userdata is not used // userdata is not used
GGML_API struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata); GGML_API struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata);
// casts userdata to ggml_opt_optimizer_params and returns it
GGML_API struct ggml_opt_optimizer_params ggml_opt_get_constant_optimizer_params(void * userdata);
// parameters for initializing a new optimization context // parameters for initializing a new optimization context
struct ggml_opt_params { struct ggml_opt_params {
ggml_backend_sched_t backend_sched; // defines which backends are used to construct the compute graphs ggml_backend_sched_t backend_sched; // defines which backends are used to construct the compute graphs
struct ggml_context * ctx_compute; // created in user code, holds non-static tensors // by default the forward graph needs to be reconstructed for each eval
// if ctx_compute, inputs, and outputs are set the graphs are instead allocated statically
// the forward graph is defined by inputs and outputs struct ggml_context * ctx_compute;
// those tensors and all tensors inbetween are not intended to be reusable between multiple optimization contexts struct ggml_tensor * inputs;
struct ggml_tensor * inputs; struct ggml_tensor * outputs;
struct ggml_tensor * outputs;
enum ggml_opt_loss_type loss_type; enum ggml_opt_loss_type loss_type;
enum ggml_opt_build_type build_type; enum ggml_opt_build_type build_type;
...@@ -107,12 +118,9 @@ extern "C" { ...@@ -107,12 +118,9 @@ extern "C" {
// get parameters for an optimization context with defaults set where possible // get parameters for an optimization context with defaults set where possible
// parameters for which no sensible defaults exist are supplied as arguments to this function // parameters for which no sensible defaults exist are supplied as arguments to this function
GGML_API ggml_opt_params ggml_opt_default_params( GGML_API struct ggml_opt_params ggml_opt_default_params(
ggml_backend_sched_t backend_sched, ggml_backend_sched_t backend_sched,
struct ggml_context * ctx_compute, enum ggml_opt_loss_type loss_type);
struct ggml_tensor * inputs,
struct ggml_tensor * outputs,
enum ggml_opt_loss_type loss_type);
GGML_API ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params); GGML_API ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params);
GGML_API void ggml_opt_free(ggml_opt_context_t opt_ctx); GGML_API void ggml_opt_free(ggml_opt_context_t opt_ctx);
...@@ -121,6 +129,7 @@ extern "C" { ...@@ -121,6 +129,7 @@ extern "C" {
GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer); GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer);
// get underlying tensors that store data // get underlying tensors that store data
// if not using static graphs these pointers become invalid with the next call to ggml_opt_alloc
GGML_API struct ggml_tensor * ggml_opt_inputs( ggml_opt_context_t opt_ctx); // forward graph input tensor GGML_API struct ggml_tensor * ggml_opt_inputs( ggml_opt_context_t opt_ctx); // forward graph input tensor
GGML_API struct ggml_tensor * ggml_opt_outputs( ggml_opt_context_t opt_ctx); // forward graph output tensor GGML_API struct ggml_tensor * ggml_opt_outputs( ggml_opt_context_t opt_ctx); // forward graph output tensor
GGML_API struct ggml_tensor * ggml_opt_labels( ggml_opt_context_t opt_ctx); // labels to compare outputs against GGML_API struct ggml_tensor * ggml_opt_labels( ggml_opt_context_t opt_ctx); // labels to compare outputs against
...@@ -128,11 +137,12 @@ extern "C" { ...@@ -128,11 +137,12 @@ extern "C" {
GGML_API struct ggml_tensor * ggml_opt_pred( ggml_opt_context_t opt_ctx); // predictions made by outputs GGML_API struct ggml_tensor * ggml_opt_pred( ggml_opt_context_t opt_ctx); // predictions made by outputs
GGML_API struct ggml_tensor * ggml_opt_ncorrect(ggml_opt_context_t opt_ctx); // number of matching predictions between outputs and labels GGML_API struct ggml_tensor * ggml_opt_ncorrect(ggml_opt_context_t opt_ctx); // number of matching predictions between outputs and labels
// get the gradient accumulator for a node from the forward graph
GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node); GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);
// ====== Optimization Result ====== // ====== Optimization Result ======
GGML_API ggml_opt_result_t ggml_opt_result_init(); GGML_API ggml_opt_result_t ggml_opt_result_init(void);
GGML_API void ggml_opt_result_free(ggml_opt_result_t result); GGML_API void ggml_opt_result_free(ggml_opt_result_t result);
GGML_API void ggml_opt_result_reset(ggml_opt_result_t result); GGML_API void ggml_opt_result_reset(ggml_opt_result_t result);
...@@ -144,11 +154,20 @@ extern "C" { ...@@ -144,11 +154,20 @@ extern "C" {
// ====== Computation ====== // ====== Computation ======
// do forward pass, increment result if not NULL // if not using static graphs, this function must be called prior to ggml_opt_alloc
GGML_API void ggml_opt_forward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result); GGML_API void ggml_opt_prepare_alloc(
ggml_opt_context_t opt_ctx,
struct ggml_context * ctx_compute,
struct ggml_cgraph * gf,
struct ggml_tensor * inputs,
struct ggml_tensor * outputs);
// allocate the next graph for evaluation, either forward or forward + backward
// must be called exactly once prior to calling ggml_opt_eval
GGML_API void ggml_opt_alloc(ggml_opt_context_t opt_ctx, bool backward);
// do forward pass, increment result if not NULL, do backward pass // do forward pass, increment result if not NULL, do backward pass if allocated
GGML_API void ggml_opt_forward_backward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result); GGML_API void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
// ############################################################################ // ############################################################################
// ## The high-level functions start here. They do not depend on any private ## // ## The high-level functions start here. They do not depend on any private ##
...@@ -200,9 +219,9 @@ extern "C" { ...@@ -200,9 +219,9 @@ extern "C" {
// fit model defined by inputs and outputs to dataset // fit model defined by inputs and outputs to dataset
GGML_API void ggml_opt_fit( GGML_API void ggml_opt_fit(
ggml_backend_sched_t backend_sched, // backend scheduler for constructing the compute graphs ggml_backend_sched_t backend_sched, // backend scheduler for constructing the compute graphs
ggml_context * ctx_compute, // context with temporarily allocated tensors to calculate the outputs struct ggml_context * ctx_compute, // context with temporarily allocated tensors to calculate the outputs
ggml_tensor * inputs, // input tensor with shape [ne_datapoint, ndata_batch] struct ggml_tensor * inputs, // input tensor with shape [ne_datapoint, ndata_batch]
ggml_tensor * outputs, // output tensor, must have shape [ne_label, ndata_batch] if labels are used struct ggml_tensor * outputs, // output tensor, must have shape [ne_label, ndata_batch] if labels are used
ggml_opt_dataset_t dataset, // dataset with data and optionally also labels ggml_opt_dataset_t dataset, // dataset with data and optionally also labels
enum ggml_opt_loss_type loss_type, // loss to minimize enum ggml_opt_loss_type loss_type, // loss to minimize
ggml_opt_get_optimizer_params get_opt_pars, // callback to get optimizer params, userdata is pointer to epoch (of type int64_t) ggml_opt_get_optimizer_params get_opt_pars, // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
......
...@@ -674,11 +674,15 @@ extern "C" { ...@@ -674,11 +674,15 @@ extern "C" {
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor); GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
// returns whether the tensor elements can be iterated over with a flattened index (no gaps, no permutation)
GGML_API bool ggml_is_contiguous (const struct ggml_tensor * tensor); GGML_API bool ggml_is_contiguous (const struct ggml_tensor * tensor);
GGML_API bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous() GGML_API bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1 GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2 GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
// returns whether the tensor elements are allocated as one contiguous block of memory (no gaps, but permutation ok)
GGML_API bool ggml_is_contiguously_allocated(const struct ggml_tensor * tensor);
// true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN // true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor); GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
...@@ -765,7 +769,7 @@ extern "C" { ...@@ -765,7 +769,7 @@ extern "C" {
// Tensor flags // Tensor flags
GGML_API void ggml_set_input(struct ggml_tensor * tensor); GGML_API void ggml_set_input(struct ggml_tensor * tensor);
GGML_API void ggml_set_output(struct ggml_tensor * tensor); GGML_API void ggml_set_output(struct ggml_tensor * tensor);
GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor); GGML_API void ggml_set_param(struct ggml_tensor * tensor);
GGML_API void ggml_set_loss(struct ggml_tensor * tensor); GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
// //
...@@ -935,7 +939,7 @@ extern "C" { ...@@ -935,7 +939,7 @@ extern "C" {
GGML_API struct ggml_tensor * ggml_repeat_back( GGML_API struct ggml_tensor * ggml_repeat_back(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b); // sum up values that are adjacent in dims > 0 instead of repeated with same stride
// concat a and b along dim // concat a and b along dim
// used in stable-diffusion // used in stable-diffusion
...@@ -2055,15 +2059,14 @@ extern "C" { ...@@ -2055,15 +2059,14 @@ extern "C" {
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
GGML_API void ggml_build_backward_expand( GGML_API void ggml_build_backward_expand(
struct ggml_context * ctx_static, // context for static gradients (loss + gradient accumulation) struct ggml_context * ctx, // context for gradient computation
struct ggml_context * ctx_compute, // context for gradient computation struct ggml_cgraph * cgraph,
struct ggml_cgraph * cgraph, struct ggml_tensor ** grad_accs);
bool accumulate); // whether or not gradients should be accumulated, requires static allocation of tensors in ctx_static
// graph allocation in a context // graph allocation in a context
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads); GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph); GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph, bool force_grads);
GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst); GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1 GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1
GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph); GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
......
...@@ -214,7 +214,7 @@ add_library(ggml ...@@ -214,7 +214,7 @@ add_library(ggml
target_link_libraries(ggml PUBLIC ggml-base) target_link_libraries(ggml PUBLIC ggml-base)
if (CMAKE_SYSTEM_NAME MATCHES "Linux") if (CMAKE_SYSTEM_NAME MATCHES "Linux")
target_link_libraries(ggml PRIVATE dl stdc++fs) target_link_libraries(ggml PRIVATE dl)
endif() endif()
function(ggml_add_backend_library backend) function(ggml_add_backend_library backend)
......
...@@ -56,7 +56,7 @@ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) { ...@@ -56,7 +56,7 @@ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
return SIZE_MAX; return SIZE_MAX;
} }
size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) { size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) {
// get_alloc_size is optional, defaults to ggml_nbytes // get_alloc_size is optional, defaults to ggml_nbytes
if (buft->iface.get_alloc_size) { if (buft->iface.get_alloc_size) {
size_t size = buft->iface.get_alloc_size(buft, tensor); size_t size = buft->iface.get_alloc_size(buft, tensor);
...@@ -151,7 +151,7 @@ size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) { ...@@ -151,7 +151,7 @@ size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer)); return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
} }
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor) {
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor); return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
} }
...@@ -674,6 +674,8 @@ struct ggml_backend_sched { ...@@ -674,6 +674,8 @@ struct ggml_backend_sched {
char * context_buffer; char * context_buffer;
size_t context_buffer_size; size_t context_buffer_size;
bool op_offload;
int debug; int debug;
}; };
...@@ -766,7 +768,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st ...@@ -766,7 +768,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor); int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
// check if a backend with higher prio wants to offload the op // check if a backend with higher prio wants to offload the op
if (src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) { if (sched->op_offload && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
for (int b = 0; b < src_backend_id; b++) { for (int b = 0; b < src_backend_id; b++) {
if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) { if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
SET_CAUSE(tensor, "1.off"); SET_CAUSE(tensor, "1.off");
...@@ -1109,7 +1111,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg ...@@ -1109,7 +1111,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
const int node_backend_id = tensor_backend_id(node); const int node_backend_id = tensor_backend_id(node);
assert(node_backend_id != -1); // all nodes should be assigned by now assert(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback
// check if we should start a new split based on the sources of the current node // check if we should start a new split based on the sources of the current node
bool need_new_split = false; bool need_new_split = false;
...@@ -1452,7 +1454,8 @@ ggml_backend_sched_t ggml_backend_sched_new( ...@@ -1452,7 +1454,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
ggml_backend_buffer_type_t * bufts, ggml_backend_buffer_type_t * bufts,
int n_backends, int n_backends,
size_t graph_size, size_t graph_size,
bool parallel) { bool parallel,
bool op_offload) {
GGML_ASSERT(n_backends > 0); GGML_ASSERT(n_backends > 0);
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS); GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU); GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
...@@ -1497,6 +1500,7 @@ ggml_backend_sched_t ggml_backend_sched_new( ...@@ -1497,6 +1500,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
} }
sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends); sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
sched->op_offload = op_offload;
ggml_backend_sched_reset(sched); ggml_backend_sched_reset(sched);
......
...@@ -428,6 +428,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name) ...@@ -428,6 +428,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
${KLEIDIAI_SRC}/kai/ukernels/ ${KLEIDIAI_SRC}/kai/ukernels/
${KLEIDIAI_SRC}/kai/ukernels/matmul/ ${KLEIDIAI_SRC}/kai/ukernels/matmul/
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/ ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/) ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/)
set(ARCH_FLAGS_TEMP "${ARCH_FLAGS}") set(ARCH_FLAGS_TEMP "${ARCH_FLAGS}")
...@@ -438,17 +439,19 @@ function(ggml_add_cpu_backend_variant_impl tag_name) ...@@ -438,17 +439,19 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
string(FIND "${ARCH_FLAGS_TEMP}" "+i8mm" I8MM_ENABLED) string(FIND "${ARCH_FLAGS_TEMP}" "+i8mm" I8MM_ENABLED)
string(FIND "${ARCH_FLAGS_TEMP}" "+sme" SME_ENABLED) string(FIND "${ARCH_FLAGS_TEMP}" "+sme" SME_ENABLED)
set(PRIVATE_ARCH_FLAGS ${ARCH_FLAGS}) set(PRIVATE_ARCH_FLAGS ${ARCH_FLAGS_TEMP})
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c) list(APPEND GGML_KLEIDIAI_SOURCES
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c) ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c) ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c) ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c)
if (NOT DOTPROD_ENABLED MATCHES -1) if (NOT DOTPROD_ENABLED MATCHES -1)
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c) list(APPEND GGML_KLEIDIAI_SOURCES
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.c) ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c) ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c)
endif() endif()
if (NOT I8MM_ENABLED MATCHES -1) if (NOT I8MM_ENABLED MATCHES -1)
...@@ -456,9 +459,13 @@ function(ggml_add_cpu_backend_variant_impl tag_name) ...@@ -456,9 +459,13 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
endif() endif()
if (NOT SME_ENABLED MATCHES -1) if (NOT SME_ENABLED MATCHES -1)
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c) list(APPEND GGML_KLEIDIAI_SOURCES
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c) ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c
set(PRIVATE_ARCH_FLAGS "${PRIVATE_ARCH_FLAGS}+sve+sve2") ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c)
set(PRIVATE_ARCH_FLAGS "-fno-tree-vectorize;${PRIVATE_ARCH_FLAGS}+sve+sve2")
endif() endif()
set_source_files_properties(${GGML_KLEIDIAI_SOURCES} PROPERTIES COMPILE_OPTIONS "${PRIVATE_ARCH_FLAGS}") set_source_files_properties(${GGML_KLEIDIAI_SOURCES} PROPERTIES COMPILE_OPTIONS "${PRIVATE_ARCH_FLAGS}")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment