Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
0cefd46f
Unverified
Commit
0cefd46f
authored
May 12, 2025
by
Jeffrey Morgan
Committed by
GitHub
May 12, 2025
Browse files
llama: update to commit de4c07f93 (#10655)
parent
ad035ad5
Changes
113
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
447 additions
and
480 deletions
+447
-480
llama/patches/0004-clip-unicode.patch
llama/patches/0004-clip-unicode.patch
+10
-10
llama/patches/0005-solar-pro.patch
llama/patches/0005-solar-pro.patch
+10
-10
llama/patches/0006-add-mllama-support.patch
llama/patches/0006-add-mllama-support.patch
+160
-143
llama/patches/0007-add-unpad-operator.patch
llama/patches/0007-add-unpad-operator.patch
+19
-19
llama/patches/0008-fix-deepseek-deseret-regex.patch
llama/patches/0008-fix-deepseek-deseret-regex.patch
+2
-2
llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch
.../patches/0010-ensure-KV-cache-is-fully-defragmented.patch
+144
-190
llama/patches/0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
...0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
+1
-1
llama/patches/0013-remove-amx.patch
llama/patches/0013-remove-amx.patch
+1
-1
llama/patches/0014-fix-string-arr-kv-loading.patch
llama/patches/0014-fix-string-arr-kv-loading.patch
+6
-6
llama/patches/0015-ollama-debug-tensor.patch
llama/patches/0015-ollama-debug-tensor.patch
+2
-2
llama/patches/0016-add-ollama-vocab-for-grammar-support.patch
...a/patches/0016-add-ollama-vocab-for-grammar-support.patch
+1
-1
llama/patches/0017-ggml-Don-t-assert-fail-when-tensor-data-changes-1322.patch
...gml-Don-t-assert-fail-when-tensor-data-changes-1322.patch
+0
-38
ml/backend/ggml/ggml.go
ml/backend/ggml/ggml.go
+1
-0
ml/backend/ggml/ggml/include/ggml-backend.h
ml/backend/ggml/ggml/include/ggml-backend.h
+4
-4
ml/backend/ggml/ggml/include/ggml-cpp.h
ml/backend/ggml/ggml/include/ggml-cpp.h
+1
-1
ml/backend/ggml/ggml/include/ggml-opt.h
ml/backend/ggml/ggml/include/ggml-opt.h
+47
-28
ml/backend/ggml/ggml/include/ggml.h
ml/backend/ggml/ggml/include/ggml.h
+10
-7
ml/backend/ggml/ggml/src/CMakeLists.txt
ml/backend/ggml/ggml/src/CMakeLists.txt
+1
-1
ml/backend/ggml/ggml/src/ggml-backend.cpp
ml/backend/ggml/ggml/src/ggml-backend.cpp
+9
-5
ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt
ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt
+18
-11
No files found.
llama/patches/0004-clip-unicode.patch
View file @
0cefd46f
...
@@ -6,16 +6,16 @@ Subject: [PATCH] clip-unicode
...
@@ -6,16 +6,16 @@ Subject: [PATCH] clip-unicode
fixes loading vision models in llama.cpp on windows
fixes loading vision models in llama.cpp on windows
filesystems for paths that include wide characters
filesystems for paths that include wide characters
---
---
examples/llava
/clip.cpp | 39 +++++++++++++++++++++++++++++++++++++++
tools/mtmd
/clip.cpp | 39 +++++++++++++++++++++++++++++++++++++++
1 file changed, 39 insertions(+)
1 file changed, 39 insertions(+)
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index ad3e7df1..b3218c78 100644
index 41ba45a7..cdd8ca44 100644
--- a/examples/llava/clip.cpp
--- a/tools/mtmd/clip.cpp
+++ b/examples/llava/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -30,6 +30,19 @@
@@ -31,6 +31,19 @@
#include <array>
#include <numeric>
#include <numeric>
#include <functional>
+#if defined(_WIN32)
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
...
@@ -32,8 +32,8 @@ index ad3e7df1..b3218c78 100644
...
@@ -32,8 +32,8 @@ index ad3e7df1..b3218c78 100644
+
+
struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
//#define CLIP_DEBUG_FUNCTIONS
enum ffn_op_type {
@@ -19
71
,7 +
1984
,29 @@
struct clip_model_loader {
@@ -
2
19
0
,7 +
2203
,29 @@
struct clip_model_loader {
{
{
std::vector<uint8_t> read_buf;
std::vector<uint8_t> read_buf;
...
@@ -63,7 +63,7 @@ index ad3e7df1..b3218c78 100644
...
@@ -63,7 +63,7 @@ index ad3e7df1..b3218c78 100644
if (!fin) {
if (!fin) {
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
}
}
@@ -
1998
,7 +2
033
,11 @@
struct clip_model_loader {
@@ -
2217
,7 +2
252
,11 @@
struct clip_model_loader {
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
}
}
}
}
...
...
llama/patches/0005-solar-pro.patch
View file @
0cefd46f
...
@@ -138,7 +138,7 @@ index 7ee6a5b7..48dce407 100644
...
@@ -138,7 +138,7 @@ index 7ee6a5b7..48dce407 100644
};
};
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index
ea73a8a7..a012aeae
100644
index
4cce5166..7f6617fa
100644
--- a/src/llama-model-loader.cpp
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -439,6 +439,7 @@
namespace GGUFMeta {
@@ -439,6 +439,7 @@
namespace GGUFMeta {
...
@@ -150,10 +150,10 @@ index ea73a8a7..a012aeae 100644
...
@@ -150,10 +150,10 @@ index ea73a8a7..a012aeae 100644
llama_model_loader::llama_model_loader(
llama_model_loader::llama_model_loader(
const std::string & fname,
const std::string & fname,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index
822e2bb2..57237
8c
9
100644
index
3a4e72a3..831b6
8c
0
100644
--- a/src/llama-model.cpp
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1
386
,6 +1
386
,21 @@
void llama_model::load_hparams(llama_model_loader & ml) {
@@ -1
402
,6 +1
402
,21 @@
void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN;
default: type = LLM_TYPE_UNKNOWN;
}
}
} break;
} break;
...
@@ -175,7 +175,7 @@ index 822e2bb2..572378c9 100644
...
@@ -175,7 +175,7 @@ index 822e2bb2..572378c9 100644
case LLM_ARCH_WAVTOKENIZER_DEC:
case LLM_ARCH_WAVTOKENIZER_DEC:
{
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -374
1
,6 +37
56
,34 @@
bool llama_model::load_tensors(llama_model_loader & ml) {
@@ -37
7
4,6 +37
89
,34 @@
bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
...
@@ -210,7 +210,7 @@ index 822e2bb2..572378c9 100644
...
@@ -210,7 +210,7 @@ index 822e2bb2..572378c9 100644
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
@@ -123
42
,6 +12
385
,165 @@
struct llm_build_chameleon : public llm_graph_context {
@@ -123
97
,6 +12
440
,165 @@
struct llm_build_chameleon : public llm_graph_context {
}
}
};
};
...
@@ -376,7 +376,7 @@ index 822e2bb2..572378c9 100644
...
@@ -376,7 +376,7 @@ index 822e2bb2..572378c9 100644
struct llm_build_wavtokenizer_dec : public llm_graph_context {
struct llm_build_wavtokenizer_dec : public llm_graph_context {
llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
ggml_tensor * cur;
ggml_tensor * cur;
@@ -13
092
,6 +13
294
,10 @@
llm_graph_result_ptr llama_model::build_graph(
@@ -13
157
,6 +13
359
,10 @@
llm_graph_result_ptr llama_model::build_graph(
{
{
llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
} break;
} break;
...
@@ -387,7 +387,7 @@ index 822e2bb2..572378c9 100644
...
@@ -387,7 +387,7 @@ index 822e2bb2..572378c9 100644
case LLM_ARCH_WAVTOKENIZER_DEC:
case LLM_ARCH_WAVTOKENIZER_DEC:
{
{
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf);
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf);
@@ -13
238
,6 +13
444
,7 @@
llama_rope_type llama_model_rope_type(const llama_model * model) {
@@ -13
301
,6 +13
507
,7 @@
llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_CHAMELEON:
case LLM_ARCH_CHAMELEON:
...
@@ -396,10 +396,10 @@ index 822e2bb2..572378c9 100644
...
@@ -396,10 +396,10 @@ index 822e2bb2..572378c9 100644
return LLAMA_ROPE_TYPE_NORM;
return LLAMA_ROPE_TYPE_NORM;
diff --git a/src/llama-model.h b/src/llama-model.h
diff --git a/src/llama-model.h b/src/llama-model.h
index
95eca002..856e6042
100644
index
6bdec263..43746c7d
100644
--- a/src/llama-model.h
--- a/src/llama-model.h
+++ b/src/llama-model.h
+++ b/src/llama-model.h
@@ -6
4
,6 +6
4
,7 @@
enum llm_type {
@@ -6
5
,6 +6
5
,7 @@
enum llm_type {
LLM_TYPE_15B,
LLM_TYPE_15B,
LLM_TYPE_16B,
LLM_TYPE_16B,
LLM_TYPE_20B,
LLM_TYPE_20B,
...
@@ -407,7 +407,7 @@ index 95eca002..856e6042 100644
...
@@ -407,7 +407,7 @@ index 95eca002..856e6042 100644
LLM_TYPE_27B,
LLM_TYPE_27B,
LLM_TYPE_30B,
LLM_TYPE_30B,
LLM_TYPE_32B,
LLM_TYPE_32B,
@@ -31
1
,6 +31
2
,8 @@
struct llama_layer {
@@ -31
5
,6 +31
6
,8 @@
struct llama_layer {
struct ggml_tensor * ffn_up_scale = nullptr;
struct ggml_tensor * ffn_up_scale = nullptr;
struct ggml_tensor * ffn_down_scale = nullptr;
struct ggml_tensor * ffn_down_scale = nullptr;
...
...
llama/patches/0006-add-mllama-support.patch
View file @
0cefd46f
...
@@ -5,88 +5,27 @@ Subject: [PATCH] add mllama support
...
@@ -5,88 +5,27 @@ Subject: [PATCH] add mllama support
adds support for the llama 3.2 vision architecture
adds support for the llama 3.2 vision architecture
---
---
examples/llava/llava.cpp | 5 +-
examples/llava/mtmd.cpp | 6 +-
ggml/src/ggml-backend-reg.cpp | 6 +-
ggml/src/ggml-backend-reg.cpp | 6 +-
include/llama.h | 6 +
include/llama.h | 6 +
src/llama-arch.cpp | 44 +++++
src/llama-arch.cpp | 44 +++++
src/llama-arch.h | 10 ++
src/llama-arch.h | 10 ++
src/llama-batch.cpp | 3 +
src/llama-batch.cpp | 3 +
src/llama-context.cpp | 2
5
++-
src/llama-context.cpp | 2
3
++-
src/llama-context.h | 1 +
src/llama-context.h | 1 +
src/llama-cparams.h | 1 +
src/llama-cparams.h | 1 +
src/llama-graph.cpp | 25 +++
src/llama-graph.cpp | 25 +++
src/llama-graph.h | 12 ++
src/llama-graph.h | 12 ++
src/llama-hparams.cpp | 4 +
src/llama-hparams.cpp | 4 +
src/llama-hparams.h | 7 +
src/llama-hparams.h | 7 +
src/llama-kv-cache.cpp | 1
2
+-
src/llama-kv-cache.cpp | 1
4
+-
src/llama-model-loader.cpp | 2 +
src/llama-model-loader.cpp | 2 +
src/llama-model.cpp | 3
09
+++++++++++++++++++++++++++++++++-
src/llama-model.cpp | 3
11
+++++++++++++++++++++++++++++++++-
src/llama-model.h | 12 ++
src/llama-model.h | 12 ++
src/llama-quant.cpp | 4 +-
src/llama-quant.cpp | 4 +-
19 files changed, 473 insertions(+), 21 deletions(-)
tools/mtmd/llava.cpp | 5 +-
tools/mtmd/mtmd-helper.cpp | 7 +-
19 files changed, 475 insertions(+), 22 deletions(-)
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index c00d16ae..bab027b5 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -457,7 +457,7 @@
struct llava_embd_batch {
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
- llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+ llava_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
pos .resize(n_tokens);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
@@ -469,6 +469,7 @@
struct llava_embd_batch {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ embd,
+ /*n_embd =*/ n_embd,
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
@@ -492,7 +493,7 @@
bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
n_eval = n_batch;
}
float * embd = image_embed->embed+i*n_embd;
- llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
+ llava_embd_batch llava_batch = llava_embd_batch(embd, n_embd, n_eval, *n_past, 0);
if (llama_decode(ctx_llama, llava_batch.batch)) {
LOG_ERR("%s : failed to eval\n", __func__);
return false;
diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
index 7081fd73..c14ac501 100644
--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
@@ -476,7 +476,7 @@
struct decode_embd_batch {
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
- decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
+ decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
pos .resize(n_tokens * n_pos_per_embd);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
@@ -487,6 +487,7 @@
struct decode_embd_batch {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ embd,
+ /*n_embd =*/ n_embd,
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
@@ -610,7 +611,8 @@
int32_t mtmd_helper_eval(mtmd_context * ctx,
int32_t i_batch = 0;
int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
float * embd = mtmd_get_output_embd(ctx);
- decode_embd_batch batch_embd(embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
+ int n_embd = llama_model_n_embd(llama_get_model(lctx));
+ decode_embd_batch batch_embd(embd, n_embd, n_tokens, n_past, 0);
const int nx = mtmd_image_tokens_get_nx(chunk.tokens_image.get());
const int ny = mtmd_image_tokens_get_ny(chunk.tokens_image.get());
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 405d8e31..82ae1b5b 100644
index 405d8e31..82ae1b5b 100644
--- a/ggml/src/ggml-backend-reg.cpp
--- a/ggml/src/ggml-backend-reg.cpp
...
@@ -105,10 +44,10 @@ index 405d8e31..82ae1b5b 100644
...
@@ -105,10 +44,10 @@ index 405d8e31..82ae1b5b 100644
register_backend(ggml_backend_rpc_reg());
register_backend(ggml_backend_rpc_reg());
#endif
#endif
diff --git a/include/llama.h b/include/llama.h
diff --git a/include/llama.h b/include/llama.h
index
06c56395..f1628e88
100644
index
abedebdb..41beef21
100644
--- a/include/llama.h
--- a/include/llama.h
+++ b/include/llama.h
+++ b/include/llama.h
@@ -25
6
,6 +25
6
,7 @@
extern "C" {
@@ -25
8
,6 +25
8
,7 @@
extern "C" {
llama_token * token;
llama_token * token;
float * embd;
float * embd;
...
@@ -116,15 +55,15 @@ index 06c56395..f1628e88 100644
...
@@ -116,15 +55,15 @@ index 06c56395..f1628e88 100644
llama_pos * pos;
llama_pos * pos;
int32_t * n_seq_id;
int32_t * n_seq_id;
llama_seq_id ** seq_id;
llama_seq_id ** seq_id;
@@ -358,6 +359,7 @@
extern "C" {
@@ -365,6 +366,7 @@
extern "C" {
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
bool no_perf; // whether to measure performance timings
bool no_perf; // whether to measure performance timings
bool op_offload; // whether to offload host tensor operations to device
+ bool cross_attn; // whether to use cross attention
+ bool cross_attn; // whether to use cross attention
};
// Abort callback
// model quantization parameters
// if it returns true, execution of llama_decode() will be aborted
@@ -464,6 +466,10 @@
extern "C" {
@@ -459,6 +461,10 @@
extern "C" {
struct llama_context_params params),
struct llama_context_params params),
"use llama_init_from_model instead");
"use llama_init_from_model instead");
...
@@ -247,10 +186,10 @@ index 525c1b7d..bc8a4f0b 100644
...
@@ -247,10 +186,10 @@ index 525c1b7d..bc8a4f0b 100644
LLM_TENSOR_CONVNEXT_DW,
LLM_TENSOR_CONVNEXT_DW,
LLM_TENSOR_CONVNEXT_NORM,
LLM_TENSOR_CONVNEXT_NORM,
diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
index
01d5ca57..8682b0e6
100644
index
a88b2fe3..241b316e
100644
--- a/src/llama-batch.cpp
--- a/src/llama-batch.cpp
+++ b/src/llama-batch.cpp
+++ b/src/llama-batch.cpp
@@ -3
16
,6 +3
16
,7 @@
struct llama_batch llama_batch_get_one(
@@ -3
20
,6 +3
20
,7 @@
struct llama_batch llama_batch_get_one(
/*n_tokens =*/ n_tokens,
/*n_tokens =*/ n_tokens,
/*tokens =*/ tokens,
/*tokens =*/ tokens,
/*embd =*/ nullptr,
/*embd =*/ nullptr,
...
@@ -258,7 +197,7 @@ index 01d5ca57..8682b0e6 100644
...
@@ -258,7 +197,7 @@ index 01d5ca57..8682b0e6 100644
/*pos =*/ nullptr,
/*pos =*/ nullptr,
/*n_seq_id =*/ nullptr,
/*n_seq_id =*/ nullptr,
/*seq_id =*/ nullptr,
/*seq_id =*/ nullptr,
@@ -32
8
,6 +3
29
,7 @@
struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
@@ -3
3
2,6 +3
33
,7 @@
struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
/*n_tokens =*/ 0,
/*n_tokens =*/ 0,
/*tokens =*/ nullptr,
/*tokens =*/ nullptr,
/*embd =*/ nullptr,
/*embd =*/ nullptr,
...
@@ -266,7 +205,7 @@ index 01d5ca57..8682b0e6 100644
...
@@ -266,7 +205,7 @@ index 01d5ca57..8682b0e6 100644
/*pos =*/ nullptr,
/*pos =*/ nullptr,
/*n_seq_id =*/ nullptr,
/*n_seq_id =*/ nullptr,
/*seq_id =*/ nullptr,
/*seq_id =*/ nullptr,
@@ -3
36
,6 +3
38
,7 @@
struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
@@ -3
40
,6 +3
42
,7 @@
struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
if (embd) {
if (embd) {
batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
...
@@ -275,10 +214,10 @@ index 01d5ca57..8682b0e6 100644
...
@@ -275,10 +214,10 @@ index 01d5ca57..8682b0e6 100644
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
}
}
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index
9c1fe93f..cd06ad91
100644
index
dca22d8b..c22687e4
100644
--- a/src/llama-context.cpp
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -
8
51,7 +
8
51,7 @@
float * llama_context::get_logits_ith(int32_t i) {
@@ -51
4
,7 +51
4
,7 @@
float * llama_context::get_logits_ith(int32_t i) {
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
}
}
...
@@ -287,7 +226,7 @@ index 9c1fe93f..cd06ad91 100644
...
@@ -287,7 +226,7 @@ index 9c1fe93f..cd06ad91 100644
} catch (const std::exception & err) {
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
#ifndef NDEBUG
#ifndef NDEBUG
@@ -
97
2,6 +
97
2,10 @@
void llama_context::set_warmup(bool value) {
@@ -
63
2,6 +
63
2,10 @@
void llama_context::set_warmup(bool value) {
cparams.warmup = value;
cparams.warmup = value;
}
}
...
@@ -298,16 +237,16 @@ index 9c1fe93f..cd06ad91 100644
...
@@ -298,16 +237,16 @@ index 9c1fe93f..cd06ad91 100644
void llama_context::set_adapter_lora(
void llama_context::set_adapter_lora(
llama_adapter_lora * adapter,
llama_adapter_lora * adapter,
float scale) {
float scale) {
@@ -
1047,7 +1051
,7 @@
int llama_context::encode(llama_batch & inp_batch) {
@@ -
709,7 +713
,7 @@
int llama_context::encode(llama_batch & inp_batch) {
const int64_t n_embd = hparams.n_embd;
const int64_t n_embd = hparams.n_embd;
-
sbatch.from_
batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
-
llama_sbatch sbatch = llama_s
batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
+
sbatch.from_
batch(batch, batch.n_embd, /* simple_split */ true, /* logits_all */ true);
+
llama_sbatch sbatch = llama_s
batch(batch, batch.n_embd, /* simple_split */ true, /* logits_all */ true);
const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
@@ -
1187
,10 +
1191
,9 @@
int llama_context::decode(llama_batch & inp_batch) {
@@ -
863
,10 +
867
,9 @@
int llama_context::decode(llama_batch & inp_batch) {
const llama_batch & batch = batch_allocr.batch;
const llama_batch & batch = batch_allocr.batch;
...
@@ -319,16 +258,16 @@ index 9c1fe93f..cd06ad91 100644
...
@@ -319,16 +258,16 @@ index 9c1fe93f..cd06ad91 100644
const int64_t n_tokens_all = batch.n_tokens;
const int64_t n_tokens_all = batch.n_tokens;
const int64_t n_embd = hparams.n_embd;
const int64_t n_embd = hparams.n_embd;
@@ -1238,7 +1241,7 @@
int llama_context::decode(llama_batch & inp_batch) {
@@ -1087,7 +1090,7 @@
int llama_context::decode(llama_batch & inp_batch) {
// make the outputs have the same order they had in the user-provided batch
const bool logits_all = n_outputs_all == n_tokens_all;
// note: this is mostly relevant for recurrent models atm
if (!sorted_output) {
- sbatch.from_batch(batch, n_embd,
- const uint32_t n_vocab = model.vocab.n_tokens();
+ sbatch.from_batch(batch, batch.n_embd,
+ const uint32_t n_vocab = model.hparams.n_vocab;
/* simple_split */ !kv_self->recurrent,
const uint32_t n_embd = model.hparams.n_embd;
/* logits_all */ logits_all);
@@ -1472,12 +1475,11 @@
int llama_context::decode(llama_batch & inp_batch) {
GGML_ASSERT((size_t) n_outputs == out_ids.size());
@@ -1142,12 +1145,11 @@
int llama_context::decode(llama_batch & inp_batch) {
int32_t llama_context::output_reserve(int32_t n_outputs) {
int32_t llama_context::output_reserve(int32_t n_outputs) {
const auto & hparams = model.hparams;
const auto & hparams = model.hparams;
...
@@ -342,16 +281,7 @@ index 9c1fe93f..cd06ad91 100644
...
@@ -342,16 +281,7 @@ index 9c1fe93f..cd06ad91 100644
const auto n_embd = hparams.n_embd;
const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead
// TODO: use a per-batch flag for logits presence instead
@@ -1545,7 +1547,7 @@
int32_t llama_context::output_reserve(int32_t n_outputs) {
@@ -1682,7 +1684,7 @@
size_t llama_context::state_write_data(llama_io_write_i & io) {
void llama_context::output_reorder() {
auto & out_ids = sbatch.out_ids;
if (!out_ids.empty()) {
- const uint32_t n_vocab = model.vocab.n_tokens();
+ const uint32_t n_vocab = model.hparams.n_vocab;
const uint32_t n_embd = model.hparams.n_embd;
GGML_ASSERT((size_t) n_outputs == out_ids.size());
@@ -2052,7 +2054,7 @@
size_t llama_context::state_write_data(llama_io_write_i & io) {
{
{
LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
...
@@ -360,15 +290,15 @@ index 9c1fe93f..cd06ad91 100644
...
@@ -360,15 +290,15 @@ index 9c1fe93f..cd06ad91 100644
io.write(&logits_size, sizeof(logits_size));
io.write(&logits_size, sizeof(logits_size));
@@ -2235,6 +2237,7 @@
llama_context_params llama_context_default_params() {
@@ -2091,6 +2093,7 @@
llama_context_params llama_context_default_params() {
/*.offload_kqv =*/ true,
/*.flash_attn =*/ false,
/*.flash_attn =*/ false,
/*.no_perf =*/ true,
/*.no_perf =*/ true,
/*.op_offload =*/ true,
+ /*.cross_attn =*/ false,
+ /*.cross_attn =*/ false,
/*.abort_callback =*/ nullptr,
/*.abort_callback_data =*/ nullptr,
};
};
@@ -2362,6 +2365,10 @@
void llama_set_warmup(llama_context * ctx, bool warmup) {
return result;
@@ -2216,6 +2219,10 @@
void llama_set_warmup(llama_context * ctx, bool warmup) {
ctx->set_warmup(warmup);
ctx->set_warmup(warmup);
}
}
...
@@ -380,10 +310,10 @@ index 9c1fe93f..cd06ad91 100644
...
@@ -380,10 +310,10 @@ index 9c1fe93f..cd06ad91 100644
ctx->synchronize();
ctx->synchronize();
}
}
diff --git a/src/llama-context.h b/src/llama-context.h
diff --git a/src/llama-context.h b/src/llama-context.h
index
5457f077..a50c4af
a 100644
index
c0ceacb1..c4ab242
a 100644
--- a/src/llama-context.h
--- a/src/llama-context.h
+++ b/src/llama-context.h
+++ b/src/llama-context.h
@@ -
65
,6 +
65
,7 @@
struct llama_context {
@@ -
71
,6 +
71
,7 @@
struct llama_context {
void set_embeddings (bool value);
void set_embeddings (bool value);
void set_causal_attn(bool value);
void set_causal_attn(bool value);
void set_warmup(bool value);
void set_warmup(bool value);
...
@@ -392,22 +322,22 @@ index 5457f077..a50c4afa 100644
...
@@ -392,22 +322,22 @@ index 5457f077..a50c4afa 100644
void set_adapter_lora(
void set_adapter_lora(
llama_adapter_lora * adapter,
llama_adapter_lora * adapter,
diff --git a/src/llama-cparams.h b/src/llama-cparams.h
diff --git a/src/llama-cparams.h b/src/llama-cparams.h
index
30e550f0..85ad91b9
100644
index
246fa577..7a6156ce
100644
--- a/src/llama-cparams.h
--- a/src/llama-cparams.h
+++ b/src/llama-cparams.h
+++ b/src/llama-cparams.h
@@ -29,6 +29,7 @@
struct llama_cparams {
@@ -31,6 +31,7 @@
struct llama_cparams {
bool offload_kqv;
bool flash_attn;
bool no_perf;
bool no_perf;
+ bool cross_attn;
bool warmup;
bool warmup;
bool op_offload;
+ bool cross_attn;
enum llama_pooling_type pooling_type;
enum llama_pooling_type pooling_type;
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index
fabb9ca2..b67216a4
100644
index
b0e3f635..f14869cf
100644
--- a/src/llama-graph.cpp
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -5
60
,6 +5
60
,12 @@
void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
@@ -5
32
,6 +5
32
,12 @@
void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
}
}
}
}
...
@@ -420,7 +350,7 @@ index fabb9ca2..b67216a4 100644
...
@@ -420,7 +350,7 @@ index fabb9ca2..b67216a4 100644
//
//
// llm_graph_context
// llm_graph_context
//
//
@@ -15
32
,6 +15
38
,25 @@
llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
@@ -15
14
,6 +15
20
,25 @@
llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
}
}
...
@@ -447,10 +377,10 @@ index fabb9ca2..b67216a4 100644
...
@@ -447,10 +377,10 @@ index fabb9ca2..b67216a4 100644
llm_graph_input_attn_cross * inp,
llm_graph_input_attn_cross * inp,
ggml_cgraph * gf,
ggml_cgraph * gf,
diff --git a/src/llama-graph.h b/src/llama-graph.h
diff --git a/src/llama-graph.h b/src/llama-graph.h
index
d0c8d321..0fe18150
100644
index
832a8c09..5a322785
100644
--- a/src/llama-graph.h
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -8
6
,6 +8
6
,7 @@
public:
@@ -8
7
,6 +8
7
,7 @@
public:
ggml_tensor * tokens = nullptr; // I32 [n_batch]
ggml_tensor * tokens = nullptr; // I32 [n_batch]
ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch]
ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch]
...
@@ -458,7 +388,7 @@ index d0c8d321..0fe18150 100644
...
@@ -458,7 +388,7 @@ index d0c8d321..0fe18150 100644
};
};
class llm_graph_input_pos : public llm_graph_input_i {
class llm_graph_input_pos : public llm_graph_input_i {
@@ -28
3
,6 +28
4
,16 @@
public:
@@ -28
4
,6 +28
5
,16 @@
public:
const llama_cross * cross = nullptr;
const llama_cross * cross = nullptr;
};
};
...
@@ -475,7 +405,7 @@ index d0c8d321..0fe18150 100644
...
@@ -475,7 +405,7 @@ index d0c8d321..0fe18150 100644
//
//
// llm_graph_result
// llm_graph_result
//
//
@@ -49
1
,6 +50
2
,7 @@
struct llm_graph_context {
@@ -49
5
,6 +50
6
,7 @@
struct llm_graph_context {
ggml_tensor * build_inp_cls() const;
ggml_tensor * build_inp_cls() const;
ggml_tensor * build_inp_s_copy() const;
ggml_tensor * build_inp_s_copy() const;
ggml_tensor * build_inp_s_mask() const;
ggml_tensor * build_inp_s_mask() const;
...
@@ -535,11 +465,11 @@ index 48dce407..b6fc7e6d 100644
...
@@ -535,11 +465,11 @@ index 48dce407..b6fc7e6d 100644
};
};
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index
7c9d46d8..69f8d35a
100644
index
3dcad65b..a7b0a7eb
100644
--- a/src/llama-kv-cache.cpp
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -
95
,8 +
95
,16 @@
bool
llama_kv_cache_unified::
init
(
@@ -
100
,8 +
100
,16 @@
llama_kv_cache_unified::
llama_kv_cache_unified
(
return false
;
throw std::runtime_error("failed to create ggml context for kv cache")
;
}
}
- ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
- ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
...
@@ -557,8 +487,17 @@ index 7c9d46d8..69f8d35a 100644
...
@@ -557,8 +487,17 @@ index 7c9d46d8..69f8d35a 100644
ggml_format_name(k, "cache_k_l%d", i);
ggml_format_name(k, "cache_k_l%d", i);
ggml_format_name(v, "cache_v_l%d", i);
ggml_format_name(v, "cache_v_l%d", i);
k_l.push_back(k);
k_l.push_back(k);
@@ -446,7 +454,7 @@
void llama_kv_cache_unified::set_full() {
llama_sbatch llama_kv_cache_unified::sbatch_init(
const llama_batch & batch,
bool logits_all) {
- return llama_sbatch(batch, hparams.n_embd, true, logits_all);
+ return llama_sbatch(batch, batch.n_embd, true, logits_all);
}
llama_ubatch llama_kv_cache_unified::ubatch_next(
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index
a012aeae..2e11507d
100644
index
7f6617fa..2acfd4a8
100644
--- a/src/llama-model-loader.cpp
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -315,6 +315,8 @@
namespace GGUFMeta {
@@ -315,6 +315,8 @@
namespace GGUFMeta {
...
@@ -571,10 +510,10 @@ index a012aeae..2e11507d 100644
...
@@ -571,10 +510,10 @@ index a012aeae..2e11507d 100644
bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
const int kid = gguf_find_key(meta.get(), key.c_str());
const int kid = gguf_find_key(meta.get(), key.c_str());
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index
57237
8c
9
..
9d099f11
100644
index
831b6
8c
0
..
e8298f56
100644
--- a/src/llama-model.cpp
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -4
2
3,6 +4
2
3,7 @@
void llama_model::load_hparams(llama_model_loader & ml) {
@@ -4
3
3,6 +4
3
3,7 @@
void llama_model::load_hparams(llama_model_loader & ml) {
// get general kv
// get general kv
ml.get_key(LLM_KV_GENERAL_NAME, name, false);
ml.get_key(LLM_KV_GENERAL_NAME, name, false);
...
@@ -582,7 +521,7 @@ index 572378c9..9d099f11 100644
...
@@ -582,7 +521,7 @@ index 572378c9..9d099f11 100644
// everything past this point is not vocab-related
// everything past this point is not vocab-related
if (hparams.vocab_only) {
if (hparams.vocab_only) {
@@ -4
3
4,6 +4
3
5,7 @@
void llama_model::load_hparams(llama_model_loader & ml) {
@@ -4
4
4,6 +4
4
5,7 @@
void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
...
@@ -590,7 +529,7 @@ index 572378c9..9d099f11 100644
...
@@ -590,7 +529,7 @@ index 572378c9..9d099f11 100644
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
@@ -4
5
7,9 +4
5
9,11 @@
void llama_model::load_hparams(llama_model_loader & ml) {
@@ -4
6
7,9 +4
6
9,11 @@
void llama_model::load_hparams(llama_model_loader & ml) {
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
...
@@ -602,7 +541,7 @@ index 572378c9..9d099f11 100644
...
@@ -602,7 +541,7 @@ index 572378c9..9d099f11 100644
// n_head_kv is optional, default to n_head
// n_head_kv is optional, default to n_head
hparams.n_head_kv_arr = hparams.n_head_arr;
hparams.n_head_kv_arr = hparams.n_head_arr;
@@ -5
1
2,7 +5
1
6,7 @@
void llama_model::load_hparams(llama_model_loader & ml) {
@@ -5
2
2,7 +5
2
6,7 @@
void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
...
@@ -611,7 +550,7 @@ index 572378c9..9d099f11 100644
...
@@ -611,7 +550,7 @@ index 572378c9..9d099f11 100644
if (hparams.n_rot != hparams.n_embd_head_k) {
if (hparams.n_rot != hparams.n_embd_head_k) {
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
}
}
@@ -5
7
5,6 +5
7
9,16 @@
void llama_model::load_hparams(llama_model_loader & ml) {
@@ -5
8
5,6 +5
8
9,16 @@
void llama_model::load_hparams(llama_model_loader & ml) {
hparams.use_kq_norm = false;
hparams.use_kq_norm = false;
}
}
} break;
} break;
...
@@ -628,7 +567,7 @@ index 572378c9..9d099f11 100644
...
@@ -628,7 +567,7 @@ index 572378c9..9d099f11 100644
case LLM_ARCH_DECI:
case LLM_ARCH_DECI:
{
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -15
62
,7 +15
76
,7 @@
bool llama_model::load_tensors(llama_model_loader & ml) {
@@ -15
81
,7 +15
95
,7 @@
bool llama_model::load_tensors(llama_model_loader & ml) {
const int64_t n_embd_head_v = hparams.n_embd_head_v;
const int64_t n_embd_head_v = hparams.n_embd_head_v;
const int64_t n_ff = hparams.n_ff();
const int64_t n_ff = hparams.n_ff();
const int64_t n_embd_gqa = n_embd_v_gqa;
const int64_t n_embd_gqa = n_embd_v_gqa;
...
@@ -637,7 +576,7 @@ index 572378c9..9d099f11 100644
...
@@ -637,7 +576,7 @@ index 572378c9..9d099f11 100644
const int64_t n_token_types = vocab.n_token_types();
const int64_t n_token_types = vocab.n_token_types();
const int64_t n_rot = hparams.n_rot;
const int64_t n_rot = hparams.n_rot;
const int64_t n_expert = hparams.n_expert;
const int64_t n_expert = hparams.n_expert;
@@ -18
15
,6 +18
29
,52 @@
bool llama_model::load_tensors(llama_model_loader & ml) {
@@ -18
40
,6 +18
54
,52 @@
bool llama_model::load_tensors(llama_model_loader & ml) {
}
}
}
}
} break;
} break;
...
@@ -690,7 +629,7 @@ index 572378c9..9d099f11 100644
...
@@ -690,7 +629,7 @@ index 572378c9..9d099f11 100644
case LLM_ARCH_DECI:
case LLM_ARCH_DECI:
{
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -47
07
,6 +4
767
,246 @@
struct llm_build_llama : public llm_graph_context {
@@ -47
56
,6 +4
816
,246 @@
struct llm_build_llama : public llm_graph_context {
}
}
};
};
...
@@ -832,7 +771,7 @@ index 572378c9..9d099f11 100644
...
@@ -832,7 +771,7 @@ index 572378c9..9d099f11 100644
+ // self attention layer
+ // self attention layer
+
+
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
+ ggml_tensor * rope_factors =
static_cast<const llama_kv_cache_unified *>(memory)->cbs
.get_rope_factors(n_ctx_per_seq, il);
+ ggml_tensor * rope_factors =
model
.get_rope_factors(n_ctx_per_seq, il);
+
+
+ // compute Q and K and RoPE them
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
...
@@ -937,7 +876,16 @@ index 572378c9..9d099f11 100644
...
@@ -937,7 +876,16 @@ index 572378c9..9d099f11 100644
struct llm_build_deci : public llm_graph_context {
struct llm_build_deci : public llm_graph_context {
llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -13063,6 +13363,10 @@
llm_graph_result_ptr llama_model::build_graph(
@@ -12496,7 +12796,7 @@
struct llm_build_solar : public llm_graph_context {
// self-attention
{
// rope freq factors for llama3; may return nullptr for llama2 and other models
- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
+ ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
// compute Q and K and RoPE them
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -13128,6 +13428,10 @@
llm_graph_result_ptr llama_model::build_graph(
{
{
llm = std::make_unique<llm_build_llama>(*this, params, gf);
llm = std::make_unique<llm_build_llama>(*this, params, gf);
} break;
} break;
...
@@ -948,7 +896,7 @@ index 572378c9..9d099f11 100644
...
@@ -948,7 +896,7 @@ index 572378c9..9d099f11 100644
case LLM_ARCH_DECI:
case LLM_ARCH_DECI:
{
{
llm = std::make_unique<llm_build_deci>(*this, params, gf);
llm = std::make_unique<llm_build_deci>(*this, params, gf);
@@ -134
24
,6 +137
28
,7 @@
llama_rope_type llama_model_rope_type(const llama_model * model) {
@@ -134
89
,6 +137
93
,7 @@
llama_rope_type llama_model_rope_type(const llama_model * model) {
// use what we call a normal RoPE, operating on pairs of consecutive head values
// use what we call a normal RoPE, operating on pairs of consecutive head values
case LLM_ARCH_LLAMA:
case LLM_ARCH_LLAMA:
case LLM_ARCH_LLAMA4:
case LLM_ARCH_LLAMA4:
...
@@ -957,7 +905,7 @@ index 572378c9..9d099f11 100644
...
@@ -957,7 +905,7 @@ index 572378c9..9d099f11 100644
case LLM_ARCH_BAICHUAN:
case LLM_ARCH_BAICHUAN:
case LLM_ARCH_STARCODER:
case LLM_ARCH_STARCODER:
diff --git a/src/llama-model.h b/src/llama-model.h
diff --git a/src/llama-model.h b/src/llama-model.h
index
856e6042..6be91282
100644
index
43746c7d..9281e629
100644
--- a/src/llama-model.h
--- a/src/llama-model.h
+++ b/src/llama-model.h
+++ b/src/llama-model.h
@@ -11,6 +11,7 @@
@@ -11,6 +11,7 @@
...
@@ -968,7 +916,7 @@ index 856e6042..6be91282 100644
...
@@ -968,7 +916,7 @@ index 856e6042..6be91282 100644
struct llama_cparams;
struct llama_cparams;
struct llama_ubatch;
struct llama_ubatch;
@@ -7
3
,6 +7
4
,7 @@
enum llm_type {
@@ -7
4
,6 +7
5
,7 @@
enum llm_type {
LLM_TYPE_40B,
LLM_TYPE_40B,
LLM_TYPE_65B,
LLM_TYPE_65B,
LLM_TYPE_70B,
LLM_TYPE_70B,
...
@@ -976,7 +924,7 @@ index 856e6042..6be91282 100644
...
@@ -976,7 +924,7 @@ index 856e6042..6be91282 100644
LLM_TYPE_236B,
LLM_TYPE_236B,
LLM_TYPE_290B,
LLM_TYPE_290B,
LLM_TYPE_314B,
LLM_TYPE_314B,
@@ -31
4
,6 +3
16
,16 @@
struct llama_layer {
@@ -31
8
,6 +3
20
,16 @@
struct llama_layer {
struct ggml_tensor * bskcn_tv = nullptr;
struct ggml_tensor * bskcn_tv = nullptr;
...
@@ -994,7 +942,7 @@ index 856e6042..6be91282 100644
...
@@ -994,7 +942,7 @@ index 856e6042..6be91282 100644
struct llama_layer_convnext convnext;
struct llama_layer_convnext convnext;
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index
7dc54227..223e1f3f
100644
index
820d5128..56531980
100644
--- a/src/llama-quant.cpp
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -639,7 +639,9 @@
static void llama_model_quantize_impl(const std::string & fname_inp, const std::
@@ -639,7 +639,9 @@
static void llama_model_quantize_impl(const std::string & fname_inp, const std::
...
@@ -1008,3 +956,72 @@ index 7dc54227..223e1f3f 100644
...
@@ -1008,3 +956,72 @@ index 7dc54227..223e1f3f 100644
}
}
size_t total_size_org = 0;
size_t total_size_org = 0;
diff --git a/tools/mtmd/llava.cpp b/tools/mtmd/llava.cpp
index ebef8b3c..b0eb79bb 100644
--- a/tools/mtmd/llava.cpp
+++ b/tools/mtmd/llava.cpp
@@ -462,7 +462,7 @@
struct llava_embd_batch {
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
- llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+ llava_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
pos .resize(n_tokens);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
@@ -474,6 +474,7 @@
struct llava_embd_batch {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ embd,
+ /*n_embd =*/ n_embd,
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
@@ -497,7 +498,7 @@
bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
n_eval = n_batch;
}
float * embd = image_embed->embed+i*n_embd;
- llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
+ llava_embd_batch llava_batch = llava_embd_batch(embd, n_embd, n_eval, *n_past, 0);
if (llama_decode(ctx_llama, llava_batch.batch)) {
LOG_ERR("%s : failed to eval\n", __func__);
return false;
diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp
index 7a328867..61ebdd43 100644
--- a/tools/mtmd/mtmd-helper.cpp
+++ b/tools/mtmd/mtmd-helper.cpp
@@ -58,7 +58,7 @@
struct decode_embd_batch {
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
- decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
+ decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
pos .resize(n_tokens * n_pos_per_embd);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
@@ -69,6 +69,7 @@
struct decode_embd_batch {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ embd,
+ /*n_embd =*/ n_embd,
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
@@ -131,6 +132,7 @@
struct decode_embd_batch {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ batch.embd + offset * n_mmproj_embd,
+ /*n_embd =*/ batch.n_embd,
/*pos =*/ pos_ptr,
/*n_seq_id =*/ batch.n_seq_id + offset,
/*seq_id =*/ batch.seq_id + offset,
@@ -166,7 +168,8 @@
int32_t mtmd_helper_decode_image_chunk(
int32_t n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens);
int32_t i_batch = 0;
int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
- decode_embd_batch batch_embd(encoded_embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
+ int n_embd = llama_model_n_embd(llama_get_model(lctx));
+ decode_embd_batch batch_embd(encoded_embd, n_embd, n_tokens, n_past, seq_id);
const int nx = mtmd_image_tokens_get_nx(image_tokens);
const int ny = mtmd_image_tokens_get_ny(image_tokens);
llama/patches/0007-add-unpad-operator.patch
View file @
0cefd46f
...
@@ -18,7 +18,7 @@ adds the unpad operator to GGML
...
@@ -18,7 +18,7 @@ adds the unpad operator to GGML
10 files changed, 223 insertions(+), 2 deletions(-)
10 files changed, 223 insertions(+), 2 deletions(-)
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index
1b8603e7..53ef31b2
100644
index
e91dedf1..8dc107ba
100644
--- a/ggml/include/ggml.h
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -489,6 +489,7 @@
extern "C" {
@@ -489,6 +489,7 @@
extern "C" {
...
@@ -29,7 +29,7 @@ index 1b8603e7..53ef31b2 100644
...
@@ -29,7 +29,7 @@ index 1b8603e7..53ef31b2 100644
GGML_OP_ARANGE,
GGML_OP_ARANGE,
GGML_OP_TIMESTEP_EMBEDDING,
GGML_OP_TIMESTEP_EMBEDDING,
GGML_OP_ARGSORT,
GGML_OP_ARGSORT,
@@ -17
77
,6 +17
7
8,15 @@
extern "C" {
@@ -17
81
,6 +178
2
,15 @@
extern "C" {
int p0,
int p0,
int p1);
int p1);
...
@@ -46,10 +46,10 @@ index 1b8603e7..53ef31b2 100644
...
@@ -46,10 +46,10 @@ index 1b8603e7..53ef31b2 100644
// timesteps: [N,]
// timesteps: [N,]
// return: [N, dim]
// return: [N, dim]
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index
64405449..34624cca
100644
index
a30e67f2..835e6495
100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -19
64
,6 +19
64
,10 @@
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
@@ -19
51
,6 +19
51
,10 @@
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
{
ggml_compute_forward_pad_reflect_1d(params, tensor);
ggml_compute_forward_pad_reflect_1d(params, tensor);
} break;
} break;
...
@@ -60,7 +60,7 @@ index 64405449..34624cca 100644
...
@@ -60,7 +60,7 @@ index 64405449..34624cca 100644
case GGML_OP_ARANGE:
case GGML_OP_ARANGE:
{
{
ggml_compute_forward_arange(params, tensor);
ggml_compute_forward_arange(params, tensor);
@@ -22
8
7,6 +22
91
,7 @@
static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
@@ -227
4
,6 +22
78
,7 @@
static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
case GGML_OP_UPSCALE:
case GGML_OP_UPSCALE:
case GGML_OP_PAD:
case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D:
case GGML_OP_PAD_REFLECT_1D:
...
@@ -69,10 +69,10 @@ index 64405449..34624cca 100644
...
@@ -69,10 +69,10 @@ index 64405449..34624cca 100644
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT:
case GGML_OP_ARGSORT:
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index
7413192b..becdae07
100644
index
955fec59..1868a10c
100644
--- a/ggml/src/ggml-cpu/ops.cpp
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -6
703
,6 +6
703
,61 @@
void ggml_compute_forward_pad_reflect_1d(
@@ -6
690
,6 +6
690
,61 @@
void ggml_compute_forward_pad_reflect_1d(
}
}
}
}
...
@@ -147,10 +147,10 @@ index dc081b9e..a7125555 100644
...
@@ -147,10 +147,10 @@ index dc081b9e..a7125555 100644
void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index
04ce764e..491acccb
100644
index
cb0d8528..6fe86674
100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -22
2
3,6 +22
2
3,9 @@
static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
@@ -223
8
,6 +223
8
,9 @@
static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_PAD:
case GGML_OP_PAD:
ggml_cuda_op_pad(ctx, dst);
ggml_cuda_op_pad(ctx, dst);
break;
break;
...
@@ -160,7 +160,7 @@ index 04ce764e..491acccb 100644
...
@@ -160,7 +160,7 @@ index 04ce764e..491acccb 100644
case GGML_OP_ARANGE:
case GGML_OP_ARANGE:
ggml_cuda_op_arange(ctx, dst);
ggml_cuda_op_arange(ctx, dst);
break;
break;
@@ -3
197
,6 +32
00
,7 @@
static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
@@ -3
212
,6 +32
15
,7 @@
static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_UPSCALE:
case GGML_OP_UPSCALE:
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
case GGML_OP_PAD:
case GGML_OP_PAD:
...
@@ -233,10 +233,10 @@ index 8fd386b0..e2ededc3 100644
...
@@ -233,10 +233,10 @@ index 8fd386b0..e2ededc3 100644
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index
425524d0..112abef6
100644
index
1b56f858..7641247e
100644
--- a/ggml/src/ggml-metal/ggml-metal.m
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -34
1
,6 +34
1
,7 @@
static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
@@ -34
7
,6 +34
7
,7 @@
static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
GGML_METAL_KERNEL_TYPE_PAD_F32,
GGML_METAL_KERNEL_TYPE_PAD_F32,
GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
...
@@ -244,7 +244,7 @@ index 425524d0..112abef6 100644
...
@@ -244,7 +244,7 @@ index 425524d0..112abef6 100644
GGML_METAL_KERNEL_TYPE_ARANGE_F32,
GGML_METAL_KERNEL_TYPE_ARANGE_F32,
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
@@ -12
77
,6 +12
78
,7 @@
@implementation GGMLMetalClass
@@ -12
94
,6 +12
95
,7 @@
@implementation GGMLMetalClass
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true);
...
@@ -252,7 +252,7 @@ index 425524d0..112abef6 100644
...
@@ -252,7 +252,7 @@ index 425524d0..112abef6 100644
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
@@ -16
47
,6 +16
49
,7 @@
static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
@@ -16
55
,6 +16
57
,7 @@
static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
case GGML_OP_POOL_2D:
case GGML_OP_POOL_2D:
case GGML_OP_PAD:
case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D:
case GGML_OP_PAD_REFLECT_1D:
...
@@ -260,7 +260,7 @@ index 425524d0..112abef6 100644
...
@@ -260,7 +260,7 @@ index 425524d0..112abef6 100644
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT:
case GGML_OP_ARGSORT:
case GGML_OP_LEAKY_RELU:
case GGML_OP_LEAKY_RELU:
@@ -4
047
,6 +4
050
,36 @@
static bool ggml_metal_encode_node(
@@ -4
184
,6 +4
187
,36 @@
static bool ggml_metal_encode_node(
const int nth = MIN(1024, ne0);
const int nth = MIN(1024, ne0);
...
@@ -298,10 +298,10 @@ index 425524d0..112abef6 100644
...
@@ -298,10 +298,10 @@ index 425524d0..112abef6 100644
} break;
} break;
case GGML_OP_ARANGE:
case GGML_OP_ARANGE:
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 9
f4147e9..6ceb3cef
100644
index 9
cfddf45..080a943b
100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -
2975,6 +2975
,51 @@
kernel void kernel_pad_reflect_1d_f32(
@@ -
3121,6 +3121
,51 @@
kernel void kernel_pad_reflect_1d_f32(
}
}
}
}
...
@@ -354,7 +354,7 @@ index 9f4147e9..6ceb3cef 100644
...
@@ -354,7 +354,7 @@ index 9f4147e9..6ceb3cef 100644
device char * dst,
device char * dst,
constant ggml_metal_kargs_arange & args,
constant ggml_metal_kargs_arange & args,
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index
7
654
ae17..3c57aff8
100644
index
8a
654
624..6b034d35
100644
--- a/ggml/src/ggml.c
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -923,6 +923,7 @@
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
@@ -923,6 +923,7 @@
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
...
@@ -391,7 +391,7 @@ index 7654ae17..3c57aff8 100644
...
@@ -391,7 +391,7 @@ index 7654ae17..3c57aff8 100644
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@@ -427
0
,6 +427
2
,25 @@
struct ggml_tensor * ggml_pad_reflect_1d(
@@ -427
4
,6 +427
6
,25 @@
struct ggml_tensor * ggml_pad_reflect_1d(
return result;
return result;
}
}
...
...
llama/patches/0008-fix-deepseek-deseret-regex.patch
View file @
0cefd46f
...
@@ -12,10 +12,10 @@ regex
...
@@ -12,10 +12,10 @@ regex
2 files changed, 22 insertions(+), 1 deletion(-)
2 files changed, 22 insertions(+), 1 deletion(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index
a9ee9f03..1306864e
100644
index
806c1b3d..10f34d33
100644
--- a/src/llama-vocab.cpp
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -29
6
,7 +29
6
,7 @@
struct llm_tokenizer_bpe : llm_tokenizer {
@@ -29
8
,7 +29
8
,7 @@
struct llm_tokenizer_bpe : llm_tokenizer {
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
regex_exprs = {
regex_exprs = {
"[\r\n]",
"[\r\n]",
...
...
llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch
View file @
0cefd46f
...
@@ -15,33 +15,102 @@ but this can leave a cache that still does not have adequate space
...
@@ -15,33 +15,102 @@ but this can leave a cache that still does not have adequate space
even after defragmentation is triggered. Instead, we should do
even after defragmentation is triggered. Instead, we should do
multiple batches of processing until everything is complete.
multiple batches of processing until everything is complete.
---
---
src/llama-context.cpp | 105 +++++++++++++----------------------------
src/llama-context.h | 1 +
src/llama-context.h | 4 +-
src/llama-kv-cache.cpp | 107 ++++++++++++++---------------------------
src/llama-kv-cache.cpp | 39 +++------------
src/llama-kv-cache.h | 12 ++++-
src/llama-kv-cache.h | 9 +++-
3 files changed, 47 insertions(+), 73 deletions(-)
4 files changed, 51 insertions(+), 106 deletions(-)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
diff --git a/src/llama-context.h b/src/llama-context.h
index cd06ad91..77177c5e 100644
index c4ab242a..9970dfc6 100644
--- a/src/llama-context.cpp
--- a/src/llama-context.h
+++ b/src/llama-context.cpp
+++ b/src/llama-context.h
@@ -583,13 +583,12 @@
llm_graph_result_ptr llama_context::build_kv_self_shift(
@@ -5,6 +5,7 @@
#include "llama-cparams.h"
llm_graph_result_ptr llama_context::build_kv_self_defrag(
#include "llama-graph.h"
ggml_context * ctx0,
#include "llama-adapter.h"
- ggml_cgraph * gf) const {
+#include "llama-kv-cache.h"
+ ggml_cgraph * gf,
+ const std::vector<struct llama_kv_defrag_move> & moves) const {
#include "ggml-cpp.h"
auto res = std::make_unique<llm_graph_result>();
#include "ggml-opt.h"
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index a7b0a7eb..1a50c034 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -372,8 +372,6 @@
void llama_kv_cache_unified::commit() {
}
bool llama_kv_cache_unified::update(llama_context & lctx) {
- bool need_reserve = false;
-
auto * sched = lctx.get_sched();
if (has_shift) {
@@ -396,8 +394,6 @@
bool llama_kv_cache_unified::update(llama_context & lctx) {
res->set_inputs(nullptr);
lctx.graph_compute(gf, false);
-
- need_reserve = true;
}
{
@@ -411,27 +407,36 @@
bool llama_kv_cache_unified::update(llama_context & lctx) {
if (do_defrag) {
LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
+ const uint32_t n_max_nodes = lctx.graph_max_nodes();
+ const uint32_t max_moves = (n_max_nodes - 2*model.hparams.n_layer)/(6*model.hparams.n_layer);
+ if (!defrag_prepare(n_max_nodes)) {
+ LLAMA_LOG_ERROR("%s: failed to prepare defragmentation\n", __func__);
+ return false;
+ }
+
+ for (std::size_t i = 0; i < defrag_info.moves.size(); i += max_moves) {
+ std::vector<struct llama_kv_defrag_move> chunk;
+ auto end = std::min(i + max_moves, defrag_info.moves.size());
+ chunk.assign(defrag_info.moves.begin() + i, defrag_info.moves.begin() + end);
- if (defrag_prepare(lctx.graph_max_nodes())) {
ggml_backend_sched_reset(sched);
auto * gf = lctx.graph_init();
- auto res = build_graph_defrag(lctx.get_cparams(), lctx.get_ctx_compute(), gf);
+ auto res = build_graph_defrag(lctx.get_cparams(), lctx.get_ctx_compute(), gf, chunk);
ggml_backend_sched_alloc_graph(sched, gf);
res->set_inputs(nullptr);
lctx.graph_compute(gf, false);
-
- need_reserve = true;
}
do_defrag = false;
}
const auto & hparams = model.hparams;
- return need_reserve;
+ // we never need to reserve a worst case graph
+ return false;
}
- const auto & ids = kv_self->defrag_info.ids;
void llama_kv_cache_unified::defrag_sched(float thold) {
@@ -715,11 +720,10 @@
llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
const llama_cparams & cparams,
ggml_context * ctx,
- ggml_cgraph * gf) const {
+ ggml_cgraph * gf,
+ const std::vector<struct llama_kv_defrag_move> & moves) const {
auto res = std::make_unique<llm_graph_result>();
- const auto & ids = defrag_info.ids;
-
-
#if 0
#if 0
// CPU defrag
// CPU defrag
//
//
@@ -
66
1,32 +
660
,20 @@
llm_graph_result_ptr llama_
context
::build_
kv_self
_defrag(
@@ -
79
1,32 +
795
,20 @@
llm_graph_result_ptr llama_
kv_cache_unified
::build_
graph
_defrag(
ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
}
}
#else
#else
...
@@ -63,188 +132,63 @@ index cd06ad91..77177c5e 100644
...
@@ -63,188 +132,63 @@ index cd06ad91..77177c5e 100644
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
ggml_tensor * view_k_src = ggml_view_2d(ctx
0
,
kv_self->
k_l[il],
ggml_tensor * view_k_src = ggml_view_2d(ctx, k_l[il],
- n_embd_k_gqa, nm,
- n_embd_k_gqa, nm,
+ n_embd_k_gqa, move.len,
+ n_embd_k_gqa, move.len,
ggml_row_size(
kv_self->
k_l[il]->type, n_embd_k_gqa),
ggml_row_size(k_l[il]->type, n_embd_k_gqa),
- ggml_row_size(
kv_self->
k_l[il]->type, n_embd_k_gqa*i));
- ggml_row_size(k_l[il]->type, n_embd_k_gqa*i));
+ ggml_row_size(
kv_self->
k_l[il]->type, n_embd_k_gqa*move.src));
+ ggml_row_size(k_l[il]->type, n_embd_k_gqa*move.src));
ggml_tensor * view_k_dst = ggml_view_2d(ctx
0
,
kv_self->
k_l[il],
ggml_tensor * view_k_dst = ggml_view_2d(ctx, k_l[il],
- n_embd_k_gqa, nm,
- n_embd_k_gqa, nm,
+ n_embd_k_gqa, move.len,
+ n_embd_k_gqa, move.len,
ggml_row_size(
kv_self->
k_l[il]->type, n_embd_k_gqa),
ggml_row_size(k_l[il]->type, n_embd_k_gqa),
- ggml_row_size(
kv_self->
k_l[il]->type, n_embd_k_gqa*id));
- ggml_row_size(k_l[il]->type, n_embd_k_gqa*id));
+ ggml_row_size(
kv_self->
k_l[il]->type, n_embd_k_gqa*move.dst));
+ ggml_row_size(k_l[il]->type, n_embd_k_gqa*move.dst));
ggml_tensor * view_v_src;
ggml_tensor * view_v_src;
ggml_tensor * view_v_dst;
ggml_tensor * view_v_dst;
@@ -
69
4,3
4
+
6
81
,30
@@
llm_graph_result_ptr llama_
context
::build_
kv_self
_defrag(
@@ -
82
4,3
1
+81
6,29
@@
llm_graph_result_ptr llama_
kv_cache_unified
::build_
graph
_defrag(
if (cparams.flash_attn) {
if (cparams.flash_attn) {
// NOTE: the V cache is not transposed when using flash attention
// NOTE: the V cache is not transposed when using flash attention
view_v_src = ggml_view_2d(ctx
0
,
kv_self->
v_l[il],
view_v_src = ggml_view_2d(ctx, v_l[il],
- n_embd_v_gqa, nm,
- n_embd_v_gqa, nm,
+ n_embd_v_gqa, move.len,
+ n_embd_v_gqa, move.len,
ggml_row_size(
kv_self->
v_l[il]->type, n_embd_v_gqa),
ggml_row_size(v_l[il]->type, n_embd_v_gqa),
- ggml_row_size(
kv_self->
v_l[il]->type, n_embd_v_gqa*i));
- ggml_row_size(v_l[il]->type, n_embd_v_gqa*i));
+ ggml_row_size(
kv_self->
v_l[il]->type, n_embd_v_gqa*move.
src
));
+ ggml_row_size(v_l[il]->type, n_embd_v_gqa*move.
dst
));
view_v_dst = ggml_view_2d(ctx
0
,
kv_self->
v_l[il],
view_v_dst = ggml_view_2d(ctx, v_l[il],
- n_embd_v_gqa, nm,
- n_embd_v_gqa, nm,
+ n_embd_v_gqa,
move.len,
+
move.len,
n_embd_v_gqa,
ggml_row_size(
kv_self->
v_l[il]->type, n_embd_v_gqa),
ggml_row_size(v_l[il]->type, n_embd_v_gqa),
- ggml_row_size(
kv_self->
v_l[il]->type, n_embd_v_gqa*id));
- ggml_row_size(v_l[il]->type, n_embd_v_gqa*id));
+ ggml_row_size(
kv_self->
v_l[il]->type,
n_embd_v_gqa*
move.
dst
));
+ ggml_row_size(v_l[il]->type, move.
src
));
} else {
} else {
view_v_src = ggml_view_2d(ctx
0
,
kv_self->
v_l[il],
view_v_src = ggml_view_2d(ctx, v_l[il],
- nm, n_embd_v_gqa,
- nm, n_embd_v_gqa,
+ move.len, n_embd_v_gqa,
+ move.len, n_embd_v_gqa,
ggml_row_size(
kv_self->
v_l[il]->type,
kv_self->
size),
ggml_row_size(v_l[il]->type, size),
- ggml_row_size(
kv_self->
v_l[il]->type, i));
- ggml_row_size(v_l[il]->type, i));
+ ggml_row_size(
kv_self->
v_l[il]->type, move.src));
+ ggml_row_size(v_l[il]->type, move.src));
view_v_dst = ggml_view_2d(ctx
0
,
kv_self->
v_l[il],
view_v_dst = ggml_view_2d(ctx, v_l[il],
- nm, n_embd_v_gqa,
- nm, n_embd_v_gqa,
+ move.len, n_embd_v_gqa,
+ move.len, n_embd_v_gqa,
ggml_row_size(
kv_self->
v_l[il]->type,
kv_self->
size),
ggml_row_size(v_l[il]->type, size),
- ggml_row_size(
kv_self->
v_l[il]->type, id));
- ggml_row_size(v_l[il]->type, id));
+ ggml_row_size(
kv_self->
v_l[il]->type, move.dst));
+ ggml_row_size(v_l[il]->type, move.dst));
}
}
ggml_build_forward_expand(gf, ggml_cpy(ctx
0
, view_k_src, view_k_dst));
ggml_build_forward_expand(gf, ggml_cpy(ctx, view_k_src, view_k_dst));
ggml_build_forward_expand(gf, ggml_cpy(ctx
0
, view_v_src, view_v_dst));
ggml_build_forward_expand(gf, ggml_cpy(ctx, view_v_src, view_v_dst));
}
}
-
-
- i += nm - 1;
- i += nm - 1;
}
}
-
- //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
#endif
return res;
@@ -730,8 +713,6 @@
llm_graph_result_ptr llama_context::build_kv_self_defrag(
void llama_context::kv_self_update() {
auto & kv = kv_self;
- bool need_reserve = false;
-
if (kv->has_shift) {
if (!kv->get_can_shift()) {
GGML_ABORT("The current context does not support K-shift");
@@ -752,8 +733,6 @@
void llama_context::kv_self_update() {
res->set_inputs(nullptr);
graph_compute(gf, false);
-
- need_reserve = true;
}
{
@@ -768,49 +747,28 @@
void llama_context::kv_self_update() {
// defragment the KV cache if needed
if (kv->do_defrag) {
LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
+ const uint32_t n_max_nodes = graph_max_nodes();
+ const uint32_t max_moves = (n_max_nodes - 2*model.hparams.n_layer)/(6*model.hparams.n_layer);
+ if (!kv->defrag_prepare(n_max_nodes)) {
+ LLAMA_LOG_ERROR("%s: failed to prepare defragmentation\n", __func__);
+ return;
+ }
- if (kv->defrag_prepare(graph_max_nodes())) {
- ggml_backend_sched_reset(sched.get());
+ for (std::size_t i = 0; i < kv_self->defrag_info.moves.size(); i += max_moves) {
+ std::vector<struct llama_kv_defrag_move> chunk;
+ auto end = std::min(i + max_moves, kv_self->defrag_info.moves.size());
+ chunk.assign(kv_self->defrag_info.moves.begin() + i, kv_self->defrag_info.moves.begin() + end);
+ ggml_backend_sched_reset(sched.get());
auto * gf = graph_init();
-
- auto res = build_kv_self_defrag(ctx_compute.get(), gf);
-
+ auto res = build_kv_self_defrag(ctx_compute.get(), gf, chunk);
ggml_backend_sched_alloc_graph(sched.get(), gf);
-
res->set_inputs(nullptr);
-
graph_compute(gf, false);
-
- need_reserve = true;
}
kv->do_defrag = false;
}
-
- // reserve a worst case graph if needed
- if (need_reserve) {
- LLAMA_LOG_DEBUG("%s: reserving a worst case graph\n", __func__);
-
- // build worst-case graph
- uint32_t n_seqs = 1; // TODO: worst-case number of sequences
- uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
-
- // simulate full KV cache
- kv_self->n = kv_self->size;
-
- llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
- llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-
- auto * gf = graph_init();
- graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT);
-
- // initialize scheduler with the worst-case graph
- ggml_backend_sched_reset(sched.get());
- if (!ggml_backend_sched_reserve(sched.get(), gf)) {
- LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
- }
- }
}
enum llama_pooling_type llama_context::pooling_type() const {
@@ -1294,9 +1252,12 @@
int llama_context::decode(llama_batch & inp_batch) {
// find KV slot
{
if (!kv_self->find_slot(ubatch)) {
- LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
-
- return 1;
+ kv_self->defrag();
+ kv_self_update();
+ if (!kv_self->find_slot(ubatch)) {
+ LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
+ return 1;
+ }
}
if (!kv_self->recurrent) {
diff --git a/src/llama-context.h b/src/llama-context.h
index a50c4afa..30f84bfd 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -5,6 +5,7 @@
#include "llama-cparams.h"
#include "llama-graph.h"
#include "llama-adapter.h"
+#include "llama-kv-cache.h"
#include "ggml-cpp.h"
@@ -179,7 +180,8 @@
private:
//LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
@@ -865,17 +855,7 @@
bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
llm_graph_result_ptr build_kv_self_defrag(
ggml_context * ctx0,
- ggml_cgraph * gf) const;
+ ggml_cgraph * gf,
+ const std::vector<struct llama_kv_defrag_move> & moves) const;
// TODO: read/write lora adapters and cvec
size_t state_write_data(llama_io_write_i & io);
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index 69f8d35a..35a750d3 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -781,17 +781,7 @@
bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
assert(n_used <= n_kv);
assert(n_used <= n_kv);
...
@@ -263,7 +207,7 @@ index 69f8d35a..35a750d3 100644
...
@@ -263,7 +207,7 @@ index 69f8d35a..35a750d3 100644
// determine which KV cells to move where
// determine which KV cells to move where
//
//
@@ -
799
,10 +
789
,7 @@
bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
@@ -
883
,10 +
863
,7 @@
bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
//
//
// if ids[i] == i || ids[i] == n_kv, then cell i is not moved
// if ids[i] == i || ids[i] == n_kv, then cell i is not moved
//
//
...
@@ -275,7 +219,7 @@ index 69f8d35a..35a750d3 100644
...
@@ -275,7 +219,7 @@ index 69f8d35a..35a750d3 100644
for (uint32_t i0 = 0; i0 < n_used; ++i0) {
for (uint32_t i0 = 0; i0 < n_used; ++i0) {
const auto & cell0 = cells[i0];
const auto & cell0 = cells[i0];
@@ -
851
,19 +
838
,11 @@
bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
@@ -
935
,19 +
912
,11 @@
bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
// are we moving a continuous block of memory?
// are we moving a continuous block of memory?
bool cont = false;
bool cont = false;
...
@@ -295,7 +239,7 @@ index 69f8d35a..35a750d3 100644
...
@@ -295,7 +239,7 @@ index 69f8d35a..35a750d3 100644
cont = false;
cont = false;
continue;
continue;
}
}
@@ -
879
,8 +
858
,10 @@
bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
@@ -
963
,8 +
932
,10 @@
bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
head = n_used;
head = n_used;
if (!cont) {
if (!cont) {
...
@@ -307,7 +251,7 @@ index 69f8d35a..35a750d3 100644
...
@@ -307,7 +251,7 @@ index 69f8d35a..35a750d3 100644
}
}
nf++;
nf++;
@@ -
890
,22 +
871
,16 @@
bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
@@ -
974
,22 +
945
,16 @@
bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
}
}
}
}
...
@@ -325,37 +269,47 @@ index 69f8d35a..35a750d3 100644
...
@@ -325,37 +269,47 @@ index 69f8d35a..35a750d3 100644
return false;
return false;
}
}
- LLAMA_LOG_DEBUG("(tmp log) KV defrag cell moves: %u\n", n_moves);
- LLAMA_LOG_DEBUG("
%s:
(tmp log) KV defrag cell moves: %u\n",
__func__,
n_moves);
-
-
- LLAMA_LOG_DEBUG("expected gf nodes: %u\n", 6*n_moves*n_layer);
- LLAMA_LOG_DEBUG("
%s:
expected gf nodes: %u\n",
__func__,
6*n_moves*n_layer);
+ // LLAMA_LOG_DEBUG("(tmp log) KV defrag cell moves: %u\n", n_moves);
+ // LLAMA_LOG_DEBUG("(tmp log) KV defrag cell moves: %u\n", n_moves);
return true;
return true;
}
}
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
index
56c74035..25cbcb56
100644
index
bf3b4b6a..928b9712
100644
--- a/src/llama-kv-cache.h
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -43,6 +43,13 @@
private:
@@ -82,6 +82,13 @@
struct llama_kv_cache_guard {
private:
llama_kv_cache * kv;
llama_kv_cache * kv;
};
};
+
+// block of KV slots to move when defragging
+// block of KV slots to move when defragging
+struct llama_kv_defrag_move {
+struct llama_kv_defrag_move {
+ uint32_t src;
+ uint32_t src;
+ uint32_t dst;
+ uint32_t dst;
+ uint32_t len;
+ uint32_t len;
+};
+};
+
struct llama_kv_cell {
llama_pos pos = -1;
llama_pos delta = 0;
@@ -131,7 +138,7 @@
public:
// defrag
//
// llama_kv_cache_unified
@@ -207,7 +214,7 @@
private:
// defrag
struct {
struct {
- std::vector<uint32_t> ids;
- std::vector<uint32_t> ids;
+ std::vector<llama_kv_defrag_move> moves;
+ std::vector<llama_kv_defrag_move> moves;
} defrag_info;
} defrag_info;
// return true if cells have been moved
// return true if cells have been moved
@@ -249,7 +256,8 @@
private:
llm_graph_result_ptr build_graph_defrag(
const llama_cparams & cparams,
ggml_context * ctx,
- ggml_cgraph * gf) const;
+ ggml_cgraph * gf,
+ const std::vector<llama_kv_defrag_move> & moves) const;
void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
llama/patches/0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
View file @
0cefd46f
...
@@ -8,7 +8,7 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
...
@@ -8,7 +8,7 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
1 file changed, 2 insertions(+)
1 file changed, 2 insertions(+)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index
43d9fc4f..4c0d3824
100644
index
ddea5ad3..45918bf6
100644
--- a/ggml/src/CMakeLists.txt
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -279,6 +279,7 @@
function(ggml_add_cpu_backend_variant tag_name)
@@ -279,6 +279,7 @@
function(ggml_add_cpu_backend_variant tag_name)
...
...
llama/patches/0013-remove-amx.patch
View file @
0cefd46f
...
@@ -9,7 +9,7 @@ disable amx as it reduces performance on some systems
...
@@ -9,7 +9,7 @@ disable amx as it reduces performance on some systems
1 file changed, 4 deletions(-)
1 file changed, 4 deletions(-)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 4
c0d3824..79c26312
100644
index 4
5918bf6..0beaed86
100644
--- a/ggml/src/CMakeLists.txt
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -296,10 +296,6 @@
if (GGML_CPU_ALL_VARIANTS)
@@ -296,10 +296,6 @@
if (GGML_CPU_ALL_VARIANTS)
...
...
llama/patches/0014-fix-string-arr-kv-loading.patch
View file @
0cefd46f
...
@@ -53,15 +53,15 @@ index 381a9c7d..e45b453d 100644
...
@@ -53,15 +53,15 @@ index 381a9c7d..e45b453d 100644
}
}
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 1
306864e..d6515ff6
100644
index 1
0f34d33..b098bb25
100644
--- a/src/llama-vocab.cpp
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1459,7 +1459,7 @@
void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
@@ -1471,7 +1471,7 @@
void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
GGML_ASSERT(pc_type == GGUF_TYPE_INT8 || pc_type == GGUF_TYPE_UINT8);
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
- const size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
if (precompiled_charsmap_keyidx != -1) {
+ const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
- size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
+ size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
#ifdef IS_BIG_ENDIAN
#ifdef IS_BIG_ENDIAN
llama/patches/0015-ollama-debug-tensor.patch
View file @
0cefd46f
...
@@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor
...
@@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor
1 file changed, 6 insertions(+)
1 file changed, 6 insertions(+)
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index
34624cca..59bd3c62
100644
index
835e6495..3902894b
100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -15,6 +15,8 @@
@@ -15,6 +15,8 @@
...
@@ -20,7 +20,7 @@ index 34624cca..59bd3c62 100644
...
@@ -20,7 +20,7 @@ index 34624cca..59bd3c62 100644
#if defined(_MSC_VER) || defined(__MINGW32__)
#if defined(_MSC_VER) || defined(__MINGW32__)
#include <malloc.h> // using malloc.h with MSC/MINGW
#include <malloc.h> // using malloc.h with MSC/MINGW
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
@@ -28
59
,6 +28
61
,10 @@
static thread_ret_t ggml_graph_compute_thread(void * data) {
@@ -28
46
,6 +28
48
,10 @@
static thread_ret_t ggml_graph_compute_thread(void * data) {
ggml_compute_forward(¶ms, node);
ggml_compute_forward(¶ms, node);
...
...
llama/patches/0016-add-ollama-vocab-for-grammar-support.patch
View file @
0cefd46f
...
@@ -184,7 +184,7 @@ index f8c291de..2a3a62db 100644
...
@@ -184,7 +184,7 @@ index f8c291de..2a3a62db 100644
const char * grammar_root,
const char * grammar_root,
bool lazy,
bool lazy,
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index
c0a5f934..75731053
100644
index
804b11e0..15a10ca8
100644
--- a/src/llama-sampling.cpp
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1466,7 +1466,7 @@
static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
@@ -1466,7 +1466,7 @@
static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
...
...
llama/patches/0017-ggml-Don-t-assert-fail-when-tensor-data-changes-1322.patch
deleted
100644 → 0
View file @
ad035ad5
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@kernel.org>
Date: Thu, 1 May 2025 13:46:10 -0700
Subject: [PATCH] ggml: Don't assert fail when tensor data changes (#13222)
The following scenario will cause an assertion failure in the graph
allocator:
- Build and allocate a graph containing a tensor with a non-NULL data
pointer
- Build and allocate a new graph where that data is NULL
Result:
ggml-alloc.c:819: GGML_ASSERT(talloc->buffer_id >= 0) failed
This happens during revalidation because we think that memory should
have been previously allocated based on the current graph but in
reality the previous graph was different. In this situation, we
should do a full reallocation pass.
---
ggml/src/ggml-alloc.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index a3d3f690..5fd379f6 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -816,7 +816,10 @@
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
size_t node_size = 0;
if (!node->data && !node->view_src) {
- GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API
+ // If we previously had data but don't now then reallocate
+ if (talloc->buffer_id < 0) {
+ return false;
+ }
node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
}
return talloc->size_max >= node_size;
ml/backend/ggml/ggml.go
View file @
0cefd46f
...
@@ -406,6 +406,7 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
...
@@ -406,6 +406,7 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
C
.
int
(
len
(
schedBackends
)),
C
.
int
(
len
(
schedBackends
)),
C
.
size_t
(
maxGraphNodes
),
C
.
size_t
(
maxGraphNodes
),
C
.
_Bool
(
len
(
gpus
)
>
1
&&
slices
.
Contains
(
gpus
,
output
.
d
)),
C
.
_Bool
(
len
(
gpus
)
>
1
&&
slices
.
Contains
(
gpus
,
output
.
d
)),
C
.
_Bool
(
false
),
),
),
schedBackends
:
schedBackends
,
schedBackends
:
schedBackends
,
schedBufts
:
schedBufts
,
schedBufts
:
schedBufts
,
...
...
ml/backend/ggml/ggml/include/ggml-backend.h
View file @
0cefd46f
...
@@ -38,7 +38,7 @@ extern "C" {
...
@@ -38,7 +38,7 @@ extern "C" {
GGML_API
ggml_backend_buffer_t
ggml_backend_buft_alloc_buffer
(
ggml_backend_buffer_type_t
buft
,
size_t
size
);
GGML_API
ggml_backend_buffer_t
ggml_backend_buft_alloc_buffer
(
ggml_backend_buffer_type_t
buft
,
size_t
size
);
GGML_API
size_t
ggml_backend_buft_get_alignment
(
ggml_backend_buffer_type_t
buft
);
GGML_API
size_t
ggml_backend_buft_get_alignment
(
ggml_backend_buffer_type_t
buft
);
GGML_API
size_t
ggml_backend_buft_get_max_size
(
ggml_backend_buffer_type_t
buft
);
GGML_API
size_t
ggml_backend_buft_get_max_size
(
ggml_backend_buffer_type_t
buft
);
GGML_API
size_t
ggml_backend_buft_get_alloc_size
(
ggml_backend_buffer_type_t
buft
,
struct
ggml_tensor
*
tensor
);
GGML_API
size_t
ggml_backend_buft_get_alloc_size
(
ggml_backend_buffer_type_t
buft
,
const
struct
ggml_tensor
*
tensor
);
GGML_API
bool
ggml_backend_buft_is_host
(
ggml_backend_buffer_type_t
buft
);
GGML_API
bool
ggml_backend_buft_is_host
(
ggml_backend_buffer_type_t
buft
);
GGML_API
ggml_backend_dev_t
ggml_backend_buft_get_device
(
ggml_backend_buffer_type_t
buft
);
GGML_API
ggml_backend_dev_t
ggml_backend_buft_get_device
(
ggml_backend_buffer_type_t
buft
);
...
@@ -59,7 +59,7 @@ extern "C" {
...
@@ -59,7 +59,7 @@ extern "C" {
GGML_API
enum
ggml_status
ggml_backend_buffer_init_tensor
(
ggml_backend_buffer_t
buffer
,
struct
ggml_tensor
*
tensor
);
GGML_API
enum
ggml_status
ggml_backend_buffer_init_tensor
(
ggml_backend_buffer_t
buffer
,
struct
ggml_tensor
*
tensor
);
GGML_API
size_t
ggml_backend_buffer_get_alignment
(
ggml_backend_buffer_t
buffer
);
GGML_API
size_t
ggml_backend_buffer_get_alignment
(
ggml_backend_buffer_t
buffer
);
GGML_API
size_t
ggml_backend_buffer_get_max_size
(
ggml_backend_buffer_t
buffer
);
GGML_API
size_t
ggml_backend_buffer_get_max_size
(
ggml_backend_buffer_t
buffer
);
GGML_API
size_t
ggml_backend_buffer_get_alloc_size
(
ggml_backend_buffer_t
buffer
,
struct
ggml_tensor
*
tensor
);
GGML_API
size_t
ggml_backend_buffer_get_alloc_size
(
ggml_backend_buffer_t
buffer
,
const
struct
ggml_tensor
*
tensor
);
GGML_API
void
ggml_backend_buffer_clear
(
ggml_backend_buffer_t
buffer
,
uint8_t
value
);
GGML_API
void
ggml_backend_buffer_clear
(
ggml_backend_buffer_t
buffer
,
uint8_t
value
);
GGML_API
bool
ggml_backend_buffer_is_host
(
ggml_backend_buffer_t
buffer
);
GGML_API
bool
ggml_backend_buffer_is_host
(
ggml_backend_buffer_t
buffer
);
GGML_API
void
ggml_backend_buffer_set_usage
(
ggml_backend_buffer_t
buffer
,
enum
ggml_backend_buffer_usage
usage
);
GGML_API
void
ggml_backend_buffer_set_usage
(
ggml_backend_buffer_t
buffer
,
enum
ggml_backend_buffer_usage
usage
);
...
@@ -248,7 +248,7 @@ extern "C" {
...
@@ -248,7 +248,7 @@ extern "C" {
// preferrably to run on the same backend as the buffer
// preferrably to run on the same backend as the buffer
ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false);
sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false
, true
);
// initialize buffers from a max size graph (optional)
// initialize buffers from a max size graph (optional)
reserve_graph = build_graph(sched, max_batch_size);
reserve_graph = build_graph(sched, max_batch_size);
...
@@ -289,7 +289,7 @@ extern "C" {
...
@@ -289,7 +289,7 @@ extern "C" {
typedef
bool
(
*
ggml_backend_sched_eval_callback
)(
struct
ggml_tensor
*
t
,
bool
ask
,
void
*
user_data
);
typedef
bool
(
*
ggml_backend_sched_eval_callback
)(
struct
ggml_tensor
*
t
,
bool
ask
,
void
*
user_data
);
// Initialize a backend scheduler, backends with low index are given priority over backends with high index
// Initialize a backend scheduler, backends with low index are given priority over backends with high index
GGML_API
ggml_backend_sched_t
ggml_backend_sched_new
(
ggml_backend_t
*
backends
,
ggml_backend_buffer_type_t
*
bufts
,
int
n_backends
,
size_t
graph_size
,
bool
parallel
);
GGML_API
ggml_backend_sched_t
ggml_backend_sched_new
(
ggml_backend_t
*
backends
,
ggml_backend_buffer_type_t
*
bufts
,
int
n_backends
,
size_t
graph_size
,
bool
parallel
,
bool
op_offload
);
GGML_API
void
ggml_backend_sched_free
(
ggml_backend_sched_t
sched
);
GGML_API
void
ggml_backend_sched_free
(
ggml_backend_sched_t
sched
);
// Initialize backend buffers from a measure graph
// Initialize backend buffers from a measure graph
...
...
ml/backend/ggml/ggml/include/ggml-cpp.h
View file @
0cefd46f
...
@@ -24,7 +24,7 @@ typedef std::unique_ptr<gguf_context, gguf_context_deleter> gguf_context_ptr;
...
@@ -24,7 +24,7 @@ typedef std::unique_ptr<gguf_context, gguf_context_deleter> gguf_context_ptr;
struct
ggml_gallocr_deleter
{
void
operator
()(
ggml_gallocr_t
galloc
)
{
ggml_gallocr_free
(
galloc
);
}
};
struct
ggml_gallocr_deleter
{
void
operator
()(
ggml_gallocr_t
galloc
)
{
ggml_gallocr_free
(
galloc
);
}
};
typedef
std
::
unique_ptr
<
ggml_gallocr
_t
,
ggml_gallocr_deleter
>
ggml_gallocr_ptr
;
typedef
std
::
unique_ptr
<
ggml_gallocr
,
ggml_gallocr_deleter
>
ggml_gallocr_ptr
;
// ggml-backend
// ggml-backend
...
...
ml/backend/ggml/ggml/include/ggml-opt.h
View file @
0cefd46f
...
@@ -37,13 +37,16 @@ extern "C" {
...
@@ -37,13 +37,16 @@ extern "C" {
// ====== Dataset ======
// ====== Dataset ======
GGML_API
ggml_opt_dataset_t
ggml_opt_dataset_init
(
GGML_API
ggml_opt_dataset_t
ggml_opt_dataset_init
(
int64_t
ne_datapoint
,
// number of elements per datapoint
enum
ggml_type
type_data
,
// the type for the internal data tensor
int64_t
ne_label
,
// number of elements per label
enum
ggml_type
type_label
,
// the type for the internal labels tensor
int64_t
ndata
,
// total number of datapoints/labels
int64_t
ne_datapoint
,
// number of elements per datapoint
int64_t
ndata_shard
);
// number of datapoints/labels per shard (unit at which the dataset is shuffled/copied)
int64_t
ne_label
,
// number of elements per label
int64_t
ndata
,
// total number of datapoints/labels
int64_t
ndata_shard
);
// number of datapoints/labels per shard (unit at which the dataset is shuffled/copied)
GGML_API
void
ggml_opt_dataset_free
(
ggml_opt_dataset_t
dataset
);
GGML_API
void
ggml_opt_dataset_free
(
ggml_opt_dataset_t
dataset
);
// get underlying tensors that store the data
// get underlying tensors that store the data
GGML_API
int64_t
ggml_opt_dataset_ndata
(
ggml_opt_dataset_t
dataset
);
GGML_API
struct
ggml_tensor
*
ggml_opt_dataset_data
(
ggml_opt_dataset_t
dataset
);
// shape = [ne_datapoint, ndata]
GGML_API
struct
ggml_tensor
*
ggml_opt_dataset_data
(
ggml_opt_dataset_t
dataset
);
// shape = [ne_datapoint, ndata]
GGML_API
struct
ggml_tensor
*
ggml_opt_dataset_labels
(
ggml_opt_dataset_t
dataset
);
// shape = [nd_label, ndata]
GGML_API
struct
ggml_tensor
*
ggml_opt_dataset_labels
(
ggml_opt_dataset_t
dataset
);
// shape = [nd_label, ndata]
...
@@ -56,13 +59,19 @@ extern "C" {
...
@@ -56,13 +59,19 @@ extern "C" {
struct
ggml_tensor
*
data_batch
,
// shape = [ne_datapoint, ndata_batch]
struct
ggml_tensor
*
data_batch
,
// shape = [ne_datapoint, ndata_batch]
struct
ggml_tensor
*
labels_batch
,
// shape = [ne_label, ndata_batch]
struct
ggml_tensor
*
labels_batch
,
// shape = [ne_label, ndata_batch]
int64_t
ibatch
);
int64_t
ibatch
);
GGML_API
void
ggml_opt_dataset_get_batch_host
(
ggml_opt_dataset_t
dataset
,
void
*
data_batch
,
size_t
nb_data_batch
,
void
*
labels_batch
,
int64_t
ibatch
);
// ====== Model / Context ======
// ====== Model / Context ======
enum
ggml_opt_build_type
{
enum
ggml_opt_build_type
{
GGML_OPT_BUILD_TYPE_FORWARD
,
GGML_OPT_BUILD_TYPE_FORWARD
=
10
,
GGML_OPT_BUILD_TYPE_GRAD
,
GGML_OPT_BUILD_TYPE_GRAD
=
20
,
GGML_OPT_BUILD_TYPE_OPT
,
GGML_OPT_BUILD_TYPE_OPT
=
30
,
};
};
// parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
// parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
...
@@ -81,20 +90,22 @@ extern "C" {
...
@@ -81,20 +90,22 @@ extern "C" {
// userdata can be used to pass arbitrary data
// userdata can be used to pass arbitrary data
typedef
struct
ggml_opt_optimizer_params
(
*
ggml_opt_get_optimizer_params
)(
void
*
userdata
);
typedef
struct
ggml_opt_optimizer_params
(
*
ggml_opt_get_optimizer_params
)(
void
*
userdata
);
// returns the default optimizer params (constant)
// returns the default optimizer params (constant
, hard-coded values
)
// userdata is not used
// userdata is not used
GGML_API
struct
ggml_opt_optimizer_params
ggml_opt_get_default_optimizer_params
(
void
*
userdata
);
GGML_API
struct
ggml_opt_optimizer_params
ggml_opt_get_default_optimizer_params
(
void
*
userdata
);
// casts userdata to ggml_opt_optimizer_params and returns it
GGML_API
struct
ggml_opt_optimizer_params
ggml_opt_get_constant_optimizer_params
(
void
*
userdata
);
// parameters for initializing a new optimization context
// parameters for initializing a new optimization context
struct
ggml_opt_params
{
struct
ggml_opt_params
{
ggml_backend_sched_t
backend_sched
;
// defines which backends are used to construct the compute graphs
ggml_backend_sched_t
backend_sched
;
// defines which backends are used to construct the compute graphs
struct
ggml_context
*
ctx_compute
;
// created in user code, holds non-static tensors
// by default the forward graph needs to be reconstructed for each eval
// if ctx_compute, inputs, and outputs are set the graphs are instead allocated statically
// the forward graph is defined by inputs and outputs
struct
ggml_context
*
ctx_compute
;
// those tensors and all tensors inbetween are not intended to be reusable between multiple optimization contexts
struct
ggml_tensor
*
inputs
;
struct
ggml_tensor
*
inputs
;
struct
ggml_tensor
*
outputs
;
struct
ggml_tensor
*
outputs
;
enum
ggml_opt_loss_type
loss_type
;
enum
ggml_opt_loss_type
loss_type
;
enum
ggml_opt_build_type
build_type
;
enum
ggml_opt_build_type
build_type
;
...
@@ -107,12 +118,9 @@ extern "C" {
...
@@ -107,12 +118,9 @@ extern "C" {
// get parameters for an optimization context with defaults set where possible
// get parameters for an optimization context with defaults set where possible
// parameters for which no sensible defaults exist are supplied as arguments to this function
// parameters for which no sensible defaults exist are supplied as arguments to this function
GGML_API
ggml_opt_params
ggml_opt_default_params
(
GGML_API
struct
ggml_opt_params
ggml_opt_default_params
(
ggml_backend_sched_t
backend_sched
,
ggml_backend_sched_t
backend_sched
,
struct
ggml_context
*
ctx_compute
,
enum
ggml_opt_loss_type
loss_type
);
struct
ggml_tensor
*
inputs
,
struct
ggml_tensor
*
outputs
,
enum
ggml_opt_loss_type
loss_type
);
GGML_API
ggml_opt_context_t
ggml_opt_init
(
struct
ggml_opt_params
params
);
GGML_API
ggml_opt_context_t
ggml_opt_init
(
struct
ggml_opt_params
params
);
GGML_API
void
ggml_opt_free
(
ggml_opt_context_t
opt_ctx
);
GGML_API
void
ggml_opt_free
(
ggml_opt_context_t
opt_ctx
);
...
@@ -121,6 +129,7 @@ extern "C" {
...
@@ -121,6 +129,7 @@ extern "C" {
GGML_API
void
ggml_opt_reset
(
ggml_opt_context_t
opt_ctx
,
bool
optimizer
);
GGML_API
void
ggml_opt_reset
(
ggml_opt_context_t
opt_ctx
,
bool
optimizer
);
// get underlying tensors that store data
// get underlying tensors that store data
// if not using static graphs these pointers become invalid with the next call to ggml_opt_alloc
GGML_API
struct
ggml_tensor
*
ggml_opt_inputs
(
ggml_opt_context_t
opt_ctx
);
// forward graph input tensor
GGML_API
struct
ggml_tensor
*
ggml_opt_inputs
(
ggml_opt_context_t
opt_ctx
);
// forward graph input tensor
GGML_API
struct
ggml_tensor
*
ggml_opt_outputs
(
ggml_opt_context_t
opt_ctx
);
// forward graph output tensor
GGML_API
struct
ggml_tensor
*
ggml_opt_outputs
(
ggml_opt_context_t
opt_ctx
);
// forward graph output tensor
GGML_API
struct
ggml_tensor
*
ggml_opt_labels
(
ggml_opt_context_t
opt_ctx
);
// labels to compare outputs against
GGML_API
struct
ggml_tensor
*
ggml_opt_labels
(
ggml_opt_context_t
opt_ctx
);
// labels to compare outputs against
...
@@ -128,11 +137,12 @@ extern "C" {
...
@@ -128,11 +137,12 @@ extern "C" {
GGML_API
struct
ggml_tensor
*
ggml_opt_pred
(
ggml_opt_context_t
opt_ctx
);
// predictions made by outputs
GGML_API
struct
ggml_tensor
*
ggml_opt_pred
(
ggml_opt_context_t
opt_ctx
);
// predictions made by outputs
GGML_API
struct
ggml_tensor
*
ggml_opt_ncorrect
(
ggml_opt_context_t
opt_ctx
);
// number of matching predictions between outputs and labels
GGML_API
struct
ggml_tensor
*
ggml_opt_ncorrect
(
ggml_opt_context_t
opt_ctx
);
// number of matching predictions between outputs and labels
// get the gradient accumulator for a node from the forward graph
GGML_API
struct
ggml_tensor
*
ggml_opt_grad_acc
(
ggml_opt_context_t
opt_ctx
,
struct
ggml_tensor
*
node
);
GGML_API
struct
ggml_tensor
*
ggml_opt_grad_acc
(
ggml_opt_context_t
opt_ctx
,
struct
ggml_tensor
*
node
);
// ====== Optimization Result ======
// ====== Optimization Result ======
GGML_API
ggml_opt_result_t
ggml_opt_result_init
();
GGML_API
ggml_opt_result_t
ggml_opt_result_init
(
void
);
GGML_API
void
ggml_opt_result_free
(
ggml_opt_result_t
result
);
GGML_API
void
ggml_opt_result_free
(
ggml_opt_result_t
result
);
GGML_API
void
ggml_opt_result_reset
(
ggml_opt_result_t
result
);
GGML_API
void
ggml_opt_result_reset
(
ggml_opt_result_t
result
);
...
@@ -144,11 +154,20 @@ extern "C" {
...
@@ -144,11 +154,20 @@ extern "C" {
// ====== Computation ======
// ====== Computation ======
// do forward pass, increment result if not NULL
// if not using static graphs, this function must be called prior to ggml_opt_alloc
GGML_API
void
ggml_opt_forward
(
ggml_opt_context_t
opt_ctx
,
ggml_opt_result_t
result
);
GGML_API
void
ggml_opt_prepare_alloc
(
ggml_opt_context_t
opt_ctx
,
struct
ggml_context
*
ctx_compute
,
struct
ggml_cgraph
*
gf
,
struct
ggml_tensor
*
inputs
,
struct
ggml_tensor
*
outputs
);
// allocate the next graph for evaluation, either forward or forward + backward
// must be called exactly once prior to calling ggml_opt_eval
GGML_API
void
ggml_opt_alloc
(
ggml_opt_context_t
opt_ctx
,
bool
backward
);
// do forward pass, increment result if not NULL, do backward pass
// do forward pass, increment result if not NULL, do backward pass
if allocated
GGML_API
void
ggml_opt_
forward_backward
(
ggml_opt_context_t
opt_ctx
,
ggml_opt_result_t
result
);
GGML_API
void
ggml_opt_
eval
(
ggml_opt_context_t
opt_ctx
,
ggml_opt_result_t
result
);
// ############################################################################
// ############################################################################
// ## The high-level functions start here. They do not depend on any private ##
// ## The high-level functions start here. They do not depend on any private ##
...
@@ -200,9 +219,9 @@ extern "C" {
...
@@ -200,9 +219,9 @@ extern "C" {
// fit model defined by inputs and outputs to dataset
// fit model defined by inputs and outputs to dataset
GGML_API
void
ggml_opt_fit
(
GGML_API
void
ggml_opt_fit
(
ggml_backend_sched_t
backend_sched
,
// backend scheduler for constructing the compute graphs
ggml_backend_sched_t
backend_sched
,
// backend scheduler for constructing the compute graphs
ggml_context
*
ctx_compute
,
// context with temporarily allocated tensors to calculate the outputs
struct
ggml_context
*
ctx_compute
,
// context with temporarily allocated tensors to calculate the outputs
ggml_tensor
*
inputs
,
// input tensor with shape [ne_datapoint, ndata_batch]
struct
ggml_tensor
*
inputs
,
// input tensor with shape [ne_datapoint, ndata_batch]
ggml_tensor
*
outputs
,
// output tensor, must have shape [ne_label, ndata_batch] if labels are used
struct
ggml_tensor
*
outputs
,
// output tensor, must have shape [ne_label, ndata_batch] if labels are used
ggml_opt_dataset_t
dataset
,
// dataset with data and optionally also labels
ggml_opt_dataset_t
dataset
,
// dataset with data and optionally also labels
enum
ggml_opt_loss_type
loss_type
,
// loss to minimize
enum
ggml_opt_loss_type
loss_type
,
// loss to minimize
ggml_opt_get_optimizer_params
get_opt_pars
,
// callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
ggml_opt_get_optimizer_params
get_opt_pars
,
// callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
...
...
ml/backend/ggml/ggml/include/ggml.h
View file @
0cefd46f
...
@@ -674,11 +674,15 @@ extern "C" {
...
@@ -674,11 +674,15 @@ extern "C" {
GGML_API
bool
ggml_is_3d
(
const
struct
ggml_tensor
*
tensor
);
GGML_API
bool
ggml_is_3d
(
const
struct
ggml_tensor
*
tensor
);
GGML_API
int
ggml_n_dims
(
const
struct
ggml_tensor
*
tensor
);
// returns 1 for scalars
GGML_API
int
ggml_n_dims
(
const
struct
ggml_tensor
*
tensor
);
// returns 1 for scalars
// returns whether the tensor elements can be iterated over with a flattened index (no gaps, no permutation)
GGML_API
bool
ggml_is_contiguous
(
const
struct
ggml_tensor
*
tensor
);
GGML_API
bool
ggml_is_contiguous
(
const
struct
ggml_tensor
*
tensor
);
GGML_API
bool
ggml_is_contiguous_0
(
const
struct
ggml_tensor
*
tensor
);
// same as ggml_is_contiguous()
GGML_API
bool
ggml_is_contiguous_0
(
const
struct
ggml_tensor
*
tensor
);
// same as ggml_is_contiguous()
GGML_API
bool
ggml_is_contiguous_1
(
const
struct
ggml_tensor
*
tensor
);
// contiguous for dims >= 1
GGML_API
bool
ggml_is_contiguous_1
(
const
struct
ggml_tensor
*
tensor
);
// contiguous for dims >= 1
GGML_API
bool
ggml_is_contiguous_2
(
const
struct
ggml_tensor
*
tensor
);
// contiguous for dims >= 2
GGML_API
bool
ggml_is_contiguous_2
(
const
struct
ggml_tensor
*
tensor
);
// contiguous for dims >= 2
// returns whether the tensor elements are allocated as one contiguous block of memory (no gaps, but permutation ok)
GGML_API
bool
ggml_is_contiguously_allocated
(
const
struct
ggml_tensor
*
tensor
);
// true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
// true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
GGML_API
bool
ggml_is_contiguous_channels
(
const
struct
ggml_tensor
*
tensor
);
GGML_API
bool
ggml_is_contiguous_channels
(
const
struct
ggml_tensor
*
tensor
);
...
@@ -765,7 +769,7 @@ extern "C" {
...
@@ -765,7 +769,7 @@ extern "C" {
// Tensor flags
// Tensor flags
GGML_API
void
ggml_set_input
(
struct
ggml_tensor
*
tensor
);
GGML_API
void
ggml_set_input
(
struct
ggml_tensor
*
tensor
);
GGML_API
void
ggml_set_output
(
struct
ggml_tensor
*
tensor
);
GGML_API
void
ggml_set_output
(
struct
ggml_tensor
*
tensor
);
GGML_API
void
ggml_set_param
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
tensor
);
GGML_API
void
ggml_set_param
(
struct
ggml_tensor
*
tensor
);
GGML_API
void
ggml_set_loss
(
struct
ggml_tensor
*
tensor
);
GGML_API
void
ggml_set_loss
(
struct
ggml_tensor
*
tensor
);
//
//
...
@@ -935,7 +939,7 @@ extern "C" {
...
@@ -935,7 +939,7 @@ extern "C" {
GGML_API
struct
ggml_tensor
*
ggml_repeat_back
(
GGML_API
struct
ggml_tensor
*
ggml_repeat_back
(
struct
ggml_context
*
ctx
,
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
,
struct
ggml_tensor
*
a
,
struct
ggml_tensor
*
b
);
struct
ggml_tensor
*
b
);
// sum up values that are adjacent in dims > 0 instead of repeated with same stride
// concat a and b along dim
// concat a and b along dim
// used in stable-diffusion
// used in stable-diffusion
...
@@ -2055,15 +2059,14 @@ extern "C" {
...
@@ -2055,15 +2059,14 @@ extern "C" {
GGML_API
void
ggml_build_forward_expand
(
struct
ggml_cgraph
*
cgraph
,
struct
ggml_tensor
*
tensor
);
GGML_API
void
ggml_build_forward_expand
(
struct
ggml_cgraph
*
cgraph
,
struct
ggml_tensor
*
tensor
);
GGML_API
void
ggml_build_backward_expand
(
GGML_API
void
ggml_build_backward_expand
(
struct
ggml_context
*
ctx_static
,
// context for static gradients (loss + gradient accumulation)
struct
ggml_context
*
ctx
,
// context for gradient computation
struct
ggml_context
*
ctx_compute
,
// context for gradient computation
struct
ggml_cgraph
*
cgraph
,
struct
ggml_cgraph
*
cgraph
,
struct
ggml_tensor
**
grad_accs
);
bool
accumulate
);
// whether or not gradients should be accumulated, requires static allocation of tensors in ctx_static
// graph allocation in a context
// graph allocation in a context
GGML_API
struct
ggml_cgraph
*
ggml_new_graph
(
struct
ggml_context
*
ctx
);
// size = GGML_DEFAULT_GRAPH_SIZE, grads = false
GGML_API
struct
ggml_cgraph
*
ggml_new_graph
(
struct
ggml_context
*
ctx
);
// size = GGML_DEFAULT_GRAPH_SIZE, grads = false
GGML_API
struct
ggml_cgraph
*
ggml_new_graph_custom
(
struct
ggml_context
*
ctx
,
size_t
size
,
bool
grads
);
GGML_API
struct
ggml_cgraph
*
ggml_new_graph_custom
(
struct
ggml_context
*
ctx
,
size_t
size
,
bool
grads
);
GGML_API
struct
ggml_cgraph
*
ggml_graph_dup
(
struct
ggml_context
*
ctx
,
struct
ggml_cgraph
*
cgraph
);
GGML_API
struct
ggml_cgraph
*
ggml_graph_dup
(
struct
ggml_context
*
ctx
,
struct
ggml_cgraph
*
cgraph
,
bool
force_grads
);
GGML_API
void
ggml_graph_cpy
(
struct
ggml_cgraph
*
src
,
struct
ggml_cgraph
*
dst
);
GGML_API
void
ggml_graph_cpy
(
struct
ggml_cgraph
*
src
,
struct
ggml_cgraph
*
dst
);
GGML_API
void
ggml_graph_reset
(
struct
ggml_cgraph
*
cgraph
);
// set regular grads + optimizer momenta to 0, set loss grad to 1
GGML_API
void
ggml_graph_reset
(
struct
ggml_cgraph
*
cgraph
);
// set regular grads + optimizer momenta to 0, set loss grad to 1
GGML_API
void
ggml_graph_clear
(
struct
ggml_cgraph
*
cgraph
);
GGML_API
void
ggml_graph_clear
(
struct
ggml_cgraph
*
cgraph
);
...
...
ml/backend/ggml/ggml/src/CMakeLists.txt
View file @
0cefd46f
...
@@ -214,7 +214,7 @@ add_library(ggml
...
@@ -214,7 +214,7 @@ add_library(ggml
target_link_libraries
(
ggml PUBLIC ggml-base
)
target_link_libraries
(
ggml PUBLIC ggml-base
)
if
(
CMAKE_SYSTEM_NAME MATCHES
"Linux"
)
if
(
CMAKE_SYSTEM_NAME MATCHES
"Linux"
)
target_link_libraries
(
ggml PRIVATE dl
stdc++fs
)
target_link_libraries
(
ggml PRIVATE dl
)
endif
()
endif
()
function
(
ggml_add_backend_library backend
)
function
(
ggml_add_backend_library backend
)
...
...
ml/backend/ggml/ggml/src/ggml-backend.cpp
View file @
0cefd46f
...
@@ -56,7 +56,7 @@ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
...
@@ -56,7 +56,7 @@ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
return
SIZE_MAX
;
return
SIZE_MAX
;
}
}
size_t
ggml_backend_buft_get_alloc_size
(
ggml_backend_buffer_type_t
buft
,
struct
ggml_tensor
*
tensor
)
{
size_t
ggml_backend_buft_get_alloc_size
(
ggml_backend_buffer_type_t
buft
,
const
struct
ggml_tensor
*
tensor
)
{
// get_alloc_size is optional, defaults to ggml_nbytes
// get_alloc_size is optional, defaults to ggml_nbytes
if
(
buft
->
iface
.
get_alloc_size
)
{
if
(
buft
->
iface
.
get_alloc_size
)
{
size_t
size
=
buft
->
iface
.
get_alloc_size
(
buft
,
tensor
);
size_t
size
=
buft
->
iface
.
get_alloc_size
(
buft
,
tensor
);
...
@@ -151,7 +151,7 @@ size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
...
@@ -151,7 +151,7 @@ size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
return
ggml_backend_buft_get_max_size
(
ggml_backend_buffer_get_type
(
buffer
));
return
ggml_backend_buft_get_max_size
(
ggml_backend_buffer_get_type
(
buffer
));
}
}
size_t
ggml_backend_buffer_get_alloc_size
(
ggml_backend_buffer_t
buffer
,
struct
ggml_tensor
*
tensor
)
{
size_t
ggml_backend_buffer_get_alloc_size
(
ggml_backend_buffer_t
buffer
,
const
struct
ggml_tensor
*
tensor
)
{
return
ggml_backend_buft_get_alloc_size
(
ggml_backend_buffer_get_type
(
buffer
),
tensor
);
return
ggml_backend_buft_get_alloc_size
(
ggml_backend_buffer_get_type
(
buffer
),
tensor
);
}
}
...
@@ -674,6 +674,8 @@ struct ggml_backend_sched {
...
@@ -674,6 +674,8 @@ struct ggml_backend_sched {
char
*
context_buffer
;
char
*
context_buffer
;
size_t
context_buffer_size
;
size_t
context_buffer_size
;
bool
op_offload
;
int
debug
;
int
debug
;
};
};
...
@@ -766,7 +768,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
...
@@ -766,7 +768,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
if
(
tensor
->
op
!=
GGML_OP_ROPE
&&
src
->
buffer
!=
NULL
&&
src
->
buffer
->
usage
==
GGML_BACKEND_BUFFER_USAGE_WEIGHTS
)
{
if
(
tensor
->
op
!=
GGML_OP_ROPE
&&
src
->
buffer
!=
NULL
&&
src
->
buffer
->
usage
==
GGML_BACKEND_BUFFER_USAGE_WEIGHTS
)
{
int
src_backend_id
=
ggml_backend_sched_backend_from_buffer
(
sched
,
src
,
tensor
);
int
src_backend_id
=
ggml_backend_sched_backend_from_buffer
(
sched
,
src
,
tensor
);
// check if a backend with higher prio wants to offload the op
// check if a backend with higher prio wants to offload the op
if
(
src_backend_id
==
sched
->
n_backends
-
1
&&
ggml_backend_buffer_is_host
(
src
->
buffer
))
{
if
(
sched
->
op_offload
&&
src_backend_id
==
sched
->
n_backends
-
1
&&
ggml_backend_buffer_is_host
(
src
->
buffer
))
{
for
(
int
b
=
0
;
b
<
src_backend_id
;
b
++
)
{
for
(
int
b
=
0
;
b
<
src_backend_id
;
b
++
)
{
if
(
ggml_backend_supports_op
(
sched
->
backends
[
b
],
tensor
)
&&
ggml_backend_offload_op
(
sched
->
backends
[
b
],
tensor
))
{
if
(
ggml_backend_supports_op
(
sched
->
backends
[
b
],
tensor
)
&&
ggml_backend_offload_op
(
sched
->
backends
[
b
],
tensor
))
{
SET_CAUSE
(
tensor
,
"1.off"
);
SET_CAUSE
(
tensor
,
"1.off"
);
...
@@ -1109,7 +1111,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
...
@@ -1109,7 +1111,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
const
int
node_backend_id
=
tensor_backend_id
(
node
);
const
int
node_backend_id
=
tensor_backend_id
(
node
);
assert
(
node_backend_id
!=
-
1
);
// all nodes should be assigned by now
assert
(
node_backend_id
!=
-
1
);
// all nodes should be assigned by now
, this can happen if there is no CPU fallback
// check if we should start a new split based on the sources of the current node
// check if we should start a new split based on the sources of the current node
bool
need_new_split
=
false
;
bool
need_new_split
=
false
;
...
@@ -1452,7 +1454,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
...
@@ -1452,7 +1454,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
ggml_backend_buffer_type_t
*
bufts
,
ggml_backend_buffer_type_t
*
bufts
,
int
n_backends
,
int
n_backends
,
size_t
graph_size
,
size_t
graph_size
,
bool
parallel
)
{
bool
parallel
,
bool
op_offload
)
{
GGML_ASSERT
(
n_backends
>
0
);
GGML_ASSERT
(
n_backends
>
0
);
GGML_ASSERT
(
n_backends
<=
GGML_SCHED_MAX_BACKENDS
);
GGML_ASSERT
(
n_backends
<=
GGML_SCHED_MAX_BACKENDS
);
GGML_ASSERT
(
ggml_backend_dev_type
(
ggml_backend_get_device
(
backends
[
n_backends
-
1
]))
==
GGML_BACKEND_DEVICE_TYPE_CPU
);
GGML_ASSERT
(
ggml_backend_dev_type
(
ggml_backend_get_device
(
backends
[
n_backends
-
1
]))
==
GGML_BACKEND_DEVICE_TYPE_CPU
);
...
@@ -1497,6 +1500,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
...
@@ -1497,6 +1500,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
}
}
sched
->
galloc
=
ggml_gallocr_new_n
(
sched
->
bufts
,
n_backends
);
sched
->
galloc
=
ggml_gallocr_new_n
(
sched
->
bufts
,
n_backends
);
sched
->
op_offload
=
op_offload
;
ggml_backend_sched_reset
(
sched
);
ggml_backend_sched_reset
(
sched
);
...
...
ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt
View file @
0cefd46f
...
@@ -428,6 +428,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
...
@@ -428,6 +428,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
${
KLEIDIAI_SRC
}
/kai/ukernels/
${
KLEIDIAI_SRC
}
/kai/ukernels/
${
KLEIDIAI_SRC
}
/kai/ukernels/matmul/
${
KLEIDIAI_SRC
}
/kai/ukernels/matmul/
${
KLEIDIAI_SRC
}
/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/
${
KLEIDIAI_SRC
}
/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/
${
KLEIDIAI_SRC
}
/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/
${
KLEIDIAI_SRC
}
/kai/ukernels/matmul/pack/
)
${
KLEIDIAI_SRC
}
/kai/ukernels/matmul/pack/
)
set
(
ARCH_FLAGS_TEMP
"
${
ARCH_FLAGS
}
"
)
set
(
ARCH_FLAGS_TEMP
"
${
ARCH_FLAGS
}
"
)
...
@@ -438,17 +439,19 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
...
@@ -438,17 +439,19 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
string
(
FIND
"
${
ARCH_FLAGS_TEMP
}
"
"+i8mm"
I8MM_ENABLED
)
string
(
FIND
"
${
ARCH_FLAGS_TEMP
}
"
"+i8mm"
I8MM_ENABLED
)
string
(
FIND
"
${
ARCH_FLAGS_TEMP
}
"
"+sme"
SME_ENABLED
)
string
(
FIND
"
${
ARCH_FLAGS_TEMP
}
"
"+sme"
SME_ENABLED
)
set
(
PRIVATE_ARCH_FLAGS
${
ARCH_FLAGS
}
)
set
(
PRIVATE_ARCH_FLAGS
${
ARCH_FLAGS
_TEMP
}
)
list
(
APPEND GGML_KLEIDIAI_SOURCES
${
KLEIDIAI_SRC
}
/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c
)
list
(
APPEND GGML_KLEIDIAI_SOURCES
list
(
APPEND GGML_KLEIDIAI_SOURCES
${
KLEIDIAI_SRC
}
/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c
)
${
KLEIDIAI_SRC
}
/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c
list
(
APPEND GGML_KLEIDIAI_SOURCES
${
KLEIDIAI_SRC
}
/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c
)
${
KLEIDIAI_SRC
}
/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c
list
(
APPEND GGML_KLEIDIAI_SOURCES
${
KLEIDIAI_SRC
}
/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c
)
${
KLEIDIAI_SRC
}
/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c
${
KLEIDIAI_SRC
}
/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c
)
if
(
NOT DOTPROD_ENABLED MATCHES -1
)
if
(
NOT DOTPROD_ENABLED MATCHES -1
)
list
(
APPEND GGML_KLEIDIAI_SOURCES
${
KLEIDIAI_SRC
}
/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c
)
list
(
APPEND GGML_KLEIDIAI_SOURCES
list
(
APPEND GGML_KLEIDIAI_SOURCES
${
KLEIDIAI_SRC
}
/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.c
)
${
KLEIDIAI_SRC
}
/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c
list
(
APPEND GGML_KLEIDIAI_SOURCES
${
KLEIDIAI_SRC
}
/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c
)
${
KLEIDIAI_SRC
}
/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.c
${
KLEIDIAI_SRC
}
/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c
)
endif
()
endif
()
if
(
NOT I8MM_ENABLED MATCHES -1
)
if
(
NOT I8MM_ENABLED MATCHES -1
)
...
@@ -456,9 +459,13 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
...
@@ -456,9 +459,13 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
endif
()
endif
()
if
(
NOT SME_ENABLED MATCHES -1
)
if
(
NOT SME_ENABLED MATCHES -1
)
list
(
APPEND GGML_KLEIDIAI_SOURCES
${
KLEIDIAI_SRC
}
/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c
)
list
(
APPEND GGML_KLEIDIAI_SOURCES
list
(
APPEND GGML_KLEIDIAI_SOURCES
${
KLEIDIAI_SRC
}
/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c
)
${
KLEIDIAI_SRC
}
/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c
set
(
PRIVATE_ARCH_FLAGS
"
${
PRIVATE_ARCH_FLAGS
}
+sve+sve2"
)
${
KLEIDIAI_SRC
}
/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c
${
KLEIDIAI_SRC
}
/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c
${
KLEIDIAI_SRC
}
/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c
${
KLEIDIAI_SRC
}
/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c
)
set
(
PRIVATE_ARCH_FLAGS
"-fno-tree-vectorize;
${
PRIVATE_ARCH_FLAGS
}
+sve+sve2"
)
endif
()
endif
()
set_source_files_properties
(
${
GGML_KLEIDIAI_SOURCES
}
PROPERTIES COMPILE_OPTIONS
"
${
PRIVATE_ARCH_FLAGS
}
"
)
set_source_files_properties
(
${
GGML_KLEIDIAI_SOURCES
}
PROPERTIES COMPILE_OPTIONS
"
${
PRIVATE_ARCH_FLAGS
}
"
)
...
...
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment