Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
544b6739
Unverified
Commit
544b6739
authored
Nov 06, 2025
by
Daniel Hiltgen
Committed by
GitHub
Nov 06, 2025
Browse files
ggml update to b6840 (#12791)
parent
c4ba257c
Changes
103
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
136 additions
and
116 deletions
+136
-116
llama/patches/0002-pretokenizer.patch
llama/patches/0002-pretokenizer.patch
+2
-2
llama/patches/0003-clip-unicode.patch
llama/patches/0003-clip-unicode.patch
+3
-3
llama/patches/0004-solar-pro.patch
llama/patches/0004-solar-pro.patch
+21
-21
llama/patches/0005-fix-deepseek-deseret-regex.patch
llama/patches/0005-fix-deepseek-deseret-regex.patch
+2
-2
llama/patches/0006-maintain-ordering-for-rules-for-grammar.patch
...atches/0006-maintain-ordering-for-rules-for-grammar.patch
+1
-1
llama/patches/0007-sort-devices-by-score.patch
llama/patches/0007-sort-devices-by-score.patch
+7
-7
llama/patches/0008-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
...0008-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
+3
-3
llama/patches/0009-remove-amx.patch
llama/patches/0009-remove-amx.patch
+2
-2
llama/patches/0010-fix-string-arr-kv-loading.patch
llama/patches/0010-fix-string-arr-kv-loading.patch
+3
-3
llama/patches/0011-ollama-debug-tensor.patch
llama/patches/0011-ollama-debug-tensor.patch
+2
-2
llama/patches/0012-add-ollama-vocab-for-grammar-support.patch
...a/patches/0012-add-ollama-vocab-for-grammar-support.patch
+3
-3
llama/patches/0013-add-argsort-and-cuda-copy-for-i32.patch
llama/patches/0013-add-argsort-and-cuda-copy-for-i32.patch
+52
-32
llama/patches/0014-graph-memory-reporting-on-failure.patch
llama/patches/0014-graph-memory-reporting-on-failure.patch
+7
-7
llama/patches/0015-ggml-Export-GPU-UUIDs.patch
llama/patches/0015-ggml-Export-GPU-UUIDs.patch
+7
-7
llama/patches/0016-add-C-API-for-mtmd_input_text.patch
llama/patches/0016-add-C-API-for-mtmd_input_text.patch
+2
-2
llama/patches/0017-no-power-throttling-win32-with-gnuc.patch
llama/patches/0017-no-power-throttling-win32-with-gnuc.patch
+2
-2
llama/patches/0018-BF16-macos-version-guard.patch
llama/patches/0018-BF16-macos-version-guard.patch
+1
-1
llama/patches/0019-ggml-Add-batch-size-hint.patch
llama/patches/0019-ggml-Add-batch-size-hint.patch
+13
-13
llama/patches/0020-Disable-ggml-blas-on-macos-v13-and-older.patch
...tches/0020-Disable-ggml-blas-on-macos-v13-and-older.patch
+2
-2
llama/patches/0021-fix-mtmd-audio.cpp-build-on-windows.patch
llama/patches/0021-fix-mtmd-audio.cpp-build-on-windows.patch
+1
-1
No files found.
llama/patches/0002-pretokenizer.patch
View file @
544b6739
...
@@ -10,7 +10,7 @@ logs instead of throwing an error
...
@@ -10,7 +10,7 @@ logs instead of throwing an error
1 file changed, 3 insertions(+), 11 deletions(-)
1 file changed, 3 insertions(+), 11 deletions(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index
7fffd171..0b6edaf4
100644
index
639fecbd3..a7ce6f8e1
100644
--- a/src/llama-vocab.cpp
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1812,16 +1812,7 @@
void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
@@ -1812,16 +1812,7 @@
void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
...
@@ -31,7 +31,7 @@ index 7fffd171..0b6edaf4 100644
...
@@ -31,7 +31,7 @@ index 7fffd171..0b6edaf4 100644
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (
} else if (
tokenizer_pre == "llama3" ||
tokenizer_pre == "llama3" ||
@@ -199
2
,7 +198
3
,8 @@
void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
@@ -199
3
,7 +198
4
,8 @@
void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
pre_type = LLAMA_VOCAB_PRE_TYPE_GROK_2;
pre_type = LLAMA_VOCAB_PRE_TYPE_GROK_2;
clean_spaces = false;
clean_spaces = false;
} else {
} else {
...
...
llama/patches/0003-clip-unicode.patch
View file @
544b6739
...
@@ -10,7 +10,7 @@ filesystems for paths that include wide characters
...
@@ -10,7 +10,7 @@ filesystems for paths that include wide characters
1 file changed, 39 insertions(+)
1 file changed, 39 insertions(+)
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index
98e68af2..6699b75a
100644
index
f2abf8852..c984e6282
100644
--- a/tools/mtmd/clip.cpp
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -28,6 +28,19 @@
@@ -28,6 +28,19 @@
...
@@ -33,7 +33,7 @@ index 98e68af2..6699b75a 100644
...
@@ -33,7 +33,7 @@ index 98e68af2..6699b75a 100644
struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
enum ffn_op_type {
enum ffn_op_type {
@@ -27
62
,7 +277
5
,29 @@
struct clip_model_loader {
@@ -27
74
,7 +27
8
7,29 @@
struct clip_model_loader {
{
{
std::vector<uint8_t> read_buf;
std::vector<uint8_t> read_buf;
...
@@ -63,7 +63,7 @@ index 98e68af2..6699b75a 100644
...
@@ -63,7 +63,7 @@ index 98e68af2..6699b75a 100644
if (!fin) {
if (!fin) {
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
}
}
@@ -2
789
,7 +28
24
,11 @@
struct clip_model_loader {
@@ -2
801
,7 +28
36
,11 @@
struct clip_model_loader {
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
}
}
}
}
...
...
llama/patches/0004-solar-pro.patch
View file @
544b6739
...
@@ -15,10 +15,10 @@ adds support for the Solar Pro architecture
...
@@ -15,10 +15,10 @@ adds support for the Solar Pro architecture
7 files changed, 248 insertions(+), 1 deletion(-)
7 files changed, 248 insertions(+), 1 deletion(-)
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 8
69e4dcc..9f6b6ad2
100644
index 8
ca769c5f..ab262ec0c
100644
--- a/src/llama-arch.cpp
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -8
1
,6 +8
1
,7 @@
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
@@ -8
2
,6 +8
2
,7 @@
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
{ LLM_ARCH_GRANITE_HYBRID, "granitehybrid" },
{ LLM_ARCH_GRANITE_HYBRID, "granitehybrid" },
{ LLM_ARCH_CHAMELEON, "chameleon" },
{ LLM_ARCH_CHAMELEON, "chameleon" },
...
@@ -26,7 +26,7 @@ index 869e4dcc..9f6b6ad2 100644
...
@@ -26,7 +26,7 @@ index 869e4dcc..9f6b6ad2 100644
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
{ LLM_ARCH_PLM, "plm" },
{ LLM_ARCH_PLM, "plm" },
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
@@ -1
79
,6 +18
0
,7 @@
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
@@ -1
83
,6 +18
4
,7 @@
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
{ LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
{ LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
{ LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
...
@@ -34,7 +34,7 @@ index 869e4dcc..9f6b6ad2 100644
...
@@ -34,7 +34,7 @@ index 869e4dcc..9f6b6ad2 100644
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
@@ -1
893
,6 +1
895
,24 @@
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
@@ -1
901
,6 +1
903
,24 @@
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
},
},
},
},
...
@@ -59,7 +59,7 @@ index 869e4dcc..9f6b6ad2 100644
...
@@ -59,7 +59,7 @@ index 869e4dcc..9f6b6ad2 100644
{
{
LLM_ARCH_WAVTOKENIZER_DEC,
LLM_ARCH_WAVTOKENIZER_DEC,
{
{
@@ -24
2
9,6 +24
4
9,7 @@
static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
@@ -24
6
9,6 +24
8
9,7 @@
static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_LAUREL_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_LAUREL_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
// this tensor is loaded for T5, but never used
// this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
...
@@ -68,10 +68,10 @@ index 869e4dcc..9f6b6ad2 100644
...
@@ -68,10 +68,10 @@ index 869e4dcc..9f6b6ad2 100644
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h
diff --git a/src/llama-arch.h b/src/llama-arch.h
index
c3ae7165..dc7a362a
100644
index
dea725c1a..ea2b4ffb9
100644
--- a/src/llama-arch.h
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -8
5
,6 +8
5
,7 @@
enum llm_arch {
@@ -8
6
,6 +8
6
,7 @@
enum llm_arch {
LLM_ARCH_GRANITE_MOE,
LLM_ARCH_GRANITE_MOE,
LLM_ARCH_GRANITE_HYBRID,
LLM_ARCH_GRANITE_HYBRID,
LLM_ARCH_CHAMELEON,
LLM_ARCH_CHAMELEON,
...
@@ -79,7 +79,7 @@ index c3ae7165..dc7a362a 100644
...
@@ -79,7 +79,7 @@ index c3ae7165..dc7a362a 100644
LLM_ARCH_WAVTOKENIZER_DEC,
LLM_ARCH_WAVTOKENIZER_DEC,
LLM_ARCH_PLM,
LLM_ARCH_PLM,
LLM_ARCH_BAILINGMOE,
LLM_ARCH_BAILINGMOE,
@@ -18
3
,6 +18
4
,7 @@
enum llm_kv {
@@ -18
7
,6 +18
8
,7 @@
enum llm_kv {
LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_OUTPUT_SCALE,
LLM_KV_ATTENTION_OUTPUT_SCALE,
LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
...
@@ -87,7 +87,7 @@ index c3ae7165..dc7a362a 100644
...
@@ -87,7 +87,7 @@ index c3ae7165..dc7a362a 100644
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
@@ -43
2
,6 +43
4
,7 @@
enum llm_tensor {
@@ -43
6
,6 +43
8
,7 @@
enum llm_tensor {
LLM_TENSOR_ENC_OUTPUT_NORM,
LLM_TENSOR_ENC_OUTPUT_NORM,
LLM_TENSOR_CLS,
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
LLM_TENSOR_CLS_OUT,
...
@@ -96,7 +96,7 @@ index c3ae7165..dc7a362a 100644
...
@@ -96,7 +96,7 @@ index c3ae7165..dc7a362a 100644
LLM_TENSOR_CONVNEXT_DW,
LLM_TENSOR_CONVNEXT_DW,
LLM_TENSOR_CONVNEXT_NORM,
LLM_TENSOR_CONVNEXT_NORM,
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
index db65d69e..b6bf6bbf 100644
index db65d69e
a
..b6bf6bbf
2
100644
--- a/src/llama-hparams.cpp
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@@ -151,6 +151,14 @@
uint32_t llama_hparams::n_pos_per_embd() const {
@@ -151,6 +151,14 @@
uint32_t llama_hparams::n_pos_per_embd() const {
...
@@ -115,7 +115,7 @@ index db65d69e..b6bf6bbf 100644
...
@@ -115,7 +115,7 @@ index db65d69e..b6bf6bbf 100644
if (il < n_layer) {
if (il < n_layer) {
return swa_layers[il];
return swa_layers[il];
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index
4e7f73ec..8058272
8 100644
index
6fcf91b7d..24569a25
8 100644
--- a/src/llama-hparams.h
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -64,6 +64,8 @@
struct llama_hparams {
@@ -64,6 +64,8 @@
struct llama_hparams {
...
@@ -127,7 +127,7 @@ index 4e7f73ec..80582728 100644
...
@@ -127,7 +127,7 @@ index 4e7f73ec..80582728 100644
uint32_t n_layer_dense_lead = 0;
uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0;
uint32_t n_lora_q = 0;
uint32_t n_lora_kv = 0;
uint32_t n_lora_kv = 0;
@@ -2
48
,6 +25
0
,9 @@
struct llama_hparams {
@@ -2
50
,6 +25
2
,9 @@
struct llama_hparams {
uint32_t n_pos_per_embd() const;
uint32_t n_pos_per_embd() const;
...
@@ -138,7 +138,7 @@ index 4e7f73ec..80582728 100644
...
@@ -138,7 +138,7 @@ index 4e7f73ec..80582728 100644
bool has_kv(uint32_t il) const;
bool has_kv(uint32_t il) const;
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index aa3a65f8..ee303bd5 100644
index aa3a65f8
7
..ee303bd5
8
100644
--- a/src/llama-model-loader.cpp
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -466,7 +466,7 @@
namespace GGUFMeta {
@@ -466,7 +466,7 @@
namespace GGUFMeta {
...
@@ -151,10 +151,10 @@ index aa3a65f8..ee303bd5 100644
...
@@ -151,10 +151,10 @@ index aa3a65f8..ee303bd5 100644
llama_model_loader::llama_model_loader(
llama_model_loader::llama_model_loader(
const std::string & fname,
const std::string & fname,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index
36d495d6..74e1d162
100644
index
2a83d6627..54621ea39
100644
--- a/src/llama-model.cpp
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -18
65
,6 +18
65
,21 @@
void llama_model::load_hparams(llama_model_loader & ml) {
@@ -18
90
,6 +18
90
,21 @@
void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN;
default: type = LLM_TYPE_UNKNOWN;
}
}
} break;
} break;
...
@@ -176,7 +176,7 @@ index 36d495d6..74e1d162 100644
...
@@ -176,7 +176,7 @@ index 36d495d6..74e1d162 100644
case LLM_ARCH_WAVTOKENIZER_DEC:
case LLM_ARCH_WAVTOKENIZER_DEC:
{
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -5
170
,6 +5
185
,34 @@
bool llama_model::load_tensors(llama_model_loader & ml) {
@@ -5
224
,6 +5
239
,34 @@
bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
...
@@ -211,7 +211,7 @@ index 36d495d6..74e1d162 100644
...
@@ -211,7 +211,7 @@ index 36d495d6..74e1d162 100644
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
@@ -16
392
,6 +16
435
,165 @@
struct llm_build_granite_hybrid : public llm_graph_context_mamba {
@@ -16
515
,6 +16
558
,165 @@
struct llm_build_granite_hybrid : public llm_graph_context_mamba {
}
}
};
};
...
@@ -377,7 +377,7 @@ index 36d495d6..74e1d162 100644
...
@@ -377,7 +377,7 @@ index 36d495d6..74e1d162 100644
// ref: https://github.com/facebookresearch/chameleon
// ref: https://github.com/facebookresearch/chameleon
// based on the original build_llama() function, changes:
// based on the original build_llama() function, changes:
// * qk-norm
// * qk-norm
@@ -
19827
,6 +20
0
29,10 @@
ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
@@ -
20096
,6 +2029
8
,10 @@
ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
{
{
llm = std::make_unique<llm_build_chameleon>(*this, params);
llm = std::make_unique<llm_build_chameleon>(*this, params);
} break;
} break;
...
@@ -388,7 +388,7 @@ index 36d495d6..74e1d162 100644
...
@@ -388,7 +388,7 @@ index 36d495d6..74e1d162 100644
case LLM_ARCH_WAVTOKENIZER_DEC:
case LLM_ARCH_WAVTOKENIZER_DEC:
{
{
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
@@ -20
057
,6 +20
263
,7 @@
llama_rope_type llama_model_rope_type(const llama_model * model) {
@@ -20
331
,6 +20
537
,7 @@
llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_GRANITE_HYBRID:
case LLM_ARCH_GRANITE_HYBRID:
case LLM_ARCH_CHAMELEON:
case LLM_ARCH_CHAMELEON:
...
@@ -397,7 +397,7 @@ index 36d495d6..74e1d162 100644
...
@@ -397,7 +397,7 @@ index 36d495d6..74e1d162 100644
case LLM_ARCH_NEO_BERT:
case LLM_ARCH_NEO_BERT:
case LLM_ARCH_SMOLLM3:
case LLM_ARCH_SMOLLM3:
diff --git a/src/llama-model.h b/src/llama-model.h
diff --git a/src/llama-model.h b/src/llama-model.h
index
7f48662f..ec3fbd33
100644
index
248f85410..4a7924aaa
100644
--- a/src/llama-model.h
--- a/src/llama-model.h
+++ b/src/llama-model.h
+++ b/src/llama-model.h
@@ -76,6 +76,7 @@
enum llm_type {
@@ -76,6 +76,7 @@
enum llm_type {
...
@@ -408,7 +408,7 @@ index 7f48662f..ec3fbd33 100644
...
@@ -408,7 +408,7 @@ index 7f48662f..ec3fbd33 100644
LLM_TYPE_27B,
LLM_TYPE_27B,
LLM_TYPE_30B,
LLM_TYPE_30B,
LLM_TYPE_32B,
LLM_TYPE_32B,
@@ -3
87
,6 +3
88
,8 @@
struct llama_layer {
@@ -3
90
,6 +3
91
,8 @@
struct llama_layer {
struct ggml_tensor * ffn_act_beta = nullptr;
struct ggml_tensor * ffn_act_beta = nullptr;
struct ggml_tensor * ffn_act_eps = nullptr;
struct ggml_tensor * ffn_act_eps = nullptr;
...
...
llama/patches/0005-fix-deepseek-deseret-regex.patch
View file @
544b6739
...
@@ -12,7 +12,7 @@ regex
...
@@ -12,7 +12,7 @@ regex
2 files changed, 22 insertions(+), 1 deletion(-)
2 files changed, 22 insertions(+), 1 deletion(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index
0b6edaf4..3de95c6
7 100644
index
a7ce6f8e1..8064dc19
7 100644
--- a/src/llama-vocab.cpp
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -299,7 +299,7 @@
struct llm_tokenizer_bpe : llm_tokenizer {
@@ -299,7 +299,7 @@
struct llm_tokenizer_bpe : llm_tokenizer {
...
@@ -25,7 +25,7 @@ index 0b6edaf4..3de95c67 100644
...
@@ -25,7 +25,7 @@ index 0b6edaf4..3de95c67 100644
"\\s+$",
"\\s+$",
"[一-龥ࠀ-一가-]+",
"[一-龥ࠀ-一가-]+",
diff --git a/src/unicode.cpp b/src/unicode.cpp
diff --git a/src/unicode.cpp b/src/unicode.cpp
index 65f36651..ce336a22 100644
index 65f36651
7
..ce336a22
8
100644
--- a/src/unicode.cpp
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -2,6 +2,11 @@
@@ -2,6 +2,11 @@
...
...
llama/patches/0006-maintain-ordering-for-rules-for-grammar.patch
View file @
544b6739
...
@@ -8,7 +8,7 @@ Subject: [PATCH] maintain ordering for rules for grammar
...
@@ -8,7 +8,7 @@ Subject: [PATCH] maintain ordering for rules for grammar
1 file changed, 1 insertion(+), 1 deletion(-)
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index d
b1f0b23..f4de7e34
100644
index d
d9b51a9e..d88f43209
100644
--- a/common/json-schema-to-grammar.cpp
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -308,7 +308,7 @@
private:
@@ -308,7 +308,7 @@
private:
...
...
llama/patches/0007-sort-devices-by-score.patch
View file @
544b6739
...
@@ -11,10 +11,10 @@ with the fastest acceleration is loaded
...
@@ -11,10 +11,10 @@ with the fastest acceleration is loaded
1 file changed, 13 insertions(+), 8 deletions(-)
1 file changed, 13 insertions(+), 8 deletions(-)
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index
136afec7..f794d9cf
100644
index
e96b5c403..a55d9b280
100644
--- a/ggml/src/ggml-backend-reg.cpp
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -17
5
,7 +17
5
,7 @@
struct ggml_backend_reg_entry {
@@ -17
9
,7 +17
9
,7 @@
struct ggml_backend_reg_entry {
struct ggml_backend_registry {
struct ggml_backend_registry {
std::vector<ggml_backend_reg_entry> backends;
std::vector<ggml_backend_reg_entry> backends;
...
@@ -23,7 +23,7 @@ index 136afec7..f794d9cf 100644
...
@@ -23,7 +23,7 @@ index 136afec7..f794d9cf 100644
ggml_backend_registry() {
ggml_backend_registry() {
#ifdef GGML_USE_CUDA
#ifdef GGML_USE_CUDA
@@ -2
2
3,7 +2
2
3,7 @@
struct ggml_backend_registry {
@@ -23
0
,7 +23
0
,7 @@
struct ggml_backend_registry {
}
}
}
}
...
@@ -32,7 +32,7 @@ index 136afec7..f794d9cf 100644
...
@@ -32,7 +32,7 @@ index 136afec7..f794d9cf 100644
if (!reg) {
if (!reg) {
return;
return;
}
}
@@ -2
3
4,15 +2
3
4,20 @@
struct ggml_backend_registry {
@@ -24
1
,15 +24
1
,20 @@
struct ggml_backend_registry {
#endif
#endif
backends.push_back({ reg, std::move(handle) });
backends.push_back({ reg, std::move(handle) });
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
...
@@ -56,7 +56,7 @@ index 136afec7..f794d9cf 100644
...
@@ -56,7 +56,7 @@ index 136afec7..f794d9cf 100644
}
}
ggml_backend_reg_t load_backend(const fs::path & path, bool silent) {
ggml_backend_reg_t load_backend(const fs::path & path, bool silent) {
@@ -2
86
,7 +29
1
,7 @@
struct ggml_backend_registry {
@@ -2
93
,7 +29
8
,7 @@
struct ggml_backend_registry {
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path_str(path).c_str());
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path_str(path).c_str());
...
@@ -65,7 +65,7 @@ index 136afec7..f794d9cf 100644
...
@@ -65,7 +65,7 @@ index 136afec7..f794d9cf 100644
return reg;
return reg;
}
}
@@ -3
09
,7 +31
4
,7 @@
struct ggml_backend_registry {
@@ -3
16
,7 +3
2
1,7 @@
struct ggml_backend_registry {
// remove devices
// remove devices
devices.erase(
devices.erase(
std::remove_if(devices.begin(), devices.end(),
std::remove_if(devices.begin(), devices.end(),
...
@@ -74,7 +74,7 @@ index 136afec7..f794d9cf 100644
...
@@ -74,7 +74,7 @@ index 136afec7..f794d9cf 100644
devices.end());
devices.end());
// remove backend
// remove backend
@@ -3
6
7,7 +37
2
,7 @@
size_t ggml_backend_dev_count() {
@@ -37
4
,7 +37
9
,7 @@
size_t ggml_backend_dev_count() {
ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
GGML_ASSERT(index < ggml_backend_dev_count());
GGML_ASSERT(index < ggml_backend_dev_count());
...
...
llama/patches/0008-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
View file @
544b6739
...
@@ -8,10 +8,10 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
...
@@ -8,10 +8,10 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
1 file changed, 2 insertions(+)
1 file changed, 2 insertions(+)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index
892c2331..09fdf5fc
100644
index
ba281b8e6..ead235878
100644
--- a/ggml/src/CMakeLists.txt
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -31
0
,6 +31
0
,7 @@
function(ggml_add_cpu_backend_variant tag_name)
@@ -31
4
,6 +31
4
,7 @@
function(ggml_add_cpu_backend_variant tag_name)
endif()
endif()
ggml_add_cpu_backend_variant_impl(${tag_name})
ggml_add_cpu_backend_variant_impl(${tag_name})
...
@@ -19,7 +19,7 @@ index 892c2331..09fdf5fc 100644
...
@@ -19,7 +19,7 @@ index 892c2331..09fdf5fc 100644
endfunction()
endfunction()
ggml_add_backend(CPU)
ggml_add_backend(CPU)
@@ -32
0
,6 +32
1
,7 @@
if (GGML_CPU_ALL_VARIANTS)
@@ -32
4
,6 +32
5
,7 @@
if (GGML_CPU_ALL_VARIANTS)
elseif (GGML_CPU_ARM_ARCH)
elseif (GGML_CPU_ARM_ARCH)
message(FATAL_ERROR "Cannot use both GGML_CPU_ARM_ARCH and GGML_CPU_ALL_VARIANTS")
message(FATAL_ERROR "Cannot use both GGML_CPU_ARM_ARCH and GGML_CPU_ALL_VARIANTS")
endif()
endif()
...
...
llama/patches/0009-remove-amx.patch
View file @
544b6739
...
@@ -9,10 +9,10 @@ disable amx as it reduces performance on some systems
...
@@ -9,10 +9,10 @@ disable amx as it reduces performance on some systems
1 file changed, 4 deletions(-)
1 file changed, 4 deletions(-)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index
09fdf5fc..0609c650
100644
index
ead235878..f9a6587f1
100644
--- a/ggml/src/CMakeLists.txt
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -33
0
,10 +33
0
,6 @@
if (GGML_CPU_ALL_VARIANTS)
@@ -33
4
,10 +33
4
,6 @@
if (GGML_CPU_ALL_VARIANTS)
ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
...
...
llama/patches/0010-fix-string-arr-kv-loading.patch
View file @
544b6739
...
@@ -13,7 +13,7 @@ such as vocab fields
...
@@ -13,7 +13,7 @@ such as vocab fields
3 files changed, 7 insertions(+), 5 deletions(-)
3 files changed, 7 insertions(+), 5 deletions(-)
diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h
diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h
index 79ee2020..3efb22f0 100644
index 79ee2020
6
..3efb22f0
1
100644
--- a/ggml/include/gguf.h
--- a/ggml/include/gguf.h
+++ b/ggml/include/gguf.h
+++ b/ggml/include/gguf.h
@@ -114,6 +114,7 @@
extern "C" {
@@ -114,6 +114,7 @@
extern "C" {
...
@@ -25,7 +25,7 @@ index 79ee2020..3efb22f0 100644
...
@@ -25,7 +25,7 @@ index 79ee2020..3efb22f0 100644
// get ith C string from array with given key_id
// get ith C string from array with given key_id
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int64_t key_id, size_t i);
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int64_t key_id, size_t i);
diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp
diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp
index 8cc4ef1c..d950dbdf 100644
index 8cc4ef1c
f
..d950dbdf
5
100644
--- a/ggml/src/gguf.cpp
--- a/ggml/src/gguf.cpp
+++ b/ggml/src/gguf.cpp
+++ b/ggml/src/gguf.cpp
@@ -805,10 +805,14 @@
enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int64_t key_id
@@ -805,10 +805,14 @@
enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int64_t key_id
...
@@ -53,7 +53,7 @@ index 8cc4ef1c..d950dbdf 100644
...
@@ -53,7 +53,7 @@ index 8cc4ef1c..d950dbdf 100644
}
}
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index
3de95c67..217ede47
100644
index
8064dc197..31f49801c
100644
--- a/src/llama-vocab.cpp
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1768,9 +1768,7 @@
void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
@@ -1768,9 +1768,7 @@
void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
...
...
llama/patches/0011-ollama-debug-tensor.patch
View file @
544b6739
...
@@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor
...
@@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor
1 file changed, 6 insertions(+)
1 file changed, 6 insertions(+)
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index
ba2a36d9..99509b0c
100644
index
9ec485cfa..4b2f8b7bd
100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -15,6 +15,8 @@
@@ -15,6 +15,8 @@
...
@@ -20,7 +20,7 @@ index ba2a36d9..99509b0c 100644
...
@@ -20,7 +20,7 @@ index ba2a36d9..99509b0c 100644
#if defined(_MSC_VER) || defined(__MINGW32__)
#if defined(_MSC_VER) || defined(__MINGW32__)
#include <malloc.h> // using malloc.h with MSC/MINGW
#include <malloc.h> // using malloc.h with MSC/MINGW
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
@@ -28
87
,6 +28
8
9,10 @@
static thread_ret_t ggml_graph_compute_thread(void * data) {
@@ -28
91
,6 +289
3
,10 @@
static thread_ret_t ggml_graph_compute_thread(void * data) {
ggml_compute_forward(¶ms, node);
ggml_compute_forward(¶ms, node);
...
...
llama/patches/0012-add-ollama-vocab-for-grammar-support.patch
View file @
544b6739
...
@@ -10,7 +10,7 @@ Subject: [PATCH] add ollama vocab for grammar support
...
@@ -10,7 +10,7 @@ Subject: [PATCH] add ollama vocab for grammar support
3 files changed, 58 insertions(+), 9 deletions(-)
3 files changed, 58 insertions(+), 9 deletions(-)
diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp
diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp
index bed706bb..b51cee09 100644
index bed706bb
2
..b51cee09
0
100644
--- a/src/llama-grammar.cpp
--- a/src/llama-grammar.cpp
+++ b/src/llama-grammar.cpp
+++ b/src/llama-grammar.cpp
@@ -907,6 +907,7 @@
llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
@@ -907,6 +907,7 @@
llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
...
@@ -137,7 +137,7 @@ index bed706bb..b51cee09 100644
...
@@ -137,7 +137,7 @@ index bed706bb..b51cee09 100644
+ }
+ }
+}
+}
diff --git a/src/llama-grammar.h b/src/llama-grammar.h
diff --git a/src/llama-grammar.h b/src/llama-grammar.h
index f8c291de..2a3a62db 100644
index f8c291de
9
..2a3a62db
3
100644
--- a/src/llama-grammar.h
--- a/src/llama-grammar.h
+++ b/src/llama-grammar.h
+++ b/src/llama-grammar.h
@@ -6,8 +6,19 @@
@@ -6,8 +6,19 @@
...
@@ -184,7 +184,7 @@ index f8c291de..2a3a62db 100644
...
@@ -184,7 +184,7 @@ index f8c291de..2a3a62db 100644
const char * grammar_root,
const char * grammar_root,
bool lazy,
bool lazy,
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 55d2e355..da34526b 100644
index 55d2e355
f
..da34526b
1
100644
--- a/src/llama-sampling.cpp
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1563,7 +1563,7 @@
static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
@@ -1563,7 +1563,7 @@
static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
...
...
llama/patches/0013-add-argsort-and-cuda-copy-for-i32.patch
View file @
544b6739
...
@@ -4,15 +4,15 @@ Date: Thu, 1 May 2025 13:45:12 -0700
...
@@ -4,15 +4,15 @@ Date: Thu, 1 May 2025 13:45:12 -0700
Subject: [PATCH] add argsort and cuda copy for i32
Subject: [PATCH] add argsort and cuda copy for i32
---
---
ggml/src/ggml-cpu/ops.cpp | 43 ++++++++++
+
ggml/src/ggml-cpu/ops.cpp | 43 ++++++++++
ggml/src/ggml-cuda/argsort.cu | 1
0
2 ++++++++++++++++++++++++
++
-
ggml/src/ggml-cuda/argsort.cu | 1
2
2 ++++++++++++++++++++++++
--
-
ggml/src/ggml-cuda/cpy-utils.cuh | 6 ++
ggml/src/ggml-cuda/cpy-utils.cuh | 6 ++
ggml/src/ggml-cuda/cpy.cu | 4
3 ++
+++++++++
ggml/src/ggml-cuda/cpy.cu | 4
0
+++++++++
ggml/src/ggml-metal/ggml-metal.metal | 64 ++++++++++++++
+++
ggml/src/ggml-metal/ggml-metal.metal | 64 ++++++++++++++
5 files changed, 2
5
6 insertions(+), 2 deletions(-)
5 files changed, 26
3
insertions(+),
1
2 deletions(-)
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index
1c43865f..31478dd8
100644
index
b52f0f847..902fdad69
100644
--- a/ggml/src/ggml-cpu/ops.cpp
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -7889,6 +7889,45 @@
static void ggml_compute_forward_argsort_f32(
@@ -7889,6 +7889,45 @@
static void ggml_compute_forward_argsort_f32(
...
@@ -73,10 +73,10 @@ index 1c43865f..31478dd8 100644
...
@@ -73,10 +73,10 @@ index 1c43865f..31478dd8 100644
{
{
GGML_ABORT("fatal error");
GGML_ABORT("fatal error");
diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu
diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu
index 6
07ded85..53b02634
100644
index 6
e7b90d42..08dd30525
100644
--- a/ggml/src/ggml-cuda/argsort.cu
--- a/ggml/src/ggml-cuda/argsort.cu
+++ b/ggml/src/ggml-cuda/argsort.cu
+++ b/ggml/src/ggml-cuda/argsort.cu
@@ -8
5
,13 +8
5
,107 @@
static void argsort_f32_i32_cuda(const float * x,
int * dst, const int ncols, co
@@ -
16
8,13 +
16
8,107 @@
static void argsort_f32_i32_cuda
_bitonic
(const float *
x,
}
}
}
}
...
@@ -185,19 +185,42 @@ index 607ded85..53b02634 100644
...
@@ -185,19 +185,42 @@ index 607ded85..53b02634 100644
GGML_ASSERT( dst->type == GGML_TYPE_I32);
GGML_ASSERT( dst->type == GGML_TYPE_I32);
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(ggml_is_contiguous(src0));
@@ -1
00,5 +194,9
@@
void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@@ -1
83,18 +277,22
@@
void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
- argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
-#ifdef GGML_CUDA_USE_CUB
- const int ncols_pad = next_power_of_2(ncols);
- const size_t shared_mem = ncols_pad * sizeof(int);
- const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
-
- if (shared_mem > max_shared_mem || ncols > 1024) {
- ggml_cuda_pool & pool = ctx.pool();
- argsort_f32_i32_cuda_cub(pool, src0_d, (int *) dst_d, ncols, nrows, order, stream);
+ if (src0->type == GGML_TYPE_I32) {
+ if (src0->type == GGML_TYPE_I32) {
+ argsort_i32_i32_cuda((const int32_t *)src0_d, (int *)dst_d, ncols, nrows, order, stream);
+ argsort_i32_i32_cuda((const int32_t *)src0_d, (int *)dst_d, ncols, nrows, order, stream);
+ } else {
} else {
+ argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
- argsort_f32_i32_cuda_bitonic(src0_d, (int *) dst_d, ncols, nrows, order, stream);
- }
+#ifdef GGML_CUDA_USE_CUB
+ const int ncols_pad = next_power_of_2(ncols);
+ const size_t shared_mem = ncols_pad * sizeof(int);
+ const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
+
+ if (shared_mem > max_shared_mem || ncols > 1024) {
+ ggml_cuda_pool & pool = ctx.pool();
+ argsort_f32_i32_cuda_cub(pool, src0_d, (int *) dst_d, ncols, nrows, order, stream);
+ } else {
+ argsort_f32_i32_cuda_bitonic(src0_d, (int *) dst_d, ncols, nrows, order, stream);
+ }
#else
- argsort_f32_i32_cuda_bitonic(src0_d, (int *) dst_d, ncols, nrows, order, stream);
+ argsort_f32_i32_cuda_bitonic(src0_d, (int *) dst_d, ncols, nrows, order, stream);
#endif
+ }
+ }
}
}
diff --git a/ggml/src/ggml-cuda/cpy-utils.cuh b/ggml/src/ggml-cuda/cpy-utils.cuh
diff --git a/ggml/src/ggml-cuda/cpy-utils.cuh b/ggml/src/ggml-cuda/cpy-utils.cuh
index e621cb98..597c0c8b 100644
index e621cb98
1
..597c0c8b
3
100644
--- a/ggml/src/ggml-cuda/cpy-utils.cuh
--- a/ggml/src/ggml-cuda/cpy-utils.cuh
+++ b/ggml/src/ggml-cuda/cpy-utils.cuh
+++ b/ggml/src/ggml-cuda/cpy-utils.cuh
@@ -215,3 +215,9 @@
template<typename src_t, typename dst_t>
@@ -215,3 +215,9 @@
template<typename src_t, typename dst_t>
...
@@ -211,19 +234,18 @@ index e621cb98..597c0c8b 100644
...
@@ -211,19 +234,18 @@ index e621cb98..597c0c8b 100644
+ *dst = *src;
+ *dst = *src;
+}
+}
diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
index
746f4396..91122
0e
9
100644
index
12d5bf776..a0e3403
0e 100644
--- a/ggml/src/ggml-cuda/cpy.cu
--- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu
@@ -2
77
,6 +2
77
,4
7
@@
static void ggml_cpy_f32_iq4_nl_cuda(
@@ -2
51
,6 +2
51
,4
3
@@
static void ggml_cpy_f32_iq4_nl_cuda(
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13
, cdst_indirect, graph_cpynode_index++
);
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
}
}
+template <cpy_kernel_t cpy_1>
+template <cpy_kernel_t cpy_1>
+static __global__ void cpy_i32_i32(
+static __global__ void cpy_i32_i32(
+ const char *cx, char *cdst, const int ne,
+ const char *cx, char *cdst, const int ne,
+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
+ const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13,
+ const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+
+
+ const int64_t i = blockDim.x * blockIdx.x + threadIdx.x;
+ const int64_t i = blockDim.x * blockIdx.x + threadIdx.x;
+
+
...
@@ -243,39 +265,37 @@ index 746f4396..911220e9 100644
...
@@ -243,39 +265,37 @@ index 746f4396..911220e9 100644
+ const int64_t i10 = i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11 - i11 * ne10;
+ const int64_t i10 = i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11 - i11 * ne10;
+ const int64_t dst_offset = i10 * nb10 + i11 * nb11 + i12 * nb12 + i13 * nb13;
+ const int64_t dst_offset = i10 * nb10 + i11 * nb11 + i12 * nb12 + i13 * nb13;
+
+
+ char * cdst_ptr = (cdst_indirect != nullptr) ? cdst_indirect[graph_cpynode_index] : cdst;
+ cpy_1(cx + x_offset, cdst + dst_offset);
+ cpy_1(cx + x_offset, cdst_ptr + dst_offset);
+}
+}
+
+
+
+static void ggml_cpy_i32_i32_cuda(
+static void ggml_cpy_i32_i32_cuda(
+ const char * cx, char * cdst, const int ne,
+ const char * cx, char * cdst, const int ne,
+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
+ const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13,
+ const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ cudaStream_t stream, char ** cdst_indirect, int graph_cpynode_index) {
+
+
+ const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+ const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+ cpy_i32_i32<cpy_1_i32_i32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
+ cpy_i32_i32<cpy_1_i32_i32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, stream
, cdst_indirect, graph_cpynode_index
);
+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, stream);
+}
+}
+
+
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1
, bool disable_indirection_for_this_node
) {
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1) {
const int64_t ne = ggml_nelements(src0);
const int64_t ne = ggml_nelements(src0);
GGML_ASSERT(ne == ggml_nelements(src1));
GGML_ASSERT(ne == ggml_nelements(src1));
@@ -3
7
2,6 +
413,8
@@
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
@@ -3
3
2,6 +
369,9
@@
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
ggml_cpy_flt_cuda<half, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream
, dest_ptrs_d, graph_cpynode_index
);
ggml_cpy_flt_cuda<half, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
ggml_cpy_flt_cuda<half, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream
, dest_ptrs_d, graph_cpynode_index
);
ggml_cpy_flt_cuda<half, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+ } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
+ } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
+ ggml_cpy_i32_i32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+ // TODO consider converting to template
+ ggml_cpy_i32_i32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16) {
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16) {
ggml_cpy_flt_cuda<nv_bfloat16, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream
, dest_ptrs_d, graph_cpynode_index
);
ggml_cpy_flt_cuda<nv_bfloat16, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F16) {
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F16) {
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index
74a9aa99..375a0c7f
100644
index
2c2f01415..50b8071de
100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -4
3
46,8 +4
3
46,72 @@
kernel void kernel_argsort_f32_i32(
@@ -446
7
,8 +446
7
,72 @@
kernel void kernel_argsort_f32_i32(
}
}
}
}
...
...
llama/patches/0014-graph-memory-reporting-on-failure.patch
View file @
544b6739
...
@@ -11,7 +11,7 @@ Subject: [PATCH] graph memory reporting on failure
...
@@ -11,7 +11,7 @@ Subject: [PATCH] graph memory reporting on failure
4 files changed, 40 insertions(+), 3 deletions(-)
4 files changed, 40 insertions(+), 3 deletions(-)
diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h
diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h
index 2cb150fd..7ab3f019 100644
index 2cb150fd
2
..7ab3f019
2
100644
--- a/ggml/include/ggml-alloc.h
--- a/ggml/include/ggml-alloc.h
+++ b/ggml/include/ggml-alloc.h
+++ b/ggml/include/ggml-alloc.h
@@ -65,6 +65,7 @@
GGML_API bool ggml_gallocr_reserve_n(
@@ -65,6 +65,7 @@
GGML_API bool ggml_gallocr_reserve_n(
...
@@ -23,7 +23,7 @@ index 2cb150fd..7ab3f019 100644
...
@@ -23,7 +23,7 @@ index 2cb150fd..7ab3f019 100644
// Utils
// Utils
// Create a buffer and allocate all the tensors in a ggml_context
// Create a buffer and allocate all the tensors in a ggml_context
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index f1b74078..c54ff98b 100644
index f1b74078
5
..c54ff98b
f
100644
--- a/ggml/include/ggml-backend.h
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -318,6 +318,7 @@
extern "C" {
@@ -318,6 +318,7 @@
extern "C" {
...
@@ -35,7 +35,7 @@ index f1b74078..c54ff98b 100644
...
@@ -35,7 +35,7 @@ index f1b74078..c54ff98b 100644
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index
929bc448..eee9d3b1
100644
index
c830c0965..363853873
100644
--- a/ggml/src/ggml-alloc.c
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -486,6 +486,7 @@
struct node_alloc {
@@ -486,6 +486,7 @@
struct node_alloc {
...
@@ -64,7 +64,7 @@ index 929bc448..eee9d3b1 100644
...
@@ -64,7 +64,7 @@ index 929bc448..eee9d3b1 100644
free(galloc->buffers);
free(galloc->buffers);
free(galloc->buf_tallocs);
free(galloc->buf_tallocs);
free(galloc->node_allocs);
free(galloc->node_allocs);
@@ -8
6
9,6 +8
74
,8 @@
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
@@ -89
1
,6 +8
96
,8 @@
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
}
}
}
}
...
@@ -73,7 +73,7 @@ index 929bc448..eee9d3b1 100644
...
@@ -73,7 +73,7 @@ index 929bc448..eee9d3b1 100644
// reallocate buffers if needed
// reallocate buffers if needed
for (int i = 0; i < galloc->n_buffers; i++) {
for (int i = 0; i < galloc->n_buffers; i++) {
// if the buffer type is used multiple times, we reuse the same buffer
// if the buffer type is used multiple times, we reuse the same buffer
@@ -
898
,14 +9
05
,19 @@
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
@@ -
920
,14 +9
27
,19 @@
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
ggml_vbuffer_free(galloc->buffers[i]);
ggml_vbuffer_free(galloc->buffers[i]);
galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
...
@@ -96,7 +96,7 @@ index 929bc448..eee9d3b1 100644
...
@@ -96,7 +96,7 @@ index 929bc448..eee9d3b1 100644
}
}
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
@@ -10
60
,6 +10
72
,22 @@
size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
@@ -10
82
,6 +10
94
,22 @@
size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
return ggml_vbuffer_size(galloc->buffers[buffer_id]);
return ggml_vbuffer_size(galloc->buffers[buffer_id]);
}
}
...
@@ -120,7 +120,7 @@ index 929bc448..eee9d3b1 100644
...
@@ -120,7 +120,7 @@ index 929bc448..eee9d3b1 100644
static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 8ba86f82..cb2b9956 100644
index 8ba86f82
4
..cb2b9956
2
100644
--- a/ggml/src/ggml-backend.cpp
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -1809,6 +1809,13 @@
size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
@@ -1809,6 +1809,13 @@
size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
...
...
llama/patches/0015-ggml-Export-GPU-UUIDs.patch
View file @
544b6739
...
@@ -12,7 +12,7 @@ with tools (e.g. nvidia-smi) and system management libraries (e.g. nvml).
...
@@ -12,7 +12,7 @@ with tools (e.g. nvidia-smi) and system management libraries (e.g. nvml).
3 files changed, 63 insertions(+), 6 deletions(-)
3 files changed, 63 insertions(+), 6 deletions(-)
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index c54ff98b..229bf387 100644
index c54ff98b
f
..229bf387
b
100644
--- a/ggml/include/ggml-backend.h
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -158,6 +158,7 @@
extern "C" {
@@ -158,6 +158,7 @@
extern "C" {
...
@@ -24,7 +24,7 @@ index c54ff98b..229bf387 100644
...
@@ -24,7 +24,7 @@ index c54ff98b..229bf387 100644
size_t memory_total;
size_t memory_total;
// device type
// device type
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index
c0b1e4c1..5b852f69
100644
index
aefc6935e..cc201afff
100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -183,6 +183,51 @@
static int ggml_cuda_parse_id(char devName[]) {
@@ -183,6 +183,51 @@
static int ggml_cuda_parse_id(char devName[]) {
...
@@ -110,7 +110,7 @@ index c0b1e4c1..5b852f69 100644
...
@@ -110,7 +110,7 @@ index c0b1e4c1..5b852f69 100644
std::string device_name(prop.name);
std::string device_name(prop.name);
if (device_name == "NVIDIA GeForce MX450") {
if (device_name == "NVIDIA GeForce MX450") {
turing_devices_without_mma.push_back({ id, device_name });
turing_devices_without_mma.push_back({ id, device_name });
@@ -32
7
6,6 +33
23
,7 @@
struct ggml_backend_cuda_device_context {
@@ -326
8
,6 +33
15
,7 @@
struct ggml_backend_cuda_device_context {
std::string name;
std::string name;
std::string description;
std::string description;
std::string pci_bus_id;
std::string pci_bus_id;
...
@@ -118,7 +118,7 @@ index c0b1e4c1..5b852f69 100644
...
@@ -118,7 +118,7 @@ index c0b1e4c1..5b852f69 100644
};
};
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -328
8
,6 +33
36
,11 @@
static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
@@ -328
0
,6 +33
28
,11 @@
static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
return ctx->description.c_str();
return ctx->description.c_str();
}
}
...
@@ -130,7 +130,7 @@ index c0b1e4c1..5b852f69 100644
...
@@ -130,7 +130,7 @@ index c0b1e4c1..5b852f69 100644
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
ggml_cuda_set_device(ctx->device);
ggml_cuda_set_device(ctx->device);
@@ -3
304
,6 +33
57
,7 @@
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
@@ -3
296
,6 +33
49
,7 @@
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
props->name = ggml_backend_cuda_device_get_name(dev);
props->name = ggml_backend_cuda_device_get_name(dev);
props->description = ggml_backend_cuda_device_get_description(dev);
props->description = ggml_backend_cuda_device_get_description(dev);
...
@@ -138,7 +138,7 @@ index c0b1e4c1..5b852f69 100644
...
@@ -138,7 +138,7 @@ index c0b1e4c1..5b852f69 100644
props->type = ggml_backend_cuda_device_get_type(dev);
props->type = ggml_backend_cuda_device_get_type(dev);
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
@@ -38
73
,6 +392
7
,7 @@
ggml_backend_reg_t ggml_backend_cuda_reg() {
@@ -38
69
,6 +392
3
,7 @@
ggml_backend_reg_t ggml_backend_cuda_reg() {
cudaDeviceProp prop;
cudaDeviceProp prop;
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
dev_ctx->description = prop.name;
dev_ctx->description = prop.name;
...
@@ -147,7 +147,7 @@ index c0b1e4c1..5b852f69 100644
...
@@ -147,7 +147,7 @@ index c0b1e4c1..5b852f69 100644
char pci_bus_id[16] = {};
char pci_bus_id[16] = {};
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
index bf096227..f2ff9f32 100644
index bf096227
4
..f2ff9f32
2
100644
--- a/ggml/src/ggml-metal/ggml-metal.cpp
--- a/ggml/src/ggml-metal/ggml-metal.cpp
+++ b/ggml/src/ggml-metal/ggml-metal.cpp
+++ b/ggml/src/ggml-metal/ggml-metal.cpp
@@ -538,6 +538,7 @@
static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
@@ -538,6 +538,7 @@
static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
...
...
llama/patches/0016-add-C-API-for-mtmd_input_text.patch
View file @
544b6739
...
@@ -10,7 +10,7 @@ Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
...
@@ -10,7 +10,7 @@ Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
2 files changed, 13 insertions(+)
2 files changed, 13 insertions(+)
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index 4d487581..35a0d25e 100644
index 4d487581
a
..35a0d25e
d
100644
--- a/tools/mtmd/mtmd.cpp
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -79,6 +79,16 @@
enum mtmd_slice_tmpl {
@@ -79,6 +79,16 @@
enum mtmd_slice_tmpl {
...
@@ -31,7 +31,7 @@ index 4d487581..35a0d25e 100644
...
@@ -31,7 +31,7 @@ index 4d487581..35a0d25e 100644
return "<__media__>";
return "<__media__>";
}
}
diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
index f4ea07d3..cf287224 100644
index f4ea07d3
a
..cf287224
b
100644
--- a/tools/mtmd/mtmd.h
--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@@ -75,6 +75,9 @@
typedef struct mtmd_input_chunk mtmd_input_chunk;
@@ -75,6 +75,9 @@
typedef struct mtmd_input_chunk mtmd_input_chunk;
...
...
llama/patches/0017-no-power-throttling-win32-with-gnuc.patch
View file @
544b6739
...
@@ -8,10 +8,10 @@ Subject: [PATCH] no power throttling win32 with gnuc
...
@@ -8,10 +8,10 @@ Subject: [PATCH] no power throttling win32 with gnuc
1 file changed, 1 insertion(+), 1 deletion(-)
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index
99509b0c..b13a491d
100644
index
4b2f8b7bd..046646282
100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -24
37
,7 +24
37
,7 @@
static bool ggml_thread_apply_priority(int32_t prio) {
@@ -24
41
,7 +24
41
,7 @@
static bool ggml_thread_apply_priority(int32_t prio) {
// Newer Windows 11 versions aggresively park (offline) CPU cores and often place
// Newer Windows 11 versions aggresively park (offline) CPU cores and often place
// all our threads onto the first 4 cores which results in terrible performance with
// all our threads onto the first 4 cores which results in terrible performance with
// n_threads > 4
// n_threads > 4
...
...
llama/patches/0018-BF16-macos-version-guard.patch
View file @
544b6739
...
@@ -9,7 +9,7 @@ Only enable BF16 on supported MacOS versions (v14+)
...
@@ -9,7 +9,7 @@ Only enable BF16 on supported MacOS versions (v14+)
1 file changed, 6 insertions(+), 1 deletion(-)
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/ggml/src/ggml-metal/ggml-metal-context.m b/ggml/src/ggml-metal/ggml-metal-context.m
diff --git a/ggml/src/ggml-metal/ggml-metal-context.m b/ggml/src/ggml-metal/ggml-metal-context.m
index 052efb7a..b47dc787 100644
index 052efb7a
c
..b47dc787
9
100644
--- a/ggml/src/ggml-metal/ggml-metal-context.m
--- a/ggml/src/ggml-metal/ggml-metal-context.m
+++ b/ggml/src/ggml-metal/ggml-metal-context.m
+++ b/ggml/src/ggml-metal/ggml-metal-context.m
@@ -125,7 +125,12 @@
ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
@@ -125,7 +125,12 @@
ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
...
...
llama/patches/0019-ggml-Add-batch-size-hint.patch
View file @
544b6739
...
@@ -178,19 +178,19 @@ index 3191faaa4..32f14c811 100644
...
@@ -178,19 +178,19 @@ index 3191faaa4..32f14c811 100644
static const struct ggml_backend_i ggml_backend_cpu_i = {
static const struct ggml_backend_i ggml_backend_cpu_i = {
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index
5b852f690..c555cd30f
100644
index
cc201afff..02d413467
100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -26
84
,7 +26
84
,7 @@
static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
@@ -26
93
,7 +26
93
,7 @@
static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
#ifdef USE_CUDA_GRAPH
#ifdef USE_CUDA_GRAPH
static bool check_node_graph_compatibility
_and_refresh_copy_ops(ggml_backend_cuda_context * cuda_ctx,
ggml_cgraph * cgraph,
static bool check_node_graph_compatibility
(
ggml_cgraph * cgraph,
- bool use_cuda_graph) {
- bool use_cuda_graph) {
+ int batch_size, bool use_cuda_graph) {
+ int batch_size, bool use_cuda_graph) {
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
@@ -27
18
,24 +27
18
,34 @@
static bool check_node_graph_compatibility
_and_refresh_copy_ops(ggml_backend_cud
@@ -27
26
,24 +27
26
,34 @@
static bool check_node_graph_compatibility
(ggml_cgraph * cgraph,
#endif
#endif
}
}
...
@@ -240,8 +240,8 @@ index 5b852f690..c555cd30f 100644
...
@@ -240,8 +240,8 @@ index 5b852f690..c555cd30f 100644
+ }
+ }
}
}
if (
node->op == GGML_OP_CPY
) {
if (
!use_cuda_graph
) {
@@ -31
3
2,7 +31
42
,7 @@
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
@@ -312
8
,7 +31
38
,7 @@
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
}
}
}
}
...
@@ -250,12 +250,12 @@ index 5b852f690..c555cd30f 100644
...
@@ -250,12 +250,12 @@ index 5b852f690..c555cd30f 100644
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
ggml_cuda_set_device(cuda_ctx->device);
ggml_cuda_set_device(cuda_ctx->device);
@@ -31
70
,7 +31
80
,7 @@
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
@@ -31
66
,7 +31
76
,7 @@
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
if (use_cuda_graph) {
if (use_cuda_graph) {
cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);
cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);
- use_cuda_graph = check_node_graph_compatibility
_and_refresh_copy_ops(cuda_ctx,
cgraph, use_cuda_graph);
- use_cuda_graph = check_node_graph_compatibility
(
cgraph, use_cuda_graph);
+ use_cuda_graph = check_node_graph_compatibility
_and_refresh_copy_ops(cuda_ctx,
cgraph, batch_size, use_cuda_graph);
+ use_cuda_graph = check_node_graph_compatibility
(
cgraph, batch_size, use_cuda_graph);
// Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
// Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
if (use_cuda_graph && cuda_graph_update_required) {
if (use_cuda_graph && cuda_graph_update_required) {
...
@@ -278,10 +278,10 @@ index f2ff9f322..05ff6a5a6 100644
...
@@ -278,10 +278,10 @@ index f2ff9f322..05ff6a5a6 100644
static void ggml_backend_metal_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) {
static void ggml_backend_metal_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) {
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index
ed83236f4..bd3ece516
100644
index
216dc167c..3a6bbe564
100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -12
015
,7 +12
015
,7 @@
static uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const stru
@@ -12
357
,7 +12
357
,7 @@
static uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const stru
return num_adds;
return num_adds;
}
}
...
@@ -290,7 +290,7 @@ index ed83236f4..bd3ece516 100644
...
@@ -290,7 +290,7 @@ index ed83236f4..bd3ece516 100644
VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
@@ -12
21
1,6 +12
21
1,7 @@
static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
@@ -12
56
1,6 +12
56
1,7 @@
static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
return GGML_STATUS_SUCCESS;
return GGML_STATUS_SUCCESS;
UNUSED(backend);
UNUSED(backend);
...
...
llama/patches/0020-Disable-ggml-blas-on-macos-v13-and-older.patch
View file @
544b6739
...
@@ -8,10 +8,10 @@ Subject: [PATCH] Disable ggml-blas on macos v13 and older
...
@@ -8,10 +8,10 @@ Subject: [PATCH] Disable ggml-blas on macos v13 and older
1 file changed, 5 insertions(+)
1 file changed, 5 insertions(+)
diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp
diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp
index
5b888cdd..2a9ff7f6
100644
index
88d088952..6a38a51a2
100644
--- a/ggml/src/ggml-blas/ggml-blas.cpp
--- a/ggml/src/ggml-blas/ggml-blas.cpp
+++ b/ggml/src/ggml-blas/ggml-blas.cpp
+++ b/ggml/src/ggml-blas/ggml-blas.cpp
@@ -50
6
,6 +50
6
,11 @@
static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = {
@@ -50
7
,6 +50
7
,11 @@
static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = {
};
};
ggml_backend_reg_t ggml_backend_blas_reg(void) {
ggml_backend_reg_t ggml_backend_blas_reg(void) {
...
...
llama/patches/0021-fix-mtmd-audio.cpp-build-on-windows.patch
View file @
544b6739
...
@@ -8,7 +8,7 @@ Subject: [PATCH] fix mtmd-audio.cpp build on windows
...
@@ -8,7 +8,7 @@ Subject: [PATCH] fix mtmd-audio.cpp build on windows
1 file changed, 1 insertion(+), 1 deletion(-)
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp
diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp
index 4d053895..84bdc277 100644
index 4d053895
c
..84bdc277
7
100644
--- a/tools/mtmd/mtmd-audio.cpp
--- a/tools/mtmd/mtmd-audio.cpp
+++ b/tools/mtmd/mtmd-audio.cpp
+++ b/tools/mtmd/mtmd-audio.cpp
@@ -1,6 +1,6 @@
@@ -1,6 +1,6 @@
...
...
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment