Unverified Commit 544b6739 authored by Daniel Hiltgen's avatar Daniel Hiltgen Committed by GitHub
Browse files

ggml update to b6840 (#12791)

parent c4ba257c
...@@ -10,7 +10,7 @@ logs instead of throwing an error ...@@ -10,7 +10,7 @@ logs instead of throwing an error
1 file changed, 3 insertions(+), 11 deletions(-) 1 file changed, 3 insertions(+), 11 deletions(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 7fffd171..0b6edaf4 100644 index 639fecbd3..a7ce6f8e1 100644
--- a/src/llama-vocab.cpp --- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp
@@ -1812,16 +1812,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { @@ -1812,16 +1812,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
...@@ -31,7 +31,7 @@ index 7fffd171..0b6edaf4 100644 ...@@ -31,7 +31,7 @@ index 7fffd171..0b6edaf4 100644
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if ( } else if (
tokenizer_pre == "llama3" || tokenizer_pre == "llama3" ||
@@ -1992,7 +1983,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { @@ -1993,7 +1984,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
pre_type = LLAMA_VOCAB_PRE_TYPE_GROK_2; pre_type = LLAMA_VOCAB_PRE_TYPE_GROK_2;
clean_spaces = false; clean_spaces = false;
} else { } else {
......
...@@ -10,7 +10,7 @@ filesystems for paths that include wide characters ...@@ -10,7 +10,7 @@ filesystems for paths that include wide characters
1 file changed, 39 insertions(+) 1 file changed, 39 insertions(+)
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 98e68af2..6699b75a 100644 index f2abf8852..c984e6282 100644
--- a/tools/mtmd/clip.cpp --- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp
@@ -28,6 +28,19 @@ @@ -28,6 +28,19 @@
...@@ -33,7 +33,7 @@ index 98e68af2..6699b75a 100644 ...@@ -33,7 +33,7 @@ index 98e68af2..6699b75a 100644
struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL}; struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
enum ffn_op_type { enum ffn_op_type {
@@ -2762,7 +2775,29 @@ struct clip_model_loader { @@ -2774,7 +2787,29 @@ struct clip_model_loader {
{ {
std::vector<uint8_t> read_buf; std::vector<uint8_t> read_buf;
...@@ -63,7 +63,7 @@ index 98e68af2..6699b75a 100644 ...@@ -63,7 +63,7 @@ index 98e68af2..6699b75a 100644
if (!fin) { if (!fin) {
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str())); throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
} }
@@ -2789,7 +2824,11 @@ struct clip_model_loader { @@ -2801,7 +2836,11 @@ struct clip_model_loader {
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
} }
} }
......
...@@ -15,10 +15,10 @@ adds support for the Solar Pro architecture ...@@ -15,10 +15,10 @@ adds support for the Solar Pro architecture
7 files changed, 248 insertions(+), 1 deletion(-) 7 files changed, 248 insertions(+), 1 deletion(-)
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 869e4dcc..9f6b6ad2 100644 index 8ca769c5f..ab262ec0c 100644
--- a/src/llama-arch.cpp --- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp +++ b/src/llama-arch.cpp
@@ -81,6 +81,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = { @@ -82,6 +82,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_GRANITE_MOE, "granitemoe" }, { LLM_ARCH_GRANITE_MOE, "granitemoe" },
{ LLM_ARCH_GRANITE_HYBRID, "granitehybrid" }, { LLM_ARCH_GRANITE_HYBRID, "granitehybrid" },
{ LLM_ARCH_CHAMELEON, "chameleon" }, { LLM_ARCH_CHAMELEON, "chameleon" },
...@@ -26,7 +26,7 @@ index 869e4dcc..9f6b6ad2 100644 ...@@ -26,7 +26,7 @@ index 869e4dcc..9f6b6ad2 100644
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" }, { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
{ LLM_ARCH_PLM, "plm" }, { LLM_ARCH_PLM, "plm" },
{ LLM_ARCH_BAILINGMOE, "bailingmoe" }, { LLM_ARCH_BAILINGMOE, "bailingmoe" },
@@ -179,6 +180,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = { @@ -183,6 +184,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" }, { LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
{ LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" }, { LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
...@@ -34,7 +34,7 @@ index 869e4dcc..9f6b6ad2 100644 ...@@ -34,7 +34,7 @@ index 869e4dcc..9f6b6ad2 100644
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" }, { LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" }, { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
@@ -1893,6 +1895,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N @@ -1901,6 +1903,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
}, },
}, },
...@@ -59,7 +59,7 @@ index 869e4dcc..9f6b6ad2 100644 ...@@ -59,7 +59,7 @@ index 869e4dcc..9f6b6ad2 100644
{ {
LLM_ARCH_WAVTOKENIZER_DEC, LLM_ARCH_WAVTOKENIZER_DEC,
{ {
@@ -2429,6 +2449,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = { @@ -2469,6 +2489,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_LAUREL_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_LAUREL_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
// this tensor is loaded for T5, but never used // this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}}, {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
...@@ -68,10 +68,10 @@ index 869e4dcc..9f6b6ad2 100644 ...@@ -68,10 +68,10 @@ index 869e4dcc..9f6b6ad2 100644
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h diff --git a/src/llama-arch.h b/src/llama-arch.h
index c3ae7165..dc7a362a 100644 index dea725c1a..ea2b4ffb9 100644
--- a/src/llama-arch.h --- a/src/llama-arch.h
+++ b/src/llama-arch.h +++ b/src/llama-arch.h
@@ -85,6 +85,7 @@ enum llm_arch { @@ -86,6 +86,7 @@ enum llm_arch {
LLM_ARCH_GRANITE_MOE, LLM_ARCH_GRANITE_MOE,
LLM_ARCH_GRANITE_HYBRID, LLM_ARCH_GRANITE_HYBRID,
LLM_ARCH_CHAMELEON, LLM_ARCH_CHAMELEON,
...@@ -79,7 +79,7 @@ index c3ae7165..dc7a362a 100644 ...@@ -79,7 +79,7 @@ index c3ae7165..dc7a362a 100644
LLM_ARCH_WAVTOKENIZER_DEC, LLM_ARCH_WAVTOKENIZER_DEC,
LLM_ARCH_PLM, LLM_ARCH_PLM,
LLM_ARCH_BAILINGMOE, LLM_ARCH_BAILINGMOE,
@@ -183,6 +184,7 @@ enum llm_kv { @@ -187,6 +188,7 @@ enum llm_kv {
LLM_KV_ATTENTION_SCALE, LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_OUTPUT_SCALE, LLM_KV_ATTENTION_OUTPUT_SCALE,
LLM_KV_ATTENTION_TEMPERATURE_LENGTH, LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
...@@ -87,7 +87,7 @@ index c3ae7165..dc7a362a 100644 ...@@ -87,7 +87,7 @@ index c3ae7165..dc7a362a 100644
LLM_KV_ATTENTION_KEY_LENGTH_MLA, LLM_KV_ATTENTION_KEY_LENGTH_MLA,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA, LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
@@ -432,6 +434,7 @@ enum llm_tensor { @@ -436,6 +438,7 @@ enum llm_tensor {
LLM_TENSOR_ENC_OUTPUT_NORM, LLM_TENSOR_ENC_OUTPUT_NORM,
LLM_TENSOR_CLS, LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT, LLM_TENSOR_CLS_OUT,
...@@ -96,7 +96,7 @@ index c3ae7165..dc7a362a 100644 ...@@ -96,7 +96,7 @@ index c3ae7165..dc7a362a 100644
LLM_TENSOR_CONVNEXT_DW, LLM_TENSOR_CONVNEXT_DW,
LLM_TENSOR_CONVNEXT_NORM, LLM_TENSOR_CONVNEXT_NORM,
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
index db65d69e..b6bf6bbf 100644 index db65d69ea..b6bf6bbf2 100644
--- a/src/llama-hparams.cpp --- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp
@@ -151,6 +151,14 @@ uint32_t llama_hparams::n_pos_per_embd() const { @@ -151,6 +151,14 @@ uint32_t llama_hparams::n_pos_per_embd() const {
...@@ -115,7 +115,7 @@ index db65d69e..b6bf6bbf 100644 ...@@ -115,7 +115,7 @@ index db65d69e..b6bf6bbf 100644
if (il < n_layer) { if (il < n_layer) {
return swa_layers[il]; return swa_layers[il];
diff --git a/src/llama-hparams.h b/src/llama-hparams.h diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index 4e7f73ec..80582728 100644 index 6fcf91b7d..24569a258 100644
--- a/src/llama-hparams.h --- a/src/llama-hparams.h
+++ b/src/llama-hparams.h +++ b/src/llama-hparams.h
@@ -64,6 +64,8 @@ struct llama_hparams { @@ -64,6 +64,8 @@ struct llama_hparams {
...@@ -127,7 +127,7 @@ index 4e7f73ec..80582728 100644 ...@@ -127,7 +127,7 @@ index 4e7f73ec..80582728 100644
uint32_t n_layer_dense_lead = 0; uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0; uint32_t n_lora_q = 0;
uint32_t n_lora_kv = 0; uint32_t n_lora_kv = 0;
@@ -248,6 +250,9 @@ struct llama_hparams { @@ -250,6 +252,9 @@ struct llama_hparams {
uint32_t n_pos_per_embd() const; uint32_t n_pos_per_embd() const;
...@@ -138,7 +138,7 @@ index 4e7f73ec..80582728 100644 ...@@ -138,7 +138,7 @@ index 4e7f73ec..80582728 100644
bool has_kv(uint32_t il) const; bool has_kv(uint32_t il) const;
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index aa3a65f8..ee303bd5 100644 index aa3a65f87..ee303bd58 100644
--- a/src/llama-model-loader.cpp --- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp
@@ -466,7 +466,7 @@ namespace GGUFMeta { @@ -466,7 +466,7 @@ namespace GGUFMeta {
...@@ -151,10 +151,10 @@ index aa3a65f8..ee303bd5 100644 ...@@ -151,10 +151,10 @@ index aa3a65f8..ee303bd5 100644
llama_model_loader::llama_model_loader( llama_model_loader::llama_model_loader(
const std::string & fname, const std::string & fname,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 36d495d6..74e1d162 100644 index 2a83d6627..54621ea39 100644
--- a/src/llama-model.cpp --- a/src/llama-model.cpp
+++ b/src/llama-model.cpp +++ b/src/llama-model.cpp
@@ -1865,6 +1865,21 @@ void llama_model::load_hparams(llama_model_loader & ml) { @@ -1890,6 +1890,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN; default: type = LLM_TYPE_UNKNOWN;
} }
} break; } break;
...@@ -176,7 +176,7 @@ index 36d495d6..74e1d162 100644 ...@@ -176,7 +176,7 @@ index 36d495d6..74e1d162 100644
case LLM_ARCH_WAVTOKENIZER_DEC: case LLM_ARCH_WAVTOKENIZER_DEC:
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -5170,6 +5185,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) { @@ -5224,6 +5239,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
...@@ -211,7 +211,7 @@ index 36d495d6..74e1d162 100644 ...@@ -211,7 +211,7 @@ index 36d495d6..74e1d162 100644
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
@@ -16392,6 +16435,165 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba { @@ -16515,6 +16558,165 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
} }
}; };
...@@ -377,7 +377,7 @@ index 36d495d6..74e1d162 100644 ...@@ -377,7 +377,7 @@ index 36d495d6..74e1d162 100644
// ref: https://github.com/facebookresearch/chameleon // ref: https://github.com/facebookresearch/chameleon
// based on the original build_llama() function, changes: // based on the original build_llama() function, changes:
// * qk-norm // * qk-norm
@@ -19827,6 +20029,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { @@ -20096,6 +20298,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
{ {
llm = std::make_unique<llm_build_chameleon>(*this, params); llm = std::make_unique<llm_build_chameleon>(*this, params);
} break; } break;
...@@ -388,7 +388,7 @@ index 36d495d6..74e1d162 100644 ...@@ -388,7 +388,7 @@ index 36d495d6..74e1d162 100644
case LLM_ARCH_WAVTOKENIZER_DEC: case LLM_ARCH_WAVTOKENIZER_DEC:
{ {
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params); llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
@@ -20057,6 +20263,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { @@ -20331,6 +20537,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_GRANITE_MOE: case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_GRANITE_HYBRID: case LLM_ARCH_GRANITE_HYBRID:
case LLM_ARCH_CHAMELEON: case LLM_ARCH_CHAMELEON:
...@@ -397,7 +397,7 @@ index 36d495d6..74e1d162 100644 ...@@ -397,7 +397,7 @@ index 36d495d6..74e1d162 100644
case LLM_ARCH_NEO_BERT: case LLM_ARCH_NEO_BERT:
case LLM_ARCH_SMOLLM3: case LLM_ARCH_SMOLLM3:
diff --git a/src/llama-model.h b/src/llama-model.h diff --git a/src/llama-model.h b/src/llama-model.h
index 7f48662f..ec3fbd33 100644 index 248f85410..4a7924aaa 100644
--- a/src/llama-model.h --- a/src/llama-model.h
+++ b/src/llama-model.h +++ b/src/llama-model.h
@@ -76,6 +76,7 @@ enum llm_type { @@ -76,6 +76,7 @@ enum llm_type {
...@@ -408,7 +408,7 @@ index 7f48662f..ec3fbd33 100644 ...@@ -408,7 +408,7 @@ index 7f48662f..ec3fbd33 100644
LLM_TYPE_27B, LLM_TYPE_27B,
LLM_TYPE_30B, LLM_TYPE_30B,
LLM_TYPE_32B, LLM_TYPE_32B,
@@ -387,6 +388,8 @@ struct llama_layer { @@ -390,6 +391,8 @@ struct llama_layer {
struct ggml_tensor * ffn_act_beta = nullptr; struct ggml_tensor * ffn_act_beta = nullptr;
struct ggml_tensor * ffn_act_eps = nullptr; struct ggml_tensor * ffn_act_eps = nullptr;
......
...@@ -12,7 +12,7 @@ regex ...@@ -12,7 +12,7 @@ regex
2 files changed, 22 insertions(+), 1 deletion(-) 2 files changed, 22 insertions(+), 1 deletion(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 0b6edaf4..3de95c67 100644 index a7ce6f8e1..8064dc197 100644
--- a/src/llama-vocab.cpp --- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp
@@ -299,7 +299,7 @@ struct llm_tokenizer_bpe : llm_tokenizer { @@ -299,7 +299,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
...@@ -25,7 +25,7 @@ index 0b6edaf4..3de95c67 100644 ...@@ -25,7 +25,7 @@ index 0b6edaf4..3de95c67 100644
"\\s+$", "\\s+$",
"[一-龥ࠀ-一가-퟿]+", "[一-龥ࠀ-一가-퟿]+",
diff --git a/src/unicode.cpp b/src/unicode.cpp diff --git a/src/unicode.cpp b/src/unicode.cpp
index 65f36651..ce336a22 100644 index 65f366517..ce336a228 100644
--- a/src/unicode.cpp --- a/src/unicode.cpp
+++ b/src/unicode.cpp +++ b/src/unicode.cpp
@@ -2,6 +2,11 @@ @@ -2,6 +2,11 @@
......
...@@ -8,7 +8,7 @@ Subject: [PATCH] maintain ordering for rules for grammar ...@@ -8,7 +8,7 @@ Subject: [PATCH] maintain ordering for rules for grammar
1 file changed, 1 insertion(+), 1 deletion(-) 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index db1f0b23..f4de7e34 100644 index dd9b51a9e..d88f43209 100644
--- a/common/json-schema-to-grammar.cpp --- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp
@@ -308,7 +308,7 @@ private: @@ -308,7 +308,7 @@ private:
......
...@@ -11,10 +11,10 @@ with the fastest acceleration is loaded ...@@ -11,10 +11,10 @@ with the fastest acceleration is loaded
1 file changed, 13 insertions(+), 8 deletions(-) 1 file changed, 13 insertions(+), 8 deletions(-)
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 136afec7..f794d9cf 100644 index e96b5c403..a55d9b280 100644
--- a/ggml/src/ggml-backend-reg.cpp --- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp
@@ -175,7 +175,7 @@ struct ggml_backend_reg_entry { @@ -179,7 +179,7 @@ struct ggml_backend_reg_entry {
struct ggml_backend_registry { struct ggml_backend_registry {
std::vector<ggml_backend_reg_entry> backends; std::vector<ggml_backend_reg_entry> backends;
...@@ -23,7 +23,7 @@ index 136afec7..f794d9cf 100644 ...@@ -23,7 +23,7 @@ index 136afec7..f794d9cf 100644
ggml_backend_registry() { ggml_backend_registry() {
#ifdef GGML_USE_CUDA #ifdef GGML_USE_CUDA
@@ -223,7 +223,7 @@ struct ggml_backend_registry { @@ -230,7 +230,7 @@ struct ggml_backend_registry {
} }
} }
...@@ -32,7 +32,7 @@ index 136afec7..f794d9cf 100644 ...@@ -32,7 +32,7 @@ index 136afec7..f794d9cf 100644
if (!reg) { if (!reg) {
return; return;
} }
@@ -234,15 +234,20 @@ struct ggml_backend_registry { @@ -241,15 +241,20 @@ struct ggml_backend_registry {
#endif #endif
backends.push_back({ reg, std::move(handle) }); backends.push_back({ reg, std::move(handle) });
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) { for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
...@@ -56,7 +56,7 @@ index 136afec7..f794d9cf 100644 ...@@ -56,7 +56,7 @@ index 136afec7..f794d9cf 100644
} }
ggml_backend_reg_t load_backend(const fs::path & path, bool silent) { ggml_backend_reg_t load_backend(const fs::path & path, bool silent) {
@@ -286,7 +291,7 @@ struct ggml_backend_registry { @@ -293,7 +298,7 @@ struct ggml_backend_registry {
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path_str(path).c_str()); GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path_str(path).c_str());
...@@ -65,7 +65,7 @@ index 136afec7..f794d9cf 100644 ...@@ -65,7 +65,7 @@ index 136afec7..f794d9cf 100644
return reg; return reg;
} }
@@ -309,7 +314,7 @@ struct ggml_backend_registry { @@ -316,7 +321,7 @@ struct ggml_backend_registry {
// remove devices // remove devices
devices.erase( devices.erase(
std::remove_if(devices.begin(), devices.end(), std::remove_if(devices.begin(), devices.end(),
...@@ -74,7 +74,7 @@ index 136afec7..f794d9cf 100644 ...@@ -74,7 +74,7 @@ index 136afec7..f794d9cf 100644
devices.end()); devices.end());
// remove backend // remove backend
@@ -367,7 +372,7 @@ size_t ggml_backend_dev_count() { @@ -374,7 +379,7 @@ size_t ggml_backend_dev_count() {
ggml_backend_dev_t ggml_backend_dev_get(size_t index) { ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
GGML_ASSERT(index < ggml_backend_dev_count()); GGML_ASSERT(index < ggml_backend_dev_count());
......
...@@ -8,10 +8,10 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants ...@@ -8,10 +8,10 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
1 file changed, 2 insertions(+) 1 file changed, 2 insertions(+)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 892c2331..09fdf5fc 100644 index ba281b8e6..ead235878 100644
--- a/ggml/src/CMakeLists.txt --- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt
@@ -310,6 +310,7 @@ function(ggml_add_cpu_backend_variant tag_name) @@ -314,6 +314,7 @@ function(ggml_add_cpu_backend_variant tag_name)
endif() endif()
ggml_add_cpu_backend_variant_impl(${tag_name}) ggml_add_cpu_backend_variant_impl(${tag_name})
...@@ -19,7 +19,7 @@ index 892c2331..09fdf5fc 100644 ...@@ -19,7 +19,7 @@ index 892c2331..09fdf5fc 100644
endfunction() endfunction()
ggml_add_backend(CPU) ggml_add_backend(CPU)
@@ -320,6 +321,7 @@ if (GGML_CPU_ALL_VARIANTS) @@ -324,6 +325,7 @@ if (GGML_CPU_ALL_VARIANTS)
elseif (GGML_CPU_ARM_ARCH) elseif (GGML_CPU_ARM_ARCH)
message(FATAL_ERROR "Cannot use both GGML_CPU_ARM_ARCH and GGML_CPU_ALL_VARIANTS") message(FATAL_ERROR "Cannot use both GGML_CPU_ARM_ARCH and GGML_CPU_ALL_VARIANTS")
endif() endif()
......
...@@ -9,10 +9,10 @@ disable amx as it reduces performance on some systems ...@@ -9,10 +9,10 @@ disable amx as it reduces performance on some systems
1 file changed, 4 deletions(-) 1 file changed, 4 deletions(-)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 09fdf5fc..0609c650 100644 index ead235878..f9a6587f1 100644
--- a/ggml/src/CMakeLists.txt --- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt
@@ -330,10 +330,6 @@ if (GGML_CPU_ALL_VARIANTS) @@ -334,10 +334,6 @@ if (GGML_CPU_ALL_VARIANTS)
ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512) ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI) ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI) ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
......
...@@ -13,7 +13,7 @@ such as vocab fields ...@@ -13,7 +13,7 @@ such as vocab fields
3 files changed, 7 insertions(+), 5 deletions(-) 3 files changed, 7 insertions(+), 5 deletions(-)
diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h
index 79ee2020..3efb22f0 100644 index 79ee20206..3efb22f01 100644
--- a/ggml/include/gguf.h --- a/ggml/include/gguf.h
+++ b/ggml/include/gguf.h +++ b/ggml/include/gguf.h
@@ -114,6 +114,7 @@ extern "C" { @@ -114,6 +114,7 @@ extern "C" {
...@@ -25,7 +25,7 @@ index 79ee2020..3efb22f0 100644 ...@@ -25,7 +25,7 @@ index 79ee2020..3efb22f0 100644
// get ith C string from array with given key_id // get ith C string from array with given key_id
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int64_t key_id, size_t i); GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int64_t key_id, size_t i);
diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp
index 8cc4ef1c..d950dbdf 100644 index 8cc4ef1cf..d950dbdf5 100644
--- a/ggml/src/gguf.cpp --- a/ggml/src/gguf.cpp
+++ b/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp
@@ -805,10 +805,14 @@ enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int64_t key_id @@ -805,10 +805,14 @@ enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int64_t key_id
...@@ -53,7 +53,7 @@ index 8cc4ef1c..d950dbdf 100644 ...@@ -53,7 +53,7 @@ index 8cc4ef1c..d950dbdf 100644
} }
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 3de95c67..217ede47 100644 index 8064dc197..31f49801c 100644
--- a/src/llama-vocab.cpp --- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp
@@ -1768,9 +1768,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { @@ -1768,9 +1768,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
......
...@@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor ...@@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor
1 file changed, 6 insertions(+) 1 file changed, 6 insertions(+)
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index ba2a36d9..99509b0c 100644 index 9ec485cfa..4b2f8b7bd 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c --- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -15,6 +15,8 @@ @@ -15,6 +15,8 @@
...@@ -20,7 +20,7 @@ index ba2a36d9..99509b0c 100644 ...@@ -20,7 +20,7 @@ index ba2a36d9..99509b0c 100644
#if defined(_MSC_VER) || defined(__MINGW32__) #if defined(_MSC_VER) || defined(__MINGW32__)
#include <malloc.h> // using malloc.h with MSC/MINGW #include <malloc.h> // using malloc.h with MSC/MINGW
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
@@ -2887,6 +2889,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { @@ -2891,6 +2893,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
ggml_compute_forward(&params, node); ggml_compute_forward(&params, node);
......
...@@ -10,7 +10,7 @@ Subject: [PATCH] add ollama vocab for grammar support ...@@ -10,7 +10,7 @@ Subject: [PATCH] add ollama vocab for grammar support
3 files changed, 58 insertions(+), 9 deletions(-) 3 files changed, 58 insertions(+), 9 deletions(-)
diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp
index bed706bb..b51cee09 100644 index bed706bb2..b51cee090 100644
--- a/src/llama-grammar.cpp --- a/src/llama-grammar.cpp
+++ b/src/llama-grammar.cpp +++ b/src/llama-grammar.cpp
@@ -907,6 +907,7 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack( @@ -907,6 +907,7 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
...@@ -137,7 +137,7 @@ index bed706bb..b51cee09 100644 ...@@ -137,7 +137,7 @@ index bed706bb..b51cee09 100644
+ } + }
+} +}
diff --git a/src/llama-grammar.h b/src/llama-grammar.h diff --git a/src/llama-grammar.h b/src/llama-grammar.h
index f8c291de..2a3a62db 100644 index f8c291de9..2a3a62db3 100644
--- a/src/llama-grammar.h --- a/src/llama-grammar.h
+++ b/src/llama-grammar.h +++ b/src/llama-grammar.h
@@ -6,8 +6,19 @@ @@ -6,8 +6,19 @@
...@@ -184,7 +184,7 @@ index f8c291de..2a3a62db 100644 ...@@ -184,7 +184,7 @@ index f8c291de..2a3a62db 100644
const char * grammar_root, const char * grammar_root,
bool lazy, bool lazy,
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 55d2e355..da34526b 100644 index 55d2e355f..da34526b1 100644
--- a/src/llama-sampling.cpp --- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp
@@ -1563,7 +1563,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) { @@ -1563,7 +1563,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
......
...@@ -4,15 +4,15 @@ Date: Thu, 1 May 2025 13:45:12 -0700 ...@@ -4,15 +4,15 @@ Date: Thu, 1 May 2025 13:45:12 -0700
Subject: [PATCH] add argsort and cuda copy for i32 Subject: [PATCH] add argsort and cuda copy for i32
--- ---
ggml/src/ggml-cpu/ops.cpp | 43 +++++++++++ ggml/src/ggml-cpu/ops.cpp | 43 ++++++++++
ggml/src/ggml-cuda/argsort.cu | 102 ++++++++++++++++++++++++++- ggml/src/ggml-cuda/argsort.cu | 122 ++++++++++++++++++++++++---
ggml/src/ggml-cuda/cpy-utils.cuh | 6 ++ ggml/src/ggml-cuda/cpy-utils.cuh | 6 ++
ggml/src/ggml-cuda/cpy.cu | 43 +++++++++++ ggml/src/ggml-cuda/cpy.cu | 40 +++++++++
ggml/src/ggml-metal/ggml-metal.metal | 64 +++++++++++++++++ ggml/src/ggml-metal/ggml-metal.metal | 64 ++++++++++++++
5 files changed, 256 insertions(+), 2 deletions(-) 5 files changed, 263 insertions(+), 12 deletions(-)
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 1c43865f..31478dd8 100644 index b52f0f847..902fdad69 100644
--- a/ggml/src/ggml-cpu/ops.cpp --- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp
@@ -7889,6 +7889,45 @@ static void ggml_compute_forward_argsort_f32( @@ -7889,6 +7889,45 @@ static void ggml_compute_forward_argsort_f32(
...@@ -73,10 +73,10 @@ index 1c43865f..31478dd8 100644 ...@@ -73,10 +73,10 @@ index 1c43865f..31478dd8 100644
{ {
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu
index 607ded85..53b02634 100644 index 6e7b90d42..08dd30525 100644
--- a/ggml/src/ggml-cuda/argsort.cu --- a/ggml/src/ggml-cuda/argsort.cu
+++ b/ggml/src/ggml-cuda/argsort.cu +++ b/ggml/src/ggml-cuda/argsort.cu
@@ -85,13 +85,107 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co @@ -168,13 +168,107 @@ static void argsort_f32_i32_cuda_bitonic(const float * x,
} }
} }
...@@ -185,19 +185,42 @@ index 607ded85..53b02634 100644 ...@@ -185,19 +185,42 @@ index 607ded85..53b02634 100644
GGML_ASSERT( dst->type == GGML_TYPE_I32); GGML_ASSERT( dst->type == GGML_TYPE_I32);
GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src0));
@@ -100,5 +194,9 @@ void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { @@ -183,18 +277,22 @@ void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0]; enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
- argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream); -#ifdef GGML_CUDA_USE_CUB
- const int ncols_pad = next_power_of_2(ncols);
- const size_t shared_mem = ncols_pad * sizeof(int);
- const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
-
- if (shared_mem > max_shared_mem || ncols > 1024) {
- ggml_cuda_pool & pool = ctx.pool();
- argsort_f32_i32_cuda_cub(pool, src0_d, (int *) dst_d, ncols, nrows, order, stream);
+ if (src0->type == GGML_TYPE_I32) { + if (src0->type == GGML_TYPE_I32) {
+ argsort_i32_i32_cuda((const int32_t *)src0_d, (int *)dst_d, ncols, nrows, order, stream); + argsort_i32_i32_cuda((const int32_t *)src0_d, (int *)dst_d, ncols, nrows, order, stream);
+ } else { } else {
+ argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream); - argsort_f32_i32_cuda_bitonic(src0_d, (int *) dst_d, ncols, nrows, order, stream);
- }
+#ifdef GGML_CUDA_USE_CUB
+ const int ncols_pad = next_power_of_2(ncols);
+ const size_t shared_mem = ncols_pad * sizeof(int);
+ const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
+
+ if (shared_mem > max_shared_mem || ncols > 1024) {
+ ggml_cuda_pool & pool = ctx.pool();
+ argsort_f32_i32_cuda_cub(pool, src0_d, (int *) dst_d, ncols, nrows, order, stream);
+ } else {
+ argsort_f32_i32_cuda_bitonic(src0_d, (int *) dst_d, ncols, nrows, order, stream);
+ }
#else
- argsort_f32_i32_cuda_bitonic(src0_d, (int *) dst_d, ncols, nrows, order, stream);
+ argsort_f32_i32_cuda_bitonic(src0_d, (int *) dst_d, ncols, nrows, order, stream);
#endif
+ } + }
} }
diff --git a/ggml/src/ggml-cuda/cpy-utils.cuh b/ggml/src/ggml-cuda/cpy-utils.cuh diff --git a/ggml/src/ggml-cuda/cpy-utils.cuh b/ggml/src/ggml-cuda/cpy-utils.cuh
index e621cb98..597c0c8b 100644 index e621cb981..597c0c8b3 100644
--- a/ggml/src/ggml-cuda/cpy-utils.cuh --- a/ggml/src/ggml-cuda/cpy-utils.cuh
+++ b/ggml/src/ggml-cuda/cpy-utils.cuh +++ b/ggml/src/ggml-cuda/cpy-utils.cuh
@@ -215,3 +215,9 @@ template<typename src_t, typename dst_t> @@ -215,3 +215,9 @@ template<typename src_t, typename dst_t>
...@@ -211,19 +234,18 @@ index e621cb98..597c0c8b 100644 ...@@ -211,19 +234,18 @@ index e621cb98..597c0c8b 100644
+ *dst = *src; + *dst = *src;
+} +}
diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
index 746f4396..911220e9 100644 index 12d5bf776..a0e34030e 100644
--- a/ggml/src/ggml-cuda/cpy.cu --- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu +++ b/ggml/src/ggml-cuda/cpy.cu
@@ -277,6 +277,47 @@ static void ggml_cpy_f32_iq4_nl_cuda( @@ -251,6 +251,43 @@ static void ggml_cpy_f32_iq4_nl_cuda(
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++); (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
} }
+template <cpy_kernel_t cpy_1> +template <cpy_kernel_t cpy_1>
+static __global__ void cpy_i32_i32( +static __global__ void cpy_i32_i32(
+ const char *cx, char *cdst, const int ne, + const char *cx, char *cdst, const int ne,
+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
+ const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+ +
+ const int64_t i = blockDim.x * blockIdx.x + threadIdx.x; + const int64_t i = blockDim.x * blockIdx.x + threadIdx.x;
+ +
...@@ -243,39 +265,37 @@ index 746f4396..911220e9 100644 ...@@ -243,39 +265,37 @@ index 746f4396..911220e9 100644
+ const int64_t i10 = i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11 - i11 * ne10; + const int64_t i10 = i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11 - i11 * ne10;
+ const int64_t dst_offset = i10 * nb10 + i11 * nb11 + i12 * nb12 + i13 * nb13; + const int64_t dst_offset = i10 * nb10 + i11 * nb11 + i12 * nb12 + i13 * nb13;
+ +
+ char * cdst_ptr = (cdst_indirect != nullptr) ? cdst_indirect[graph_cpynode_index] : cdst; + cpy_1(cx + x_offset, cdst + dst_offset);
+ cpy_1(cx + x_offset, cdst_ptr + dst_offset);
+} +}
+ +
+
+static void ggml_cpy_i32_i32_cuda( +static void ggml_cpy_i32_i32_cuda(
+ const char * cx, char * cdst, const int ne, + const char * cx, char * cdst, const int ne,
+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
+ const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ cudaStream_t stream, char ** cdst_indirect, int graph_cpynode_index) {
+ +
+ const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; + const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+ cpy_i32_i32<cpy_1_i32_i32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>> + cpy_i32_i32<cpy_1_i32_i32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, stream, cdst_indirect, graph_cpynode_index); + (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, stream);
+} +}
+ +
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1, bool disable_indirection_for_this_node) { void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1) {
const int64_t ne = ggml_nelements(src0); const int64_t ne = ggml_nelements(src0);
GGML_ASSERT(ne == ggml_nelements(src1)); GGML_ASSERT(ne == ggml_nelements(src1));
@@ -372,6 +413,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg @@ -332,6 +369,9 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
ggml_cpy_flt_cuda<half, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index); ggml_cpy_flt_cuda<half, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) { } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
ggml_cpy_flt_cuda<half, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index); ggml_cpy_flt_cuda<half, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+ } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) { + } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
+ ggml_cpy_i32_i32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index); + // TODO consider converting to template
+ ggml_cpy_i32_i32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16) { } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16) {
ggml_cpy_flt_cuda<nv_bfloat16, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index); ggml_cpy_flt_cuda<nv_bfloat16, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F16) { } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F16) {
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 74a9aa99..375a0c7f 100644 index 2c2f01415..50b8071de 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal --- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -4346,8 +4346,72 @@ kernel void kernel_argsort_f32_i32( @@ -4467,8 +4467,72 @@ kernel void kernel_argsort_f32_i32(
} }
} }
......
...@@ -11,7 +11,7 @@ Subject: [PATCH] graph memory reporting on failure ...@@ -11,7 +11,7 @@ Subject: [PATCH] graph memory reporting on failure
4 files changed, 40 insertions(+), 3 deletions(-) 4 files changed, 40 insertions(+), 3 deletions(-)
diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h
index 2cb150fd..7ab3f019 100644 index 2cb150fd2..7ab3f0192 100644
--- a/ggml/include/ggml-alloc.h --- a/ggml/include/ggml-alloc.h
+++ b/ggml/include/ggml-alloc.h +++ b/ggml/include/ggml-alloc.h
@@ -65,6 +65,7 @@ GGML_API bool ggml_gallocr_reserve_n( @@ -65,6 +65,7 @@ GGML_API bool ggml_gallocr_reserve_n(
...@@ -23,7 +23,7 @@ index 2cb150fd..7ab3f019 100644 ...@@ -23,7 +23,7 @@ index 2cb150fd..7ab3f019 100644
// Utils // Utils
// Create a buffer and allocate all the tensors in a ggml_context // Create a buffer and allocate all the tensors in a ggml_context
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index f1b74078..c54ff98b 100644 index f1b740785..c54ff98bf 100644
--- a/ggml/include/ggml-backend.h --- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h
@@ -318,6 +318,7 @@ extern "C" { @@ -318,6 +318,7 @@ extern "C" {
...@@ -35,7 +35,7 @@ index f1b74078..c54ff98b 100644 ...@@ -35,7 +35,7 @@ index f1b74078..c54ff98b 100644
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend); GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node); GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index 929bc448..eee9d3b1 100644 index c830c0965..363853873 100644
--- a/ggml/src/ggml-alloc.c --- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c +++ b/ggml/src/ggml-alloc.c
@@ -486,6 +486,7 @@ struct node_alloc { @@ -486,6 +486,7 @@ struct node_alloc {
...@@ -64,7 +64,7 @@ index 929bc448..eee9d3b1 100644 ...@@ -64,7 +64,7 @@ index 929bc448..eee9d3b1 100644
free(galloc->buffers); free(galloc->buffers);
free(galloc->buf_tallocs); free(galloc->buf_tallocs);
free(galloc->node_allocs); free(galloc->node_allocs);
@@ -869,6 +874,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c @@ -891,6 +896,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
} }
} }
...@@ -73,7 +73,7 @@ index 929bc448..eee9d3b1 100644 ...@@ -73,7 +73,7 @@ index 929bc448..eee9d3b1 100644
// reallocate buffers if needed // reallocate buffers if needed
for (int i = 0; i < galloc->n_buffers; i++) { for (int i = 0; i < galloc->n_buffers; i++) {
// if the buffer type is used multiple times, we reuse the same buffer // if the buffer type is used multiple times, we reuse the same buffer
@@ -898,14 +905,19 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c @@ -920,14 +927,19 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
ggml_vbuffer_free(galloc->buffers[i]); ggml_vbuffer_free(galloc->buffers[i]);
galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE); galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
...@@ -96,7 +96,7 @@ index 929bc448..eee9d3b1 100644 ...@@ -96,7 +96,7 @@ index 929bc448..eee9d3b1 100644
} }
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) { bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
@@ -1060,6 +1072,22 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) { @@ -1082,6 +1094,22 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
return ggml_vbuffer_size(galloc->buffers[buffer_id]); return ggml_vbuffer_size(galloc->buffers[buffer_id]);
} }
...@@ -120,7 +120,7 @@ index 929bc448..eee9d3b1 100644 ...@@ -120,7 +120,7 @@ index 929bc448..eee9d3b1 100644
static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) { static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 8ba86f82..cb2b9956 100644 index 8ba86f824..cb2b99562 100644
--- a/ggml/src/ggml-backend.cpp --- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp
@@ -1809,6 +1809,13 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe @@ -1809,6 +1809,13 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
......
...@@ -12,7 +12,7 @@ with tools (e.g. nvidia-smi) and system management libraries (e.g. nvml). ...@@ -12,7 +12,7 @@ with tools (e.g. nvidia-smi) and system management libraries (e.g. nvml).
3 files changed, 63 insertions(+), 6 deletions(-) 3 files changed, 63 insertions(+), 6 deletions(-)
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index c54ff98b..229bf387 100644 index c54ff98bf..229bf387b 100644
--- a/ggml/include/ggml-backend.h --- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h
@@ -158,6 +158,7 @@ extern "C" { @@ -158,6 +158,7 @@ extern "C" {
...@@ -24,7 +24,7 @@ index c54ff98b..229bf387 100644 ...@@ -24,7 +24,7 @@ index c54ff98b..229bf387 100644
size_t memory_total; size_t memory_total;
// device type // device type
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index c0b1e4c1..5b852f69 100644 index aefc6935e..cc201afff 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -183,6 +183,51 @@ static int ggml_cuda_parse_id(char devName[]) { @@ -183,6 +183,51 @@ static int ggml_cuda_parse_id(char devName[]) {
...@@ -110,7 +110,7 @@ index c0b1e4c1..5b852f69 100644 ...@@ -110,7 +110,7 @@ index c0b1e4c1..5b852f69 100644
std::string device_name(prop.name); std::string device_name(prop.name);
if (device_name == "NVIDIA GeForce MX450") { if (device_name == "NVIDIA GeForce MX450") {
turing_devices_without_mma.push_back({ id, device_name }); turing_devices_without_mma.push_back({ id, device_name });
@@ -3276,6 +3323,7 @@ struct ggml_backend_cuda_device_context { @@ -3268,6 +3315,7 @@ struct ggml_backend_cuda_device_context {
std::string name; std::string name;
std::string description; std::string description;
std::string pci_bus_id; std::string pci_bus_id;
...@@ -118,7 +118,7 @@ index c0b1e4c1..5b852f69 100644 ...@@ -118,7 +118,7 @@ index c0b1e4c1..5b852f69 100644
}; };
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) { static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -3288,6 +3336,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t @@ -3280,6 +3328,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
return ctx->description.c_str(); return ctx->description.c_str();
} }
...@@ -130,7 +130,7 @@ index c0b1e4c1..5b852f69 100644 ...@@ -130,7 +130,7 @@ index c0b1e4c1..5b852f69 100644
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
ggml_cuda_set_device(ctx->device); ggml_cuda_set_device(ctx->device);
@@ -3304,6 +3357,7 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back @@ -3296,6 +3349,7 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
props->name = ggml_backend_cuda_device_get_name(dev); props->name = ggml_backend_cuda_device_get_name(dev);
props->description = ggml_backend_cuda_device_get_description(dev); props->description = ggml_backend_cuda_device_get_description(dev);
...@@ -138,7 +138,7 @@ index c0b1e4c1..5b852f69 100644 ...@@ -138,7 +138,7 @@ index c0b1e4c1..5b852f69 100644
props->type = ggml_backend_cuda_device_get_type(dev); props->type = ggml_backend_cuda_device_get_type(dev);
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str(); props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total); ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
@@ -3873,6 +3927,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { @@ -3869,6 +3923,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
cudaDeviceProp prop; cudaDeviceProp prop;
CUDA_CHECK(cudaGetDeviceProperties(&prop, i)); CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
dev_ctx->description = prop.name; dev_ctx->description = prop.name;
...@@ -147,7 +147,7 @@ index c0b1e4c1..5b852f69 100644 ...@@ -147,7 +147,7 @@ index c0b1e4c1..5b852f69 100644
char pci_bus_id[16] = {}; char pci_bus_id[16] = {};
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID); snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
index bf096227..f2ff9f32 100644 index bf0962274..f2ff9f322 100644
--- a/ggml/src/ggml-metal/ggml-metal.cpp --- a/ggml/src/ggml-metal/ggml-metal.cpp
+++ b/ggml/src/ggml-metal/ggml-metal.cpp +++ b/ggml/src/ggml-metal/ggml-metal.cpp
@@ -538,6 +538,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen @@ -538,6 +538,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
......
...@@ -10,7 +10,7 @@ Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> ...@@ -10,7 +10,7 @@ Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
2 files changed, 13 insertions(+) 2 files changed, 13 insertions(+)
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index 4d487581..35a0d25e 100644 index 4d487581a..35a0d25ed 100644
--- a/tools/mtmd/mtmd.cpp --- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp
@@ -79,6 +79,16 @@ enum mtmd_slice_tmpl { @@ -79,6 +79,16 @@ enum mtmd_slice_tmpl {
...@@ -31,7 +31,7 @@ index 4d487581..35a0d25e 100644 ...@@ -31,7 +31,7 @@ index 4d487581..35a0d25e 100644
return "<__media__>"; return "<__media__>";
} }
diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
index f4ea07d3..cf287224 100644 index f4ea07d3a..cf287224b 100644
--- a/tools/mtmd/mtmd.h --- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h +++ b/tools/mtmd/mtmd.h
@@ -75,6 +75,9 @@ typedef struct mtmd_input_chunk mtmd_input_chunk; @@ -75,6 +75,9 @@ typedef struct mtmd_input_chunk mtmd_input_chunk;
......
...@@ -8,10 +8,10 @@ Subject: [PATCH] no power throttling win32 with gnuc ...@@ -8,10 +8,10 @@ Subject: [PATCH] no power throttling win32 with gnuc
1 file changed, 1 insertion(+), 1 deletion(-) 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 99509b0c..b13a491d 100644 index 4b2f8b7bd..046646282 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c --- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -2437,7 +2437,7 @@ static bool ggml_thread_apply_priority(int32_t prio) { @@ -2441,7 +2441,7 @@ static bool ggml_thread_apply_priority(int32_t prio) {
// Newer Windows 11 versions aggresively park (offline) CPU cores and often place // Newer Windows 11 versions aggresively park (offline) CPU cores and often place
// all our threads onto the first 4 cores which results in terrible performance with // all our threads onto the first 4 cores which results in terrible performance with
// n_threads > 4 // n_threads > 4
......
...@@ -9,7 +9,7 @@ Only enable BF16 on supported MacOS versions (v14+) ...@@ -9,7 +9,7 @@ Only enable BF16 on supported MacOS versions (v14+)
1 file changed, 6 insertions(+), 1 deletion(-) 1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/ggml/src/ggml-metal/ggml-metal-context.m b/ggml/src/ggml-metal/ggml-metal-context.m diff --git a/ggml/src/ggml-metal/ggml-metal-context.m b/ggml/src/ggml-metal/ggml-metal-context.m
index 052efb7a..b47dc787 100644 index 052efb7ac..b47dc7879 100644
--- a/ggml/src/ggml-metal/ggml-metal-context.m --- a/ggml/src/ggml-metal/ggml-metal-context.m
+++ b/ggml/src/ggml-metal/ggml-metal-context.m +++ b/ggml/src/ggml-metal/ggml-metal-context.m
@@ -125,7 +125,12 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) { @@ -125,7 +125,12 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
......
...@@ -178,19 +178,19 @@ index 3191faaa4..32f14c811 100644 ...@@ -178,19 +178,19 @@ index 3191faaa4..32f14c811 100644
static const struct ggml_backend_i ggml_backend_cpu_i = { static const struct ggml_backend_i ggml_backend_cpu_i = {
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 5b852f690..c555cd30f 100644 index cc201afff..02d413467 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2684,7 +2684,7 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { @@ -2693,7 +2693,7 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
#ifdef USE_CUDA_GRAPH #ifdef USE_CUDA_GRAPH
static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, static bool check_node_graph_compatibility(ggml_cgraph * cgraph,
- bool use_cuda_graph) { - bool use_cuda_graph) {
+ int batch_size, bool use_cuda_graph) { + int batch_size, bool use_cuda_graph) {
// Loop over nodes in GGML graph to obtain info needed for CUDA graph // Loop over nodes in GGML graph to obtain info needed for CUDA graph
cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
@@ -2718,24 +2718,34 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud @@ -2726,24 +2726,34 @@ static bool check_node_graph_compatibility(ggml_cgraph * cgraph,
#endif #endif
} }
...@@ -240,8 +240,8 @@ index 5b852f690..c555cd30f 100644 ...@@ -240,8 +240,8 @@ index 5b852f690..c555cd30f 100644
+ } + }
} }
if (node->op == GGML_OP_CPY) { if (!use_cuda_graph) {
@@ -3132,7 +3142,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx @@ -3128,7 +3138,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
} }
} }
...@@ -250,12 +250,12 @@ index 5b852f690..c555cd30f 100644 ...@@ -250,12 +250,12 @@ index 5b852f690..c555cd30f 100644
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
ggml_cuda_set_device(cuda_ctx->device); ggml_cuda_set_device(cuda_ctx->device);
@@ -3170,7 +3180,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, @@ -3166,7 +3176,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
if (use_cuda_graph) { if (use_cuda_graph) {
cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph); cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);
- use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph, use_cuda_graph); - use_cuda_graph = check_node_graph_compatibility(cgraph, use_cuda_graph);
+ use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph, batch_size, use_cuda_graph); + use_cuda_graph = check_node_graph_compatibility(cgraph, batch_size, use_cuda_graph);
// Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates. // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
if (use_cuda_graph && cuda_graph_update_required) { if (use_cuda_graph && cuda_graph_update_required) {
...@@ -278,10 +278,10 @@ index f2ff9f322..05ff6a5a6 100644 ...@@ -278,10 +278,10 @@ index f2ff9f322..05ff6a5a6 100644
static void ggml_backend_metal_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) { static void ggml_backend_metal_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) {
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index ed83236f4..bd3ece516 100644 index 216dc167c..3a6bbe564 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -12015,7 +12015,7 @@ static uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const stru @@ -12357,7 +12357,7 @@ static uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const stru
return num_adds; return num_adds;
} }
...@@ -290,7 +290,7 @@ index ed83236f4..bd3ece516 100644 ...@@ -290,7 +290,7 @@ index ed83236f4..bd3ece516 100644
VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)"); VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
@@ -12211,6 +12211,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg @@ -12561,6 +12561,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
return GGML_STATUS_SUCCESS; return GGML_STATUS_SUCCESS;
UNUSED(backend); UNUSED(backend);
......
...@@ -8,10 +8,10 @@ Subject: [PATCH] Disable ggml-blas on macos v13 and older ...@@ -8,10 +8,10 @@ Subject: [PATCH] Disable ggml-blas on macos v13 and older
1 file changed, 5 insertions(+) 1 file changed, 5 insertions(+)
diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp
index 5b888cdd..2a9ff7f6 100644 index 88d088952..6a38a51a2 100644
--- a/ggml/src/ggml-blas/ggml-blas.cpp --- a/ggml/src/ggml-blas/ggml-blas.cpp
+++ b/ggml/src/ggml-blas/ggml-blas.cpp +++ b/ggml/src/ggml-blas/ggml-blas.cpp
@@ -506,6 +506,11 @@ static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = { @@ -507,6 +507,11 @@ static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = {
}; };
ggml_backend_reg_t ggml_backend_blas_reg(void) { ggml_backend_reg_t ggml_backend_blas_reg(void) {
......
...@@ -8,7 +8,7 @@ Subject: [PATCH] fix mtmd-audio.cpp build on windows ...@@ -8,7 +8,7 @@ Subject: [PATCH] fix mtmd-audio.cpp build on windows
1 file changed, 1 insertion(+), 1 deletion(-) 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp
index 4d053895..84bdc277 100644 index 4d053895c..84bdc2777 100644
--- a/tools/mtmd/mtmd-audio.cpp --- a/tools/mtmd/mtmd-audio.cpp
+++ b/tools/mtmd/mtmd-audio.cpp +++ b/tools/mtmd/mtmd-audio.cpp
@@ -1,6 +1,6 @@ @@ -1,6 +1,6 @@
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment