Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
8dd12c87
Unverified
Commit
8dd12c87
authored
May 01, 2025
by
Jeffrey Morgan
Committed by
GitHub
May 01, 2025
Browse files
llama: update to commit e1e8e099 (#10513)
parent
e6d2d041
Changes
68
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
187 additions
and
202 deletions
+187
-202
llama/llama.cpp/src/llama-sampling.cpp
llama/llama.cpp/src/llama-sampling.cpp
+2
-1
llama/llama.cpp/src/llama-vocab.cpp
llama/llama.cpp/src/llama-vocab.cpp
+2
-1
llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
...gml-backend-malloc-and-free-using-the-same-compiler.patch
+12
-12
llama/patches/0002-pretokenizer.patch
llama/patches/0002-pretokenizer.patch
+2
-2
llama/patches/0003-embeddings.patch
llama/patches/0003-embeddings.patch
+4
-4
llama/patches/0004-clip-unicode.patch
llama/patches/0004-clip-unicode.patch
+5
-5
llama/patches/0005-solar-pro.patch
llama/patches/0005-solar-pro.patch
+21
-21
llama/patches/0006-add-mllama-support.patch
llama/patches/0006-add-mllama-support.patch
+65
-87
llama/patches/0007-add-unpad-operator.patch
llama/patches/0007-add-unpad-operator.patch
+28
-28
llama/patches/0008-fix-deepseek-deseret-regex.patch
llama/patches/0008-fix-deepseek-deseret-regex.patch
+1
-1
llama/patches/0009-maintain-ordering-for-rules-for-grammar.patch
...atches/0009-maintain-ordering-for-rules-for-grammar.patch
+2
-2
llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch
.../patches/0010-ensure-KV-cache-is-fully-defragmented.patch
+10
-10
llama/patches/0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
...0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
+6
-6
llama/patches/0013-remove-amx.patch
llama/patches/0013-remove-amx.patch
+7
-7
llama/patches/0014-fix-string-arr-kv-loading.patch
llama/patches/0014-fix-string-arr-kv-loading.patch
+1
-1
llama/patches/0015-ollama-debug-tensor.patch
llama/patches/0015-ollama-debug-tensor.patch
+2
-2
llama/patches/0016-add-model-quantizations.patch
llama/patches/0016-add-model-quantizations.patch
+8
-8
llama/patches/0017-add-ollama-vocab-for-grammar-support.patch
...a/patches/0017-add-ollama-vocab-for-grammar-support.patch
+3
-3
ml/backend/ggml/ggml/include/ggml-cpu.h
ml/backend/ggml/ggml/include/ggml-cpu.h
+5
-0
ml/backend/ggml/ggml/include/ggml-rpc.h
ml/backend/ggml/ggml/include/ggml-rpc.h
+1
-1
No files found.
llama/llama.cpp/src/llama-sampling.cpp
View file @
8dd12c87
...
...
@@ -232,7 +232,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
// }
if
(
k
<=
0
)
{
k
=
cur_p
->
size
;
return
;
}
k
=
std
::
min
(
k
,
(
int
)
cur_p
->
size
);
...
...
@@ -298,6 +298,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
}
cur_p
->
sorted
=
true
;
}
cur_p
->
size
=
k
;
}
...
...
llama/llama.cpp/src/llama-vocab.cpp
View file @
8dd12c87
...
...
@@ -1497,7 +1497,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre
==
"llama3"
||
tokenizer_pre
==
"llama-v3"
||
tokenizer_pre
==
"llama-bpe"
||
tokenizer_pre
==
"falcon3"
)
{
tokenizer_pre
==
"falcon3"
||
tokenizer_pre
==
"pixtral"
)
{
pre_type
=
LLAMA_VOCAB_PRE_TYPE_LLAMA3
;
ignore_merges
=
true
;
add_bos
=
true
;
...
...
llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
View file @
8dd12c87
...
...
@@ -85,7 +85,7 @@ index e2617b06..242e50a7 100644
/**
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index
a7febef7..31750b6f
100644
index
9fb2134f..04ce764e
100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -534,6 +534,7 @@
struct ggml_backend_cuda_buffer_context {
...
...
@@ -125,10 +125,10 @@ index 50579227..2799a0a5 100644
static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index
266d8af4..12886cd3
100644
index
d92392ed..425524d0
100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -
4759,6 +4759
,7 @@
static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
@@ -
5077,6 +5077
,7 @@
static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
}
free(ctx);
...
...
@@ -149,10 +149,10 @@ index 05a2f4e6..392cc18d 100644
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
index
a0667b7d..bd83adc5
100644
index
140a775f..e33c4ba0
100644
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -4
68
,6 +4
68
,7 @@
static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
@@ -4
77
,6 +4
77
,7 @@
static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
GGML_ASSERT(status);
delete ctx;
...
...
@@ -161,10 +161,10 @@ index a0667b7d..bd83adc5 100644
static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index
1de34c96..4600f61
e 100644
index
66b6f2cc..e3e6dea
e 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -31
6
,6 +31
6
,7 @@
ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
@@ -31
7
,6 +31
7
,7 @@
ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
ggml_sycl_set_device(ctx->device);
delete ctx;
...
...
@@ -172,7 +172,7 @@ index 1de34c96..4600f61e 100644
}
catch (sycl::exception const &exc) {
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -76
1
,6 +76
2
,7 @@
struct ggml_backend_sycl_split_buffer_context {
@@ -76
2
,6 +76
3
,7 @@
struct ggml_backend_sycl_split_buffer_context {
static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
delete ctx;
...
...
@@ -180,7 +180,7 @@ index 1de34c96..4600f61e 100644
}
static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -109
5
,6 +109
7
,7 @@
static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
@@ -109
6
,6 +109
8
,7 @@
static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_sycl_host_free(buffer->context);
...
...
@@ -189,10 +189,10 @@ index 1de34c96..4600f61e 100644
static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index
39f3cd34..c569a8a5
100644
index
c0bdb9e1..03d03064
100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -86
53
,6 +86
53
,7 @@
static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
@@ -86
60
,6 +86
60
,7 @@
static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
ggml_vk_destroy_buffer(ctx->dev_buffer);
delete ctx;
...
...
@@ -200,7 +200,7 @@ index 39f3cd34..c569a8a5 100644
}
static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -8
796
,6 +8
797
,7 @@
static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
@@ -8
803
,6 +8
804
,7 @@
static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
ggml_vk_host_free(vk_instance.devices[0], buffer->context);
...
...
llama/patches/0002-pretokenizer.patch
View file @
8dd12c87
...
...
@@ -10,7 +10,7 @@ logs instead of throwing an error
1 file changed, 3 insertions(+), 11 deletions(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index
48060517..a35b498c
100644
index
50ded286..a9ee9f03
100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1491,16 +1491,7 @@
void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
...
...
@@ -31,7 +31,7 @@ index 48060517..a35b498c 100644
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (
tokenizer_pre == "llama3" ||
@@ -163
4
,7 +162
5
,8 @@
void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
@@ -163
5
,7 +162
6
,8 @@
void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
clean_spaces = false;
} else {
...
...
llama/patches/0003-embeddings.patch
View file @
8dd12c87
...
...
@@ -11,10 +11,10 @@ instead of forcing one or the error
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index
983385f8..32f59819
100644
index
5a2eef9b..9c1fe93f
100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -12
36
,7 +12
36
,7 @@
int llama_context::decode(llama_batch & inp_batch) {
@@ -12
25
,7 +12
25
,7 @@
int llama_context::decode(llama_batch & inp_batch) {
int64_t n_outputs_all = 0;
// count outputs
...
...
@@ -23,7 +23,7 @@ index 983385f8..32f59819 100644
for (uint32_t i = 0; i < n_tokens_all; ++i) {
n_outputs_all += batch.logits[i] != 0;
}
@@ -13
48
,7 +13
48
,7 @@
int llama_context::decode(llama_batch & inp_batch) {
@@ -13
37
,7 +13
37
,7 @@
int llama_context::decode(llama_batch & inp_batch) {
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
//}
...
...
@@ -32,7 +32,7 @@ index 983385f8..32f59819 100644
auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr;
if (t_embd && res->get_embd_pooled()) {
@@ -14
92
,7 +14
92
,7 @@
int32_t llama_context::output_reserve(int32_t n_outputs) {
@@ -14
81
,7 +14
81
,7 @@
int32_t llama_context::output_reserve(int32_t n_outputs) {
const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead
...
...
llama/patches/0004-clip-unicode.patch
View file @
8dd12c87
...
...
@@ -10,12 +10,12 @@ filesystems for paths that include wide characters
1 file changed, 39 insertions(+)
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index
75970615..d57b4bd6
100644
index
ad3e7df1..b3218c78
100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -29,6 +29,19 @@
#include <limits>
@@ -30,6 +30,19 @@
#include <array>
#include <numeric>
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
...
...
@@ -33,7 +33,7 @@ index 75970615..d57b4bd6 100644
struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
//#define CLIP_DEBUG_FUNCTIONS
@@ -1
430
,7 +1
443
,29 @@
struct clip_model_loader {
@@ -1
971
,7 +1
984
,29 @@
struct clip_model_loader {
{
std::vector<uint8_t> read_buf;
...
...
@@ -63,7 +63,7 @@ index 75970615..d57b4bd6 100644
if (!fin) {
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
}
@@ -1
457
,7 +
1492
,11 @@
struct clip_model_loader {
@@ -1
998
,7 +
2033
,11 @@
struct clip_model_loader {
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
}
}
...
...
llama/patches/0005-solar-pro.patch
View file @
8dd12c87
...
...
@@ -15,10 +15,10 @@ adds support for the Solar Pro architecture
7 files changed, 248 insertions(+)
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index
62e1480b..f754bc8f
100644
index
f2bc8ca7..5ab3f572
100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -6
8
,6 +6
8
,7 @@
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
@@ -6
9
,6 +6
9
,7 @@
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_GRANITE, "granite" },
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
{ LLM_ARCH_CHAMELEON, "chameleon" },
...
...
@@ -26,7 +26,7 @@ index 62e1480b..f754bc8f 100644
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
{ LLM_ARCH_PLM, "plm" },
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
@@ -14
0
,6 +14
1
,7 @@
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
@@ -14
2
,6 +14
3
,7 @@
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
...
...
@@ -34,7 +34,7 @@ index 62e1480b..f754bc8f 100644
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
@@ -1
48
2,6 +1
48
4,24 @@
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
@@ -1
50
2,6 +1
50
4,24 @@
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
},
},
...
...
@@ -59,7 +59,7 @@ index 62e1480b..f754bc8f 100644
{
LLM_ARCH_WAVTOKENIZER_DEC,
{
@@ -16
6
0,6 +1
68
0,7 @@
static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
@@ -16
8
0,6 +1
70
0,7 @@
static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
// this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
...
...
@@ -68,10 +68,10 @@ index 62e1480b..f754bc8f 100644
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h
index
98ca00a1..439aaeab
100644
index
41a023da..525c1b7d
100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -7
2
,6 +7
2
,7 @@
enum llm_arch {
@@ -7
3
,6 +7
3
,7 @@
enum llm_arch {
LLM_ARCH_GRANITE,
LLM_ARCH_GRANITE_MOE,
LLM_ARCH_CHAMELEON,
...
...
@@ -79,7 +79,7 @@ index 98ca00a1..439aaeab 100644
LLM_ARCH_WAVTOKENIZER_DEC,
LLM_ARCH_PLM,
LLM_ARCH_BAILINGMOE,
@@ -14
4
,6 +14
5
,7 @@
enum llm_kv {
@@ -14
6
,6 +14
7
,7 @@
enum llm_kv {
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE,
...
...
@@ -87,7 +87,7 @@ index 98ca00a1..439aaeab 100644
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
@@ -34
4
,6 +34
6
,7 @@
enum llm_tensor {
@@ -34
6
,6 +34
8
,7 @@
enum llm_tensor {
LLM_TENSOR_ENC_OUTPUT_NORM,
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
...
...
@@ -115,7 +115,7 @@ index 90dfe7a7..8a667960 100644
if (il < n_layer) {
return n_swa > 0 && n_swa_pattern > 0 && il % n_swa_pattern < (n_swa_pattern - 1);
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index
80fcd65d..6e278945
100644
index
7ee6a5b7..48dce407
100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -55,6 +55,8 @@
struct llama_hparams {
...
...
@@ -127,7 +127,7 @@ index 80fcd65d..6e278945 100644
uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0;
uint32_t n_lora_kv = 0;
@@ -15
3
,6 +15
5
,9 @@
struct llama_hparams {
@@ -15
4
,6 +15
6
,9 @@
struct llama_hparams {
// dimension of the recurrent state embeddings
uint32_t n_embd_v_s() const;
...
...
@@ -150,10 +150,10 @@ index ea73a8a7..a012aeae 100644
llama_model_loader::llama_model_loader(
const std::string & fname,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index
6b7bfecf..aba4281
9 100644
index
822e2bb2..572378c
9 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -13
74
,6 +13
74
,21 @@
void llama_model::load_hparams(llama_model_loader & ml) {
@@ -13
86
,6 +13
86
,21 @@
void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN;
}
} break;
...
...
@@ -175,7 +175,7 @@ index 6b7bfecf..aba42819 100644
case LLM_ARCH_WAVTOKENIZER_DEC:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -371
7
,6 +37
32
,34 @@
bool llama_model::load_tensors(llama_model_loader & ml) {
@@ -37
4
1,6 +37
56
,34 @@
bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
...
...
@@ -210,7 +210,7 @@ index 6b7bfecf..aba42819 100644
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
@@ -12
296
,6 +123
39
,165 @@
struct llm_build_chameleon : public llm_graph_context {
@@ -12
342
,6 +123
85
,165 @@
struct llm_build_chameleon : public llm_graph_context {
}
};
...
...
@@ -376,7 +376,7 @@ index 6b7bfecf..aba42819 100644
struct llm_build_wavtokenizer_dec : public llm_graph_context {
llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
ggml_tensor * cur;
@@ -130
45
,6 +1324
7
,10 @@
llm_graph_result_ptr llama_model::build_graph(
@@ -130
92
,6 +132
9
4,10 @@
llm_graph_result_ptr llama_model::build_graph(
{
llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
} break;
...
...
@@ -387,7 +387,7 @@ index 6b7bfecf..aba42819 100644
case LLM_ARCH_WAVTOKENIZER_DEC:
{
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf);
@@ -13
191
,6 +13
397
,7 @@
llama_rope_type llama_model_rope_type(const llama_model * model) {
@@ -13
238
,6 +13
444
,7 @@
llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_CHAMELEON:
...
...
@@ -396,18 +396,18 @@ index 6b7bfecf..aba42819 100644
return LLAMA_ROPE_TYPE_NORM;
diff --git a/src/llama-model.h b/src/llama-model.h
index
fd82d106..5865d5e9
100644
index
95eca002..856e6042
100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -6
2
,6 +6
2
,7 @@
enum llm_type {
@@ -6
4
,6 +6
4
,7 @@
enum llm_type {
LLM_TYPE_15B,
LLM_TYPE_16B,
LLM_TYPE_20B,
+ LLM_TYPE_22B,
LLM_TYPE_27B,
LLM_TYPE_30B,
LLM_TYPE_32B,
LLM_TYPE_34B,
@@ -307,6 +308,8 @@
struct llama_layer {
@@ -311,6 +312,8 @@
struct llama_layer {
struct ggml_tensor * ffn_up_scale = nullptr;
struct ggml_tensor * ffn_down_scale = nullptr;
...
...
llama/patches/0006-add-mllama-support.patch
View file @
8dd12c87
This diff is collapsed.
Click to expand it.
llama/patches/0007-add-unpad-operator.patch
View file @
8dd12c87
...
...
@@ -18,10 +18,10 @@ adds the unpad operator to GGML
10 files changed, 223 insertions(+), 2 deletions(-)
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index
8fcc16df..d19fc167
100644
index
1b8603e7..53ef31b2
100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -48
8
,6 +48
8
,7 @@
extern "C" {
@@ -48
9
,6 +48
9
,7 @@
extern "C" {
GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_PAD,
GGML_OP_PAD_REFLECT_1D,
...
...
@@ -29,7 +29,7 @@ index 8fcc16df..d19fc167 100644
GGML_OP_ARANGE,
GGML_OP_TIMESTEP_EMBEDDING,
GGML_OP_ARGSORT,
@@ -17
5
7,6 +17
5
8,15 @@
extern "C" {
@@ -17
7
7,6 +17
7
8,15 @@
extern "C" {
int p0,
int p1);
...
...
@@ -46,10 +46,10 @@ index 8fcc16df..d19fc167 100644
// timesteps: [N,]
// return: [N, dim]
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index
50400328..432942bf
100644
index
64405449..34624cca
100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -196
0
,6 +196
0
,10 @@
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
@@ -196
4
,6 +196
4
,10 @@
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_pad_reflect_1d(params, tensor);
} break;
...
...
@@ -60,7 +60,7 @@ index 50400328..432942bf 100644
case GGML_OP_ARANGE:
{
ggml_compute_forward_arange(params, tensor);
@@ -228
2
,6 +22
86
,7 @@
static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
@@ -228
7
,6 +22
91
,7 @@
static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
case GGML_OP_UPSCALE:
case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D:
...
...
@@ -69,10 +69,10 @@ index 50400328..432942bf 100644
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT:
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index
6050147b..66b8da68
100644
index
7413192b..becdae07
100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -6
531
,6 +6
531
,61 @@
void ggml_compute_forward_pad_reflect_1d(
@@ -6
703
,6 +6
703
,61 @@
void ggml_compute_forward_pad_reflect_1d(
}
}
...
...
@@ -135,10 +135,10 @@ index 6050147b..66b8da68 100644
static void ggml_compute_forward_arange_f32(
diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h
index
410a3720..3eca1cf8
100644
index
dc081b9e..a7125555
100644
--- a/ggml/src/ggml-cpu/ops.h
+++ b/ggml/src/ggml-cpu/ops.h
@@ -7
1
,6 +7
1
,7 @@
void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params
@@ -7
2
,6 +7
2
,7 @@
void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params
void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
...
...
@@ -147,10 +147,10 @@ index 410a3720..3eca1cf8 100644
void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index
31750b6f..0fef9522
100644
index
04ce764e..491acccb
100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -22
46
,6 +22
46
,9 @@
static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
@@ -22
23
,6 +22
23
,9 @@
static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_PAD:
ggml_cuda_op_pad(ctx, dst);
break;
...
...
@@ -160,7 +160,7 @@ index 31750b6f..0fef9522 100644
case GGML_OP_ARANGE:
ggml_cuda_op_arange(ctx, dst);
break;
@@ -3
222
,6 +32
25
,7 @@
static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
@@ -3
197
,6 +32
00
,7 @@
static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_UPSCALE:
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
case GGML_OP_PAD:
...
...
@@ -233,7 +233,7 @@ index 8fd386b0..e2ededc3 100644
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index
12886cd3..b2e95a6
6 100644
index
425524d0..112abef
6 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -341,6 +341,7 @@
static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
...
...
@@ -244,7 +244,7 @@ index 12886cd3..b2e95a66 100644
GGML_METAL_KERNEL_TYPE_ARANGE_F32,
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
@@ -1
020
,6 +1
021
,7 @@
@implementation GGMLMetalClass
@@ -1
277
,6 +1
278
,7 @@
@implementation GGMLMetalClass
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true);
...
...
@@ -252,7 +252,7 @@ index 12886cd3..b2e95a66 100644
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
@@ -1
384
,6 +1
386
,7 @@
static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
@@ -1
647
,6 +1
649
,7 @@
static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
case GGML_OP_POOL_2D:
case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D:
...
...
@@ -260,7 +260,7 @@ index 12886cd3..b2e95a66 100644
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT:
case GGML_OP_LEAKY_RELU:
@@ -
3731,6 +3734
,36 @@
static
void
ggml_metal_encode_node(
@@ -
4047,6 +4050
,36 @@
static
bool
ggml_metal_encode_node(
const int nth = MIN(1024, ne0);
...
...
@@ -298,7 +298,7 @@ index 12886cd3..b2e95a66 100644
} break;
case GGML_OP_ARANGE:
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index
8d6e99e6..71f0f97
f 100644
index
9f4147e9..6ceb3ce
f 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -2975,6 +2975,51 @@
kernel void kernel_pad_reflect_1d_f32(
...
...
@@ -354,10 +354,10 @@ index 8d6e99e6..71f0f97f 100644
device char * dst,
constant ggml_metal_kargs_arange & args,
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index
950772c7..2276b631
100644
index
7654ae17..3c57aff8
100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -9
6
3,6 +9
6
3,7 @@
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
@@ -9
2
3,6 +9
2
3,7 @@
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"UPSCALE",
"PAD",
"PAD_REFLECT_1D",
...
...
@@ -365,16 +365,16 @@ index 950772c7..2276b631 100644
"ARANGE",
"TIMESTEP_EMBEDDING",
"ARGSORT",
@@ -9
9
3,7 +9
9
4,7 @@
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
@@ -9
5
3,7 +9
5
4,7 @@
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"OPT_STEP_ADAMW",
};
-static_assert(GGML_OP_COUNT == 8
1
, "GGML_OP_COUNT != 8
1
");
+static_assert(GGML_OP_COUNT == 8
2
, "GGML_OP_COUNT != 8
2
");
-static_assert(GGML_OP_COUNT == 8
2
, "GGML_OP_COUNT != 8
2
");
+static_assert(GGML_OP_COUNT == 8
3
, "GGML_OP_COUNT != 8
3
");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
@@ -10
57
,6 +10
58
,7 @@
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
@@ -10
18
,6 +10
19
,7 @@
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"upscale(x)",
"pad(x)",
"pad_reflect_1d(x)",
...
...
@@ -382,16 +382,16 @@ index 950772c7..2276b631 100644
"arange(start, stop, step)",
"timestep_embedding(timesteps, dim, max_period)",
"argsort(x)",
@@ -108
7
,7 +10
89
,7 @@
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
@@ -10
4
8,7 +10
50
,7 @@
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"adamw(x)",
};
-static_assert(GGML_OP_COUNT == 8
1
, "GGML_OP_COUNT != 8
1
");
+static_assert(GGML_OP_COUNT == 8
2
, "GGML_OP_COUNT != 8
2
");
-static_assert(GGML_OP_COUNT == 8
2
, "GGML_OP_COUNT != 8
2
");
+static_assert(GGML_OP_COUNT == 8
3
, "GGML_OP_COUNT != 8
3
");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@@ -42
62
,6 +42
64
,25 @@
struct ggml_tensor * ggml_pad_reflect_1d(
@@ -42
70
,6 +42
72
,25 @@
struct ggml_tensor * ggml_pad_reflect_1d(
return result;
}
...
...
llama/patches/0008-fix-deepseek-deseret-regex.patch
View file @
8dd12c87
...
...
@@ -12,7 +12,7 @@ regex
2 files changed, 22 insertions(+), 1 deletion(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index a
35b498c..032019c9
100644
index a
9ee9f03..1306864e
100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -296,7 +296,7 @@
struct llm_tokenizer_bpe : llm_tokenizer {
...
...
llama/patches/0009-maintain-ordering-for-rules-for-grammar.patch
View file @
8dd12c87
...
...
@@ -8,10 +8,10 @@ Subject: [PATCH] maintain ordering for rules for grammar
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index
9067982
2..56
043678
100644
index
5b3059c
2..
6
56
b3eca
100644
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -34
6
,7 +34
6
,7 @@
private:
@@ -34
9
,7 +34
9
,7 @@
private:
friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
std::function<json(const std::string &)> _fetch_json;
bool _dotall;
...
...
llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch
View file @
8dd12c87
...
...
@@ -22,10 +22,10 @@ multiple batches of processing until everything is complete.
4 files changed, 51 insertions(+), 106 deletions(-)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index
0343ba8a..4b3e6a83
100644
index
cd06ad91..77177c5e
100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -5
94
,13 +5
94
,12 @@
llm_graph_result_ptr llama_context::build_kv_self_shift(
@@ -5
83
,13 +5
83
,12 @@
llm_graph_result_ptr llama_context::build_kv_self_shift(
llm_graph_result_ptr llama_context::build_kv_self_defrag(
ggml_context * ctx0,
...
...
@@ -41,7 +41,7 @@ index 0343ba8a..4b3e6a83 100644
#if 0
// CPU defrag
//
@@ -6
72
,32 +6
71
,20 @@
llm_graph_result_ptr llama_context::build_kv_self_defrag(
@@ -6
61
,32 +6
60
,20 @@
llm_graph_result_ptr llama_context::build_kv_self_defrag(
ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
}
#else
...
...
@@ -79,7 +79,7 @@ index 0343ba8a..4b3e6a83 100644
ggml_tensor * view_v_src;
ggml_tensor * view_v_dst;
@@ -
705
,34 +6
92
,30 @@
llm_graph_result_ptr llama_context::build_kv_self_defrag(
@@ -
694
,34 +6
81
,30 @@
llm_graph_result_ptr llama_context::build_kv_self_defrag(
if (cparams.flash_attn) {
// NOTE: the V cache is not transposed when using flash attention
view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il],
...
...
@@ -122,7 +122,7 @@ index 0343ba8a..4b3e6a83 100644
#endif
return res;
@@ -7
41
,8 +7
24
,6 @@
llm_graph_result_ptr llama_context::build_kv_self_defrag(
@@ -7
30
,8 +7
13
,6 @@
llm_graph_result_ptr llama_context::build_kv_self_defrag(
void llama_context::kv_self_update() {
auto & kv = kv_self;
...
...
@@ -131,7 +131,7 @@ index 0343ba8a..4b3e6a83 100644
if (kv->has_shift) {
if (!kv->get_can_shift()) {
GGML_ABORT("The current context does not support K-shift");
@@ -7
63
,8 +7
44
,6 @@
void llama_context::kv_self_update() {
@@ -7
52
,8 +7
33
,6 @@
void llama_context::kv_self_update() {
res->set_inputs(nullptr);
graph_compute(gf, false);
...
...
@@ -140,7 +140,7 @@ index 0343ba8a..4b3e6a83 100644
}
{
@@ -7
79
,49 +7
58
,28 @@
void llama_context::kv_self_update() {
@@ -7
68
,49 +7
47
,28 @@
void llama_context::kv_self_update() {
// defragment the KV cache if needed
if (kv->do_defrag) {
LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
...
...
@@ -202,7 +202,7 @@ index 0343ba8a..4b3e6a83 100644
}
enum llama_pooling_type llama_context::pooling_type() const {
@@ -1
305
,9 +12
63
,12 @@
int llama_context::decode(llama_batch & inp_batch) {
@@ -1
294
,9 +12
52
,12 @@
int llama_context::decode(llama_batch & inp_batch) {
// find KV slot
{
if (!kv_self->find_slot(ubatch)) {
...
...
@@ -219,7 +219,7 @@ index 0343ba8a..4b3e6a83 100644
if (!kv_self->recurrent) {
diff --git a/src/llama-context.h b/src/llama-context.h
index
baa03276..a59ff8
fd 100644
index
a50c4afa..30f84b
fd 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -5,6 +5,7 @@
...
...
@@ -230,7 +230,7 @@ index baa03276..a59ff8fd 100644
#include "ggml-cpp.h"
@@ -1
80
,7 +18
1
,8 @@
private:
@@ -1
79
,7 +18
0
,8 @@
private:
llm_graph_result_ptr build_kv_self_defrag(
ggml_context * ctx0,
...
...
llama/patches/0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
View file @
8dd12c87
...
...
@@ -8,10 +8,10 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
1 file changed, 2 insertions(+)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index
f00700da..91d6a7d5
100644
index
43d9fc4f..4c0d3824
100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -27
8
,6 +27
8
,7 @@
function(ggml_add_cpu_backend_variant tag_name)
@@ -27
9
,6 +27
9
,7 @@
function(ggml_add_cpu_backend_variant tag_name)
endforeach()
ggml_add_cpu_backend_variant_impl(${tag_name})
...
...
@@ -19,11 +19,11 @@ index f00700da..91d6a7d5 100644
endfunction()
ggml_add_backend(CPU)
@@ -28
6
,6 +28
7
,7 @@
if (GGML_CPU_ALL_VARIANTS)
@@ -28
7
,6 +28
8
,7 @@
if (GGML_CPU_ALL_VARIANTS)
if (NOT GGML_BACKEND_DL)
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
endif()
+ add_custom_target(ggml-cpu)
ggml_add_cpu_backend_variant(
sandybridge AVX
)
ggml_add_cpu_backend_variant(
haswell
AVX F16C AVX2 BMI2 FMA
)
ggml_add_cpu_backend_variant(s
kylakex AVX F16C AVX2 BMI2 FMA AVX512
)
ggml_add_cpu_backend_variant(
x64
)
ggml_add_cpu_backend_variant(
sse42
SSE42
)
ggml_add_cpu_backend_variant(s
andybridge SSE42 AVX
)
llama/patches/0013-remove-amx.patch
View file @
8dd12c87
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: T
ue, 8 Apr
2025
20:33
:0
1
-0700
Date: T
hu, 1 May
2025
15:05
:0
8
-0700
Subject: [PATCH] remove amx
disable amx as it reduces performance on some systems
...
...
@@ -9,16 +9,16 @@ disable amx as it reduces performance on some systems
1 file changed, 4 deletions(-)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index
91d6a7d5..d6b393a
2 100644
index
4c0d3824..79c2631
2 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -29
3
,10 +29
3
,6 @@
if (GGML_CPU_ALL_VARIANTS)
ggml_add_cpu_backend_variant(skylakex
AVX F16C AVX2 BMI2 FMA AVX512)
ggml_add_cpu_backend_variant(icelake
AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
ggml_add_cpu_backend_variant(alderlake
AVX F16C AVX2 BMI2 FMA AVX_VNNI)
@@ -29
6
,10 +29
6
,6 @@
if (GGML_CPU_ALL_VARIANTS)
ggml_add_cpu_backend_variant(skylakex
SSE42
AVX F16C AVX2 BMI2 FMA AVX512)
ggml_add_cpu_backend_variant(icelake
SSE42
AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
ggml_add_cpu_backend_variant(alderlake
SSE42
AVX F16C AVX2 BMI2 FMA AVX_VNNI)
- if (NOT MSVC)
- # MSVC doesn't support AMX
- ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
- ggml_add_cpu_backend_variant(sapphirerapids
SSE42
AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
- endif()
elseif (GGML_CPU)
ggml_add_cpu_backend_variant_impl("")
...
...
llama/patches/0014-fix-string-arr-kv-loading.patch
View file @
8dd12c87
...
...
@@ -53,7 +53,7 @@ index 381a9c7d..e45b453d 100644
}
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index
032019c9..ba37df35
100644
index
1306864e..d6515ff6
100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1459,7 +1459,7 @@
void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
...
...
llama/patches/0015-ollama-debug-tensor.patch
View file @
8dd12c87
...
...
@@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor
1 file changed, 6 insertions(+)
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index
432942bf..6d4abe4c
100644
index
34624cca..59bd3c62
100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -15,6 +15,8 @@
...
...
@@ -20,7 +20,7 @@ index 432942bf..6d4abe4c 100644
#if defined(_MSC_VER) || defined(__MINGW32__)
#include <malloc.h> // using malloc.h with MSC/MINGW
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
@@ -285
4
,6 +28
5
6,10 @@
static thread_ret_t ggml_graph_compute_thread(void * data) {
@@ -285
9
,6 +286
1
,10 @@
static thread_ret_t ggml_graph_compute_thread(void * data) {
ggml_compute_forward(¶ms, node);
...
...
llama/patches/0016-add-model-quantizations.patch
View file @
8dd12c87
...
...
@@ -13,10 +13,10 @@ models not supported in llama.cpp
4 files changed, 24 insertions(+)
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index
0568565f..dd01df60
100644
index
eb7b5325..df42d1a5
100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -7
3
,6 +7
3
,7 @@
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
@@ -7
4
,6 +7
4
,7 @@
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
{ LLM_ARCH_PLM, "plm" },
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
...
...
@@ -24,7 +24,7 @@ index 0568565f..dd01df60 100644
{ LLM_ARCH_UNKNOWN, "(unknown)" },
};
@@ -1
58
6,6 +1
58
7,22 @@
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
@@ -1
60
6,6 +1
60
7,22 @@
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
},
},
...
...
@@ -48,10 +48,10 @@ index 0568565f..dd01df60 100644
LLM_ARCH_UNKNOWN,
{
diff --git a/src/llama-arch.h b/src/llama-arch.h
index
6a989034..b6227eeb
100644
index
bc8a4f0b..bda9d071
100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -7
5
,6 +7
5
,7 @@
enum llm_arch {
@@ -7
6
,6 +7
6
,7 @@
enum llm_arch {
LLM_ARCH_CHAMELEON,
LLM_ARCH_SOLAR,
LLM_ARCH_WAVTOKENIZER_DEC,
...
...
@@ -60,10 +60,10 @@ index 6a989034..b6227eeb 100644
LLM_ARCH_BAILINGMOE,
LLM_ARCH_UNKNOWN,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index d0
51696c..c8374159
100644
index
9
d0
99f11..ef70486d
100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -14
25
,6 +14
25
,7 @@
void llama_model::load_hparams(llama_model_loader & ml) {
@@ -14
37
,6 +14
37
,7 @@
void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN;
}
} break;
...
...
@@ -71,7 +71,7 @@ index d051696c..c8374159 100644
default: throw std::runtime_error("unsupported model architecture");
}
@@ -137
04
,6 +137
0
5,7 @@
llama_rope_type llama_model_rope_type(const llama_model * model) {
@@ -137
51
,6 +1375
2
,7 @@
llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_CHAMELEON:
case LLM_ARCH_SOLAR:
case LLM_ARCH_BAILINGMOE:
...
...
llama/patches/00
2
1-add-ollama-vocab-for-grammar-support.patch
→
llama/patches/001
7
-add-ollama-vocab-for-grammar-support.patch
View file @
8dd12c87
...
...
@@ -184,10 +184,10 @@ index f8c291de..2a3a62db 100644
const char * grammar_root,
bool lazy,
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index
d1497985..b1a9dca
3 100644
index
c0a5f934..7573105
3 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -146
5
,7 +146
5
,7 @@
static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
@@ -146
6
,7 +146
6
,7 @@
static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
trigger_patterns_c.push_back(trigger_pattern.pattern.c_str());
}
...
...
@@ -196,7 +196,7 @@ index d1497985..b1a9dca3 100644
ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(),
ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size());
@@ -154
7
,7 +154
7
,7 @@
static struct llama_sampler * llama_sampler_init_grammar_impl(
@@ -154
8
,7 +154
8
,7 @@
static struct llama_sampler * llama_sampler_init_grammar_impl(
/* .vocab = */ vocab,
/* .grammar_str = */ grammar_str,
/* .grammar_root = */ grammar_root,
...
...
ml/backend/ggml/ggml/include/ggml-cpu.h
View file @
8dd12c87
...
...
@@ -133,6 +133,11 @@ extern "C" {
GGML_BACKEND_API
ggml_backend_reg_t
ggml_backend_cpu_reg
(
void
);
GGML_BACKEND_API
void
ggml_cpu_fp32_to_fp16
(
const
float
*
,
ggml_fp16_t
*
,
int64_t
);
GGML_BACKEND_API
void
ggml_cpu_fp16_to_fp32
(
const
ggml_fp16_t
*
,
float
*
,
int64_t
);
GGML_BACKEND_API
void
ggml_cpu_fp32_to_bf16
(
const
float
*
,
ggml_bf16_t
*
,
int64_t
);
GGML_BACKEND_API
void
ggml_cpu_bf16_to_fp32
(
const
ggml_bf16_t
*
,
float
*
,
int64_t
);
#ifdef __cplusplus
}
#endif
ml/backend/ggml/ggml/include/ggml-rpc.h
View file @
8dd12c87
...
...
@@ -7,7 +7,7 @@
extern
"C"
{
#endif
#define RPC_PROTO_MAJOR_VERSION
1
#define RPC_PROTO_MAJOR_VERSION
2
#define RPC_PROTO_MINOR_VERSION 0
#define RPC_PROTO_PATCH_VERSION 0
#define GGML_RPC_MAX_SERVERS 16
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment