Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
20c5fd39
Unverified
Commit
20c5fd39
authored
May 08, 2025
by
Devon Rifkin
Committed by
GitHub
May 08, 2025
Browse files
Merge branch 'main' into drifkin/array-head-count-simple
parents
d2ee599d
6e9a7a25
Changes
156
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
141 additions
and
166 deletions
+141
-166
llama/patches/0009-maintain-ordering-for-rules-for-grammar.patch
...atches/0009-maintain-ordering-for-rules-for-grammar.patch
+2
-2
llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch
.../patches/0010-ensure-KV-cache-is-fully-defragmented.patch
+10
-10
llama/patches/0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
...0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
+6
-6
llama/patches/0013-remove-amx.patch
llama/patches/0013-remove-amx.patch
+7
-7
llama/patches/0014-fix-string-arr-kv-loading.patch
llama/patches/0014-fix-string-arr-kv-loading.patch
+1
-1
llama/patches/0015-ollama-debug-tensor.patch
llama/patches/0015-ollama-debug-tensor.patch
+2
-2
llama/patches/0016-add-model-quantizations.patch
llama/patches/0016-add-model-quantizations.patch
+0
-96
llama/patches/0016-add-ollama-vocab-for-grammar-support.patch
...a/patches/0016-add-ollama-vocab-for-grammar-support.patch
+3
-3
llama/patches/0017-ggml-Don-t-assert-fail-when-tensor-data-changes-1322.patch
...gml-Don-t-assert-fail-when-tensor-data-changes-1322.patch
+38
-0
llama/sampling_ext.cpp
llama/sampling_ext.cpp
+0
-3
llama/sampling_ext.h
llama/sampling_ext.h
+0
-3
llm/llm_windows.go
llm/llm_windows.go
+2
-1
llm/memory_test.go
llm/memory_test.go
+1
-1
llm/server.go
llm/server.go
+25
-13
llm/server_test.go
llm/server_test.go
+1
-1
ml/backend/ggml/ggml.go
ml/backend/ggml/ggml.go
+6
-8
ml/backend/ggml/ggml/include/ggml-cpu.h
ml/backend/ggml/ggml/include/ggml-cpu.h
+5
-0
ml/backend/ggml/ggml/include/ggml-rpc.h
ml/backend/ggml/ggml/include/ggml-rpc.h
+1
-1
ml/backend/ggml/ggml/include/ggml.h
ml/backend/ggml/ggml/include/ggml.h
+23
-3
ml/backend/ggml/ggml/src/CMakeLists.txt
ml/backend/ggml/ggml/src/CMakeLists.txt
+8
-5
No files found.
llama/patches/0009-maintain-ordering-for-rules-for-grammar.patch
View file @
20c5fd39
...
...
@@ -8,10 +8,10 @@ Subject: [PATCH] maintain ordering for rules for grammar
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index
9067982
2..56
043678
100644
index
5b3059c
2..
6
56
b3eca
100644
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -34
6
,7 +34
6
,7 @@
private:
@@ -34
9
,7 +34
9
,7 @@
private:
friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
std::function<json(const std::string &)> _fetch_json;
bool _dotall;
...
...
llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch
View file @
20c5fd39
...
...
@@ -22,10 +22,10 @@ multiple batches of processing until everything is complete.
4 files changed, 51 insertions(+), 106 deletions(-)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index
0343ba8a..4b3e6a83
100644
index
cd06ad91..77177c5e
100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -5
94
,13 +5
94
,12 @@
llm_graph_result_ptr llama_context::build_kv_self_shift(
@@ -5
83
,13 +5
83
,12 @@
llm_graph_result_ptr llama_context::build_kv_self_shift(
llm_graph_result_ptr llama_context::build_kv_self_defrag(
ggml_context * ctx0,
...
...
@@ -41,7 +41,7 @@ index 0343ba8a..4b3e6a83 100644
#if 0
// CPU defrag
//
@@ -6
72
,32 +6
71
,20 @@
llm_graph_result_ptr llama_context::build_kv_self_defrag(
@@ -6
61
,32 +6
60
,20 @@
llm_graph_result_ptr llama_context::build_kv_self_defrag(
ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
}
#else
...
...
@@ -79,7 +79,7 @@ index 0343ba8a..4b3e6a83 100644
ggml_tensor * view_v_src;
ggml_tensor * view_v_dst;
@@ -
705
,34 +6
92
,30 @@
llm_graph_result_ptr llama_context::build_kv_self_defrag(
@@ -
694
,34 +6
81
,30 @@
llm_graph_result_ptr llama_context::build_kv_self_defrag(
if (cparams.flash_attn) {
// NOTE: the V cache is not transposed when using flash attention
view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il],
...
...
@@ -122,7 +122,7 @@ index 0343ba8a..4b3e6a83 100644
#endif
return res;
@@ -7
41
,8 +7
24
,6 @@
llm_graph_result_ptr llama_context::build_kv_self_defrag(
@@ -7
30
,8 +7
13
,6 @@
llm_graph_result_ptr llama_context::build_kv_self_defrag(
void llama_context::kv_self_update() {
auto & kv = kv_self;
...
...
@@ -131,7 +131,7 @@ index 0343ba8a..4b3e6a83 100644
if (kv->has_shift) {
if (!kv->get_can_shift()) {
GGML_ABORT("The current context does not support K-shift");
@@ -7
63
,8 +7
44
,6 @@
void llama_context::kv_self_update() {
@@ -7
52
,8 +7
33
,6 @@
void llama_context::kv_self_update() {
res->set_inputs(nullptr);
graph_compute(gf, false);
...
...
@@ -140,7 +140,7 @@ index 0343ba8a..4b3e6a83 100644
}
{
@@ -7
79
,49 +7
58
,28 @@
void llama_context::kv_self_update() {
@@ -7
68
,49 +7
47
,28 @@
void llama_context::kv_self_update() {
// defragment the KV cache if needed
if (kv->do_defrag) {
LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
...
...
@@ -202,7 +202,7 @@ index 0343ba8a..4b3e6a83 100644
}
enum llama_pooling_type llama_context::pooling_type() const {
@@ -1
305
,9 +12
63
,12 @@
int llama_context::decode(llama_batch & inp_batch) {
@@ -1
294
,9 +12
52
,12 @@
int llama_context::decode(llama_batch & inp_batch) {
// find KV slot
{
if (!kv_self->find_slot(ubatch)) {
...
...
@@ -219,7 +219,7 @@ index 0343ba8a..4b3e6a83 100644
if (!kv_self->recurrent) {
diff --git a/src/llama-context.h b/src/llama-context.h
index
baa03276..a59ff8
fd 100644
index
a50c4afa..30f84b
fd 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -5,6 +5,7 @@
...
...
@@ -230,7 +230,7 @@ index baa03276..a59ff8fd 100644
#include "ggml-cpp.h"
@@ -1
80
,7 +18
1
,8 @@
private:
@@ -1
79
,7 +18
0
,8 @@
private:
llm_graph_result_ptr build_kv_self_defrag(
ggml_context * ctx0,
...
...
llama/patches/0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
View file @
20c5fd39
...
...
@@ -8,10 +8,10 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
1 file changed, 2 insertions(+)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index
f00700da..91d6a7d5
100644
index
43d9fc4f..4c0d3824
100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -27
8
,6 +27
8
,7 @@
function(ggml_add_cpu_backend_variant tag_name)
@@ -27
9
,6 +27
9
,7 @@
function(ggml_add_cpu_backend_variant tag_name)
endforeach()
ggml_add_cpu_backend_variant_impl(${tag_name})
...
...
@@ -19,11 +19,11 @@ index f00700da..91d6a7d5 100644
endfunction()
ggml_add_backend(CPU)
@@ -28
6
,6 +28
7
,7 @@
if (GGML_CPU_ALL_VARIANTS)
@@ -28
7
,6 +28
8
,7 @@
if (GGML_CPU_ALL_VARIANTS)
if (NOT GGML_BACKEND_DL)
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
endif()
+ add_custom_target(ggml-cpu)
ggml_add_cpu_backend_variant(
sandybridge AVX
)
ggml_add_cpu_backend_variant(
haswell
AVX F16C AVX2 BMI2 FMA
)
ggml_add_cpu_backend_variant(s
kylakex AVX F16C AVX2 BMI2 FMA AVX512
)
ggml_add_cpu_backend_variant(
x64
)
ggml_add_cpu_backend_variant(
sse42
SSE42
)
ggml_add_cpu_backend_variant(s
andybridge SSE42 AVX
)
llama/patches/0013-remove-amx.patch
View file @
20c5fd39
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: T
ue, 8 Apr
2025
20:33
:0
1
-0700
Date: T
hu, 1 May
2025
15:05
:0
8
-0700
Subject: [PATCH] remove amx
disable amx as it reduces performance on some systems
...
...
@@ -9,16 +9,16 @@ disable amx as it reduces performance on some systems
1 file changed, 4 deletions(-)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index
91d6a7d5..d6b393a
2 100644
index
4c0d3824..79c2631
2 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -29
3
,10 +29
3
,6 @@
if (GGML_CPU_ALL_VARIANTS)
ggml_add_cpu_backend_variant(skylakex
AVX F16C AVX2 BMI2 FMA AVX512)
ggml_add_cpu_backend_variant(icelake
AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
ggml_add_cpu_backend_variant(alderlake
AVX F16C AVX2 BMI2 FMA AVX_VNNI)
@@ -29
6
,10 +29
6
,6 @@
if (GGML_CPU_ALL_VARIANTS)
ggml_add_cpu_backend_variant(skylakex
SSE42
AVX F16C AVX2 BMI2 FMA AVX512)
ggml_add_cpu_backend_variant(icelake
SSE42
AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
ggml_add_cpu_backend_variant(alderlake
SSE42
AVX F16C AVX2 BMI2 FMA AVX_VNNI)
- if (NOT MSVC)
- # MSVC doesn't support AMX
- ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
- ggml_add_cpu_backend_variant(sapphirerapids
SSE42
AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
- endif()
elseif (GGML_CPU)
ggml_add_cpu_backend_variant_impl("")
...
...
llama/patches/0014-fix-string-arr-kv-loading.patch
View file @
20c5fd39
...
...
@@ -53,7 +53,7 @@ index 381a9c7d..e45b453d 100644
}
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index
032019c9..ba37df35
100644
index
1306864e..d6515ff6
100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1459,7 +1459,7 @@
void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
...
...
llama/patches/0015-ollama-debug-tensor.patch
View file @
20c5fd39
...
...
@@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor
1 file changed, 6 insertions(+)
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index
432942bf..6d4abe4c
100644
index
34624cca..59bd3c62
100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -15,6 +15,8 @@
...
...
@@ -20,7 +20,7 @@ index 432942bf..6d4abe4c 100644
#if defined(_MSC_VER) || defined(__MINGW32__)
#include <malloc.h> // using malloc.h with MSC/MINGW
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
@@ -285
4
,6 +28
5
6,10 @@
static thread_ret_t ggml_graph_compute_thread(void * data) {
@@ -285
9
,6 +286
1
,10 @@
static thread_ret_t ggml_graph_compute_thread(void * data) {
ggml_compute_forward(¶ms, node);
...
...
llama/patches/0016-add-model-quantizations.patch
deleted
100644 → 0
View file @
d2ee599d
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: Tue, 8 Apr 2025 20:39:32 -0700
Subject: [PATCH] add model quantizations
a temporary patch to add model quantization for
models not supported in llama.cpp
---
src/llama-arch.cpp | 17 +++++++++++++++++
src/llama-arch.h | 1 +
src/llama-model.cpp | 2 ++
src/llama-quant.cpp | 4 ++++
4 files changed, 24 insertions(+)
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 0568565f..dd01df60 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -73,6 +73,7 @@
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
{ LLM_ARCH_PLM, "plm" },
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
+ { LLM_ARCH_MISTRAL3, "mistral3" },
{ LLM_ARCH_UNKNOWN, "(unknown)" },
};
@@ -1586,6 +1587,22 @@
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
},
},
+ {
+ LLM_ARCH_MISTRAL3,
+ {
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
+ }
+ },
{
LLM_ARCH_UNKNOWN,
{
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 6a989034..b6227eeb 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -75,6 +75,7 @@
enum llm_arch {
LLM_ARCH_CHAMELEON,
LLM_ARCH_SOLAR,
LLM_ARCH_WAVTOKENIZER_DEC,
+ LLM_ARCH_MISTRAL3,
LLM_ARCH_PLM,
LLM_ARCH_BAILINGMOE,
LLM_ARCH_UNKNOWN,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index d051696c..c8374159 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1425,6 +1425,7 @@
void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN;
}
} break;
+ case LLM_ARCH_MISTRAL3: break;
default: throw std::runtime_error("unsupported model architecture");
}
@@ -13704,6 +13705,7 @@
llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_CHAMELEON:
case LLM_ARCH_SOLAR:
case LLM_ARCH_BAILINGMOE:
+ case LLM_ARCH_MISTRAL3:
return LLAMA_ROPE_TYPE_NORM;
// the pairs of head values are offset by n_rot/2
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 223e1f3f..8ae6dde8 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -744,6 +744,10 @@
static void llama_model_quantize_impl(const std::string & fname_inp, const std::
// This used to be a regex, but <regex> has an extreme cost to compile times.
bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
+ // don't quantize vision stuff
+ quantize &= name.find("v.") == std::string::npos;
+ quantize &= name.find("mm.") == std::string::npos;
+
// quantize only 2D and 3D tensors (experts)
quantize &= (ggml_n_dims(tensor) >= 2);
llama/patches/00
2
1-add-ollama-vocab-for-grammar-support.patch
→
llama/patches/001
6
-add-ollama-vocab-for-grammar-support.patch
View file @
20c5fd39
...
...
@@ -184,10 +184,10 @@ index f8c291de..2a3a62db 100644
const char * grammar_root,
bool lazy,
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index
d1497985..b1a9dca
3 100644
index
c0a5f934..7573105
3 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -146
5
,7 +146
5
,7 @@
static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
@@ -146
6
,7 +146
6
,7 @@
static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
trigger_patterns_c.push_back(trigger_pattern.pattern.c_str());
}
...
...
@@ -196,7 +196,7 @@ index d1497985..b1a9dca3 100644
ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(),
ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size());
@@ -154
7
,7 +154
7
,7 @@
static struct llama_sampler * llama_sampler_init_grammar_impl(
@@ -154
8
,7 +154
8
,7 @@
static struct llama_sampler * llama_sampler_init_grammar_impl(
/* .vocab = */ vocab,
/* .grammar_str = */ grammar_str,
/* .grammar_root = */ grammar_root,
...
...
llama/patches/0017-ggml-Don-t-assert-fail-when-tensor-data-changes-1322.patch
0 → 100644
View file @
20c5fd39
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@kernel.org>
Date: Thu, 1 May 2025 13:46:10 -0700
Subject: [PATCH] ggml: Don't assert fail when tensor data changes (#13222)
The following scenario will cause an assertion failure in the graph
allocator:
- Build and allocate a graph containing a tensor with a non-NULL data
pointer
- Build and allocate a new graph where that data is NULL
Result:
ggml-alloc.c:819: GGML_ASSERT(talloc->buffer_id >= 0) failed
This happens during revalidation because we think that memory should
have been previously allocated based on the current graph but in
reality the previous graph was different. In this situation, we
should do a full reallocation pass.
---
ggml/src/ggml-alloc.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index a3d3f690..5fd379f6 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -816,7 +816,10 @@
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
size_t node_size = 0;
if (!node->data && !node->view_src) {
- GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API
+ // If we previously had data but don't now then reallocate
+ if (talloc->buffer_id < 0) {
+ return false;
+ }
node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
}
return talloc->size_max >= node_size;
llama/sampling_ext.cpp
View file @
20c5fd39
...
...
@@ -19,9 +19,6 @@ struct common_sampler *common_sampler_cinit(const struct llama_model *model, str
sparams
.
penalty_repeat
=
params
->
penalty_repeat
;
sparams
.
penalty_freq
=
params
->
penalty_freq
;
sparams
.
penalty_present
=
params
->
penalty_present
;
sparams
.
mirostat
=
params
->
mirostat
;
sparams
.
mirostat_tau
=
params
->
mirostat_tau
;
sparams
.
mirostat_eta
=
params
->
mirostat_eta
;
sparams
.
seed
=
params
->
seed
;
sparams
.
grammar
=
params
->
grammar
;
sparams
.
xtc_probability
=
0.0
;
...
...
llama/sampling_ext.h
View file @
20c5fd39
...
...
@@ -20,9 +20,6 @@ extern "C"
float
penalty_repeat
;
float
penalty_freq
;
float
penalty_present
;
int32_t
mirostat
;
float
mirostat_tau
;
float
mirostat_eta
;
uint32_t
seed
;
char
*
grammar
;
};
...
...
llm/llm_windows.go
View file @
20c5fd39
...
...
@@ -7,6 +7,7 @@ import (
const
(
CREATE_DEFAULT_ERROR_MODE
=
0x04000000
ABOVE_NORMAL_PRIORITY_CLASS
=
0x00008000
CREATE_NO_WINDOW
=
0x08000000
)
var
LlamaServerSysProcAttr
=
&
syscall
.
SysProcAttr
{
...
...
@@ -18,5 +19,5 @@ var LlamaServerSysProcAttr = &syscall.SysProcAttr{
//
// Setting Above Normal priority class ensures when running as a "background service"
// with "programs" given best priority, we aren't starved of cpu cycles
CreationFlags
:
CREATE_DEFAULT_ERROR_MODE
|
ABOVE_NORMAL_PRIORITY_CLASS
,
CreationFlags
:
CREATE_DEFAULT_ERROR_MODE
|
ABOVE_NORMAL_PRIORITY_CLASS
|
CREATE_NO_WINDOW
,
}
llm/memory_test.go
View file @
20c5fd39
...
...
@@ -25,7 +25,7 @@ func TestEstimateGPULayers(t *testing.T) {
defer
f
.
Close
()
inputLayerCount
:=
5
tensors
:=
[]
ggml
.
Tensor
{
tensors
:=
[]
*
ggml
.
Tensor
{
{
Name
:
"blk.0.attn.weight"
,
Kind
:
uint32
(
0
),
Offset
:
uint64
(
0
),
Shape
:
[]
uint64
{
1
,
1
,
1
,
1
},
WriterTo
:
bytes
.
NewReader
(
make
([]
byte
,
32
))},
{
Name
:
"blk.1.attn.weight"
,
Kind
:
uint32
(
0
),
Offset
:
uint64
(
0
),
Shape
:
[]
uint64
{
1
,
1
,
1
,
1
},
WriterTo
:
bytes
.
NewReader
(
make
([]
byte
,
32
))},
{
Name
:
"blk.2.attn.weight"
,
Kind
:
uint32
(
0
),
Offset
:
uint64
(
0
),
Shape
:
[]
uint64
{
1
,
1
,
1
,
1
},
WriterTo
:
bytes
.
NewReader
(
make
([]
byte
,
32
))},
...
...
llm/server.go
View file @
20c5fd39
...
...
@@ -44,6 +44,7 @@ type LlamaServer interface {
EstimatedVRAM
()
uint64
// Total VRAM across all GPUs
EstimatedTotal
()
uint64
EstimatedVRAMByGPU
(
gpuID
string
)
uint64
Pid
()
int
}
// llmServer is an instance of the llama.cpp server
...
...
@@ -216,10 +217,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
params
=
append
(
params
,
"--no-mmap"
)
}
if
opts
.
UseMLock
{
params
=
append
(
params
,
"--mlock"
)
}
// TODO - NUMA support currently doesn't work properly
params
=
append
(
params
,
"--parallel"
,
strconv
.
Itoa
(
numParallel
))
...
...
@@ -289,7 +286,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
params
=
append
(
params
,
"--mmproj"
,
projectors
[
0
])
}
// iterate through compatible GPU libraries such as 'cuda_v12',
'cuda_v11',
'rocm', etc.
// iterate through compatible GPU libraries such as 'cuda_v12', 'rocm', etc.
// adding each library's respective path to the LD_LIBRARY_PATH, until finally running
// without any LD_LIBRARY_PATH flags
for
{
...
...
@@ -324,21 +321,23 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
pathEnv
=
"LD_LIBRARY_PATH"
}
var
libraryPaths
[]
string
// Note: we always put our dependency paths first
// since these are the exact version we compiled/linked against
libraryPaths
:=
[]
string
{
discover
.
LibOllamaPath
}
if
libraryPath
,
ok
:=
os
.
LookupEnv
(
pathEnv
);
ok
{
libraryPaths
=
append
(
libraryPaths
,
filepath
.
SplitList
(
libraryPath
)
...
)
}
ggmlPaths
:=
[]
string
{
discover
.
LibOllamaPath
}
if
len
(
compatible
)
>
0
{
c
:=
compatible
[
0
]
if
libpath
,
ok
:=
libs
[
c
];
ok
{
slog
.
Debug
(
"adding gpu library"
,
"path"
,
libpath
)
libraryPaths
=
append
(
libraryPaths
,
libpath
)
libraryPaths
=
append
([]
string
{
libpath
},
libraryPaths
...
)
ggmlPaths
=
append
(
ggmlPaths
,
libpath
)
}
}
// Note: we always put the dependency path first
// since this was the exact version we compiled/linked against
if
gpus
[
0
]
.
DependencyPath
!=
nil
{
slog
.
Debug
(
"adding gpu dependency paths"
,
"paths"
,
gpus
[
0
]
.
DependencyPath
)
// assume gpus from the same library have the same dependency path
...
...
@@ -369,6 +368,8 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
s
.
cmd
.
Stderr
=
s
.
status
s
.
cmd
.
SysProcAttr
=
LlamaServerSysProcAttr
s
.
cmd
.
Env
=
append
(
s
.
cmd
.
Env
,
"OLLAMA_LIBRARY_PATH="
+
strings
.
Join
(
ggmlPaths
,
string
(
filepath
.
ListSeparator
)))
envWorkarounds
:=
[][
2
]
string
{}
for
_
,
gpu
:=
range
gpus
{
envWorkarounds
=
append
(
envWorkarounds
,
gpu
.
EnvWorkarounds
...
)
...
...
@@ -406,7 +407,8 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
if
envconfig
.
Debug
()
{
filteredEnv
:=
[]
string
{}
for
_
,
ev
:=
range
s
.
cmd
.
Env
{
if
strings
.
HasPrefix
(
ev
,
"CUDA_"
)
||
if
strings
.
HasPrefix
(
ev
,
"OLLAMA_"
)
||
strings
.
HasPrefix
(
ev
,
"CUDA_"
)
||
strings
.
HasPrefix
(
ev
,
"ROCR_"
)
||
strings
.
HasPrefix
(
ev
,
"ROCM_"
)
||
strings
.
HasPrefix
(
ev
,
"HIP_"
)
||
...
...
@@ -515,6 +517,9 @@ func (s *llmServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
if
errors
.
Is
(
err
,
context
.
DeadlineExceeded
)
{
return
ServerStatusNotResponding
,
errors
.
New
(
"server not responding"
)
}
if
strings
.
Contains
(
err
.
Error
(),
"connection refused"
)
{
return
ServerStatusNotResponding
,
errors
.
New
(
"connection refused"
)
}
return
ServerStatusError
,
fmt
.
Errorf
(
"health resp: %w"
,
err
)
}
defer
resp
.
Body
.
Close
()
...
...
@@ -635,6 +640,13 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
}
}
func
(
s
*
llmServer
)
Pid
()
int
{
if
s
.
cmd
!=
nil
&&
s
.
cmd
.
Process
!=
nil
{
return
s
.
cmd
.
Process
.
Pid
}
return
-
1
}
var
grammarJSON
=
`
root ::= object
value ::= object | array | string | number | ("true" | "false" | "null") ws
...
...
@@ -998,17 +1010,17 @@ func (s *llmServer) Close() error {
s
.
llamaModelLock
.
Unlock
()
if
s
.
cmd
!=
nil
{
slog
.
Debug
(
"stopping llama server"
)
slog
.
Debug
(
"stopping llama server"
,
"pid"
,
s
.
Pid
()
)
if
err
:=
s
.
cmd
.
Process
.
Kill
();
err
!=
nil
{
return
err
}
// if ProcessState is already populated, Wait already completed, no need to wait again
if
s
.
cmd
.
ProcessState
==
nil
{
slog
.
Debug
(
"waiting for llama server to exit"
)
slog
.
Debug
(
"waiting for llama server to exit"
,
"pid"
,
s
.
Pid
()
)
<-
s
.
done
}
slog
.
Debug
(
"llama server stopped"
)
slog
.
Debug
(
"llama server stopped"
,
"pid"
,
s
.
Pid
()
)
}
return
nil
...
...
llm/server_test.go
View file @
20c5fd39
...
...
@@ -16,7 +16,7 @@ func TestLLMServerCompletionFormat(t *testing.T) {
// of a mess, and but it's good enough, until we can refactoring the
// Completion method to be more testable.
ctx
,
cancel
:=
context
.
WithCancel
(
c
ontext
.
Background
())
ctx
,
cancel
:=
context
.
WithCancel
(
t
.
C
ontext
())
s
:=
&
llmServer
{
sem
:
semaphore
.
NewWeighted
(
1
),
// required to prevent nil panic
}
...
...
ml/backend/ggml/ggml.go
View file @
20c5fd39
...
...
@@ -312,6 +312,7 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
g
,
ctx
:=
errgroup
.
WithContext
(
ctx
)
g
.
SetLimit
(
runtime
.
GOMAXPROCS
(
0
))
for
_
,
t
:=
range
meta
.
Tensors
()
.
Items
()
{
t
:=
t
g
.
Go
(
func
()
error
{
tts
:=
make
([]
*
C
.
struct_ggml_tensor
,
max
(
1
,
len
(
targets
[
t
.
Name
])))
for
i
:=
range
tts
{
...
...
@@ -341,6 +342,11 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
var
s
uint64
for
s
<
t
.
Size
()
{
// Stop if either the parent context has been canceled or if any of the other tensors returned an error
if
err
:=
ctx
.
Err
();
err
!=
nil
{
return
err
}
n
,
err
:=
io
.
ReadFull
(
sr
,
bts
[
:
min
(
len
(
bts
),
int
(
t
.
Size
()
-
s
))])
if
err
!=
nil
{
slog
.
Warn
(
"file read error"
,
"file"
,
r
.
Name
(),
"error"
,
err
)
...
...
@@ -363,14 +369,6 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
})
}
// start a goroutine to cancel the errgroup if the parent context is done
go
func
()
{
<-
ctx
.
Done
()
g
.
Go
(
func
()
error
{
return
ctx
.
Err
()
})
}()
if
err
:=
g
.
Wait
();
err
!=
nil
{
return
nil
,
err
}
...
...
ml/backend/ggml/ggml/include/ggml-cpu.h
View file @
20c5fd39
...
...
@@ -133,6 +133,11 @@ extern "C" {
GGML_BACKEND_API
ggml_backend_reg_t
ggml_backend_cpu_reg
(
void
);
GGML_BACKEND_API
void
ggml_cpu_fp32_to_fp16
(
const
float
*
,
ggml_fp16_t
*
,
int64_t
);
GGML_BACKEND_API
void
ggml_cpu_fp16_to_fp32
(
const
ggml_fp16_t
*
,
float
*
,
int64_t
);
GGML_BACKEND_API
void
ggml_cpu_fp32_to_bf16
(
const
float
*
,
ggml_bf16_t
*
,
int64_t
);
GGML_BACKEND_API
void
ggml_cpu_bf16_to_fp32
(
const
ggml_bf16_t
*
,
float
*
,
int64_t
);
#ifdef __cplusplus
}
#endif
ml/backend/ggml/ggml/include/ggml-rpc.h
View file @
20c5fd39
...
...
@@ -7,7 +7,7 @@
extern
"C"
{
#endif
#define RPC_PROTO_MAJOR_VERSION
1
#define RPC_PROTO_MAJOR_VERSION
2
#define RPC_PROTO_MINOR_VERSION 0
#define RPC_PROTO_PATCH_VERSION 0
#define GGML_RPC_MAX_SERVERS 16
...
...
ml/backend/ggml/ggml/include/ggml.h
View file @
20c5fd39
...
...
@@ -393,8 +393,8 @@ extern "C" {
// precision
enum
ggml_prec
{
GGML_PREC_DEFAULT
,
GGML_PREC_F32
,
GGML_PREC_DEFAULT
=
0
,
// stored as ggml_tensor.op_params, 0 by default
GGML_PREC_F32
=
10
,
};
// model file types
...
...
@@ -481,6 +481,7 @@ extern "C" {
GGML_OP_CONV_TRANSPOSE_1D
,
GGML_OP_IM2COL
,
GGML_OP_IM2COL_BACK
,
GGML_OP_CONV_2D_DW
,
GGML_OP_CONV_TRANSPOSE_2D
,
GGML_OP_POOL_1D
,
GGML_OP_POOL_2D
,
...
...
@@ -678,6 +679,9 @@ extern "C" {
GGML_API
bool
ggml_is_contiguous_1
(
const
struct
ggml_tensor
*
tensor
);
// contiguous for dims >= 1
GGML_API
bool
ggml_is_contiguous_2
(
const
struct
ggml_tensor
*
tensor
);
// contiguous for dims >= 2
// true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
GGML_API
bool
ggml_is_contiguous_channels
(
const
struct
ggml_tensor
*
tensor
);
GGML_API
bool
ggml_are_same_shape
(
const
struct
ggml_tensor
*
t0
,
const
struct
ggml_tensor
*
t1
);
GGML_API
bool
ggml_are_same_stride
(
const
struct
ggml_tensor
*
t0
,
const
struct
ggml_tensor
*
t1
);
...
...
@@ -1661,7 +1665,7 @@ extern "C" {
struct
ggml_tensor
*
a
,
struct
ggml_tensor
*
b
);
// depthwise
// depthwise
(via im2col and mul_mat)
GGML_API
struct
ggml_tensor
*
ggml_conv_2d_dw
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
,
// convolution kernel
...
...
@@ -1673,6 +1677,22 @@ extern "C" {
int
d0
,
// dilation dimension 0
int
d1
);
// dilation dimension 1
// Depthwise 2D convolution
// may be faster than ggml_conv_2d_dw, but not available in all backends
// a: KW KH 1 C convolution kernel
// b: W H C N input data
// res: W_out H_out C N
GGML_API
struct
ggml_tensor
*
ggml_conv_2d_dw_direct
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
,
struct
ggml_tensor
*
b
,
int
stride0
,
int
stride1
,
int
pad0
,
int
pad1
,
int
dilation0
,
int
dilation1
);
GGML_API
struct
ggml_tensor
*
ggml_conv_transpose_2d_p0
(
struct
ggml_context
*
ctx
,
struct
ggml_tensor
*
a
,
...
...
ml/backend/ggml/ggml/src/CMakeLists.txt
View file @
20c5fd39
...
...
@@ -267,6 +267,7 @@ function(ggml_add_cpu_backend_variant tag_name)
set
(
GGML_CPU_TAG_NAME
${
tag_name
}
)
# other: OPENMP LLAMAFILE CPU_HBM
foreach
(
feat NATIVE
SSE42
AVX AVX2 BMI2 AVX_VNNI FMA F16C
AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
AMX_TILE AMX_INT8 AMX_BF16
)
...
...
@@ -288,11 +289,13 @@ if (GGML_CPU_ALL_VARIANTS)
message
(
FATAL_ERROR
"GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL"
)
endif
()
add_custom_target
(
ggml-cpu
)
ggml_add_cpu_backend_variant
(
sandybridge AVX
)
ggml_add_cpu_backend_variant
(
haswell AVX F16C AVX2 BMI2 FMA
)
ggml_add_cpu_backend_variant
(
skylakex AVX F16C AVX2 BMI2 FMA AVX512
)
ggml_add_cpu_backend_variant
(
icelake AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI
)
ggml_add_cpu_backend_variant
(
alderlake AVX F16C AVX2 BMI2 FMA AVX_VNNI
)
ggml_add_cpu_backend_variant
(
x64
)
ggml_add_cpu_backend_variant
(
sse42 SSE42
)
ggml_add_cpu_backend_variant
(
sandybridge SSE42 AVX
)
ggml_add_cpu_backend_variant
(
haswell SSE42 AVX F16C AVX2 BMI2 FMA
)
ggml_add_cpu_backend_variant
(
skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512
)
ggml_add_cpu_backend_variant
(
icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI
)
ggml_add_cpu_backend_variant
(
alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI
)
elseif
(
GGML_CPU
)
ggml_add_cpu_backend_variant_impl
(
""
)
endif
()
...
...
Prev
1
2
3
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment