Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
f46df4e5
Unverified
Commit
f46df4e5
authored
May 13, 2025
by
Jeffrey Morgan
Committed by
GitHub
May 13, 2025
Browse files
llama: fix defrag patch to defragment when no slots are available (#10695)
parent
c6bcdc42
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
50 additions
and
7 deletions
+50
-7
llama/llama.cpp/src/llama-context.cpp
llama/llama.cpp/src/llama-context.cpp
+12
-6
llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch
.../patches/0010-ensure-KV-cache-is-fully-defragmented.patch
+38
-1
No files found.
llama/llama.cpp/src/llama-context.cpp
View file @
f46df4e5
...
@@ -949,11 +949,14 @@ int llama_context::decode(llama_batch & inp_batch) {
...
@@ -949,11 +949,14 @@ int llama_context::decode(llama_batch & inp_batch) {
}
}
// find KV slot
// find KV slot
if
(
!
kv_self
->
find_slot
(
ubatch
))
{
kv_self
->
defrag_sched
(
-
1.0
f
);
kv_self
->
update
(
*
this
);
if
(
!
kv_self
->
find_slot
(
ubatch
))
{
if
(
!
kv_self
->
find_slot
(
ubatch
))
{
LLAMA_LOG_WARN
(
"%s: failed to find KV cache slot for ubatch of size %d
\n
"
,
__func__
,
ubatch
.
n_tokens
);
LLAMA_LOG_WARN
(
"%s: failed to find KV cache slot for ubatch of size %d
\n
"
,
__func__
,
ubatch
.
n_tokens
);
return
1
;
return
1
;
}
}
}
ggml_backend_sched_reset
(
sched
.
get
());
ggml_backend_sched_reset
(
sched
.
get
());
ggml_backend_sched_set_eval_callback
(
sched
.
get
(),
cparams
.
cb_eval
,
cparams
.
cb_eval_user_data
);
ggml_backend_sched_set_eval_callback
(
sched
.
get
(),
cparams
.
cb_eval
,
cparams
.
cb_eval_user_data
);
...
@@ -1966,11 +1969,14 @@ void llama_context::opt_epoch_iter(
...
@@ -1966,11 +1969,14 @@ void llama_context::opt_epoch_iter(
n_outputs
=
ubatch
.
n_tokens
;
n_outputs
=
ubatch
.
n_tokens
;
// TODO: not sure if this is needed
// TODO: not sure if this is needed
if
(
!
kv_self
->
find_slot
(
ubatch
))
{
kv_self
->
defrag_sched
(
-
1.0
f
);
kv_self
->
update
(
*
this
);
if
(
!
kv_self
->
find_slot
(
ubatch
))
{
if
(
!
kv_self
->
find_slot
(
ubatch
))
{
LLAMA_LOG_WARN
(
"%s: failed to find KV cache slot for ubatch of size %d
\n
"
,
__func__
,
ubatch
.
n_tokens
);
LLAMA_LOG_WARN
(
"%s: failed to find KV cache slot for ubatch of size %d
\n
"
,
__func__
,
ubatch
.
n_tokens
);
GGML_ABORT
(
"TODO: handle this error"
);
GGML_ABORT
(
"TODO: handle this error"
);
}
}
}
auto
*
gf
=
graph_init
();
auto
*
gf
=
graph_init
();
auto
res
=
graph_build
(
ctx_compute
.
get
(),
gf
,
ubatch
,
LLM_GRAPH_TYPE_DEFAULT
);
auto
res
=
graph_build
(
ctx_compute
.
get
(),
gf
,
ubatch
,
LLM_GRAPH_TYPE_DEFAULT
);
...
...
llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch
View file @
f46df4e5
...
@@ -15,11 +15,48 @@ but this can leave a cache that still does not have adequate space
...
@@ -15,11 +15,48 @@ but this can leave a cache that still does not have adequate space
even after defragmentation is triggered. Instead, we should do
even after defragmentation is triggered. Instead, we should do
multiple batches of processing until everything is complete.
multiple batches of processing until everything is complete.
---
---
src/llama-context.cpp | 18 ++++---
src/llama-context.h | 1 +
src/llama-context.h | 1 +
src/llama-kv-cache.cpp | 107 ++++++++++++++---------------------------
src/llama-kv-cache.cpp | 107 ++++++++++++++---------------------------
src/llama-kv-cache.h | 12 ++++-
src/llama-kv-cache.h | 12 ++++-
3
files changed,
47
insertions(+), 7
3
deletions(-)
4
files changed,
59
insertions(+), 7
9
deletions(-)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index c22687e4..c5948e8f 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -950,9 +950,12 @@
int llama_context::decode(llama_batch & inp_batch) {
// find KV slot
if (!kv_self->find_slot(ubatch)) {
- LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
-
- return 1;
+ kv_self->defrag_sched(-1.0f);
+ kv_self->update(*this);
+ if (!kv_self->find_slot(ubatch)) {
+ LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
+ return 1;
+ }
}
ggml_backend_sched_reset(sched.get());
@@ -1967,9 +1970,12 @@
void llama_context::opt_epoch_iter(
// TODO: not sure if this is needed
if (!kv_self->find_slot(ubatch)) {
- LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
-
- GGML_ABORT("TODO: handle this error");
+ kv_self->defrag_sched(-1.0f);
+ kv_self->update(*this);
+ if (!kv_self->find_slot(ubatch)) {
+ LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
+ GGML_ABORT("TODO: handle this error");
+ }
}
auto * gf = graph_init();
diff --git a/src/llama-context.h b/src/llama-context.h
diff --git a/src/llama-context.h b/src/llama-context.h
index c4ab242a..9970dfc6 100644
index c4ab242a..9970dfc6 100644
--- a/src/llama-context.h
--- a/src/llama-context.h
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment