Unverified Commit f46df4e5 authored by Jeffrey Morgan's avatar Jeffrey Morgan Committed by GitHub
Browse files

llama: fix defrag patch to defragment when no slots are available (#10695)

parent c6bcdc42
...@@ -949,11 +949,14 @@ int llama_context::decode(llama_batch & inp_batch) { ...@@ -949,11 +949,14 @@ int llama_context::decode(llama_batch & inp_batch) {
} }
// find KV slot // find KV slot
if (!kv_self->find_slot(ubatch)) {
kv_self->defrag_sched(-1.0f);
kv_self->update(*this);
if (!kv_self->find_slot(ubatch)) { if (!kv_self->find_slot(ubatch)) {
LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens); LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
return 1; return 1;
} }
}
ggml_backend_sched_reset(sched.get()); ggml_backend_sched_reset(sched.get());
ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
...@@ -1966,11 +1969,14 @@ void llama_context::opt_epoch_iter( ...@@ -1966,11 +1969,14 @@ void llama_context::opt_epoch_iter(
n_outputs = ubatch.n_tokens; n_outputs = ubatch.n_tokens;
// TODO: not sure if this is needed // TODO: not sure if this is needed
if (!kv_self->find_slot(ubatch)) {
kv_self->defrag_sched(-1.0f);
kv_self->update(*this);
if (!kv_self->find_slot(ubatch)) { if (!kv_self->find_slot(ubatch)) {
LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens); LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
GGML_ABORT("TODO: handle this error"); GGML_ABORT("TODO: handle this error");
} }
}
auto * gf = graph_init(); auto * gf = graph_init();
auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT); auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT);
......
...@@ -15,11 +15,48 @@ but this can leave a cache that still does not have adequate space ...@@ -15,11 +15,48 @@ but this can leave a cache that still does not have adequate space
even after defragmentation is triggered. Instead, we should do even after defragmentation is triggered. Instead, we should do
multiple batches of processing until everything is complete. multiple batches of processing until everything is complete.
--- ---
src/llama-context.cpp | 18 ++++---
src/llama-context.h | 1 + src/llama-context.h | 1 +
src/llama-kv-cache.cpp | 107 ++++++++++++++--------------------------- src/llama-kv-cache.cpp | 107 ++++++++++++++---------------------------
src/llama-kv-cache.h | 12 ++++- src/llama-kv-cache.h | 12 ++++-
3 files changed, 47 insertions(+), 73 deletions(-) 4 files changed, 59 insertions(+), 79 deletions(-)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index c22687e4..c5948e8f 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -950,9 +950,12 @@ int llama_context::decode(llama_batch & inp_batch) {
// find KV slot
if (!kv_self->find_slot(ubatch)) {
- LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
-
- return 1;
+ kv_self->defrag_sched(-1.0f);
+ kv_self->update(*this);
+ if (!kv_self->find_slot(ubatch)) {
+ LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
+ return 1;
+ }
}
ggml_backend_sched_reset(sched.get());
@@ -1967,9 +1970,12 @@ void llama_context::opt_epoch_iter(
// TODO: not sure if this is needed
if (!kv_self->find_slot(ubatch)) {
- LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
-
- GGML_ABORT("TODO: handle this error");
+ kv_self->defrag_sched(-1.0f);
+ kv_self->update(*this);
+ if (!kv_self->find_slot(ubatch)) {
+ LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
+ GGML_ABORT("TODO: handle this error");
+ }
}
auto * gf = graph_init();
diff --git a/src/llama-context.h b/src/llama-context.h diff --git a/src/llama-context.h b/src/llama-context.h
index c4ab242a..9970dfc6 100644 index c4ab242a..9970dfc6 100644
--- a/src/llama-context.h --- a/src/llama-context.h
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment