Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
0cefd46f
Unverified
Commit
0cefd46f
authored
May 12, 2025
by
Jeffrey Morgan
Committed by
GitHub
May 12, 2025
Browse files
llama: update to commit de4c07f93 (#10655)
parent
ad035ad5
Changes
113
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2426 additions
and
1109 deletions
+2426
-1109
Makefile.sync
Makefile.sync
+1
-1
llama/build-info.cpp
llama/build-info.cpp
+1
-1
llama/llama.cpp/.rsync-filter
llama/llama.cpp/.rsync-filter
+5
-5
llama/llama.cpp/common/common.cpp
llama/llama.cpp/common/common.cpp
+18
-1
llama/llama.cpp/common/common.h
llama/llama.cpp/common/common.h
+13
-5
llama/llama.cpp/common/sampling.cpp
llama/llama.cpp/common/sampling.cpp
+57
-50
llama/llama.cpp/include/llama.h
llama/llama.cpp/include/llama.h
+51
-10
llama/llama.cpp/src/llama-adapter.cpp
llama/llama.cpp/src/llama-adapter.cpp
+6
-0
llama/llama.cpp/src/llama-batch.cpp
llama/llama.cpp/src/llama-batch.cpp
+5
-1
llama/llama.cpp/src/llama-batch.h
llama/llama.cpp/src/llama-batch.h
+2
-1
llama/llama.cpp/src/llama-chat.cpp
llama/llama.cpp/src/llama-chat.cpp
+17
-7
llama/llama.cpp/src/llama-chat.h
llama/llama.cpp/src/llama-chat.h
+1
-0
llama/llama.cpp/src/llama-context.cpp
llama/llama.cpp/src/llama-context.cpp
+401
-476
llama/llama.cpp/src/llama-context.h
llama/llama.cpp/src/llama-context.h
+44
-33
llama/llama.cpp/src/llama-cparams.h
llama/llama.cpp/src/llama-cparams.h
+2
-1
llama/llama.cpp/src/llama-graph.cpp
llama/llama.cpp/src/llama-graph.cpp
+20
-38
llama/llama.cpp/src/llama-graph.h
llama/llama.cpp/src/llama-graph.h
+12
-8
llama/llama.cpp/src/llama-kv-cache.cpp
llama/llama.cpp/src/llama-kv-cache.cpp
+1485
-389
llama/llama.cpp/src/llama-kv-cache.h
llama/llama.cpp/src/llama-kv-cache.h
+274
-81
llama/llama.cpp/src/llama-memory.h
llama/llama.cpp/src/llama-memory.h
+11
-1
No files found.
Makefile.sync
View file @
0cefd46f
UPSTREAM
=
https://github.com/ggerganov/llama.cpp.git
WORKDIR
=
llama/vendor
FETCH_HEAD
=
e1e8e0991ffd9e99a445c6812bb519d5bac9f4b5
FETCH_HEAD
=
de4c07f93783a1a96456a44dc16b9db538ee1618
.PHONY
:
help
help
:
...
...
llama/build-info.cpp
View file @
0cefd46f
int
LLAMA_BUILD_NUMBER
=
0
;
char
const
*
LLAMA_COMMIT
=
"
e1e8e0991ffd9e99a445c6812bb519d5bac9f4b5
"
;
char
const
*
LLAMA_COMMIT
=
"
de4c07f93783a1a96456a44dc16b9db538ee1618
"
;
char
const
*
LLAMA_COMPILER
=
""
;
char
const
*
LLAMA_BUILD_TARGET
=
""
;
llama/llama.cpp/.rsync-filter
View file @
0cefd46f
...
...
@@ -10,11 +10,11 @@ include common/stb_image.*
include include/
include include/llama.*
include include/llama-*.*
include
example
s/
include
examples/llava
/
include
examples/llava
/clip.*
include
examples/llava
/clip-impl.*
include
examples/llava
/llava.*
include
tool
s/
include
tools/mtmd
/
include
tools/mtmd
/clip.*
include
tools/mtmd
/clip-impl.*
include
tools/mtmd
/llava.*
include src/
include src/llama.*
include src/llama-*.*
...
...
llama/llama.cpp/common/common.cpp
View file @
0cefd46f
...
...
@@ -1096,7 +1096,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
cparams
.
n_threads
=
params
.
cpuparams
.
n_threads
;
cparams
.
n_threads_batch
=
params
.
cpuparams_batch
.
n_threads
==
-
1
?
params
.
cpuparams
.
n_threads
:
params
.
cpuparams_batch
.
n_threads
;
cparams
.
logits_all
=
params
.
logits_all
;
cparams
.
embeddings
=
params
.
embedding
;
cparams
.
rope_scaling_type
=
params
.
rope_scaling_type
;
cparams
.
rope_freq_base
=
params
.
rope_freq_base
;
...
...
@@ -1114,6 +1113,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
cparams
.
offload_kqv
=
!
params
.
no_kv_offload
;
cparams
.
flash_attn
=
params
.
flash_attn
;
cparams
.
no_perf
=
params
.
no_perf
;
cparams
.
op_offload
=
!
params
.
no_op_offload
;
if
(
params
.
reranking
)
{
cparams
.
embeddings
=
true
;
...
...
@@ -1565,3 +1565,20 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
return
result
;
}
ggml_opt_dataset_t
common_opt_dataset_init
(
struct
llama_context
*
ctx
,
const
std
::
vector
<
llama_token
>
&
tokens
,
int64_t
stride
)
{
const
int64_t
ne_datapoint
=
llama_n_ctx
(
ctx
);
const
int64_t
ndata
=
(
tokens
.
size
()
-
ne_datapoint
-
1
)
/
stride
;
ggml_opt_dataset_t
result
=
ggml_opt_dataset_init
(
GGML_TYPE_I32
,
GGML_TYPE_I32
,
ne_datapoint
,
ne_datapoint
,
ndata
,
/*ndata_shard =*/
1
);
llama_token
*
data
=
(
llama_token
*
)
ggml_opt_dataset_data
(
result
)
->
data
;
llama_token
*
labels
=
(
llama_token
*
)
ggml_opt_dataset_labels
(
result
)
->
data
;
for
(
int64_t
idata
=
0
;
idata
<
ndata
;
++
idata
)
{
memcpy
(
data
+
idata
*
ne_datapoint
,
tokens
.
data
()
+
idata
*
stride
+
0
,
ne_datapoint
*
sizeof
(
llama_token
));
memcpy
(
labels
+
idata
*
ne_datapoint
,
tokens
.
data
()
+
idata
*
stride
+
1
,
ne_datapoint
*
sizeof
(
llama_token
));
}
return
result
;
}
llama/llama.cpp/common/common.h
View file @
0cefd46f
...
...
@@ -66,7 +66,6 @@ enum llama_example {
LLAMA_EXAMPLE_COMMON
,
LLAMA_EXAMPLE_SPECULATIVE
,
LLAMA_EXAMPLE_MAIN
,
LLAMA_EXAMPLE_INFILL
,
LLAMA_EXAMPLE_EMBEDDING
,
LLAMA_EXAMPLE_PERPLEXITY
,
LLAMA_EXAMPLE_RETRIEVAL
,
...
...
@@ -96,6 +95,7 @@ enum common_sampler_type {
COMMON_SAMPLER_TYPE_XTC
=
8
,
COMMON_SAMPLER_TYPE_INFILL
=
9
,
COMMON_SAMPLER_TYPE_PENALTIES
=
10
,
COMMON_SAMPLER_TYPE_TOP_N_SIGMA
=
11
,
};
// dimensionality reduction methods, used by cvector-generator
...
...
@@ -161,6 +161,7 @@ struct common_params_sampling {
std
::
vector
<
enum
common_sampler_type
>
samplers
=
{
COMMON_SAMPLER_TYPE_PENALTIES
,
COMMON_SAMPLER_TYPE_DRY
,
COMMON_SAMPLER_TYPE_TOP_N_SIGMA
,
COMMON_SAMPLER_TYPE_TOP_K
,
COMMON_SAMPLER_TYPE_TYPICAL_P
,
COMMON_SAMPLER_TYPE_TOP_P
,
...
...
@@ -323,7 +324,6 @@ struct common_params {
bool
ctx_shift
=
true
;
// context shift on inifinite text generation
bool
input_prefix_bos
=
false
;
// prefix BOS to user inputs, preceding input_prefix
bool
logits_all
=
false
;
// return logits for all tokens in the batch
bool
use_mmap
=
true
;
// use mmap for faster loads
bool
use_mlock
=
false
;
// use mlock to keep model in memory
bool
verbose_prompt
=
false
;
// print prompt tokens before generation
...
...
@@ -332,6 +332,7 @@ struct common_params {
bool
no_kv_offload
=
false
;
// disable KV offloading
bool
warmup
=
true
;
// warmup run
bool
check_tensors
=
false
;
// validate tensor data
bool
no_op_offload
=
false
;
// globally disable offload host tensor operations to device
bool
single_turn
=
false
;
// single turn chat conversation
...
...
@@ -340,7 +341,7 @@ struct common_params {
common_conversation_mode
conversation_mode
=
COMMON_CONVERSATION_MODE_AUTO
;
// multimodal models (see
examples/llava
)
// multimodal models (see
tools/mtmd
)
struct
common_params_model
mmproj
;
bool
mmproj_use_gpu
=
true
;
// use GPU for multimodal model
bool
no_mmproj
=
false
;
// explicitly disable multimodal model
...
...
@@ -409,13 +410,14 @@ struct common_params {
bool
process_output
=
false
;
// collect data for the output tensor
bool
compute_ppl
=
true
;
// whether to compute perplexity
bool
parse_special
=
false
;
// whether to parse special tokens during imatrix tokenization
// cvector-generator params
int
n_pca_batch
=
100
;
int
n_pca_iterations
=
1000
;
dimre_method
cvector_dimre_method
=
DIMRE_METHOD_PCA
;
std
::
string
cvector_positive_file
=
"
example
s/cvector-generator/positive.txt"
;
std
::
string
cvector_negative_file
=
"
example
s/cvector-generator/negative.txt"
;
std
::
string
cvector_positive_file
=
"
tool
s/cvector-generator/positive.txt"
;
std
::
string
cvector_negative_file
=
"
tool
s/cvector-generator/negative.txt"
;
bool
spm_infill
=
false
;
// suffix/prefix/middle pattern for infill
...
...
@@ -664,3 +666,9 @@ const char * const LLM_KV_SPLIT_COUNT = "split.count";
const
char
*
const
LLM_KV_SPLIT_TENSORS_COUNT
=
"split.tensors.count"
;
}
//
// training utils
//
ggml_opt_dataset_t
common_opt_dataset_init
(
struct
llama_context
*
ctx
,
const
std
::
vector
<
llama_token
>
&
tokens
,
int64_t
stride
);
llama/llama.cpp/common/sampling.cpp
View file @
0cefd46f
#include "sampling.h"
#include "common.h"
#include "log.h"
#include <cmath>
#include <unordered_map>
...
...
@@ -229,51 +230,48 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
params
.
logit_bias
.
data
()));
if
(
params
.
mirostat
==
0
)
{
if
(
params
.
top_n_sigma
>=
0
)
{
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_top_k
(
params
.
top_k
));
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_temp
(
params
.
temp
));
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_top_n_sigma
(
params
.
top_n_sigma
));
}
else
{
for
(
const
auto
&
cnstr
:
params
.
samplers
)
{
switch
(
cnstr
)
{
case
COMMON_SAMPLER_TYPE_DRY
:
{
std
::
vector
<
const
char
*>
c_breakers
;
c_breakers
.
reserve
(
params
.
dry_sequence_breakers
.
size
());
for
(
const
auto
&
str
:
params
.
dry_sequence_breakers
)
{
c_breakers
.
push_back
(
str
.
c_str
());
}
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_dry
(
vocab
,
llama_model_n_ctx_train
(
model
),
params
.
dry_multiplier
,
params
.
dry_base
,
params
.
dry_allowed_length
,
params
.
dry_penalty_last_n
,
c_breakers
.
data
(),
c_breakers
.
size
()));
for
(
const
auto
&
cnstr
:
params
.
samplers
)
{
switch
(
cnstr
)
{
case
COMMON_SAMPLER_TYPE_DRY
:
{
std
::
vector
<
const
char
*>
c_breakers
;
c_breakers
.
reserve
(
params
.
dry_sequence_breakers
.
size
());
for
(
const
auto
&
str
:
params
.
dry_sequence_breakers
)
{
c_breakers
.
push_back
(
str
.
c_str
());
}
break
;
case
COMMON_SAMPLER_TYPE_TOP_K
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_top_k
(
params
.
top_k
));
break
;
case
COMMON_SAMPLER_TYPE_TOP_P
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_top_p
(
params
.
top_p
,
params
.
min_keep
));
break
;
case
COMMON_SAMPLER_TYPE_MIN_P
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_min_p
(
params
.
min_p
,
params
.
min_keep
));
break
;
case
COMMON_SAMPLER_TYPE_XTC
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_xtc
(
params
.
xtc_probability
,
params
.
xtc_threshold
,
params
.
min_keep
,
params
.
seed
));
break
;
case
COMMON_SAMPLER_TYPE_TYPICAL_P
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_typical
(
params
.
typ_p
,
params
.
min_keep
));
break
;
case
COMMON_SAMPLER_TYPE_TEMPERATURE
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_temp_ext
(
params
.
temp
,
params
.
dynatemp_range
,
params
.
dynatemp_exponent
));
break
;
case
COMMON_SAMPLER_TYPE_INFILL
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_infill
(
vocab
));
break
;
case
COMMON_SAMPLER_TYPE_PENALTIES
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_penalties
(
params
.
penalty_last_n
,
params
.
penalty_repeat
,
params
.
penalty_freq
,
params
.
penalty_present
));
break
;
default:
GGML_ASSERT
(
false
&&
"unknown sampler type"
);
}
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_dry
(
vocab
,
llama_model_n_ctx_train
(
model
),
params
.
dry_multiplier
,
params
.
dry_base
,
params
.
dry_allowed_length
,
params
.
dry_penalty_last_n
,
c_breakers
.
data
(),
c_breakers
.
size
()));
}
break
;
case
COMMON_SAMPLER_TYPE_TOP_K
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_top_k
(
params
.
top_k
));
break
;
case
COMMON_SAMPLER_TYPE_TOP_P
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_top_p
(
params
.
top_p
,
params
.
min_keep
));
break
;
case
COMMON_SAMPLER_TYPE_TOP_N_SIGMA
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_top_n_sigma
(
params
.
top_n_sigma
));
break
;
case
COMMON_SAMPLER_TYPE_MIN_P
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_min_p
(
params
.
min_p
,
params
.
min_keep
));
break
;
case
COMMON_SAMPLER_TYPE_XTC
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_xtc
(
params
.
xtc_probability
,
params
.
xtc_threshold
,
params
.
min_keep
,
params
.
seed
));
break
;
case
COMMON_SAMPLER_TYPE_TYPICAL_P
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_typical
(
params
.
typ_p
,
params
.
min_keep
));
break
;
case
COMMON_SAMPLER_TYPE_TEMPERATURE
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_temp_ext
(
params
.
temp
,
params
.
dynatemp_range
,
params
.
dynatemp_exponent
));
break
;
case
COMMON_SAMPLER_TYPE_INFILL
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_infill
(
vocab
));
break
;
case
COMMON_SAMPLER_TYPE_PENALTIES
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_penalties
(
params
.
penalty_last_n
,
params
.
penalty_repeat
,
params
.
penalty_freq
,
params
.
penalty_present
));
break
;
default:
GGML_ASSERT
(
false
&&
"unknown sampler type"
);
}
}
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_dist
(
params
.
seed
));
...
...
@@ -475,6 +473,7 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
case
COMMON_SAMPLER_TYPE_TOP_K
:
return
'k'
;
case
COMMON_SAMPLER_TYPE_TYPICAL_P
:
return
'y'
;
case
COMMON_SAMPLER_TYPE_TOP_P
:
return
'p'
;
case
COMMON_SAMPLER_TYPE_TOP_N_SIGMA
:
return
's'
;
case
COMMON_SAMPLER_TYPE_MIN_P
:
return
'm'
;
case
COMMON_SAMPLER_TYPE_TEMPERATURE
:
return
't'
;
case
COMMON_SAMPLER_TYPE_XTC
:
return
'x'
;
...
...
@@ -490,6 +489,7 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
case
COMMON_SAMPLER_TYPE_TOP_K
:
return
"top_k"
;
case
COMMON_SAMPLER_TYPE_TYPICAL_P
:
return
"typ_p"
;
case
COMMON_SAMPLER_TYPE_TOP_P
:
return
"top_p"
;
case
COMMON_SAMPLER_TYPE_TOP_N_SIGMA
:
return
"top_n_sigma"
;
case
COMMON_SAMPLER_TYPE_MIN_P
:
return
"min_p"
;
case
COMMON_SAMPLER_TYPE_TEMPERATURE
:
return
"temperature"
;
case
COMMON_SAMPLER_TYPE_XTC
:
return
"xtc"
;
...
...
@@ -504,6 +504,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
{
"dry"
,
COMMON_SAMPLER_TYPE_DRY
},
{
"top_k"
,
COMMON_SAMPLER_TYPE_TOP_K
},
{
"top_p"
,
COMMON_SAMPLER_TYPE_TOP_P
},
{
"top_n_sigma"
,
COMMON_SAMPLER_TYPE_TOP_N_SIGMA
},
{
"typ_p"
,
COMMON_SAMPLER_TYPE_TYPICAL_P
},
{
"min_p"
,
COMMON_SAMPLER_TYPE_MIN_P
},
{
"temperature"
,
COMMON_SAMPLER_TYPE_TEMPERATURE
},
...
...
@@ -517,6 +518,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
std
::
unordered_map
<
std
::
string
,
common_sampler_type
>
sampler_alt_name_map
{
{
"top-k"
,
COMMON_SAMPLER_TYPE_TOP_K
},
{
"top-p"
,
COMMON_SAMPLER_TYPE_TOP_P
},
{
"top-n-sigma"
,
COMMON_SAMPLER_TYPE_TOP_N_SIGMA
},
{
"nucleus"
,
COMMON_SAMPLER_TYPE_TOP_P
},
{
"typical-p"
,
COMMON_SAMPLER_TYPE_TYPICAL_P
},
{
"typical"
,
COMMON_SAMPLER_TYPE_TYPICAL_P
},
...
...
@@ -533,14 +535,16 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
auto
sampler
=
sampler_canonical_name_map
.
find
(
name
);
if
(
sampler
!=
sampler_canonical_name_map
.
end
())
{
samplers
.
push_back
(
sampler
->
second
);
}
else
{
if
(
allow_alt_names
)
{
sampler
=
sampler_alt_name_map
.
find
(
name
);
if
(
sampler
!=
sampler_alt_name_map
.
end
())
{
samplers
.
push_back
(
sampler
->
second
);
}
continue
;
}
if
(
allow_alt_names
)
{
sampler
=
sampler_alt_name_map
.
find
(
name
);
if
(
sampler
!=
sampler_alt_name_map
.
end
())
{
samplers
.
push_back
(
sampler
->
second
);
continue
;
}
}
LOG_WRN
(
"%s: unable to match sampler by name '%s'
\n
"
,
__func__
,
name
.
c_str
());
}
return
samplers
;
...
...
@@ -552,6 +556,7 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
{
common_sampler_type_to_chr
(
COMMON_SAMPLER_TYPE_TOP_K
),
COMMON_SAMPLER_TYPE_TOP_K
},
{
common_sampler_type_to_chr
(
COMMON_SAMPLER_TYPE_TYPICAL_P
),
COMMON_SAMPLER_TYPE_TYPICAL_P
},
{
common_sampler_type_to_chr
(
COMMON_SAMPLER_TYPE_TOP_P
),
COMMON_SAMPLER_TYPE_TOP_P
},
{
common_sampler_type_to_chr
(
COMMON_SAMPLER_TYPE_TOP_N_SIGMA
),
COMMON_SAMPLER_TYPE_TOP_N_SIGMA
},
{
common_sampler_type_to_chr
(
COMMON_SAMPLER_TYPE_MIN_P
),
COMMON_SAMPLER_TYPE_MIN_P
},
{
common_sampler_type_to_chr
(
COMMON_SAMPLER_TYPE_TEMPERATURE
),
COMMON_SAMPLER_TYPE_TEMPERATURE
},
{
common_sampler_type_to_chr
(
COMMON_SAMPLER_TYPE_XTC
),
COMMON_SAMPLER_TYPE_XTC
},
...
...
@@ -566,6 +571,8 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
const
auto
sampler
=
sampler_name_map
.
find
(
c
);
if
(
sampler
!=
sampler_name_map
.
end
())
{
samplers
.
push_back
(
sampler
->
second
);
}
else
{
LOG_WRN
(
"%s: unable to match sampler by char '%c'
\n
"
,
__func__
,
c
);
}
}
...
...
llama/llama.cpp/include/llama.h
View file @
0cefd46f
...
...
@@ -4,6 +4,7 @@
#include "ggml.h"
#include "ggml-cpu.h"
#include "ggml-backend.h"
#include "ggml-opt.h"
#include <stddef.h>
#include <stdint.h>
...
...
@@ -112,6 +113,7 @@ extern "C" {
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE
=
32
,
LLAMA_VOCAB_PRE_TYPE_LLAMA4
=
33
,
LLAMA_VOCAB_PRE_TYPE_PIXTRAL
=
34
,
LLAMA_VOCAB_PRE_TYPE_SEED_CODER
=
35
,
};
enum
llama_rope_type
{
...
...
@@ -352,20 +354,19 @@ extern "C" {
enum
ggml_type
type_k
;
// data type for K cache [EXPERIMENTAL]
enum
ggml_type
type_v
;
// data type for V cache [EXPERIMENTAL]
// Abort callback
// if it returns true, execution of llama_decode() will be aborted
// currently works only with CPU execution
ggml_abort_callback
abort_callback
;
void
*
abort_callback_data
;
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
// TODO: move at the end of the struct
bool
logits_all
;
// the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
bool
embeddings
;
// if true, extract embeddings (together with logits)
bool
offload_kqv
;
// whether to offload the KQV ops (including the KV cache) to GPU
bool
flash_attn
;
// whether to use flash attention [EXPERIMENTAL]
bool
no_perf
;
// whether to measure performance timings
bool
op_offload
;
// whether to offload host tensor operations to device
bool
cross_attn
;
// whether to use cross attention
// Abort callback
// if it returns true, execution of llama_decode() will be aborted
// currently works only with CPU execution
ggml_abort_callback
abort_callback
;
void
*
abort_callback_data
;
};
// model quantization parameters
...
...
@@ -447,6 +448,10 @@ extern "C" {
size_t
n_paths
,
struct
llama_model_params
params
);
LLAMA_API
void
llama_model_save_to_file
(
const
struct
llama_model
*
model
,
const
char
*
path_model
);
DEPRECATED
(
LLAMA_API
void
llama_free_model
(
struct
llama_model
*
model
),
"use llama_model_free instead"
);
...
...
@@ -930,14 +935,19 @@ extern "C" {
// Frees a batch of tokens allocated with llama_batch_init()
LLAMA_API
void
llama_batch_free
(
struct
llama_batch
batch
);
// Processes a batch of tokens with the ecoder part of the encoder-decoder model.
// Stores the encoder output internally for later use by the decoder cross-attention layers.
// Process a batch of tokens.
// In contrast to llama_decode() - this call does not use KV cache.
// For encode-decoder contexts, processes the batch using the encoder.
// Can store the encoder output internally for later use by the decoder's cross-attention layers.
// 0 - success
// < 0 - error. the KV cache state is restored to the state before this call
LLAMA_API
int32_t
llama_encode
(
struct
llama_context
*
ctx
,
struct
llama_batch
batch
);
// Process a batch of tokens.
// Requires KV cache.
// For encode-decoder contexts, processes the batch using the decoder.
// Positive return values does not mean a fatal error, but rather a warning.
// 0 - success
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
...
...
@@ -1434,6 +1444,37 @@ extern "C" {
LLAMA_API
void
llama_perf_sampler_print
(
const
struct
llama_sampler
*
chain
);
LLAMA_API
void
llama_perf_sampler_reset
(
struct
llama_sampler
*
chain
);
//
// training
//
// function that returns whether or not a given tensor contains trainable parameters
typedef
bool
(
*
llama_opt_param_filter
)(
const
struct
ggml_tensor
*
tensor
,
void
*
userdata
);
// always returns true
LLAMA_API
bool
llama_opt_param_filter_all
(
const
struct
ggml_tensor
*
tensor
,
void
*
userdata
);
struct
llama_opt_params
{
uint32_t
n_ctx_train
;
// assumed context size post training, use context size specified in llama_context if 0
llama_opt_param_filter
param_filter
;
// callback for determining which tensors contain trainable parameters
void
*
param_filter_ud
;
// userdata for determining which tensors contain trainable parameters
ggml_opt_get_optimizer_params
get_opt_pars
;
// callback for calculating optimizer parameters
void
*
get_opt_pars_ud
;
// userdata for calculating optimizer parameters
};
LLAMA_API
void
llama_opt_init
(
struct
llama_context
*
lctx
,
struct
llama_model
*
model
,
struct
llama_opt_params
lopt_params
);
LLAMA_API
void
llama_opt_epoch
(
struct
llama_context
*
lctx
,
ggml_opt_dataset_t
dataset
,
ggml_opt_result_t
result_train
,
ggml_opt_result_t
result_eval
,
int64_t
idata_split
,
ggml_opt_epoch_callback
callback_train
,
ggml_opt_epoch_callback
callback_eval
);
#ifdef __cplusplus
}
#endif
...
...
llama/llama.cpp/src/llama-adapter.cpp
View file @
0cefd46f
...
...
@@ -253,6 +253,9 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
std
::
vector
<
ggml_backend_buffer_type_t
>
buft_extra
;
{
auto
*
cpu_dev
=
ggml_backend_dev_by_type
(
GGML_BACKEND_DEVICE_TYPE_CPU
);
if
(
!
cpu_dev
)
{
throw
std
::
runtime_error
(
format
(
"%s: no CPU backend found"
,
__func__
));
}
auto
*
cpu_reg
=
ggml_backend_dev_backend_reg
(
cpu_dev
);
auto
ggml_backend_dev_get_extra_bufts_fn
=
(
ggml_backend_dev_get_extra_bufts_t
)
...
...
@@ -291,6 +294,9 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
LLAMA_LOG_WARN
(
"%s: lora for '%s' cannot use buft '%s', fallback to CPU
\n
"
,
__func__
,
model_tensor
->
name
,
ggml_backend_buft_name
(
buft
));
auto
*
cpu_dev
=
ggml_backend_dev_by_type
(
GGML_BACKEND_DEVICE_TYPE_CPU
);
if
(
!
cpu_dev
)
{
throw
std
::
runtime_error
(
format
(
"%s: no CPU backend found"
,
__func__
));
}
buft
=
ggml_backend_dev_buffer_type
(
cpu_dev
);
break
;
...
...
llama/llama.cpp/src/llama-batch.cpp
View file @
0cefd46f
...
...
@@ -189,7 +189,7 @@ llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) {
return
ubatch
;
}
void
llama_sbatch
::
from_
batch
(
const
llama_batch
&
batch
,
size_t
n_embd
,
bool
simple_split
,
bool
logits_all
)
{
llama_sbatch
::
llama_s
batch
(
const
llama_batch
&
batch
,
size_t
n_embd
,
bool
simple_split
,
bool
logits_all
)
{
GGML_ASSERT
(
batch
.
n_tokens
>=
0
);
this
->
batch
=
&
batch
;
this
->
n_embd
=
n_embd
;
...
...
@@ -203,6 +203,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
for
(
size_t
i
=
0
;
i
<
n_tokens
;
++
i
)
{
ids
[
i
]
=
i
;
}
if
(
simple_split
)
{
seq
.
resize
(
1
);
llama_sbatch_seq
&
s
=
seq
[
0
];
...
...
@@ -212,6 +213,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
s
.
length
=
n_tokens
;
return
;
}
std
::
sort
(
ids
.
begin
(),
ids
.
end
(),
[
&
batch
](
size_t
a
,
size_t
b
)
{
int32_t
n_seq_a
=
batch
.
n_seq_id
?
batch
.
n_seq_id
[
a
]
:
1
;
...
...
@@ -239,6 +241,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
return
n_seq_a
>
n_seq_b
;
}
);
// init seq
llama_sbatch_seq
*
last_seq
=
nullptr
;
...
...
@@ -262,6 +265,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
seq
.
push_back
(
new_seq
);
last_seq
=
&
seq
.
back
();
}
// keep shared prompts first at the end, then sort by length descending.
std
::
sort
(
seq
.
begin
(),
seq
.
end
(),
[](
llama_sbatch_seq
&
a
,
llama_sbatch_seq
&
b
)
{
...
...
llama/llama.cpp/src/llama-batch.h
View file @
0cefd46f
...
...
@@ -70,7 +70,8 @@ struct llama_sbatch {
// sequence-wise split
llama_ubatch
split_seq
(
size_t
n_ubatch
);
void
from_batch
(
const
llama_batch
&
batch
,
size_t
n_embd
,
bool
simple_split
=
false
,
bool
logits_all
=
false
);
llama_sbatch
()
=
default
;
llama_sbatch
(
const
llama_batch
&
batch
,
size_t
n_embd
,
bool
simple_split
=
false
,
bool
logits_all
=
false
);
};
// temporary allocate memory for the input batch if needed
...
...
llama/llama.cpp/src/llama-chat.cpp
View file @
0cefd46f
...
...
@@ -35,6 +35,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
{
"mistral-v3"
,
LLM_CHAT_TEMPLATE_MISTRAL_V3
},
{
"mistral-v3-tekken"
,
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN
},
{
"mistral-v7"
,
LLM_CHAT_TEMPLATE_MISTRAL_V7
},
{
"mistral-v7-tekken"
,
LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN
},
{
"phi3"
,
LLM_CHAT_TEMPLATE_PHI_3
},
{
"phi4"
,
LLM_CHAT_TEMPLATE_PHI_4
},
{
"falcon3"
,
LLM_CHAT_TEMPLATE_FALCON_3
},
...
...
@@ -202,19 +203,20 @@ int32_t llm_chat_apply_template(
if
(
add_ass
)
{
ss
<<
"<|im_start|>assistant
\n
"
;
}
}
else
if
(
tmpl
==
LLM_CHAT_TEMPLATE_MISTRAL_V7
)
{
}
else
if
(
tmpl
==
LLM_CHAT_TEMPLATE_MISTRAL_V7
||
tmpl
==
LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN
)
{
// Official mistral 'v7' template
// See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
// https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503#basic-instruct-template-v7-tekken
const
char
*
trailing_space
=
tmpl
==
LLM_CHAT_TEMPLATE_MISTRAL_V7
?
" "
:
""
;
for
(
auto
message
:
chat
)
{
std
::
string
role
(
message
->
role
);
std
::
string
content
(
message
->
content
);
if
(
role
==
"system"
)
{
ss
<<
"[SYSTEM_PROMPT]
"
<<
content
<<
"[/SYSTEM_PROMPT]"
;
ss
<<
"[SYSTEM_PROMPT]"
<<
trailing_space
<<
content
<<
"[/SYSTEM_PROMPT]"
;
}
else
if
(
role
==
"user"
)
{
ss
<<
"[INST] "
<<
content
<<
"[/INST]"
;
}
else
{
ss
<<
" "
<<
content
<<
"</s>"
;
ss
<<
"[INST]"
<<
trailing_space
<<
content
<<
"[/INST]"
;
}
else
{
ss
<<
trailing_space
<<
content
<<
"</s>"
;
}
}
}
else
if
(
tmpl
==
LLM_CHAT_TEMPLATE_MISTRAL_V1
...
...
@@ -447,8 +449,16 @@ int32_t llm_chat_apply_template(
if
(
add_ass
)
{
ss
<<
"<|assistant|>"
;
}
}
else
if
(
tmpl
==
LLM_CHAT_TEMPLATE_CHATGLM_4
||
tmpl
==
LLM_CHAT_TEMPLATE_GLMEDGE
)
{
}
else
if
(
tmpl
==
LLM_CHAT_TEMPLATE_CHATGLM_4
)
{
ss
<<
"[gMASK]"
<<
"<sop>"
;
for
(
auto
message
:
chat
)
{
std
::
string
role
(
message
->
role
);
ss
<<
"<|"
<<
role
<<
"|>"
<<
"
\n
"
<<
message
->
content
;
}
if
(
add_ass
)
{
ss
<<
"<|assistant|>
\n
"
;
}
}
else
if
(
tmpl
==
LLM_CHAT_TEMPLATE_GLMEDGE
)
{
for
(
auto
message
:
chat
)
{
std
::
string
role
(
message
->
role
);
ss
<<
"<|"
<<
role
<<
"|>"
<<
"
\n
"
<<
message
->
content
;
...
...
llama/llama.cpp/src/llama-chat.h
View file @
0cefd46f
...
...
@@ -14,6 +14,7 @@ enum llm_chat_template {
LLM_CHAT_TEMPLATE_MISTRAL_V3
,
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN
,
LLM_CHAT_TEMPLATE_MISTRAL_V7
,
LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN
,
LLM_CHAT_TEMPLATE_PHI_3
,
LLM_CHAT_TEMPLATE_PHI_4
,
LLM_CHAT_TEMPLATE_FALCON_3
,
...
...
llama/llama.cpp/src/llama-context.cpp
View file @
0cefd46f
...
...
@@ -6,11 +6,9 @@
#include "llama-model.h"
#include "llama-kv-cache.h"
#include <cassert>
#include <cstring>
#include <stdexcept>
#include <cinttypes>
#include <cmath>
//
// llama_context
...
...
@@ -95,6 +93,7 @@ llama_context::llama_context(
}
cparams
.
n_ubatch
=
std
::
min
(
cparams
.
n_batch
,
params
.
n_ubatch
==
0
?
params
.
n_batch
:
params
.
n_ubatch
);
cparams
.
op_offload
=
params
.
op_offload
;
const
uint32_t
n_ctx_per_seq
=
cparams
.
n_ctx
/
cparams
.
n_seq_max
;
...
...
@@ -118,8 +117,6 @@ llama_context::llama_context(
__func__
,
n_ctx_per_seq
,
hparams
.
n_ctx_train
);
}
logits_all
=
params
.
logits_all
;
if
(
!
hparams
.
vocab_only
)
{
// GPU backends
for
(
auto
*
dev
:
model
.
devices
)
{
...
...
@@ -177,44 +174,13 @@ llama_context::llama_context(
}
// init the memory module
// TODO: for now, always create a unified KV cache
if
(
!
hparams
.
vocab_only
)
{
kv_self
.
reset
(
static_cast
<
llama_kv_cache_unified
*>
(
model
.
create_memory
()));
LLAMA_LOG_DEBUG
(
"%s: n_ctx = %u
\n
"
,
__func__
,
cparams
.
n_ctx
);
cparams
.
n_ctx
=
GGML_PAD
(
cparams
.
n_ctx
,
kv_self
->
get_padding
(
cparams
));
LLAMA_LOG_DEBUG
(
"%s: n_ctx = %u (padded)
\n
"
,
__func__
,
cparams
.
n_ctx
);
uint32_t
kv_size
=
cparams
.
n_ctx
;
ggml_type
type_k
=
params
.
type_k
;
ggml_type
type_v
=
params
.
type_v
;
if
(
llama_model_is_recurrent
(
&
model
))
{
// Mamba needs at least as many KV cells as there are sequences kept at any time
kv_size
=
std
::
max
((
uint32_t
)
1
,
params
.
n_seq_max
);
// it's probably best to keep as much precision as possible for the states
type_k
=
GGML_TYPE_F32
;
// required by ggml_ssm_conv for Mamba's conv_states
type_v
=
GGML_TYPE_F32
;
// required by ggml_ssm_scan for Mamba's ssm_states
}
GGML_ASSERT
(
hparams
.
n_embd_head_k
%
ggml_blck_size
(
type_k
)
==
0
);
GGML_ASSERT
(
hparams
.
n_embd_head_v
%
ggml_blck_size
(
type_v
)
==
0
);
if
(
!
kv_self
->
init
(
model
,
cparams
,
type_k
,
type_v
,
kv_size
,
cparams
.
offload_kqv
))
{
throw
std
::
runtime_error
(
"failed to initialize self-attention cache"
);
}
{
const
size_t
memory_size_k
=
kv_self
->
size_k_bytes
();
const
size_t
memory_size_v
=
kv_self
->
size_v_bytes
();
llama_memory_params
params_mem
=
{
/*.type_k =*/
params
.
type_k
,
/*.type_v =*/
params
.
type_v
,
};
LLAMA_LOG_INFO
(
"%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB
\n
"
,
__func__
,
(
float
)(
memory_size_k
+
memory_size_v
)
/
(
1024.0
f
*
1024.0
f
),
ggml_type_name
(
type_k
),
(
float
)
memory_size_k
/
(
1024.0
f
*
1024.0
f
),
ggml_type_name
(
type_v
),
(
float
)
memory_size_v
/
(
1024.0
f
*
1024.0
f
));
}
memory
.
reset
(
model
.
create_memory
(
params_mem
,
cparams
));
}
// init backends
...
...
@@ -278,7 +244,7 @@ llama_context::llama_context(
}
}
sched
.
reset
(
ggml_backend_sched_new
(
backend_ptrs
.
data
(),
backend_buft
.
data
(),
backend_ptrs
.
size
(),
max_nodes
,
pipeline_parallel
));
sched
.
reset
(
ggml_backend_sched_new
(
backend_ptrs
.
data
(),
backend_buft
.
data
(),
backend_ptrs
.
size
(),
max_nodes
,
pipeline_parallel
,
cparams
.
op_offload
));
if
(
pipeline_parallel
)
{
LLAMA_LOG_INFO
(
"%s: pipeline parallelism enabled (n_copies=%d)
\n
"
,
__func__
,
ggml_backend_sched_get_n_copies
(
sched
.
get
()));
...
...
@@ -286,7 +252,7 @@ llama_context::llama_context(
}
// reserve worst-case graph
if
(
!
hparams
.
vocab_only
)
{
if
(
!
hparams
.
vocab_only
&&
memory
)
{
const
uint32_t
n_seqs
=
1
;
// TODO: worst-case number of sequences
const
uint32_t
n_tokens
=
std
::
min
(
cparams
.
n_ctx
,
cparams
.
n_ubatch
);
...
...
@@ -305,7 +271,9 @@ llama_context::llama_context(
int
n_nodes_tg
=
-
1
;
// simulate full KV cache
kv_self
->
n
=
kv_self
->
size
;
llama_kv_cache
*
kv_self
=
static_cast
<
llama_kv_cache
*>
(
memory
.
get
());
kv_self
->
set_full
();
cross
.
v_embd
.
clear
();
...
...
@@ -391,7 +359,9 @@ llama_context::llama_context(
}
}
llama_context
::~
llama_context
()
=
default
;
llama_context
::~
llama_context
()
{
ggml_opt_free
(
opt_ctx
);
}
void
llama_context
::
synchronize
()
{
ggml_backend_sched_synchronize
(
sched
.
get
());
...
...
@@ -427,6 +397,18 @@ const llama_model & llama_context::get_model() const {
return
model
;
}
const
llama_cparams
&
llama_context
::
get_cparams
()
const
{
return
cparams
;
}
ggml_backend_sched_t
llama_context
::
get_sched
()
const
{
return
sched
.
get
();
}
ggml_context
*
llama_context
::
get_ctx_compute
()
const
{
return
ctx_compute
.
get
();
}
uint32_t
llama_context
::
n_ctx
()
const
{
return
cparams
.
n_ctx
;
}
...
...
@@ -456,318 +438,44 @@ uint32_t llama_context::n_threads_batch() const {
}
llama_kv_cache
*
llama_context
::
get_kv_self
()
{
return
kv_self
.
get
();
llama_kv_cache
*
kv_self
=
static_cast
<
llama_kv_cache
*>
(
memory
.
get
());
return
kv_self
;
}
const
llama_kv_cache
*
llama_context
::
get_kv_self
()
const
{
return
kv_self
.
get
();
}
ggml_tensor
*
llama_context
::
build_rope_shift
(
ggml_context
*
ctx0
,
ggml_tensor
*
cur
,
ggml_tensor
*
shift
,
ggml_tensor
*
factors
,
float
freq_base
,
float
freq_scale
)
const
{
const
auto
&
n_ctx_orig
=
cparams
.
n_ctx_orig_yarn
;
const
auto
&
yarn_ext_factor
=
cparams
.
yarn_ext_factor
;
const
auto
&
yarn_beta_fast
=
cparams
.
yarn_beta_fast
;
const
auto
&
yarn_beta_slow
=
cparams
.
yarn_beta_slow
;
const
auto
&
hparams
=
model
.
hparams
;
const
auto
&
n_rot
=
hparams
.
n_rot
;
const
auto
&
rope_type
=
hparams
.
rope_type
;
// See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
const
float
yarn_attn_factor
=
model
.
arch
==
LLM_ARCH_DEEPSEEK2
?
1.0
f
/
(
1.0
f
+
0.1
f
*
logf
(
1.0
f
/
freq_scale
))
:
cparams
.
yarn_attn_factor
;
ggml_tensor
*
tmp
;
if
(
ggml_is_quantized
(
cur
->
type
))
{
// dequantize to f32 -> RoPE -> quantize back
tmp
=
ggml_cast
(
ctx0
,
cur
,
GGML_TYPE_F32
);
tmp
=
ggml_rope_ext
(
ctx0
,
tmp
,
shift
,
factors
,
n_rot
,
rope_type
,
n_ctx_orig
,
freq_base
,
freq_scale
,
yarn_ext_factor
,
yarn_attn_factor
,
yarn_beta_fast
,
yarn_beta_slow
);
tmp
=
ggml_cpy
(
ctx0
,
tmp
,
cur
);
}
else
{
// we rotate only the first n_rot dimensions
tmp
=
ggml_rope_ext_inplace
(
ctx0
,
cur
,
shift
,
factors
,
n_rot
,
rope_type
,
n_ctx_orig
,
freq_base
,
freq_scale
,
yarn_ext_factor
,
yarn_attn_factor
,
yarn_beta_fast
,
yarn_beta_slow
);
}
return
tmp
;
}
class
llm_graph_input_k_shift
:
public
llm_graph_input_i
{
public:
llm_graph_input_k_shift
(
const
llama_kv_cache_unified
*
kv_self
)
:
kv_self
(
kv_self
)
{}
virtual
~
llm_graph_input_k_shift
()
=
default
;
void
set_input
(
const
llama_ubatch
*
ubatch
)
override
;
ggml_tensor
*
k_shift
;
// I32 [kv_size]
const
llama_kv_cache_unified
*
kv_self
;
};
void
llm_graph_input_k_shift
::
set_input
(
const
llama_ubatch
*
ubatch
)
{
GGML_UNUSED
(
ubatch
);
if
(
k_shift
)
{
assert
(
ggml_backend_buffer_is_host
(
k_shift
->
buffer
));
int32_t
*
data
=
(
int32_t
*
)
k_shift
->
data
;
for
(
uint32_t
i
=
0
;
i
<
kv_self
->
size
;
++
i
)
{
data
[
i
]
=
kv_self
->
cells
[
i
].
delta
;
}
}
}
llm_graph_result_ptr
llama_context
::
build_kv_self_shift
(
ggml_context
*
ctx0
,
ggml_cgraph
*
gf
)
const
{
auto
res
=
std
::
make_unique
<
llm_graph_result
>
();
const
auto
&
hparams
=
model
.
hparams
;
const
auto
&
n_layer
=
hparams
.
n_layer
;
const
auto
&
n_embd_head_k
=
hparams
.
n_embd_head_k
;
//const auto & n_embd_head_v = hparams.n_embd_head_v;
//GGML_ASSERT(kv_self->size == n_ctx);
auto
inp
=
std
::
make_unique
<
llm_graph_input_k_shift
>
(
kv_self
.
get
());
inp
->
k_shift
=
ggml_new_tensor_1d
(
ctx0
,
GGML_TYPE_I32
,
cparams
.
n_ctx
);
ggml_set_input
(
inp
->
k_shift
);
for
(
uint32_t
il
=
0
;
il
<
n_layer
;
++
il
)
{
const
int64_t
n_head_kv
=
hparams
.
n_head_kv
(
il
);
const
int64_t
n_embd_k_gqa
=
hparams
.
n_embd_k_gqa
(
il
);
const
bool
is_swa
=
hparams
.
is_swa
(
il
);
// note: the swa rope params could become part of the cparams in the future
// if we decide to make them configurable, like the non-sliding ones
const
float
freq_base_l
=
is_swa
?
hparams
.
rope_freq_base_train_swa
:
cparams
.
rope_freq_base
;
const
float
freq_scale_l
=
is_swa
?
hparams
.
rope_freq_scale_train_swa
:
cparams
.
rope_freq_scale
;
ggml_tensor
*
rope_factors
=
kv_self
->
cbs
.
get_rope_factors
(
n_ctx_per_seq
(),
il
);
ggml_tensor
*
k
=
ggml_view_3d
(
ctx0
,
kv_self
->
k_l
[
il
],
n_embd_head_k
,
n_head_kv
,
kv_self
->
size
,
ggml_row_size
(
kv_self
->
k_l
[
il
]
->
type
,
n_embd_head_k
),
ggml_row_size
(
kv_self
->
k_l
[
il
]
->
type
,
n_embd_k_gqa
),
0
);
ggml_tensor
*
cur
=
build_rope_shift
(
ctx0
,
k
,
inp
->
k_shift
,
rope_factors
,
freq_base_l
,
freq_scale_l
);
ggml_build_forward_expand
(
gf
,
cur
);
}
res
->
add_input
(
std
::
move
(
inp
));
return
res
;
}
llm_graph_result_ptr
llama_context
::
build_kv_self_defrag
(
ggml_context
*
ctx0
,
ggml_cgraph
*
gf
,
const
std
::
vector
<
struct
llama_kv_defrag_move
>
&
moves
)
const
{
auto
res
=
std
::
make_unique
<
llm_graph_result
>
();
const
auto
&
hparams
=
model
.
hparams
;
#if 0
// CPU defrag
//
// TODO: optimizations are possible:
// - multiple threads
// - avoid copying to the host memory when already there
//
// likely not worth the effort, as we have ggml_graph based defrag
//
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
const uint32_t kv_size = size;
std::vector<uint8_t> buf_k;
std::vector<uint8_t> buf_v;
for (uint32_t il = 0; il < n_layer; ++il) {
const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
const size_t k_size = ggml_row_size(k_l[il]->type, n_embd_k_gqa*kv_size);
const size_t v_size_el = ggml_type_size(v_l[il]->type);
const size_t v_size = ggml_row_size (v_l[il]->type, n_embd_v_gqa*kv_size);
buf_k.resize(k_size);
buf_v.resize(v_size);
ggml_backend_tensor_get(k_l[il], buf_k.data(), 0, buf_k.size());
ggml_backend_tensor_get(v_l[il], buf_v.data(), 0, buf_v.size());
// batch move [i, i+nm) to [id, id+nm)
// note: cells can move only to a lower index
for (uint32_t i = 0; i < n_kv; ++i) {
const uint32_t id = ids[i];
if (i == id || id == n_kv) {
continue;
}
uint32_t nm = 1;
while (i + nm < n_kv && ids[i + nm] == id + nm) {
nm++;
}
// move keys
{
const int64_t os = i*k_size_row;
const int64_t od = id*k_size_row;
memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
}
// move values (note: they are transposed)
{
const int64_t os = i;
const int64_t od = id;
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
}
}
i += nm - 1;
}
ggml_backend_tensor_set(k_l[il], buf_k.data(), 0, buf_k.size());
ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
}
#else
for
(
const
auto
&
move
:
moves
)
{
for
(
uint32_t
il
=
0
;
il
<
hparams
.
n_layer
;
++
il
)
{
// NOLINT
const
int64_t
n_embd_k_gqa
=
hparams
.
n_embd_k_gqa
(
il
);
const
int64_t
n_embd_v_gqa
=
hparams
.
n_embd_v_gqa
(
il
);
ggml_tensor
*
view_k_src
=
ggml_view_2d
(
ctx0
,
kv_self
->
k_l
[
il
],
n_embd_k_gqa
,
move
.
len
,
ggml_row_size
(
kv_self
->
k_l
[
il
]
->
type
,
n_embd_k_gqa
),
ggml_row_size
(
kv_self
->
k_l
[
il
]
->
type
,
n_embd_k_gqa
*
move
.
src
));
ggml_tensor
*
view_k_dst
=
ggml_view_2d
(
ctx0
,
kv_self
->
k_l
[
il
],
n_embd_k_gqa
,
move
.
len
,
ggml_row_size
(
kv_self
->
k_l
[
il
]
->
type
,
n_embd_k_gqa
),
ggml_row_size
(
kv_self
->
k_l
[
il
]
->
type
,
n_embd_k_gqa
*
move
.
dst
));
ggml_tensor
*
view_v_src
;
ggml_tensor
*
view_v_dst
;
if
(
cparams
.
flash_attn
)
{
// NOTE: the V cache is not transposed when using flash attention
view_v_src
=
ggml_view_2d
(
ctx0
,
kv_self
->
v_l
[
il
],
n_embd_v_gqa
,
move
.
len
,
ggml_row_size
(
kv_self
->
v_l
[
il
]
->
type
,
n_embd_v_gqa
),
ggml_row_size
(
kv_self
->
v_l
[
il
]
->
type
,
n_embd_v_gqa
*
move
.
src
));
view_v_dst
=
ggml_view_2d
(
ctx0
,
kv_self
->
v_l
[
il
],
n_embd_v_gqa
,
move
.
len
,
ggml_row_size
(
kv_self
->
v_l
[
il
]
->
type
,
n_embd_v_gqa
),
ggml_row_size
(
kv_self
->
v_l
[
il
]
->
type
,
n_embd_v_gqa
*
move
.
dst
));
}
else
{
view_v_src
=
ggml_view_2d
(
ctx0
,
kv_self
->
v_l
[
il
],
move
.
len
,
n_embd_v_gqa
,
ggml_row_size
(
kv_self
->
v_l
[
il
]
->
type
,
kv_self
->
size
),
ggml_row_size
(
kv_self
->
v_l
[
il
]
->
type
,
move
.
src
));
view_v_dst
=
ggml_view_2d
(
ctx0
,
kv_self
->
v_l
[
il
],
move
.
len
,
n_embd_v_gqa
,
ggml_row_size
(
kv_self
->
v_l
[
il
]
->
type
,
kv_self
->
size
),
ggml_row_size
(
kv_self
->
v_l
[
il
]
->
type
,
move
.
dst
));
}
ggml_build_forward_expand
(
gf
,
ggml_cpy
(
ctx0
,
view_k_src
,
view_k_dst
));
ggml_build_forward_expand
(
gf
,
ggml_cpy
(
ctx0
,
view_v_src
,
view_v_dst
));
}
}
#endif
return
res
;
llama_kv_cache
*
kv_self
=
static_cast
<
llama_kv_cache
*>
(
memory
.
get
());
return
kv_self
;
}
void
llama_context
::
kv_self_update
()
{
auto
&
kv
=
kv_self
;
bool
need_reserve
=
false
;
if
(
kv
->
has_shift
)
{
if
(
!
kv
->
get_can_shift
())
{
GGML_ABORT
(
"The current context does not support K-shift"
);
}
LLAMA_LOG_DEBUG
(
"%s: applying K-shift
\n
"
,
__func__
);
// apply K-shift if needed
if
(
model
.
hparams
.
rope_type
!=
LLAMA_ROPE_TYPE_NONE
)
{
ggml_backend_sched_reset
(
sched
.
get
());
auto
*
gf
=
graph_init
();
llama_kv_cache
*
kv_self
=
static_cast
<
llama_kv_cache
*>
(
memory
.
get
());
auto
res
=
build_kv_self_shift
(
ctx_compute
.
get
(),
gf
);
need_reserve
=
kv_self
->
update
(
*
this
);
ggml_backend_sched_alloc_graph
(
sched
.
get
(),
gf
);
// reserve a worst case graph if needed
if
(
need_reserve
)
{
LLAMA_LOG_DEBUG
(
"%s: reserving a worst case graph
\n
"
,
__func__
);
res
->
set_inputs
(
nullptr
);
// build worst-case graph
uint32_t
n_seqs
=
1
;
// TODO: worst-case number of sequences
uint32_t
n_tokens
=
std
::
min
(
cparams
.
n_ctx
,
cparams
.
n_ubatch
);
graph_compute
(
gf
,
false
);
}
{
kv
->
has_shift
=
false
;
for
(
uint32_t
i
=
0
;
i
<
kv
->
size
;
++
i
)
{
kv
->
cells
[
i
].
delta
=
0
;
}
}
}
// simulate full KV cache
kv_self
->
set_full
();
// defragment the KV cache if needed
if
(
kv
->
do_defrag
)
{
LLAMA_LOG_DEBUG
(
"%s: defragmenting KV cache
\n
"
,
__func__
);
const
uint32_t
n_max_nodes
=
graph_max_nodes
();
const
uint32_t
max_moves
=
(
n_max_nodes
-
2
*
model
.
hparams
.
n_layer
)
/
(
6
*
model
.
hparams
.
n_layer
);
if
(
!
kv
->
defrag_prepare
(
n_max_nodes
))
{
LLAMA_LOG_ERROR
(
"%s: failed to prepare defragmentation
\n
"
,
__func__
);
return
;
}
llama_token
token
=
model
.
vocab
.
token_bos
();
// not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
llama_ubatch
ubatch
=
{
true
,
n_tokens
,
n_tokens
/
n_seqs
,
n_seqs
,
&
token
,
nullptr
,
nullptr
,
nullptr
,
nullptr
,
nullptr
};
for
(
std
::
size_t
i
=
0
;
i
<
kv_self
->
defrag_info
.
moves
.
size
();
i
+=
max_moves
)
{
std
::
vector
<
struct
llama_kv_defrag_move
>
chunk
;
auto
end
=
std
::
min
(
i
+
max_moves
,
kv_self
->
defrag_info
.
moves
.
size
());
chunk
.
assign
(
kv_self
->
defrag_info
.
moves
.
begin
()
+
i
,
kv_self
->
defrag_info
.
moves
.
begin
()
+
end
);
auto
*
gf
=
graph_init
();
graph_build
(
ctx_compute
.
get
(),
gf
,
ubatch
,
LLM_GRAPH_TYPE_DEFAULT
);
ggml_backend_sched_reset
(
sched
.
get
());
auto
*
gf
=
graph_init
();
auto
res
=
build_kv_self_defrag
(
ctx_compute
.
get
(),
gf
,
chunk
);
ggml_backend_sched_alloc_graph
(
sched
.
get
(),
gf
);
res
->
set_inputs
(
nullptr
);
graph_compute
(
gf
,
false
);
// initialize scheduler with the worst-case graph
ggml_backend_sched_reset
(
sched
.
get
());
if
(
!
ggml_backend_sched_reserve
(
sched
.
get
(),
gf
))
{
LLAMA_LOG_ERROR
(
"%s: failed to allocate compute buffers
\n
"
,
__func__
);
}
kv
->
do_defrag
=
false
;
}
}
...
...
@@ -776,9 +484,6 @@ enum llama_pooling_type llama_context::pooling_type() const {
}
float
*
llama_context
::
get_logits
()
{
// reorder logits for backward compatibility
output_reorder
();
return
logits
;
}
...
...
@@ -821,9 +526,6 @@ float * llama_context::get_logits_ith(int32_t i) {
}
float
*
llama_context
::
get_embeddings
()
{
// reorder embeddings for backward compatibility
output_reorder
();
return
embd
;
}
...
...
@@ -979,8 +681,8 @@ int llama_context::encode(llama_batch & inp_batch) {
}
// temporary allocate memory for the input batch if needed
//
TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences
llama_batch_allocr
batch_allocr
(
inp_batch
,
inp_batch
.
pos
?
-
1
:
kv_self
->
pos_max
()
+
1
);
//
note: during encode, we always pass the full sequence starting from pos = 0
llama_batch_allocr
batch_allocr
(
inp_batch
,
inp_batch
.
pos
?
-
1
:
0
);
const
llama_batch
&
batch
=
batch_allocr
.
batch
;
const
int32_t
n_tokens
=
batch
.
n_tokens
;
...
...
@@ -1005,11 +707,13 @@ int llama_context::encode(llama_batch & inp_batch) {
t_compute_start_us
=
ggml_time_us
();
}
embd_seq
.
clear
();
n_queued_tokens
+=
n_tokens
;
const
int64_t
n_embd
=
hparams
.
n_embd
;
sbatch
.
from_
batch
(
batch
,
batch
.
n_embd
,
/* simple_split */
true
,
/* logits_all */
true
);
llama_sbatch
sbatch
=
llama_s
batch
(
batch
,
batch
.
n_embd
,
/* simple_split */
true
,
/* logits_all */
true
);
const
llama_ubatch
ubatch
=
sbatch
.
split_simple
(
n_tokens
);
...
...
@@ -1066,12 +770,12 @@ int llama_context::encode(llama_batch & inp_batch) {
ggml_backend_t
backend_embd
=
ggml_backend_sched_get_tensor_backend
(
sched
.
get
(),
t_embd
);
GGML_ASSERT
(
backend_embd
!=
nullptr
);
GGML_ASSERT
(
embd
!=
nullptr
);
switch
(
cparams
.
pooling_type
)
{
case
LLAMA_POOLING_TYPE_NONE
:
{
// extract token embeddings
GGML_ASSERT
(
embd
!=
nullptr
);
GGML_ASSERT
(
n_tokens
*
n_embd
<=
(
int64_t
)
embd_size
);
ggml_backend_tensor_get_async
(
backend_embd
,
t_embd
,
embd
,
0
,
n_tokens
*
n_embd
*
sizeof
(
float
));
}
break
;
...
...
@@ -1096,11 +800,18 @@ int llama_context::encode(llama_batch & inp_batch) {
}
break
;
case
LLAMA_POOLING_TYPE_RANK
:
{
// TODO: this likely should be the same logic as in llama_decoder_internal, but better to
// wait for an encoder model that requires this pooling type in order to test it
// https://github.com/ggerganov/llama.cpp/pull/9510
GGML_ABORT
(
"RANK pooling not implemented yet"
);
}
// extract the rerank score - a single float per sequence
auto
&
embd_seq_out
=
embd_seq
;
for
(
uint32_t
s
=
0
;
s
<
ubatch
.
n_seqs
;
++
s
)
{
const
llama_seq_id
seq_id
=
ubatch
.
seq_id
[
s
][
0
];
if
(
embd_seq_out
.
find
(
seq_id
)
!=
embd_seq_out
.
end
())
{
continue
;
}
embd_seq_out
[
seq_id
].
resize
(
1
);
ggml_backend_tensor_get_async
(
backend_embd
,
t_embd
,
embd_seq_out
[
seq_id
].
data
(),
(
seq_id
)
*
sizeof
(
float
),
sizeof
(
float
));
}
}
break
;
case
LLAMA_POOLING_TYPE_UNSPECIFIED
:
{
GGML_ABORT
(
"unknown pooling type"
);
...
...
@@ -1138,14 +849,21 @@ int llama_context::encode(llama_batch & inp_batch) {
}
int
llama_context
::
decode
(
llama_batch
&
inp_batch
)
{
if
(
!
memory
)
{
LLAMA_LOG_WARN
(
"%s: cannot decode batches with this context (use llama_encode() instead)
\n
"
,
__func__
);
return
encode
(
inp_batch
);
}
if
(
inp_batch
.
n_tokens
==
0
)
{
LLAMA_LOG_ERROR
(
"%s: n_tokens == 0
\n
"
,
__func__
);
return
-
1
;
}
llama_kv_cache
*
kv_self
=
static_cast
<
llama_kv_cache
*>
(
memory
.
get
());
// temporary allocate memory for the input batch if needed
// TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences
llama_batch_allocr
batch_allocr
(
inp_batch
,
inp_batch
.
pos
?
-
1
:
kv_self
->
pos_max
()
+
1
);
// TODO: this is incorrect for multiple sequences because
get_
pos_max() is the maximum across all sequences
llama_batch_allocr
batch_allocr
(
inp_batch
,
inp_batch
.
pos
?
-
1
:
kv_self
->
get_
pos_max
()
+
1
);
const
llama_batch
&
batch
=
batch_allocr
.
batch
;
...
...
@@ -1156,7 +874,7 @@ int llama_context::decode(llama_batch & inp_batch) {
const
int64_t
n_tokens_all
=
batch
.
n_tokens
;
const
int64_t
n_embd
=
hparams
.
n_embd
;
llama_kv_cache_guard
kv_guard
(
kv_self
.
get
()
);
llama_kv_cache_guard
kv_guard
(
kv_self
);
GGML_ASSERT
((
!
batch
.
token
&&
batch
.
embd
)
||
(
batch
.
token
&&
!
batch
.
embd
));
// NOLINT
...
...
@@ -1190,18 +908,14 @@ int llama_context::decode(llama_batch & inp_batch) {
for
(
uint32_t
i
=
0
;
i
<
n_tokens_all
;
++
i
)
{
n_outputs_all
+=
batch
.
logits
[
i
]
!=
0
;
}
}
else
if
(
logits_all
||
embd_pooled
)
{
}
else
if
(
embd_pooled
)
{
n_outputs_all
=
n_tokens_all
;
}
else
{
// keep last output only
n_outputs_all
=
1
;
}
const
bool
logits_all
=
n_outputs_all
==
n_tokens_all
;
sbatch
.
from_batch
(
batch
,
batch
.
n_embd
,
/* simple_split */
!
kv_self
->
recurrent
,
/* logits_all */
logits_all
);
llama_sbatch
sbatch
=
kv_self
->
sbatch_init
(
batch
,
/* logits_all */
n_outputs_all
==
n_tokens_all
);
// reserve output buffer
if
(
output_reserve
(
n_outputs_all
)
<
n_outputs_all
)
{
...
...
@@ -1215,22 +929,7 @@ int llama_context::decode(llama_batch & inp_batch) {
int64_t
n_outputs_prev
=
0
;
while
(
sbatch
.
n_tokens
>
0
)
{
llama_ubatch
ubatch
=
llama_ubatch
();
const
auto
&
n_ubatch
=
cparams
.
n_ubatch
;
if
(
kv_self
->
recurrent
)
{
if
(
embd_pooled
)
{
// Pooled embeddings cannot be split across ubatches (yet)
ubatch
=
sbatch
.
split_seq
(
cparams
.
n_ubatch
);
}
else
{
// recurrent model architectures are easier to implement
// with equal-length sequences
ubatch
=
sbatch
.
split_equal
(
cparams
.
n_ubatch
);
}
}
else
{
ubatch
=
sbatch
.
split_simple
(
n_ubatch
);
}
llama_ubatch
ubatch
=
kv_self
->
ubatch_next
(
sbatch
,
cparams
.
n_ubatch
,
embd_pooled
);
// count the outputs in this u_batch
{
...
...
@@ -1250,27 +949,12 @@ int llama_context::decode(llama_batch & inp_batch) {
}
// find KV slot
{
if
(
!
kv_self
->
find_slot
(
ubatch
))
{
kv_self
->
defrag
();
kv_self_update
();
if
(
!
kv_self
->
find_slot
(
ubatch
))
{
LLAMA_LOG_WARN
(
"%s: failed to find KV cache slot for ubatch of size %d
\n
"
,
__func__
,
ubatch
.
n_tokens
);
return
1
;
}
}
if
(
!
kv_self
->
find_slot
(
ubatch
))
{
LLAMA_LOG_WARN
(
"%s: failed to find KV cache slot for ubatch of size %d
\n
"
,
__func__
,
ubatch
.
n_tokens
);
if
(
!
kv_self
->
recurrent
)
{
// a heuristic, to avoid attending the full cache if it is not yet utilized
// after enough generations, the benefit from this heuristic disappears
// if we start defragmenting the cache, the benefit from this will be more important
const
uint32_t
pad
=
kv_self
->
get_padding
(
cparams
);
kv_self
->
n
=
std
::
min
(
kv_self
->
size
,
std
::
max
(
pad
,
GGML_PAD
(
kv_self
->
cell_max
(),
pad
)));
}
return
1
;
}
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self->n, kv_self->used, kv_self->head);
ggml_backend_sched_reset
(
sched
.
get
());
ggml_backend_sched_set_eval_callback
(
sched
.
get
(),
cparams
.
cb_eval
,
cparams
.
cb_eval_user_data
);
...
...
@@ -1384,43 +1068,68 @@ int llama_context::decode(llama_batch & inp_batch) {
// finalize the batch processing
kv_guard
.
commit
();
// set to total number of outputs in the batch, for use in llama_get_logits_ith
n_outputs
=
n_outputs_all
;
// set output mappings
{
bool
sorted_output
=
true
;
GGML_ASSERT
(
sbatch
.
out_ids
.
size
()
==
(
size_t
)
n_outputs_all
);
auto
&
out_ids
=
sbatch
.
out_ids
;
GGML_ASSERT
(
out_ids
.
size
()
==
(
size_t
)
n_outputs_all
);
for
(
int64_t
i
=
0
;
i
<
n_outputs_all
;
++
i
)
{
int64_t
out_id
=
sbatch
.
out_ids
[
i
];
int64_t
out_id
=
out_ids
[
i
];
output_ids
[
out_id
]
=
i
;
if
(
out_id
!=
i
)
{
sorted_output
=
false
;
}
}
if
(
sorted_output
)
{
sbatch
.
out_ids
.
clear
();
// make the outputs have the same order they had in the user-provided batch
// note: this is mostly relevant for recurrent models atm
if
(
!
sorted_output
)
{
const
uint32_t
n_vocab
=
model
.
hparams
.
n_vocab
;
const
uint32_t
n_embd
=
model
.
hparams
.
n_embd
;
GGML_ASSERT
((
size_t
)
n_outputs
==
out_ids
.
size
());
// TODO: is there something more efficient which also minimizes swaps?
// selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort)
for
(
int32_t
i
=
0
;
i
<
n_outputs
-
1
;
++
i
)
{
int32_t
j_min
=
i
;
for
(
int32_t
j
=
i
+
1
;
j
<
n_outputs
;
++
j
)
{
if
(
out_ids
[
j
]
<
out_ids
[
j_min
])
{
j_min
=
j
;
}
}
if
(
j_min
==
i
)
{
continue
;
}
std
::
swap
(
out_ids
[
i
],
out_ids
[
j_min
]);
if
(
logits_size
>
0
)
{
for
(
uint32_t
k
=
0
;
k
<
n_vocab
;
k
++
)
{
std
::
swap
(
logits
[
i
*
n_vocab
+
k
],
logits
[
j_min
*
n_vocab
+
k
]);
}
}
if
(
embd_size
>
0
)
{
for
(
uint32_t
k
=
0
;
k
<
n_embd
;
k
++
)
{
std
::
swap
(
embd
[
i
*
n_embd
+
k
],
embd
[
j_min
*
n_embd
+
k
]);
}
}
}
std
::
fill
(
output_ids
.
begin
(),
output_ids
.
end
(),
-
1
);
for
(
int32_t
i
=
0
;
i
<
n_outputs
;
++
i
)
{
output_ids
[
out_ids
[
i
]]
=
i
;
}
}
}
// set to total number of outputs in the batch, for use in llama_get_logits_ith
n_outputs
=
n_outputs_all
;
// wait for the computation to finish (automatically done when obtaining the model output)
//synchronize();
// decide if we need to defrag the kv cache
if
(
cparams
.
causal_attn
&&
cparams
.
defrag_thold
>
0.0
f
)
{
// - do not defrag small contexts (i.e. < 2048 tokens)
// - count the padding towards the number of used tokens
const
float
fragmentation
=
kv_self
->
n
>=
2048
?
std
::
max
(
0.0
f
,
1.0
f
-
float
(
kv_self
->
used
+
kv_self
->
get_padding
(
cparams
))
/
float
(
kv_self
->
n
))
:
0.0
f
;
// queue defragmentation for next llama_kv_cache_update
if
(
fragmentation
>
cparams
.
defrag_thold
)
{
LLAMA_LOG_DEBUG
(
"%s: fragmentation: %.2f - requesting defrag
\n
"
,
__func__
,
fragmentation
);
kv_self
->
defrag
();
}
if
(
cparams
.
defrag_thold
>
0.0
f
)
{
kv_self
->
defrag_sched
(
cparams
.
defrag_thold
);
}
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
...
...
@@ -1505,44 +1214,6 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
return
n_outputs_max
;
}
void
llama_context
::
output_reorder
()
{
auto
&
out_ids
=
sbatch
.
out_ids
;
if
(
!
out_ids
.
empty
())
{
const
uint32_t
n_vocab
=
model
.
hparams
.
n_vocab
;
const
uint32_t
n_embd
=
model
.
hparams
.
n_embd
;
GGML_ASSERT
((
size_t
)
n_outputs
==
out_ids
.
size
());
// TODO: is there something more efficient which also minimizes swaps?
// selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort)
for
(
int32_t
i
=
0
;
i
<
n_outputs
-
1
;
++
i
)
{
int32_t
j_min
=
i
;
for
(
int32_t
j
=
i
+
1
;
j
<
n_outputs
;
++
j
)
{
if
(
out_ids
[
j
]
<
out_ids
[
j_min
])
{
j_min
=
j
;
}
}
if
(
j_min
==
i
)
{
continue
;
}
std
::
swap
(
out_ids
[
i
],
out_ids
[
j_min
]);
if
(
logits_size
>
0
)
{
for
(
uint32_t
k
=
0
;
k
<
n_vocab
;
k
++
)
{
std
::
swap
(
logits
[
i
*
n_vocab
+
k
],
logits
[
j_min
*
n_vocab
+
k
]);
}
}
if
(
embd_size
>
0
)
{
for
(
uint32_t
k
=
0
;
k
<
n_embd
;
k
++
)
{
std
::
swap
(
embd
[
i
*
n_embd
+
k
],
embd
[
j_min
*
n_embd
+
k
]);
}
}
}
std
::
fill
(
output_ids
.
begin
(),
output_ids
.
end
(),
-
1
);
for
(
int32_t
i
=
0
;
i
<
n_outputs
;
++
i
)
{
output_ids
[
out_ids
[
i
]]
=
i
;
}
out_ids
.
clear
();
}
}
//
// graph
//
...
...
@@ -1579,7 +1250,7 @@ llm_graph_result_ptr llama_context::graph_build(
/*.backend_cpu =*/
backend_cpu
,
/*.cvec =*/
&
cvec
,
/*.loras =*/
&
loras
,
/*.memory =*/
kv_self
.
get
(),
/*.memory =*/
memory
.
get
(),
/*.cross =*/
&
cross
,
/*.n_outputs =*/
n_outputs
,
/*.cb =*/
graph_get_cb
(),
...
...
@@ -1983,8 +1654,6 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
{
LLAMA_LOG_DEBUG
(
"%s: - writing output ids
\n
"
,
__func__
);
output_reorder
();
const
auto
n_outputs
=
this
->
n_outputs
;
const
auto
&
output_ids
=
this
->
output_ids
;
...
...
@@ -2038,6 +1707,8 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
}
LLAMA_LOG_DEBUG
(
"%s: - writing KV self
\n
"
,
__func__
);
llama_kv_cache
*
kv_self
=
static_cast
<
llama_kv_cache
*>
(
memory
.
get
());
kv_self
->
state_write
(
io
);
return
io
.
n_bytes
();
...
...
@@ -2121,8 +1792,13 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
}
}
LLAMA_LOG_DEBUG
(
"%s: - reading KV self
\n
"
,
__func__
);
kv_self
->
state_read
(
io
);
if
(
memory
)
{
LLAMA_LOG_DEBUG
(
"%s: - reading KV self
\n
"
,
__func__
);
llama_kv_cache
*
kv_self
=
static_cast
<
llama_kv_cache
*>
(
memory
.
get
());
kv_self
->
state_read
(
io
);
}
return
io
.
n_bytes
();
}
...
...
@@ -2130,7 +1806,11 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
size_t
llama_context
::
state_seq_write_data
(
llama_io_write_i
&
io
,
llama_seq_id
seq_id
)
{
GGML_UNUSED
(
seq_id
);
kv_self
->
state_write
(
io
,
seq_id
);
if
(
memory
)
{
llama_kv_cache
*
kv_self
=
static_cast
<
llama_kv_cache
*>
(
memory
.
get
());
kv_self
->
state_write
(
io
,
seq_id
);
}
return
io
.
n_bytes
();
}
...
...
@@ -2138,7 +1818,11 @@ size_t llama_context::state_seq_write_data(llama_io_write_i & io, llama_seq_id s
size_t
llama_context
::
state_seq_read_data
(
llama_io_read_i
&
io
,
llama_seq_id
seq_id
)
{
GGML_UNUSED
(
seq_id
);
kv_self
->
state_read
(
io
,
seq_id
);
if
(
memory
)
{
llama_kv_cache
*
kv_self
=
static_cast
<
llama_kv_cache
*>
(
memory
.
get
());
kv_self
->
state_read
(
io
,
seq_id
);
}
return
io
.
n_bytes
();
}
...
...
@@ -2166,6 +1850,215 @@ void llama_context::perf_reset() {
t_p_eval_us
=
n_p_eval
=
0
;
}
//
// training
//
static
void
llama_set_param
(
struct
ggml_tensor
*
tensor
,
llama_opt_param_filter
param_filter
,
void
*
userdata
)
{
if
(
!
tensor
||
tensor
->
type
!=
GGML_TYPE_F32
)
{
return
;
}
if
(
!
param_filter
(
tensor
,
userdata
))
{
return
;
}
if
(
strcmp
(
tensor
->
name
,
"token_embd.weight"
)
==
0
)
{
return
;
// FIXME
}
if
(
strcmp
(
tensor
->
name
,
"rope_freqs.weight"
)
==
0
)
{
return
;
// FIXME
}
ggml_set_param
(
tensor
);
}
void
llama_context
::
opt_init
(
struct
llama_model
*
model
,
struct
llama_opt_params
lopt_params
)
{
GGML_ASSERT
(
!
opt_ctx
);
model
->
hparams
.
n_ctx_train
=
lopt_params
.
n_ctx_train
>
0
?
lopt_params
.
n_ctx_train
:
n_ctx
();
const
uint32_t
n_batch
=
std
::
min
(
this
->
n_batch
(),
model
->
hparams
.
n_ctx_train
);
const
uint32_t
n_ubatch
=
std
::
min
(
this
->
n_ubatch
(),
n_batch
);
GGML_ASSERT
(
model
->
hparams
.
n_ctx_train
%
n_batch
==
0
);
GGML_ASSERT
(
n_batch
%
n_ubatch
==
0
);
ggml_opt_params
opt_params
=
ggml_opt_default_params
(
sched
.
get
(),
GGML_OPT_LOSS_TYPE_CROSS_ENTROPY
);
opt_params
.
opt_period
=
n_batch
/
n_ubatch
;
opt_params
.
get_opt_pars
=
lopt_params
.
get_opt_pars
;
opt_params
.
get_opt_pars_ud
=
lopt_params
.
get_opt_pars_ud
;
opt_ctx
=
ggml_opt_init
(
opt_params
);
llama_opt_param_filter
param_filter
=
lopt_params
.
param_filter
;
void
*
param_filter_ud
=
lopt_params
.
param_filter_ud
;
//llama_set_param(model->tok_embd, param_filter, param_filter_ud); // FIXME
llama_set_param
(
model
->
type_embd
,
param_filter
,
param_filter_ud
);
llama_set_param
(
model
->
pos_embd
,
param_filter
,
param_filter_ud
);
llama_set_param
(
model
->
tok_norm
,
param_filter
,
param_filter_ud
);
llama_set_param
(
model
->
tok_norm_b
,
param_filter
,
param_filter_ud
);
llama_set_param
(
model
->
output_norm
,
param_filter
,
param_filter_ud
);
llama_set_param
(
model
->
output_norm_b
,
param_filter
,
param_filter_ud
);
llama_set_param
(
model
->
output
,
param_filter
,
param_filter_ud
);
llama_set_param
(
model
->
output_b
,
param_filter
,
param_filter_ud
);
llama_set_param
(
model
->
output_norm_enc
,
param_filter
,
param_filter_ud
);
llama_set_param
(
model
->
cls
,
param_filter
,
param_filter_ud
);
llama_set_param
(
model
->
cls_b
,
param_filter
,
param_filter_ud
);
llama_set_param
(
model
->
cls_out
,
param_filter
,
param_filter_ud
);
llama_set_param
(
model
->
cls_out_b
,
param_filter
,
param_filter_ud
);
for
(
struct
llama_layer
&
layer
:
model
->
layers
)
{
for
(
size_t
i
=
0
;
i
<
sizeof
(
layer
)
/
sizeof
(
struct
ggml_tensor
*
);
++
i
)
{
llama_set_param
(
reinterpret_cast
<
struct
ggml_tensor
**>
(
&
layer
)[
i
],
param_filter
,
param_filter_ud
);
}
}
}
void
llama_context
::
opt_epoch_iter
(
ggml_opt_dataset_t
dataset
,
ggml_opt_result_t
result
,
const
std
::
vector
<
llama_token
>
&
tokens
,
const
std
::
vector
<
llama_token
>
&
labels_sparse
,
llama_batch
&
batch
,
ggml_opt_epoch_callback
callback
,
bool
train
,
int64_t
idata_in_loop
,
int64_t
ndata_in_loop
,
int64_t
t_loop_start
)
{
GGML_ASSERT
(
opt_ctx
);
const
uint32_t
n_ctx
=
llama_model_n_ctx_train
(
&
model
);
const
uint32_t
n_batch
=
std
::
min
(
this
->
n_batch
(),
n_ctx
);
const
uint32_t
n_ubatch
=
std
::
min
(
this
->
n_ubatch
(),
n_batch
);
llama_kv_cache
*
kv_self
=
static_cast
<
llama_kv_cache
*>
(
memory
.
get
());
kv_self
->
clear
();
llama_kv_cache_guard
kv_guard
(
kv_self
);
for
(
uint32_t
pos_ctx
=
0
;
pos_ctx
<
n_ctx
;
pos_ctx
+=
n_batch
)
{
batch
.
n_tokens
=
n_batch
;
for
(
uint32_t
pos_batch
=
0
;
pos_batch
<
n_batch
;
++
pos_batch
)
{
batch
.
token
[
pos_batch
]
=
tokens
[
pos_ctx
+
pos_batch
];
batch
.
pos
[
pos_batch
]
=
pos_ctx
+
pos_batch
;
batch
.
n_seq_id
[
pos_batch
]
=
1
;
batch
.
seq_id
[
pos_batch
][
0
]
=
0
;
batch
.
logits
[
pos_batch
]
=
true
;
}
const
auto
n_tokens_all
=
batch
.
n_tokens
;
n_queued_tokens
+=
n_tokens_all
;
// this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
const
bool
embd_pooled
=
cparams
.
embeddings
&&
cparams
.
pooling_type
!=
LLAMA_POOLING_TYPE_NONE
;
embd_seq
.
clear
();
int64_t
n_outputs_all
=
n_tokens_all
;
llama_sbatch
sbatch
=
kv_self
->
sbatch_init
(
batch
,
/*logits_all =*/
true
);
// reserve output buffer
if
(
output_reserve
(
n_outputs_all
)
<
n_outputs_all
)
{
LLAMA_LOG_ERROR
(
"%s: could not reserve space for batch with %"
PRId64
" outputs
\n
"
,
__func__
,
n_outputs_all
);
GGML_ABORT
(
"TODO: handle this error"
);
};
for
(
uint32_t
pos_batch
=
0
;
pos_batch
<
n_batch
;
pos_batch
+=
n_ubatch
)
{
llama_ubatch
ubatch
=
kv_self
->
ubatch_next
(
sbatch
,
cparams
.
n_ubatch
,
embd_pooled
);
n_outputs
=
ubatch
.
n_tokens
;
// TODO: not sure if this is needed
if
(
!
kv_self
->
find_slot
(
ubatch
))
{
LLAMA_LOG_WARN
(
"%s: failed to find KV cache slot for ubatch of size %d
\n
"
,
__func__
,
ubatch
.
n_tokens
);
GGML_ABORT
(
"TODO: handle this error"
);
}
auto
*
gf
=
graph_init
();
auto
res
=
graph_build
(
ctx_compute
.
get
(),
gf
,
ubatch
,
LLM_GRAPH_TYPE_DEFAULT
);
struct
ggml_context
*
ctx_compute_opt
;
{
const
size_t
size_gf
=
ggml_graph_size
(
gf
);
const
size_t
size_meta
=
4
*
size_gf
*
ggml_tensor_overhead
()
+
2
*
ggml_graph_overhead_custom
(
size_gf
,
/*grads = */
true
);
struct
ggml_init_params
params
=
{
/*.mem_size =*/
size_meta
,
/*.mem_buffer =*/
nullptr
,
/*.no_alloc =*/
true
,
};
ctx_compute_opt
=
ggml_init
(
params
);
}
ggml_opt_prepare_alloc
(
opt_ctx
,
ctx_compute_opt
,
gf
,
res
->
get_tokens
(),
res
->
get_logits
());
ggml_opt_alloc
(
opt_ctx
,
train
);
res
->
set_inputs
(
&
ubatch
);
{
struct
ggml_tensor
*
labels
=
ggml_opt_labels
(
opt_ctx
);
GGML_ASSERT
(
labels
->
ne
[
1
]
==
n_ubatch
);
ggml_set_zero
(
labels
);
const
float
onef
=
1.0
f
;
for
(
uint32_t
pos_ubatch
=
0
;
pos_ubatch
<
n_ubatch
;
++
pos_ubatch
)
{
const
uint32_t
ilabel
=
pos_ctx
+
pos_batch
+
pos_ubatch
;
GGML_ASSERT
(
labels_sparse
[
ilabel
]
<
labels
->
ne
[
0
]);
ggml_backend_tensor_set
(
labels
,
&
onef
,
(
pos_ubatch
*
labels
->
ne
[
0
]
+
labels_sparse
[
ilabel
])
*
sizeof
(
float
),
sizeof
(
float
));
}
}
ggml_opt_eval
(
opt_ctx
,
result
);
if
(
callback
)
{
callback
(
train
,
opt_ctx
,
dataset
,
result
,
idata_in_loop
+
(
pos_ctx
+
pos_batch
)
/
n_ubatch
+
1
,
ndata_in_loop
,
t_loop_start
);
}
ggml_free
(
ctx_compute_opt
);
}
}
kv_guard
.
commit
();
}
void
llama_context
::
opt_epoch
(
ggml_opt_dataset_t
dataset
,
ggml_opt_result_t
result_train
,
ggml_opt_result_t
result_eval
,
int64_t
idata_split
,
ggml_opt_epoch_callback
callback_train
,
ggml_opt_epoch_callback
callback_eval
)
{
const
uint32_t
n_ctx
=
this
->
n_ctx
();
const
uint32_t
n_batch
=
std
::
min
(
cparams
.
n_batch
,
n_ctx
);
const
uint32_t
n_ubatch
=
std
::
min
(
cparams
.
n_ubatch
,
n_batch
);
const
int64_t
ndata
=
ggml_opt_dataset_ndata
(
dataset
);
GGML_ASSERT
(
idata_split
>=
0
);
GGML_ASSERT
(
idata_split
<=
ndata
);
const
uint32_t
ubatch_per_ctx
=
n_ctx
/
n_ubatch
;
struct
llama_batch
batch
=
llama_batch_init
(
n_batch
,
0
,
1
);
std
::
vector
<
llama_token
>
tokens
(
n_ctx
);
std
::
vector
<
llama_token
>
labels_sparse
(
n_ctx
);
int64_t
idata
=
0
;
int64_t
t_loop_start
=
ggml_time_us
();
int64_t
ndata_in_loop
=
idata_split
*
ubatch_per_ctx
;
for
(;
idata
<
idata_split
;
++
idata
)
{
constexpr
bool
train
=
true
;
const
int64_t
idata_in_loop
=
idata
*
ubatch_per_ctx
;
ggml_opt_dataset_get_batch_host
(
dataset
,
tokens
.
data
(),
n_ctx
*
sizeof
(
llama_token
),
labels_sparse
.
data
(),
idata
);
opt_epoch_iter
(
dataset
,
result_train
,
tokens
,
labels_sparse
,
batch
,
callback_train
,
train
,
idata_in_loop
,
ndata_in_loop
,
t_loop_start
);
}
t_loop_start
=
ggml_time_us
();
ndata_in_loop
=
(
ndata
-
idata_split
)
*
ubatch_per_ctx
;
for
(;
idata
<
ndata
;
++
idata
)
{
constexpr
bool
train
=
false
;
const
int64_t
idata_in_loop
=
(
idata
-
idata_split
)
*
ubatch_per_ctx
;
ggml_opt_dataset_get_batch_host
(
dataset
,
tokens
.
data
(),
n_ctx
*
sizeof
(
llama_token
),
labels_sparse
.
data
(),
idata
);
opt_epoch_iter
(
dataset
,
result_eval
,
tokens
,
labels_sparse
,
batch
,
callback_eval
,
train
,
idata_in_loop
,
ndata_in_loop
,
t_loop_start
);
}
llama_batch_free
(
batch
);
}
//
// interface implementation
//
...
...
@@ -2193,14 +2086,14 @@ llama_context_params llama_context_default_params() {
/*.cb_eval_user_data =*/
nullptr
,
/*.type_k =*/
GGML_TYPE_F16
,
/*.type_v =*/
GGML_TYPE_F16
,
/*.logits_all =*/
false
,
/*.abort_callback =*/
nullptr
,
/*.abort_callback_data =*/
nullptr
,
/*.embeddings =*/
false
,
/*.offload_kqv =*/
true
,
/*.flash_attn =*/
false
,
/*.no_perf =*/
true
,
/*.op_offload =*/
true
,
/*.cross_attn =*/
false
,
/*.abort_callback =*/
nullptr
,
/*.abort_callback_data =*/
nullptr
,
};
return
result
;
...
...
@@ -2498,7 +2391,7 @@ void llama_kv_cache_seq_cp(
llama_seq_id
seq_id_dst
,
llama_pos
p0
,
llama_pos
p1
)
{
return
llama_kv_self_seq_cp
(
ctx
,
seq_id_src
,
seq_id_dst
,
p0
,
p1
);
llama_kv_self_seq_cp
(
ctx
,
seq_id_src
,
seq_id_dst
,
p0
,
p1
);
}
void
llama_kv_self_seq_cp
(
...
...
@@ -2512,14 +2405,14 @@ void llama_kv_self_seq_cp(
return
;
}
return
kv
->
seq_cp
(
seq_id_src
,
seq_id_dst
,
p0
,
p1
);
kv
->
seq_cp
(
seq_id_src
,
seq_id_dst
,
p0
,
p1
);
}
// deprecated
void
llama_kv_cache_seq_keep
(
llama_context
*
ctx
,
llama_seq_id
seq_id
)
{
return
llama_kv_self_seq_keep
(
ctx
,
seq_id
);
llama_kv_self_seq_keep
(
ctx
,
seq_id
);
}
void
llama_kv_self_seq_keep
(
llama_context
*
ctx
,
llama_seq_id
seq_id
)
{
...
...
@@ -2528,7 +2421,7 @@ void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) {
return
;
}
return
kv
->
seq_keep
(
seq_id
);
kv
->
seq_keep
(
seq_id
);
}
// deprecated
...
...
@@ -2538,7 +2431,7 @@ void llama_kv_cache_seq_add(
llama_pos
p0
,
llama_pos
p1
,
llama_pos
delta
)
{
return
llama_kv_self_seq_add
(
ctx
,
seq_id
,
p0
,
p1
,
delta
);
llama_kv_self_seq_add
(
ctx
,
seq_id
,
p0
,
p1
,
delta
);
}
void
llama_kv_self_seq_add
(
...
...
@@ -2552,7 +2445,7 @@ void llama_kv_self_seq_add(
return
;
}
return
kv
->
seq_add
(
seq_id
,
p0
,
p1
,
delta
);
kv
->
seq_add
(
seq_id
,
p0
,
p1
,
delta
);
}
// deprecated
...
...
@@ -2562,7 +2455,7 @@ void llama_kv_cache_seq_div(
llama_pos
p0
,
llama_pos
p1
,
int
d
)
{
return
llama_kv_self_seq_div
(
ctx
,
seq_id
,
p0
,
p1
,
d
);
llama_kv_self_seq_div
(
ctx
,
seq_id
,
p0
,
p1
,
d
);
}
void
llama_kv_self_seq_div
(
...
...
@@ -2576,7 +2469,7 @@ void llama_kv_self_seq_div(
return
;
}
return
kv
->
seq_div
(
seq_id
,
p0
,
p1
,
d
);
kv
->
seq_div
(
seq_id
,
p0
,
p1
,
d
);
}
// deprecated
...
...
@@ -2595,7 +2488,7 @@ llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
// deprecated
void
llama_kv_cache_defrag
(
llama_context
*
ctx
)
{
return
llama_kv_self_defrag
(
ctx
);
llama_kv_self_defrag
(
ctx
);
}
void
llama_kv_self_defrag
(
llama_context
*
ctx
)
{
...
...
@@ -2604,7 +2497,8 @@ void llama_kv_self_defrag(llama_context * ctx) {
return
;
}
return
kv
->
defrag
();
// force defrag
kv
->
defrag_sched
(
-
1.0
f
);
}
// deprecated
...
...
@@ -2788,3 +2682,34 @@ void llama_perf_context_print(const llama_context * ctx) {
void
llama_perf_context_reset
(
llama_context
*
ctx
)
{
ctx
->
perf_reset
();
}
//
// training
//
bool
llama_opt_param_filter_all
(
const
struct
ggml_tensor
*
tensor
,
void
*
userdata
)
{
GGML_UNUSED
(
tensor
);
GGML_UNUSED
(
userdata
);
return
true
;
}
void
llama_opt_init
(
struct
llama_context
*
ctx
,
struct
llama_model
*
model
,
struct
llama_opt_params
lopt_params
)
{
ctx
->
opt_init
(
model
,
lopt_params
);
}
void
llama_opt_epoch
(
struct
llama_context
*
ctx
,
ggml_opt_dataset_t
dataset
,
ggml_opt_result_t
result_train
,
ggml_opt_result_t
result_eval
,
int64_t
idata_split
,
ggml_opt_epoch_callback
callback_train
,
ggml_opt_epoch_callback
callback_eval
)
{
ctx
->
opt_epoch
(
dataset
,
result_train
,
result_eval
,
idata_split
,
callback_train
,
callback_eval
);
}
llama/llama.cpp/src/llama-context.h
View file @
0cefd46f
...
...
@@ -8,6 +8,7 @@
#include "llama-kv-cache.h"
#include "ggml-cpp.h"
#include "ggml-opt.h"
#include <map>
#include <vector>
...
...
@@ -28,7 +29,12 @@ struct llama_context {
void
synchronize
();
const
llama_model
&
get_model
()
const
;
const
llama_model
&
get_model
()
const
;
const
llama_cparams
&
get_cparams
()
const
;
ggml_backend_sched_t
get_sched
()
const
;
ggml_context
*
get_ctx_compute
()
const
;
uint32_t
n_ctx
()
const
;
uint32_t
n_ctx_per_seq
()
const
;
...
...
@@ -130,6 +136,32 @@ struct llama_context {
llama_perf_context_data
perf_get_data
()
const
;
void
perf_reset
();
//
// training
//
void
opt_init
(
struct
llama_model
*
model
,
struct
llama_opt_params
lopt_params
);
void
opt_epoch
(
ggml_opt_dataset_t
dataset
,
ggml_opt_result_t
result_train
,
ggml_opt_result_t
result_eval
,
int64_t
idata_split
,
ggml_opt_epoch_callback
callback_train
,
ggml_opt_epoch_callback
callback_eval
);
void
opt_epoch_iter
(
ggml_opt_dataset_t
dataset
,
ggml_opt_result_t
result
,
const
std
::
vector
<
llama_token
>
&
tokens
,
const
std
::
vector
<
llama_token
>
&
labels_sparse
,
llama_batch
&
batch
,
ggml_opt_epoch_callback
callback
,
bool
train
,
int64_t
idata_in_loop
,
int64_t
ndata_in_loop
,
int64_t
t_loop_start
);
private:
//
// output
...
...
@@ -139,50 +171,30 @@ private:
// Returns max number of outputs for which space was reserved.
int32_t
output_reserve
(
int32_t
n_outputs
);
// make the outputs have the same order they had in the user-provided batch
// TODO: maybe remove this
void
output_reorder
();
//
// graph
//
public:
int32_t
graph_max_nodes
()
const
;
// zero-out inputs and create the ctx_compute for the compute graph
ggml_cgraph
*
graph_init
();
// returns the result of ggml_backend_sched_graph_compute_async execution
ggml_status
graph_compute
(
ggml_cgraph
*
gf
,
bool
batched
);
private:
llm_graph_result_ptr
graph_build
(
ggml_context
*
ctx
,
ggml_cgraph
*
gf
,
const
llama_ubatch
&
ubatch
,
llm_graph_type
gtype
);
// returns the result of ggml_backend_sched_graph_compute_async execution
ggml_status
graph_compute
(
ggml_cgraph
*
gf
,
bool
batched
);
llm_graph_cb
graph_get_cb
()
const
;
// used by kv_self_update()
ggml_tensor
*
build_rope_shift
(
ggml_context
*
ctx0
,
ggml_tensor
*
cur
,
ggml_tensor
*
shift
,
ggml_tensor
*
factors
,
float
freq_base
,
float
freq_scale
)
const
;
llm_graph_result_ptr
build_kv_self_shift
(
ggml_context
*
ctx0
,
ggml_cgraph
*
gf
)
const
;
llm_graph_result_ptr
build_kv_self_defrag
(
ggml_context
*
ctx0
,
ggml_cgraph
*
gf
,
const
std
::
vector
<
struct
llama_kv_defrag_move
>
&
moves
)
const
;
// TODO: read/write lora adapters and cvec
size_t
state_write_data
(
llama_io_write_i
&
io
);
size_t
state_read_data
(
llama_io_read_i
&
io
);
...
...
@@ -199,14 +211,10 @@ private:
llama_cparams
cparams
;
llama_adapter_cvec
cvec
;
llama_adapter_loras
loras
;
llama_sbatch
sbatch
;
llama_cross
cross
;
// TODO: tmp for handling cross-attention - need something better probably
std
::
unique_ptr
<
llama_kv_cache_unified
>
kv_self
;
// TODO: remove
bool
logits_all
=
false
;
std
::
unique_ptr
<
llama_memory_i
>
memory
;
// decode output (2-dimensional array: [n_outputs][n_vocab])
size_t
logits_size
=
0
;
// capacity (of floats) for logits
...
...
@@ -233,6 +241,9 @@ private:
ggml_context_ptr
ctx_compute
;
// training
ggml_opt_context_t
opt_ctx
=
nullptr
;
ggml_threadpool_t
threadpool
=
nullptr
;
ggml_threadpool_t
threadpool_batch
=
nullptr
;
...
...
llama/llama.cpp/src/llama-cparams.h
View file @
0cefd46f
...
...
@@ -29,8 +29,9 @@ struct llama_cparams {
bool
offload_kqv
;
bool
flash_attn
;
bool
no_perf
;
bool
cross_attn
;
bool
warmup
;
bool
op_offload
;
bool
cross_attn
;
enum
llama_pooling_type
pooling_type
;
...
...
llama/llama.cpp/src/llama-graph.cpp
View file @
0cefd46f
...
...
@@ -284,24 +284,7 @@ void llm_graph_input_s_copy::set_input(const llama_ubatch * ubatch) {
// assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
for
(
uint32_t
i
=
0
;
i
<
n_kv
;
++
i
)
{
const
uint32_t
cell_id
=
i
+
kv_self
->
head
;
//////////////////////////////////////////////
// TODO: this should not mutate the KV cache !
llama_kv_cell
&
kv_cell
=
const_cast
<
class
llama_kv_cache_unified
*>
(
kv_self
)
->
cells
[
i
];
// prevent out-of-bound sources
if
(
kv_cell
.
src
<
0
||
(
uint32_t
)
kv_cell
.
src
>=
kv_self
->
size
)
{
kv_cell
.
src
=
cell_id
;
}
data
[
i
]
=
kv_cell
.
src
;
// TODO: do not mutate the KV cache
// ensure copy only happens once
if
(
kv_cell
.
src
!=
(
int32_t
)
cell_id
)
{
kv_cell
.
src
=
cell_id
;
}
data
[
i
]
=
kv_self
->
s_copy
(
i
);
}
}
}
...
...
@@ -317,18 +300,7 @@ void llm_graph_input_s_mask::set_input(const llama_ubatch * ubatch) {
// clear unused states
for
(
int
i
=
0
;
i
<
n_kv
;
++
i
)
{
const
uint32_t
cell_id
=
i
+
kv_self
->
head
;
//////////////////////////////////////////////
// TODO: this should not mutate the KV cache !
llama_kv_cell
&
kv_cell
=
const_cast
<
class
llama_kv_cache_unified
*>
(
kv_self
)
->
cells
[
i
];
data
[
i
]
=
(
float
)
(
kv_cell
.
src
>=
0
);
// only clear once
if
(
kv_cell
.
src
<
0
)
{
kv_cell
.
src
=
cell_id
;
}
data
[
i
]
=
kv_self
->
s_mask
(
i
);
}
}
}
...
...
@@ -816,7 +788,7 @@ ggml_tensor * llm_graph_context::build_ffn(
}
break
;
}
if
(
type_gate
==
LLM_FFN_PAR
)
{
if
(
gate
&&
type_gate
==
LLM_FFN_PAR
)
{
cur
=
ggml_mul
(
ctx0
,
cur
,
tmp
);
cb
(
cur
,
"ffn_gate_par"
,
il
);
}
...
...
@@ -1005,6 +977,7 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
inp
->
tokens
=
ggml_new_tensor_1d
(
ctx0
,
GGML_TYPE_I32
,
ubatch
.
n_tokens
);
//cb(inp->tokens, "inp_tokens", -1);
ggml_set_input
(
inp
->
tokens
);
res
->
t_tokens
=
inp
->
tokens
;
cur
=
ggml_get_rows
(
ctx0
,
tok_embd
,
inp
->
tokens
);
...
...
@@ -1111,7 +1084,7 @@ ggml_tensor * llm_graph_context::build_inp_cls() const {
}
ggml_tensor
*
llm_graph_context
::
build_inp_s_copy
()
const
{
const
llama_kv_cache_
unified
*
kv_self
=
static_cast
<
const
llama_kv_cache_
unified
*>
(
memory
);
const
llama_kv_cache_
recurrent
*
kv_self
=
static_cast
<
const
llama_kv_cache_
recurrent
*>
(
memory
);
auto
inp
=
std
::
make_unique
<
llm_graph_input_s_copy
>
(
kv_self
);
...
...
@@ -1128,7 +1101,7 @@ ggml_tensor * llm_graph_context::build_inp_s_copy() const {
}
ggml_tensor
*
llm_graph_context
::
build_inp_s_mask
()
const
{
const
llama_kv_cache_
unified
*
kv_self
=
static_cast
<
const
llama_kv_cache_
unified
*>
(
memory
);
const
llama_kv_cache_
recurrent
*
kv_self
=
static_cast
<
const
llama_kv_cache_
recurrent
*>
(
memory
);
auto
inp
=
std
::
make_unique
<
llm_graph_input_s_mask
>
(
kv_self
);
...
...
@@ -1261,8 +1234,19 @@ ggml_tensor * llm_graph_context::build_attn_mha(
ggml_flash_attn_ext_set_prec
(
cur
,
GGML_PREC_F32
);
if
(
v_mla
)
{
#if 0
// v_mla can be applied as a matrix-vector multiplication with broadcasting across dimension 3 == n_tokens.
// However, the code is optimized for dimensions 0 and 1 being large, so this is ineffient.
cur = ggml_reshape_4d(ctx0, cur, v_mla->ne[0], 1, n_head, n_tokens);
cur = ggml_mul_mat(ctx0, v_mla, cur);
#else
// It's preferable to do the calculation as a matrix-matrix multiplication with n_tokens in dimension 1.
// The permutations are noops and only change how the tensor data is interpreted.
cur
=
ggml_permute
(
ctx0
,
cur
,
0
,
2
,
1
,
3
);
cur
=
ggml_mul_mat
(
ctx0
,
v_mla
,
cur
);
cur
=
ggml_permute
(
ctx0
,
cur
,
0
,
2
,
1
,
3
);
cur
=
ggml_cont
(
ctx0
,
cur
);
// Needed because ggml_reshape_2d expects contiguous inputs.
#endif
}
cur
=
ggml_reshape_2d
(
ctx0
,
cur
,
cur
->
ne
[
0
]
*
n_head
,
n_tokens
);
...
...
@@ -1442,8 +1426,6 @@ ggml_tensor * llm_graph_context::build_attn(
// store to KV cache
{
GGML_ASSERT
(
!
kv_self
->
recurrent
);
const
auto
kv_head
=
kv_self
->
head
;
GGML_ASSERT
(
kv_self
->
size
==
n_ctx
);
...
...
@@ -1612,7 +1594,7 @@ ggml_tensor * llm_graph_context::build_copy_mask_state(
ggml_tensor
*
state_mask
,
int32_t
n_state
,
int32_t
n_seqs
)
const
{
const
llama_kv_cache_
unified
*
kv_self
=
static_cast
<
const
llama_kv_cache_
unified
*>
(
memory
);
const
llama_kv_cache_
recurrent
*
kv_self
=
static_cast
<
const
llama_kv_cache_
recurrent
*>
(
memory
);
const
auto
n_kv
=
kv_self
->
n
;
const
auto
kv_head
=
kv_self
->
head
;
...
...
@@ -1644,7 +1626,7 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
ggml_tensor
*
state_mask
,
const
llama_ubatch
&
ubatch
,
int
il
)
const
{
const
llama_kv_cache_
unified
*
kv_self
=
static_cast
<
const
llama_kv_cache_
unified
*>
(
memory
);
const
llama_kv_cache_
recurrent
*
kv_self
=
static_cast
<
const
llama_kv_cache_
recurrent
*>
(
memory
);
const
auto
token_shift_count
=
hparams
.
token_shift_count
;
...
...
@@ -1665,7 +1647,7 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_store(
ggml_tensor
*
token_shift
,
const
llama_ubatch
&
ubatch
,
int
il
)
const
{
const
llama_kv_cache_
unified
*
kv_self
=
static_cast
<
const
llama_kv_cache_
unified
*>
(
memory
);
const
llama_kv_cache_
recurrent
*
kv_self
=
static_cast
<
const
llama_kv_cache_
recurrent
*>
(
memory
);
const
auto
token_shift_count
=
hparams
.
token_shift_count
;
const
auto
n_embd
=
hparams
.
n_embd
;
...
...
llama/llama.cpp/src/llama-graph.h
View file @
0cefd46f
...
...
@@ -19,6 +19,7 @@ struct llama_cparams;
class
llama_memory_i
;
class
llama_kv_cache_unified
;
class
llama_kv_cache_recurrent
;
// certain models (typically multi-modal) can produce different types of graphs
enum
llm_graph_type
{
...
...
@@ -187,26 +188,26 @@ public:
class
llm_graph_input_s_copy
:
public
llm_graph_input_i
{
public:
llm_graph_input_s_copy
(
const
llama_kv_cache_
unified
*
kv_self
)
:
kv_self
(
kv_self
)
{}
llm_graph_input_s_copy
(
const
llama_kv_cache_
recurrent
*
kv_self
)
:
kv_self
(
kv_self
)
{}
virtual
~
llm_graph_input_s_copy
()
=
default
;
void
set_input
(
const
llama_ubatch
*
ubatch
)
override
;
ggml_tensor
*
s_copy
;
// I32 [kv_size]
const
llama_kv_cache_
unified
*
kv_self
;
const
llama_kv_cache_
recurrent
*
kv_self
;
};
class
llm_graph_input_s_mask
:
public
llm_graph_input_i
{
public:
llm_graph_input_s_mask
(
const
llama_kv_cache_
unified
*
kv_self
)
:
kv_self
(
kv_self
)
{}
llm_graph_input_s_mask
(
const
llama_kv_cache_
recurrent
*
kv_self
)
:
kv_self
(
kv_self
)
{}
virtual
~
llm_graph_input_s_mask
()
=
default
;
void
set_input
(
const
llama_ubatch
*
ubatch
)
override
;
ggml_tensor
*
s_mask
;
// F32 [1, n_kv]
const
llama_kv_cache_
unified
*
kv_self
;
const
llama_kv_cache_
recurrent
*
kv_self
;
};
class
llm_graph_input_cross_embd
:
public
llm_graph_input_i
{
...
...
@@ -308,6 +309,7 @@ class llm_graph_result_i {
public:
virtual
~
llm_graph_result_i
()
=
default
;
virtual
ggml_tensor
*
get_tokens
()
=
0
;
virtual
ggml_tensor
*
get_logits
()
=
0
;
virtual
ggml_tensor
*
get_embd
()
=
0
;
virtual
ggml_tensor
*
get_embd_pooled
()
=
0
;
...
...
@@ -322,6 +324,7 @@ class llm_graph_result : public llm_graph_result_i {
public:
virtual
~
llm_graph_result
()
=
default
;
ggml_tensor
*
get_tokens
()
override
{
return
t_tokens
;
}
ggml_tensor
*
get_logits
()
override
{
return
t_logits
;
}
ggml_tensor
*
get_embd
()
override
{
return
t_embd
;
}
ggml_tensor
*
get_embd_pooled
()
override
{
return
t_embd_pooled
;
}
...
...
@@ -338,6 +341,7 @@ public:
}
// important graph nodes
ggml_tensor
*
t_tokens
=
nullptr
;
ggml_tensor
*
t_logits
=
nullptr
;
ggml_tensor
*
t_embd
=
nullptr
;
ggml_tensor
*
t_embd_pooled
=
nullptr
;
...
...
@@ -361,8 +365,8 @@ struct llm_graph_params {
const
llama_cparams
&
cparams
;
const
llama_ubatch
&
ubatch
;
ggml_backend_sched
*
sched
;
ggml_backend
*
backend_cpu
;
ggml_backend_sched
_t
sched
;
ggml_backend
_t
backend_cpu
;
const
llama_adapter_cvec
*
cvec
;
const
llama_adapter_loras
*
loras
;
...
...
@@ -413,9 +417,9 @@ struct llm_graph_context {
ggml_context
*
ctx0
=
nullptr
;
ggml_backend_sched
*
sched
;
ggml_backend_sched
_t
sched
;
ggml_backend
*
backend_cpu
;
// TODO: needed by build_attn_mha, figure out a way to remove?
ggml_backend
_t
backend_cpu
;
// TODO: needed by build_attn_mha, figure out a way to remove?
const
llama_adapter_cvec
*
cvec
;
const
llama_adapter_loras
*
loras
;
...
...
llama/llama.cpp/src/llama-kv-cache.cpp
View file @
0cefd46f
...
...
@@ -4,33 +4,41 @@
#include "llama-batch.h"
#include "llama-cparams.h"
#include "llama-model.h"
#include "llama-context.h"
#include <algorithm>
#include <cassert>
#include <cmath>
#include <limits>
#include <map>
#include <stdexcept>
llama_kv_cache_unified
::
llama_kv_cache_unified
(
const
llama_hparams
&
hparams
,
callbacks
cbs
)
:
hparams
(
hparams
),
cbs
(
std
::
move
(
cbs
))
{
//
// llama_kv_cache_unified
//
uint32_t
llama_kv_cache_unified
::
get_padding
(
const
llama_cparams
&
cparams
)
{
// the FA kernels require padding to avoid extra runtime boundary checks
return
cparams
.
flash_attn
?
256u
:
32u
;
}
bool
llama_kv_cache_unified
::
init
(
llama_kv_cache_unified
::
llama_kv_cache_unified
(
const
llama_model
&
model
,
const
llama_cparams
&
cparams
,
ggml_type
type_k
,
ggml_type
type_v
,
bool
v_trans
,
bool
offload
,
uint32_t
kv_size
,
bool
offload
)
{
uint32_t
padding
)
:
model
(
model
),
hparams
(
model
.
hparams
),
v_trans
(
v_trans
),
padding
(
padding
)
{
const
int32_t
n_layer
=
hparams
.
n_layer
;
has_shift
=
false
;
can_shift
=
true
;
recurrent
=
llama_model_is_recurrent
(
&
model
);
v_trans
=
!
recurrent
&&
!
cparams
.
flash_attn
;
can_shift
=
!
recurrent
;
LLAMA_LOG_INFO
(
"%s: kv_size = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d, padding = %d
\n
"
,
__func__
,
kv_size
,
ggml_type_name
(
type_k
),
ggml_type_name
(
type_v
),
n_layer
,
can_shift
,
padding
);
LLAMA_LOG_INFO
(
"%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d
\n
"
,
__func__
,
kv_size
,
offload
,
ggml_type_name
(
type_k
),
ggml_type_name
(
type_v
),
n_layer
,
can_shift
);
GGML_ASSERT
(
kv_size
%
padding
==
0
&&
"kv_size must be a multiple of padding"
);
head
=
0
;
size
=
kv_size
;
...
...
@@ -76,23 +84,20 @@ bool llama_kv_cache_unified::init(
const
char
*
dev_name
=
"CPU"
;
ggml_backend_buffer_type_t
buft
;
ggml_backend_buffer_type_t
buft
=
ggml_backend_cpu_buffer_type
();
if
(
offload
)
{
auto
*
dev
=
model
.
dev_layer
(
i
);
buft
=
ggml_backend_dev_buffer_type
(
dev
);
dev_name
=
ggml_backend_dev_name
(
dev
);
}
else
{
buft
=
ggml_backend_cpu_buffer_type
();
}
LLAMA_LOG_DEBUG
(
"%s: layer %3d: n_embd_k_gqa = %d, n_embd_v_gqa = %d, dev = %s
\n
"
,
__func__
,
i
,
n_embd_k_gqa
,
n_embd_v_gqa
,
dev_name
);
LLAMA_LOG_DEBUG
(
"%s: layer %3d: dev = %s
\n
"
,
__func__
,
i
,
dev_name
);
ggml_context
*
ctx
=
ctx_for_buft
(
buft
);
if
(
!
ctx
)
{
LLAMA_LOG_ERROR
(
"%s: failed to create ggml context for kv cache
\n
"
,
__func__
);
return
false
;
throw
std
::
runtime_error
(
"failed to create ggml context for kv cache"
);
}
ggml_tensor
*
k
,
*
v
;
...
...
@@ -118,55 +123,28 @@ bool llama_kv_cache_unified::init(
ggml_backend_buffer_t
buf
=
ggml_backend_alloc_ctx_tensors_from_buft
(
ctx
,
buft
);
if
(
!
buf
)
{
LLAMA_LOG_ERROR
(
"%s: failed to allocate buffer for kv cache
\n
"
,
__func__
);
return
false
;
throw
std
::
runtime_error
(
"failed to allocate buffer for kv cache"
);
}
ggml_backend_buffer_clear
(
buf
,
0
);
LLAMA_LOG_INFO
(
"%s: %10s KV buffer size = %8.2f MiB
\n
"
,
__func__
,
ggml_backend_buffer_name
(
buf
),
ggml_backend_buffer_get_size
(
buf
)
/
1024.0
/
1024.0
);
bufs
.
emplace_back
(
buf
);
}
return
true
;
}
int32_t
llama_kv_cache_unified
::
get_n_tokens
()
const
{
int32_t
result
=
0
;
for
(
uint32_t
i
=
0
;
i
<
size
;
i
++
)
{
result
+=
cells
[
i
].
seq_id
.
size
();
}
return
result
;
}
int32_t
llama_kv_cache_unified
::
get_used_cells
()
const
{
return
used
;
}
size_t
llama_kv_cache_unified
::
total_size
()
const
{
size_t
size
=
0
;
for
(
const
auto
&
buf
:
bufs
)
{
size
+=
ggml_backend_buffer_get_size
(
buf
.
get
());
}
return
size
;
}
{
const
size_t
memory_size_k
=
size_k_bytes
();
const
size_t
memory_size_v
=
size_v_bytes
();
llama_pos
llama_kv_cache_unified
::
pos_max
()
const
{
llama_pos
pos_max
=
-
1
;
for
(
const
auto
&
cell
:
cells
)
{
pos_max
=
std
::
max
(
pos_max
,
cell
.
pos
);
LLAMA_LOG_INFO
(
"%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB
\n
"
,
__func__
,
(
float
)(
memory_size_k
+
memory_size_v
)
/
(
1024.0
f
*
1024.0
f
),
ggml_type_name
(
type_k
),
(
float
)
memory_size_k
/
(
1024.0
f
*
1024.0
f
),
ggml_type_name
(
type_v
),
(
float
)
memory_size_v
/
(
1024.0
f
*
1024.0
f
)
);
}
return
pos_max
;
}
void
llama_kv_cache_unified
::
clear
()
{
for
(
int32_t
i
=
0
;
i
<
(
int32_t
)
size
;
++
i
)
{
cells
[
i
].
pos
=
-
1
;
cells
[
i
].
seq_id
.
clear
();
cells
[
i
].
src
=
-
1
;
cells
[
i
].
tail
=
-
1
;
}
head
=
0
;
used
=
0
;
...
...
@@ -187,35 +165,6 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
p1
=
std
::
numeric_limits
<
llama_pos
>::
max
();
}
// models like Mamba or RWKV can't have a state partially erased
if
(
recurrent
)
{
if
(
seq_id
>=
(
int64_t
)
size
)
{
// could be fatal
return
false
;
}
if
(
0
<=
seq_id
)
{
int32_t
&
tail_id
=
cells
[
seq_id
].
tail
;
if
(
tail_id
>=
0
)
{
const
llama_kv_cell
&
cell
=
cells
[
tail_id
];
// partial intersection is invalid
if
((
0
<
p0
&&
p0
<=
cell
.
pos
)
||
(
0
<
p1
&&
p1
<=
cell
.
pos
))
{
return
false
;
}
// invalidate tails which will be cleared
if
(
p0
<=
cell
.
pos
&&
cell
.
pos
<
p1
)
{
tail_id
=
-
1
;
}
}
}
else
{
// seq_id is negative, then the range should include everything or nothing
if
(
p0
!=
p1
&&
(
p0
!=
0
||
p1
!=
std
::
numeric_limits
<
llama_pos
>::
max
()))
{
return
false
;
}
}
return
true
;
}
for
(
uint32_t
i
=
0
;
i
<
size
;
++
i
)
{
if
(
cells
[
i
].
pos
>=
p0
&&
cells
[
i
].
pos
<
p1
)
{
if
(
seq_id
<
0
)
{
...
...
@@ -232,7 +181,6 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
}
cells
[
i
].
pos
=
-
1
;
cells
[
i
].
src
=
-
1
;
if
(
new_head
==
size
)
{
new_head
=
i
;
...
...
@@ -262,34 +210,6 @@ void llama_kv_cache_unified::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id
p1
=
std
::
numeric_limits
<
llama_pos
>::
max
();
}
if
(
recurrent
)
{
if
((
uint32_t
)
seq_id_dst
<
size
&&
(
uint32_t
)
seq_id_src
<
size
)
{
llama_kv_cell
&
tail_src
=
cells
[
seq_id_src
];
llama_kv_cell
&
tail_dst
=
cells
[
seq_id_dst
];
if
(
tail_dst
.
tail
>=
0
)
{
// clear destination seq_id if it wasn't empty
llama_kv_cell
&
cell_dst
=
cells
[
tail_dst
.
tail
];
cell_dst
.
seq_id
.
erase
(
seq_id_dst
);
tail_dst
.
tail
=
-
1
;
if
(
cell_dst
.
seq_id
.
empty
())
{
cell_dst
.
pos
=
-
1
;
cell_dst
.
delta
=
-
1
;
cell_dst
.
src
=
-
1
;
used
-=
1
;
}
}
if
(
tail_src
.
tail
>=
0
)
{
llama_kv_cell
&
cell_src
=
cells
[
tail_src
.
tail
];
cell_src
.
seq_id
.
insert
(
seq_id_dst
);
tail_dst
.
tail
=
tail_src
.
tail
;
}
}
return
;
}
// otherwise, this is the KV of a Transformer-like model
head
=
0
;
...
...
@@ -304,17 +224,12 @@ void llama_kv_cache_unified::seq_keep(llama_seq_id seq_id) {
uint32_t
new_head
=
size
;
for
(
uint32_t
i
=
0
;
i
<
size
;
++
i
)
{
if
(
recurrent
&&
(
llama_seq_id
)
i
!=
seq_id
)
{
cells
[
i
].
tail
=
-
1
;
}
if
(
!
cells
[
i
].
has_seq_id
(
seq_id
))
{
if
(
cells
[
i
].
pos
>=
0
)
{
used
--
;
}
cells
[
i
].
pos
=
-
1
;
cells
[
i
].
src
=
-
1
;
cells
[
i
].
seq_id
.
clear
();
if
(
new_head
==
size
){
...
...
@@ -352,20 +267,6 @@ void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_po
return
;
}
if
(
recurrent
)
{
// for Mamba-like or RWKV models, only the pos needs to be shifted
if
(
0
<=
seq_id
&&
seq_id
<
(
int64_t
)
size
)
{
const
int32_t
tail_id
=
cells
[
seq_id
].
tail
;
if
(
tail_id
>=
0
)
{
llama_kv_cell
&
cell
=
cells
[
tail_id
];
if
(
cell
.
has_seq_id
(
seq_id
)
&&
p0
<=
cell
.
pos
&&
cell
.
pos
<
p1
)
{
cell
.
pos
+=
delta
;
}
}
}
return
;
}
for
(
uint32_t
i
=
0
;
i
<
size
;
++
i
)
{
if
(
cells
[
i
].
has_seq_id
(
seq_id
)
&&
cells
[
i
].
pos
>=
p0
&&
cells
[
i
].
pos
<
p1
)
{
has_shift
=
true
;
...
...
@@ -408,21 +309,6 @@ void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_po
return
;
}
if
(
recurrent
)
{
// for Mamba-like or RWKV models, only the pos needs to be changed
if
(
0
<=
seq_id
&&
seq_id
<
(
int64_t
)
size
)
{
const
int32_t
tail_id
=
cells
[
seq_id
].
tail
;
if
(
tail_id
>=
0
)
{
llama_kv_cell
&
cell
=
cells
[
tail_id
];
if
(
cell
.
has_seq_id
(
seq_id
)
&&
p0
<=
cell
.
pos
&&
cell
.
pos
<
p1
)
{
cell
.
pos
/=
d
;
}
}
}
return
;
}
for
(
uint32_t
i
=
0
;
i
<
size
;
++
i
)
{
if
(
cells
[
i
].
has_seq_id
(
seq_id
)
&&
cells
[
i
].
pos
>=
p0
&&
cells
[
i
].
pos
<
p1
)
{
has_shift
=
true
;
...
...
@@ -448,23 +334,11 @@ llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const {
return
result
;
}
void
llama_kv_cache_unified
::
defrag
()
{
if
(
!
recurrent
)
{
do_defrag
=
true
;
}
}
void
llama_kv_cache_unified
::
restore
()
{
if
(
pending
.
ranges
.
empty
())
{
return
;
}
// TODO: tmp - move to llama_kv_cache_recurrent
if
(
recurrent
)
{
seq_rm
(
-
1
,
-
1
,
-
1
);
return
;
}
uint32_t
new_head
=
size
;
for
(
auto
&
range
:
pending
.
ranges
)
{
...
...
@@ -477,7 +351,6 @@ void llama_kv_cache_unified::restore() {
}
cells
[
i
].
pos
=
-
1
;
cells
[
i
].
src
=
-
1
;
}
new_head
=
std
::
min
(
new_head
,
range
.
c0
);
...
...
@@ -489,11 +362,6 @@ void llama_kv_cache_unified::restore() {
}
void
llama_kv_cache_unified
::
commit
()
{
// TODO: tmp - move to llama_kv_cache_recurrent
if
(
recurrent
)
{
return
;
}
if
(
pending
.
ranges
.
empty
())
{
LLAMA_LOG_WARN
(
"%s: no pending KV cache updates to commit - might indicate a bug (ref: %s)
\n
"
,
__func__
,
"https://github.com/ggml-org/llama.cpp/pull/12695"
);
...
...
@@ -503,183 +371,115 @@ void llama_kv_cache_unified::commit() {
pending
.
ranges
.
clear
();
}
bool
llama_kv_cache_unified
::
get_can_shift
()
const
{
return
can_shift
;
}
bool
llama_kv_cache_unified
::
update
(
llama_context
&
lctx
)
{
auto
*
sched
=
lctx
.
get_sched
();
bool
llama_kv_cache_unified
::
find_slot
(
const
llama_ubatch
&
ubatch
)
{
const
uint32_t
n_tokens
=
ubatch
.
n_tokens
;
const
uint32_t
n_seqs
=
ubatch
.
n_seqs
;
const
uint32_t
n_seq_tokens
=
ubatch
.
n_seq_tokens
;
if
(
has_shift
)
{
if
(
!
get_can_shift
())
{
GGML_ABORT
(
"The current KV cache / model configuration does not support K-shift"
);
}
// if we have enough unused cells before the current head ->
// better to start searching from the beginning of the cache, hoping to fill it
if
(
head
>
used
+
2
*
ubatch
.
n_tokens
)
{
head
=
0
;
}
LLAMA_LOG_DEBUG
(
"%s: applying K-shift
\n
"
,
__func__
);
if
(
recurrent
)
{
// For recurrent state architectures (like Mamba or RWKV),
// each cache cell can store the state for a whole sequence.
// A slot should be always be contiguous.
// apply K-shift if needed
if
(
hparams
.
rope_type
!=
LLAMA_ROPE_TYPE_NONE
)
{
ggml_backend_sched_reset
(
sched
);
// can only process batches with an equal number of new tokens in each sequence
GGML_ASSERT
(
ubatch
.
equal_seqs
);
auto
*
gf
=
lctx
.
graph_init
();
int32_t
min
=
size
-
1
;
int32_t
max
=
0
;
auto
res
=
build_graph_shift
(
lctx
.
get_cparams
(),
lctx
.
get_ctx_compute
(),
gf
);
// everything should fit if all seq_ids are smaller than the max
for
(
uint32_t
s
=
0
;
s
<
n_seqs
;
++
s
)
{
const
uint32_t
n_seq_id
=
ubatch
.
n_seq_id
[
s
];
for
(
uint32_t
j
=
0
;
j
<
n_seq_id
;
++
j
)
{
const
llama_seq_id
seq_id
=
ubatch
.
seq_id
[
s
][
j
];
ggml_backend_sched_alloc_graph
(
sched
,
gf
);
if
(
seq_id
<
0
||
(
uint32_t
)
seq_id
>=
size
)
{
// too big seq_id
// TODO: would it be possible to resize the cache instead?
LLAMA_LOG_ERROR
(
"%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value
\n
"
,
__func__
,
seq_id
,
size
);
return
false
;
}
if
(
j
>
0
)
{
llama_kv_cell
&
seq
=
cells
[
seq_id
];
if
(
seq
.
tail
>=
0
)
{
llama_kv_cell
&
cell
=
cells
[
seq
.
tail
];
// clear cells from seq_ids that become shared
// (should not normally happen, but let's handle it anyway)
cell
.
seq_id
.
erase
(
seq_id
);
seq
.
tail
=
-
1
;
if
(
cell
.
seq_id
.
empty
())
{
cell
.
pos
=
-
1
;
cell
.
src
=
-
1
;
used
-=
1
;
}
}
}
}
res
->
set_inputs
(
nullptr
);
lctx
.
graph_compute
(
gf
,
false
);
}
#ifndef NDEBUG
{
std
::
vector
<
int32_t
>
tails_verif
;
tails_verif
.
assign
(
size
,
-
1
);
for
(
uint32_t
i
=
0
;
i
<
size
;
++
i
)
{
llama_kv_cell
&
cell
=
cells
[
i
];
for
(
llama_seq_id
seq_id
:
cell
.
seq_id
)
{
if
(
tails_verif
[
seq_id
]
!=
-
1
)
{
LLAMA_LOG_ERROR
(
"%s: duplicate tail for seq_id %d in cell %d and %d
\n
"
,
__func__
,
seq_id
,
i
,
tails_verif
[
seq_id
]);
}
tails_verif
[
seq_id
]
=
i
;
}
}
has_shift
=
false
;
for
(
uint32_t
i
=
0
;
i
<
size
;
++
i
)
{
if
(
tails_verif
[
i
]
!=
cells
[
i
].
tail
)
{
LLAMA_LOG_ERROR
(
"%s: wrong tail for seq_id %d, (%d instead of %d)
\n
"
,
__func__
,
i
,
cells
[
i
].
tail
,
tails_verif
[
i
]);
}
cells
[
i
].
delta
=
0
;
}
}
#endif
// find next empty cell
uint32_t
next_empty_cell
=
head
;
}
for
(
uint32_t
i
=
0
;
i
<
size
;
++
i
)
{
if
(
next_empty_cell
>=
size
)
{
next_empty_cell
-=
size
;
}
llama_kv_cell
&
cell
=
cells
[
next_empty_cell
];
if
(
cell
.
is_empty
())
{
break
;
}
next_empty_cell
+=
1
;
if
(
do_defrag
)
{
LLAMA_LOG_DEBUG
(
"%s: defragmenting KV cache
\n
"
,
__func__
);
const
uint32_t
n_max_nodes
=
lctx
.
graph_max_nodes
();
const
uint32_t
max_moves
=
(
n_max_nodes
-
2
*
model
.
hparams
.
n_layer
)
/
(
6
*
model
.
hparams
.
n_layer
);
if
(
!
defrag_prepare
(
n_max_nodes
))
{
LLAMA_LOG_ERROR
(
"%s: failed to prepare defragmentation
\n
"
,
__func__
);
return
false
;
}
// find usable cell range
for
(
uint32_t
s
=
0
;
s
<
n_seqs
;
++
s
)
{
const
llama_seq_id
seq_id
=
ubatch
.
seq_id
[
s
][
0
];
llama_kv_cell
&
seq_meta
=
cells
[
seq_id
];
bool
has_cell
=
false
;
if
(
seq_meta
.
tail
>=
0
)
{
llama_kv_cell
&
cell
=
cells
[
seq_meta
.
tail
];
GGML_ASSERT
(
cell
.
has_seq_id
(
seq_id
));
// does this seq_id "own" the cell?
if
(
cell
.
seq_id
.
size
()
==
1
)
{
has_cell
=
true
;
}
}
if
(
!
has_cell
)
{
llama_kv_cell
&
empty_cell
=
cells
[
next_empty_cell
];
GGML_ASSERT
(
empty_cell
.
is_empty
());
// copy old tail into the empty cell
if
(
seq_meta
.
tail
>=
0
)
{
llama_kv_cell
&
orig_cell
=
cells
[
seq_meta
.
tail
];
empty_cell
.
pos
=
orig_cell
.
pos
;
empty_cell
.
src
=
orig_cell
.
src
;
orig_cell
.
seq_id
.
erase
(
seq_id
);
empty_cell
.
seq_id
.
insert
(
seq_id
);
// will be overwritten
}
seq_meta
.
tail
=
next_empty_cell
;
// find next empty cell
if
(
s
+
1
<
n_seqs
)
{
next_empty_cell
+=
1
;
for
(
uint32_t
i
=
0
;
i
<
size
;
++
i
)
{
if
(
next_empty_cell
>=
size
)
{
next_empty_cell
-=
size
;
}
llama_kv_cell
&
cell
=
cells
[
next_empty_cell
];
if
(
cell
.
is_empty
())
{
break
;
}
next_empty_cell
+=
1
;
}
}
}
if
(
min
>
seq_meta
.
tail
)
{
min
=
seq_meta
.
tail
;
}
if
(
max
<
seq_meta
.
tail
)
{
max
=
seq_meta
.
tail
;
}
}
for
(
std
::
size_t
i
=
0
;
i
<
defrag_info
.
moves
.
size
();
i
+=
max_moves
)
{
std
::
vector
<
struct
llama_kv_defrag_move
>
chunk
;
auto
end
=
std
::
min
(
i
+
max_moves
,
defrag_info
.
moves
.
size
());
chunk
.
assign
(
defrag_info
.
moves
.
begin
()
+
i
,
defrag_info
.
moves
.
begin
()
+
end
);
// gather and re-order
for
(
uint32_t
s
=
0
;
s
<
n_seqs
;
++
s
)
{
int32_t
dst_id
=
s
+
min
;
int32_t
src_id
=
cells
[
ubatch
.
seq_id
[
s
][
0
]].
tail
;
if
(
dst_id
!=
src_id
)
{
llama_kv_cell
&
dst_cell
=
cells
[
dst_id
];
llama_kv_cell
&
src_cell
=
cells
[
src_id
];
ggml_backend_sched_reset
(
sched
);
std
::
swap
(
dst_cell
.
pos
,
src_cell
.
pos
);
std
::
swap
(
dst_cell
.
src
,
src_cell
.
src
);
std
::
swap
(
dst_cell
.
seq_id
,
src_cell
.
seq_id
);
auto
*
gf
=
lctx
.
graph_init
();
// swap tails (assuming they NEVER overlap)
for
(
const
llama_seq_id
seq_id
:
src_cell
.
seq_id
)
{
cells
[
seq_id
].
tail
=
src_id
;
}
for
(
const
llama_seq_id
seq_id
:
dst_cell
.
seq_id
)
{
cells
[
seq_id
].
tail
=
dst_id
;
}
}
}
auto
res
=
build_graph_defrag
(
lctx
.
get_cparams
(),
lctx
.
get_ctx_compute
(),
gf
,
chunk
);
// update the pos of the used seqs
for
(
uint32_t
s
=
0
;
s
<
n_seqs
;
++
s
)
{
const
llama_pos
last_pos
=
ubatch
.
pos
[
n_seq_tokens
*
s
+
n_seq_tokens
-
1
];
int32_t
cell_id
=
s
+
min
;
llama_kv_cell
&
cell
=
cells
[
cell_id
];
ggml_backend_sched_alloc_graph
(
sched
,
gf
);
if
(
cell
.
pos
>=
0
&&
last_pos
!=
cell
.
pos
+
(
llama_pos
)
n_seq_tokens
)
{
// What should happen when the pos backtracks or skips a value?
// Clearing the state mid-batch would require special-casing which isn't done.
LLAMA_LOG_WARN
(
"%s: non-consecutive token position %d after %d for sequence %d with %u new tokens
\n
"
,
__func__
,
last_pos
,
cell
.
pos
,
ubatch
.
seq_id
[
s
][
0
],
n_seq_tokens
);
}
cell
.
pos
=
last_pos
;
cell
.
seq_id
.
clear
();
for
(
int32_t
j
=
0
;
j
<
ubatch
.
n_seq_id
[
s
];
++
j
)
{
const
llama_seq_id
seq_id
=
ubatch
.
seq_id
[
s
][
j
];
cell
.
seq_id
.
insert
(
seq_id
);
cells
[
seq_id
].
tail
=
cell_id
;
}
res
->
set_inputs
(
nullptr
);
lctx
.
graph_compute
(
gf
,
false
);
}
// allow getting the range of used cells, from head to head + n
head
=
min
;
n
=
max
-
min
+
1
;
used
=
std
::
count_if
(
cells
.
begin
(),
cells
.
end
(),
[](
const
llama_kv_cell
&
cell
){
return
!
cell
.
is_empty
();
});
do_defrag
=
false
;
}
// we never need to reserve a worst case graph
return
false
;
}
void
llama_kv_cache_unified
::
defrag_sched
(
float
thold
)
{
// - do not defrag small contexts (i.e. < 2048 tokens)
// - count the padding towards the number of used tokens
const
float
fragmentation
=
n
>=
2048
?
std
::
max
(
0.0
f
,
1.0
f
-
(
float
(
used
+
padding
)
/
n
))
:
0.0
f
;
// queue defragmentation for next llama_kv_cache_update
if
(
fragmentation
>
thold
)
{
LLAMA_LOG_DEBUG
(
"%s: fragmentation: %.2f - requesting defrag
\n
"
,
__func__
,
fragmentation
);
do_defrag
=
true
;
}
}
void
llama_kv_cache_unified
::
set_full
()
{
n
=
size
;
}
llama_sbatch
llama_kv_cache_unified
::
sbatch_init
(
const
llama_batch
&
batch
,
bool
logits_all
)
{
return
llama_sbatch
(
batch
,
batch
.
n_embd
,
true
,
logits_all
);
}
llama_ubatch
llama_kv_cache_unified
::
ubatch_next
(
llama_sbatch
&
sbatch
,
uint32_t
n_ubatch
,
bool
embd_pooled
)
const
{
GGML_UNUSED
(
embd_pooled
);
return
sbatch
.
split_simple
(
n_ubatch
);
}
bool
llama_kv_cache_unified
::
find_slot
(
const
llama_ubatch
&
ubatch
)
{
const
uint32_t
n_tokens
=
ubatch
.
n_tokens
;
const
uint32_t
n_seqs
=
ubatch
.
n_seqs
;
const
uint32_t
n_seq_tokens
=
ubatch
.
n_seq_tokens
;
// sanity check
return
n
>=
n_seqs
;
// if we have enough unused cells before the current head ->
// better to start searching from the beginning of the cache, hoping to fill it
if
(
head
>
used
+
2
*
ubatch
.
n_tokens
)
{
head
=
0
;
}
// otherwise, one cell per token.
...
...
@@ -733,24 +533,50 @@ bool llama_kv_cache_unified::find_slot(
pending
.
ranges
.
push_back
({
head
,
head
+
n_tokens
});
// a heuristic, to avoid attending the full cache if it is not yet utilized
// after enough generations, the benefit from this heuristic disappears
// if we start defragmenting the cache, the benefit from this will be more important
n
=
std
::
min
(
size
,
std
::
max
(
padding
,
GGML_PAD
(
cell_max
(),
padding
)));
//printf("n = %5d, used = %5d, head = %5d\n", n, used, head);
return
true
;
}
uint32_t
llama_kv_cache_unified
::
get_padding
(
const
llama_cparams
&
cparams
)
const
{
// the FA kernels require padding to avoid extra runtime boundary checks
return
cparams
.
flash_attn
?
256u
:
32u
;
int32_t
llama_kv_cache_unified
::
get_n_tokens
()
const
{
int32_t
result
=
0
;
for
(
uint32_t
i
=
0
;
i
<
size
;
i
++
)
{
result
+=
cells
[
i
].
seq_id
.
size
();
}
return
result
;
}
u
int32_t
llama_kv_cache_unified
::
cell_max
()
const
{
for
(
uint32_t
i
=
size
;
i
>
0
;
--
i
)
{
const
llama_kv_cell
&
cell
=
cells
[
i
-
1
];
int32_t
llama_kv_cache_unified
::
get_used_cells
()
const
{
return
used
;
}
if
(
cell
.
pos
>=
0
&&
!
cell
.
is_empty
())
{
return
i
;
}
bool
llama_kv_cache_unified
::
get_can_shift
()
const
{
return
can_shift
;
}
llama_pos
llama_kv_cache_unified
::
get_pos_max
()
const
{
llama_pos
pos_max
=
-
1
;
for
(
const
auto
&
cell
:
cells
)
{
pos_max
=
std
::
max
(
pos_max
,
cell
.
pos
);
}
return
0
;
return
pos_max
;
}
size_t
llama_kv_cache_unified
::
total_size
()
const
{
size_t
size
=
0
;
for
(
const
auto
&
buf
:
bufs
)
{
size
+=
ggml_backend_buffer_get_size
(
buf
.
get
());
}
return
size
;
}
size_t
llama_kv_cache_unified
::
size_k_bytes
()
const
{
...
...
@@ -773,73 +599,321 @@ size_t llama_kv_cache_unified::size_v_bytes() const {
return
size_v_bytes
;
}
bool
llama_kv_cache_unified
::
defrag_prepare
(
int32_t
n_max_nodes
)
{
const
uint32_t
n_layer
=
hparams
.
n_layer
;
ggml_tensor
*
llama_kv_cache_unified
::
build_rope_shift
(
const
llama_cparams
&
cparams
,
ggml_context
*
ctx
,
ggml_tensor
*
cur
,
ggml_tensor
*
shift
,
ggml_tensor
*
factors
,
float
freq_base
,
float
freq_scale
)
const
{
const
auto
&
n_ctx_orig
=
cparams
.
n_ctx_orig_yarn
;
const
uint32_t
n_kv
=
cell_max
();
const
uint32_t
n_used
=
used
;
const
auto
&
yarn_ext_factor
=
cparams
.
yarn_ext_factor
;
const
auto
&
yarn_beta_fast
=
cparams
.
yarn_beta_fast
;
const
auto
&
yarn_beta_slow
=
cparams
.
yarn_beta_slow
;
assert
(
n_used
<=
n_kv
);
const
auto
&
n_rot
=
hparams
.
n_rot
;
const
auto
&
rope_type
=
hparams
.
rope_type
;
defrag_info
.
moves
.
clear
();
// See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
const
float
yarn_attn_factor
=
model
.
arch
==
LLM_ARCH_DEEPSEEK2
?
1.0
f
/
(
1.0
f
+
0.1
f
*
logf
(
1.0
f
/
freq_scale
))
:
cparams
.
yarn_attn_factor
;
// determine which KV cells to move where
//
// cell i moves to ids[i]
//
// if ids[i] == i || ids[i] == n_kv, then cell i is not moved
//
std
::
vector
<
uint32_t
>
ids
(
n_kv
,
n_kv
);
ggml_tensor
*
tmp
;
for
(
uint32_t
i0
=
0
;
i0
<
n_used
;
++
i0
)
{
const
auto
&
cell0
=
cells
[
i0
];
if
(
ggml_is_quantized
(
cur
->
type
))
{
// dequantize to f32 -> RoPE -> quantize back
tmp
=
ggml_cast
(
ctx
,
cur
,
GGML_TYPE_F32
);
if
(
!
cell0
.
is_empty
())
{
ids
[
i0
]
=
i0
;
tmp
=
ggml_rope_ext
(
ctx
,
tmp
,
shift
,
factors
,
n_rot
,
rope_type
,
n_ctx_orig
,
freq_base
,
freq_scale
,
yarn_ext_factor
,
yarn_attn_factor
,
yarn_beta_fast
,
yarn_beta_slow
);
continue
;
}
tmp
=
ggml_cpy
(
ctx
,
tmp
,
cur
);
}
else
{
// we rotate only the first n_rot dimensions
tmp
=
ggml_rope_ext_inplace
(
ctx
,
cur
,
shift
,
factors
,
n_rot
,
rope_type
,
n_ctx_orig
,
freq_base
,
freq_scale
,
yarn_ext_factor
,
yarn_attn_factor
,
yarn_beta_fast
,
yarn_beta_slow
);
}
// found a hole - fill it with data from the end of the cache
return
tmp
;
}
uint32_t
nh
=
1
;
class
llm_graph_input_k_shift
:
public
llm_graph_input_i
{
public:
llm_graph_input_k_shift
(
const
llama_kv_cache_unified
*
kv_self
)
:
kv_self
(
kv_self
)
{}
virtual
~
llm_graph_input_k_shift
()
=
default
;
// determine the size of the hole
while
(
i0
+
nh
<
n_used
&&
cells
[
i0
+
nh
].
is_empty
())
{
nh
++
;
}
void
set_input
(
const
llama_ubatch
*
ubatch
)
override
;
uint32_t
nf
=
0
;
uint32_t
is
=
n_kv
-
1
;
ggml_tensor
*
k_shift
;
// I32 [kv_size]
// starting from the end, find nh non-empty cells
for
(;
is
>
i0
;
--
is
)
{
const
auto
&
cell1
=
cells
[
is
];
const
llama_kv_cache_unified
*
kv_self
;
};
if
(
cell1
.
is_empty
()
||
ids
[
is
]
!=
n_kv
)
{
continue
;
}
void
llm_graph_input_k_shift
::
set_input
(
const
llama_ubatch
*
ubatch
)
{
GGML_UNUSED
(
ubatch
);
// non-empty cell which is not yet moved
nf
++
;
if
(
k_shift
)
{
assert
(
ggml_backend_buffer_is_host
(
k_shift
->
buffer
))
;
if
(
nf
==
nh
)
{
break
;
}
int32_t
*
data
=
(
int32_t
*
)
k_shift
->
data
;
for
(
uint32_t
i
=
0
;
i
<
kv_self
->
size
;
++
i
)
{
data
[
i
]
=
kv_self
->
cells
[
i
].
delta
;
}
}
}
// this can only happen if `n_used` is not accurate, which would be a bug
GGML_ASSERT
(
nf
==
nh
&&
"KV defrag bug: nf != nh"
);
llm_graph_result_ptr
llama_kv_cache_unified
::
build_graph_shift
(
const
llama_cparams
&
cparams
,
ggml_context
*
ctx
,
ggml_cgraph
*
gf
)
const
{
auto
res
=
std
::
make_unique
<
llm_graph_result
>
();
nf
=
0
;
const
auto
&
n_layer
=
hparams
.
n_layer
;
uint32_t
i1
=
is
;
const
auto
&
n_embd_head_k
=
hparams
.
n_embd_head_k
;
//const auto & n_embd_head_v = hparams.n_embd_head_v;
// are we moving a continuous block of memory?
bool
cont
=
false
;
const
uint32_t
n_ctx_per_seq
=
cparams
.
n_ctx
/
cparams
.
n_seq_max
;
// go back and move the nf cells to the hole
for
(;
i1
<
n_kv
;
++
i1
)
{
//GGML_ASSERT(kv_self->size == n_ctx);
auto
inp
=
std
::
make_unique
<
llm_graph_input_k_shift
>
(
this
);
inp
->
k_shift
=
ggml_new_tensor_1d
(
ctx
,
GGML_TYPE_I32
,
cparams
.
n_ctx
);
ggml_set_input
(
inp
->
k_shift
);
for
(
uint32_t
il
=
0
;
il
<
n_layer
;
++
il
)
{
const
int64_t
n_head_kv
=
hparams
.
n_head_kv
(
il
);
const
int64_t
n_embd_k_gqa
=
hparams
.
n_embd_k_gqa
(
il
);
const
bool
is_swa
=
hparams
.
is_swa
(
il
);
// note: the swa rope params could become part of the cparams in the future
// if we decide to make them configurable, like the non-sliding ones
const
float
freq_base_l
=
is_swa
?
hparams
.
rope_freq_base_train_swa
:
cparams
.
rope_freq_base
;
const
float
freq_scale_l
=
is_swa
?
hparams
.
rope_freq_scale_train_swa
:
cparams
.
rope_freq_scale
;
ggml_tensor
*
rope_factors
=
model
.
get_rope_factors
(
n_ctx_per_seq
,
il
);
ggml_tensor
*
k
=
ggml_view_3d
(
ctx
,
k_l
[
il
],
n_embd_head_k
,
n_head_kv
,
size
,
ggml_row_size
(
k_l
[
il
]
->
type
,
n_embd_head_k
),
ggml_row_size
(
k_l
[
il
]
->
type
,
n_embd_k_gqa
),
0
);
ggml_tensor
*
cur
=
build_rope_shift
(
cparams
,
ctx
,
k
,
inp
->
k_shift
,
rope_factors
,
freq_base_l
,
freq_scale_l
);
ggml_build_forward_expand
(
gf
,
cur
);
}
res
->
add_input
(
std
::
move
(
inp
));
return
res
;
}
llm_graph_result_ptr
llama_kv_cache_unified
::
build_graph_defrag
(
const
llama_cparams
&
cparams
,
ggml_context
*
ctx
,
ggml_cgraph
*
gf
,
const
std
::
vector
<
struct
llama_kv_defrag_move
>
&
moves
)
const
{
auto
res
=
std
::
make_unique
<
llm_graph_result
>
();
#if 0
// CPU defrag
//
// TODO: optimizations are possible:
// - multiple threads
// - avoid copying to the host memory when already there
//
// likely not worth the effort, as we have ggml_graph based defrag
//
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
const uint32_t kv_size = size;
std::vector<uint8_t> buf_k;
std::vector<uint8_t> buf_v;
for (uint32_t il = 0; il < n_layer; ++il) {
const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
const size_t k_size = ggml_row_size(k_l[il]->type, n_embd_k_gqa*kv_size);
const size_t v_size_el = ggml_type_size(v_l[il]->type);
const size_t v_size = ggml_row_size (v_l[il]->type, n_embd_v_gqa*kv_size);
buf_k.resize(k_size);
buf_v.resize(v_size);
ggml_backend_tensor_get(k_l[il], buf_k.data(), 0, buf_k.size());
ggml_backend_tensor_get(v_l[il], buf_v.data(), 0, buf_v.size());
// batch move [i, i+nm) to [id, id+nm)
// note: cells can move only to a lower index
for (uint32_t i = 0; i < n_kv; ++i) {
const uint32_t id = ids[i];
if (i == id || id == n_kv) {
continue;
}
uint32_t nm = 1;
while (i + nm < n_kv && ids[i + nm] == id + nm) {
nm++;
}
// move keys
{
const int64_t os = i*k_size_row;
const int64_t od = id*k_size_row;
memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
}
// move values (note: they are transposed)
{
const int64_t os = i;
const int64_t od = id;
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
}
}
i += nm - 1;
}
ggml_backend_tensor_set(k_l[il], buf_k.data(), 0, buf_k.size());
ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
}
#else
for
(
const
auto
&
move
:
moves
)
{
for
(
uint32_t
il
=
0
;
il
<
hparams
.
n_layer
;
++
il
)
{
// NOLINT
const
int64_t
n_embd_k_gqa
=
hparams
.
n_embd_k_gqa
(
il
);
const
int64_t
n_embd_v_gqa
=
hparams
.
n_embd_v_gqa
(
il
);
ggml_tensor
*
view_k_src
=
ggml_view_2d
(
ctx
,
k_l
[
il
],
n_embd_k_gqa
,
move
.
len
,
ggml_row_size
(
k_l
[
il
]
->
type
,
n_embd_k_gqa
),
ggml_row_size
(
k_l
[
il
]
->
type
,
n_embd_k_gqa
*
move
.
src
));
ggml_tensor
*
view_k_dst
=
ggml_view_2d
(
ctx
,
k_l
[
il
],
n_embd_k_gqa
,
move
.
len
,
ggml_row_size
(
k_l
[
il
]
->
type
,
n_embd_k_gqa
),
ggml_row_size
(
k_l
[
il
]
->
type
,
n_embd_k_gqa
*
move
.
dst
));
ggml_tensor
*
view_v_src
;
ggml_tensor
*
view_v_dst
;
if
(
cparams
.
flash_attn
)
{
// NOTE: the V cache is not transposed when using flash attention
view_v_src
=
ggml_view_2d
(
ctx
,
v_l
[
il
],
n_embd_v_gqa
,
move
.
len
,
ggml_row_size
(
v_l
[
il
]
->
type
,
n_embd_v_gqa
),
ggml_row_size
(
v_l
[
il
]
->
type
,
n_embd_v_gqa
*
move
.
dst
));
view_v_dst
=
ggml_view_2d
(
ctx
,
v_l
[
il
],
move
.
len
,
n_embd_v_gqa
,
ggml_row_size
(
v_l
[
il
]
->
type
,
n_embd_v_gqa
),
ggml_row_size
(
v_l
[
il
]
->
type
,
move
.
src
));
}
else
{
view_v_src
=
ggml_view_2d
(
ctx
,
v_l
[
il
],
move
.
len
,
n_embd_v_gqa
,
ggml_row_size
(
v_l
[
il
]
->
type
,
size
),
ggml_row_size
(
v_l
[
il
]
->
type
,
move
.
src
));
view_v_dst
=
ggml_view_2d
(
ctx
,
v_l
[
il
],
move
.
len
,
n_embd_v_gqa
,
ggml_row_size
(
v_l
[
il
]
->
type
,
size
),
ggml_row_size
(
v_l
[
il
]
->
type
,
move
.
dst
));
}
ggml_build_forward_expand
(
gf
,
ggml_cpy
(
ctx
,
view_k_src
,
view_k_dst
));
ggml_build_forward_expand
(
gf
,
ggml_cpy
(
ctx
,
view_v_src
,
view_v_dst
));
}
}
//LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
#endif
return
res
;
}
bool
llama_kv_cache_unified
::
defrag_prepare
(
int32_t
n_max_nodes
)
{
const
uint32_t
n_layer
=
hparams
.
n_layer
;
const
uint32_t
n_kv
=
cell_max
();
const
uint32_t
n_used
=
used
;
assert
(
n_used
<=
n_kv
);
defrag_info
.
moves
.
clear
();
// determine which KV cells to move where
//
// cell i moves to ids[i]
//
// if ids[i] == i || ids[i] == n_kv, then cell i is not moved
//
std
::
vector
<
uint32_t
>
ids
(
n_kv
,
n_kv
);
for
(
uint32_t
i0
=
0
;
i0
<
n_used
;
++
i0
)
{
const
auto
&
cell0
=
cells
[
i0
];
if
(
!
cell0
.
is_empty
())
{
ids
[
i0
]
=
i0
;
continue
;
}
// found a hole - fill it with data from the end of the cache
uint32_t
nh
=
1
;
// determine the size of the hole
while
(
i0
+
nh
<
n_used
&&
cells
[
i0
+
nh
].
is_empty
())
{
nh
++
;
}
uint32_t
nf
=
0
;
uint32_t
is
=
n_kv
-
1
;
// starting from the end, find nh non-empty cells
for
(;
is
>
i0
;
--
is
)
{
const
auto
&
cell1
=
cells
[
is
];
if
(
cell1
.
is_empty
()
||
ids
[
is
]
!=
n_kv
)
{
continue
;
}
// non-empty cell which is not yet moved
nf
++
;
if
(
nf
==
nh
)
{
break
;
}
}
// this can only happen if `n_used` is not accurate, which would be a bug
GGML_ASSERT
(
nf
==
nh
&&
"KV defrag bug: nf != nh"
);
nf
=
0
;
uint32_t
i1
=
is
;
// are we moving a continuous block of memory?
bool
cont
=
false
;
// go back and move the nf cells to the hole
for
(;
i1
<
n_kv
;
++
i1
)
{
auto
&
cell1
=
cells
[
i1
];
if
(
cell1
.
is_empty
()
||
ids
[
i1
]
!=
n_kv
)
{
...
...
@@ -854,7 +928,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
cells
[
i0
+
nf
]
=
cell1
;
// clear the old cell and move the head there
cell1
=
llama_
kv_cell
();
cell1
=
kv_cell
();
head
=
n_used
;
if
(
!
cont
)
{
...
...
@@ -885,6 +959,18 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
return
true
;
}
uint32_t
llama_kv_cache_unified
::
cell_max
()
const
{
for
(
uint32_t
i
=
size
;
i
>
0
;
--
i
)
{
const
kv_cell
&
cell
=
cells
[
i
-
1
];
if
(
cell
.
pos
>=
0
&&
!
cell
.
is_empty
())
{
return
i
;
}
}
return
0
;
}
void
llama_kv_cache_unified
::
state_write
(
llama_io_write_i
&
io
,
llama_seq_id
seq_id
)
const
{
std
::
vector
<
std
::
pair
<
uint32_t
,
uint32_t
>>
cell_ranges
;
// ranges, from inclusive, to exclusive
uint32_t
cell_count
=
0
;
...
...
@@ -1093,7 +1179,7 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
clear
();
for
(
uint32_t
i
=
0
;
i
<
cell_count
;
++
i
)
{
llama_
kv_cell
&
cell
=
cells
[
i
];
kv_cell
&
cell
=
cells
[
i
];
llama_pos
pos
;
uint32_t
n_seq_id
;
...
...
@@ -1116,15 +1202,6 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
}
cell
.
seq_id
.
insert
(
seq_id
);
if
(
recurrent
)
{
int32_t
&
tail
=
cells
[
seq_id
].
tail
;
if
(
tail
!=
-
1
)
{
LLAMA_LOG_ERROR
(
"%s: duplicate tail for seq_id %d in cell %d and %d
\n
"
,
__func__
,
seq_id
,
i
,
tail
);
return
false
;
}
tail
=
i
;
}
}
}
...
...
@@ -1132,18 +1209,1037 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
used
=
cell_count
;
}
if
(
recurrent
)
{
for
(
uint32_t
i
=
0
;
i
<
cell_count
;
++
i
)
{
uint32_t
cell_id
=
head
+
i
;
// make sure the recurrent states will keep their restored state
cells
[
cell_id
].
src
=
cell_id
;
return
true
;
}
bool
llama_kv_cache_unified
::
state_read_data
(
llama_io_read_i
&
io
,
uint32_t
cell_count
)
{
uint32_t
v_trans
;
uint32_t
n_layer
;
io
.
read_to
(
&
v_trans
,
sizeof
(
v_trans
));
io
.
read_to
(
&
n_layer
,
sizeof
(
n_layer
));
if
(
n_layer
!=
hparams
.
n_layer
)
{
LLAMA_LOG_ERROR
(
"%s: mismatched layer count (%u instead of %u)
\n
"
,
__func__
,
n_layer
,
hparams
.
n_layer
);
return
false
;
}
if
(
cell_count
>
size
)
{
LLAMA_LOG_ERROR
(
"%s: not enough cells in kv cache to restore state (%u > %u)
\n
"
,
__func__
,
cell_count
,
size
);
return
false
;
}
if
(
this
->
v_trans
!=
(
bool
)
v_trans
)
{
LLAMA_LOG_ERROR
(
"%s: incompatible V transposition
\n
"
,
__func__
);
return
false
;
}
// For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
for
(
uint32_t
il
=
0
;
il
<
n_layer
;
++
il
)
{
const
uint32_t
n_embd_k_gqa
=
hparams
.
n_embd_k_gqa
(
il
)
+
hparams
.
n_embd_k_s
();
// Read type of key
int32_t
k_type_i_ref
;
io
.
read_to
(
&
k_type_i_ref
,
sizeof
(
k_type_i_ref
));
const
int32_t
k_type_i
=
(
int32_t
)
k_l
[
il
]
->
type
;
if
(
k_type_i
!=
k_type_i_ref
)
{
LLAMA_LOG_ERROR
(
"%s: mismatched key type (%d != %d, layer %d)
\n
"
,
__func__
,
k_type_i
,
k_type_i_ref
,
il
);
return
false
;
}
// Read row size of key
uint64_t
k_size_row_ref
;
io
.
read_to
(
&
k_size_row_ref
,
sizeof
(
k_size_row_ref
));
const
size_t
k_size_row
=
ggml_row_size
(
k_l
[
il
]
->
type
,
n_embd_k_gqa
);
if
(
k_size_row
!=
k_size_row_ref
)
{
LLAMA_LOG_ERROR
(
"%s: mismatched key row size (%zu != %zu, layer %d)
\n
"
,
__func__
,
k_size_row
,
(
size_t
)
k_size_row_ref
,
il
);
return
false
;
}
if
(
cell_count
)
{
// Read and set the keys for the whole cell range
ggml_backend_tensor_set
(
k_l
[
il
],
io
.
read
(
cell_count
*
k_size_row
),
head
*
k_size_row
,
cell_count
*
k_size_row
);
}
}
if
(
!
this
->
v_trans
)
{
for
(
uint32_t
il
=
0
;
il
<
n_layer
;
++
il
)
{
const
uint32_t
n_embd_v_gqa
=
hparams
.
n_embd_v_gqa
(
il
)
+
hparams
.
n_embd_v_s
();
// Read type of value
int32_t
v_type_i_ref
;
io
.
read_to
(
&
v_type_i_ref
,
sizeof
(
v_type_i_ref
));
const
int32_t
v_type_i
=
(
int32_t
)
v_l
[
il
]
->
type
;
if
(
v_type_i
!=
v_type_i_ref
)
{
LLAMA_LOG_ERROR
(
"%s: mismatched value type (%d != %d, layer %d)
\n
"
,
__func__
,
v_type_i
,
v_type_i_ref
,
il
);
return
false
;
}
// Read row size of value
uint64_t
v_size_row_ref
;
io
.
read_to
(
&
v_size_row_ref
,
sizeof
(
v_size_row_ref
));
const
size_t
v_size_row
=
ggml_row_size
(
v_l
[
il
]
->
type
,
n_embd_v_gqa
);
if
(
v_size_row
!=
v_size_row_ref
)
{
LLAMA_LOG_ERROR
(
"%s: mismatched value row size (%zu != %zu, layer %d)
\n
"
,
__func__
,
v_size_row
,
(
size_t
)
v_size_row_ref
,
il
);
return
false
;
}
if
(
cell_count
)
{
// Read and set the values for the whole cell range
ggml_backend_tensor_set
(
v_l
[
il
],
io
.
read
(
cell_count
*
v_size_row
),
head
*
v_size_row
,
cell_count
*
v_size_row
);
}
}
}
else
{
// For each layer, read the values for each cell (transposed)
for
(
uint32_t
il
=
0
;
il
<
n_layer
;
++
il
)
{
const
uint32_t
n_embd_v_gqa
=
hparams
.
n_embd_v_gqa
(
il
)
+
hparams
.
n_embd_v_s
();
// Read type of value
int32_t
v_type_i_ref
;
io
.
read_to
(
&
v_type_i_ref
,
sizeof
(
v_type_i_ref
));
const
int32_t
v_type_i
=
(
int32_t
)
v_l
[
il
]
->
type
;
if
(
v_type_i
!=
v_type_i_ref
)
{
LLAMA_LOG_ERROR
(
"%s: mismatched value type (%d != %d, layer %d)
\n
"
,
__func__
,
v_type_i
,
v_type_i_ref
,
il
);
return
false
;
}
// Read element size of value
uint32_t
v_size_el_ref
;
io
.
read_to
(
&
v_size_el_ref
,
sizeof
(
v_size_el_ref
));
const
size_t
v_size_el
=
ggml_type_size
(
v_l
[
il
]
->
type
);
if
(
v_size_el
!=
v_size_el_ref
)
{
LLAMA_LOG_ERROR
(
"%s: mismatched value element size (%zu != %zu, layer %d)
\n
"
,
__func__
,
v_size_el
,
(
size_t
)
v_size_el_ref
,
il
);
return
false
;
}
// Read GQA embedding size
uint32_t
n_embd_v_gqa_ref
;
io
.
read_to
(
&
n_embd_v_gqa_ref
,
sizeof
(
n_embd_v_gqa_ref
));
if
(
n_embd_v_gqa
!=
n_embd_v_gqa_ref
)
{
LLAMA_LOG_ERROR
(
"%s: mismatched GQA embedding size (%u != %u, layer %d)
\n
"
,
__func__
,
n_embd_v_gqa
,
n_embd_v_gqa_ref
,
il
);
return
false
;
}
if
(
cell_count
)
{
// For each row in the transposed matrix, read the values for the whole cell range
for
(
uint32_t
j
=
0
;
j
<
n_embd_v_gqa
;
++
j
)
{
const
size_t
dst_offset
=
(
head
+
j
*
size
)
*
v_size_el
;
ggml_backend_tensor_set
(
v_l
[
il
],
io
.
read
(
cell_count
*
v_size_el
),
dst_offset
,
cell_count
*
v_size_el
);
}
}
}
}
return
true
;
}
bool
llama_kv_cache_unified
::
state_read_data
(
llama_io_read_i
&
io
,
uint32_t
cell_count
)
{
//
// llama_kv_cache_recurrent
//
llama_kv_cache_recurrent
::
llama_kv_cache_recurrent
(
const
llama_model
&
model
,
ggml_type
type_k
,
ggml_type
type_v
,
bool
offload
,
uint32_t
kv_size
)
:
hparams
(
model
.
hparams
)
{
const
int32_t
n_layer
=
hparams
.
n_layer
;
LLAMA_LOG_INFO
(
"%s: kv_size = %d, type_k = '%s', type_v = '%s', n_layer = %d
\n
"
,
__func__
,
kv_size
,
ggml_type_name
(
type_k
),
ggml_type_name
(
type_v
),
n_layer
);
head
=
0
;
size
=
kv_size
;
used
=
0
;
this
->
type_k
=
type_k
;
this
->
type_v
=
type_v
;
cells
.
clear
();
cells
.
resize
(
kv_size
);
// create a context for each buffer type
std
::
map
<
ggml_backend_buffer_type_t
,
ggml_context
*>
ctx_map
;
auto
ctx_for_buft
=
[
&
](
ggml_backend_buffer_type_t
buft
)
->
ggml_context
*
{
auto
it
=
ctx_map
.
find
(
buft
);
if
(
it
==
ctx_map
.
end
())
{
ggml_init_params
params
=
{
/*.mem_size =*/
size_t
(
2u
*
n_layer
*
ggml_tensor_overhead
()),
/*.mem_buffer =*/
NULL
,
/*.no_alloc =*/
true
,
};
ggml_context
*
ctx
=
ggml_init
(
params
);
if
(
!
ctx
)
{
return
nullptr
;
}
ctx_map
[
buft
]
=
ctx
;
ctxs
.
emplace_back
(
ctx
);
return
ctx
;
}
return
it
->
second
;
};
k_l
.
reserve
(
n_layer
);
v_l
.
reserve
(
n_layer
);
for
(
int
i
=
0
;
i
<
n_layer
;
i
++
)
{
const
uint32_t
n_embd_k_gqa
=
hparams
.
n_embd_k_gqa
(
i
)
+
hparams
.
n_embd_k_s
();
const
uint32_t
n_embd_v_gqa
=
hparams
.
n_embd_v_gqa
(
i
)
+
hparams
.
n_embd_v_s
();
const
char
*
dev_name
=
"CPU"
;
ggml_backend_buffer_type_t
buft
=
ggml_backend_cpu_buffer_type
();
if
(
offload
)
{
auto
*
dev
=
model
.
dev_layer
(
i
);
buft
=
ggml_backend_dev_buffer_type
(
dev
);
dev_name
=
ggml_backend_dev_name
(
dev
);
}
LLAMA_LOG_DEBUG
(
"%s, layer %3d: dev = %s
\n
"
,
__func__
,
i
,
dev_name
);
ggml_context
*
ctx
=
ctx_for_buft
(
buft
);
if
(
!
ctx
)
{
throw
std
::
runtime_error
(
"failed to create ggml context for kv cache"
);
}
ggml_tensor
*
k
=
ggml_new_tensor_1d
(
ctx
,
type_k
,
n_embd_k_gqa
*
kv_size
);
ggml_tensor
*
v
=
ggml_new_tensor_1d
(
ctx
,
type_v
,
n_embd_v_gqa
*
kv_size
);
ggml_format_name
(
k
,
"cache_k_l%d"
,
i
);
ggml_format_name
(
v
,
"cache_v_l%d"
,
i
);
k_l
.
push_back
(
k
);
v_l
.
push_back
(
v
);
}
// allocate tensors and initialize the buffers to avoid NaNs in the padding
for
(
auto
it
:
ctx_map
)
{
auto
*
buft
=
it
.
first
;
auto
*
ctx
=
it
.
second
;
ggml_backend_buffer_t
buf
=
ggml_backend_alloc_ctx_tensors_from_buft
(
ctx
,
buft
);
if
(
!
buf
)
{
throw
std
::
runtime_error
(
"failed to allocate buffer for kv cache"
);
}
ggml_backend_buffer_clear
(
buf
,
0
);
LLAMA_LOG_INFO
(
"%s: %10s KV buffer size = %8.2f MiB
\n
"
,
__func__
,
ggml_backend_buffer_name
(
buf
),
ggml_backend_buffer_get_size
(
buf
)
/
1024.0
/
1024.0
);
bufs
.
emplace_back
(
buf
);
}
{
const
size_t
memory_size_k
=
size_k_bytes
();
const
size_t
memory_size_v
=
size_v_bytes
();
LLAMA_LOG_INFO
(
"%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB
\n
"
,
__func__
,
(
float
)(
memory_size_k
+
memory_size_v
)
/
(
1024.0
f
*
1024.0
f
),
ggml_type_name
(
type_k
),
(
float
)
memory_size_k
/
(
1024.0
f
*
1024.0
f
),
ggml_type_name
(
type_v
),
(
float
)
memory_size_v
/
(
1024.0
f
*
1024.0
f
));
}
}
void
llama_kv_cache_recurrent
::
clear
()
{
for
(
int32_t
i
=
0
;
i
<
(
int32_t
)
size
;
++
i
)
{
cells
[
i
].
pos
=
-
1
;
cells
[
i
].
seq_id
.
clear
();
cells
[
i
].
src
=
-
1
;
cells
[
i
].
tail
=
-
1
;
}
head
=
0
;
used
=
0
;
for
(
auto
&
buf
:
bufs
)
{
ggml_backend_buffer_clear
(
buf
.
get
(),
0
);
}
}
bool
llama_kv_cache_recurrent
::
seq_rm
(
llama_seq_id
seq_id
,
llama_pos
p0
,
llama_pos
p1
)
{
uint32_t
new_head
=
size
;
if
(
p0
<
0
)
{
p0
=
0
;
}
if
(
p1
<
0
)
{
p1
=
std
::
numeric_limits
<
llama_pos
>::
max
();
}
// models like Mamba or RWKV can't have a state partially erased
if
(
seq_id
>=
(
int64_t
)
size
)
{
// could be fatal
return
false
;
}
if
(
0
<=
seq_id
)
{
int32_t
&
tail_id
=
cells
[
seq_id
].
tail
;
if
(
tail_id
>=
0
)
{
const
kv_cell
&
cell
=
cells
[
tail_id
];
// partial intersection is invalid
if
((
0
<
p0
&&
p0
<=
cell
.
pos
)
||
(
0
<
p1
&&
p1
<=
cell
.
pos
))
{
return
false
;
}
// invalidate tails which will be cleared
if
(
p0
<=
cell
.
pos
&&
cell
.
pos
<
p1
)
{
tail_id
=
-
1
;
}
}
}
else
{
// seq_id is negative, then the range should include everything or nothing
if
(
p0
!=
p1
&&
(
p0
!=
0
||
p1
!=
std
::
numeric_limits
<
llama_pos
>::
max
()))
{
return
false
;
}
}
for
(
uint32_t
i
=
0
;
i
<
size
;
++
i
)
{
if
(
cells
[
i
].
pos
>=
p0
&&
cells
[
i
].
pos
<
p1
)
{
if
(
seq_id
<
0
)
{
cells
[
i
].
seq_id
.
clear
();
}
else
if
(
cells
[
i
].
has_seq_id
(
seq_id
))
{
cells
[
i
].
seq_id
.
erase
(
seq_id
);
}
else
{
continue
;
}
if
(
cells
[
i
].
is_empty
())
{
// keep count of the number of used cells
if
(
cells
[
i
].
pos
>=
0
)
{
used
--
;
}
cells
[
i
].
pos
=
-
1
;
cells
[
i
].
src
=
-
1
;
if
(
new_head
==
size
)
{
new_head
=
i
;
}
}
}
}
// If we freed up a slot, set head to it so searching can start there.
if
(
new_head
!=
size
&&
new_head
<
head
)
{
head
=
new_head
;
}
return
true
;
}
void
llama_kv_cache_recurrent
::
seq_cp
(
llama_seq_id
seq_id_src
,
llama_seq_id
seq_id_dst
,
llama_pos
p0
,
llama_pos
p1
)
{
if
(
seq_id_src
==
seq_id_dst
)
{
return
;
}
if
(
p0
<
0
)
{
p0
=
0
;
}
if
(
p1
<
0
)
{
p1
=
std
::
numeric_limits
<
llama_pos
>::
max
();
}
if
((
uint32_t
)
seq_id_dst
<
size
&&
(
uint32_t
)
seq_id_src
<
size
)
{
kv_cell
&
tail_src
=
cells
[
seq_id_src
];
kv_cell
&
tail_dst
=
cells
[
seq_id_dst
];
if
(
tail_dst
.
tail
>=
0
)
{
// clear destination seq_id if it wasn't empty
kv_cell
&
cell_dst
=
cells
[
tail_dst
.
tail
];
cell_dst
.
seq_id
.
erase
(
seq_id_dst
);
tail_dst
.
tail
=
-
1
;
if
(
cell_dst
.
seq_id
.
empty
())
{
cell_dst
.
pos
=
-
1
;
cell_dst
.
src
=
-
1
;
used
-=
1
;
}
}
if
(
tail_src
.
tail
>=
0
)
{
kv_cell
&
cell_src
=
cells
[
tail_src
.
tail
];
cell_src
.
seq_id
.
insert
(
seq_id_dst
);
tail_dst
.
tail
=
tail_src
.
tail
;
}
}
}
void
llama_kv_cache_recurrent
::
seq_keep
(
llama_seq_id
seq_id
)
{
uint32_t
new_head
=
size
;
for
(
uint32_t
i
=
0
;
i
<
size
;
++
i
)
{
if
((
llama_seq_id
)
i
!=
seq_id
)
{
cells
[
i
].
tail
=
-
1
;
}
if
(
!
cells
[
i
].
has_seq_id
(
seq_id
))
{
if
(
cells
[
i
].
pos
>=
0
)
{
used
--
;
}
cells
[
i
].
pos
=
-
1
;
cells
[
i
].
src
=
-
1
;
cells
[
i
].
seq_id
.
clear
();
if
(
new_head
==
size
){
new_head
=
i
;
}
}
else
{
cells
[
i
].
seq_id
.
clear
();
cells
[
i
].
seq_id
.
insert
(
seq_id
);
}
}
// If we freed up a slot, set head to it so searching can start there.
if
(
new_head
!=
size
&&
new_head
<
head
)
{
head
=
new_head
;
}
}
void
llama_kv_cache_recurrent
::
seq_add
(
llama_seq_id
seq_id
,
llama_pos
p0
,
llama_pos
p1
,
llama_pos
delta
)
{
if
(
delta
==
0
)
{
return
;
}
if
(
p0
<
0
)
{
p0
=
0
;
}
if
(
p1
<
0
)
{
p1
=
std
::
numeric_limits
<
llama_pos
>::
max
();
}
// If there is no range then return early to avoid looping over the
if
(
p0
==
p1
)
{
return
;
}
// for Mamba-like or RWKV models, only the pos needs to be shifted
if
(
0
<=
seq_id
&&
seq_id
<
(
int64_t
)
size
)
{
const
int32_t
tail_id
=
cells
[
seq_id
].
tail
;
if
(
tail_id
>=
0
)
{
kv_cell
&
cell
=
cells
[
tail_id
];
if
(
cell
.
has_seq_id
(
seq_id
)
&&
p0
<=
cell
.
pos
&&
cell
.
pos
<
p1
)
{
cell
.
pos
+=
delta
;
}
}
}
}
void
llama_kv_cache_recurrent
::
seq_div
(
llama_seq_id
seq_id
,
llama_pos
p0
,
llama_pos
p1
,
int
d
)
{
if
(
d
==
1
)
{
return
;
}
if
(
p0
<
0
)
{
p0
=
0
;
}
if
(
p1
<
0
)
{
p1
=
std
::
numeric_limits
<
llama_pos
>::
max
();
}
// If there is no range then return early to avoid looping over the cache.
if
(
p0
==
p1
)
{
return
;
}
// for Mamba-like or RWKV models, only the pos needs to be changed
if
(
0
<=
seq_id
&&
seq_id
<
(
int64_t
)
size
)
{
const
int32_t
tail_id
=
cells
[
seq_id
].
tail
;
if
(
tail_id
>=
0
)
{
kv_cell
&
cell
=
cells
[
tail_id
];
if
(
cell
.
has_seq_id
(
seq_id
)
&&
p0
<=
cell
.
pos
&&
cell
.
pos
<
p1
)
{
cell
.
pos
/=
d
;
}
}
}
}
llama_pos
llama_kv_cache_recurrent
::
seq_pos_max
(
llama_seq_id
seq_id
)
const
{
llama_pos
result
=
0
;
for
(
uint32_t
i
=
0
;
i
<
size
;
++
i
)
{
if
(
cells
[
i
].
has_seq_id
(
seq_id
))
{
result
=
std
::
max
(
result
,
cells
[
i
].
pos
);
}
}
return
result
;
}
void
llama_kv_cache_recurrent
::
restore
()
{
if
(
pending
.
ranges
.
empty
())
{
return
;
}
seq_rm
(
-
1
,
-
1
,
-
1
);
}
void
llama_kv_cache_recurrent
::
commit
()
{
pending
.
ranges
.
clear
();
}
bool
llama_kv_cache_recurrent
::
update
(
llama_context
&
lctx
)
{
GGML_UNUSED
(
lctx
);
return
false
;
}
void
llama_kv_cache_recurrent
::
defrag_sched
(
float
thold
)
{
GGML_UNUSED
(
thold
);
// noop
}
void
llama_kv_cache_recurrent
::
set_full
()
{
n
=
size
;
}
llama_sbatch
llama_kv_cache_recurrent
::
sbatch_init
(
const
llama_batch
&
batch
,
bool
logits_all
)
{
return
llama_sbatch
(
batch
,
hparams
.
n_embd
,
false
,
logits_all
);
}
llama_ubatch
llama_kv_cache_recurrent
::
ubatch_next
(
llama_sbatch
&
sbatch
,
uint32_t
n_ubatch
,
bool
embd_pooled
)
const
{
if
(
embd_pooled
)
{
// Pooled embeddings cannot be split across ubatches (yet)
return
sbatch
.
split_seq
(
n_ubatch
);
}
return
sbatch
.
split_equal
(
n_ubatch
);
}
bool
llama_kv_cache_recurrent
::
find_slot
(
const
llama_ubatch
&
ubatch
)
{
const
uint32_t
n_tokens
=
ubatch
.
n_tokens
;
const
uint32_t
n_seqs
=
ubatch
.
n_seqs
;
const
uint32_t
n_seq_tokens
=
ubatch
.
n_seq_tokens
;
// if we have enough unused cells before the current head ->
// better to start searching from the beginning of the cache, hoping to fill it
if
(
head
>
used
+
2
*
n_tokens
)
{
head
=
0
;
}
// For recurrent state architectures (like Mamba or RWKV),
// each cache cell can store the state for a whole sequence.
// A slot should be always be contiguous.
// can only process batches with an equal number of new tokens in each sequence
GGML_ASSERT
(
ubatch
.
equal_seqs
);
int32_t
min
=
size
-
1
;
int32_t
max
=
0
;
// everything should fit if all seq_ids are smaller than the max
for
(
uint32_t
s
=
0
;
s
<
n_seqs
;
++
s
)
{
const
uint32_t
n_seq_id
=
ubatch
.
n_seq_id
[
s
];
for
(
uint32_t
j
=
0
;
j
<
n_seq_id
;
++
j
)
{
const
llama_seq_id
seq_id
=
ubatch
.
seq_id
[
s
][
j
];
if
(
seq_id
<
0
||
(
uint32_t
)
seq_id
>=
size
)
{
// too big seq_id
// TODO: would it be possible to resize the cache instead?
LLAMA_LOG_ERROR
(
"%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value
\n
"
,
__func__
,
seq_id
,
size
);
return
false
;
}
if
(
j
>
0
)
{
kv_cell
&
seq
=
cells
[
seq_id
];
if
(
seq
.
tail
>=
0
)
{
kv_cell
&
cell
=
cells
[
seq
.
tail
];
// clear cells from seq_ids that become shared
// (should not normally happen, but let's handle it anyway)
cell
.
seq_id
.
erase
(
seq_id
);
seq
.
tail
=
-
1
;
if
(
cell
.
seq_id
.
empty
())
{
cell
.
pos
=
-
1
;
cell
.
src
=
-
1
;
used
-=
1
;
}
}
}
}
}
#ifndef NDEBUG
{
std
::
vector
<
int32_t
>
tails_verif
;
tails_verif
.
assign
(
size
,
-
1
);
for
(
uint32_t
i
=
0
;
i
<
size
;
++
i
)
{
kv_cell
&
cell
=
cells
[
i
];
for
(
llama_seq_id
seq_id
:
cell
.
seq_id
)
{
if
(
tails_verif
[
seq_id
]
!=
-
1
)
{
LLAMA_LOG_ERROR
(
"%s: duplicate tail for seq_id %d in cell %d and %d
\n
"
,
__func__
,
seq_id
,
i
,
tails_verif
[
seq_id
]);
}
tails_verif
[
seq_id
]
=
i
;
}
}
for
(
uint32_t
i
=
0
;
i
<
size
;
++
i
)
{
if
(
tails_verif
[
i
]
!=
cells
[
i
].
tail
)
{
LLAMA_LOG_ERROR
(
"%s: wrong tail for seq_id %d, (%d instead of %d)
\n
"
,
__func__
,
i
,
cells
[
i
].
tail
,
tails_verif
[
i
]);
}
}
}
#endif
// find next empty cell
uint32_t
next_empty_cell
=
head
;
for
(
uint32_t
i
=
0
;
i
<
size
;
++
i
)
{
if
(
next_empty_cell
>=
size
)
{
next_empty_cell
-=
size
;
}
kv_cell
&
cell
=
cells
[
next_empty_cell
];
if
(
cell
.
is_empty
())
{
break
;
}
next_empty_cell
+=
1
;
}
// find usable cell range
for
(
uint32_t
s
=
0
;
s
<
n_seqs
;
++
s
)
{
const
llama_seq_id
seq_id
=
ubatch
.
seq_id
[
s
][
0
];
kv_cell
&
seq_meta
=
cells
[
seq_id
];
bool
has_cell
=
false
;
if
(
seq_meta
.
tail
>=
0
)
{
kv_cell
&
cell
=
cells
[
seq_meta
.
tail
];
GGML_ASSERT
(
cell
.
has_seq_id
(
seq_id
));
// does this seq_id "own" the cell?
if
(
cell
.
seq_id
.
size
()
==
1
)
{
has_cell
=
true
;
}
}
if
(
!
has_cell
)
{
kv_cell
&
empty_cell
=
cells
[
next_empty_cell
];
GGML_ASSERT
(
empty_cell
.
is_empty
());
// copy old tail into the empty cell
if
(
seq_meta
.
tail
>=
0
)
{
kv_cell
&
orig_cell
=
cells
[
seq_meta
.
tail
];
empty_cell
.
pos
=
orig_cell
.
pos
;
empty_cell
.
src
=
orig_cell
.
src
;
orig_cell
.
seq_id
.
erase
(
seq_id
);
empty_cell
.
seq_id
.
insert
(
seq_id
);
// will be overwritten
}
seq_meta
.
tail
=
next_empty_cell
;
// find next empty cell
if
(
s
+
1
<
n_seqs
)
{
next_empty_cell
+=
1
;
for
(
uint32_t
i
=
0
;
i
<
size
;
++
i
)
{
if
(
next_empty_cell
>=
size
)
{
next_empty_cell
-=
size
;
}
kv_cell
&
cell
=
cells
[
next_empty_cell
];
if
(
cell
.
is_empty
())
{
break
;
}
next_empty_cell
+=
1
;
}
}
}
if
(
min
>
seq_meta
.
tail
)
{
min
=
seq_meta
.
tail
;
}
if
(
max
<
seq_meta
.
tail
)
{
max
=
seq_meta
.
tail
;
}
}
// gather and re-order
for
(
uint32_t
s
=
0
;
s
<
n_seqs
;
++
s
)
{
int32_t
dst_id
=
s
+
min
;
int32_t
src_id
=
cells
[
ubatch
.
seq_id
[
s
][
0
]].
tail
;
if
(
dst_id
!=
src_id
)
{
kv_cell
&
dst_cell
=
cells
[
dst_id
];
kv_cell
&
src_cell
=
cells
[
src_id
];
std
::
swap
(
dst_cell
.
pos
,
src_cell
.
pos
);
std
::
swap
(
dst_cell
.
src
,
src_cell
.
src
);
std
::
swap
(
dst_cell
.
seq_id
,
src_cell
.
seq_id
);
// swap tails (assuming they NEVER overlap)
for
(
const
llama_seq_id
seq_id
:
src_cell
.
seq_id
)
{
cells
[
seq_id
].
tail
=
src_id
;
}
for
(
const
llama_seq_id
seq_id
:
dst_cell
.
seq_id
)
{
cells
[
seq_id
].
tail
=
dst_id
;
}
}
}
// update the pos of the used seqs
for
(
uint32_t
s
=
0
;
s
<
n_seqs
;
++
s
)
{
const
llama_pos
last_pos
=
ubatch
.
pos
[
n_seq_tokens
*
s
+
n_seq_tokens
-
1
];
int32_t
cell_id
=
s
+
min
;
kv_cell
&
cell
=
cells
[
cell_id
];
if
(
cell
.
pos
>=
0
&&
last_pos
!=
cell
.
pos
+
(
llama_pos
)
n_seq_tokens
)
{
// What should happen when the pos backtracks or skips a value?
// Clearing the state mid-batch would require special-casing which isn't done.
LLAMA_LOG_WARN
(
"%s: non-consecutive token position %d after %d for sequence %d with %u new tokens
\n
"
,
__func__
,
last_pos
,
cell
.
pos
,
ubatch
.
seq_id
[
s
][
0
],
n_seq_tokens
);
}
cell
.
pos
=
last_pos
;
cell
.
seq_id
.
clear
();
for
(
int32_t
j
=
0
;
j
<
ubatch
.
n_seq_id
[
s
];
++
j
)
{
const
llama_seq_id
seq_id
=
ubatch
.
seq_id
[
s
][
j
];
cell
.
seq_id
.
insert
(
seq_id
);
cells
[
seq_id
].
tail
=
cell_id
;
}
}
// allow getting the range of used cells, from head to head + n
head
=
min
;
n
=
max
-
min
+
1
;
used
=
std
::
count_if
(
cells
.
begin
(),
cells
.
end
(),
[](
const
kv_cell
&
cell
){
return
!
cell
.
is_empty
();
});
// sanity check
return
n
>=
n_seqs
;
}
int32_t
llama_kv_cache_recurrent
::
get_n_tokens
()
const
{
int32_t
result
=
0
;
for
(
uint32_t
i
=
0
;
i
<
size
;
i
++
)
{
result
+=
cells
[
i
].
seq_id
.
size
();
}
return
result
;
}
int32_t
llama_kv_cache_recurrent
::
get_used_cells
()
const
{
return
used
;
}
llama_pos
llama_kv_cache_recurrent
::
get_pos_max
()
const
{
llama_pos
pos_max
=
-
1
;
for
(
const
auto
&
cell
:
cells
)
{
pos_max
=
std
::
max
(
pos_max
,
cell
.
pos
);
}
return
pos_max
;
}
bool
llama_kv_cache_recurrent
::
get_can_shift
()
const
{
return
false
;
}
int32_t
llama_kv_cache_recurrent
::
s_copy
(
int
i
)
const
{
const
uint32_t
cell_id
=
i
+
head
;
//////////////////////////////////////////////
// TODO: this should not mutate the KV cache !
kv_cell
&
cell
=
const_cast
<
kv_cell
&>
(
cells
[
cell_id
]);
// prevent out-of-bound sources
if
(
cell
.
src
<
0
||
(
uint32_t
)
cell
.
src
>=
size
)
{
cell
.
src
=
cell_id
;
}
int32_t
res
=
cell
.
src
;
// TODO: do not mutate the KV cache
// ensure copy only happens once
if
(
cell
.
src
!=
(
int32_t
)
cell_id
)
{
cell
.
src
=
cell_id
;
}
return
res
;
}
float
llama_kv_cache_recurrent
::
s_mask
(
int
i
)
const
{
const
uint32_t
cell_id
=
i
+
head
;
//////////////////////////////////////////////
// TODO: this should not mutate the KV cache !
kv_cell
&
cell
=
const_cast
<
kv_cell
&>
(
cells
[
cell_id
]);
float
res
=
(
float
)
(
cell
.
src
>=
0
);
// only clear once
if
(
cell
.
src
<
0
)
{
cell
.
src
=
cell_id
;
}
return
res
;
}
uint32_t
llama_kv_cache_recurrent
::
cell_max
()
const
{
for
(
uint32_t
i
=
size
;
i
>
0
;
--
i
)
{
const
kv_cell
&
cell
=
cells
[
i
-
1
];
if
(
cell
.
pos
>=
0
&&
!
cell
.
is_empty
())
{
return
i
;
}
}
return
0
;
}
size_t
llama_kv_cache_recurrent
::
total_size
()
const
{
size_t
size
=
0
;
for
(
const
auto
&
buf
:
bufs
)
{
size
+=
ggml_backend_buffer_get_size
(
buf
.
get
());
}
return
size
;
}
size_t
llama_kv_cache_recurrent
::
size_k_bytes
()
const
{
size_t
size_k_bytes
=
0
;
for
(
const
auto
&
k
:
k_l
)
{
size_k_bytes
+=
ggml_nbytes
(
k
);
}
return
size_k_bytes
;
}
size_t
llama_kv_cache_recurrent
::
size_v_bytes
()
const
{
size_t
size_v_bytes
=
0
;
for
(
const
auto
&
v
:
v_l
)
{
size_v_bytes
+=
ggml_nbytes
(
v
);
}
return
size_v_bytes
;
}
void
llama_kv_cache_recurrent
::
state_write
(
llama_io_write_i
&
io
,
llama_seq_id
seq_id
)
const
{
std
::
vector
<
std
::
pair
<
uint32_t
,
uint32_t
>>
cell_ranges
;
// ranges, from inclusive, to exclusive
uint32_t
cell_count
=
0
;
// Count the number of cells with the specified seq_id
// Find all the ranges of cells with this seq id (or all, when -1)
uint32_t
cell_range_begin
=
size
;
for
(
uint32_t
i
=
0
;
i
<
size
;
++
i
)
{
const
auto
&
cell
=
cells
[
i
];
if
((
seq_id
==
-
1
&&
!
cell
.
is_empty
())
||
cell
.
has_seq_id
(
seq_id
))
{
++
cell_count
;
if
(
cell_range_begin
==
size
)
{
cell_range_begin
=
i
;
}
}
else
{
if
(
cell_range_begin
!=
size
)
{
cell_ranges
.
emplace_back
(
cell_range_begin
,
i
);
cell_range_begin
=
size
;
}
}
}
if
(
cell_range_begin
!=
size
)
{
cell_ranges
.
emplace_back
(
cell_range_begin
,
size
);
}
// DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
uint32_t
cell_count_check
=
0
;
for
(
const
auto
&
range
:
cell_ranges
)
{
cell_count_check
+=
range
.
second
-
range
.
first
;
}
GGML_ASSERT
(
cell_count
==
cell_count_check
);
io
.
write
(
&
cell_count
,
sizeof
(
cell_count
));
state_write_meta
(
io
,
cell_ranges
,
seq_id
);
state_write_data
(
io
,
cell_ranges
);
}
void
llama_kv_cache_recurrent
::
state_read
(
llama_io_read_i
&
io
,
llama_seq_id
seq_id
)
{
uint32_t
cell_count
;
io
.
read_to
(
&
cell_count
,
sizeof
(
cell_count
));
bool
res
=
true
;
res
=
res
&&
state_read_meta
(
io
,
cell_count
,
seq_id
);
res
=
res
&&
state_read_data
(
io
,
cell_count
);
if
(
!
res
)
{
if
(
seq_id
==
-
1
)
{
clear
();
}
else
{
seq_rm
(
seq_id
,
-
1
,
-
1
);
}
throw
std
::
runtime_error
(
"failed to restore kv cache"
);
}
}
void
llama_kv_cache_recurrent
::
state_write_meta
(
llama_io_write_i
&
io
,
const
std
::
vector
<
std
::
pair
<
uint32_t
,
uint32_t
>>
&
cell_ranges
,
llama_seq_id
seq_id
)
const
{
for
(
const
auto
&
range
:
cell_ranges
)
{
for
(
uint32_t
i
=
range
.
first
;
i
<
range
.
second
;
++
i
)
{
const
auto
&
cell
=
cells
[
i
];
const
llama_pos
pos
=
cell
.
pos
;
const
uint32_t
n_seq_id
=
seq_id
==
-
1
?
cell
.
seq_id
.
size
()
:
0
;
io
.
write
(
&
pos
,
sizeof
(
pos
));
io
.
write
(
&
n_seq_id
,
sizeof
(
n_seq_id
));
if
(
n_seq_id
)
{
for
(
auto
seq_id
:
cell
.
seq_id
)
{
io
.
write
(
&
seq_id
,
sizeof
(
seq_id
));
}
}
}
}
}
void
llama_kv_cache_recurrent
::
state_write_data
(
llama_io_write_i
&
io
,
const
std
::
vector
<
std
::
pair
<
uint32_t
,
uint32_t
>>
&
cell_ranges
)
const
{
const
uint32_t
v_trans
=
0
;
const
uint32_t
n_layer
=
hparams
.
n_layer
;
io
.
write
(
&
v_trans
,
sizeof
(
v_trans
));
io
.
write
(
&
n_layer
,
sizeof
(
n_layer
));
std
::
vector
<
uint8_t
>
tmp_buf
;
// Iterate and write all the keys first, each row is a cell
// Get whole range at a time
for
(
uint32_t
il
=
0
;
il
<
n_layer
;
++
il
)
{
const
uint32_t
n_embd_k_gqa
=
hparams
.
n_embd_k_gqa
(
il
)
+
hparams
.
n_embd_k_s
();
// Write key type
const
int32_t
k_type_i
=
(
int32_t
)
k_l
[
il
]
->
type
;
io
.
write
(
&
k_type_i
,
sizeof
(
k_type_i
));
// Write row size of key
const
uint64_t
k_size_row
=
ggml_row_size
(
k_l
[
il
]
->
type
,
n_embd_k_gqa
);
io
.
write
(
&
k_size_row
,
sizeof
(
k_size_row
));
// Read each range of cells of k_size length each into tmp_buf and write out
for
(
const
auto
&
range
:
cell_ranges
)
{
const
size_t
range_size
=
range
.
second
-
range
.
first
;
const
size_t
buf_size
=
range_size
*
k_size_row
;
io
.
write_tensor
(
k_l
[
il
],
range
.
first
*
k_size_row
,
buf_size
);
}
}
if
(
!
v_trans
)
{
for
(
uint32_t
il
=
0
;
il
<
n_layer
;
++
il
)
{
const
uint32_t
n_embd_v_gqa
=
hparams
.
n_embd_v_gqa
(
il
)
+
hparams
.
n_embd_v_s
();
// Write value type
const
int32_t
v_type_i
=
(
int32_t
)
v_l
[
il
]
->
type
;
io
.
write
(
&
v_type_i
,
sizeof
(
v_type_i
));
// Write row size of value
const
uint64_t
v_size_row
=
ggml_row_size
(
v_l
[
il
]
->
type
,
n_embd_v_gqa
);
io
.
write
(
&
v_size_row
,
sizeof
(
v_size_row
));
// Read each range of cells of v_size length each into tmp_buf and write out
for
(
const
auto
&
range
:
cell_ranges
)
{
const
size_t
range_size
=
range
.
second
-
range
.
first
;
const
size_t
buf_size
=
range_size
*
v_size_row
;
io
.
write_tensor
(
v_l
[
il
],
range
.
first
*
v_size_row
,
buf_size
);
}
}
}
else
{
// When v is transposed, we also need the element size and get the element ranges from each row
const
uint32_t
kv_size
=
size
;
for
(
uint32_t
il
=
0
;
il
<
n_layer
;
++
il
)
{
const
uint32_t
n_embd_v_gqa
=
hparams
.
n_embd_v_gqa
(
il
)
+
hparams
.
n_embd_v_s
();
// Write value type
const
int32_t
v_type_i
=
(
int32_t
)
v_l
[
il
]
->
type
;
io
.
write
(
&
v_type_i
,
sizeof
(
v_type_i
));
// Write element size
const
uint32_t
v_size_el
=
ggml_type_size
(
v_l
[
il
]
->
type
);
io
.
write
(
&
v_size_el
,
sizeof
(
v_size_el
));
// Write GQA embedding size
io
.
write
(
&
n_embd_v_gqa
,
sizeof
(
n_embd_v_gqa
));
// For each row, we get the element values of each cell
for
(
uint32_t
j
=
0
;
j
<
n_embd_v_gqa
;
++
j
)
{
// Read each range of cells of v_size_el length each into tmp_buf and write out
for
(
const
auto
&
range
:
cell_ranges
)
{
const
size_t
range_size
=
range
.
second
-
range
.
first
;
const
size_t
src_offset
=
(
range
.
first
+
j
*
kv_size
)
*
v_size_el
;
const
size_t
buf_size
=
range_size
*
v_size_el
;
io
.
write_tensor
(
v_l
[
il
],
src_offset
,
buf_size
);
}
}
}
}
}
bool
llama_kv_cache_recurrent
::
state_read_meta
(
llama_io_read_i
&
io
,
uint32_t
cell_count
,
llama_seq_id
dest_seq_id
)
{
if
(
dest_seq_id
!=
-
1
)
{
// single sequence
seq_rm
(
dest_seq_id
,
-
1
,
-
1
);
llama_sbatch
sbatch
;
llama_ubatch
batch
=
sbatch
.
reserve_ubatch
(
cell_count
,
/* has_embd */
false
);
batch
.
n_tokens
=
cell_count
;
batch
.
n_seq_tokens
=
cell_count
;
batch
.
n_seqs
=
1
;
for
(
uint32_t
i
=
0
;
i
<
cell_count
;
++
i
)
{
llama_pos
pos
;
uint32_t
n_seq_id
;
io
.
read_to
(
&
pos
,
sizeof
(
pos
));
io
.
read_to
(
&
n_seq_id
,
sizeof
(
n_seq_id
));
if
(
n_seq_id
!=
0
)
{
LLAMA_LOG_ERROR
(
"%s: invalid seq_id-agnostic kv cell
\n
"
,
__func__
);
return
false
;
}
batch
.
pos
[
i
]
=
pos
;
}
batch
.
n_seq_id
[
0
]
=
1
;
batch
.
seq_id
[
0
]
=
&
dest_seq_id
;
if
(
!
find_slot
(
batch
))
{
LLAMA_LOG_ERROR
(
"%s: failed to find available cells in kv cache
\n
"
,
__func__
);
return
false
;
}
commit
();
// DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
// Assume that this is one contiguous block of cells
GGML_ASSERT
(
head
+
cell_count
<=
size
);
GGML_ASSERT
(
cells
[
head
].
pos
==
batch
.
pos
[
0
]);
GGML_ASSERT
(
cells
[
head
+
cell_count
-
1
].
pos
==
batch
.
pos
[
cell_count
-
1
]);
GGML_ASSERT
(
cells
[
head
].
has_seq_id
(
dest_seq_id
));
GGML_ASSERT
(
cells
[
head
+
cell_count
-
1
].
has_seq_id
(
dest_seq_id
));
}
else
{
// whole KV cache restore
if
(
cell_count
>
size
)
{
LLAMA_LOG_ERROR
(
"%s: not enough cells in kv cache
\n
"
,
__func__
);
return
false
;
}
clear
();
for
(
uint32_t
i
=
0
;
i
<
cell_count
;
++
i
)
{
kv_cell
&
cell
=
cells
[
i
];
llama_pos
pos
;
uint32_t
n_seq_id
;
io
.
read_to
(
&
pos
,
sizeof
(
pos
));
io
.
read_to
(
&
n_seq_id
,
sizeof
(
n_seq_id
));
cell
.
pos
=
pos
;
for
(
uint32_t
j
=
0
;
j
<
n_seq_id
;
++
j
)
{
llama_seq_id
seq_id
;
io
.
read_to
(
&
seq_id
,
sizeof
(
seq_id
));
// TODO: llama_kv_cache_recurrent should have a notion of max sequences
//if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
if
(
seq_id
<
0
)
{
//LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
LLAMA_LOG_ERROR
(
"%s: invalid seq_id, %d is out of range [0, inf)
\n
"
,
__func__
,
seq_id
);
return
false
;
}
cell
.
seq_id
.
insert
(
seq_id
);
int32_t
&
tail
=
cells
[
seq_id
].
tail
;
if
(
tail
!=
-
1
)
{
LLAMA_LOG_ERROR
(
"%s: duplicate tail for seq_id %d in cell %d and %d
\n
"
,
__func__
,
seq_id
,
i
,
tail
);
return
false
;
}
tail
=
i
;
}
}
head
=
0
;
used
=
cell_count
;
}
for
(
uint32_t
i
=
0
;
i
<
cell_count
;
++
i
)
{
uint32_t
cell_id
=
head
+
i
;
// make sure the recurrent states will keep their restored state
cells
[
cell_id
].
src
=
cell_id
;
}
return
true
;
}
bool
llama_kv_cache_recurrent
::
state_read_data
(
llama_io_read_i
&
io
,
uint32_t
cell_count
)
{
uint32_t
v_trans
;
uint32_t
n_layer
;
io
.
read_to
(
&
v_trans
,
sizeof
(
v_trans
));
...
...
@@ -1157,7 +2253,7 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
LLAMA_LOG_ERROR
(
"%s: not enough cells in kv cache to restore state (%u > %u)
\n
"
,
__func__
,
cell_count
,
size
);
return
false
;
}
if
(
v_trans
!=
(
bool
)
v_trans
)
{
if
(
false
!=
(
bool
)
v_trans
)
{
LLAMA_LOG_ERROR
(
"%s: incompatible V transposition
\n
"
,
__func__
);
return
false
;
}
...
...
@@ -1309,7 +2405,7 @@ void llama_kv_cache_view_update(llama_kv_cache_view * view, const llama_kv_cache
view
->
cells_sequences
=
(
llama_seq_id
*
)
p
;
}
const
std
::
vector
<
llama_kv_cell
>
&
kv_cells
=
kvu
->
cells
;
const
std
::
vector
<
llama_kv_
cache_unified
::
kv_
cell
>
&
kv_cells
=
kvu
->
cells
;
llama_kv_cache_view_cell
*
c_curr
=
view
->
cells
;
llama_seq_id
*
cs_curr
=
view
->
cells_sequences
;
int32_t
used_cells
=
0
;
...
...
llama/llama.cpp/src/llama-kv-cache.h
View file @
0cefd46f
...
...
@@ -2,32 +2,72 @@
#include "llama.h"
#include "llama-io.h"
#include "llama-graph.h"
#include "llama-memory.h"
#include "ggml-cpp.h"
#include <functional>
#include <set>
#include <vector>
struct
llama_cparams
;
struct
llama_hparams
;
struct
llama_ubatch
;
struct
llama_sbatch
;
struct
llama_model
;
struct
llama_context
;
struct
llama_kv_cache
:
public
llama_memory_i
{
using
llama_memory_i
::
llama_memory_i
;
virtual
~
llama_kv_cache
()
=
default
;
virtual
void
restore
()
=
0
;
// call if batch processing fails - restores the cache state
virtual
void
commit
()
=
0
;
// call after successful batch processing - clears any pending state
// call if batch processing fails - restores the cache state
virtual
void
restore
()
=
0
;
virtual
int32_t
get_n_tokens
()
const
=
0
;
virtual
int32_t
get_used_cells
()
const
=
0
;
// TODO: remove, this is too-specific to the unified cache
// call after successful batch processing - clears any pending state
virtual
void
commit
()
=
0
;
virtual
bool
get_can_shift
()
const
=
0
;
// process any pending defrag/shift/etc. operations
// optionally call once before processing a new batch
virtual
bool
update
(
llama_context
&
lctx
)
=
0
;
// schedule a defrag if the fragmentation threshold is exceeded. otherwise, do nothing
virtual
void
defrag_sched
(
float
thold
)
=
0
;
// simulate full cache, used for allocating worst-case compute buffers
virtual
void
set_full
()
=
0
;
//
// batch processing
//
virtual
llama_sbatch
sbatch_init
(
const
llama_batch
&
batch
,
bool
logits_all
)
=
0
;
// different KV caches require different batch splitting strategies
virtual
llama_ubatch
ubatch_next
(
llama_sbatch
&
sbatch
,
uint32_t
n_ubatch
,
bool
embd_pooled
)
const
=
0
;
// find an empty slot of size "n_tokens" in the cache
virtual
bool
find_slot
(
const
llama_ubatch
&
batch
)
=
0
;
// getters
virtual
int32_t
get_n_tokens
()
const
=
0
;
virtual
int32_t
get_used_cells
()
const
=
0
;
// TODO: remove, this is too-specific to the unified cache
virtual
llama_pos
get_pos_max
()
const
=
0
;
virtual
bool
get_can_shift
()
const
=
0
;
bool
get_can_edit
()
const
override
{
return
get_can_shift
();
}
//
// state write/read
//
virtual
void
state_write
(
llama_io_write_i
&
io
,
llama_seq_id
seq_id
=
-
1
)
const
=
0
;
virtual
void
state_read
(
llama_io_read_i
&
io
,
llama_seq_id
seq_id
=
-
1
)
=
0
;
};
//
// llama_kv_cache_guard
//
struct
llama_kv_cache_guard
{
llama_kv_cache_guard
(
llama_kv_cache
*
kv
)
:
kv
(
kv
)
{}
...
...
@@ -42,7 +82,7 @@ struct llama_kv_cache_guard {
private:
llama_kv_cache
*
kv
;
};
// block of KV slots to move when defragging
struct
llama_kv_defrag_move
{
uint32_t
src
;
...
...
@@ -50,65 +90,50 @@ struct llama_kv_defrag_move {
uint32_t
len
;
};
struct
llama_kv_cell
{
llama_pos
pos
=
-
1
;
llama_pos
delta
=
0
;
int32_t
src
=
-
1
;
// used by recurrent state models to copy states
int32_t
tail
=
-
1
;
//
// llama_kv_cache_unified
//
std
::
set
<
llama_seq_id
>
seq_id
;
// TODO: add notion of max sequences
class
llama_kv_cache_unified
:
public
llama_kv_cache
{
public:
struct
kv_cell
{
llama_pos
pos
=
-
1
;
llama_pos
delta
=
0
;
bool
has_seq_id
(
const
llama_seq_id
&
id
)
const
{
return
seq_id
.
find
(
id
)
!=
seq_id
.
end
();
}
std
::
set
<
llama_seq_id
>
seq_id
;
bool
is_empty
(
)
const
{
return
seq_id
.
empty
();
}
bool
has_seq_id
(
const
llama_seq_id
&
id
)
const
{
return
seq_id
.
find
(
id
)
!=
seq_id
.
end
();
}
bool
is_same_seq
(
const
llama_kv_cell
&
other
)
const
{
return
seq_id
==
other
.
seq_id
;
}
};
bool
is_empty
()
const
{
return
seq_id
.
empty
();
}
// ring-buffer of cached KV data
// TODO: pimpl
// TODO: add notion of max sequences
class
llama_kv_cache_unified
:
public
llama_kv_cache
{
public:
// can be used to query data from the model if needed
struct
callbacks
{
std
::
function
<
ggml_tensor
*
(
uint32_t
n_ctx_per_seq
,
int
il
)
>
get_rope_factors
;
bool
is_same_seq
(
const
kv_cell
&
other
)
const
{
return
seq_id
==
other
.
seq_id
;
}
};
llama_kv_cache_unified
(
const
llama_hparams
&
hparams
,
callbacks
cbs
);
static
uint32_t
get_padding
(
const
llama_cparams
&
cparams
);
virtual
~
llama_kv_cache_unified
()
=
default
;
// TODO: become constructor
bool
init
(
const
llama_model
&
model
,
// TODO: do not reference the model
const
llama_cparams
&
cparams
,
llama_kv_cache_unified
(
const
llama_model
&
model
,
ggml_type
type_k
,
ggml_type
type_v
,
bool
v_trans
,
bool
offload
,
uint32_t
kv_size
,
bool
offload
);
uint32_t
padding
);
int32_t
get_n_tokens
()
const
override
;
int32_t
get_used_cells
()
const
override
;
~
llama_kv_cache_unified
()
=
default
;
size_t
total_size
()
const
;
// TODO: better data structures to reduce the cost of this operation
llama_pos
pos_max
()
const
;
//
// llama_memory_i
//
void
clear
()
override
;
void
defrag
()
override
;
virtual
void
restore
()
override
;
virtual
void
commit
()
override
;
bool
seq_rm
(
llama_seq_id
seq_id
,
llama_pos
p0
,
llama_pos
p1
)
override
;
void
seq_cp
(
llama_seq_id
seq_id_src
,
llama_seq_id
seq_id_dst
,
llama_pos
p0
,
llama_pos
p1
)
override
;
...
...
@@ -118,25 +143,76 @@ public:
llama_pos
seq_pos_max
(
llama_seq_id
seq_id
)
const
override
;
bool
get_can_shift
()
const
override
;
//
// llama_kv_cache
//
void
restore
()
override
;
void
commit
()
override
;
bool
update
(
llama_context
&
ctx
)
override
;
void
defrag_sched
(
float
thold
)
override
;
void
set_full
()
override
;
llama_sbatch
sbatch_init
(
const
llama_batch
&
batch
,
bool
logits_all
)
override
;
llama_ubatch
ubatch_next
(
llama_sbatch
&
sbatch
,
uint32_t
n_ubatch
,
bool
embd_pooled
)
const
override
;
// find an empty slot of size "n_tokens" in the cache
// updates the cache head
// Note: On success, it's important that cache.head points
// to the first cell of the slot.
bool
find_slot
(
const
llama_ubatch
&
batch
);
bool
find_slot
(
const
llama_ubatch
&
batch
)
override
;
// TODO: maybe not nee
de
d
u
int32_t
get_
padding
(
const
llama_cparams
&
cparams
)
const
;
int32_t
get_n_tokens
()
const
overri
de
;
int32_t
get_
used_cells
()
const
override
;
//
find how many cells are currently in use
uint32_t
cell
_max
()
const
;
//
TODO: better data structures to reduce the cost of this operation
llama_pos
get_pos
_max
()
const
override
;
size_t
size_k_bytes
()
const
;
size_t
size_v_bytes
()
const
;
bool
get_can_shift
()
const
override
;
//
defrag
//
state write/load
void
state_write
(
llama_io_write_i
&
io
,
llama_seq_id
seq_id
=
-
1
)
const
override
;
void
state_read
(
llama_io_read_i
&
io
,
llama_seq_id
seq_id
=
-
1
)
override
;
// Note: The value of head isn't only used to optimize searching
// for a free KV slot. llama_decode_impl also uses it, so it
// cannot be freely changed after a slot has been allocated.
uint32_t
head
=
0
;
uint32_t
size
=
0
;
uint32_t
used
=
0
;
// used cells (i.e. at least one seq_id)
// computed before each graph build
uint32_t
n
=
0
;
std
::
vector
<
kv_cell
>
cells
;
std
::
vector
<
ggml_tensor
*>
k_l
;
// per layer
std
::
vector
<
ggml_tensor
*>
v_l
;
private:
const
llama_model
&
model
;
const
llama_hparams
&
hparams
;
bool
has_shift
=
false
;
bool
do_defrag
=
false
;
bool
v_trans
=
true
;
// the value tensor is transposed
bool
can_shift
=
false
;
// required padding
uint32_t
padding
=
1
;
ggml_type
type_k
=
GGML_TYPE_F16
;
ggml_type
type_v
=
GGML_TYPE_F16
;
std
::
vector
<
ggml_context_ptr
>
ctxs
;
std
::
vector
<
ggml_backend_buffer_ptr
>
bufs
;
// defrag
struct
{
std
::
vector
<
llama_kv_defrag_move
>
moves
;
}
defrag_info
;
...
...
@@ -145,7 +221,6 @@ public:
bool
defrag_prepare
(
int32_t
n_max_nodes
);
// commit/restore cache
struct
slot_range
{
uint32_t
c0
=
0
;
// note: these are cell indices, not sequence positions
uint32_t
c1
=
0
;
...
...
@@ -156,25 +231,125 @@ public:
std
::
vector
<
slot_range
>
ranges
;
}
pending
;
// state write/load
// find how many cells are currently in use
uint32_t
cell_max
()
const
;
void
state_write
(
llama_io_write_i
&
io
,
llama_seq_id
seq_id
=
-
1
)
const
;
void
state_read
(
llama_io_read_i
&
io
,
llama_seq_id
seq_id
=
-
1
);
size_t
total_size
()
const
;
// members
size_t
size_k_bytes
()
const
;
size_t
size_v_bytes
()
const
;
const
llama_hparams
&
hparams
;
ggml_tensor
*
build_rope_shift
(
const
llama_cparams
&
cparams
,
ggml_context
*
ctx
,
ggml_tensor
*
cur
,
ggml_tensor
*
shift
,
ggml_tensor
*
factors
,
float
freq_base
,
float
freq_scale
)
const
;
llm_graph_result_ptr
build_graph_shift
(
const
llama_cparams
&
cparams
,
ggml_context
*
ctx
,
ggml_cgraph
*
gf
)
const
;
llm_graph_result_ptr
build_graph_defrag
(
const
llama_cparams
&
cparams
,
ggml_context
*
ctx
,
ggml_cgraph
*
gf
,
const
std
::
vector
<
llama_kv_defrag_move
>
&
moves
)
const
;
callbacks
cbs
;
void
state_write_meta
(
llama_io_write_i
&
io
,
const
std
::
vector
<
std
::
pair
<
uint32_t
,
uint32_t
>>
&
cell_ranges
,
llama_seq_id
seq_id
=
-
1
)
const
;
void
state_write_data
(
llama_io_write_i
&
io
,
const
std
::
vector
<
std
::
pair
<
uint32_t
,
uint32_t
>>
&
cell_ranges
)
const
;
bool
has_shift
=
false
;
bool
do_defrag
=
false
;
bool
state_read_meta
(
llama_io_read_i
&
io
,
uint32_t
cell_count
,
llama_seq_id
dest_seq_id
=
-
1
);
bool
state_read_data
(
llama_io_read_i
&
io
,
uint32_t
cell_count
);
};
// TODO: remove this and implement llama_kv_cache_recurrent instead
bool
recurrent
=
false
;
// with recurrent state models, a cell can hold the state for more than one past token
//
// llama_kv_cache_recurrent
//
bool
v_trans
=
true
;
// the value tensor is transposed
bool
can_shift
=
false
;
class
llama_kv_cache_recurrent
:
public
llama_kv_cache
{
public:
struct
kv_cell
{
llama_pos
pos
=
-
1
;
int32_t
src
=
-
1
;
// used to copy states
int32_t
tail
=
-
1
;
std
::
set
<
llama_seq_id
>
seq_id
;
bool
has_seq_id
(
const
llama_seq_id
&
id
)
const
{
return
seq_id
.
find
(
id
)
!=
seq_id
.
end
();
}
bool
is_empty
()
const
{
return
seq_id
.
empty
();
}
bool
is_same_seq
(
const
kv_cell
&
other
)
const
{
return
seq_id
==
other
.
seq_id
;
}
};
llama_kv_cache_recurrent
(
const
llama_model
&
model
,
ggml_type
type_k
,
ggml_type
type_v
,
bool
offload
,
uint32_t
kv_size
);
~
llama_kv_cache_recurrent
()
=
default
;
//
// llama_memory_i
//
void
clear
()
override
;
bool
seq_rm
(
llama_seq_id
seq_id
,
llama_pos
p0
,
llama_pos
p1
)
override
;
void
seq_cp
(
llama_seq_id
seq_id_src
,
llama_seq_id
seq_id_dst
,
llama_pos
p0
,
llama_pos
p1
)
override
;
void
seq_keep
(
llama_seq_id
seq_id
)
override
;
void
seq_add
(
llama_seq_id
seq_id
,
llama_pos
p0
,
llama_pos
p1
,
llama_pos
delta
)
override
;
void
seq_div
(
llama_seq_id
seq_id
,
llama_pos
p0
,
llama_pos
p1
,
int
d
)
override
;
llama_pos
seq_pos_max
(
llama_seq_id
seq_id
)
const
override
;
//
// llama_kv_cache
//
void
restore
()
override
;
void
commit
()
override
;
bool
update
(
llama_context
&
lctx
)
override
;
void
defrag_sched
(
float
thold
)
override
;
void
set_full
()
override
;
llama_sbatch
sbatch_init
(
const
llama_batch
&
batch
,
bool
logits_all
)
override
;
llama_ubatch
ubatch_next
(
llama_sbatch
&
sbatch
,
uint32_t
n_ubatch
,
bool
embd_pooled
)
const
override
;
bool
find_slot
(
const
llama_ubatch
&
batch
)
override
;
int32_t
get_n_tokens
()
const
override
;
int32_t
get_used_cells
()
const
override
;
// TODO: better data structures to reduce the cost of this operation
llama_pos
get_pos_max
()
const
override
;
bool
get_can_shift
()
const
override
;
// TODO: temporary methods - they are not really const as they do const_cast<>, fix this
int32_t
s_copy
(
int
i
)
const
;
float
s_mask
(
int
i
)
const
;
// state write/load
void
state_write
(
llama_io_write_i
&
io
,
llama_seq_id
seq_id
=
-
1
)
const
override
;
void
state_read
(
llama_io_read_i
&
io
,
llama_seq_id
seq_id
=
-
1
)
override
;
// Note: The value of head isn't only used to optimize searching
// for a free KV slot. llama_decode_impl also uses it, so it
...
...
@@ -186,18 +361,41 @@ public:
// computed before each graph build
uint32_t
n
=
0
;
std
::
vector
<
llama_
kv_cell
>
cells
;
std
::
vector
<
kv_cell
>
cells
;
std
::
vector
<
ggml_tensor
*>
k_l
;
// per layer
std
::
vector
<
ggml_tensor
*>
v_l
;
private:
//const llama_model & model;
const
llama_hparams
&
hparams
;
// commit/restore cache
// TODO: rework for recurrent cache
struct
slot_range
{
uint32_t
c0
=
0
;
// note: these are cell indices, not sequence positions
uint32_t
c1
=
0
;
};
// pending cell updates that are not yet committed
struct
{
std
::
vector
<
slot_range
>
ranges
;
}
pending
;
ggml_type
type_k
=
GGML_TYPE_F16
;
ggml_type
type_v
=
GGML_TYPE_F16
;
std
::
vector
<
ggml_context_ptr
>
ctxs
;
std
::
vector
<
ggml_backend_buffer_ptr
>
bufs
;
// find how many cells are currently in use
uint32_t
cell_max
()
const
;
size_t
total_size
()
const
;
size_t
size_k_bytes
()
const
;
size_t
size_v_bytes
()
const
;
void
state_write_meta
(
llama_io_write_i
&
io
,
const
std
::
vector
<
std
::
pair
<
uint32_t
,
uint32_t
>>
&
cell_ranges
,
llama_seq_id
seq_id
=
-
1
)
const
;
void
state_write_data
(
llama_io_write_i
&
io
,
const
std
::
vector
<
std
::
pair
<
uint32_t
,
uint32_t
>>
&
cell_ranges
)
const
;
...
...
@@ -205,11 +403,6 @@ private:
bool
state_read_data
(
llama_io_read_i
&
io
,
uint32_t
cell_count
);
};
// TODO: temporary reusing llama_kv_cache_unified -- implement recurrent cache and simplify llama_kv_cache_unified
//class llama_kv_cache_recurrent : public llama_kv_cache_unified {
//public:
// using llama_kv_cache_unified::llama_kv_cache_unified;
//};
//
// kv cache view
...
...
llama/llama.cpp/src/llama-memory.h
View file @
0cefd46f
...
...
@@ -2,12 +2,22 @@
#include "llama.h"
struct
llama_memory_params
{
// kv cache
ggml_type
type_k
;
ggml_type
type_v
;
// parameters for other types of memory
// ...
};
// general concept of LLM memory
// the KV cache is a type of LLM memory, but there can be other types
class
llama_memory_i
{
public:
virtual
~
llama_memory_i
()
=
default
;
virtual
void
clear
()
=
0
;
virtual
void
defrag
()
=
0
;
virtual
bool
seq_rm
(
llama_seq_id
seq_id
,
llama_pos
p0
,
llama_pos
p1
)
=
0
;
virtual
void
seq_cp
(
llama_seq_id
seq_id_src
,
llama_seq_id
seq_id_dst
,
llama_pos
p0
,
llama_pos
p1
)
=
0
;
...
...
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment