Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
0cefd46f
Unverified
Commit
0cefd46f
authored
May 12, 2025
by
Jeffrey Morgan
Committed by
GitHub
May 12, 2025
Browse files
llama: update to commit de4c07f93 (#10655)
parent
ad035ad5
Changes
113
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2426 additions
and
1109 deletions
+2426
-1109
Makefile.sync
Makefile.sync
+1
-1
llama/build-info.cpp
llama/build-info.cpp
+1
-1
llama/llama.cpp/.rsync-filter
llama/llama.cpp/.rsync-filter
+5
-5
llama/llama.cpp/common/common.cpp
llama/llama.cpp/common/common.cpp
+18
-1
llama/llama.cpp/common/common.h
llama/llama.cpp/common/common.h
+13
-5
llama/llama.cpp/common/sampling.cpp
llama/llama.cpp/common/sampling.cpp
+57
-50
llama/llama.cpp/include/llama.h
llama/llama.cpp/include/llama.h
+51
-10
llama/llama.cpp/src/llama-adapter.cpp
llama/llama.cpp/src/llama-adapter.cpp
+6
-0
llama/llama.cpp/src/llama-batch.cpp
llama/llama.cpp/src/llama-batch.cpp
+5
-1
llama/llama.cpp/src/llama-batch.h
llama/llama.cpp/src/llama-batch.h
+2
-1
llama/llama.cpp/src/llama-chat.cpp
llama/llama.cpp/src/llama-chat.cpp
+17
-7
llama/llama.cpp/src/llama-chat.h
llama/llama.cpp/src/llama-chat.h
+1
-0
llama/llama.cpp/src/llama-context.cpp
llama/llama.cpp/src/llama-context.cpp
+401
-476
llama/llama.cpp/src/llama-context.h
llama/llama.cpp/src/llama-context.h
+44
-33
llama/llama.cpp/src/llama-cparams.h
llama/llama.cpp/src/llama-cparams.h
+2
-1
llama/llama.cpp/src/llama-graph.cpp
llama/llama.cpp/src/llama-graph.cpp
+20
-38
llama/llama.cpp/src/llama-graph.h
llama/llama.cpp/src/llama-graph.h
+12
-8
llama/llama.cpp/src/llama-kv-cache.cpp
llama/llama.cpp/src/llama-kv-cache.cpp
+1485
-389
llama/llama.cpp/src/llama-kv-cache.h
llama/llama.cpp/src/llama-kv-cache.h
+274
-81
llama/llama.cpp/src/llama-memory.h
llama/llama.cpp/src/llama-memory.h
+11
-1
No files found.
Makefile.sync
View file @
0cefd46f
UPSTREAM
=
https://github.com/ggerganov/llama.cpp.git
WORKDIR
=
llama/vendor
FETCH_HEAD
=
e1e8e0991ffd9e99a445c6812bb519d5bac9f4b5
FETCH_HEAD
=
de4c07f93783a1a96456a44dc16b9db538ee1618
.PHONY
:
help
help
:
...
...
llama/build-info.cpp
View file @
0cefd46f
int
LLAMA_BUILD_NUMBER
=
0
;
char
const
*
LLAMA_COMMIT
=
"
e1e8e0991ffd9e99a445c6812bb519d5bac9f4b5
"
;
char
const
*
LLAMA_COMMIT
=
"
de4c07f93783a1a96456a44dc16b9db538ee1618
"
;
char
const
*
LLAMA_COMPILER
=
""
;
char
const
*
LLAMA_BUILD_TARGET
=
""
;
llama/llama.cpp/.rsync-filter
View file @
0cefd46f
...
...
@@ -10,11 +10,11 @@ include common/stb_image.*
include include/
include include/llama.*
include include/llama-*.*
include
example
s/
include
examples/llava
/
include
examples/llava
/clip.*
include
examples/llava
/clip-impl.*
include
examples/llava
/llava.*
include
tool
s/
include
tools/mtmd
/
include
tools/mtmd
/clip.*
include
tools/mtmd
/clip-impl.*
include
tools/mtmd
/llava.*
include src/
include src/llama.*
include src/llama-*.*
...
...
llama/llama.cpp/common/common.cpp
View file @
0cefd46f
...
...
@@ -1096,7 +1096,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
cparams
.
n_threads
=
params
.
cpuparams
.
n_threads
;
cparams
.
n_threads_batch
=
params
.
cpuparams_batch
.
n_threads
==
-
1
?
params
.
cpuparams
.
n_threads
:
params
.
cpuparams_batch
.
n_threads
;
cparams
.
logits_all
=
params
.
logits_all
;
cparams
.
embeddings
=
params
.
embedding
;
cparams
.
rope_scaling_type
=
params
.
rope_scaling_type
;
cparams
.
rope_freq_base
=
params
.
rope_freq_base
;
...
...
@@ -1114,6 +1113,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
cparams
.
offload_kqv
=
!
params
.
no_kv_offload
;
cparams
.
flash_attn
=
params
.
flash_attn
;
cparams
.
no_perf
=
params
.
no_perf
;
cparams
.
op_offload
=
!
params
.
no_op_offload
;
if
(
params
.
reranking
)
{
cparams
.
embeddings
=
true
;
...
...
@@ -1565,3 +1565,20 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
return
result
;
}
ggml_opt_dataset_t
common_opt_dataset_init
(
struct
llama_context
*
ctx
,
const
std
::
vector
<
llama_token
>
&
tokens
,
int64_t
stride
)
{
const
int64_t
ne_datapoint
=
llama_n_ctx
(
ctx
);
const
int64_t
ndata
=
(
tokens
.
size
()
-
ne_datapoint
-
1
)
/
stride
;
ggml_opt_dataset_t
result
=
ggml_opt_dataset_init
(
GGML_TYPE_I32
,
GGML_TYPE_I32
,
ne_datapoint
,
ne_datapoint
,
ndata
,
/*ndata_shard =*/
1
);
llama_token
*
data
=
(
llama_token
*
)
ggml_opt_dataset_data
(
result
)
->
data
;
llama_token
*
labels
=
(
llama_token
*
)
ggml_opt_dataset_labels
(
result
)
->
data
;
for
(
int64_t
idata
=
0
;
idata
<
ndata
;
++
idata
)
{
memcpy
(
data
+
idata
*
ne_datapoint
,
tokens
.
data
()
+
idata
*
stride
+
0
,
ne_datapoint
*
sizeof
(
llama_token
));
memcpy
(
labels
+
idata
*
ne_datapoint
,
tokens
.
data
()
+
idata
*
stride
+
1
,
ne_datapoint
*
sizeof
(
llama_token
));
}
return
result
;
}
llama/llama.cpp/common/common.h
View file @
0cefd46f
...
...
@@ -66,7 +66,6 @@ enum llama_example {
LLAMA_EXAMPLE_COMMON
,
LLAMA_EXAMPLE_SPECULATIVE
,
LLAMA_EXAMPLE_MAIN
,
LLAMA_EXAMPLE_INFILL
,
LLAMA_EXAMPLE_EMBEDDING
,
LLAMA_EXAMPLE_PERPLEXITY
,
LLAMA_EXAMPLE_RETRIEVAL
,
...
...
@@ -96,6 +95,7 @@ enum common_sampler_type {
COMMON_SAMPLER_TYPE_XTC
=
8
,
COMMON_SAMPLER_TYPE_INFILL
=
9
,
COMMON_SAMPLER_TYPE_PENALTIES
=
10
,
COMMON_SAMPLER_TYPE_TOP_N_SIGMA
=
11
,
};
// dimensionality reduction methods, used by cvector-generator
...
...
@@ -161,6 +161,7 @@ struct common_params_sampling {
std
::
vector
<
enum
common_sampler_type
>
samplers
=
{
COMMON_SAMPLER_TYPE_PENALTIES
,
COMMON_SAMPLER_TYPE_DRY
,
COMMON_SAMPLER_TYPE_TOP_N_SIGMA
,
COMMON_SAMPLER_TYPE_TOP_K
,
COMMON_SAMPLER_TYPE_TYPICAL_P
,
COMMON_SAMPLER_TYPE_TOP_P
,
...
...
@@ -323,7 +324,6 @@ struct common_params {
bool
ctx_shift
=
true
;
// context shift on inifinite text generation
bool
input_prefix_bos
=
false
;
// prefix BOS to user inputs, preceding input_prefix
bool
logits_all
=
false
;
// return logits for all tokens in the batch
bool
use_mmap
=
true
;
// use mmap for faster loads
bool
use_mlock
=
false
;
// use mlock to keep model in memory
bool
verbose_prompt
=
false
;
// print prompt tokens before generation
...
...
@@ -332,6 +332,7 @@ struct common_params {
bool
no_kv_offload
=
false
;
// disable KV offloading
bool
warmup
=
true
;
// warmup run
bool
check_tensors
=
false
;
// validate tensor data
bool
no_op_offload
=
false
;
// globally disable offload host tensor operations to device
bool
single_turn
=
false
;
// single turn chat conversation
...
...
@@ -340,7 +341,7 @@ struct common_params {
common_conversation_mode
conversation_mode
=
COMMON_CONVERSATION_MODE_AUTO
;
// multimodal models (see
examples/llava
)
// multimodal models (see
tools/mtmd
)
struct
common_params_model
mmproj
;
bool
mmproj_use_gpu
=
true
;
// use GPU for multimodal model
bool
no_mmproj
=
false
;
// explicitly disable multimodal model
...
...
@@ -409,13 +410,14 @@ struct common_params {
bool
process_output
=
false
;
// collect data for the output tensor
bool
compute_ppl
=
true
;
// whether to compute perplexity
bool
parse_special
=
false
;
// whether to parse special tokens during imatrix tokenization
// cvector-generator params
int
n_pca_batch
=
100
;
int
n_pca_iterations
=
1000
;
dimre_method
cvector_dimre_method
=
DIMRE_METHOD_PCA
;
std
::
string
cvector_positive_file
=
"
example
s/cvector-generator/positive.txt"
;
std
::
string
cvector_negative_file
=
"
example
s/cvector-generator/negative.txt"
;
std
::
string
cvector_positive_file
=
"
tool
s/cvector-generator/positive.txt"
;
std
::
string
cvector_negative_file
=
"
tool
s/cvector-generator/negative.txt"
;
bool
spm_infill
=
false
;
// suffix/prefix/middle pattern for infill
...
...
@@ -664,3 +666,9 @@ const char * const LLM_KV_SPLIT_COUNT = "split.count";
const
char
*
const
LLM_KV_SPLIT_TENSORS_COUNT
=
"split.tensors.count"
;
}
//
// training utils
//
ggml_opt_dataset_t
common_opt_dataset_init
(
struct
llama_context
*
ctx
,
const
std
::
vector
<
llama_token
>
&
tokens
,
int64_t
stride
);
llama/llama.cpp/common/sampling.cpp
View file @
0cefd46f
#include "sampling.h"
#include "common.h"
#include "log.h"
#include <cmath>
#include <unordered_map>
...
...
@@ -229,51 +230,48 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
params
.
logit_bias
.
data
()));
if
(
params
.
mirostat
==
0
)
{
if
(
params
.
top_n_sigma
>=
0
)
{
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_top_k
(
params
.
top_k
));
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_temp
(
params
.
temp
));
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_top_n_sigma
(
params
.
top_n_sigma
));
}
else
{
for
(
const
auto
&
cnstr
:
params
.
samplers
)
{
switch
(
cnstr
)
{
case
COMMON_SAMPLER_TYPE_DRY
:
{
std
::
vector
<
const
char
*>
c_breakers
;
c_breakers
.
reserve
(
params
.
dry_sequence_breakers
.
size
());
for
(
const
auto
&
str
:
params
.
dry_sequence_breakers
)
{
c_breakers
.
push_back
(
str
.
c_str
());
}
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_dry
(
vocab
,
llama_model_n_ctx_train
(
model
),
params
.
dry_multiplier
,
params
.
dry_base
,
params
.
dry_allowed_length
,
params
.
dry_penalty_last_n
,
c_breakers
.
data
(),
c_breakers
.
size
()));
for
(
const
auto
&
cnstr
:
params
.
samplers
)
{
switch
(
cnstr
)
{
case
COMMON_SAMPLER_TYPE_DRY
:
{
std
::
vector
<
const
char
*>
c_breakers
;
c_breakers
.
reserve
(
params
.
dry_sequence_breakers
.
size
());
for
(
const
auto
&
str
:
params
.
dry_sequence_breakers
)
{
c_breakers
.
push_back
(
str
.
c_str
());
}
break
;
case
COMMON_SAMPLER_TYPE_TOP_K
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_top_k
(
params
.
top_k
));
break
;
case
COMMON_SAMPLER_TYPE_TOP_P
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_top_p
(
params
.
top_p
,
params
.
min_keep
));
break
;
case
COMMON_SAMPLER_TYPE_MIN_P
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_min_p
(
params
.
min_p
,
params
.
min_keep
));
break
;
case
COMMON_SAMPLER_TYPE_XTC
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_xtc
(
params
.
xtc_probability
,
params
.
xtc_threshold
,
params
.
min_keep
,
params
.
seed
));
break
;
case
COMMON_SAMPLER_TYPE_TYPICAL_P
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_typical
(
params
.
typ_p
,
params
.
min_keep
));
break
;
case
COMMON_SAMPLER_TYPE_TEMPERATURE
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_temp_ext
(
params
.
temp
,
params
.
dynatemp_range
,
params
.
dynatemp_exponent
));
break
;
case
COMMON_SAMPLER_TYPE_INFILL
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_infill
(
vocab
));
break
;
case
COMMON_SAMPLER_TYPE_PENALTIES
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_penalties
(
params
.
penalty_last_n
,
params
.
penalty_repeat
,
params
.
penalty_freq
,
params
.
penalty_present
));
break
;
default:
GGML_ASSERT
(
false
&&
"unknown sampler type"
);
}
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_dry
(
vocab
,
llama_model_n_ctx_train
(
model
),
params
.
dry_multiplier
,
params
.
dry_base
,
params
.
dry_allowed_length
,
params
.
dry_penalty_last_n
,
c_breakers
.
data
(),
c_breakers
.
size
()));
}
break
;
case
COMMON_SAMPLER_TYPE_TOP_K
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_top_k
(
params
.
top_k
));
break
;
case
COMMON_SAMPLER_TYPE_TOP_P
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_top_p
(
params
.
top_p
,
params
.
min_keep
));
break
;
case
COMMON_SAMPLER_TYPE_TOP_N_SIGMA
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_top_n_sigma
(
params
.
top_n_sigma
));
break
;
case
COMMON_SAMPLER_TYPE_MIN_P
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_min_p
(
params
.
min_p
,
params
.
min_keep
));
break
;
case
COMMON_SAMPLER_TYPE_XTC
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_xtc
(
params
.
xtc_probability
,
params
.
xtc_threshold
,
params
.
min_keep
,
params
.
seed
));
break
;
case
COMMON_SAMPLER_TYPE_TYPICAL_P
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_typical
(
params
.
typ_p
,
params
.
min_keep
));
break
;
case
COMMON_SAMPLER_TYPE_TEMPERATURE
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_temp_ext
(
params
.
temp
,
params
.
dynatemp_range
,
params
.
dynatemp_exponent
));
break
;
case
COMMON_SAMPLER_TYPE_INFILL
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_infill
(
vocab
));
break
;
case
COMMON_SAMPLER_TYPE_PENALTIES
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_penalties
(
params
.
penalty_last_n
,
params
.
penalty_repeat
,
params
.
penalty_freq
,
params
.
penalty_present
));
break
;
default:
GGML_ASSERT
(
false
&&
"unknown sampler type"
);
}
}
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_dist
(
params
.
seed
));
...
...
@@ -475,6 +473,7 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
case
COMMON_SAMPLER_TYPE_TOP_K
:
return
'k'
;
case
COMMON_SAMPLER_TYPE_TYPICAL_P
:
return
'y'
;
case
COMMON_SAMPLER_TYPE_TOP_P
:
return
'p'
;
case
COMMON_SAMPLER_TYPE_TOP_N_SIGMA
:
return
's'
;
case
COMMON_SAMPLER_TYPE_MIN_P
:
return
'm'
;
case
COMMON_SAMPLER_TYPE_TEMPERATURE
:
return
't'
;
case
COMMON_SAMPLER_TYPE_XTC
:
return
'x'
;
...
...
@@ -490,6 +489,7 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
case
COMMON_SAMPLER_TYPE_TOP_K
:
return
"top_k"
;
case
COMMON_SAMPLER_TYPE_TYPICAL_P
:
return
"typ_p"
;
case
COMMON_SAMPLER_TYPE_TOP_P
:
return
"top_p"
;
case
COMMON_SAMPLER_TYPE_TOP_N_SIGMA
:
return
"top_n_sigma"
;
case
COMMON_SAMPLER_TYPE_MIN_P
:
return
"min_p"
;
case
COMMON_SAMPLER_TYPE_TEMPERATURE
:
return
"temperature"
;
case
COMMON_SAMPLER_TYPE_XTC
:
return
"xtc"
;
...
...
@@ -504,6 +504,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
{
"dry"
,
COMMON_SAMPLER_TYPE_DRY
},
{
"top_k"
,
COMMON_SAMPLER_TYPE_TOP_K
},
{
"top_p"
,
COMMON_SAMPLER_TYPE_TOP_P
},
{
"top_n_sigma"
,
COMMON_SAMPLER_TYPE_TOP_N_SIGMA
},
{
"typ_p"
,
COMMON_SAMPLER_TYPE_TYPICAL_P
},
{
"min_p"
,
COMMON_SAMPLER_TYPE_MIN_P
},
{
"temperature"
,
COMMON_SAMPLER_TYPE_TEMPERATURE
},
...
...
@@ -517,6 +518,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
std
::
unordered_map
<
std
::
string
,
common_sampler_type
>
sampler_alt_name_map
{
{
"top-k"
,
COMMON_SAMPLER_TYPE_TOP_K
},
{
"top-p"
,
COMMON_SAMPLER_TYPE_TOP_P
},
{
"top-n-sigma"
,
COMMON_SAMPLER_TYPE_TOP_N_SIGMA
},
{
"nucleus"
,
COMMON_SAMPLER_TYPE_TOP_P
},
{
"typical-p"
,
COMMON_SAMPLER_TYPE_TYPICAL_P
},
{
"typical"
,
COMMON_SAMPLER_TYPE_TYPICAL_P
},
...
...
@@ -533,14 +535,16 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
auto
sampler
=
sampler_canonical_name_map
.
find
(
name
);
if
(
sampler
!=
sampler_canonical_name_map
.
end
())
{
samplers
.
push_back
(
sampler
->
second
);
}
else
{
if
(
allow_alt_names
)
{
sampler
=
sampler_alt_name_map
.
find
(
name
);
if
(
sampler
!=
sampler_alt_name_map
.
end
())
{
samplers
.
push_back
(
sampler
->
second
);
}
continue
;
}
if
(
allow_alt_names
)
{
sampler
=
sampler_alt_name_map
.
find
(
name
);
if
(
sampler
!=
sampler_alt_name_map
.
end
())
{
samplers
.
push_back
(
sampler
->
second
);
continue
;
}
}
LOG_WRN
(
"%s: unable to match sampler by name '%s'
\n
"
,
__func__
,
name
.
c_str
());
}
return
samplers
;
...
...
@@ -552,6 +556,7 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
{
common_sampler_type_to_chr
(
COMMON_SAMPLER_TYPE_TOP_K
),
COMMON_SAMPLER_TYPE_TOP_K
},
{
common_sampler_type_to_chr
(
COMMON_SAMPLER_TYPE_TYPICAL_P
),
COMMON_SAMPLER_TYPE_TYPICAL_P
},
{
common_sampler_type_to_chr
(
COMMON_SAMPLER_TYPE_TOP_P
),
COMMON_SAMPLER_TYPE_TOP_P
},
{
common_sampler_type_to_chr
(
COMMON_SAMPLER_TYPE_TOP_N_SIGMA
),
COMMON_SAMPLER_TYPE_TOP_N_SIGMA
},
{
common_sampler_type_to_chr
(
COMMON_SAMPLER_TYPE_MIN_P
),
COMMON_SAMPLER_TYPE_MIN_P
},
{
common_sampler_type_to_chr
(
COMMON_SAMPLER_TYPE_TEMPERATURE
),
COMMON_SAMPLER_TYPE_TEMPERATURE
},
{
common_sampler_type_to_chr
(
COMMON_SAMPLER_TYPE_XTC
),
COMMON_SAMPLER_TYPE_XTC
},
...
...
@@ -566,6 +571,8 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
const
auto
sampler
=
sampler_name_map
.
find
(
c
);
if
(
sampler
!=
sampler_name_map
.
end
())
{
samplers
.
push_back
(
sampler
->
second
);
}
else
{
LOG_WRN
(
"%s: unable to match sampler by char '%c'
\n
"
,
__func__
,
c
);
}
}
...
...
llama/llama.cpp/include/llama.h
View file @
0cefd46f
...
...
@@ -4,6 +4,7 @@
#include "ggml.h"
#include "ggml-cpu.h"
#include "ggml-backend.h"
#include "ggml-opt.h"
#include <stddef.h>
#include <stdint.h>
...
...
@@ -112,6 +113,7 @@ extern "C" {
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE
=
32
,
LLAMA_VOCAB_PRE_TYPE_LLAMA4
=
33
,
LLAMA_VOCAB_PRE_TYPE_PIXTRAL
=
34
,
LLAMA_VOCAB_PRE_TYPE_SEED_CODER
=
35
,
};
enum
llama_rope_type
{
...
...
@@ -352,20 +354,19 @@ extern "C" {
enum
ggml_type
type_k
;
// data type for K cache [EXPERIMENTAL]
enum
ggml_type
type_v
;
// data type for V cache [EXPERIMENTAL]
// Abort callback
// if it returns true, execution of llama_decode() will be aborted
// currently works only with CPU execution
ggml_abort_callback
abort_callback
;
void
*
abort_callback_data
;
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
// TODO: move at the end of the struct
bool
logits_all
;
// the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
bool
embeddings
;
// if true, extract embeddings (together with logits)
bool
offload_kqv
;
// whether to offload the KQV ops (including the KV cache) to GPU
bool
flash_attn
;
// whether to use flash attention [EXPERIMENTAL]
bool
no_perf
;
// whether to measure performance timings
bool
op_offload
;
// whether to offload host tensor operations to device
bool
cross_attn
;
// whether to use cross attention
// Abort callback
// if it returns true, execution of llama_decode() will be aborted
// currently works only with CPU execution
ggml_abort_callback
abort_callback
;
void
*
abort_callback_data
;
};
// model quantization parameters
...
...
@@ -447,6 +448,10 @@ extern "C" {
size_t
n_paths
,
struct
llama_model_params
params
);
LLAMA_API
void
llama_model_save_to_file
(
const
struct
llama_model
*
model
,
const
char
*
path_model
);
DEPRECATED
(
LLAMA_API
void
llama_free_model
(
struct
llama_model
*
model
),
"use llama_model_free instead"
);
...
...
@@ -930,14 +935,19 @@ extern "C" {
// Frees a batch of tokens allocated with llama_batch_init()
LLAMA_API
void
llama_batch_free
(
struct
llama_batch
batch
);
// Processes a batch of tokens with the ecoder part of the encoder-decoder model.
// Stores the encoder output internally for later use by the decoder cross-attention layers.
// Process a batch of tokens.
// In contrast to llama_decode() - this call does not use KV cache.
// For encode-decoder contexts, processes the batch using the encoder.
// Can store the encoder output internally for later use by the decoder's cross-attention layers.
// 0 - success
// < 0 - error. the KV cache state is restored to the state before this call
LLAMA_API
int32_t
llama_encode
(
struct
llama_context
*
ctx
,
struct
llama_batch
batch
);
// Process a batch of tokens.
// Requires KV cache.
// For encode-decoder contexts, processes the batch using the decoder.
// Positive return values does not mean a fatal error, but rather a warning.
// 0 - success
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
...
...
@@ -1434,6 +1444,37 @@ extern "C" {
LLAMA_API
void
llama_perf_sampler_print
(
const
struct
llama_sampler
*
chain
);
LLAMA_API
void
llama_perf_sampler_reset
(
struct
llama_sampler
*
chain
);
//
// training
//
// function that returns whether or not a given tensor contains trainable parameters
typedef
bool
(
*
llama_opt_param_filter
)(
const
struct
ggml_tensor
*
tensor
,
void
*
userdata
);
// always returns true
LLAMA_API
bool
llama_opt_param_filter_all
(
const
struct
ggml_tensor
*
tensor
,
void
*
userdata
);
struct
llama_opt_params
{
uint32_t
n_ctx_train
;
// assumed context size post training, use context size specified in llama_context if 0
llama_opt_param_filter
param_filter
;
// callback for determining which tensors contain trainable parameters
void
*
param_filter_ud
;
// userdata for determining which tensors contain trainable parameters
ggml_opt_get_optimizer_params
get_opt_pars
;
// callback for calculating optimizer parameters
void
*
get_opt_pars_ud
;
// userdata for calculating optimizer parameters
};
LLAMA_API
void
llama_opt_init
(
struct
llama_context
*
lctx
,
struct
llama_model
*
model
,
struct
llama_opt_params
lopt_params
);
LLAMA_API
void
llama_opt_epoch
(
struct
llama_context
*
lctx
,
ggml_opt_dataset_t
dataset
,
ggml_opt_result_t
result_train
,
ggml_opt_result_t
result_eval
,
int64_t
idata_split
,
ggml_opt_epoch_callback
callback_train
,
ggml_opt_epoch_callback
callback_eval
);
#ifdef __cplusplus
}
#endif
...
...
llama/llama.cpp/src/llama-adapter.cpp
View file @
0cefd46f
...
...
@@ -253,6 +253,9 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
std
::
vector
<
ggml_backend_buffer_type_t
>
buft_extra
;
{
auto
*
cpu_dev
=
ggml_backend_dev_by_type
(
GGML_BACKEND_DEVICE_TYPE_CPU
);
if
(
!
cpu_dev
)
{
throw
std
::
runtime_error
(
format
(
"%s: no CPU backend found"
,
__func__
));
}
auto
*
cpu_reg
=
ggml_backend_dev_backend_reg
(
cpu_dev
);
auto
ggml_backend_dev_get_extra_bufts_fn
=
(
ggml_backend_dev_get_extra_bufts_t
)
...
...
@@ -291,6 +294,9 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
LLAMA_LOG_WARN
(
"%s: lora for '%s' cannot use buft '%s', fallback to CPU
\n
"
,
__func__
,
model_tensor
->
name
,
ggml_backend_buft_name
(
buft
));
auto
*
cpu_dev
=
ggml_backend_dev_by_type
(
GGML_BACKEND_DEVICE_TYPE_CPU
);
if
(
!
cpu_dev
)
{
throw
std
::
runtime_error
(
format
(
"%s: no CPU backend found"
,
__func__
));
}
buft
=
ggml_backend_dev_buffer_type
(
cpu_dev
);
break
;
...
...
llama/llama.cpp/src/llama-batch.cpp
View file @
0cefd46f
...
...
@@ -189,7 +189,7 @@ llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) {
return
ubatch
;
}
void
llama_sbatch
::
from_
batch
(
const
llama_batch
&
batch
,
size_t
n_embd
,
bool
simple_split
,
bool
logits_all
)
{
llama_sbatch
::
llama_s
batch
(
const
llama_batch
&
batch
,
size_t
n_embd
,
bool
simple_split
,
bool
logits_all
)
{
GGML_ASSERT
(
batch
.
n_tokens
>=
0
);
this
->
batch
=
&
batch
;
this
->
n_embd
=
n_embd
;
...
...
@@ -203,6 +203,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
for
(
size_t
i
=
0
;
i
<
n_tokens
;
++
i
)
{
ids
[
i
]
=
i
;
}
if
(
simple_split
)
{
seq
.
resize
(
1
);
llama_sbatch_seq
&
s
=
seq
[
0
];
...
...
@@ -212,6 +213,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
s
.
length
=
n_tokens
;
return
;
}
std
::
sort
(
ids
.
begin
(),
ids
.
end
(),
[
&
batch
](
size_t
a
,
size_t
b
)
{
int32_t
n_seq_a
=
batch
.
n_seq_id
?
batch
.
n_seq_id
[
a
]
:
1
;
...
...
@@ -239,6 +241,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
return
n_seq_a
>
n_seq_b
;
}
);
// init seq
llama_sbatch_seq
*
last_seq
=
nullptr
;
...
...
@@ -262,6 +265,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
seq
.
push_back
(
new_seq
);
last_seq
=
&
seq
.
back
();
}
// keep shared prompts first at the end, then sort by length descending.
std
::
sort
(
seq
.
begin
(),
seq
.
end
(),
[](
llama_sbatch_seq
&
a
,
llama_sbatch_seq
&
b
)
{
...
...
llama/llama.cpp/src/llama-batch.h
View file @
0cefd46f
...
...
@@ -70,7 +70,8 @@ struct llama_sbatch {
// sequence-wise split
llama_ubatch
split_seq
(
size_t
n_ubatch
);
void
from_batch
(
const
llama_batch
&
batch
,
size_t
n_embd
,
bool
simple_split
=
false
,
bool
logits_all
=
false
);
llama_sbatch
()
=
default
;
llama_sbatch
(
const
llama_batch
&
batch
,
size_t
n_embd
,
bool
simple_split
=
false
,
bool
logits_all
=
false
);
};
// temporary allocate memory for the input batch if needed
...
...
llama/llama.cpp/src/llama-chat.cpp
View file @
0cefd46f
...
...
@@ -35,6 +35,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
{
"mistral-v3"
,
LLM_CHAT_TEMPLATE_MISTRAL_V3
},
{
"mistral-v3-tekken"
,
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN
},
{
"mistral-v7"
,
LLM_CHAT_TEMPLATE_MISTRAL_V7
},
{
"mistral-v7-tekken"
,
LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN
},
{
"phi3"
,
LLM_CHAT_TEMPLATE_PHI_3
},
{
"phi4"
,
LLM_CHAT_TEMPLATE_PHI_4
},
{
"falcon3"
,
LLM_CHAT_TEMPLATE_FALCON_3
},
...
...
@@ -202,19 +203,20 @@ int32_t llm_chat_apply_template(
if
(
add_ass
)
{
ss
<<
"<|im_start|>assistant
\n
"
;
}
}
else
if
(
tmpl
==
LLM_CHAT_TEMPLATE_MISTRAL_V7
)
{
}
else
if
(
tmpl
==
LLM_CHAT_TEMPLATE_MISTRAL_V7
||
tmpl
==
LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN
)
{
// Official mistral 'v7' template
// See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
// https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503#basic-instruct-template-v7-tekken
const
char
*
trailing_space
=
tmpl
==
LLM_CHAT_TEMPLATE_MISTRAL_V7
?
" "
:
""
;
for
(
auto
message
:
chat
)
{
std
::
string
role
(
message
->
role
);
std
::
string
content
(
message
->
content
);
if
(
role
==
"system"
)
{
ss
<<
"[SYSTEM_PROMPT]
"
<<
content
<<
"[/SYSTEM_PROMPT]"
;
ss
<<
"[SYSTEM_PROMPT]"
<<
trailing_space
<<
content
<<
"[/SYSTEM_PROMPT]"
;
}
else
if
(
role
==
"user"
)
{
ss
<<
"[INST] "
<<
content
<<
"[/INST]"
;
}
else
{
ss
<<
" "
<<
content
<<
"</s>"
;
ss
<<
"[INST]"
<<
trailing_space
<<
content
<<
"[/INST]"
;
}
else
{
ss
<<
trailing_space
<<
content
<<
"</s>"
;
}
}
}
else
if
(
tmpl
==
LLM_CHAT_TEMPLATE_MISTRAL_V1
...
...
@@ -447,8 +449,16 @@ int32_t llm_chat_apply_template(
if
(
add_ass
)
{
ss
<<
"<|assistant|>"
;
}
}
else
if
(
tmpl
==
LLM_CHAT_TEMPLATE_CHATGLM_4
||
tmpl
==
LLM_CHAT_TEMPLATE_GLMEDGE
)
{
}
else
if
(
tmpl
==
LLM_CHAT_TEMPLATE_CHATGLM_4
)
{
ss
<<
"[gMASK]"
<<
"<sop>"
;
for
(
auto
message
:
chat
)
{
std
::
string
role
(
message
->
role
);
ss
<<
"<|"
<<
role
<<
"|>"
<<
"
\n
"
<<
message
->
content
;
}
if
(
add_ass
)
{
ss
<<
"<|assistant|>
\n
"
;
}
}
else
if
(
tmpl
==
LLM_CHAT_TEMPLATE_GLMEDGE
)
{
for
(
auto
message
:
chat
)
{
std
::
string
role
(
message
->
role
);
ss
<<
"<|"
<<
role
<<
"|>"
<<
"
\n
"
<<
message
->
content
;
...
...
llama/llama.cpp/src/llama-chat.h
View file @
0cefd46f
...
...
@@ -14,6 +14,7 @@ enum llm_chat_template {
LLM_CHAT_TEMPLATE_MISTRAL_V3
,
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN
,
LLM_CHAT_TEMPLATE_MISTRAL_V7
,
LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN
,
LLM_CHAT_TEMPLATE_PHI_3
,
LLM_CHAT_TEMPLATE_PHI_4
,
LLM_CHAT_TEMPLATE_FALCON_3
,
...
...
llama/llama.cpp/src/llama-context.cpp
View file @
0cefd46f
This diff is collapsed.
Click to expand it.
llama/llama.cpp/src/llama-context.h
View file @
0cefd46f
...
...
@@ -8,6 +8,7 @@
#include "llama-kv-cache.h"
#include "ggml-cpp.h"
#include "ggml-opt.h"
#include <map>
#include <vector>
...
...
@@ -28,7 +29,12 @@ struct llama_context {
void
synchronize
();
const
llama_model
&
get_model
()
const
;
const
llama_model
&
get_model
()
const
;
const
llama_cparams
&
get_cparams
()
const
;
ggml_backend_sched_t
get_sched
()
const
;
ggml_context
*
get_ctx_compute
()
const
;
uint32_t
n_ctx
()
const
;
uint32_t
n_ctx_per_seq
()
const
;
...
...
@@ -130,6 +136,32 @@ struct llama_context {
llama_perf_context_data
perf_get_data
()
const
;
void
perf_reset
();
//
// training
//
void
opt_init
(
struct
llama_model
*
model
,
struct
llama_opt_params
lopt_params
);
void
opt_epoch
(
ggml_opt_dataset_t
dataset
,
ggml_opt_result_t
result_train
,
ggml_opt_result_t
result_eval
,
int64_t
idata_split
,
ggml_opt_epoch_callback
callback_train
,
ggml_opt_epoch_callback
callback_eval
);
void
opt_epoch_iter
(
ggml_opt_dataset_t
dataset
,
ggml_opt_result_t
result
,
const
std
::
vector
<
llama_token
>
&
tokens
,
const
std
::
vector
<
llama_token
>
&
labels_sparse
,
llama_batch
&
batch
,
ggml_opt_epoch_callback
callback
,
bool
train
,
int64_t
idata_in_loop
,
int64_t
ndata_in_loop
,
int64_t
t_loop_start
);
private:
//
// output
...
...
@@ -139,50 +171,30 @@ private:
// Returns max number of outputs for which space was reserved.
int32_t
output_reserve
(
int32_t
n_outputs
);
// make the outputs have the same order they had in the user-provided batch
// TODO: maybe remove this
void
output_reorder
();
//
// graph
//
public:
int32_t
graph_max_nodes
()
const
;
// zero-out inputs and create the ctx_compute for the compute graph
ggml_cgraph
*
graph_init
();
// returns the result of ggml_backend_sched_graph_compute_async execution
ggml_status
graph_compute
(
ggml_cgraph
*
gf
,
bool
batched
);
private:
llm_graph_result_ptr
graph_build
(
ggml_context
*
ctx
,
ggml_cgraph
*
gf
,
const
llama_ubatch
&
ubatch
,
llm_graph_type
gtype
);
// returns the result of ggml_backend_sched_graph_compute_async execution
ggml_status
graph_compute
(
ggml_cgraph
*
gf
,
bool
batched
);
llm_graph_cb
graph_get_cb
()
const
;
// used by kv_self_update()
ggml_tensor
*
build_rope_shift
(
ggml_context
*
ctx0
,
ggml_tensor
*
cur
,
ggml_tensor
*
shift
,
ggml_tensor
*
factors
,
float
freq_base
,
float
freq_scale
)
const
;
llm_graph_result_ptr
build_kv_self_shift
(
ggml_context
*
ctx0
,
ggml_cgraph
*
gf
)
const
;
llm_graph_result_ptr
build_kv_self_defrag
(
ggml_context
*
ctx0
,
ggml_cgraph
*
gf
,
const
std
::
vector
<
struct
llama_kv_defrag_move
>
&
moves
)
const
;
// TODO: read/write lora adapters and cvec
size_t
state_write_data
(
llama_io_write_i
&
io
);
size_t
state_read_data
(
llama_io_read_i
&
io
);
...
...
@@ -199,14 +211,10 @@ private:
llama_cparams
cparams
;
llama_adapter_cvec
cvec
;
llama_adapter_loras
loras
;
llama_sbatch
sbatch
;
llama_cross
cross
;
// TODO: tmp for handling cross-attention - need something better probably
std
::
unique_ptr
<
llama_kv_cache_unified
>
kv_self
;
// TODO: remove
bool
logits_all
=
false
;
std
::
unique_ptr
<
llama_memory_i
>
memory
;
// decode output (2-dimensional array: [n_outputs][n_vocab])
size_t
logits_size
=
0
;
// capacity (of floats) for logits
...
...
@@ -233,6 +241,9 @@ private:
ggml_context_ptr
ctx_compute
;
// training
ggml_opt_context_t
opt_ctx
=
nullptr
;
ggml_threadpool_t
threadpool
=
nullptr
;
ggml_threadpool_t
threadpool_batch
=
nullptr
;
...
...
llama/llama.cpp/src/llama-cparams.h
View file @
0cefd46f
...
...
@@ -29,8 +29,9 @@ struct llama_cparams {
bool
offload_kqv
;
bool
flash_attn
;
bool
no_perf
;
bool
cross_attn
;
bool
warmup
;
bool
op_offload
;
bool
cross_attn
;
enum
llama_pooling_type
pooling_type
;
...
...
llama/llama.cpp/src/llama-graph.cpp
View file @
0cefd46f
...
...
@@ -284,24 +284,7 @@ void llm_graph_input_s_copy::set_input(const llama_ubatch * ubatch) {
// assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
for
(
uint32_t
i
=
0
;
i
<
n_kv
;
++
i
)
{
const
uint32_t
cell_id
=
i
+
kv_self
->
head
;
//////////////////////////////////////////////
// TODO: this should not mutate the KV cache !
llama_kv_cell
&
kv_cell
=
const_cast
<
class
llama_kv_cache_unified
*>
(
kv_self
)
->
cells
[
i
];
// prevent out-of-bound sources
if
(
kv_cell
.
src
<
0
||
(
uint32_t
)
kv_cell
.
src
>=
kv_self
->
size
)
{
kv_cell
.
src
=
cell_id
;
}
data
[
i
]
=
kv_cell
.
src
;
// TODO: do not mutate the KV cache
// ensure copy only happens once
if
(
kv_cell
.
src
!=
(
int32_t
)
cell_id
)
{
kv_cell
.
src
=
cell_id
;
}
data
[
i
]
=
kv_self
->
s_copy
(
i
);
}
}
}
...
...
@@ -317,18 +300,7 @@ void llm_graph_input_s_mask::set_input(const llama_ubatch * ubatch) {
// clear unused states
for
(
int
i
=
0
;
i
<
n_kv
;
++
i
)
{
const
uint32_t
cell_id
=
i
+
kv_self
->
head
;
//////////////////////////////////////////////
// TODO: this should not mutate the KV cache !
llama_kv_cell
&
kv_cell
=
const_cast
<
class
llama_kv_cache_unified
*>
(
kv_self
)
->
cells
[
i
];
data
[
i
]
=
(
float
)
(
kv_cell
.
src
>=
0
);
// only clear once
if
(
kv_cell
.
src
<
0
)
{
kv_cell
.
src
=
cell_id
;
}
data
[
i
]
=
kv_self
->
s_mask
(
i
);
}
}
}
...
...
@@ -816,7 +788,7 @@ ggml_tensor * llm_graph_context::build_ffn(
}
break
;
}
if
(
type_gate
==
LLM_FFN_PAR
)
{
if
(
gate
&&
type_gate
==
LLM_FFN_PAR
)
{
cur
=
ggml_mul
(
ctx0
,
cur
,
tmp
);
cb
(
cur
,
"ffn_gate_par"
,
il
);
}
...
...
@@ -1005,6 +977,7 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
inp
->
tokens
=
ggml_new_tensor_1d
(
ctx0
,
GGML_TYPE_I32
,
ubatch
.
n_tokens
);
//cb(inp->tokens, "inp_tokens", -1);
ggml_set_input
(
inp
->
tokens
);
res
->
t_tokens
=
inp
->
tokens
;
cur
=
ggml_get_rows
(
ctx0
,
tok_embd
,
inp
->
tokens
);
...
...
@@ -1111,7 +1084,7 @@ ggml_tensor * llm_graph_context::build_inp_cls() const {
}
ggml_tensor
*
llm_graph_context
::
build_inp_s_copy
()
const
{
const
llama_kv_cache_
unified
*
kv_self
=
static_cast
<
const
llama_kv_cache_
unified
*>
(
memory
);
const
llama_kv_cache_
recurrent
*
kv_self
=
static_cast
<
const
llama_kv_cache_
recurrent
*>
(
memory
);
auto
inp
=
std
::
make_unique
<
llm_graph_input_s_copy
>
(
kv_self
);
...
...
@@ -1128,7 +1101,7 @@ ggml_tensor * llm_graph_context::build_inp_s_copy() const {
}
ggml_tensor
*
llm_graph_context
::
build_inp_s_mask
()
const
{
const
llama_kv_cache_
unified
*
kv_self
=
static_cast
<
const
llama_kv_cache_
unified
*>
(
memory
);
const
llama_kv_cache_
recurrent
*
kv_self
=
static_cast
<
const
llama_kv_cache_
recurrent
*>
(
memory
);
auto
inp
=
std
::
make_unique
<
llm_graph_input_s_mask
>
(
kv_self
);
...
...
@@ -1261,8 +1234,19 @@ ggml_tensor * llm_graph_context::build_attn_mha(
ggml_flash_attn_ext_set_prec
(
cur
,
GGML_PREC_F32
);
if
(
v_mla
)
{
#if 0
// v_mla can be applied as a matrix-vector multiplication with broadcasting across dimension 3 == n_tokens.
// However, the code is optimized for dimensions 0 and 1 being large, so this is ineffient.
cur = ggml_reshape_4d(ctx0, cur, v_mla->ne[0], 1, n_head, n_tokens);
cur = ggml_mul_mat(ctx0, v_mla, cur);
#else
// It's preferable to do the calculation as a matrix-matrix multiplication with n_tokens in dimension 1.
// The permutations are noops and only change how the tensor data is interpreted.
cur
=
ggml_permute
(
ctx0
,
cur
,
0
,
2
,
1
,
3
);
cur
=
ggml_mul_mat
(
ctx0
,
v_mla
,
cur
);
cur
=
ggml_permute
(
ctx0
,
cur
,
0
,
2
,
1
,
3
);
cur
=
ggml_cont
(
ctx0
,
cur
);
// Needed because ggml_reshape_2d expects contiguous inputs.
#endif
}
cur
=
ggml_reshape_2d
(
ctx0
,
cur
,
cur
->
ne
[
0
]
*
n_head
,
n_tokens
);
...
...
@@ -1442,8 +1426,6 @@ ggml_tensor * llm_graph_context::build_attn(
// store to KV cache
{
GGML_ASSERT
(
!
kv_self
->
recurrent
);
const
auto
kv_head
=
kv_self
->
head
;
GGML_ASSERT
(
kv_self
->
size
==
n_ctx
);
...
...
@@ -1612,7 +1594,7 @@ ggml_tensor * llm_graph_context::build_copy_mask_state(
ggml_tensor
*
state_mask
,
int32_t
n_state
,
int32_t
n_seqs
)
const
{
const
llama_kv_cache_
unified
*
kv_self
=
static_cast
<
const
llama_kv_cache_
unified
*>
(
memory
);
const
llama_kv_cache_
recurrent
*
kv_self
=
static_cast
<
const
llama_kv_cache_
recurrent
*>
(
memory
);
const
auto
n_kv
=
kv_self
->
n
;
const
auto
kv_head
=
kv_self
->
head
;
...
...
@@ -1644,7 +1626,7 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
ggml_tensor
*
state_mask
,
const
llama_ubatch
&
ubatch
,
int
il
)
const
{
const
llama_kv_cache_
unified
*
kv_self
=
static_cast
<
const
llama_kv_cache_
unified
*>
(
memory
);
const
llama_kv_cache_
recurrent
*
kv_self
=
static_cast
<
const
llama_kv_cache_
recurrent
*>
(
memory
);
const
auto
token_shift_count
=
hparams
.
token_shift_count
;
...
...
@@ -1665,7 +1647,7 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_store(
ggml_tensor
*
token_shift
,
const
llama_ubatch
&
ubatch
,
int
il
)
const
{
const
llama_kv_cache_
unified
*
kv_self
=
static_cast
<
const
llama_kv_cache_
unified
*>
(
memory
);
const
llama_kv_cache_
recurrent
*
kv_self
=
static_cast
<
const
llama_kv_cache_
recurrent
*>
(
memory
);
const
auto
token_shift_count
=
hparams
.
token_shift_count
;
const
auto
n_embd
=
hparams
.
n_embd
;
...
...
llama/llama.cpp/src/llama-graph.h
View file @
0cefd46f
...
...
@@ -19,6 +19,7 @@ struct llama_cparams;
class
llama_memory_i
;
class
llama_kv_cache_unified
;
class
llama_kv_cache_recurrent
;
// certain models (typically multi-modal) can produce different types of graphs
enum
llm_graph_type
{
...
...
@@ -187,26 +188,26 @@ public:
class
llm_graph_input_s_copy
:
public
llm_graph_input_i
{
public:
llm_graph_input_s_copy
(
const
llama_kv_cache_
unified
*
kv_self
)
:
kv_self
(
kv_self
)
{}
llm_graph_input_s_copy
(
const
llama_kv_cache_
recurrent
*
kv_self
)
:
kv_self
(
kv_self
)
{}
virtual
~
llm_graph_input_s_copy
()
=
default
;
void
set_input
(
const
llama_ubatch
*
ubatch
)
override
;
ggml_tensor
*
s_copy
;
// I32 [kv_size]
const
llama_kv_cache_
unified
*
kv_self
;
const
llama_kv_cache_
recurrent
*
kv_self
;
};
class
llm_graph_input_s_mask
:
public
llm_graph_input_i
{
public:
llm_graph_input_s_mask
(
const
llama_kv_cache_
unified
*
kv_self
)
:
kv_self
(
kv_self
)
{}
llm_graph_input_s_mask
(
const
llama_kv_cache_
recurrent
*
kv_self
)
:
kv_self
(
kv_self
)
{}
virtual
~
llm_graph_input_s_mask
()
=
default
;
void
set_input
(
const
llama_ubatch
*
ubatch
)
override
;
ggml_tensor
*
s_mask
;
// F32 [1, n_kv]
const
llama_kv_cache_
unified
*
kv_self
;
const
llama_kv_cache_
recurrent
*
kv_self
;
};
class
llm_graph_input_cross_embd
:
public
llm_graph_input_i
{
...
...
@@ -308,6 +309,7 @@ class llm_graph_result_i {
public:
virtual
~
llm_graph_result_i
()
=
default
;
virtual
ggml_tensor
*
get_tokens
()
=
0
;
virtual
ggml_tensor
*
get_logits
()
=
0
;
virtual
ggml_tensor
*
get_embd
()
=
0
;
virtual
ggml_tensor
*
get_embd_pooled
()
=
0
;
...
...
@@ -322,6 +324,7 @@ class llm_graph_result : public llm_graph_result_i {
public:
virtual
~
llm_graph_result
()
=
default
;
ggml_tensor
*
get_tokens
()
override
{
return
t_tokens
;
}
ggml_tensor
*
get_logits
()
override
{
return
t_logits
;
}
ggml_tensor
*
get_embd
()
override
{
return
t_embd
;
}
ggml_tensor
*
get_embd_pooled
()
override
{
return
t_embd_pooled
;
}
...
...
@@ -338,6 +341,7 @@ public:
}
// important graph nodes
ggml_tensor
*
t_tokens
=
nullptr
;
ggml_tensor
*
t_logits
=
nullptr
;
ggml_tensor
*
t_embd
=
nullptr
;
ggml_tensor
*
t_embd_pooled
=
nullptr
;
...
...
@@ -361,8 +365,8 @@ struct llm_graph_params {
const
llama_cparams
&
cparams
;
const
llama_ubatch
&
ubatch
;
ggml_backend_sched
*
sched
;
ggml_backend
*
backend_cpu
;
ggml_backend_sched
_t
sched
;
ggml_backend
_t
backend_cpu
;
const
llama_adapter_cvec
*
cvec
;
const
llama_adapter_loras
*
loras
;
...
...
@@ -413,9 +417,9 @@ struct llm_graph_context {
ggml_context
*
ctx0
=
nullptr
;
ggml_backend_sched
*
sched
;
ggml_backend_sched
_t
sched
;
ggml_backend
*
backend_cpu
;
// TODO: needed by build_attn_mha, figure out a way to remove?
ggml_backend
_t
backend_cpu
;
// TODO: needed by build_attn_mha, figure out a way to remove?
const
llama_adapter_cvec
*
cvec
;
const
llama_adapter_loras
*
loras
;
...
...
llama/llama.cpp/src/llama-kv-cache.cpp
View file @
0cefd46f
This diff is collapsed.
Click to expand it.
llama/llama.cpp/src/llama-kv-cache.h
View file @
0cefd46f
...
...
@@ -2,32 +2,72 @@
#include "llama.h"
#include "llama-io.h"
#include "llama-graph.h"
#include "llama-memory.h"
#include "ggml-cpp.h"
#include <functional>
#include <set>
#include <vector>
struct
llama_cparams
;
struct
llama_hparams
;
struct
llama_ubatch
;
struct
llama_sbatch
;
struct
llama_model
;
struct
llama_context
;
struct
llama_kv_cache
:
public
llama_memory_i
{
using
llama_memory_i
::
llama_memory_i
;
virtual
~
llama_kv_cache
()
=
default
;
virtual
void
restore
()
=
0
;
// call if batch processing fails - restores the cache state
virtual
void
commit
()
=
0
;
// call after successful batch processing - clears any pending state
// call if batch processing fails - restores the cache state
virtual
void
restore
()
=
0
;
virtual
int32_t
get_n_tokens
()
const
=
0
;
virtual
int32_t
get_used_cells
()
const
=
0
;
// TODO: remove, this is too-specific to the unified cache
// call after successful batch processing - clears any pending state
virtual
void
commit
()
=
0
;
virtual
bool
get_can_shift
()
const
=
0
;
// process any pending defrag/shift/etc. operations
// optionally call once before processing a new batch
virtual
bool
update
(
llama_context
&
lctx
)
=
0
;
// schedule a defrag if the fragmentation threshold is exceeded. otherwise, do nothing
virtual
void
defrag_sched
(
float
thold
)
=
0
;
// simulate full cache, used for allocating worst-case compute buffers
virtual
void
set_full
()
=
0
;
//
// batch processing
//
virtual
llama_sbatch
sbatch_init
(
const
llama_batch
&
batch
,
bool
logits_all
)
=
0
;
// different KV caches require different batch splitting strategies
virtual
llama_ubatch
ubatch_next
(
llama_sbatch
&
sbatch
,
uint32_t
n_ubatch
,
bool
embd_pooled
)
const
=
0
;
// find an empty slot of size "n_tokens" in the cache
virtual
bool
find_slot
(
const
llama_ubatch
&
batch
)
=
0
;
// getters
virtual
int32_t
get_n_tokens
()
const
=
0
;
virtual
int32_t
get_used_cells
()
const
=
0
;
// TODO: remove, this is too-specific to the unified cache
virtual
llama_pos
get_pos_max
()
const
=
0
;
virtual
bool
get_can_shift
()
const
=
0
;
bool
get_can_edit
()
const
override
{
return
get_can_shift
();
}
//
// state write/read
//
virtual
void
state_write
(
llama_io_write_i
&
io
,
llama_seq_id
seq_id
=
-
1
)
const
=
0
;
virtual
void
state_read
(
llama_io_read_i
&
io
,
llama_seq_id
seq_id
=
-
1
)
=
0
;
};
//
// llama_kv_cache_guard
//
struct
llama_kv_cache_guard
{
llama_kv_cache_guard
(
llama_kv_cache
*
kv
)
:
kv
(
kv
)
{}
...
...
@@ -42,7 +82,7 @@ struct llama_kv_cache_guard {
private:
llama_kv_cache
*
kv
;
};
// block of KV slots to move when defragging
struct
llama_kv_defrag_move
{
uint32_t
src
;
...
...
@@ -50,65 +90,50 @@ struct llama_kv_defrag_move {
uint32_t
len
;
};
struct
llama_kv_cell
{
llama_pos
pos
=
-
1
;
llama_pos
delta
=
0
;
int32_t
src
=
-
1
;
// used by recurrent state models to copy states
int32_t
tail
=
-
1
;
//
// llama_kv_cache_unified
//
std
::
set
<
llama_seq_id
>
seq_id
;
// TODO: add notion of max sequences
class
llama_kv_cache_unified
:
public
llama_kv_cache
{
public:
struct
kv_cell
{
llama_pos
pos
=
-
1
;
llama_pos
delta
=
0
;
bool
has_seq_id
(
const
llama_seq_id
&
id
)
const
{
return
seq_id
.
find
(
id
)
!=
seq_id
.
end
();
}
std
::
set
<
llama_seq_id
>
seq_id
;
bool
is_empty
(
)
const
{
return
seq_id
.
empty
();
}
bool
has_seq_id
(
const
llama_seq_id
&
id
)
const
{
return
seq_id
.
find
(
id
)
!=
seq_id
.
end
();
}
bool
is_same_seq
(
const
llama_kv_cell
&
other
)
const
{
return
seq_id
==
other
.
seq_id
;
}
};
bool
is_empty
()
const
{
return
seq_id
.
empty
();
}
// ring-buffer of cached KV data
// TODO: pimpl
// TODO: add notion of max sequences
class
llama_kv_cache_unified
:
public
llama_kv_cache
{
public:
// can be used to query data from the model if needed
struct
callbacks
{
std
::
function
<
ggml_tensor
*
(
uint32_t
n_ctx_per_seq
,
int
il
)
>
get_rope_factors
;
bool
is_same_seq
(
const
kv_cell
&
other
)
const
{
return
seq_id
==
other
.
seq_id
;
}
};
llama_kv_cache_unified
(
const
llama_hparams
&
hparams
,
callbacks
cbs
);
static
uint32_t
get_padding
(
const
llama_cparams
&
cparams
);
virtual
~
llama_kv_cache_unified
()
=
default
;
// TODO: become constructor
bool
init
(
const
llama_model
&
model
,
// TODO: do not reference the model
const
llama_cparams
&
cparams
,
llama_kv_cache_unified
(
const
llama_model
&
model
,
ggml_type
type_k
,
ggml_type
type_v
,
bool
v_trans
,
bool
offload
,
uint32_t
kv_size
,
bool
offload
);
uint32_t
padding
);
int32_t
get_n_tokens
()
const
override
;
int32_t
get_used_cells
()
const
override
;
~
llama_kv_cache_unified
()
=
default
;
size_t
total_size
()
const
;
// TODO: better data structures to reduce the cost of this operation
llama_pos
pos_max
()
const
;
//
// llama_memory_i
//
void
clear
()
override
;
void
defrag
()
override
;
virtual
void
restore
()
override
;
virtual
void
commit
()
override
;
bool
seq_rm
(
llama_seq_id
seq_id
,
llama_pos
p0
,
llama_pos
p1
)
override
;
void
seq_cp
(
llama_seq_id
seq_id_src
,
llama_seq_id
seq_id_dst
,
llama_pos
p0
,
llama_pos
p1
)
override
;
...
...
@@ -118,25 +143,76 @@ public:
llama_pos
seq_pos_max
(
llama_seq_id
seq_id
)
const
override
;
bool
get_can_shift
()
const
override
;
//
// llama_kv_cache
//
void
restore
()
override
;
void
commit
()
override
;
bool
update
(
llama_context
&
ctx
)
override
;
void
defrag_sched
(
float
thold
)
override
;
void
set_full
()
override
;
llama_sbatch
sbatch_init
(
const
llama_batch
&
batch
,
bool
logits_all
)
override
;
llama_ubatch
ubatch_next
(
llama_sbatch
&
sbatch
,
uint32_t
n_ubatch
,
bool
embd_pooled
)
const
override
;
// find an empty slot of size "n_tokens" in the cache
// updates the cache head
// Note: On success, it's important that cache.head points
// to the first cell of the slot.
bool
find_slot
(
const
llama_ubatch
&
batch
);
bool
find_slot
(
const
llama_ubatch
&
batch
)
override
;
// TODO: maybe not nee
de
d
u
int32_t
get_
padding
(
const
llama_cparams
&
cparams
)
const
;
int32_t
get_n_tokens
()
const
overri
de
;
int32_t
get_
used_cells
()
const
override
;
//
find how many cells are currently in use
uint32_t
cell
_max
()
const
;
//
TODO: better data structures to reduce the cost of this operation
llama_pos
get_pos
_max
()
const
override
;
size_t
size_k_bytes
()
const
;
size_t
size_v_bytes
()
const
;
bool
get_can_shift
()
const
override
;
//
defrag
//
state write/load
void
state_write
(
llama_io_write_i
&
io
,
llama_seq_id
seq_id
=
-
1
)
const
override
;
void
state_read
(
llama_io_read_i
&
io
,
llama_seq_id
seq_id
=
-
1
)
override
;
// Note: The value of head isn't only used to optimize searching
// for a free KV slot. llama_decode_impl also uses it, so it
// cannot be freely changed after a slot has been allocated.
uint32_t
head
=
0
;
uint32_t
size
=
0
;
uint32_t
used
=
0
;
// used cells (i.e. at least one seq_id)
// computed before each graph build
uint32_t
n
=
0
;
std
::
vector
<
kv_cell
>
cells
;
std
::
vector
<
ggml_tensor
*>
k_l
;
// per layer
std
::
vector
<
ggml_tensor
*>
v_l
;
private:
const
llama_model
&
model
;
const
llama_hparams
&
hparams
;
bool
has_shift
=
false
;
bool
do_defrag
=
false
;
bool
v_trans
=
true
;
// the value tensor is transposed
bool
can_shift
=
false
;
// required padding
uint32_t
padding
=
1
;
ggml_type
type_k
=
GGML_TYPE_F16
;
ggml_type
type_v
=
GGML_TYPE_F16
;
std
::
vector
<
ggml_context_ptr
>
ctxs
;
std
::
vector
<
ggml_backend_buffer_ptr
>
bufs
;
// defrag
struct
{
std
::
vector
<
llama_kv_defrag_move
>
moves
;
}
defrag_info
;
...
...
@@ -145,7 +221,6 @@ public:
bool
defrag_prepare
(
int32_t
n_max_nodes
);
// commit/restore cache
struct
slot_range
{
uint32_t
c0
=
0
;
// note: these are cell indices, not sequence positions
uint32_t
c1
=
0
;
...
...
@@ -156,25 +231,125 @@ public:
std
::
vector
<
slot_range
>
ranges
;
}
pending
;
// state write/load
// find how many cells are currently in use
uint32_t
cell_max
()
const
;
void
state_write
(
llama_io_write_i
&
io
,
llama_seq_id
seq_id
=
-
1
)
const
;
void
state_read
(
llama_io_read_i
&
io
,
llama_seq_id
seq_id
=
-
1
);
size_t
total_size
()
const
;
// members
size_t
size_k_bytes
()
const
;
size_t
size_v_bytes
()
const
;
const
llama_hparams
&
hparams
;
ggml_tensor
*
build_rope_shift
(
const
llama_cparams
&
cparams
,
ggml_context
*
ctx
,
ggml_tensor
*
cur
,
ggml_tensor
*
shift
,
ggml_tensor
*
factors
,
float
freq_base
,
float
freq_scale
)
const
;
llm_graph_result_ptr
build_graph_shift
(
const
llama_cparams
&
cparams
,
ggml_context
*
ctx
,
ggml_cgraph
*
gf
)
const
;
llm_graph_result_ptr
build_graph_defrag
(
const
llama_cparams
&
cparams
,
ggml_context
*
ctx
,
ggml_cgraph
*
gf
,
const
std
::
vector
<
llama_kv_defrag_move
>
&
moves
)
const
;
callbacks
cbs
;
void
state_write_meta
(
llama_io_write_i
&
io
,
const
std
::
vector
<
std
::
pair
<
uint32_t
,
uint32_t
>>
&
cell_ranges
,
llama_seq_id
seq_id
=
-
1
)
const
;
void
state_write_data
(
llama_io_write_i
&
io
,
const
std
::
vector
<
std
::
pair
<
uint32_t
,
uint32_t
>>
&
cell_ranges
)
const
;
bool
has_shift
=
false
;
bool
do_defrag
=
false
;
bool
state_read_meta
(
llama_io_read_i
&
io
,
uint32_t
cell_count
,
llama_seq_id
dest_seq_id
=
-
1
);
bool
state_read_data
(
llama_io_read_i
&
io
,
uint32_t
cell_count
);
};
// TODO: remove this and implement llama_kv_cache_recurrent instead
bool
recurrent
=
false
;
// with recurrent state models, a cell can hold the state for more than one past token
//
// llama_kv_cache_recurrent
//
bool
v_trans
=
true
;
// the value tensor is transposed
bool
can_shift
=
false
;
class
llama_kv_cache_recurrent
:
public
llama_kv_cache
{
public:
struct
kv_cell
{
llama_pos
pos
=
-
1
;
int32_t
src
=
-
1
;
// used to copy states
int32_t
tail
=
-
1
;
std
::
set
<
llama_seq_id
>
seq_id
;
bool
has_seq_id
(
const
llama_seq_id
&
id
)
const
{
return
seq_id
.
find
(
id
)
!=
seq_id
.
end
();
}
bool
is_empty
()
const
{
return
seq_id
.
empty
();
}
bool
is_same_seq
(
const
kv_cell
&
other
)
const
{
return
seq_id
==
other
.
seq_id
;
}
};
llama_kv_cache_recurrent
(
const
llama_model
&
model
,
ggml_type
type_k
,
ggml_type
type_v
,
bool
offload
,
uint32_t
kv_size
);
~
llama_kv_cache_recurrent
()
=
default
;
//
// llama_memory_i
//
void
clear
()
override
;
bool
seq_rm
(
llama_seq_id
seq_id
,
llama_pos
p0
,
llama_pos
p1
)
override
;
void
seq_cp
(
llama_seq_id
seq_id_src
,
llama_seq_id
seq_id_dst
,
llama_pos
p0
,
llama_pos
p1
)
override
;
void
seq_keep
(
llama_seq_id
seq_id
)
override
;
void
seq_add
(
llama_seq_id
seq_id
,
llama_pos
p0
,
llama_pos
p1
,
llama_pos
delta
)
override
;
void
seq_div
(
llama_seq_id
seq_id
,
llama_pos
p0
,
llama_pos
p1
,
int
d
)
override
;
llama_pos
seq_pos_max
(
llama_seq_id
seq_id
)
const
override
;
//
// llama_kv_cache
//
void
restore
()
override
;
void
commit
()
override
;
bool
update
(
llama_context
&
lctx
)
override
;
void
defrag_sched
(
float
thold
)
override
;
void
set_full
()
override
;
llama_sbatch
sbatch_init
(
const
llama_batch
&
batch
,
bool
logits_all
)
override
;
llama_ubatch
ubatch_next
(
llama_sbatch
&
sbatch
,
uint32_t
n_ubatch
,
bool
embd_pooled
)
const
override
;
bool
find_slot
(
const
llama_ubatch
&
batch
)
override
;
int32_t
get_n_tokens
()
const
override
;
int32_t
get_used_cells
()
const
override
;
// TODO: better data structures to reduce the cost of this operation
llama_pos
get_pos_max
()
const
override
;
bool
get_can_shift
()
const
override
;
// TODO: temporary methods - they are not really const as they do const_cast<>, fix this
int32_t
s_copy
(
int
i
)
const
;
float
s_mask
(
int
i
)
const
;
// state write/load
void
state_write
(
llama_io_write_i
&
io
,
llama_seq_id
seq_id
=
-
1
)
const
override
;
void
state_read
(
llama_io_read_i
&
io
,
llama_seq_id
seq_id
=
-
1
)
override
;
// Note: The value of head isn't only used to optimize searching
// for a free KV slot. llama_decode_impl also uses it, so it
...
...
@@ -186,18 +361,41 @@ public:
// computed before each graph build
uint32_t
n
=
0
;
std
::
vector
<
llama_
kv_cell
>
cells
;
std
::
vector
<
kv_cell
>
cells
;
std
::
vector
<
ggml_tensor
*>
k_l
;
// per layer
std
::
vector
<
ggml_tensor
*>
v_l
;
private:
//const llama_model & model;
const
llama_hparams
&
hparams
;
// commit/restore cache
// TODO: rework for recurrent cache
struct
slot_range
{
uint32_t
c0
=
0
;
// note: these are cell indices, not sequence positions
uint32_t
c1
=
0
;
};
// pending cell updates that are not yet committed
struct
{
std
::
vector
<
slot_range
>
ranges
;
}
pending
;
ggml_type
type_k
=
GGML_TYPE_F16
;
ggml_type
type_v
=
GGML_TYPE_F16
;
std
::
vector
<
ggml_context_ptr
>
ctxs
;
std
::
vector
<
ggml_backend_buffer_ptr
>
bufs
;
// find how many cells are currently in use
uint32_t
cell_max
()
const
;
size_t
total_size
()
const
;
size_t
size_k_bytes
()
const
;
size_t
size_v_bytes
()
const
;
void
state_write_meta
(
llama_io_write_i
&
io
,
const
std
::
vector
<
std
::
pair
<
uint32_t
,
uint32_t
>>
&
cell_ranges
,
llama_seq_id
seq_id
=
-
1
)
const
;
void
state_write_data
(
llama_io_write_i
&
io
,
const
std
::
vector
<
std
::
pair
<
uint32_t
,
uint32_t
>>
&
cell_ranges
)
const
;
...
...
@@ -205,11 +403,6 @@ private:
bool
state_read_data
(
llama_io_read_i
&
io
,
uint32_t
cell_count
);
};
// TODO: temporary reusing llama_kv_cache_unified -- implement recurrent cache and simplify llama_kv_cache_unified
//class llama_kv_cache_recurrent : public llama_kv_cache_unified {
//public:
// using llama_kv_cache_unified::llama_kv_cache_unified;
//};
//
// kv cache view
...
...
llama/llama.cpp/src/llama-memory.h
View file @
0cefd46f
...
...
@@ -2,12 +2,22 @@
#include "llama.h"
struct
llama_memory_params
{
// kv cache
ggml_type
type_k
;
ggml_type
type_v
;
// parameters for other types of memory
// ...
};
// general concept of LLM memory
// the KV cache is a type of LLM memory, but there can be other types
class
llama_memory_i
{
public:
virtual
~
llama_memory_i
()
=
default
;
virtual
void
clear
()
=
0
;
virtual
void
defrag
()
=
0
;
virtual
bool
seq_rm
(
llama_seq_id
seq_id
,
llama_pos
p0
,
llama_pos
p1
)
=
0
;
virtual
void
seq_cp
(
llama_seq_id
seq_id_src
,
llama_seq_id
seq_id_dst
,
llama_pos
p0
,
llama_pos
p1
)
=
0
;
...
...
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment