Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
b2b270ad
Commit
b2b270ad
authored
Jun 23, 2025
by
Devon Rifkin
Browse files
Merge branch 'main' into drifkin/array-head-count-simple
parents
20c5fd39
2bb69b40
Changes
288
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
675 additions
and
764 deletions
+675
-764
kvcache/causal_test.go
kvcache/causal_test.go
+10
-10
llama/build-info.cpp
llama/build-info.cpp
+1
-1
llama/llama.cpp/.rsync-filter
llama/llama.cpp/.rsync-filter
+5
-5
llama/llama.cpp/common/common.cpp
llama/llama.cpp/common/common.cpp
+18
-1
llama/llama.cpp/common/common.h
llama/llama.cpp/common/common.h
+13
-5
llama/llama.cpp/common/sampling.cpp
llama/llama.cpp/common/sampling.cpp
+57
-50
llama/llama.cpp/include/llama.h
llama/llama.cpp/include/llama.h
+51
-16
llama/llama.cpp/src/llama-adapter.cpp
llama/llama.cpp/src/llama-adapter.cpp
+6
-0
llama/llama.cpp/src/llama-arch.cpp
llama/llama.cpp/src/llama-arch.cpp
+0
-44
llama/llama.cpp/src/llama-arch.h
llama/llama.cpp/src/llama-arch.h
+0
-10
llama/llama.cpp/src/llama-batch.cpp
llama/llama.cpp/src/llama-batch.cpp
+5
-4
llama/llama.cpp/src/llama-batch.h
llama/llama.cpp/src/llama-batch.h
+2
-1
llama/llama.cpp/src/llama-chat.cpp
llama/llama.cpp/src/llama-chat.cpp
+17
-7
llama/llama.cpp/src/llama-chat.h
llama/llama.cpp/src/llama-chat.h
+1
-0
llama/llama.cpp/src/llama-context.cpp
llama/llama.cpp/src/llama-context.cpp
+412
-488
llama/llama.cpp/src/llama-context.h
llama/llama.cpp/src/llama-context.h
+44
-34
llama/llama.cpp/src/llama-cparams.h
llama/llama.cpp/src/llama-cparams.h
+1
-1
llama/llama.cpp/src/llama-graph.cpp
llama/llama.cpp/src/llama-graph.cpp
+20
-63
llama/llama.cpp/src/llama-graph.h
llama/llama.cpp/src/llama-graph.h
+12
-20
llama/llama.cpp/src/llama-hparams.cpp
llama/llama.cpp/src/llama-hparams.cpp
+0
-4
No files found.
kvcache/causal_test.go
View file @
b2b270ad
...
...
@@ -344,7 +344,7 @@ func testCache(t *testing.T, backend ml.Backend, cache Cache, tests []testCase)
}
cache
.
SetLayer
(
0
)
tensor
,
_
:=
context
.
FromFloatSlice
(
test
.
in
,
test
.
inShape
...
)
tensor
:=
context
.
FromFloatSlice
(
test
.
in
,
test
.
inShape
...
)
cache
.
Put
(
context
,
tensor
,
tensor
)
out
,
_
,
mask
:=
cache
.
Get
(
context
)
...
...
@@ -386,7 +386,7 @@ func TestCanResume(t *testing.T) {
}
cache
.
SetLayer
(
0
)
tensor
,
_
:=
context
.
FromFloatSlice
([]
float32
{
1
,
2
,
3
,
4
},
1
,
1
,
4
)
tensor
:=
context
.
FromFloatSlice
([]
float32
{
1
,
2
,
3
,
4
},
1
,
1
,
4
)
cache
.
Put
(
context
,
tensor
,
tensor
)
// with window size 4, nothing has slid out of the window yet
...
...
@@ -413,7 +413,7 @@ func TestCanResume(t *testing.T) {
}
cache
.
SetLayer
(
0
)
tensor
,
_
=
context
.
FromFloatSlice
([]
float32
{
5
,
6
},
1
,
1
,
2
)
tensor
=
context
.
FromFloatSlice
([]
float32
{
5
,
6
},
1
,
1
,
2
)
cache
.
Put
(
context
,
tensor
,
tensor
)
// only the latest position has overlapping windows
...
...
@@ -470,24 +470,24 @@ func (c *testContext) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
return
c
.
Empty
(
dtype
,
shape
...
)
}
func
(
c
*
testContext
)
FromFloatSlice
(
s
[]
float32
,
shape
...
int
)
(
ml
.
Tensor
,
error
)
{
func
(
c
*
testContext
)
FromFloatSlice
(
s
[]
float32
,
shape
...
int
)
ml
.
Tensor
{
t
:=
c
.
Empty
(
ml
.
DTypeF32
,
shape
...
)
.
(
*
testTensor
)
copy
(
t
.
data
,
s
)
return
t
,
nil
return
t
}
func
(
c
*
testContext
)
FromIntSlice
(
s
[]
int32
,
shape
...
int
)
(
ml
.
Tensor
,
error
)
{
func
(
c
*
testContext
)
FromIntSlice
(
s
[]
int32
,
shape
...
int
)
ml
.
Tensor
{
f
:=
make
([]
float32
,
len
(
s
))
for
i
:=
range
f
{
f
[
i
]
=
float32
(
s
[
i
])
}
out
,
_
:=
c
.
FromFloatSlice
(
f
,
shape
...
)
out
:=
c
.
FromFloatSlice
(
f
,
shape
...
)
out
.
(
*
testTensor
)
.
dtype
=
ml
.
DTypeI32
return
out
,
nil
return
out
}
func
(
c
*
testContext
)
Arange
(
start
,
stop
,
step
float32
,
dtype
ml
.
DType
)
ml
.
Tensor
{
...
...
@@ -496,7 +496,7 @@ func (c *testContext) Arange(start, stop, step float32, dtype ml.DType) ml.Tenso
s
=
append
(
s
,
i
)
}
out
,
_
:=
c
.
FromFloatSlice
(
s
,
len
(
s
))
out
:=
c
.
FromFloatSlice
(
s
,
len
(
s
))
out
.
(
*
testTensor
)
.
dtype
=
dtype
return
out
}
...
...
@@ -508,7 +508,7 @@ func (c *testContext) Forward(...ml.Tensor) ml.Context { return c }
func
(
c
*
testContext
)
Compute
(
...
ml
.
Tensor
)
{}
func
(
c
*
testContext
)
Reserve
()
error
{
return
nil
}
func
(
c
*
testContext
)
Reserve
()
{
}
func
(
c
*
testContext
)
MaxGraphNodes
()
int
{
return
10
...
...
llama/build-info.cpp
View file @
b2b270ad
int
LLAMA_BUILD_NUMBER
=
0
;
char
const
*
LLAMA_COMMIT
=
"
e1e8e0991ffd9e99a445c6812bb519d5bac9f4b5
"
;
char
const
*
LLAMA_COMMIT
=
"
de4c07f93783a1a96456a44dc16b9db538ee1618
"
;
char
const
*
LLAMA_COMPILER
=
""
;
char
const
*
LLAMA_BUILD_TARGET
=
""
;
llama/llama.cpp/.rsync-filter
View file @
b2b270ad
...
...
@@ -10,11 +10,11 @@ include common/stb_image.*
include include/
include include/llama.*
include include/llama-*.*
include
example
s/
include
examples/llava
/
include
examples/llava
/clip.*
include
examples/llava
/clip-impl.*
include
examples/llava
/llava.*
include
tool
s/
include
tools/mtmd
/
include
tools/mtmd
/clip.*
include
tools/mtmd
/clip-impl.*
include
tools/mtmd
/llava.*
include src/
include src/llama.*
include src/llama-*.*
...
...
llama/llama.cpp/common/common.cpp
View file @
b2b270ad
...
...
@@ -1096,7 +1096,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
cparams
.
n_threads
=
params
.
cpuparams
.
n_threads
;
cparams
.
n_threads_batch
=
params
.
cpuparams_batch
.
n_threads
==
-
1
?
params
.
cpuparams
.
n_threads
:
params
.
cpuparams_batch
.
n_threads
;
cparams
.
logits_all
=
params
.
logits_all
;
cparams
.
embeddings
=
params
.
embedding
;
cparams
.
rope_scaling_type
=
params
.
rope_scaling_type
;
cparams
.
rope_freq_base
=
params
.
rope_freq_base
;
...
...
@@ -1114,6 +1113,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
cparams
.
offload_kqv
=
!
params
.
no_kv_offload
;
cparams
.
flash_attn
=
params
.
flash_attn
;
cparams
.
no_perf
=
params
.
no_perf
;
cparams
.
op_offload
=
!
params
.
no_op_offload
;
if
(
params
.
reranking
)
{
cparams
.
embeddings
=
true
;
...
...
@@ -1565,3 +1565,20 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
return
result
;
}
ggml_opt_dataset_t
common_opt_dataset_init
(
struct
llama_context
*
ctx
,
const
std
::
vector
<
llama_token
>
&
tokens
,
int64_t
stride
)
{
const
int64_t
ne_datapoint
=
llama_n_ctx
(
ctx
);
const
int64_t
ndata
=
(
tokens
.
size
()
-
ne_datapoint
-
1
)
/
stride
;
ggml_opt_dataset_t
result
=
ggml_opt_dataset_init
(
GGML_TYPE_I32
,
GGML_TYPE_I32
,
ne_datapoint
,
ne_datapoint
,
ndata
,
/*ndata_shard =*/
1
);
llama_token
*
data
=
(
llama_token
*
)
ggml_opt_dataset_data
(
result
)
->
data
;
llama_token
*
labels
=
(
llama_token
*
)
ggml_opt_dataset_labels
(
result
)
->
data
;
for
(
int64_t
idata
=
0
;
idata
<
ndata
;
++
idata
)
{
memcpy
(
data
+
idata
*
ne_datapoint
,
tokens
.
data
()
+
idata
*
stride
+
0
,
ne_datapoint
*
sizeof
(
llama_token
));
memcpy
(
labels
+
idata
*
ne_datapoint
,
tokens
.
data
()
+
idata
*
stride
+
1
,
ne_datapoint
*
sizeof
(
llama_token
));
}
return
result
;
}
llama/llama.cpp/common/common.h
View file @
b2b270ad
...
...
@@ -66,7 +66,6 @@ enum llama_example {
LLAMA_EXAMPLE_COMMON
,
LLAMA_EXAMPLE_SPECULATIVE
,
LLAMA_EXAMPLE_MAIN
,
LLAMA_EXAMPLE_INFILL
,
LLAMA_EXAMPLE_EMBEDDING
,
LLAMA_EXAMPLE_PERPLEXITY
,
LLAMA_EXAMPLE_RETRIEVAL
,
...
...
@@ -96,6 +95,7 @@ enum common_sampler_type {
COMMON_SAMPLER_TYPE_XTC
=
8
,
COMMON_SAMPLER_TYPE_INFILL
=
9
,
COMMON_SAMPLER_TYPE_PENALTIES
=
10
,
COMMON_SAMPLER_TYPE_TOP_N_SIGMA
=
11
,
};
// dimensionality reduction methods, used by cvector-generator
...
...
@@ -161,6 +161,7 @@ struct common_params_sampling {
std
::
vector
<
enum
common_sampler_type
>
samplers
=
{
COMMON_SAMPLER_TYPE_PENALTIES
,
COMMON_SAMPLER_TYPE_DRY
,
COMMON_SAMPLER_TYPE_TOP_N_SIGMA
,
COMMON_SAMPLER_TYPE_TOP_K
,
COMMON_SAMPLER_TYPE_TYPICAL_P
,
COMMON_SAMPLER_TYPE_TOP_P
,
...
...
@@ -323,7 +324,6 @@ struct common_params {
bool
ctx_shift
=
true
;
// context shift on inifinite text generation
bool
input_prefix_bos
=
false
;
// prefix BOS to user inputs, preceding input_prefix
bool
logits_all
=
false
;
// return logits for all tokens in the batch
bool
use_mmap
=
true
;
// use mmap for faster loads
bool
use_mlock
=
false
;
// use mlock to keep model in memory
bool
verbose_prompt
=
false
;
// print prompt tokens before generation
...
...
@@ -332,6 +332,7 @@ struct common_params {
bool
no_kv_offload
=
false
;
// disable KV offloading
bool
warmup
=
true
;
// warmup run
bool
check_tensors
=
false
;
// validate tensor data
bool
no_op_offload
=
false
;
// globally disable offload host tensor operations to device
bool
single_turn
=
false
;
// single turn chat conversation
...
...
@@ -340,7 +341,7 @@ struct common_params {
common_conversation_mode
conversation_mode
=
COMMON_CONVERSATION_MODE_AUTO
;
// multimodal models (see
examples/llava
)
// multimodal models (see
tools/mtmd
)
struct
common_params_model
mmproj
;
bool
mmproj_use_gpu
=
true
;
// use GPU for multimodal model
bool
no_mmproj
=
false
;
// explicitly disable multimodal model
...
...
@@ -409,13 +410,14 @@ struct common_params {
bool
process_output
=
false
;
// collect data for the output tensor
bool
compute_ppl
=
true
;
// whether to compute perplexity
bool
parse_special
=
false
;
// whether to parse special tokens during imatrix tokenization
// cvector-generator params
int
n_pca_batch
=
100
;
int
n_pca_iterations
=
1000
;
dimre_method
cvector_dimre_method
=
DIMRE_METHOD_PCA
;
std
::
string
cvector_positive_file
=
"
example
s/cvector-generator/positive.txt"
;
std
::
string
cvector_negative_file
=
"
example
s/cvector-generator/negative.txt"
;
std
::
string
cvector_positive_file
=
"
tool
s/cvector-generator/positive.txt"
;
std
::
string
cvector_negative_file
=
"
tool
s/cvector-generator/negative.txt"
;
bool
spm_infill
=
false
;
// suffix/prefix/middle pattern for infill
...
...
@@ -664,3 +666,9 @@ const char * const LLM_KV_SPLIT_COUNT = "split.count";
const
char
*
const
LLM_KV_SPLIT_TENSORS_COUNT
=
"split.tensors.count"
;
}
//
// training utils
//
ggml_opt_dataset_t
common_opt_dataset_init
(
struct
llama_context
*
ctx
,
const
std
::
vector
<
llama_token
>
&
tokens
,
int64_t
stride
);
llama/llama.cpp/common/sampling.cpp
View file @
b2b270ad
#include "sampling.h"
#include "common.h"
#include "log.h"
#include <cmath>
#include <unordered_map>
...
...
@@ -229,51 +230,48 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
params
.
logit_bias
.
data
()));
if
(
params
.
mirostat
==
0
)
{
if
(
params
.
top_n_sigma
>=
0
)
{
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_top_k
(
params
.
top_k
));
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_temp
(
params
.
temp
));
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_top_n_sigma
(
params
.
top_n_sigma
));
}
else
{
for
(
const
auto
&
cnstr
:
params
.
samplers
)
{
switch
(
cnstr
)
{
case
COMMON_SAMPLER_TYPE_DRY
:
{
std
::
vector
<
const
char
*>
c_breakers
;
c_breakers
.
reserve
(
params
.
dry_sequence_breakers
.
size
());
for
(
const
auto
&
str
:
params
.
dry_sequence_breakers
)
{
c_breakers
.
push_back
(
str
.
c_str
());
}
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_dry
(
vocab
,
llama_model_n_ctx_train
(
model
),
params
.
dry_multiplier
,
params
.
dry_base
,
params
.
dry_allowed_length
,
params
.
dry_penalty_last_n
,
c_breakers
.
data
(),
c_breakers
.
size
()));
for
(
const
auto
&
cnstr
:
params
.
samplers
)
{
switch
(
cnstr
)
{
case
COMMON_SAMPLER_TYPE_DRY
:
{
std
::
vector
<
const
char
*>
c_breakers
;
c_breakers
.
reserve
(
params
.
dry_sequence_breakers
.
size
());
for
(
const
auto
&
str
:
params
.
dry_sequence_breakers
)
{
c_breakers
.
push_back
(
str
.
c_str
());
}
break
;
case
COMMON_SAMPLER_TYPE_TOP_K
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_top_k
(
params
.
top_k
));
break
;
case
COMMON_SAMPLER_TYPE_TOP_P
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_top_p
(
params
.
top_p
,
params
.
min_keep
));
break
;
case
COMMON_SAMPLER_TYPE_MIN_P
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_min_p
(
params
.
min_p
,
params
.
min_keep
));
break
;
case
COMMON_SAMPLER_TYPE_XTC
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_xtc
(
params
.
xtc_probability
,
params
.
xtc_threshold
,
params
.
min_keep
,
params
.
seed
));
break
;
case
COMMON_SAMPLER_TYPE_TYPICAL_P
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_typical
(
params
.
typ_p
,
params
.
min_keep
));
break
;
case
COMMON_SAMPLER_TYPE_TEMPERATURE
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_temp_ext
(
params
.
temp
,
params
.
dynatemp_range
,
params
.
dynatemp_exponent
));
break
;
case
COMMON_SAMPLER_TYPE_INFILL
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_infill
(
vocab
));
break
;
case
COMMON_SAMPLER_TYPE_PENALTIES
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_penalties
(
params
.
penalty_last_n
,
params
.
penalty_repeat
,
params
.
penalty_freq
,
params
.
penalty_present
));
break
;
default:
GGML_ASSERT
(
false
&&
"unknown sampler type"
);
}
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_dry
(
vocab
,
llama_model_n_ctx_train
(
model
),
params
.
dry_multiplier
,
params
.
dry_base
,
params
.
dry_allowed_length
,
params
.
dry_penalty_last_n
,
c_breakers
.
data
(),
c_breakers
.
size
()));
}
break
;
case
COMMON_SAMPLER_TYPE_TOP_K
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_top_k
(
params
.
top_k
));
break
;
case
COMMON_SAMPLER_TYPE_TOP_P
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_top_p
(
params
.
top_p
,
params
.
min_keep
));
break
;
case
COMMON_SAMPLER_TYPE_TOP_N_SIGMA
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_top_n_sigma
(
params
.
top_n_sigma
));
break
;
case
COMMON_SAMPLER_TYPE_MIN_P
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_min_p
(
params
.
min_p
,
params
.
min_keep
));
break
;
case
COMMON_SAMPLER_TYPE_XTC
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_xtc
(
params
.
xtc_probability
,
params
.
xtc_threshold
,
params
.
min_keep
,
params
.
seed
));
break
;
case
COMMON_SAMPLER_TYPE_TYPICAL_P
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_typical
(
params
.
typ_p
,
params
.
min_keep
));
break
;
case
COMMON_SAMPLER_TYPE_TEMPERATURE
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_temp_ext
(
params
.
temp
,
params
.
dynatemp_range
,
params
.
dynatemp_exponent
));
break
;
case
COMMON_SAMPLER_TYPE_INFILL
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_infill
(
vocab
));
break
;
case
COMMON_SAMPLER_TYPE_PENALTIES
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_penalties
(
params
.
penalty_last_n
,
params
.
penalty_repeat
,
params
.
penalty_freq
,
params
.
penalty_present
));
break
;
default:
GGML_ASSERT
(
false
&&
"unknown sampler type"
);
}
}
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_dist
(
params
.
seed
));
...
...
@@ -475,6 +473,7 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
case
COMMON_SAMPLER_TYPE_TOP_K
:
return
'k'
;
case
COMMON_SAMPLER_TYPE_TYPICAL_P
:
return
'y'
;
case
COMMON_SAMPLER_TYPE_TOP_P
:
return
'p'
;
case
COMMON_SAMPLER_TYPE_TOP_N_SIGMA
:
return
's'
;
case
COMMON_SAMPLER_TYPE_MIN_P
:
return
'm'
;
case
COMMON_SAMPLER_TYPE_TEMPERATURE
:
return
't'
;
case
COMMON_SAMPLER_TYPE_XTC
:
return
'x'
;
...
...
@@ -490,6 +489,7 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
case
COMMON_SAMPLER_TYPE_TOP_K
:
return
"top_k"
;
case
COMMON_SAMPLER_TYPE_TYPICAL_P
:
return
"typ_p"
;
case
COMMON_SAMPLER_TYPE_TOP_P
:
return
"top_p"
;
case
COMMON_SAMPLER_TYPE_TOP_N_SIGMA
:
return
"top_n_sigma"
;
case
COMMON_SAMPLER_TYPE_MIN_P
:
return
"min_p"
;
case
COMMON_SAMPLER_TYPE_TEMPERATURE
:
return
"temperature"
;
case
COMMON_SAMPLER_TYPE_XTC
:
return
"xtc"
;
...
...
@@ -504,6 +504,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
{
"dry"
,
COMMON_SAMPLER_TYPE_DRY
},
{
"top_k"
,
COMMON_SAMPLER_TYPE_TOP_K
},
{
"top_p"
,
COMMON_SAMPLER_TYPE_TOP_P
},
{
"top_n_sigma"
,
COMMON_SAMPLER_TYPE_TOP_N_SIGMA
},
{
"typ_p"
,
COMMON_SAMPLER_TYPE_TYPICAL_P
},
{
"min_p"
,
COMMON_SAMPLER_TYPE_MIN_P
},
{
"temperature"
,
COMMON_SAMPLER_TYPE_TEMPERATURE
},
...
...
@@ -517,6 +518,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
std
::
unordered_map
<
std
::
string
,
common_sampler_type
>
sampler_alt_name_map
{
{
"top-k"
,
COMMON_SAMPLER_TYPE_TOP_K
},
{
"top-p"
,
COMMON_SAMPLER_TYPE_TOP_P
},
{
"top-n-sigma"
,
COMMON_SAMPLER_TYPE_TOP_N_SIGMA
},
{
"nucleus"
,
COMMON_SAMPLER_TYPE_TOP_P
},
{
"typical-p"
,
COMMON_SAMPLER_TYPE_TYPICAL_P
},
{
"typical"
,
COMMON_SAMPLER_TYPE_TYPICAL_P
},
...
...
@@ -533,14 +535,16 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
auto
sampler
=
sampler_canonical_name_map
.
find
(
name
);
if
(
sampler
!=
sampler_canonical_name_map
.
end
())
{
samplers
.
push_back
(
sampler
->
second
);
}
else
{
if
(
allow_alt_names
)
{
sampler
=
sampler_alt_name_map
.
find
(
name
);
if
(
sampler
!=
sampler_alt_name_map
.
end
())
{
samplers
.
push_back
(
sampler
->
second
);
}
continue
;
}
if
(
allow_alt_names
)
{
sampler
=
sampler_alt_name_map
.
find
(
name
);
if
(
sampler
!=
sampler_alt_name_map
.
end
())
{
samplers
.
push_back
(
sampler
->
second
);
continue
;
}
}
LOG_WRN
(
"%s: unable to match sampler by name '%s'
\n
"
,
__func__
,
name
.
c_str
());
}
return
samplers
;
...
...
@@ -552,6 +556,7 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
{
common_sampler_type_to_chr
(
COMMON_SAMPLER_TYPE_TOP_K
),
COMMON_SAMPLER_TYPE_TOP_K
},
{
common_sampler_type_to_chr
(
COMMON_SAMPLER_TYPE_TYPICAL_P
),
COMMON_SAMPLER_TYPE_TYPICAL_P
},
{
common_sampler_type_to_chr
(
COMMON_SAMPLER_TYPE_TOP_P
),
COMMON_SAMPLER_TYPE_TOP_P
},
{
common_sampler_type_to_chr
(
COMMON_SAMPLER_TYPE_TOP_N_SIGMA
),
COMMON_SAMPLER_TYPE_TOP_N_SIGMA
},
{
common_sampler_type_to_chr
(
COMMON_SAMPLER_TYPE_MIN_P
),
COMMON_SAMPLER_TYPE_MIN_P
},
{
common_sampler_type_to_chr
(
COMMON_SAMPLER_TYPE_TEMPERATURE
),
COMMON_SAMPLER_TYPE_TEMPERATURE
},
{
common_sampler_type_to_chr
(
COMMON_SAMPLER_TYPE_XTC
),
COMMON_SAMPLER_TYPE_XTC
},
...
...
@@ -566,6 +571,8 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
const
auto
sampler
=
sampler_name_map
.
find
(
c
);
if
(
sampler
!=
sampler_name_map
.
end
())
{
samplers
.
push_back
(
sampler
->
second
);
}
else
{
LOG_WRN
(
"%s: unable to match sampler by char '%c'
\n
"
,
__func__
,
c
);
}
}
...
...
llama/llama.cpp/include/llama.h
View file @
b2b270ad
...
...
@@ -4,6 +4,7 @@
#include "ggml.h"
#include "ggml-cpu.h"
#include "ggml-backend.h"
#include "ggml-opt.h"
#include <stddef.h>
#include <stdint.h>
...
...
@@ -112,6 +113,7 @@ extern "C" {
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE
=
32
,
LLAMA_VOCAB_PRE_TYPE_LLAMA4
=
33
,
LLAMA_VOCAB_PRE_TYPE_PIXTRAL
=
34
,
LLAMA_VOCAB_PRE_TYPE_SEED_CODER
=
35
,
};
enum
llama_rope_type
{
...
...
@@ -256,7 +258,6 @@ extern "C" {
llama_token
*
token
;
float
*
embd
;
int32_t
n_embd
;
llama_pos
*
pos
;
int32_t
*
n_seq_id
;
llama_seq_id
**
seq_id
;
...
...
@@ -352,20 +353,18 @@ extern "C" {
enum
ggml_type
type_k
;
// data type for K cache [EXPERIMENTAL]
enum
ggml_type
type_v
;
// data type for V cache [EXPERIMENTAL]
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
// TODO: move at the end of the struct
bool
logits_all
;
// the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
bool
embeddings
;
// if true, extract embeddings (together with logits)
bool
offload_kqv
;
// whether to offload the KQV ops (including the KV cache) to GPU
bool
flash_attn
;
// whether to use flash attention [EXPERIMENTAL]
bool
no_perf
;
// whether to measure performance timings
bool
cross_attn
;
// whether to use cross attention
// Abort callback
// if it returns true, execution of llama_decode() will be aborted
// currently works only with CPU execution
ggml_abort_callback
abort_callback
;
void
*
abort_callback_data
;
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
bool
embeddings
;
// if true, extract embeddings (together with logits)
bool
offload_kqv
;
// whether to offload the KQV ops (including the KV cache) to GPU
bool
flash_attn
;
// whether to use flash attention [EXPERIMENTAL]
bool
no_perf
;
// whether to measure performance timings
bool
op_offload
;
// whether to offload host tensor operations to device
};
// model quantization parameters
...
...
@@ -447,6 +446,10 @@ extern "C" {
size_t
n_paths
,
struct
llama_model_params
params
);
LLAMA_API
void
llama_model_save_to_file
(
const
struct
llama_model
*
model
,
const
char
*
path_model
);
DEPRECATED
(
LLAMA_API
void
llama_free_model
(
struct
llama_model
*
model
),
"use llama_model_free instead"
);
...
...
@@ -461,10 +464,6 @@ extern "C" {
struct
llama_context_params
params
),
"use llama_init_from_model instead"
);
// TODO (jmorganca): this should most likely be passed in as part of a batch
// and not set on the context for all batches.
LLAMA_API
void
llama_set_cross_attention
(
struct
llama_context
*
ctx
,
bool
cross_attn_state
);
// Frees all allocated memory
LLAMA_API
void
llama_free
(
struct
llama_context
*
ctx
);
...
...
@@ -930,14 +929,19 @@ extern "C" {
// Frees a batch of tokens allocated with llama_batch_init()
LLAMA_API
void
llama_batch_free
(
struct
llama_batch
batch
);
// Processes a batch of tokens with the ecoder part of the encoder-decoder model.
// Stores the encoder output internally for later use by the decoder cross-attention layers.
// Process a batch of tokens.
// In contrast to llama_decode() - this call does not use KV cache.
// For encode-decoder contexts, processes the batch using the encoder.
// Can store the encoder output internally for later use by the decoder's cross-attention layers.
// 0 - success
// < 0 - error. the KV cache state is restored to the state before this call
LLAMA_API
int32_t
llama_encode
(
struct
llama_context
*
ctx
,
struct
llama_batch
batch
);
// Process a batch of tokens.
// Requires KV cache.
// For encode-decoder contexts, processes the batch using the decoder.
// Positive return values does not mean a fatal error, but rather a warning.
// 0 - success
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
...
...
@@ -1434,6 +1438,37 @@ extern "C" {
LLAMA_API
void
llama_perf_sampler_print
(
const
struct
llama_sampler
*
chain
);
LLAMA_API
void
llama_perf_sampler_reset
(
struct
llama_sampler
*
chain
);
//
// training
//
// function that returns whether or not a given tensor contains trainable parameters
typedef
bool
(
*
llama_opt_param_filter
)(
const
struct
ggml_tensor
*
tensor
,
void
*
userdata
);
// always returns true
LLAMA_API
bool
llama_opt_param_filter_all
(
const
struct
ggml_tensor
*
tensor
,
void
*
userdata
);
struct
llama_opt_params
{
uint32_t
n_ctx_train
;
// assumed context size post training, use context size specified in llama_context if 0
llama_opt_param_filter
param_filter
;
// callback for determining which tensors contain trainable parameters
void
*
param_filter_ud
;
// userdata for determining which tensors contain trainable parameters
ggml_opt_get_optimizer_params
get_opt_pars
;
// callback for calculating optimizer parameters
void
*
get_opt_pars_ud
;
// userdata for calculating optimizer parameters
};
LLAMA_API
void
llama_opt_init
(
struct
llama_context
*
lctx
,
struct
llama_model
*
model
,
struct
llama_opt_params
lopt_params
);
LLAMA_API
void
llama_opt_epoch
(
struct
llama_context
*
lctx
,
ggml_opt_dataset_t
dataset
,
ggml_opt_result_t
result_train
,
ggml_opt_result_t
result_eval
,
int64_t
idata_split
,
ggml_opt_epoch_callback
callback_train
,
ggml_opt_epoch_callback
callback_eval
);
#ifdef __cplusplus
}
#endif
...
...
llama/llama.cpp/src/llama-adapter.cpp
View file @
b2b270ad
...
...
@@ -253,6 +253,9 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
std
::
vector
<
ggml_backend_buffer_type_t
>
buft_extra
;
{
auto
*
cpu_dev
=
ggml_backend_dev_by_type
(
GGML_BACKEND_DEVICE_TYPE_CPU
);
if
(
!
cpu_dev
)
{
throw
std
::
runtime_error
(
format
(
"%s: no CPU backend found"
,
__func__
));
}
auto
*
cpu_reg
=
ggml_backend_dev_backend_reg
(
cpu_dev
);
auto
ggml_backend_dev_get_extra_bufts_fn
=
(
ggml_backend_dev_get_extra_bufts_t
)
...
...
@@ -291,6 +294,9 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
LLAMA_LOG_WARN
(
"%s: lora for '%s' cannot use buft '%s', fallback to CPU
\n
"
,
__func__
,
model_tensor
->
name
,
ggml_backend_buft_name
(
buft
));
auto
*
cpu_dev
=
ggml_backend_dev_by_type
(
GGML_BACKEND_DEVICE_TYPE_CPU
);
if
(
!
cpu_dev
)
{
throw
std
::
runtime_error
(
format
(
"%s: no CPU backend found"
,
__func__
));
}
buft
=
ggml_backend_dev_buffer_type
(
cpu_dev
);
break
;
...
...
llama/llama.cpp/src/llama-arch.cpp
View file @
b2b270ad
...
...
@@ -6,7 +6,6 @@
static
const
std
::
map
<
llm_arch
,
const
char
*>
LLM_ARCH_NAMES
=
{
{
LLM_ARCH_LLAMA
,
"llama"
},
{
LLM_ARCH_MLLAMA
,
"mllama"
},
{
LLM_ARCH_LLAMA4
,
"llama4"
},
{
LLM_ARCH_DECI
,
"deci"
},
{
LLM_ARCH_FALCON
,
"falcon"
},
...
...
@@ -145,7 +144,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{
LLM_KV_ATTENTION_SLIDING_WINDOW
,
"%s.attention.sliding_window"
},
{
LLM_KV_ATTENTION_SCALE
,
"%s.attention.scale"
},
{
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION
,
"%s.attention.block_skip_connection"
},
{
LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS
,
"%s.attention.cross_attention_layers"
},
{
LLM_KV_ATTENTION_KEY_LENGTH_MLA
,
"%s.attention.key_length_mla"
},
{
LLM_KV_ATTENTION_VALUE_LENGTH_MLA
,
"%s.attention.value_length_mla"
},
...
...
@@ -275,40 +273,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{
LLM_TENSOR_FFN_UP_SHEXP
,
"blk.%d.ffn_up_shexp"
},
},
},
{
LLM_ARCH_MLLAMA
,
{
{
LLM_TENSOR_TOKEN_EMBD
,
"token_embd"
},
{
LLM_TENSOR_OUTPUT_NORM
,
"output_norm"
},
{
LLM_TENSOR_OUTPUT
,
"output"
},
{
LLM_TENSOR_ROPE_FREQS
,
"rope_freqs"
},
{
LLM_TENSOR_ATTN_NORM
,
"blk.%d.attn_norm"
},
{
LLM_TENSOR_ATTN_Q
,
"blk.%d.attn_q"
},
{
LLM_TENSOR_ATTN_K
,
"blk.%d.attn_k"
},
{
LLM_TENSOR_ATTN_V
,
"blk.%d.attn_v"
},
{
LLM_TENSOR_ATTN_OUT
,
"blk.%d.attn_output"
},
{
LLM_TENSOR_ATTN_ROT_EMBD
,
"blk.%d.attn_rot_embd"
},
{
LLM_TENSOR_FFN_GATE_INP
,
"blk.%d.ffn_gate_inp"
},
{
LLM_TENSOR_FFN_NORM
,
"blk.%d.ffn_norm"
},
{
LLM_TENSOR_FFN_GATE
,
"blk.%d.ffn_gate"
},
{
LLM_TENSOR_FFN_DOWN
,
"blk.%d.ffn_down"
},
{
LLM_TENSOR_FFN_UP
,
"blk.%d.ffn_up"
},
{
LLM_TENSOR_FFN_GATE_EXP
,
"blk.%d.ffn_gate.%d"
},
{
LLM_TENSOR_FFN_DOWN_EXP
,
"blk.%d.ffn_down.%d"
},
{
LLM_TENSOR_FFN_UP_EXP
,
"blk.%d.ffn_up.%d"
},
{
LLM_TENSOR_FFN_GATE_EXPS
,
"blk.%d.ffn_gate_exps"
},
{
LLM_TENSOR_FFN_DOWN_EXPS
,
"blk.%d.ffn_down_exps"
},
{
LLM_TENSOR_FFN_UP_EXPS
,
"blk.%d.ffn_up_exps"
},
{
LLM_TENSOR_CROSS_ATTN_K_NORM
,
"blk.%d.cross_attn_k_norm"
},
{
LLM_TENSOR_CROSS_ATTN_K_PROJ
,
"blk.%d.cross_attn_k_proj"
},
{
LLM_TENSOR_CROSS_ATTN_O_PROJ
,
"blk.%d.cross_attn_o_proj"
},
{
LLM_TENSOR_CROSS_ATTN_Q_NORM
,
"blk.%d.cross_attn_q_norm"
},
{
LLM_TENSOR_CROSS_ATTN_Q_PROJ
,
"blk.%d.cross_attn_q_proj"
},
{
LLM_TENSOR_CROSS_ATTN_V_PROJ
,
"blk.%d.cross_attn_v_proj"
},
{
LLM_TENSOR_CROSS_ATTN_ATTN_GATE
,
"blk.%d.cross_attn_attn_gate"
},
{
LLM_TENSOR_CROSS_ATTN_MLP_GATE
,
"blk.%d.cross_attn_mlp_gate"
},
},
},
{
LLM_ARCH_DECI
,
{
...
...
@@ -1737,14 +1701,6 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
// this tensor is loaded for T5, but never used
{
LLM_TENSOR_DEC_CROSS_ATTN_REL_B
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_NONE
}},
{
LLM_TENSOR_BSKCN_TV
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL
}},
{
LLM_TENSOR_CROSS_ATTN_K_NORM
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL
}},
{
LLM_TENSOR_CROSS_ATTN_K_PROJ
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL_MAT
}},
{
LLM_TENSOR_CROSS_ATTN_O_PROJ
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL_MAT
}},
{
LLM_TENSOR_CROSS_ATTN_Q_NORM
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL
}},
{
LLM_TENSOR_CROSS_ATTN_Q_PROJ
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL_MAT
}},
{
LLM_TENSOR_CROSS_ATTN_V_PROJ
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL_MAT
}},
{
LLM_TENSOR_CROSS_ATTN_ATTN_GATE
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL
}},
{
LLM_TENSOR_CROSS_ATTN_MLP_GATE
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL
}},
{
LLM_TENSOR_CONV1D
,
{
LLM_TENSOR_LAYER_INPUT
,
GGML_OP_IM2COL
}},
{
LLM_TENSOR_POS_NET_NORM
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL
}},
{
LLM_TENSOR_POS_NET_NORM1
,
{
LLM_TENSOR_LAYER_REPEATING
,
GGML_OP_MUL
}},
...
...
llama/llama.cpp/src/llama-arch.h
View file @
b2b270ad
...
...
@@ -11,7 +11,6 @@
enum
llm_arch
{
LLM_ARCH_LLAMA
,
LLM_ARCH_LLAMA4
,
LLM_ARCH_MLLAMA
,
LLM_ARCH_DECI
,
LLM_ARCH_FALCON
,
LLM_ARCH_BAICHUAN
,
...
...
@@ -149,7 +148,6 @@ enum llm_kv {
LLM_KV_ATTENTION_SLIDING_WINDOW
,
LLM_KV_ATTENTION_SCALE
,
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION
,
LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS
,
LLM_KV_ATTENTION_KEY_LENGTH_MLA
,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA
,
...
...
@@ -351,14 +349,6 @@ enum llm_tensor {
LLM_TENSOR_CLS
,
LLM_TENSOR_CLS_OUT
,
LLM_TENSOR_BSKCN_TV
,
LLM_TENSOR_CROSS_ATTN_K_NORM
,
LLM_TENSOR_CROSS_ATTN_K_PROJ
,
LLM_TENSOR_CROSS_ATTN_O_PROJ
,
LLM_TENSOR_CROSS_ATTN_Q_NORM
,
LLM_TENSOR_CROSS_ATTN_Q_PROJ
,
LLM_TENSOR_CROSS_ATTN_V_PROJ
,
LLM_TENSOR_CROSS_ATTN_ATTN_GATE
,
LLM_TENSOR_CROSS_ATTN_MLP_GATE
,
LLM_TENSOR_CONV1D
,
LLM_TENSOR_CONVNEXT_DW
,
LLM_TENSOR_CONVNEXT_NORM
,
...
...
llama/llama.cpp/src/llama-batch.cpp
View file @
b2b270ad
...
...
@@ -189,7 +189,7 @@ llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) {
return
ubatch
;
}
void
llama_sbatch
::
from_
batch
(
const
llama_batch
&
batch
,
size_t
n_embd
,
bool
simple_split
,
bool
logits_all
)
{
llama_sbatch
::
llama_s
batch
(
const
llama_batch
&
batch
,
size_t
n_embd
,
bool
simple_split
,
bool
logits_all
)
{
GGML_ASSERT
(
batch
.
n_tokens
>=
0
);
this
->
batch
=
&
batch
;
this
->
n_embd
=
n_embd
;
...
...
@@ -203,6 +203,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
for
(
size_t
i
=
0
;
i
<
n_tokens
;
++
i
)
{
ids
[
i
]
=
i
;
}
if
(
simple_split
)
{
seq
.
resize
(
1
);
llama_sbatch_seq
&
s
=
seq
[
0
];
...
...
@@ -212,6 +213,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
s
.
length
=
n_tokens
;
return
;
}
std
::
sort
(
ids
.
begin
(),
ids
.
end
(),
[
&
batch
](
size_t
a
,
size_t
b
)
{
int32_t
n_seq_a
=
batch
.
n_seq_id
?
batch
.
n_seq_id
[
a
]
:
1
;
...
...
@@ -239,6 +241,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
return
n_seq_a
>
n_seq_b
;
}
);
// init seq
llama_sbatch_seq
*
last_seq
=
nullptr
;
...
...
@@ -262,6 +265,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
seq
.
push_back
(
new_seq
);
last_seq
=
&
seq
.
back
();
}
// keep shared prompts first at the end, then sort by length descending.
std
::
sort
(
seq
.
begin
(),
seq
.
end
(),
[](
llama_sbatch_seq
&
a
,
llama_sbatch_seq
&
b
)
{
...
...
@@ -316,7 +320,6 @@ struct llama_batch llama_batch_get_one(
/*n_tokens =*/
n_tokens
,
/*tokens =*/
tokens
,
/*embd =*/
nullptr
,
/*n_embd =*/
0
,
/*pos =*/
nullptr
,
/*n_seq_id =*/
nullptr
,
/*seq_id =*/
nullptr
,
...
...
@@ -329,7 +332,6 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
/*n_tokens =*/
0
,
/*tokens =*/
nullptr
,
/*embd =*/
nullptr
,
/*n_embd =*/
0
,
/*pos =*/
nullptr
,
/*n_seq_id =*/
nullptr
,
/*seq_id =*/
nullptr
,
...
...
@@ -338,7 +340,6 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
if
(
embd
)
{
batch
.
embd
=
(
float
*
)
malloc
(
sizeof
(
float
)
*
n_tokens_alloc
*
embd
);
batch
.
n_embd
=
embd
;
}
else
{
batch
.
token
=
(
llama_token
*
)
malloc
(
sizeof
(
llama_token
)
*
n_tokens_alloc
);
}
...
...
llama/llama.cpp/src/llama-batch.h
View file @
b2b270ad
...
...
@@ -70,7 +70,8 @@ struct llama_sbatch {
// sequence-wise split
llama_ubatch
split_seq
(
size_t
n_ubatch
);
void
from_batch
(
const
llama_batch
&
batch
,
size_t
n_embd
,
bool
simple_split
=
false
,
bool
logits_all
=
false
);
llama_sbatch
()
=
default
;
llama_sbatch
(
const
llama_batch
&
batch
,
size_t
n_embd
,
bool
simple_split
=
false
,
bool
logits_all
=
false
);
};
// temporary allocate memory for the input batch if needed
...
...
llama/llama.cpp/src/llama-chat.cpp
View file @
b2b270ad
...
...
@@ -35,6 +35,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
{
"mistral-v3"
,
LLM_CHAT_TEMPLATE_MISTRAL_V3
},
{
"mistral-v3-tekken"
,
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN
},
{
"mistral-v7"
,
LLM_CHAT_TEMPLATE_MISTRAL_V7
},
{
"mistral-v7-tekken"
,
LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN
},
{
"phi3"
,
LLM_CHAT_TEMPLATE_PHI_3
},
{
"phi4"
,
LLM_CHAT_TEMPLATE_PHI_4
},
{
"falcon3"
,
LLM_CHAT_TEMPLATE_FALCON_3
},
...
...
@@ -202,19 +203,20 @@ int32_t llm_chat_apply_template(
if
(
add_ass
)
{
ss
<<
"<|im_start|>assistant
\n
"
;
}
}
else
if
(
tmpl
==
LLM_CHAT_TEMPLATE_MISTRAL_V7
)
{
}
else
if
(
tmpl
==
LLM_CHAT_TEMPLATE_MISTRAL_V7
||
tmpl
==
LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN
)
{
// Official mistral 'v7' template
// See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
// https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503#basic-instruct-template-v7-tekken
const
char
*
trailing_space
=
tmpl
==
LLM_CHAT_TEMPLATE_MISTRAL_V7
?
" "
:
""
;
for
(
auto
message
:
chat
)
{
std
::
string
role
(
message
->
role
);
std
::
string
content
(
message
->
content
);
if
(
role
==
"system"
)
{
ss
<<
"[SYSTEM_PROMPT]
"
<<
content
<<
"[/SYSTEM_PROMPT]"
;
ss
<<
"[SYSTEM_PROMPT]"
<<
trailing_space
<<
content
<<
"[/SYSTEM_PROMPT]"
;
}
else
if
(
role
==
"user"
)
{
ss
<<
"[INST] "
<<
content
<<
"[/INST]"
;
}
else
{
ss
<<
" "
<<
content
<<
"</s>"
;
ss
<<
"[INST]"
<<
trailing_space
<<
content
<<
"[/INST]"
;
}
else
{
ss
<<
trailing_space
<<
content
<<
"</s>"
;
}
}
}
else
if
(
tmpl
==
LLM_CHAT_TEMPLATE_MISTRAL_V1
...
...
@@ -447,8 +449,16 @@ int32_t llm_chat_apply_template(
if
(
add_ass
)
{
ss
<<
"<|assistant|>"
;
}
}
else
if
(
tmpl
==
LLM_CHAT_TEMPLATE_CHATGLM_4
||
tmpl
==
LLM_CHAT_TEMPLATE_GLMEDGE
)
{
}
else
if
(
tmpl
==
LLM_CHAT_TEMPLATE_CHATGLM_4
)
{
ss
<<
"[gMASK]"
<<
"<sop>"
;
for
(
auto
message
:
chat
)
{
std
::
string
role
(
message
->
role
);
ss
<<
"<|"
<<
role
<<
"|>"
<<
"
\n
"
<<
message
->
content
;
}
if
(
add_ass
)
{
ss
<<
"<|assistant|>
\n
"
;
}
}
else
if
(
tmpl
==
LLM_CHAT_TEMPLATE_GLMEDGE
)
{
for
(
auto
message
:
chat
)
{
std
::
string
role
(
message
->
role
);
ss
<<
"<|"
<<
role
<<
"|>"
<<
"
\n
"
<<
message
->
content
;
...
...
llama/llama.cpp/src/llama-chat.h
View file @
b2b270ad
...
...
@@ -14,6 +14,7 @@ enum llm_chat_template {
LLM_CHAT_TEMPLATE_MISTRAL_V3
,
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN
,
LLM_CHAT_TEMPLATE_MISTRAL_V7
,
LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN
,
LLM_CHAT_TEMPLATE_PHI_3
,
LLM_CHAT_TEMPLATE_PHI_4
,
LLM_CHAT_TEMPLATE_FALCON_3
,
...
...
llama/llama.cpp/src/llama-context.cpp
View file @
b2b270ad
...
...
@@ -6,11 +6,9 @@
#include "llama-model.h"
#include "llama-kv-cache.h"
#include <cassert>
#include <cstring>
#include <stdexcept>
#include <cinttypes>
#include <cmath>
//
// llama_context
...
...
@@ -95,6 +93,7 @@ llama_context::llama_context(
}
cparams
.
n_ubatch
=
std
::
min
(
cparams
.
n_batch
,
params
.
n_ubatch
==
0
?
params
.
n_batch
:
params
.
n_ubatch
);
cparams
.
op_offload
=
params
.
op_offload
;
const
uint32_t
n_ctx_per_seq
=
cparams
.
n_ctx
/
cparams
.
n_seq_max
;
...
...
@@ -118,8 +117,6 @@ llama_context::llama_context(
__func__
,
n_ctx_per_seq
,
hparams
.
n_ctx_train
);
}
logits_all
=
params
.
logits_all
;
if
(
!
hparams
.
vocab_only
)
{
// GPU backends
for
(
auto
*
dev
:
model
.
devices
)
{
...
...
@@ -177,44 +174,13 @@ llama_context::llama_context(
}
// init the memory module
// TODO: for now, always create a unified KV cache
if
(
!
hparams
.
vocab_only
)
{
kv_self
.
reset
(
static_cast
<
llama_kv_cache_unified
*>
(
model
.
create_memory
()));
LLAMA_LOG_DEBUG
(
"%s: n_ctx = %u
\n
"
,
__func__
,
cparams
.
n_ctx
);
cparams
.
n_ctx
=
GGML_PAD
(
cparams
.
n_ctx
,
kv_self
->
get_padding
(
cparams
));
LLAMA_LOG_DEBUG
(
"%s: n_ctx = %u (padded)
\n
"
,
__func__
,
cparams
.
n_ctx
);
uint32_t
kv_size
=
cparams
.
n_ctx
;
ggml_type
type_k
=
params
.
type_k
;
ggml_type
type_v
=
params
.
type_v
;
if
(
llama_model_is_recurrent
(
&
model
))
{
// Mamba needs at least as many KV cells as there are sequences kept at any time
kv_size
=
std
::
max
((
uint32_t
)
1
,
params
.
n_seq_max
);
// it's probably best to keep as much precision as possible for the states
type_k
=
GGML_TYPE_F32
;
// required by ggml_ssm_conv for Mamba's conv_states
type_v
=
GGML_TYPE_F32
;
// required by ggml_ssm_scan for Mamba's ssm_states
}
GGML_ASSERT
(
hparams
.
n_embd_head_k
%
ggml_blck_size
(
type_k
)
==
0
);
GGML_ASSERT
(
hparams
.
n_embd_head_v
%
ggml_blck_size
(
type_v
)
==
0
);
if
(
!
kv_self
->
init
(
model
,
cparams
,
type_k
,
type_v
,
kv_size
,
cparams
.
offload_kqv
))
{
throw
std
::
runtime_error
(
"failed to initialize self-attention cache"
);
}
{
const
size_t
memory_size_k
=
kv_self
->
size_k_bytes
();
const
size_t
memory_size_v
=
kv_self
->
size_v_bytes
();
llama_memory_params
params_mem
=
{
/*.type_k =*/
params
.
type_k
,
/*.type_v =*/
params
.
type_v
,
};
LLAMA_LOG_INFO
(
"%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB
\n
"
,
__func__
,
(
float
)(
memory_size_k
+
memory_size_v
)
/
(
1024.0
f
*
1024.0
f
),
ggml_type_name
(
type_k
),
(
float
)
memory_size_k
/
(
1024.0
f
*
1024.0
f
),
ggml_type_name
(
type_v
),
(
float
)
memory_size_v
/
(
1024.0
f
*
1024.0
f
));
}
memory
.
reset
(
model
.
create_memory
(
params_mem
,
cparams
));
}
// init backends
...
...
@@ -278,7 +244,7 @@ llama_context::llama_context(
}
}
sched
.
reset
(
ggml_backend_sched_new
(
backend_ptrs
.
data
(),
backend_buft
.
data
(),
backend_ptrs
.
size
(),
max_nodes
,
pipeline_parallel
));
sched
.
reset
(
ggml_backend_sched_new
(
backend_ptrs
.
data
(),
backend_buft
.
data
(),
backend_ptrs
.
size
(),
max_nodes
,
pipeline_parallel
,
cparams
.
op_offload
));
if
(
pipeline_parallel
)
{
LLAMA_LOG_INFO
(
"%s: pipeline parallelism enabled (n_copies=%d)
\n
"
,
__func__
,
ggml_backend_sched_get_n_copies
(
sched
.
get
()));
...
...
@@ -286,7 +252,7 @@ llama_context::llama_context(
}
// reserve worst-case graph
if
(
!
hparams
.
vocab_only
)
{
if
(
!
hparams
.
vocab_only
&&
memory
)
{
const
uint32_t
n_seqs
=
1
;
// TODO: worst-case number of sequences
const
uint32_t
n_tokens
=
std
::
min
(
cparams
.
n_ctx
,
cparams
.
n_ubatch
);
...
...
@@ -305,7 +271,9 @@ llama_context::llama_context(
int
n_nodes_tg
=
-
1
;
// simulate full KV cache
kv_self
->
n
=
kv_self
->
size
;
llama_kv_cache
*
kv_self
=
static_cast
<
llama_kv_cache
*>
(
memory
.
get
());
kv_self
->
set_full
();
cross
.
v_embd
.
clear
();
...
...
@@ -391,7 +359,9 @@ llama_context::llama_context(
}
}
llama_context
::~
llama_context
()
=
default
;
llama_context
::~
llama_context
()
{
ggml_opt_free
(
opt_ctx
);
}
void
llama_context
::
synchronize
()
{
ggml_backend_sched_synchronize
(
sched
.
get
());
...
...
@@ -427,6 +397,18 @@ const llama_model & llama_context::get_model() const {
return
model
;
}
const
llama_cparams
&
llama_context
::
get_cparams
()
const
{
return
cparams
;
}
ggml_backend_sched_t
llama_context
::
get_sched
()
const
{
return
sched
.
get
();
}
ggml_context
*
llama_context
::
get_ctx_compute
()
const
{
return
ctx_compute
.
get
();
}
uint32_t
llama_context
::
n_ctx
()
const
{
return
cparams
.
n_ctx
;
}
...
...
@@ -456,318 +438,44 @@ uint32_t llama_context::n_threads_batch() const {
}
llama_kv_cache
*
llama_context
::
get_kv_self
()
{
return
kv_self
.
get
();
llama_kv_cache
*
kv_self
=
static_cast
<
llama_kv_cache
*>
(
memory
.
get
());
return
kv_self
;
}
const
llama_kv_cache
*
llama_context
::
get_kv_self
()
const
{
return
kv_self
.
get
();
}
ggml_tensor
*
llama_context
::
build_rope_shift
(
ggml_context
*
ctx0
,
ggml_tensor
*
cur
,
ggml_tensor
*
shift
,
ggml_tensor
*
factors
,
float
freq_base
,
float
freq_scale
)
const
{
const
auto
&
n_ctx_orig
=
cparams
.
n_ctx_orig_yarn
;
const
auto
&
yarn_ext_factor
=
cparams
.
yarn_ext_factor
;
const
auto
&
yarn_beta_fast
=
cparams
.
yarn_beta_fast
;
const
auto
&
yarn_beta_slow
=
cparams
.
yarn_beta_slow
;
const
auto
&
hparams
=
model
.
hparams
;
const
auto
&
n_rot
=
hparams
.
n_rot
;
const
auto
&
rope_type
=
hparams
.
rope_type
;
// See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
const
float
yarn_attn_factor
=
model
.
arch
==
LLM_ARCH_DEEPSEEK2
?
1.0
f
/
(
1.0
f
+
0.1
f
*
logf
(
1.0
f
/
freq_scale
))
:
cparams
.
yarn_attn_factor
;
ggml_tensor
*
tmp
;
if
(
ggml_is_quantized
(
cur
->
type
))
{
// dequantize to f32 -> RoPE -> quantize back
tmp
=
ggml_cast
(
ctx0
,
cur
,
GGML_TYPE_F32
);
tmp
=
ggml_rope_ext
(
ctx0
,
tmp
,
shift
,
factors
,
n_rot
,
rope_type
,
n_ctx_orig
,
freq_base
,
freq_scale
,
yarn_ext_factor
,
yarn_attn_factor
,
yarn_beta_fast
,
yarn_beta_slow
);
tmp
=
ggml_cpy
(
ctx0
,
tmp
,
cur
);
}
else
{
// we rotate only the first n_rot dimensions
tmp
=
ggml_rope_ext_inplace
(
ctx0
,
cur
,
shift
,
factors
,
n_rot
,
rope_type
,
n_ctx_orig
,
freq_base
,
freq_scale
,
yarn_ext_factor
,
yarn_attn_factor
,
yarn_beta_fast
,
yarn_beta_slow
);
}
return
tmp
;
}
class
llm_graph_input_k_shift
:
public
llm_graph_input_i
{
public:
llm_graph_input_k_shift
(
const
llama_kv_cache_unified
*
kv_self
)
:
kv_self
(
kv_self
)
{}
virtual
~
llm_graph_input_k_shift
()
=
default
;
void
set_input
(
const
llama_ubatch
*
ubatch
)
override
;
ggml_tensor
*
k_shift
;
// I32 [kv_size]
const
llama_kv_cache_unified
*
kv_self
;
};
void
llm_graph_input_k_shift
::
set_input
(
const
llama_ubatch
*
ubatch
)
{
GGML_UNUSED
(
ubatch
);
if
(
k_shift
)
{
assert
(
ggml_backend_buffer_is_host
(
k_shift
->
buffer
));
int32_t
*
data
=
(
int32_t
*
)
k_shift
->
data
;
for
(
uint32_t
i
=
0
;
i
<
kv_self
->
size
;
++
i
)
{
data
[
i
]
=
kv_self
->
cells
[
i
].
delta
;
}
}
}
llm_graph_result_ptr
llama_context
::
build_kv_self_shift
(
ggml_context
*
ctx0
,
ggml_cgraph
*
gf
)
const
{
auto
res
=
std
::
make_unique
<
llm_graph_result
>
();
const
auto
&
hparams
=
model
.
hparams
;
const
auto
&
n_layer
=
hparams
.
n_layer
;
const
auto
&
n_embd_head_k
=
hparams
.
n_embd_head_k
;
//const auto & n_embd_head_v = hparams.n_embd_head_v;
//GGML_ASSERT(kv_self->size == n_ctx);
auto
inp
=
std
::
make_unique
<
llm_graph_input_k_shift
>
(
kv_self
.
get
());
inp
->
k_shift
=
ggml_new_tensor_1d
(
ctx0
,
GGML_TYPE_I32
,
cparams
.
n_ctx
);
ggml_set_input
(
inp
->
k_shift
);
for
(
uint32_t
il
=
0
;
il
<
n_layer
;
++
il
)
{
const
int64_t
n_head_kv
=
hparams
.
n_head_kv
(
il
);
const
int64_t
n_embd_k_gqa
=
hparams
.
n_embd_k_gqa
(
il
);
const
bool
is_swa
=
hparams
.
is_swa
(
il
);
// note: the swa rope params could become part of the cparams in the future
// if we decide to make them configurable, like the non-sliding ones
const
float
freq_base_l
=
is_swa
?
hparams
.
rope_freq_base_train_swa
:
cparams
.
rope_freq_base
;
const
float
freq_scale_l
=
is_swa
?
hparams
.
rope_freq_scale_train_swa
:
cparams
.
rope_freq_scale
;
ggml_tensor
*
rope_factors
=
kv_self
->
cbs
.
get_rope_factors
(
n_ctx_per_seq
(),
il
);
ggml_tensor
*
k
=
ggml_view_3d
(
ctx0
,
kv_self
->
k_l
[
il
],
n_embd_head_k
,
n_head_kv
,
kv_self
->
size
,
ggml_row_size
(
kv_self
->
k_l
[
il
]
->
type
,
n_embd_head_k
),
ggml_row_size
(
kv_self
->
k_l
[
il
]
->
type
,
n_embd_k_gqa
),
0
);
ggml_tensor
*
cur
=
build_rope_shift
(
ctx0
,
k
,
inp
->
k_shift
,
rope_factors
,
freq_base_l
,
freq_scale_l
);
ggml_build_forward_expand
(
gf
,
cur
);
}
res
->
add_input
(
std
::
move
(
inp
));
return
res
;
}
llm_graph_result_ptr
llama_context
::
build_kv_self_defrag
(
ggml_context
*
ctx0
,
ggml_cgraph
*
gf
,
const
std
::
vector
<
struct
llama_kv_defrag_move
>
&
moves
)
const
{
auto
res
=
std
::
make_unique
<
llm_graph_result
>
();
const
auto
&
hparams
=
model
.
hparams
;
#if 0
// CPU defrag
//
// TODO: optimizations are possible:
// - multiple threads
// - avoid copying to the host memory when already there
//
// likely not worth the effort, as we have ggml_graph based defrag
//
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
const uint32_t kv_size = size;
std::vector<uint8_t> buf_k;
std::vector<uint8_t> buf_v;
for (uint32_t il = 0; il < n_layer; ++il) {
const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
const size_t k_size = ggml_row_size(k_l[il]->type, n_embd_k_gqa*kv_size);
const size_t v_size_el = ggml_type_size(v_l[il]->type);
const size_t v_size = ggml_row_size (v_l[il]->type, n_embd_v_gqa*kv_size);
buf_k.resize(k_size);
buf_v.resize(v_size);
ggml_backend_tensor_get(k_l[il], buf_k.data(), 0, buf_k.size());
ggml_backend_tensor_get(v_l[il], buf_v.data(), 0, buf_v.size());
// batch move [i, i+nm) to [id, id+nm)
// note: cells can move only to a lower index
for (uint32_t i = 0; i < n_kv; ++i) {
const uint32_t id = ids[i];
if (i == id || id == n_kv) {
continue;
}
uint32_t nm = 1;
while (i + nm < n_kv && ids[i + nm] == id + nm) {
nm++;
}
// move keys
{
const int64_t os = i*k_size_row;
const int64_t od = id*k_size_row;
memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
}
// move values (note: they are transposed)
{
const int64_t os = i;
const int64_t od = id;
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
}
}
i += nm - 1;
}
ggml_backend_tensor_set(k_l[il], buf_k.data(), 0, buf_k.size());
ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
}
#else
for
(
const
auto
&
move
:
moves
)
{
for
(
uint32_t
il
=
0
;
il
<
hparams
.
n_layer
;
++
il
)
{
// NOLINT
const
int64_t
n_embd_k_gqa
=
hparams
.
n_embd_k_gqa
(
il
);
const
int64_t
n_embd_v_gqa
=
hparams
.
n_embd_v_gqa
(
il
);
ggml_tensor
*
view_k_src
=
ggml_view_2d
(
ctx0
,
kv_self
->
k_l
[
il
],
n_embd_k_gqa
,
move
.
len
,
ggml_row_size
(
kv_self
->
k_l
[
il
]
->
type
,
n_embd_k_gqa
),
ggml_row_size
(
kv_self
->
k_l
[
il
]
->
type
,
n_embd_k_gqa
*
move
.
src
));
ggml_tensor
*
view_k_dst
=
ggml_view_2d
(
ctx0
,
kv_self
->
k_l
[
il
],
n_embd_k_gqa
,
move
.
len
,
ggml_row_size
(
kv_self
->
k_l
[
il
]
->
type
,
n_embd_k_gqa
),
ggml_row_size
(
kv_self
->
k_l
[
il
]
->
type
,
n_embd_k_gqa
*
move
.
dst
));
ggml_tensor
*
view_v_src
;
ggml_tensor
*
view_v_dst
;
if
(
cparams
.
flash_attn
)
{
// NOTE: the V cache is not transposed when using flash attention
view_v_src
=
ggml_view_2d
(
ctx0
,
kv_self
->
v_l
[
il
],
n_embd_v_gqa
,
move
.
len
,
ggml_row_size
(
kv_self
->
v_l
[
il
]
->
type
,
n_embd_v_gqa
),
ggml_row_size
(
kv_self
->
v_l
[
il
]
->
type
,
n_embd_v_gqa
*
move
.
src
));
view_v_dst
=
ggml_view_2d
(
ctx0
,
kv_self
->
v_l
[
il
],
n_embd_v_gqa
,
move
.
len
,
ggml_row_size
(
kv_self
->
v_l
[
il
]
->
type
,
n_embd_v_gqa
),
ggml_row_size
(
kv_self
->
v_l
[
il
]
->
type
,
n_embd_v_gqa
*
move
.
dst
));
}
else
{
view_v_src
=
ggml_view_2d
(
ctx0
,
kv_self
->
v_l
[
il
],
move
.
len
,
n_embd_v_gqa
,
ggml_row_size
(
kv_self
->
v_l
[
il
]
->
type
,
kv_self
->
size
),
ggml_row_size
(
kv_self
->
v_l
[
il
]
->
type
,
move
.
src
));
view_v_dst
=
ggml_view_2d
(
ctx0
,
kv_self
->
v_l
[
il
],
move
.
len
,
n_embd_v_gqa
,
ggml_row_size
(
kv_self
->
v_l
[
il
]
->
type
,
kv_self
->
size
),
ggml_row_size
(
kv_self
->
v_l
[
il
]
->
type
,
move
.
dst
));
}
ggml_build_forward_expand
(
gf
,
ggml_cpy
(
ctx0
,
view_k_src
,
view_k_dst
));
ggml_build_forward_expand
(
gf
,
ggml_cpy
(
ctx0
,
view_v_src
,
view_v_dst
));
}
}
#endif
return
res
;
llama_kv_cache
*
kv_self
=
static_cast
<
llama_kv_cache
*>
(
memory
.
get
());
return
kv_self
;
}
void
llama_context
::
kv_self_update
()
{
auto
&
kv
=
kv_self
;
if
(
kv
->
has_shift
)
{
if
(
!
kv
->
get_can_shift
())
{
GGML_ABORT
(
"The current context does not support K-shift"
);
}
LLAMA_LOG_DEBUG
(
"%s: applying K-shift
\n
"
,
__func__
);
// apply K-shift if needed
if
(
model
.
hparams
.
rope_type
!=
LLAMA_ROPE_TYPE_NONE
)
{
ggml_backend_sched_reset
(
sched
.
get
());
auto
*
gf
=
graph_init
();
auto
res
=
build_kv_self_shift
(
ctx_compute
.
get
(),
gf
);
bool
need_reserve
=
false
;
ggml_backend_sched_alloc_graph
(
sched
.
get
()
,
gf
);
llama_kv_cache
*
kv_self
=
static_cast
<
llama_kv_cache
*>
(
memory
.
get
());
res
->
set_inputs
(
nullptr
);
need_reserve
=
kv_self
->
update
(
*
this
);
graph_compute
(
gf
,
false
);
}
// reserve a worst case graph if needed
if
(
need_reserve
)
{
LLAMA_LOG_DEBUG
(
"%s: reserving a worst case graph
\n
"
,
__func__
);
{
kv
->
has_shift
=
false
;
// build worst-case graph
uint32_t
n_seqs
=
1
;
// TODO: worst-case number of sequences
uint32_t
n_tokens
=
std
::
min
(
cparams
.
n_ctx
,
cparams
.
n_ubatch
);
for
(
uint32_t
i
=
0
;
i
<
kv
->
size
;
++
i
)
{
kv
->
cells
[
i
].
delta
=
0
;
}
}
}
// simulate full KV cache
kv_self
->
set_full
();
// defragment the KV cache if needed
if
(
kv
->
do_defrag
)
{
LLAMA_LOG_DEBUG
(
"%s: defragmenting KV cache
\n
"
,
__func__
);
const
uint32_t
n_max_nodes
=
graph_max_nodes
();
const
uint32_t
max_moves
=
(
n_max_nodes
-
2
*
model
.
hparams
.
n_layer
)
/
(
6
*
model
.
hparams
.
n_layer
);
if
(
!
kv
->
defrag_prepare
(
n_max_nodes
))
{
LLAMA_LOG_ERROR
(
"%s: failed to prepare defragmentation
\n
"
,
__func__
);
return
;
}
llama_token
token
=
model
.
vocab
.
token_bos
();
// not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
llama_ubatch
ubatch
=
{
true
,
n_tokens
,
n_tokens
/
n_seqs
,
n_seqs
,
&
token
,
nullptr
,
nullptr
,
nullptr
,
nullptr
,
nullptr
};
for
(
std
::
size_t
i
=
0
;
i
<
kv_self
->
defrag_info
.
moves
.
size
();
i
+=
max_moves
)
{
std
::
vector
<
struct
llama_kv_defrag_move
>
chunk
;
auto
end
=
std
::
min
(
i
+
max_moves
,
kv_self
->
defrag_info
.
moves
.
size
());
chunk
.
assign
(
kv_self
->
defrag_info
.
moves
.
begin
()
+
i
,
kv_self
->
defrag_info
.
moves
.
begin
()
+
end
);
auto
*
gf
=
graph_init
();
graph_build
(
ctx_compute
.
get
(),
gf
,
ubatch
,
LLM_GRAPH_TYPE_DEFAULT
);
ggml_backend_sched_reset
(
sched
.
get
());
auto
*
gf
=
graph_init
();
auto
res
=
build_kv_self_defrag
(
ctx_compute
.
get
(),
gf
,
chunk
);
ggml_backend_sched_alloc_graph
(
sched
.
get
(),
gf
);
res
->
set_inputs
(
nullptr
);
graph_compute
(
gf
,
false
);
// initialize scheduler with the worst-case graph
ggml_backend_sched_reset
(
sched
.
get
());
if
(
!
ggml_backend_sched_reserve
(
sched
.
get
(),
gf
))
{
LLAMA_LOG_ERROR
(
"%s: failed to allocate compute buffers
\n
"
,
__func__
);
}
kv
->
do_defrag
=
false
;
}
}
...
...
@@ -776,9 +484,6 @@ enum llama_pooling_type llama_context::pooling_type() const {
}
float
*
llama_context
::
get_logits
()
{
// reorder logits for backward compatibility
output_reorder
();
return
logits
;
}
...
...
@@ -809,7 +514,7 @@ float * llama_context::get_logits_ith(int32_t i) {
throw
std
::
runtime_error
(
format
(
"corrupt output buffer (j=%d, n_outputs=%d)"
,
j
,
n_outputs
));
}
return
logits
+
j
*
model
.
hparams
.
n_vocab
;
return
logits
+
j
*
model
.
vocab
.
n_tokens
()
;
}
catch
(
const
std
::
exception
&
err
)
{
LLAMA_LOG_ERROR
(
"%s: invalid logits id %d, reason: %s
\n
"
,
__func__
,
i
,
err
.
what
());
#ifndef NDEBUG
...
...
@@ -821,9 +526,6 @@ float * llama_context::get_logits_ith(int32_t i) {
}
float
*
llama_context
::
get_embeddings
()
{
// reorder embeddings for backward compatibility
output_reorder
();
return
embd
;
}
...
...
@@ -930,10 +632,6 @@ void llama_context::set_warmup(bool value) {
cparams
.
warmup
=
value
;
}
void
llama_context
::
set_cross_attn
(
bool
value
)
{
cparams
.
cross_attn
=
value
;
}
void
llama_context
::
set_adapter_lora
(
llama_adapter_lora
*
adapter
,
float
scale
)
{
...
...
@@ -979,8 +677,8 @@ int llama_context::encode(llama_batch & inp_batch) {
}
// temporary allocate memory for the input batch if needed
//
TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences
llama_batch_allocr
batch_allocr
(
inp_batch
,
inp_batch
.
pos
?
-
1
:
kv_self
->
pos_max
()
+
1
);
//
note: during encode, we always pass the full sequence starting from pos = 0
llama_batch_allocr
batch_allocr
(
inp_batch
,
inp_batch
.
pos
?
-
1
:
0
);
const
llama_batch
&
batch
=
batch_allocr
.
batch
;
const
int32_t
n_tokens
=
batch
.
n_tokens
;
...
...
@@ -1005,11 +703,13 @@ int llama_context::encode(llama_batch & inp_batch) {
t_compute_start_us
=
ggml_time_us
();
}
embd_seq
.
clear
();
n_queued_tokens
+=
n_tokens
;
const
int64_t
n_embd
=
hparams
.
n_embd
;
sbatch
.
from_
batch
(
batch
,
batch
.
n_embd
,
/* simple_split */
true
,
/* logits_all */
true
);
llama_sbatch
sbatch
=
llama_s
batch
(
batch
,
n_embd
,
/* simple_split */
true
,
/* logits_all */
true
);
const
llama_ubatch
ubatch
=
sbatch
.
split_simple
(
n_tokens
);
...
...
@@ -1066,12 +766,12 @@ int llama_context::encode(llama_batch & inp_batch) {
ggml_backend_t
backend_embd
=
ggml_backend_sched_get_tensor_backend
(
sched
.
get
(),
t_embd
);
GGML_ASSERT
(
backend_embd
!=
nullptr
);
GGML_ASSERT
(
embd
!=
nullptr
);
switch
(
cparams
.
pooling_type
)
{
case
LLAMA_POOLING_TYPE_NONE
:
{
// extract token embeddings
GGML_ASSERT
(
embd
!=
nullptr
);
GGML_ASSERT
(
n_tokens
*
n_embd
<=
(
int64_t
)
embd_size
);
ggml_backend_tensor_get_async
(
backend_embd
,
t_embd
,
embd
,
0
,
n_tokens
*
n_embd
*
sizeof
(
float
));
}
break
;
...
...
@@ -1096,11 +796,18 @@ int llama_context::encode(llama_batch & inp_batch) {
}
break
;
case
LLAMA_POOLING_TYPE_RANK
:
{
// TODO: this likely should be the same logic as in llama_decoder_internal, but better to
// wait for an encoder model that requires this pooling type in order to test it
// https://github.com/ggerganov/llama.cpp/pull/9510
GGML_ABORT
(
"RANK pooling not implemented yet"
);
}
// extract the rerank score - a single float per sequence
auto
&
embd_seq_out
=
embd_seq
;
for
(
uint32_t
s
=
0
;
s
<
ubatch
.
n_seqs
;
++
s
)
{
const
llama_seq_id
seq_id
=
ubatch
.
seq_id
[
s
][
0
];
if
(
embd_seq_out
.
find
(
seq_id
)
!=
embd_seq_out
.
end
())
{
continue
;
}
embd_seq_out
[
seq_id
].
resize
(
1
);
ggml_backend_tensor_get_async
(
backend_embd
,
t_embd
,
embd_seq_out
[
seq_id
].
data
(),
(
seq_id
)
*
sizeof
(
float
),
sizeof
(
float
));
}
}
break
;
case
LLAMA_POOLING_TYPE_UNSPECIFIED
:
{
GGML_ABORT
(
"unknown pooling type"
);
...
...
@@ -1138,25 +845,33 @@ int llama_context::encode(llama_batch & inp_batch) {
}
int
llama_context
::
decode
(
llama_batch
&
inp_batch
)
{
if
(
!
memory
)
{
LLAMA_LOG_WARN
(
"%s: cannot decode batches with this context (use llama_encode() instead)
\n
"
,
__func__
);
return
encode
(
inp_batch
);
}
if
(
inp_batch
.
n_tokens
==
0
)
{
LLAMA_LOG_ERROR
(
"%s: n_tokens == 0
\n
"
,
__func__
);
return
-
1
;
}
llama_kv_cache
*
kv_self
=
static_cast
<
llama_kv_cache
*>
(
memory
.
get
());
// temporary allocate memory for the input batch if needed
// TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences
llama_batch_allocr
batch_allocr
(
inp_batch
,
inp_batch
.
pos
?
-
1
:
kv_self
->
pos_max
()
+
1
);
// TODO: this is incorrect for multiple sequences because
get_
pos_max() is the maximum across all sequences
llama_batch_allocr
batch_allocr
(
inp_batch
,
inp_batch
.
pos
?
-
1
:
kv_self
->
get_
pos_max
()
+
1
);
const
llama_batch
&
batch
=
batch_allocr
.
batch
;
const
auto
&
vocab
=
model
.
vocab
;
const
auto
&
hparams
=
model
.
hparams
;
const
int32_t
n_vocab
=
hparams
.
n_vocab
;
const
int32_t
n_vocab
=
vocab
.
n_tokens
()
;
const
int64_t
n_tokens_all
=
batch
.
n_tokens
;
const
int64_t
n_embd
=
hparams
.
n_embd
;
llama_kv_cache_guard
kv_guard
(
kv_self
.
get
()
);
llama_kv_cache_guard
kv_guard
(
kv_self
);
GGML_ASSERT
((
!
batch
.
token
&&
batch
.
embd
)
||
(
batch
.
token
&&
!
batch
.
embd
));
// NOLINT
...
...
@@ -1190,18 +905,14 @@ int llama_context::decode(llama_batch & inp_batch) {
for
(
uint32_t
i
=
0
;
i
<
n_tokens_all
;
++
i
)
{
n_outputs_all
+=
batch
.
logits
[
i
]
!=
0
;
}
}
else
if
(
logits_all
||
embd_pooled
)
{
}
else
if
(
embd_pooled
)
{
n_outputs_all
=
n_tokens_all
;
}
else
{
// keep last output only
n_outputs_all
=
1
;
}
const
bool
logits_all
=
n_outputs_all
==
n_tokens_all
;
sbatch
.
from_batch
(
batch
,
batch
.
n_embd
,
/* simple_split */
!
kv_self
->
recurrent
,
/* logits_all */
logits_all
);
llama_sbatch
sbatch
=
kv_self
->
sbatch_init
(
batch
,
/* logits_all */
n_outputs_all
==
n_tokens_all
);
// reserve output buffer
if
(
output_reserve
(
n_outputs_all
)
<
n_outputs_all
)
{
...
...
@@ -1215,22 +926,7 @@ int llama_context::decode(llama_batch & inp_batch) {
int64_t
n_outputs_prev
=
0
;
while
(
sbatch
.
n_tokens
>
0
)
{
llama_ubatch
ubatch
=
llama_ubatch
();
const
auto
&
n_ubatch
=
cparams
.
n_ubatch
;
if
(
kv_self
->
recurrent
)
{
if
(
embd_pooled
)
{
// Pooled embeddings cannot be split across ubatches (yet)
ubatch
=
sbatch
.
split_seq
(
cparams
.
n_ubatch
);
}
else
{
// recurrent model architectures are easier to implement
// with equal-length sequences
ubatch
=
sbatch
.
split_equal
(
cparams
.
n_ubatch
);
}
}
else
{
ubatch
=
sbatch
.
split_simple
(
n_ubatch
);
}
llama_ubatch
ubatch
=
kv_self
->
ubatch_next
(
sbatch
,
cparams
.
n_ubatch
,
embd_pooled
);
// count the outputs in this u_batch
{
...
...
@@ -1250,27 +946,15 @@ int llama_context::decode(llama_batch & inp_batch) {
}
// find KV slot
{
if
(
!
kv_self
->
find_slot
(
ubatch
))
{
kv_self
->
defrag_sched
(
-
1.0
f
);
kv_self
->
update
(
*
this
);
if
(
!
kv_self
->
find_slot
(
ubatch
))
{
kv_self
->
defrag
();
kv_self_update
();
if
(
!
kv_self
->
find_slot
(
ubatch
))
{
LLAMA_LOG_WARN
(
"%s: failed to find KV cache slot for ubatch of size %d
\n
"
,
__func__
,
ubatch
.
n_tokens
);
return
1
;
}
}
if
(
!
kv_self
->
recurrent
)
{
// a heuristic, to avoid attending the full cache if it is not yet utilized
// after enough generations, the benefit from this heuristic disappears
// if we start defragmenting the cache, the benefit from this will be more important
const
uint32_t
pad
=
kv_self
->
get_padding
(
cparams
);
kv_self
->
n
=
std
::
min
(
kv_self
->
size
,
std
::
max
(
pad
,
GGML_PAD
(
kv_self
->
cell_max
(),
pad
)));
LLAMA_LOG_WARN
(
"%s: failed to find KV cache slot for ubatch of size %d
\n
"
,
__func__
,
ubatch
.
n_tokens
);
return
1
;
}
}
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self->n, kv_self->used, kv_self->head);
ggml_backend_sched_reset
(
sched
.
get
());
ggml_backend_sched_set_eval_callback
(
sched
.
get
(),
cparams
.
cb_eval
,
cparams
.
cb_eval_user_data
);
...
...
@@ -1384,43 +1068,68 @@ int llama_context::decode(llama_batch & inp_batch) {
// finalize the batch processing
kv_guard
.
commit
();
// set to total number of outputs in the batch, for use in llama_get_logits_ith
n_outputs
=
n_outputs_all
;
// set output mappings
{
bool
sorted_output
=
true
;
GGML_ASSERT
(
sbatch
.
out_ids
.
size
()
==
(
size_t
)
n_outputs_all
);
auto
&
out_ids
=
sbatch
.
out_ids
;
GGML_ASSERT
(
out_ids
.
size
()
==
(
size_t
)
n_outputs_all
);
for
(
int64_t
i
=
0
;
i
<
n_outputs_all
;
++
i
)
{
int64_t
out_id
=
sbatch
.
out_ids
[
i
];
int64_t
out_id
=
out_ids
[
i
];
output_ids
[
out_id
]
=
i
;
if
(
out_id
!=
i
)
{
sorted_output
=
false
;
}
}
if
(
sorted_output
)
{
sbatch
.
out_ids
.
clear
();
// make the outputs have the same order they had in the user-provided batch
// note: this is mostly relevant for recurrent models atm
if
(
!
sorted_output
)
{
const
uint32_t
n_vocab
=
model
.
vocab
.
n_tokens
();
const
uint32_t
n_embd
=
model
.
hparams
.
n_embd
;
GGML_ASSERT
((
size_t
)
n_outputs
==
out_ids
.
size
());
// TODO: is there something more efficient which also minimizes swaps?
// selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort)
for
(
int32_t
i
=
0
;
i
<
n_outputs
-
1
;
++
i
)
{
int32_t
j_min
=
i
;
for
(
int32_t
j
=
i
+
1
;
j
<
n_outputs
;
++
j
)
{
if
(
out_ids
[
j
]
<
out_ids
[
j_min
])
{
j_min
=
j
;
}
}
if
(
j_min
==
i
)
{
continue
;
}
std
::
swap
(
out_ids
[
i
],
out_ids
[
j_min
]);
if
(
logits_size
>
0
)
{
for
(
uint32_t
k
=
0
;
k
<
n_vocab
;
k
++
)
{
std
::
swap
(
logits
[
i
*
n_vocab
+
k
],
logits
[
j_min
*
n_vocab
+
k
]);
}
}
if
(
embd_size
>
0
)
{
for
(
uint32_t
k
=
0
;
k
<
n_embd
;
k
++
)
{
std
::
swap
(
embd
[
i
*
n_embd
+
k
],
embd
[
j_min
*
n_embd
+
k
]);
}
}
}
std
::
fill
(
output_ids
.
begin
(),
output_ids
.
end
(),
-
1
);
for
(
int32_t
i
=
0
;
i
<
n_outputs
;
++
i
)
{
output_ids
[
out_ids
[
i
]]
=
i
;
}
}
}
// set to total number of outputs in the batch, for use in llama_get_logits_ith
n_outputs
=
n_outputs_all
;
// wait for the computation to finish (automatically done when obtaining the model output)
//synchronize();
// decide if we need to defrag the kv cache
if
(
cparams
.
causal_attn
&&
cparams
.
defrag_thold
>
0.0
f
)
{
// - do not defrag small contexts (i.e. < 2048 tokens)
// - count the padding towards the number of used tokens
const
float
fragmentation
=
kv_self
->
n
>=
2048
?
std
::
max
(
0.0
f
,
1.0
f
-
float
(
kv_self
->
used
+
kv_self
->
get_padding
(
cparams
))
/
float
(
kv_self
->
n
))
:
0.0
f
;
// queue defragmentation for next llama_kv_cache_update
if
(
fragmentation
>
cparams
.
defrag_thold
)
{
LLAMA_LOG_DEBUG
(
"%s: fragmentation: %.2f - requesting defrag
\n
"
,
__func__
,
fragmentation
);
kv_self
->
defrag
();
}
if
(
cparams
.
defrag_thold
>
0.0
f
)
{
kv_self
->
defrag_sched
(
cparams
.
defrag_thold
);
}
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
...
...
@@ -1436,11 +1145,12 @@ int llama_context::decode(llama_batch & inp_batch) {
int32_t
llama_context
::
output_reserve
(
int32_t
n_outputs
)
{
const
auto
&
hparams
=
model
.
hparams
;
const
auto
&
vocab
=
model
.
vocab
;
const
int64_t
n_outputs_max
=
std
::
max
<
int64_t
>
(
n_outputs
,
n_seq_max
());
const
auto
n_batch
=
cparams
.
n_batch
;
const
auto
n_vocab
=
hparams
.
n_vocab
;
const
auto
n_vocab
=
vocab
.
n_tokens
()
;
const
auto
n_embd
=
hparams
.
n_embd
;
// TODO: use a per-batch flag for logits presence instead
...
...
@@ -1505,44 +1215,6 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
return
n_outputs_max
;
}
void
llama_context
::
output_reorder
()
{
auto
&
out_ids
=
sbatch
.
out_ids
;
if
(
!
out_ids
.
empty
())
{
const
uint32_t
n_vocab
=
model
.
hparams
.
n_vocab
;
const
uint32_t
n_embd
=
model
.
hparams
.
n_embd
;
GGML_ASSERT
((
size_t
)
n_outputs
==
out_ids
.
size
());
// TODO: is there something more efficient which also minimizes swaps?
// selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort)
for
(
int32_t
i
=
0
;
i
<
n_outputs
-
1
;
++
i
)
{
int32_t
j_min
=
i
;
for
(
int32_t
j
=
i
+
1
;
j
<
n_outputs
;
++
j
)
{
if
(
out_ids
[
j
]
<
out_ids
[
j_min
])
{
j_min
=
j
;
}
}
if
(
j_min
==
i
)
{
continue
;
}
std
::
swap
(
out_ids
[
i
],
out_ids
[
j_min
]);
if
(
logits_size
>
0
)
{
for
(
uint32_t
k
=
0
;
k
<
n_vocab
;
k
++
)
{
std
::
swap
(
logits
[
i
*
n_vocab
+
k
],
logits
[
j_min
*
n_vocab
+
k
]);
}
}
if
(
embd_size
>
0
)
{
for
(
uint32_t
k
=
0
;
k
<
n_embd
;
k
++
)
{
std
::
swap
(
embd
[
i
*
n_embd
+
k
],
embd
[
j_min
*
n_embd
+
k
]);
}
}
}
std
::
fill
(
output_ids
.
begin
(),
output_ids
.
end
(),
-
1
);
for
(
int32_t
i
=
0
;
i
<
n_outputs
;
++
i
)
{
output_ids
[
out_ids
[
i
]]
=
i
;
}
out_ids
.
clear
();
}
}
//
// graph
//
...
...
@@ -1579,7 +1251,7 @@ llm_graph_result_ptr llama_context::graph_build(
/*.backend_cpu =*/
backend_cpu
,
/*.cvec =*/
&
cvec
,
/*.loras =*/
&
loras
,
/*.memory =*/
kv_self
.
get
(),
/*.memory =*/
memory
.
get
(),
/*.cross =*/
&
cross
,
/*.n_outputs =*/
n_outputs
,
/*.cb =*/
graph_get_cb
(),
...
...
@@ -1983,8 +1655,6 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
{
LLAMA_LOG_DEBUG
(
"%s: - writing output ids
\n
"
,
__func__
);
output_reorder
();
const
auto
n_outputs
=
this
->
n_outputs
;
const
auto
&
output_ids
=
this
->
output_ids
;
...
...
@@ -2015,7 +1685,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
{
LLAMA_LOG_DEBUG
(
"%s: - writing logits
\n
"
,
__func__
);
const
uint64_t
logits_size
=
std
::
min
((
uint64_t
)
this
->
logits_size
,
(
uint64_t
)
n_outputs
*
model
.
hparams
.
n_vocab
);
const
uint64_t
logits_size
=
std
::
min
((
uint64_t
)
this
->
logits_size
,
(
uint64_t
)
n_outputs
*
model
.
vocab
.
n_tokens
()
);
io
.
write
(
&
logits_size
,
sizeof
(
logits_size
));
...
...
@@ -2038,6 +1708,8 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
}
LLAMA_LOG_DEBUG
(
"%s: - writing KV self
\n
"
,
__func__
);
llama_kv_cache
*
kv_self
=
static_cast
<
llama_kv_cache
*>
(
memory
.
get
());
kv_self
->
state_write
(
io
);
return
io
.
n_bytes
();
...
...
@@ -2121,8 +1793,13 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
}
}
LLAMA_LOG_DEBUG
(
"%s: - reading KV self
\n
"
,
__func__
);
kv_self
->
state_read
(
io
);
if
(
memory
)
{
LLAMA_LOG_DEBUG
(
"%s: - reading KV self
\n
"
,
__func__
);
llama_kv_cache
*
kv_self
=
static_cast
<
llama_kv_cache
*>
(
memory
.
get
());
kv_self
->
state_read
(
io
);
}
return
io
.
n_bytes
();
}
...
...
@@ -2130,7 +1807,11 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
size_t
llama_context
::
state_seq_write_data
(
llama_io_write_i
&
io
,
llama_seq_id
seq_id
)
{
GGML_UNUSED
(
seq_id
);
kv_self
->
state_write
(
io
,
seq_id
);
if
(
memory
)
{
llama_kv_cache
*
kv_self
=
static_cast
<
llama_kv_cache
*>
(
memory
.
get
());
kv_self
->
state_write
(
io
,
seq_id
);
}
return
io
.
n_bytes
();
}
...
...
@@ -2138,7 +1819,11 @@ size_t llama_context::state_seq_write_data(llama_io_write_i & io, llama_seq_id s
size_t
llama_context
::
state_seq_read_data
(
llama_io_read_i
&
io
,
llama_seq_id
seq_id
)
{
GGML_UNUSED
(
seq_id
);
kv_self
->
state_read
(
io
,
seq_id
);
if
(
memory
)
{
llama_kv_cache
*
kv_self
=
static_cast
<
llama_kv_cache
*>
(
memory
.
get
());
kv_self
->
state_read
(
io
,
seq_id
);
}
return
io
.
n_bytes
();
}
...
...
@@ -2166,6 +1851,218 @@ void llama_context::perf_reset() {
t_p_eval_us
=
n_p_eval
=
0
;
}
//
// training
//
static
void
llama_set_param
(
struct
ggml_tensor
*
tensor
,
llama_opt_param_filter
param_filter
,
void
*
userdata
)
{
if
(
!
tensor
||
tensor
->
type
!=
GGML_TYPE_F32
)
{
return
;
}
if
(
!
param_filter
(
tensor
,
userdata
))
{
return
;
}
if
(
strcmp
(
tensor
->
name
,
"token_embd.weight"
)
==
0
)
{
return
;
// FIXME
}
if
(
strcmp
(
tensor
->
name
,
"rope_freqs.weight"
)
==
0
)
{
return
;
// FIXME
}
ggml_set_param
(
tensor
);
}
void
llama_context
::
opt_init
(
struct
llama_model
*
model
,
struct
llama_opt_params
lopt_params
)
{
GGML_ASSERT
(
!
opt_ctx
);
model
->
hparams
.
n_ctx_train
=
lopt_params
.
n_ctx_train
>
0
?
lopt_params
.
n_ctx_train
:
n_ctx
();
const
uint32_t
n_batch
=
std
::
min
(
this
->
n_batch
(),
model
->
hparams
.
n_ctx_train
);
const
uint32_t
n_ubatch
=
std
::
min
(
this
->
n_ubatch
(),
n_batch
);
GGML_ASSERT
(
model
->
hparams
.
n_ctx_train
%
n_batch
==
0
);
GGML_ASSERT
(
n_batch
%
n_ubatch
==
0
);
ggml_opt_params
opt_params
=
ggml_opt_default_params
(
sched
.
get
(),
GGML_OPT_LOSS_TYPE_CROSS_ENTROPY
);
opt_params
.
opt_period
=
n_batch
/
n_ubatch
;
opt_params
.
get_opt_pars
=
lopt_params
.
get_opt_pars
;
opt_params
.
get_opt_pars_ud
=
lopt_params
.
get_opt_pars_ud
;
opt_ctx
=
ggml_opt_init
(
opt_params
);
llama_opt_param_filter
param_filter
=
lopt_params
.
param_filter
;
void
*
param_filter_ud
=
lopt_params
.
param_filter_ud
;
//llama_set_param(model->tok_embd, param_filter, param_filter_ud); // FIXME
llama_set_param
(
model
->
type_embd
,
param_filter
,
param_filter_ud
);
llama_set_param
(
model
->
pos_embd
,
param_filter
,
param_filter_ud
);
llama_set_param
(
model
->
tok_norm
,
param_filter
,
param_filter_ud
);
llama_set_param
(
model
->
tok_norm_b
,
param_filter
,
param_filter_ud
);
llama_set_param
(
model
->
output_norm
,
param_filter
,
param_filter_ud
);
llama_set_param
(
model
->
output_norm_b
,
param_filter
,
param_filter_ud
);
llama_set_param
(
model
->
output
,
param_filter
,
param_filter_ud
);
llama_set_param
(
model
->
output_b
,
param_filter
,
param_filter_ud
);
llama_set_param
(
model
->
output_norm_enc
,
param_filter
,
param_filter_ud
);
llama_set_param
(
model
->
cls
,
param_filter
,
param_filter_ud
);
llama_set_param
(
model
->
cls_b
,
param_filter
,
param_filter_ud
);
llama_set_param
(
model
->
cls_out
,
param_filter
,
param_filter_ud
);
llama_set_param
(
model
->
cls_out_b
,
param_filter
,
param_filter_ud
);
for
(
struct
llama_layer
&
layer
:
model
->
layers
)
{
for
(
size_t
i
=
0
;
i
<
sizeof
(
layer
)
/
sizeof
(
struct
ggml_tensor
*
);
++
i
)
{
llama_set_param
(
reinterpret_cast
<
struct
ggml_tensor
**>
(
&
layer
)[
i
],
param_filter
,
param_filter_ud
);
}
}
}
void
llama_context
::
opt_epoch_iter
(
ggml_opt_dataset_t
dataset
,
ggml_opt_result_t
result
,
const
std
::
vector
<
llama_token
>
&
tokens
,
const
std
::
vector
<
llama_token
>
&
labels_sparse
,
llama_batch
&
batch
,
ggml_opt_epoch_callback
callback
,
bool
train
,
int64_t
idata_in_loop
,
int64_t
ndata_in_loop
,
int64_t
t_loop_start
)
{
GGML_ASSERT
(
opt_ctx
);
const
uint32_t
n_ctx
=
llama_model_n_ctx_train
(
&
model
);
const
uint32_t
n_batch
=
std
::
min
(
this
->
n_batch
(),
n_ctx
);
const
uint32_t
n_ubatch
=
std
::
min
(
this
->
n_ubatch
(),
n_batch
);
llama_kv_cache
*
kv_self
=
static_cast
<
llama_kv_cache
*>
(
memory
.
get
());
kv_self
->
clear
();
llama_kv_cache_guard
kv_guard
(
kv_self
);
for
(
uint32_t
pos_ctx
=
0
;
pos_ctx
<
n_ctx
;
pos_ctx
+=
n_batch
)
{
batch
.
n_tokens
=
n_batch
;
for
(
uint32_t
pos_batch
=
0
;
pos_batch
<
n_batch
;
++
pos_batch
)
{
batch
.
token
[
pos_batch
]
=
tokens
[
pos_ctx
+
pos_batch
];
batch
.
pos
[
pos_batch
]
=
pos_ctx
+
pos_batch
;
batch
.
n_seq_id
[
pos_batch
]
=
1
;
batch
.
seq_id
[
pos_batch
][
0
]
=
0
;
batch
.
logits
[
pos_batch
]
=
true
;
}
const
auto
n_tokens_all
=
batch
.
n_tokens
;
n_queued_tokens
+=
n_tokens_all
;
// this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
const
bool
embd_pooled
=
cparams
.
embeddings
&&
cparams
.
pooling_type
!=
LLAMA_POOLING_TYPE_NONE
;
embd_seq
.
clear
();
int64_t
n_outputs_all
=
n_tokens_all
;
llama_sbatch
sbatch
=
kv_self
->
sbatch_init
(
batch
,
/*logits_all =*/
true
);
// reserve output buffer
if
(
output_reserve
(
n_outputs_all
)
<
n_outputs_all
)
{
LLAMA_LOG_ERROR
(
"%s: could not reserve space for batch with %"
PRId64
" outputs
\n
"
,
__func__
,
n_outputs_all
);
GGML_ABORT
(
"TODO: handle this error"
);
};
for
(
uint32_t
pos_batch
=
0
;
pos_batch
<
n_batch
;
pos_batch
+=
n_ubatch
)
{
llama_ubatch
ubatch
=
kv_self
->
ubatch_next
(
sbatch
,
cparams
.
n_ubatch
,
embd_pooled
);
n_outputs
=
ubatch
.
n_tokens
;
// TODO: not sure if this is needed
if
(
!
kv_self
->
find_slot
(
ubatch
))
{
kv_self
->
defrag_sched
(
-
1.0
f
);
kv_self
->
update
(
*
this
);
if
(
!
kv_self
->
find_slot
(
ubatch
))
{
LLAMA_LOG_WARN
(
"%s: failed to find KV cache slot for ubatch of size %d
\n
"
,
__func__
,
ubatch
.
n_tokens
);
GGML_ABORT
(
"TODO: handle this error"
);
}
}
auto
*
gf
=
graph_init
();
auto
res
=
graph_build
(
ctx_compute
.
get
(),
gf
,
ubatch
,
LLM_GRAPH_TYPE_DEFAULT
);
struct
ggml_context
*
ctx_compute_opt
;
{
const
size_t
size_gf
=
ggml_graph_size
(
gf
);
const
size_t
size_meta
=
4
*
size_gf
*
ggml_tensor_overhead
()
+
2
*
ggml_graph_overhead_custom
(
size_gf
,
/*grads = */
true
);
struct
ggml_init_params
params
=
{
/*.mem_size =*/
size_meta
,
/*.mem_buffer =*/
nullptr
,
/*.no_alloc =*/
true
,
};
ctx_compute_opt
=
ggml_init
(
params
);
}
ggml_opt_prepare_alloc
(
opt_ctx
,
ctx_compute_opt
,
gf
,
res
->
get_tokens
(),
res
->
get_logits
());
ggml_opt_alloc
(
opt_ctx
,
train
);
res
->
set_inputs
(
&
ubatch
);
{
struct
ggml_tensor
*
labels
=
ggml_opt_labels
(
opt_ctx
);
GGML_ASSERT
(
labels
->
ne
[
1
]
==
n_ubatch
);
ggml_set_zero
(
labels
);
const
float
onef
=
1.0
f
;
for
(
uint32_t
pos_ubatch
=
0
;
pos_ubatch
<
n_ubatch
;
++
pos_ubatch
)
{
const
uint32_t
ilabel
=
pos_ctx
+
pos_batch
+
pos_ubatch
;
GGML_ASSERT
(
labels_sparse
[
ilabel
]
<
labels
->
ne
[
0
]);
ggml_backend_tensor_set
(
labels
,
&
onef
,
(
pos_ubatch
*
labels
->
ne
[
0
]
+
labels_sparse
[
ilabel
])
*
sizeof
(
float
),
sizeof
(
float
));
}
}
ggml_opt_eval
(
opt_ctx
,
result
);
if
(
callback
)
{
callback
(
train
,
opt_ctx
,
dataset
,
result
,
idata_in_loop
+
(
pos_ctx
+
pos_batch
)
/
n_ubatch
+
1
,
ndata_in_loop
,
t_loop_start
);
}
ggml_free
(
ctx_compute_opt
);
}
}
kv_guard
.
commit
();
}
void
llama_context
::
opt_epoch
(
ggml_opt_dataset_t
dataset
,
ggml_opt_result_t
result_train
,
ggml_opt_result_t
result_eval
,
int64_t
idata_split
,
ggml_opt_epoch_callback
callback_train
,
ggml_opt_epoch_callback
callback_eval
)
{
const
uint32_t
n_ctx
=
this
->
n_ctx
();
const
uint32_t
n_batch
=
std
::
min
(
cparams
.
n_batch
,
n_ctx
);
const
uint32_t
n_ubatch
=
std
::
min
(
cparams
.
n_ubatch
,
n_batch
);
const
int64_t
ndata
=
ggml_opt_dataset_ndata
(
dataset
);
GGML_ASSERT
(
idata_split
>=
0
);
GGML_ASSERT
(
idata_split
<=
ndata
);
const
uint32_t
ubatch_per_ctx
=
n_ctx
/
n_ubatch
;
struct
llama_batch
batch
=
llama_batch_init
(
n_batch
,
0
,
1
);
std
::
vector
<
llama_token
>
tokens
(
n_ctx
);
std
::
vector
<
llama_token
>
labels_sparse
(
n_ctx
);
int64_t
idata
=
0
;
int64_t
t_loop_start
=
ggml_time_us
();
int64_t
ndata_in_loop
=
idata_split
*
ubatch_per_ctx
;
for
(;
idata
<
idata_split
;
++
idata
)
{
constexpr
bool
train
=
true
;
const
int64_t
idata_in_loop
=
idata
*
ubatch_per_ctx
;
ggml_opt_dataset_get_batch_host
(
dataset
,
tokens
.
data
(),
n_ctx
*
sizeof
(
llama_token
),
labels_sparse
.
data
(),
idata
);
opt_epoch_iter
(
dataset
,
result_train
,
tokens
,
labels_sparse
,
batch
,
callback_train
,
train
,
idata_in_loop
,
ndata_in_loop
,
t_loop_start
);
}
t_loop_start
=
ggml_time_us
();
ndata_in_loop
=
(
ndata
-
idata_split
)
*
ubatch_per_ctx
;
for
(;
idata
<
ndata
;
++
idata
)
{
constexpr
bool
train
=
false
;
const
int64_t
idata_in_loop
=
(
idata
-
idata_split
)
*
ubatch_per_ctx
;
ggml_opt_dataset_get_batch_host
(
dataset
,
tokens
.
data
(),
n_ctx
*
sizeof
(
llama_token
),
labels_sparse
.
data
(),
idata
);
opt_epoch_iter
(
dataset
,
result_eval
,
tokens
,
labels_sparse
,
batch
,
callback_eval
,
train
,
idata_in_loop
,
ndata_in_loop
,
t_loop_start
);
}
llama_batch_free
(
batch
);
}
//
// interface implementation
//
...
...
@@ -2193,14 +2090,13 @@ llama_context_params llama_context_default_params() {
/*.cb_eval_user_data =*/
nullptr
,
/*.type_k =*/
GGML_TYPE_F16
,
/*.type_v =*/
GGML_TYPE_F16
,
/*.logits_all =*/
false
,
/*.abort_callback =*/
nullptr
,
/*.abort_callback_data =*/
nullptr
,
/*.embeddings =*/
false
,
/*.offload_kqv =*/
true
,
/*.flash_attn =*/
false
,
/*.no_perf =*/
true
,
/*.cross_attn =*/
false
,
/*.abort_callback =*/
nullptr
,
/*.abort_callback_data =*/
nullptr
,
/*.op_offload =*/
true
,
};
return
result
;
...
...
@@ -2326,10 +2222,6 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
ctx
->
set_warmup
(
warmup
);
}
void
llama_set_cross_attention
(
struct
llama_context
*
ctx
,
bool
cross_attention
)
{
ctx
->
set_cross_attn
(
cross_attention
);
}
void
llama_synchronize
(
llama_context
*
ctx
)
{
ctx
->
synchronize
();
}
...
...
@@ -2498,7 +2390,7 @@ void llama_kv_cache_seq_cp(
llama_seq_id
seq_id_dst
,
llama_pos
p0
,
llama_pos
p1
)
{
return
llama_kv_self_seq_cp
(
ctx
,
seq_id_src
,
seq_id_dst
,
p0
,
p1
);
llama_kv_self_seq_cp
(
ctx
,
seq_id_src
,
seq_id_dst
,
p0
,
p1
);
}
void
llama_kv_self_seq_cp
(
...
...
@@ -2512,14 +2404,14 @@ void llama_kv_self_seq_cp(
return
;
}
return
kv
->
seq_cp
(
seq_id_src
,
seq_id_dst
,
p0
,
p1
);
kv
->
seq_cp
(
seq_id_src
,
seq_id_dst
,
p0
,
p1
);
}
// deprecated
void
llama_kv_cache_seq_keep
(
llama_context
*
ctx
,
llama_seq_id
seq_id
)
{
return
llama_kv_self_seq_keep
(
ctx
,
seq_id
);
llama_kv_self_seq_keep
(
ctx
,
seq_id
);
}
void
llama_kv_self_seq_keep
(
llama_context
*
ctx
,
llama_seq_id
seq_id
)
{
...
...
@@ -2528,7 +2420,7 @@ void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) {
return
;
}
return
kv
->
seq_keep
(
seq_id
);
kv
->
seq_keep
(
seq_id
);
}
// deprecated
...
...
@@ -2538,7 +2430,7 @@ void llama_kv_cache_seq_add(
llama_pos
p0
,
llama_pos
p1
,
llama_pos
delta
)
{
return
llama_kv_self_seq_add
(
ctx
,
seq_id
,
p0
,
p1
,
delta
);
llama_kv_self_seq_add
(
ctx
,
seq_id
,
p0
,
p1
,
delta
);
}
void
llama_kv_self_seq_add
(
...
...
@@ -2552,7 +2444,7 @@ void llama_kv_self_seq_add(
return
;
}
return
kv
->
seq_add
(
seq_id
,
p0
,
p1
,
delta
);
kv
->
seq_add
(
seq_id
,
p0
,
p1
,
delta
);
}
// deprecated
...
...
@@ -2562,7 +2454,7 @@ void llama_kv_cache_seq_div(
llama_pos
p0
,
llama_pos
p1
,
int
d
)
{
return
llama_kv_self_seq_div
(
ctx
,
seq_id
,
p0
,
p1
,
d
);
llama_kv_self_seq_div
(
ctx
,
seq_id
,
p0
,
p1
,
d
);
}
void
llama_kv_self_seq_div
(
...
...
@@ -2576,7 +2468,7 @@ void llama_kv_self_seq_div(
return
;
}
return
kv
->
seq_div
(
seq_id
,
p0
,
p1
,
d
);
kv
->
seq_div
(
seq_id
,
p0
,
p1
,
d
);
}
// deprecated
...
...
@@ -2595,7 +2487,7 @@ llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
// deprecated
void
llama_kv_cache_defrag
(
llama_context
*
ctx
)
{
return
llama_kv_self_defrag
(
ctx
);
llama_kv_self_defrag
(
ctx
);
}
void
llama_kv_self_defrag
(
llama_context
*
ctx
)
{
...
...
@@ -2604,7 +2496,8 @@ void llama_kv_self_defrag(llama_context * ctx) {
return
;
}
return
kv
->
defrag
();
// force defrag
kv
->
defrag_sched
(
-
1.0
f
);
}
// deprecated
...
...
@@ -2788,3 +2681,34 @@ void llama_perf_context_print(const llama_context * ctx) {
void
llama_perf_context_reset
(
llama_context
*
ctx
)
{
ctx
->
perf_reset
();
}
//
// training
//
bool
llama_opt_param_filter_all
(
const
struct
ggml_tensor
*
tensor
,
void
*
userdata
)
{
GGML_UNUSED
(
tensor
);
GGML_UNUSED
(
userdata
);
return
true
;
}
void
llama_opt_init
(
struct
llama_context
*
ctx
,
struct
llama_model
*
model
,
struct
llama_opt_params
lopt_params
)
{
ctx
->
opt_init
(
model
,
lopt_params
);
}
void
llama_opt_epoch
(
struct
llama_context
*
ctx
,
ggml_opt_dataset_t
dataset
,
ggml_opt_result_t
result_train
,
ggml_opt_result_t
result_eval
,
int64_t
idata_split
,
ggml_opt_epoch_callback
callback_train
,
ggml_opt_epoch_callback
callback_eval
)
{
ctx
->
opt_epoch
(
dataset
,
result_train
,
result_eval
,
idata_split
,
callback_train
,
callback_eval
);
}
llama/llama.cpp/src/llama-context.h
View file @
b2b270ad
...
...
@@ -8,6 +8,7 @@
#include "llama-kv-cache.h"
#include "ggml-cpp.h"
#include "ggml-opt.h"
#include <map>
#include <vector>
...
...
@@ -28,7 +29,12 @@ struct llama_context {
void
synchronize
();
const
llama_model
&
get_model
()
const
;
const
llama_model
&
get_model
()
const
;
const
llama_cparams
&
get_cparams
()
const
;
ggml_backend_sched_t
get_sched
()
const
;
ggml_context
*
get_ctx_compute
()
const
;
uint32_t
n_ctx
()
const
;
uint32_t
n_ctx_per_seq
()
const
;
...
...
@@ -66,7 +72,6 @@ struct llama_context {
void
set_embeddings
(
bool
value
);
void
set_causal_attn
(
bool
value
);
void
set_warmup
(
bool
value
);
void
set_cross_attn
(
bool
value
);
void
set_adapter_lora
(
llama_adapter_lora
*
adapter
,
...
...
@@ -130,6 +135,32 @@ struct llama_context {
llama_perf_context_data
perf_get_data
()
const
;
void
perf_reset
();
//
// training
//
void
opt_init
(
struct
llama_model
*
model
,
struct
llama_opt_params
lopt_params
);
void
opt_epoch
(
ggml_opt_dataset_t
dataset
,
ggml_opt_result_t
result_train
,
ggml_opt_result_t
result_eval
,
int64_t
idata_split
,
ggml_opt_epoch_callback
callback_train
,
ggml_opt_epoch_callback
callback_eval
);
void
opt_epoch_iter
(
ggml_opt_dataset_t
dataset
,
ggml_opt_result_t
result
,
const
std
::
vector
<
llama_token
>
&
tokens
,
const
std
::
vector
<
llama_token
>
&
labels_sparse
,
llama_batch
&
batch
,
ggml_opt_epoch_callback
callback
,
bool
train
,
int64_t
idata_in_loop
,
int64_t
ndata_in_loop
,
int64_t
t_loop_start
);
private:
//
// output
...
...
@@ -139,50 +170,30 @@ private:
// Returns max number of outputs for which space was reserved.
int32_t
output_reserve
(
int32_t
n_outputs
);
// make the outputs have the same order they had in the user-provided batch
// TODO: maybe remove this
void
output_reorder
();
//
// graph
//
public:
int32_t
graph_max_nodes
()
const
;
// zero-out inputs and create the ctx_compute for the compute graph
ggml_cgraph
*
graph_init
();
// returns the result of ggml_backend_sched_graph_compute_async execution
ggml_status
graph_compute
(
ggml_cgraph
*
gf
,
bool
batched
);
private:
llm_graph_result_ptr
graph_build
(
ggml_context
*
ctx
,
ggml_cgraph
*
gf
,
const
llama_ubatch
&
ubatch
,
llm_graph_type
gtype
);
// returns the result of ggml_backend_sched_graph_compute_async execution
ggml_status
graph_compute
(
ggml_cgraph
*
gf
,
bool
batched
);
llm_graph_cb
graph_get_cb
()
const
;
// used by kv_self_update()
ggml_tensor
*
build_rope_shift
(
ggml_context
*
ctx0
,
ggml_tensor
*
cur
,
ggml_tensor
*
shift
,
ggml_tensor
*
factors
,
float
freq_base
,
float
freq_scale
)
const
;
llm_graph_result_ptr
build_kv_self_shift
(
ggml_context
*
ctx0
,
ggml_cgraph
*
gf
)
const
;
llm_graph_result_ptr
build_kv_self_defrag
(
ggml_context
*
ctx0
,
ggml_cgraph
*
gf
,
const
std
::
vector
<
struct
llama_kv_defrag_move
>
&
moves
)
const
;
// TODO: read/write lora adapters and cvec
size_t
state_write_data
(
llama_io_write_i
&
io
);
size_t
state_read_data
(
llama_io_read_i
&
io
);
...
...
@@ -199,14 +210,10 @@ private:
llama_cparams
cparams
;
llama_adapter_cvec
cvec
;
llama_adapter_loras
loras
;
llama_sbatch
sbatch
;
llama_cross
cross
;
// TODO: tmp for handling cross-attention - need something better probably
std
::
unique_ptr
<
llama_kv_cache_unified
>
kv_self
;
// TODO: remove
bool
logits_all
=
false
;
std
::
unique_ptr
<
llama_memory_i
>
memory
;
// decode output (2-dimensional array: [n_outputs][n_vocab])
size_t
logits_size
=
0
;
// capacity (of floats) for logits
...
...
@@ -233,6 +240,9 @@ private:
ggml_context_ptr
ctx_compute
;
// training
ggml_opt_context_t
opt_ctx
=
nullptr
;
ggml_threadpool_t
threadpool
=
nullptr
;
ggml_threadpool_t
threadpool_batch
=
nullptr
;
...
...
llama/llama.cpp/src/llama-cparams.h
View file @
b2b270ad
...
...
@@ -29,8 +29,8 @@ struct llama_cparams {
bool
offload_kqv
;
bool
flash_attn
;
bool
no_perf
;
bool
cross_attn
;
bool
warmup
;
bool
op_offload
;
enum
llama_pooling_type
pooling_type
;
...
...
llama/llama.cpp/src/llama-graph.cpp
View file @
b2b270ad
...
...
@@ -284,24 +284,7 @@ void llm_graph_input_s_copy::set_input(const llama_ubatch * ubatch) {
// assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
for
(
uint32_t
i
=
0
;
i
<
n_kv
;
++
i
)
{
const
uint32_t
cell_id
=
i
+
kv_self
->
head
;
//////////////////////////////////////////////
// TODO: this should not mutate the KV cache !
llama_kv_cell
&
kv_cell
=
const_cast
<
class
llama_kv_cache_unified
*>
(
kv_self
)
->
cells
[
i
];
// prevent out-of-bound sources
if
(
kv_cell
.
src
<
0
||
(
uint32_t
)
kv_cell
.
src
>=
kv_self
->
size
)
{
kv_cell
.
src
=
cell_id
;
}
data
[
i
]
=
kv_cell
.
src
;
// TODO: do not mutate the KV cache
// ensure copy only happens once
if
(
kv_cell
.
src
!=
(
int32_t
)
cell_id
)
{
kv_cell
.
src
=
cell_id
;
}
data
[
i
]
=
kv_self
->
s_copy
(
i
);
}
}
}
...
...
@@ -317,18 +300,7 @@ void llm_graph_input_s_mask::set_input(const llama_ubatch * ubatch) {
// clear unused states
for
(
int
i
=
0
;
i
<
n_kv
;
++
i
)
{
const
uint32_t
cell_id
=
i
+
kv_self
->
head
;
//////////////////////////////////////////////
// TODO: this should not mutate the KV cache !
llama_kv_cell
&
kv_cell
=
const_cast
<
class
llama_kv_cache_unified
*>
(
kv_self
)
->
cells
[
i
];
data
[
i
]
=
(
float
)
(
kv_cell
.
src
>=
0
);
// only clear once
if
(
kv_cell
.
src
<
0
)
{
kv_cell
.
src
=
cell_id
;
}
data
[
i
]
=
kv_self
->
s_mask
(
i
);
}
}
}
...
...
@@ -560,12 +532,6 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
}
}
void
llm_graph_input_cross_attn_state
::
set_input
(
const
llama_ubatch
*
ubatch
)
{
if
(
ubatch
->
embd
)
{
ggml_backend_tensor_set
(
cross_attn_state
,
ubatch
->
embd
,
0
,
ggml_nbytes
(
cross_attn_state
));
}
}
//
// llm_graph_context
//
...
...
@@ -816,7 +782,7 @@ ggml_tensor * llm_graph_context::build_ffn(
}
break
;
}
if
(
type_gate
==
LLM_FFN_PAR
)
{
if
(
gate
&&
type_gate
==
LLM_FFN_PAR
)
{
cur
=
ggml_mul
(
ctx0
,
cur
,
tmp
);
cb
(
cur
,
"ffn_gate_par"
,
il
);
}
...
...
@@ -1005,6 +971,7 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
inp
->
tokens
=
ggml_new_tensor_1d
(
ctx0
,
GGML_TYPE_I32
,
ubatch
.
n_tokens
);
//cb(inp->tokens, "inp_tokens", -1);
ggml_set_input
(
inp
->
tokens
);
res
->
t_tokens
=
inp
->
tokens
;
cur
=
ggml_get_rows
(
ctx0
,
tok_embd
,
inp
->
tokens
);
...
...
@@ -1111,7 +1078,7 @@ ggml_tensor * llm_graph_context::build_inp_cls() const {
}
ggml_tensor
*
llm_graph_context
::
build_inp_s_copy
()
const
{
const
llama_kv_cache_
unified
*
kv_self
=
static_cast
<
const
llama_kv_cache_
unified
*>
(
memory
);
const
llama_kv_cache_
recurrent
*
kv_self
=
static_cast
<
const
llama_kv_cache_
recurrent
*>
(
memory
);
auto
inp
=
std
::
make_unique
<
llm_graph_input_s_copy
>
(
kv_self
);
...
...
@@ -1128,7 +1095,7 @@ ggml_tensor * llm_graph_context::build_inp_s_copy() const {
}
ggml_tensor
*
llm_graph_context
::
build_inp_s_mask
()
const
{
const
llama_kv_cache_
unified
*
kv_self
=
static_cast
<
const
llama_kv_cache_
unified
*>
(
memory
);
const
llama_kv_cache_
recurrent
*
kv_self
=
static_cast
<
const
llama_kv_cache_
recurrent
*>
(
memory
);
auto
inp
=
std
::
make_unique
<
llm_graph_input_s_mask
>
(
kv_self
);
...
...
@@ -1261,8 +1228,19 @@ ggml_tensor * llm_graph_context::build_attn_mha(
ggml_flash_attn_ext_set_prec
(
cur
,
GGML_PREC_F32
);
if
(
v_mla
)
{
#if 0
// v_mla can be applied as a matrix-vector multiplication with broadcasting across dimension 3 == n_tokens.
// However, the code is optimized for dimensions 0 and 1 being large, so this is ineffient.
cur = ggml_reshape_4d(ctx0, cur, v_mla->ne[0], 1, n_head, n_tokens);
cur = ggml_mul_mat(ctx0, v_mla, cur);
#else
// It's preferable to do the calculation as a matrix-matrix multiplication with n_tokens in dimension 1.
// The permutations are noops and only change how the tensor data is interpreted.
cur
=
ggml_permute
(
ctx0
,
cur
,
0
,
2
,
1
,
3
);
cur
=
ggml_mul_mat
(
ctx0
,
v_mla
,
cur
);
cur
=
ggml_permute
(
ctx0
,
cur
,
0
,
2
,
1
,
3
);
cur
=
ggml_cont
(
ctx0
,
cur
);
// Needed because ggml_reshape_2d expects contiguous inputs.
#endif
}
cur
=
ggml_reshape_2d
(
ctx0
,
cur
,
cur
->
ne
[
0
]
*
n_head
,
n_tokens
);
...
...
@@ -1442,8 +1420,6 @@ ggml_tensor * llm_graph_context::build_attn(
// store to KV cache
{
GGML_ASSERT
(
!
kv_self
->
recurrent
);
const
auto
kv_head
=
kv_self
->
head
;
GGML_ASSERT
(
kv_self
->
size
==
n_ctx
);
...
...
@@ -1538,25 +1514,6 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
return
(
llm_graph_input_attn_cross
*
)
res
->
add_input
(
std
::
move
(
inp
));
}
ggml_tensor
*
llm_graph_context
::
build_inp_cross_attn_state
()
const
{
const
int64_t
n_embd
=
hparams
.
n_embd
;
auto
inp
=
std
::
make_unique
<
llm_graph_input_cross_attn_state
>
();
ggml_tensor
*
cur
=
nullptr
;
inp
->
cross_attn_state
=
ggml_new_tensor_3d
(
ctx0
,
GGML_TYPE_F32
,
n_embd
,
1601
,
4
);
ggml_set_input
(
inp
->
cross_attn_state
);
cur
=
inp
->
cross_attn_state
;
cb
(
cur
,
"inp_cross_attn_state"
,
-
1
);
res
->
add_input
(
std
::
move
(
inp
));
return
cur
;
}
ggml_tensor
*
llm_graph_context
::
build_attn
(
llm_graph_input_attn_cross
*
inp
,
ggml_cgraph
*
gf
,
...
...
@@ -1612,7 +1569,7 @@ ggml_tensor * llm_graph_context::build_copy_mask_state(
ggml_tensor
*
state_mask
,
int32_t
n_state
,
int32_t
n_seqs
)
const
{
const
llama_kv_cache_
unified
*
kv_self
=
static_cast
<
const
llama_kv_cache_
unified
*>
(
memory
);
const
llama_kv_cache_
recurrent
*
kv_self
=
static_cast
<
const
llama_kv_cache_
recurrent
*>
(
memory
);
const
auto
n_kv
=
kv_self
->
n
;
const
auto
kv_head
=
kv_self
->
head
;
...
...
@@ -1644,7 +1601,7 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
ggml_tensor
*
state_mask
,
const
llama_ubatch
&
ubatch
,
int
il
)
const
{
const
llama_kv_cache_
unified
*
kv_self
=
static_cast
<
const
llama_kv_cache_
unified
*>
(
memory
);
const
llama_kv_cache_
recurrent
*
kv_self
=
static_cast
<
const
llama_kv_cache_
recurrent
*>
(
memory
);
const
auto
token_shift_count
=
hparams
.
token_shift_count
;
...
...
@@ -1665,7 +1622,7 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_store(
ggml_tensor
*
token_shift
,
const
llama_ubatch
&
ubatch
,
int
il
)
const
{
const
llama_kv_cache_
unified
*
kv_self
=
static_cast
<
const
llama_kv_cache_
unified
*>
(
memory
);
const
llama_kv_cache_
recurrent
*
kv_self
=
static_cast
<
const
llama_kv_cache_
recurrent
*>
(
memory
);
const
auto
token_shift_count
=
hparams
.
token_shift_count
;
const
auto
n_embd
=
hparams
.
n_embd
;
...
...
llama/llama.cpp/src/llama-graph.h
View file @
b2b270ad
...
...
@@ -19,6 +19,7 @@ struct llama_cparams;
class
llama_memory_i
;
class
llama_kv_cache_unified
;
class
llama_kv_cache_recurrent
;
// certain models (typically multi-modal) can produce different types of graphs
enum
llm_graph_type
{
...
...
@@ -86,7 +87,6 @@ public:
ggml_tensor
*
tokens
=
nullptr
;
// I32 [n_batch]
ggml_tensor
*
embd
=
nullptr
;
// F32 [n_embd, n_batch]
ggml_tensor
*
cross_attn_state
;
// F32 [4, n_embd, 1061]
};
class
llm_graph_input_pos
:
public
llm_graph_input_i
{
...
...
@@ -187,26 +187,26 @@ public:
class
llm_graph_input_s_copy
:
public
llm_graph_input_i
{
public:
llm_graph_input_s_copy
(
const
llama_kv_cache_
unified
*
kv_self
)
:
kv_self
(
kv_self
)
{}
llm_graph_input_s_copy
(
const
llama_kv_cache_
recurrent
*
kv_self
)
:
kv_self
(
kv_self
)
{}
virtual
~
llm_graph_input_s_copy
()
=
default
;
void
set_input
(
const
llama_ubatch
*
ubatch
)
override
;
ggml_tensor
*
s_copy
;
// I32 [kv_size]
const
llama_kv_cache_
unified
*
kv_self
;
const
llama_kv_cache_
recurrent
*
kv_self
;
};
class
llm_graph_input_s_mask
:
public
llm_graph_input_i
{
public:
llm_graph_input_s_mask
(
const
llama_kv_cache_
unified
*
kv_self
)
:
kv_self
(
kv_self
)
{}
llm_graph_input_s_mask
(
const
llama_kv_cache_
recurrent
*
kv_self
)
:
kv_self
(
kv_self
)
{}
virtual
~
llm_graph_input_s_mask
()
=
default
;
void
set_input
(
const
llama_ubatch
*
ubatch
)
override
;
ggml_tensor
*
s_mask
;
// F32 [1, n_kv]
const
llama_kv_cache_
unified
*
kv_self
;
const
llama_kv_cache_
recurrent
*
kv_self
;
};
class
llm_graph_input_cross_embd
:
public
llm_graph_input_i
{
...
...
@@ -284,16 +284,6 @@ public:
const
llama_cross
*
cross
=
nullptr
;
};
class
llm_graph_input_cross_attn_state
:
public
llm_graph_input_i
{
public:
llm_graph_input_cross_attn_state
()
=
default
;
virtual
~
llm_graph_input_cross_attn_state
()
=
default
;
void
set_input
(
const
llama_ubatch
*
ubatch
)
override
;
ggml_tensor
*
cross_attn_state
;
// F32 [4, n_embd, 1061]
};
//
// llm_graph_result
//
...
...
@@ -308,6 +298,7 @@ class llm_graph_result_i {
public:
virtual
~
llm_graph_result_i
()
=
default
;
virtual
ggml_tensor
*
get_tokens
()
=
0
;
virtual
ggml_tensor
*
get_logits
()
=
0
;
virtual
ggml_tensor
*
get_embd
()
=
0
;
virtual
ggml_tensor
*
get_embd_pooled
()
=
0
;
...
...
@@ -322,6 +313,7 @@ class llm_graph_result : public llm_graph_result_i {
public:
virtual
~
llm_graph_result
()
=
default
;
ggml_tensor
*
get_tokens
()
override
{
return
t_tokens
;
}
ggml_tensor
*
get_logits
()
override
{
return
t_logits
;
}
ggml_tensor
*
get_embd
()
override
{
return
t_embd
;
}
ggml_tensor
*
get_embd_pooled
()
override
{
return
t_embd_pooled
;
}
...
...
@@ -338,6 +330,7 @@ public:
}
// important graph nodes
ggml_tensor
*
t_tokens
=
nullptr
;
ggml_tensor
*
t_logits
=
nullptr
;
ggml_tensor
*
t_embd
=
nullptr
;
ggml_tensor
*
t_embd_pooled
=
nullptr
;
...
...
@@ -361,8 +354,8 @@ struct llm_graph_params {
const
llama_cparams
&
cparams
;
const
llama_ubatch
&
ubatch
;
ggml_backend_sched
*
sched
;
ggml_backend
*
backend_cpu
;
ggml_backend_sched
_t
sched
;
ggml_backend
_t
backend_cpu
;
const
llama_adapter_cvec
*
cvec
;
const
llama_adapter_loras
*
loras
;
...
...
@@ -413,9 +406,9 @@ struct llm_graph_context {
ggml_context
*
ctx0
=
nullptr
;
ggml_backend_sched
*
sched
;
ggml_backend_sched
_t
sched
;
ggml_backend
*
backend_cpu
;
// TODO: needed by build_attn_mha, figure out a way to remove?
ggml_backend
_t
backend_cpu
;
// TODO: needed by build_attn_mha, figure out a way to remove?
const
llama_adapter_cvec
*
cvec
;
const
llama_adapter_loras
*
loras
;
...
...
@@ -502,7 +495,6 @@ struct llm_graph_context {
ggml_tensor
*
build_inp_cls
()
const
;
ggml_tensor
*
build_inp_s_copy
()
const
;
ggml_tensor
*
build_inp_s_mask
()
const
;
ggml_tensor
*
build_inp_cross_attn_state
()
const
;
ggml_tensor
*
build_inp_cross_embd
()
const
;
ggml_tensor
*
build_inp_pos_bucket_enc
()
const
;
...
...
llama/llama.cpp/src/llama-hparams.cpp
View file @
b2b270ad
...
...
@@ -85,7 +85,3 @@ bool llama_hparams::is_swa(uint32_t il) const {
GGML_ABORT
(
"fatal error"
);
}
bool
llama_hparams
::
cross_attention_layers
(
uint32_t
il
)
const
{
return
std
::
find
(
cross_attn_layers
.
begin
(),
cross_attn_layers
.
end
(),
il
)
!=
cross_attn_layers
.
end
();
}
Prev
1
2
3
4
5
6
7
8
…
15
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment