"docs/git@developer.sourcefind.cn:renzhc/diffusers_dcu.git" did not exist on "0bc6be69606f9ae7f82ad499baf494255c51c38d"
Unverified Commit 4987f13d authored by Gabe Goodhart's avatar Gabe Goodhart Committed by GitHub
Browse files

Llama cpp bump (df1b612): granite docling / mamba2 optimizations / multimodal...


Llama cpp bump (df1b612): granite docling / mamba2 optimizations / multimodal encoding fixes (#12552)

* feat: Bump llama.cpp to df1b612

Branch: LlamaCPPBump-GraniteDocling
Signed-off-by: default avatarGabe Goodhart <ghart@us.ibm.com>

* fix(mtmd): Correctly encode text chunks during mtmd tokenization

There can be text chunks that appear interspersed with the image embeddings
that contain template delimiter tokens for some models. These need to be
correctly translated to text tokens.

Branch: LlamaCPPBump-GraniteDocling
Signed-off-by: default avatarGabe Goodhart <ghart@us.ibm.com>

* tests: Use MtmdChunk in image_test

Branch: LlamaCPPBump-GraniteDocling
Signed-off-by: default avatarGabe Goodhart <ghart@us.ibm.com>

* style: Fix unnecessary conversion linting

Branch: LlamaCPPBump-GraniteDocling
Signed-off-by: default avatarGabe Goodhart <ghart@us.ibm.com>

* fix(ggml): Revert changes to ggml_hip.cpp

These changes were done largely by our code assistant and are likely wrong

Branch: LlamaCPPBump-GraniteDocling
Signed-off-by: default avatarGabe Goodhart <ghart@us.ibm.com>

* fix: Revert changes in mem_nvml.cpp

Branch: LlamaCPPBump-GraniteDocling
Signed-off-by: default avatarGabe Goodhart <ghart@us.ibm.com>

* feat: Update sync point to 1deee0

This brings in several more optimization commits and model support for
EmbeddingGemma

Branch: LlamaCPPBump-GraniteDocling
Signed-off-by: default avatarGabe Goodhart <ghart@us.ibm.com>

* feat: Update patches for 1deee0

Branch: LlamaCPPBump-GraniteDocling
Signed-off-by: default avatarGabe Goodhart <ghart@us.ibm.com>

* feat: sync for bump to 1deee0

Branch: LlamaCPPBump-GraniteDocling
Signed-off-by: default avatarGabe Goodhart <ghart@us.ibm.com>

* fix: Bad patch updates with errant `+`

Branch: LlamaCPPBump-GraniteDocling
Signed-off-by: default avatarGabe Goodhart <ghart@us.ibm.com>

* feat: Bump llama.cpp/ggml to 7049736

Branch: LlamaCPPBump-GraniteDocling
Signed-off-by: default avatarGabe Goodhart <ghart@us.ibm.com>

* fix: format-patches after latest bump

Branch: LlamaCPPBump-GraniteDocling
Signed-off-by: default avatarGabe Goodhart <ghart@us.ibm.com>

---------
Signed-off-by: default avatarGabe Goodhart <ghart@us.ibm.com>
parent e638f2ac
UPSTREAM=https://github.com/ggml-org/llama.cpp.git UPSTREAM=https://github.com/ggml-org/llama.cpp.git
WORKDIR=llama/vendor WORKDIR=llama/vendor
FETCH_HEAD=364a7a6d4a786e98947c8a90430ea581213c0ba9 FETCH_HEAD=7049736b2dd9011bf819e298b844ebbc4b5afdc9
.PHONY: help .PHONY: help
help: help:
......
int LLAMA_BUILD_NUMBER = 0; int LLAMA_BUILD_NUMBER = 0;
char const *LLAMA_COMMIT = "364a7a6d4a786e98947c8a90430ea581213c0ba9"; char const *LLAMA_COMMIT = "7049736b2dd9011bf819e298b844ebbc4b5afdc9";
char const *LLAMA_COMPILER = ""; char const *LLAMA_COMPILER = "";
char const *LLAMA_BUILD_TARGET = ""; char const *LLAMA_BUILD_TARGET = "";
...@@ -1133,6 +1133,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { ...@@ -1133,6 +1133,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
mparams.use_mlock = params.use_mlock; mparams.use_mlock = params.use_mlock;
mparams.check_tensors = params.check_tensors; mparams.check_tensors = params.check_tensors;
mparams.use_extra_bufts = !params.no_extra_bufts; mparams.use_extra_bufts = !params.no_extra_bufts;
mparams.no_host = params.no_host;
if (params.kv_overrides.empty()) { if (params.kv_overrides.empty()) {
mparams.kv_overrides = NULL; mparams.kv_overrides = NULL;
......
...@@ -378,7 +378,7 @@ struct common_params { ...@@ -378,7 +378,7 @@ struct common_params {
bool simple_io = false; // improves compatibility with subprocesses and limited consoles bool simple_io = false; // improves compatibility with subprocesses and limited consoles
bool cont_batching = true; // insert new sequences for decoding on-the-fly bool cont_batching = true; // insert new sequences for decoding on-the-fly
bool no_perf = false; // disable performance metrics bool no_perf = false; // disable performance metrics
bool ctx_shift = false; // context shift on infinite text generation bool ctx_shift = false; // context shift on infinite text generation
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055) bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
bool kv_unified = false; // enable unified KV cache bool kv_unified = false; // enable unified KV cache
...@@ -392,6 +392,7 @@ struct common_params { ...@@ -392,6 +392,7 @@ struct common_params {
bool check_tensors = false; // validate tensor data bool check_tensors = false; // validate tensor data
bool no_op_offload = false; // globally disable offload host tensor operations to device bool no_op_offload = false; // globally disable offload host tensor operations to device
bool no_extra_bufts = false; // disable extra buffer types (used for weight repacking) bool no_extra_bufts = false; // disable extra buffer types (used for weight repacking)
bool no_host = false; // bypass host buffer allowing extra buffers to be used
bool single_turn = false; // single turn chat conversation bool single_turn = false; // single turn chat conversation
...@@ -424,7 +425,8 @@ struct common_params { ...@@ -424,7 +425,8 @@ struct common_params {
int32_t timeout_write = timeout_read; // http write timeout in seconds int32_t timeout_write = timeout_read; // http write timeout in seconds
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool) int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
int32_t n_swa_checkpoints = 3; // max number of SWA checkpoints per slot int32_t n_ctx_checkpoints = 8; // max number of context checkpoints per slot
int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
std::string hostname = "127.0.0.1"; std::string hostname = "127.0.0.1";
std::string public_path = ""; // NOLINT std::string public_path = ""; // NOLINT
...@@ -432,7 +434,7 @@ struct common_params { ...@@ -432,7 +434,7 @@ struct common_params {
std::string chat_template = ""; // NOLINT std::string chat_template = ""; // NOLINT
bool use_jinja = false; // NOLINT bool use_jinja = false; // NOLINT
bool enable_chat_template = true; bool enable_chat_template = true;
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO; common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
int reasoning_budget = -1; int reasoning_budget = -1;
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
......
...@@ -296,6 +296,7 @@ extern "C" { ...@@ -296,6 +296,7 @@ extern "C" {
bool use_mlock; // force system to keep model in RAM bool use_mlock; // force system to keep model in RAM
bool check_tensors; // validate model tensor data bool check_tensors; // validate model tensor data
bool use_extra_bufts; // use extra buffer types (used for weight repacking) bool use_extra_bufts; // use extra buffer types (used for weight repacking)
bool no_host; // bypass host buffer allowing extra buffers to be used
}; };
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
...@@ -543,6 +544,9 @@ extern "C" { ...@@ -543,6 +544,9 @@ extern "C" {
// Returns true if the model is recurrent (like Mamba, RWKV, etc.) // Returns true if the model is recurrent (like Mamba, RWKV, etc.)
LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model); LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model);
// Returns true if the model is hybrid (like Jamba, Granite, etc.)
LLAMA_API bool llama_model_is_hybrid(const struct llama_model * model);
// Returns true if the model is diffusion-based (like LLaDA, Dream, etc.) // Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)
LLAMA_API bool llama_model_is_diffusion(const struct llama_model * model); LLAMA_API bool llama_model_is_diffusion(const struct llama_model * model);
...@@ -791,8 +795,12 @@ extern "C" { ...@@ -791,8 +795,12 @@ extern "C" {
size_t n_token_capacity, size_t n_token_capacity,
size_t * n_token_count_out); size_t * n_token_count_out);
// for backwards-compat
#define LLAMA_STATE_SEQ_FLAGS_SWA_ONLY 1 #define LLAMA_STATE_SEQ_FLAGS_SWA_ONLY 1
// work only with partial states, such as SWA KV cache or recurrent cache (e.g. Mamba)
#define LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY 1
typedef uint32_t llama_state_seq_flags; typedef uint32_t llama_state_seq_flags;
LLAMA_API size_t llama_state_seq_get_size_ext( LLAMA_API size_t llama_state_seq_get_size_ext(
......
...@@ -94,12 +94,14 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = { ...@@ -94,12 +94,14 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_SMOLLM3, "smollm3" }, { LLM_ARCH_SMOLLM3, "smollm3" },
{ LLM_ARCH_OPENAI_MOE, "gpt-oss" }, { LLM_ARCH_OPENAI_MOE, "gpt-oss" },
{ LLM_ARCH_LFM2, "lfm2" }, { LLM_ARCH_LFM2, "lfm2" },
{ LLM_ARCH_LFM2MOE, "lfm2moe" },
{ LLM_ARCH_DREAM, "dream" }, { LLM_ARCH_DREAM, "dream" },
{ LLM_ARCH_SMALLTHINKER, "smallthinker" }, { LLM_ARCH_SMALLTHINKER, "smallthinker" },
{ LLM_ARCH_LLADA, "llada" }, { LLM_ARCH_LLADA, "llada" },
{ LLM_ARCH_LLADA_MOE, "llada-moe" }, { LLM_ARCH_LLADA_MOE, "llada-moe" },
{ LLM_ARCH_SEED_OSS, "seed_oss" }, { LLM_ARCH_SEED_OSS, "seed_oss" },
{ LLM_ARCH_GROVEMOE, "grovemoe" }, { LLM_ARCH_GROVEMOE, "grovemoe" },
{ LLM_ARCH_APERTUS, "apertus" },
{ LLM_ARCH_UNKNOWN, "(unknown)" }, { LLM_ARCH_UNKNOWN, "(unknown)" },
}; };
...@@ -219,6 +221,11 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = { ...@@ -219,6 +221,11 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" }, { LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },
{ LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" }, { LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" },
// sentence-transformers dense modules feature dims
{ LLM_KV_DENSE_2_FEAT_IN, "%s.dense_2_feat_in" },
{ LLM_KV_DENSE_2_FEAT_OUT, "%s.dense_2_feat_out" },
{ LLM_KV_DENSE_3_FEAT_IN, "%s.dense_3_feat_in" },
{ LLM_KV_DENSE_3_FEAT_OUT, "%s.dense_3_feat_out" },
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" }, { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
...@@ -258,6 +265,11 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = { ...@@ -258,6 +265,11 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ADAPTER_LORA_PROMPT_PREFIX, "adapter.lora.prompt_prefix" }, { LLM_KV_ADAPTER_LORA_PROMPT_PREFIX, "adapter.lora.prompt_prefix" },
{ LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS, "adapter.alora.invocation_tokens" }, { LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS, "adapter.alora.invocation_tokens" },
{ LLM_KV_XIELU_ALPHA_N, "xielu.alpha_n" },
{ LLM_KV_XIELU_ALPHA_P, "xielu.alpha_p" },
{ LLM_KV_XIELU_BETA, "xielu.beta" },
{ LLM_KV_XIELU_EPS, "xielu.eps" },
// deprecated // deprecated
{ LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" }, { LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
{ LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" }, { LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
...@@ -1066,6 +1078,8 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N ...@@ -1066,6 +1078,8 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" }, { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" }, { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" }, { LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_DENSE_2_OUT, "dense_2" },
{ LLM_TENSOR_DENSE_3_OUT, "dense_3" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
...@@ -2118,6 +2132,32 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N ...@@ -2118,6 +2132,32 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_OUTPUT, "output" }, { LLM_TENSOR_OUTPUT, "output" },
} }
}, },
{
LLM_ARCH_LFM2MOE,
{
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_SHORTCONV_CONV, "blk.%d.shortconv.conv" },
{ LLM_TENSOR_SHORTCONV_INPROJ, "blk.%d.shortconv.in_proj" },
{ LLM_TENSOR_SHORTCONV_OUTPROJ, "blk.%d.shortconv.out_proj" },
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
}
},
{ {
LLM_ARCH_SMALLTHINKER, LLM_ARCH_SMALLTHINKER,
{ {
...@@ -2139,6 +2179,25 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N ...@@ -2139,6 +2179,25 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" } { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }
}, },
}, },
{
LLM_ARCH_APERTUS,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{ {
LLM_ARCH_DREAM, LLM_ARCH_DREAM,
{ {
...@@ -2249,6 +2308,8 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = { ...@@ -2249,6 +2308,8 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_OUTPUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, {LLM_TENSOR_OUTPUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
{LLM_TENSOR_CLS, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, {LLM_TENSOR_CLS, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
{LLM_TENSOR_CLS_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, {LLM_TENSOR_CLS_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
{LLM_TENSOR_DENSE_2_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, // Dense layer output
{LLM_TENSOR_DENSE_3_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, // Dense layer output
{LLM_TENSOR_OUTPUT_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, {LLM_TENSOR_OUTPUT_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
{LLM_TENSOR_DEC_OUTPUT_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, {LLM_TENSOR_DEC_OUTPUT_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
{LLM_TENSOR_ENC_OUTPUT_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, {LLM_TENSOR_ENC_OUTPUT_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
...@@ -2489,6 +2550,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) { ...@@ -2489,6 +2550,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
case LLM_ARCH_PLAMO2: case LLM_ARCH_PLAMO2:
case LLM_ARCH_GRANITE_HYBRID: case LLM_ARCH_GRANITE_HYBRID:
case LLM_ARCH_LFM2: case LLM_ARCH_LFM2:
case LLM_ARCH_LFM2MOE:
case LLM_ARCH_NEMOTRON_H: case LLM_ARCH_NEMOTRON_H:
return true; return true;
default: default:
......
...@@ -98,12 +98,14 @@ enum llm_arch { ...@@ -98,12 +98,14 @@ enum llm_arch {
LLM_ARCH_SMOLLM3, LLM_ARCH_SMOLLM3,
LLM_ARCH_OPENAI_MOE, LLM_ARCH_OPENAI_MOE,
LLM_ARCH_LFM2, LLM_ARCH_LFM2,
LLM_ARCH_LFM2MOE,
LLM_ARCH_DREAM, LLM_ARCH_DREAM,
LLM_ARCH_SMALLTHINKER, LLM_ARCH_SMALLTHINKER,
LLM_ARCH_LLADA, LLM_ARCH_LLADA,
LLM_ARCH_LLADA_MOE, LLM_ARCH_LLADA_MOE,
LLM_ARCH_SEED_OSS, LLM_ARCH_SEED_OSS,
LLM_ARCH_GROVEMOE, LLM_ARCH_GROVEMOE,
LLM_ARCH_APERTUS,
LLM_ARCH_UNKNOWN, LLM_ARCH_UNKNOWN,
}; };
...@@ -262,10 +264,21 @@ enum llm_kv { ...@@ -262,10 +264,21 @@ enum llm_kv {
LLM_KV_SHORTCONV_L_CACHE, LLM_KV_SHORTCONV_L_CACHE,
LLM_KV_XIELU_ALPHA_N,
LLM_KV_XIELU_ALPHA_P,
LLM_KV_XIELU_BETA,
LLM_KV_XIELU_EPS,
// deprecated: // deprecated:
LLM_KV_TOKENIZER_PREFIX_ID, LLM_KV_TOKENIZER_PREFIX_ID,
LLM_KV_TOKENIZER_SUFFIX_ID, LLM_KV_TOKENIZER_SUFFIX_ID,
LLM_KV_TOKENIZER_MIDDLE_ID, LLM_KV_TOKENIZER_MIDDLE_ID,
// sentence-transformers dense layers in and out features
LLM_KV_DENSE_2_FEAT_IN,
LLM_KV_DENSE_2_FEAT_OUT,
LLM_KV_DENSE_3_FEAT_IN,
LLM_KV_DENSE_3_FEAT_OUT,
}; };
enum llm_tensor { enum llm_tensor {
...@@ -273,6 +286,8 @@ enum llm_tensor { ...@@ -273,6 +286,8 @@ enum llm_tensor {
LLM_TENSOR_TOKEN_EMBD_NORM, LLM_TENSOR_TOKEN_EMBD_NORM,
LLM_TENSOR_TOKEN_TYPES, LLM_TENSOR_TOKEN_TYPES,
LLM_TENSOR_POS_EMBD, LLM_TENSOR_POS_EMBD,
LLM_TENSOR_DENSE_2_OUT,
LLM_TENSOR_DENSE_3_OUT,
LLM_TENSOR_OUTPUT, LLM_TENSOR_OUTPUT,
LLM_TENSOR_OUTPUT_NORM, LLM_TENSOR_OUTPUT_NORM,
LLM_TENSOR_ROPE_FREQS, LLM_TENSOR_ROPE_FREQS,
......
...@@ -590,7 +590,7 @@ int32_t llm_chat_apply_template( ...@@ -590,7 +590,7 @@ int32_t llm_chat_apply_template(
ss << message->content << "<|end_of_text|>\n"; ss << message->content << "<|end_of_text|>\n";
} }
if (add_ass) { if (add_ass) {
ss << "<|start_of_role|>assistant<|end_of_role|>\n"; ss << "<|start_of_role|>assistant<|end_of_role|>";
} }
} else if (tmpl == LLM_CHAT_TEMPLATE_GIGACHAT) { } else if (tmpl == LLM_CHAT_TEMPLATE_GIGACHAT) {
// GigaChat template // GigaChat template
......
...@@ -2345,6 +2345,12 @@ llama_context * llama_init_from_model( ...@@ -2345,6 +2345,12 @@ llama_context * llama_init_from_model(
return nullptr; return nullptr;
} }
if (params.pooling_type != model->hparams.pooling_type) {
//user-specified pooling-type is different from the model default
LLAMA_LOG_WARN("%s: model default pooling_type is [%d], but [%d] was specified\n", __func__,
model->hparams.pooling_type, params.pooling_type);
}
try { try {
auto * ctx = new llama_context(*model, params); auto * ctx = new llama_context(*model, params);
return ctx; return ctx;
......
...@@ -1853,6 +1853,23 @@ llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const { ...@@ -1853,6 +1853,23 @@ llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
return (llm_graph_input_mem_hybrid *) res->add_input(std::move(inp)); return (llm_graph_input_mem_hybrid *) res->add_input(std::move(inp));
} }
void llm_graph_context::build_dense_out(
ggml_tensor * dense_2,
ggml_tensor * dense_3) const {
if (!cparams.embeddings || dense_2 == nullptr || dense_3 == nullptr) {
return;
}
ggml_tensor * cur = res->t_embd_pooled != nullptr ? res->t_embd_pooled : res->t_embd;
GGML_ASSERT(cur != nullptr && "missing t_embd_pooled/t_embd");
cur = ggml_mul_mat(ctx0, dense_2, cur);
cur = ggml_mul_mat(ctx0, dense_3, cur);
cb(cur, "result_embd_pooled", -1);
res->t_embd_pooled = cur;
ggml_build_forward_expand(gf, cur);
}
void llm_graph_context::build_pooling( void llm_graph_context::build_pooling(
ggml_tensor * cls, ggml_tensor * cls,
ggml_tensor * cls_b, ggml_tensor * cls_b,
......
...@@ -814,6 +814,14 @@ struct llm_graph_context { ...@@ -814,6 +814,14 @@ struct llm_graph_context {
ggml_tensor * cls_b, ggml_tensor * cls_b,
ggml_tensor * cls_out, ggml_tensor * cls_out,
ggml_tensor * cls_out_b) const; ggml_tensor * cls_out_b) const;
//
// dense (out)
//
void build_dense_out(
ggml_tensor * dense_2,
ggml_tensor * dense_3) const;
}; };
// TODO: better name // TODO: better name
......
...@@ -140,7 +140,11 @@ uint32_t llama_hparams::n_embd_s() const { ...@@ -140,7 +140,11 @@ uint32_t llama_hparams::n_embd_s() const {
} }
bool llama_hparams::is_recurrent(uint32_t il) const { bool llama_hparams::is_recurrent(uint32_t il) const {
return recurrent_layer_arr[il]; if (il < n_layer) {
return recurrent_layer_arr[il];
}
GGML_ABORT("%s: il (%u) out of bounds (n_layer: %u)\n", __func__, il, n_layer);
} }
uint32_t llama_hparams::n_pos_per_embd() const { uint32_t llama_hparams::n_pos_per_embd() const {
......
...@@ -42,7 +42,7 @@ struct llama_hparams { ...@@ -42,7 +42,7 @@ struct llama_hparams {
uint32_t n_embd; uint32_t n_embd;
uint32_t n_embd_features = 0; uint32_t n_embd_features = 0;
uint32_t n_layer; uint32_t n_layer;
int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
uint32_t n_rot; uint32_t n_rot;
uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
...@@ -171,6 +171,18 @@ struct llama_hparams { ...@@ -171,6 +171,18 @@ struct llama_hparams {
uint32_t laurel_rank = 64; uint32_t laurel_rank = 64;
uint32_t n_embd_altup = 256; uint32_t n_embd_altup = 256;
// needed for sentence-transformers dense layers
uint32_t dense_2_feat_in = 0; // in_features of the 2_Dense
uint32_t dense_2_feat_out = 0; // out_features of the 2_Dense
uint32_t dense_3_feat_in = 0; // in_features of the 3_Dense
uint32_t dense_3_feat_out = 0; // out_features of the 3_Dense
// xIELU
std::array<float, LLAMA_MAX_LAYERS> xielu_alpha_n;
std::array<float, LLAMA_MAX_LAYERS> xielu_alpha_p;
std::array<float, LLAMA_MAX_LAYERS> xielu_beta;
std::array<float, LLAMA_MAX_LAYERS> xielu_eps;
// needed by encoder-decoder models (e.g. T5, FLAN-T5) // needed by encoder-decoder models (e.g. T5, FLAN-T5)
// ref: https://github.com/ggerganov/llama.cpp/pull/8141 // ref: https://github.com/ggerganov/llama.cpp/pull/8141
llama_token dec_start_token_id = LLAMA_TOKEN_NULL; llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
......
...@@ -220,7 +220,7 @@ bool llama_kv_cache_iswa::get_can_shift() const { ...@@ -220,7 +220,7 @@ bool llama_kv_cache_iswa::get_can_shift() const {
} }
void llama_kv_cache_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const { void llama_kv_cache_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
if ((flags & LLAMA_STATE_SEQ_FLAGS_SWA_ONLY) == 0) { if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {
kv_base->state_write(io, seq_id, flags); kv_base->state_write(io, seq_id, flags);
} }
...@@ -228,7 +228,7 @@ void llama_kv_cache_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id ...@@ -228,7 +228,7 @@ void llama_kv_cache_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id
} }
void llama_kv_cache_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) { void llama_kv_cache_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
if ((flags & LLAMA_STATE_SEQ_FLAGS_SWA_ONLY) == 0) { if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {
kv_base->state_read(io, seq_id, flags); kv_base->state_read(io, seq_id, flags);
} }
......
...@@ -123,11 +123,8 @@ llama_kv_cache::llama_kv_cache( ...@@ -123,11 +123,8 @@ llama_kv_cache::llama_kv_cache(
throw std::runtime_error("failed to create ggml context for kv cache"); throw std::runtime_error("failed to create ggml context for kv cache");
} }
ggml_tensor * k; ggml_tensor * k = ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, kv_size, n_stream);
ggml_tensor * v; ggml_tensor * v = ggml_new_tensor_3d(ctx, type_v, n_embd_v_gqa, kv_size, n_stream);
k = ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, kv_size, n_stream);
v = ggml_new_tensor_3d(ctx, type_v, n_embd_v_gqa, kv_size, n_stream);
ggml_format_name(k, "cache_k_l%d", il); ggml_format_name(k, "cache_k_l%d", il);
ggml_format_name(v, "cache_v_l%d", il); ggml_format_name(v, "cache_v_l%d", il);
......
...@@ -73,7 +73,9 @@ llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & ba ...@@ -73,7 +73,9 @@ llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & ba
// if all tokens are output, split by sequence // if all tokens are output, split by sequence
ubatch = balloc.split_seq(n_ubatch); ubatch = balloc.split_seq(n_ubatch);
} else { } else {
ubatch = balloc.split_equal(n_ubatch, false); // TODO: non-sequential equal split can be done if using unified KV cache
// for simplicity, we always use sequential equal split for now
ubatch = balloc.split_equal(n_ubatch, true);
} }
if (ubatch.n_tokens == 0) { if (ubatch.n_tokens == 0) {
...@@ -175,17 +177,17 @@ std::map<ggml_backend_buffer_type_t, size_t> llama_memory_hybrid::memory_breakdo ...@@ -175,17 +177,17 @@ std::map<ggml_backend_buffer_type_t, size_t> llama_memory_hybrid::memory_breakdo
} }
void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const { void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
GGML_UNUSED(flags); if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {
mem_attn->state_write(io, seq_id, flags);
mem_attn->state_write(io, seq_id); }
mem_recr->state_write(io, seq_id); mem_recr->state_write(io, seq_id, flags);
} }
void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) { void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
GGML_UNUSED(flags); if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {
mem_attn->state_read(io, seq_id, flags);
mem_attn->state_read(io, seq_id); }
mem_recr->state_read(io, seq_id); mem_recr->state_read(io, seq_id, flags);
} }
llama_kv_cache * llama_memory_hybrid::get_mem_attn() const { llama_kv_cache * llama_memory_hybrid::get_mem_attn() const {
......
...@@ -136,6 +136,7 @@ void llama_memory_recurrent::clear(bool data) { ...@@ -136,6 +136,7 @@ void llama_memory_recurrent::clear(bool data) {
} }
bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) { bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
//printf("[DEBUG] calling llama_memory_recurrent::seq_rm` with `seq_id=%d, p0=%d, p1=%d`\n", seq_id, p0, p1);
uint32_t new_head = size; uint32_t new_head = size;
if (p0 < 0) { if (p0 < 0) {
...@@ -156,7 +157,8 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos ...@@ -156,7 +157,8 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
if (tail_id >= 0) { if (tail_id >= 0) {
const auto & cell = cells[tail_id]; const auto & cell = cells[tail_id];
// partial intersection is invalid // partial intersection is invalid
if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) { if ((0 < p0 && p0 < cell.pos) || (0 < p1 && p1 <= cell.pos)) {
//printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: partial intersection is invalid, so returning false\n");
return false; return false;
} }
// invalidate tails which will be cleared // invalidate tails which will be cleared
...@@ -167,6 +169,7 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos ...@@ -167,6 +169,7 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
} else { } else {
// seq_id is negative, then the range should include everything or nothing // seq_id is negative, then the range should include everything or nothing
if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) { if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
//printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: `seq_id` is negative, so returning false\n");
return false; return false;
} }
} }
...@@ -379,7 +382,9 @@ llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr & ...@@ -379,7 +382,9 @@ llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr &
// if all tokens are output, split by sequence // if all tokens are output, split by sequence
ubatch = balloc.split_seq(n_ubatch); ubatch = balloc.split_seq(n_ubatch);
} else { } else {
ubatch = balloc.split_equal(n_ubatch, false); // TODO: non-sequential equal split can be done if using unified KV cache
// for simplicity, we always use sequential equal split for now
ubatch = balloc.split_equal(n_ubatch, true);
} }
if (ubatch.n_tokens == 0) { if (ubatch.n_tokens == 0) {
...@@ -856,9 +861,12 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std:: ...@@ -856,9 +861,12 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
bool llama_memory_recurrent::state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id) { bool llama_memory_recurrent::state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id) {
if (dest_seq_id != -1) { if (dest_seq_id != -1) {
// single sequence // single sequence
seq_rm(dest_seq_id, -1, -1); seq_rm(dest_seq_id, -1, -1);
if (cell_count == 0) {
return true;
}
llama_batch_allocr balloc(hparams.n_pos_per_embd()); llama_batch_allocr balloc(hparams.n_pos_per_embd());
llama_ubatch ubatch = balloc.ubatch_reserve(cell_count, 1); llama_ubatch ubatch = balloc.ubatch_reserve(cell_count, 1);
......
...@@ -465,6 +465,7 @@ namespace GGUFMeta { ...@@ -465,6 +465,7 @@ namespace GGUFMeta {
// TODO: this is not very clever - figure out something better // TODO: this is not very clever - figure out something better
template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required); template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required); template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
template bool llama_model_loader::get_key_or_arr<std::array<float, 512>>(enum llm_kv kid, std::array<float, 512> & result, uint32_t n, bool required);
template bool llama_model_loader::get_key_or_arr<uint32_t>(const std::string & key, std::array<uint32_t, 512> & result, uint32_t n, bool required); template bool llama_model_loader::get_key_or_arr<uint32_t>(const std::string & key, std::array<uint32_t, 512> & result, uint32_t n, bool required);
llama_model_loader::llama_model_loader( llama_model_loader::llama_model_loader(
......
This diff is collapsed.
...@@ -108,6 +108,7 @@ enum llm_type { ...@@ -108,6 +108,7 @@ enum llm_type {
LLM_TYPE_17B_16E, // llama4 Scout LLM_TYPE_17B_16E, // llama4 Scout
LLM_TYPE_17B_128E, // llama4 Maverick LLM_TYPE_17B_128E, // llama4 Maverick
LLM_TYPE_A13B, LLM_TYPE_A13B,
LLM_TYPE_8B_A1B, // lfm2moe
LLM_TYPE_21B_A3B, // Ernie MoE small LLM_TYPE_21B_A3B, // Ernie MoE small
LLM_TYPE_30B_A3B, LLM_TYPE_30B_A3B,
LLM_TYPE_106B_A12B, // GLM-4.5-Air LLM_TYPE_106B_A12B, // GLM-4.5-Air
...@@ -381,6 +382,12 @@ struct llama_layer { ...@@ -381,6 +382,12 @@ struct llama_layer {
// openai-moe // openai-moe
struct ggml_tensor * attn_sinks = nullptr; struct ggml_tensor * attn_sinks = nullptr;
// xIELU activation parameters for Apertus
struct ggml_tensor * ffn_act_alpha_n = nullptr;
struct ggml_tensor * ffn_act_alpha_p = nullptr;
struct ggml_tensor * ffn_act_beta = nullptr;
struct ggml_tensor * ffn_act_eps = nullptr;
struct ggml_tensor * bskcn_tv = nullptr; struct ggml_tensor * bskcn_tv = nullptr;
struct llama_layer_posnet posnet; struct llama_layer_posnet posnet;
...@@ -434,6 +441,12 @@ struct llama_model { ...@@ -434,6 +441,12 @@ struct llama_model {
std::vector<llama_layer> layers; std::vector<llama_layer> layers;
//Dense linear projections for SentenceTransformers models like embeddinggemma
// For Sentence Transformers models structure see
// https://sbert.net/docs/sentence_transformer/usage/custom_models.html#structure-of-sentence-transformer-models
struct ggml_tensor * dense_2_out_layers = nullptr;
struct ggml_tensor * dense_3_out_layers = nullptr;
llama_model_params params; llama_model_params params;
// gguf metadata // gguf metadata
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment