Unverified Commit e9e5f61c authored by Jeffrey Morgan's avatar Jeffrey Morgan Committed by GitHub
Browse files

llama: update to commit 2016f07b (#10352)

parent 11dde418
UPSTREAM=https://github.com/ggerganov/llama.cpp.git
WORKDIR=llama/vendor
FETCH_HEAD=71e90e8813f90097701e62f7fce137d96ddf41e2
FETCH_HEAD=2016f07bd106c73699ecbaace80f55db5ed95dac
.PHONY: help
help:
......
int LLAMA_BUILD_NUMBER = 0;
char const *LLAMA_COMMIT = "71e90e8813f90097701e62f7fce137d96ddf41e2";
char const *LLAMA_COMMIT = "2016f07bd106c73699ecbaace80f55db5ed95dac";
char const *LLAMA_COMPILER = "";
char const *LLAMA_BUILD_TARGET = "";
......@@ -50,7 +50,6 @@
// tensor name constants
//
#define TN_TOKEN_EMBD "%s.token_embd.weight"
#define TN_POS_EMBD "%s.position_embd.weight"
#define TN_CLASS_EMBD "v.class_embd"
#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
......@@ -66,8 +65,6 @@
#define TN_LN_2 "%s.blk.%d.ln2.%s"
#define TN_LN_PRE "%s.pre_ln.%s"
#define TN_LN_POST "%s.post_ln.%s"
#define TN_TEXT_PROJ "text_projection.weight"
#define TN_VIS_PROJ "visual_projection.weight"
#define TN_LLAVA_PROJ "mm.%d.%s"
#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
......
This diff is collapsed.
......@@ -145,6 +145,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
{ LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
......@@ -1142,6 +1144,8 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" },
{ LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
{ LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
{ LLM_TENSOR_ATTN_K_B, "blk.%d.attn_k_b" },
{ LLM_TENSOR_ATTN_V_B, "blk.%d.attn_v_b" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
......@@ -1636,23 +1640,8 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_KV_A_MQA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_QKV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_FFN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_FFN_DOWN_SHEXP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_FFN_GATE_SHEXP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_FFN_UP_SHEXP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_Q_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_KV_A_MQA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_K_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_V_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_DEC_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
......
......@@ -149,6 +149,8 @@ enum llm_kv {
LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_DIMENSION_SECTIONS,
......@@ -311,6 +313,8 @@ enum llm_tensor {
LLM_TENSOR_ATTN_Q_B,
LLM_TENSOR_ATTN_KV_A_MQA,
LLM_TENSOR_ATTN_KV_B,
LLM_TENSOR_ATTN_K_B,
LLM_TENSOR_ATTN_V_B,
LLM_TENSOR_ATTN_Q_A_NORM,
LLM_TENSOR_ATTN_KV_A_NORM,
LLM_TENSOR_ATTN_SUB_NORM,
......
......@@ -10,6 +10,7 @@
#include <cstring>
#include <stdexcept>
#include <cinttypes>
#include <cmath>
//
// llama_context
......@@ -473,7 +474,6 @@ ggml_tensor * llama_context::build_rope_shift(
const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
const auto & yarn_ext_factor = cparams.yarn_ext_factor;
const auto & yarn_attn_factor = cparams.yarn_attn_factor;
const auto & yarn_beta_fast = cparams.yarn_beta_fast;
const auto & yarn_beta_slow = cparams.yarn_beta_slow;
......@@ -482,6 +482,10 @@ ggml_tensor * llama_context::build_rope_shift(
const auto & n_rot = hparams.n_rot;
const auto & rope_type = hparams.rope_type;
// See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2 ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)) : cparams.yarn_attn_factor;
ggml_tensor * tmp;
if (ggml_is_quantized(cur->type)) {
......
......@@ -1194,6 +1194,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
ggml_tensor * v,
ggml_tensor * kq_b,
ggml_tensor * kq_mask,
ggml_tensor * v_mla,
bool v_trans,
float kq_scale) const {
//const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
......@@ -1205,8 +1206,6 @@ ggml_tensor * llm_graph_context::build_attn_mha(
//const auto & n_embd_head_k = hparams.n_embd_head_k;
//const auto & n_embd_head_v = hparams.n_embd_head_v;
const auto n_embd_head_v = v_trans ? v->ne[1] : v->ne[0];
const auto n_tokens = q->ne[1];
const auto n_head = q->ne[2];
const auto n_kv = k->ne[1];
......@@ -1235,7 +1234,12 @@ ggml_tensor * llm_graph_context::build_attn_mha(
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
cur = ggml_reshape_2d(ctx0, cur, n_embd_head_v*n_head, n_tokens);
if (v_mla) {
cur = ggml_reshape_4d(ctx0, cur, v_mla->ne[0], 1, n_head, n_tokens);
cur = ggml_mul_mat(ctx0, v_mla, cur);
}
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens);
} else {
ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
......@@ -1273,9 +1277,14 @@ ggml_tensor * llm_graph_context::build_attn_mha(
ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
// for MLA with the absorption optimization, we need to "decompress" from MQA back to MHA
if (v_mla) {
kqv = ggml_mul_mat(ctx0, v_mla, kqv);
}
cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens);
cur = ggml_cont_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens);
if (!cparams.offload_kqv) {
// all nodes between the KV store and the attention output are run on the CPU
......@@ -1310,6 +1319,7 @@ ggml_tensor * llm_graph_context::build_attn(
ggml_tensor * k_cur,
ggml_tensor * v_cur,
ggml_tensor * kq_b,
ggml_tensor * v_mla,
float kq_scale,
int il) const {
GGML_UNUSED(n_tokens);
......@@ -1331,7 +1341,7 @@ ggml_tensor * llm_graph_context::build_attn(
ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
//cb(k, "v", il);
ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, false, kq_scale);
ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, false, kq_scale);
cb(cur, "kqv_out", il);
......@@ -1385,6 +1395,7 @@ ggml_tensor * llm_graph_context::build_attn(
ggml_tensor * k_cur,
ggml_tensor * v_cur,
ggml_tensor * kq_b,
ggml_tensor * v_mla,
float kq_scale,
int il) const {
// these nodes are added to the graph together so that they are not reordered
......@@ -1470,7 +1481,7 @@ ggml_tensor * llm_graph_context::build_attn(
ggml_element_size(kv_self->v_l[il])*n_ctx*n_embd_head_v,
0);
ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_trans, kq_scale);
ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, v_trans, kq_scale);
cb(cur, "kqv_out", il);
if (wo) {
......@@ -1529,6 +1540,7 @@ ggml_tensor * llm_graph_context::build_attn(
ggml_tensor * k_cur,
ggml_tensor * v_cur,
ggml_tensor * kq_b,
ggml_tensor * v_mla,
float kq_scale,
int il) const {
// these nodes are added to the graph together so that they are not reordered
......@@ -1548,7 +1560,7 @@ ggml_tensor * llm_graph_context::build_attn(
ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
//cb(k, "v", il);
ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, false, kq_scale);
ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, false, kq_scale);
cb(cur, "kqv_out", il);
......@@ -1717,4 +1729,3 @@ void llm_graph_context::build_pooling(
ggml_build_forward_expand(gf, cur);
}
......@@ -517,11 +517,12 @@ struct llm_graph_context {
ggml_tensor * build_attn_mha(
ggml_cgraph * gf,
ggml_tensor * q, // [n_embd_head_q, n_tokens, n_head_q]
ggml_tensor * k, // [n_embd_head_k, n_tokens, n_head_k]
ggml_tensor * v, // [n_embd_head_v, n_tokens, n_head_v] (v_trans == false)
ggml_tensor * q, // [n_embd_head_q, n_tokens, n_head_q]
ggml_tensor * k, // [n_embd_head_k, n_tokens, n_head_k]
ggml_tensor * v, // [n_embd_head_v, n_tokens, n_head_v] (v_trans == false)
ggml_tensor * kq_b,
ggml_tensor * kq_mask,
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
bool v_trans,
float kq_scale) const;
......@@ -536,6 +537,7 @@ struct llm_graph_context {
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
ggml_tensor * kq_b,
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
float kq_scale,
int il) const;
......@@ -550,6 +552,7 @@ struct llm_graph_context {
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
ggml_tensor * kq_b,
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
float kq_scale,
int il) const;
......@@ -564,6 +567,7 @@ struct llm_graph_context {
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
ggml_tensor * kq_b,
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
float kq_scale,
int il) const;
......
......@@ -46,6 +46,10 @@ struct llama_hparams {
uint32_t n_rel_attn_bkts = 0;
uint32_t n_vocab = 0;
// note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
uint32_t n_embd_head_k_mla = 0;
uint32_t n_embd_head_v_mla = 0;
// for WavTokenizer
struct llama_hparams_posnet posnet;
struct llama_hparams_convnext convnext;
......
......@@ -27,7 +27,7 @@ bool llama_kv_cache_unified::init(
recurrent = llama_model_is_recurrent(&model);
v_trans = !recurrent && !cparams.flash_attn;
can_shift = !recurrent && model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA
can_shift = !recurrent;
LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d\n",
__func__, kv_size, offload, ggml_type_name(type_k), ggml_type_name(type_v), n_layer, can_shift);
......
This diff is collapsed.
......@@ -174,6 +174,8 @@ struct llama_layer {
struct ggml_tensor * wq_b = nullptr;
struct ggml_tensor * wkv_a_mqa = nullptr;
struct ggml_tensor * wkv_b = nullptr;
struct ggml_tensor * wk_b = nullptr;
struct ggml_tensor * wv_b = nullptr;
struct ggml_tensor * wq_cross = nullptr;
struct ggml_tensor * wk_cross = nullptr;
struct ggml_tensor * wv_cross = nullptr;
......
......@@ -1833,6 +1833,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
if (false
|| t.first == "<|fim_prefix|>" // Qwen
|| t.first == "<fim-prefix>"
|| t.first == "<fim_prefix>" // Granite
|| t.first == "<|fim▁begin|>" // DeepSeek
|| t.first == "<PRE>"
|| t.first == "▁<PRE>" // CodeLlama
......@@ -1851,6 +1852,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
if (false
|| t.first == "<|fim_suffix|>" // Qwen
|| t.first == "<fim-suffix>"
|| t.first == "<fim_suffix>" // Granite
|| t.first == "<|fim▁hole|>" // DeepSeek
|| t.first == "<SUF>"
|| t.first == "▁<SUF>" // CodeLlama
......@@ -1869,6 +1871,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
if (false
|| t.first == "<|fim_middle|>" // Qwen
|| t.first == "<fim-middle>"
|| t.first == "<fim_middle>" // Granite
|| t.first == "<|fim▁end|>" // DeepSeek
|| t.first == "<MID>"
|| t.first == "▁<MID>" // CodeLlama
......@@ -1887,6 +1890,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
if (false
|| t.first == "<|fim_pad|>" // Qwen
|| t.first == "<fim-pad>"
|| t.first == "<fim_pad>" // Granite
|| t.first == "<PAD>"
) {
special_fim_pad_id = t.second;
......@@ -1905,6 +1909,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "<|repo_name|>"
|| t.first == "<fim-repo>"
|| t.first == "<REPO>"
|| t.first == "<reponame>" // Granite
) {
special_fim_rep_id = t.second;
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
......
......@@ -65,10 +65,10 @@ index 273075f4..dd11f304 100644
/* .init_tensor = */ NULL, // no initialization required
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index cec36b36..4b057973 100644
index e2617b06..242e50a7 100644
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -530,6 +530,7 @@ static void ggml_backend_cann_buffer_free_buffer(
@@ -800,6 +800,7 @@ static void ggml_backend_cann_buffer_free_buffer(
ggml_backend_cann_buffer_context* ctx =
(ggml_backend_cann_buffer_context*)buffer->context;
delete ctx;
......@@ -76,7 +76,7 @@ index cec36b36..4b057973 100644
}
/**
@@ -1199,6 +1200,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
@@ -1472,6 +1473,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
*/
static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
ACL_CHECK(aclrtFreeHost(buffer->context));
......@@ -85,10 +85,10 @@ index cec36b36..4b057973 100644
/**
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index fafe9633..59a49560 100644
index a7febef7..31750b6f 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -533,6 +533,7 @@ struct ggml_backend_cuda_buffer_context {
@@ -534,6 +534,7 @@ struct ggml_backend_cuda_buffer_context {
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
delete ctx;
......@@ -96,7 +96,7 @@ index fafe9633..59a49560 100644
}
static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
@@ -788,6 +789,7 @@ struct ggml_backend_cuda_split_buffer_context {
@@ -789,6 +790,7 @@ struct ggml_backend_cuda_split_buffer_context {
static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
delete ctx;
......@@ -104,7 +104,7 @@ index fafe9633..59a49560 100644
}
static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -1061,6 +1063,7 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
@@ -1062,6 +1064,7 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
CUDA_CHECK(cudaFreeHost(buffer->context));
......@@ -125,10 +125,10 @@ index 50579227..2799a0a5 100644
static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 9f1c6c6c..310afe8a 100644
index 266d8af4..12886cd3 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -4641,6 +4641,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
@@ -4759,6 +4759,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
}
free(ctx);
......@@ -137,10 +137,10 @@ index 9f1c6c6c..310afe8a 100644
static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index b8b5cbd3..14d4561b 100644
index 05a2f4e6..392cc18d 100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -1443,6 +1443,7 @@ struct ggml_backend_opencl_buffer_context {
@@ -1940,6 +1940,7 @@ struct ggml_backend_opencl_buffer_context {
static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
delete ctx;
......@@ -149,10 +149,10 @@ index b8b5cbd3..14d4561b 100644
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
index 862b9b66..34536681 100644
index a0667b7d..bd83adc5 100644
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -443,6 +443,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
@@ -468,6 +468,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
GGML_ASSERT(status);
delete ctx;
......@@ -161,7 +161,7 @@ index 862b9b66..34536681 100644
static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 3e48a924..a3d182fc 100644
index 1de34c96..4600f61e 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -316,6 +316,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
......@@ -189,10 +189,10 @@ index 3e48a924..a3d182fc 100644
static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 783a0ff8..8ac1e07e 100644
index 39f3cd34..c569a8a5 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -8639,6 +8639,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
@@ -8653,6 +8653,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
ggml_vk_destroy_buffer(ctx->dev_buffer);
delete ctx;
......@@ -200,7 +200,7 @@ index 783a0ff8..8ac1e07e 100644
}
static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -8782,6 +8783,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
@@ -8796,6 +8797,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
ggml_vk_host_free(vk_instance.devices[0], buffer->context);
......
......@@ -10,7 +10,7 @@ logs instead of throwing an error
1 file changed, 3 insertions(+), 11 deletions(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 464ff01e..0125ee53 100644
index 48060517..a35b498c 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1491,16 +1491,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
......
......@@ -11,10 +11,10 @@ instead of forcing one or the error
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 4735e98e..65135172 100644
index 983385f8..32f59819 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1232,7 +1232,7 @@ int llama_context::decode(llama_batch & inp_batch) {
@@ -1236,7 +1236,7 @@ int llama_context::decode(llama_batch & inp_batch) {
int64_t n_outputs_all = 0;
// count outputs
......@@ -23,7 +23,7 @@ index 4735e98e..65135172 100644
for (uint32_t i = 0; i < n_tokens_all; ++i) {
n_outputs_all += batch.logits[i] != 0;
}
@@ -1344,7 +1344,7 @@ int llama_context::decode(llama_batch & inp_batch) {
@@ -1348,7 +1348,7 @@ int llama_context::decode(llama_batch & inp_batch) {
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
//}
......@@ -32,7 +32,7 @@ index 4735e98e..65135172 100644
auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr;
if (t_embd && res->get_embd_pooled()) {
@@ -1488,7 +1488,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
@@ -1492,7 +1492,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead
......
......@@ -10,12 +10,12 @@ filesystems for paths that include wide characters
1 file changed, 39 insertions(+)
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 49c90b75..4b72ea9f 100644
index 75970615..d57b4bd6 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -28,6 +28,19 @@
#include <cinttypes>
@@ -29,6 +29,19 @@
#include <limits>
#include <array>
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
......@@ -33,7 +33,7 @@ index 49c90b75..4b72ea9f 100644
struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
//#define CLIP_DEBUG_FUNCTIONS
@@ -1429,7 +1442,29 @@ struct clip_model_loader {
@@ -1430,7 +1443,29 @@ struct clip_model_loader {
{
std::vector<uint8_t> read_buf;
......@@ -63,7 +63,7 @@ index 49c90b75..4b72ea9f 100644
if (!fin) {
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
}
@@ -1456,7 +1491,11 @@ struct clip_model_loader {
@@ -1457,7 +1492,11 @@ struct clip_model_loader {
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
}
}
......
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: Tue, 8 Apr 2025 16:03:51 -0700
Date: Sun, 20 Apr 2025 16:11:09 -0700
Subject: [PATCH] solar-pro
adds support for the Solar Pro architecture
......@@ -15,7 +15,7 @@ adds support for the Solar Pro architecture
7 files changed, 248 insertions(+)
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index a6fddc7f..0b0fedcd 100644
index 62e1480b..f754bc8f 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -68,6 +68,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
......@@ -31,10 +31,10 @@ index a6fddc7f..0b0fedcd 100644
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
+ { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
@@ -1478,6 +1480,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
@@ -1482,6 +1484,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
},
},
......@@ -59,7 +59,7 @@ index a6fddc7f..0b0fedcd 100644
{
LLM_ARCH_WAVTOKENIZER_DEC,
{
@@ -1671,6 +1691,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
@@ -1660,6 +1680,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
// this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
......@@ -68,7 +68,7 @@ index a6fddc7f..0b0fedcd 100644
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 2c2099b3..74aa3dd0 100644
index 98ca00a1..439aaeab 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -72,6 +72,7 @@ enum llm_arch {
......@@ -84,10 +84,10 @@ index 2c2099b3..74aa3dd0 100644
LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE,
+ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_DIMENSION_SECTIONS,
@@ -340,6 +342,7 @@ enum llm_tensor {
@@ -344,6 +346,7 @@ enum llm_tensor {
LLM_TENSOR_ENC_OUTPUT_NORM,
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
......@@ -115,10 +115,10 @@ index 90dfe7a7..8a667960 100644
if (il < n_layer) {
return n_swa > 0 && n_swa_pattern > 0 && il % n_swa_pattern < (n_swa_pattern - 1);
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index 4e0b5719..c3147cbc 100644
index 80fcd65d..6e278945 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -51,6 +51,8 @@ struct llama_hparams {
@@ -55,6 +55,8 @@ struct llama_hparams {
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
......@@ -127,7 +127,7 @@ index 4e0b5719..c3147cbc 100644
uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0;
uint32_t n_lora_kv = 0;
@@ -149,6 +151,9 @@ struct llama_hparams {
@@ -153,6 +155,9 @@ struct llama_hparams {
// dimension of the recurrent state embeddings
uint32_t n_embd_v_s() const;
......@@ -150,10 +150,10 @@ index ea73a8a7..a012aeae 100644
llama_model_loader::llama_model_loader(
const std::string & fname,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index b74dd72c..5fbd0055 100644
index 6b7bfecf..aba42819 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1372,6 +1372,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
@@ -1374,6 +1374,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN;
}
} break;
......@@ -175,7 +175,7 @@ index b74dd72c..5fbd0055 100644
case LLM_ARCH_WAVTOKENIZER_DEC:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -3701,6 +3716,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
@@ -3717,6 +3732,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
......@@ -210,7 +210,7 @@ index b74dd72c..5fbd0055 100644
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
@@ -12244,6 +12287,165 @@ struct llm_build_chameleon : public llm_graph_context {
@@ -12296,6 +12339,165 @@ struct llm_build_chameleon : public llm_graph_context {
}
};
......@@ -309,14 +309,14 @@ index b74dd72c..5fbd0055 100644
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn, gf,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, kq_scale, il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
+ cb(cur, "attn_out", il);
+ }
+
......@@ -376,7 +376,7 @@ index b74dd72c..5fbd0055 100644
struct llm_build_wavtokenizer_dec : public llm_graph_context {
llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
ggml_tensor * cur;
@@ -12993,6 +13195,10 @@ llm_graph_result_ptr llama_model::build_graph(
@@ -13045,6 +13247,10 @@ llm_graph_result_ptr llama_model::build_graph(
{
llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
} break;
......@@ -387,7 +387,7 @@ index b74dd72c..5fbd0055 100644
case LLM_ARCH_WAVTOKENIZER_DEC:
{
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf);
@@ -13139,6 +13345,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
@@ -13191,6 +13397,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_CHAMELEON:
......@@ -396,7 +396,7 @@ index b74dd72c..5fbd0055 100644
return LLAMA_ROPE_TYPE_NORM;
diff --git a/src/llama-model.h b/src/llama-model.h
index 0f18dac1..e08d4ae4 100644
index fd82d106..5865d5e9 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -62,6 +62,7 @@ enum llm_type {
......@@ -407,7 +407,7 @@ index 0f18dac1..e08d4ae4 100644
LLM_TYPE_30B,
LLM_TYPE_32B,
LLM_TYPE_34B,
@@ -305,6 +306,8 @@ struct llama_layer {
@@ -307,6 +308,8 @@ struct llama_layer {
struct ggml_tensor * ffn_up_scale = nullptr;
struct ggml_tensor * ffn_down_scale = nullptr;
......
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: Tue, 8 Apr 2025 19:27:12 -0700
Date: Sun, 20 Apr 2025 16:12:36 -0700
Subject: [PATCH] add mllama support
adds support for the llama 3.2 vision architecture
......@@ -28,7 +28,7 @@ adds support for the llama 3.2 vision architecture
20 files changed, 475 insertions(+), 22 deletions(-)
diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp
index 91a07e2a..13127c7b 100644
index 3d566475..654d1358 100644
--- a/examples/llava/gemma3-cli.cpp
+++ b/examples/llava/gemma3-cli.cpp
@@ -106,7 +106,7 @@ struct decode_embd_batch {
......@@ -79,10 +79,10 @@ index 03a22cbb..5eb40bcd 100644
LOG_ERR("%s : failed to eval\n", __func__);
return false;
diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
index 114c274b..a0e649ad 100644
index 3fd5bebc..f0cec596 100644
--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
@@ -213,7 +213,7 @@ struct decode_embd_batch {
@@ -233,7 +233,7 @@ struct decode_embd_batch {
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
......@@ -91,7 +91,7 @@ index 114c274b..a0e649ad 100644
pos .resize(n_tokens);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
@@ -225,6 +225,7 @@ struct decode_embd_batch {
@@ -245,6 +245,7 @@ struct decode_embd_batch {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ embd,
......@@ -99,9 +99,9 @@ index 114c274b..a0e649ad 100644
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
@@ -291,7 +292,8 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
@@ -311,7 +312,8 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
int32_t n_tokens = chunk.tokens_image->n_tokens();
int32_t n_tokens = mtmd_image_tokens_get_n_tokens(chunk.tokens_image.get());
float * embd = mtmd_get_output_embd(ctx);
- decode_embd_batch batch_img(embd, n_tokens, n_past, 0);
+ int n_embd = llama_model_n_embd(llama_get_model(lctx));
......@@ -158,7 +158,7 @@ index 5657fbf0..f91896e4 100644
LLAMA_API void llama_free(struct llama_context * ctx);
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 0b0fedcd..c1f78618 100644
index f754bc8f..0568565f 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -6,6 +6,7 @@
......@@ -174,10 +174,10 @@ index 0b0fedcd..c1f78618 100644
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
+ { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
@@ -269,6 +271,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
@@ -271,6 +273,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
},
},
......@@ -218,7 +218,7 @@ index 0b0fedcd..c1f78618 100644
{
LLM_ARCH_DECI,
{
@@ -1692,6 +1728,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
@@ -1681,6 +1717,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
// this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
{LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
......@@ -234,7 +234,7 @@ index 0b0fedcd..c1f78618 100644
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 74aa3dd0..f987844d 100644
index 439aaeab..6a989034 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -11,6 +11,7 @@
......@@ -250,10 +250,10 @@ index 74aa3dd0..f987844d 100644
LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
+ LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_DIMENSION_SECTIONS,
@@ -343,6 +345,14 @@ enum llm_tensor {
@@ -347,6 +349,14 @@ enum llm_tensor {
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
LLM_TENSOR_BSKCN_TV,
......@@ -297,10 +297,10 @@ index 01d5ca57..8682b0e6 100644
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
}
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 65135172..afe6f552 100644
index 32f59819..0343ba8a 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -858,7 +858,7 @@ float * llama_context::get_logits_ith(int32_t i) {
@@ -862,7 +862,7 @@ float * llama_context::get_logits_ith(int32_t i) {
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
}
......@@ -309,7 +309,7 @@ index 65135172..afe6f552 100644
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
#ifndef NDEBUG
@@ -979,6 +979,10 @@ void llama_context::set_warmup(bool value) {
@@ -983,6 +983,10 @@ void llama_context::set_warmup(bool value) {
cparams.warmup = value;
}
......@@ -320,7 +320,7 @@ index 65135172..afe6f552 100644
void llama_context::set_adapter_lora(
llama_adapter_lora * adapter,
float scale) {
@@ -1054,7 +1058,7 @@ int llama_context::encode(llama_batch & inp_batch) {
@@ -1058,7 +1062,7 @@ int llama_context::encode(llama_batch & inp_batch) {
const int64_t n_embd = hparams.n_embd;
......@@ -329,7 +329,7 @@ index 65135172..afe6f552 100644
const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
@@ -1194,10 +1198,9 @@ int llama_context::decode(llama_batch & inp_batch) {
@@ -1198,10 +1202,9 @@ int llama_context::decode(llama_batch & inp_batch) {
const llama_batch & batch = batch_allocr.batch;
......@@ -341,7 +341,7 @@ index 65135172..afe6f552 100644
const int64_t n_tokens_all = batch.n_tokens;
const int64_t n_embd = hparams.n_embd;
@@ -1245,7 +1248,7 @@ int llama_context::decode(llama_batch & inp_batch) {
@@ -1249,7 +1252,7 @@ int llama_context::decode(llama_batch & inp_batch) {
const bool logits_all = n_outputs_all == n_tokens_all;
......@@ -350,7 +350,7 @@ index 65135172..afe6f552 100644
/* simple_split */ !kv_self->recurrent,
/* logits_all */ logits_all);
@@ -1479,12 +1482,11 @@ int llama_context::decode(llama_batch & inp_batch) {
@@ -1483,12 +1486,11 @@ int llama_context::decode(llama_batch & inp_batch) {
int32_t llama_context::output_reserve(int32_t n_outputs) {
const auto & hparams = model.hparams;
......@@ -364,7 +364,7 @@ index 65135172..afe6f552 100644
const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead
@@ -1554,7 +1556,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
@@ -1558,7 +1560,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
void llama_context::output_reorder() {
auto & out_ids = sbatch.out_ids;
if (!out_ids.empty()) {
......@@ -373,7 +373,7 @@ index 65135172..afe6f552 100644
const uint32_t n_embd = model.hparams.n_embd;
GGML_ASSERT((size_t) n_outputs == out_ids.size());
@@ -2061,7 +2063,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
@@ -2065,7 +2067,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
{
LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
......@@ -382,7 +382,7 @@ index 65135172..afe6f552 100644
io.write(&logits_size, sizeof(logits_size));
@@ -2244,6 +2246,7 @@ llama_context_params llama_context_default_params() {
@@ -2248,6 +2250,7 @@ llama_context_params llama_context_default_params() {
/*.offload_kqv =*/ true,
/*.flash_attn =*/ false,
/*.no_perf =*/ true,
......@@ -390,7 +390,7 @@ index 65135172..afe6f552 100644
/*.abort_callback =*/ nullptr,
/*.abort_callback_data =*/ nullptr,
};
@@ -2371,6 +2374,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
@@ -2375,6 +2378,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
ctx->set_warmup(warmup);
}
......@@ -426,7 +426,7 @@ index 30e550f0..85ad91b9 100644
enum llama_pooling_type pooling_type;
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index cd955d63..83f3c5a8 100644
index a85e9728..d740c120 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -546,6 +546,12 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
......@@ -442,7 +442,7 @@ index cd955d63..83f3c5a8 100644
//
// llm_graph_context
//
@@ -1495,6 +1501,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
@@ -1506,6 +1512,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
}
......@@ -469,7 +469,7 @@ index cd955d63..83f3c5a8 100644
llm_graph_input_attn_cross * inp,
ggml_cgraph * gf,
diff --git a/src/llama-graph.h b/src/llama-graph.h
index 5b6618f9..51993998 100644
index d192dc14..260a2af2 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -86,6 +86,7 @@ public:
......@@ -518,7 +518,7 @@ index 8a667960..6a02de03 100644
+ return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
+}
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index c3147cbc..4567a0e9 100644
index 6e278945..c8a34d52 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -2,6 +2,8 @@
......@@ -536,9 +536,9 @@ index c3147cbc..4567a0e9 100644
uint32_t n_rel_attn_bkts = 0;
+ uint32_t n_vocab = 0;
// for WavTokenizer
struct llama_hparams_posnet posnet;
@@ -52,6 +55,7 @@ struct llama_hparams {
// note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
uint32_t n_embd_head_k_mla = 0;
@@ -56,6 +59,7 @@ struct llama_hparams {
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr = {};
......@@ -546,7 +546,7 @@ index c3147cbc..4567a0e9 100644
uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0;
@@ -154,6 +158,9 @@ struct llama_hparams {
@@ -158,6 +162,9 @@ struct llama_hparams {
// Block skip connection
bool n_bskcn(uint32_t n, uint32_t il) const;
......@@ -557,7 +557,7 @@ index c3147cbc..4567a0e9 100644
};
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index dbf5f118..9310f262 100644
index 7c9d46d8..69f8d35a 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -95,8 +95,16 @@ bool llama_kv_cache_unified::init(
......@@ -593,7 +593,7 @@ index a012aeae..2e11507d 100644
bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
const int kid = gguf_find_key(meta.get(), key.c_str());
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 5fbd0055..d5ad466e 100644
index aba42819..d051696c 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -419,6 +419,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
......@@ -650,7 +650,7 @@ index 5fbd0055..d5ad466e 100644
case LLM_ARCH_DECI:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -1548,7 +1562,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
@@ -1550,7 +1564,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
const int64_t n_embd_head_v = hparams.n_embd_head_v;
const int64_t n_ff = hparams.n_ff();
const int64_t n_embd_gqa = n_embd_v_gqa;
......@@ -659,7 +659,7 @@ index 5fbd0055..d5ad466e 100644
const int64_t n_token_types = vocab.n_token_types();
const int64_t n_rot = hparams.n_rot;
const int64_t n_expert = hparams.n_expert;
@@ -1801,6 +1815,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
@@ -1803,6 +1817,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
}
}
} break;
......@@ -712,7 +712,7 @@ index 5fbd0055..d5ad466e 100644
case LLM_ARCH_DECI:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -4665,6 +4725,246 @@ struct llm_build_llama : public llm_graph_context {
@@ -4683,6 +4743,246 @@ struct llm_build_llama : public llm_graph_context {
}
};
......@@ -893,14 +893,14 @@ index 5fbd0055..d5ad466e 100644
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn, gf,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+
+ if (il == n_layer - 1) {
+ // skip computing output for unused tokens
......@@ -959,7 +959,7 @@ index 5fbd0055..d5ad466e 100644
struct llm_build_deci : public llm_graph_context {
llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -12965,6 +13265,10 @@ llm_graph_result_ptr llama_model::build_graph(
@@ -13017,6 +13317,10 @@ llm_graph_result_ptr llama_model::build_graph(
{
llm = std::make_unique<llm_build_llama>(*this, params, gf);
} break;
......@@ -970,7 +970,7 @@ index 5fbd0055..d5ad466e 100644
case LLM_ARCH_DECI:
{
llm = std::make_unique<llm_build_deci>(*this, params, gf);
@@ -13325,6 +13629,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
@@ -13377,6 +13681,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
// use what we call a normal RoPE, operating on pairs of consecutive head values
case LLM_ARCH_LLAMA:
case LLM_ARCH_LLAMA4:
......@@ -979,7 +979,7 @@ index 5fbd0055..d5ad466e 100644
case LLM_ARCH_BAICHUAN:
case LLM_ARCH_STARCODER:
diff --git a/src/llama-model.h b/src/llama-model.h
index e08d4ae4..21c4617b 100644
index 5865d5e9..72bab5be 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -11,6 +11,7 @@
......@@ -998,7 +998,7 @@ index e08d4ae4..21c4617b 100644
LLM_TYPE_236B,
LLM_TYPE_314B,
LLM_TYPE_671B,
@@ -308,6 +310,16 @@ struct llama_layer {
@@ -310,6 +312,16 @@ struct llama_layer {
struct ggml_tensor * bskcn_tv = nullptr;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment