Unverified Commit 7a81daf0 authored by Jeffrey Morgan's avatar Jeffrey Morgan Committed by GitHub
Browse files

llama: update vendor code to commit ba1cb19c (#8101)

parent 60f75560
/**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
* llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
*
* MIT License
*
......
/**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
* llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
*
* MIT License
*
......@@ -444,6 +444,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
case LLAMA_VOCAB_PRE_TYPE_EXAONE:
case LLAMA_VOCAB_PRE_TYPE_MINERVA:
regex_exprs = {
"\\p{N}",
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
......
/**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
* llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
*
* MIT License
*
......
/**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
* llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
*
* MIT License
*
......@@ -190,6 +190,7 @@ enum llm_arch {
LLM_ARCH_QWEN,
LLM_ARCH_QWEN2,
LLM_ARCH_QWEN2MOE,
LLM_ARCH_QWEN2VL,
LLM_ARCH_PHI2,
LLM_ARCH_PHI3,
LLM_ARCH_PLAMO,
......@@ -246,6 +247,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_QWEN, "qwen" },
{ LLM_ARCH_QWEN2, "qwen2" },
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
{ LLM_ARCH_QWEN2VL, "qwen2vl" },
{ LLM_ARCH_PHI2, "phi2" },
{ LLM_ARCH_PHI3, "phi3" },
{ LLM_ARCH_PLAMO, "plamo" },
......@@ -340,6 +342,7 @@ enum llm_kv {
LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,
 
LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_DIMENSION_SECTIONS,
LLM_KV_ROPE_FREQ_BASE,
LLM_KV_ROPE_SCALE_LINEAR,
LLM_KV_ROPE_SCALING_TYPE,
......@@ -458,6 +461,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
 
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
......@@ -975,6 +979,23 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{
LLM_ARCH_QWEN2VL,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{
LLM_ARCH_QWEN2MOE,
{
......@@ -1113,6 +1134,8 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
{ LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
{ LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
......@@ -1778,9 +1801,10 @@ struct LLM_TN {
//
 
static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
{ LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
{ LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
{ LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
{ LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
{ LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
{ LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
{ LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
};
 
static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
......@@ -1886,7 +1910,7 @@ private:
DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
if (!bufLen) {
ret = format("Win32 error code: %s", error_code);
ret = format("Win32 error code: %lx", error_code);
} else {
ret = lpMsgBuf;
LocalFree(lpMsgBuf);
......@@ -2224,7 +2248,7 @@ struct llama_mmap {
HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
 
// may fail on pre-Windows 8 systems
pPrefetchVirtualMemory = reinterpret_cast<decltype(pPrefetchVirtualMemory)> (GetProcAddress(hKernel32, "PrefetchVirtualMemory"));
pPrefetchVirtualMemory = (decltype(pPrefetchVirtualMemory))(void *) GetProcAddress(hKernel32, "PrefetchVirtualMemory");
 
if (pPrefetchVirtualMemory) {
// advise the kernel to preload the mapped memory
......@@ -2571,11 +2595,12 @@ struct llama_hparams {
uint32_t time_decay_extra_dim = 0;
uint32_t wkv_head_size = 0;
 
float rope_attn_factor = 1.0f;
float rope_freq_base_train;
float rope_freq_scale_train;
uint32_t n_ctx_orig_yarn;
float rope_yarn_log_mul;
float rope_attn_factor = 1.0f;
float rope_freq_base_train;
float rope_freq_scale_train;
uint32_t n_ctx_orig_yarn;
float rope_yarn_log_mul;
int rope_sections[4];
 
// for State Space Models
uint32_t ssm_d_conv = 0;
......@@ -2634,6 +2659,9 @@ struct llama_hparams {
 
if (this->rope_finetuned != other.rope_finetuned) return true;
if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true;
if (std::equal(std::begin(this->rope_sections),
std::end(this->rope_sections),
std::begin(other.rope_sections))) return true;
 
if (this->ssm_d_conv != other.ssm_d_conv) return true;
if (this->ssm_d_inner != other.ssm_d_inner) return true;
......@@ -3504,6 +3532,11 @@ struct llama_context {
// whether we are computing encoder output or decoder output
bool is_encoding = false;
 
// TODO: find a better way to accommodate mutli-dimension position encoding methods
// number of position id each token get, 1 for each token in most cases.
// when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate.
int n_pos_per_token = 1;
// output of the encoder part of the encoder-decoder models
std::vector<float> embd_enc;
std::vector<std::set<llama_seq_id>> seq_ids_enc;
......@@ -4739,9 +4772,6 @@ struct llama_model_loader {
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
case GGML_TYPE_Q4_0_4_4: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_4; break;
case GGML_TYPE_Q4_0_4_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_8; break;
case GGML_TYPE_Q4_0_8_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_8_8; break;
default:
{
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
......@@ -5505,9 +5535,6 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";
 
default: return "unknown, may not work";
}
......@@ -5756,8 +5783,12 @@ static void llm_load_hparams(
case LLM_ARCH_MINICPM:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
 
switch (hparams.n_layer) {
case 52: model.type = e_model::MODEL_1B; break;
case 40: model.type = e_model::MODEL_2B; break;
default: model.type = e_model::MODEL_UNKNOWN;
}
......@@ -5922,6 +5953,13 @@ static void llm_load_hparams(
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
case LLM_ARCH_QWEN2VL:
{
std::array<int, 4> section_dims;
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, section_dims, 4, true);
std::copy(section_dims.begin(), section_dims.begin() + 4, std::begin(hparams.rope_sections));
}
// fall through
case LLM_ARCH_QWEN2:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
......@@ -6654,6 +6692,9 @@ static void llm_load_vocab(
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
vocab.tokenizer_add_bos = true;
vocab.tokenizer_clean_spaces = false;
} else if (
tokenizer_pre == "minerva-7b") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA;
} else {
LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
......@@ -7248,7 +7289,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
}
 
if (model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
if (model.arch == LLM_ARCH_MINICPM || model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
......@@ -7882,7 +7923,13 @@ static bool llm_load_tensors(
 
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
 
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
}
else {
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
}
 
if (n_expert == 0) {
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
......@@ -8396,6 +8443,7 @@ static bool llm_load_tensors(
}
} break;
case LLM_ARCH_QWEN2:
case LLM_ARCH_QWEN2VL:
{
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
......@@ -13064,6 +13112,124 @@ struct llm_build_context {
return gf;
}
 
struct ggml_cgraph * build_qwen2vl() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// inp_pos - contains the positions
lctx.inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens * 4);
cb(lctx.inp_pos, "inp_pos", -1);
ggml_set_input(lctx.inp_pos);
struct ggml_tensor * inp_pos = lctx.inp_pos;
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
int sections[4];
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL;
// norm
cur = llm_build_norm(ctx0, inpL, hparams,
model.layers[il].attn_norm, NULL,
LLM_NORM_RMS, cb, il);
cb(cur, "attn_norm", il);
// self-attention
{
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
cb(Qcur, "Qcur", il);
struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
cb(Kcur, "Kcur", il);
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
cb(Kcur, "Kcur", il);
struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
cb(Vcur, "Vcur", il);
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
cb(Vcur, "Vcur", il);
Qcur = ggml_rope_multi(
ctx0,
ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur", il);
Kcur = ggml_rope_multi(
ctx0,
ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Kcur, "Kcur", il);
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
}
if (il == n_layer - 1) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
// feed-forward network
cur = llm_build_norm(ctx0, ffn_inp, hparams,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, cb, il);
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, lctx, cur,
model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
cur = ggml_add(ctx0, cur, ffn_inp);
cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);
// input for next layer
inpL = cur;
}
cur = inpL;
cur = llm_build_norm(ctx0, cur, hparams,
model.output_norm, NULL,
LLM_NORM_RMS, cb, -1);
cb(cur, "result_norm", -1);
// lm_head
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur);
return gf;
}
struct ggml_cgraph * build_qwen2moe() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
......@@ -14015,153 +14181,6 @@ struct llm_build_context {
return gf;
}
 
// ref: https://arxiv.org/abs/2203.03466
// https://github.com/ggerganov/llama.cpp/issues/5276#issuecomment-1925774738
// based on the original build_llama() function
struct ggml_cgraph * build_minicpm() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
const int64_t n_embd = hparams.n_embd;
//TODO: if the model varies, these parameters need to be read from the model
const int64_t n_embd_base = 256;
const float scale_embd = 12.0f;
const float scale_depth = 1.4f;
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// scale the input embeddings
inpL = ggml_scale(ctx0, inpL, scale_embd);
cb(inpL, "inp_scaled", -1);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos();
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL;
// norm
cur = llm_build_norm(ctx0, inpL, hparams,
model.layers[il].attn_norm, NULL,
LLM_NORM_RMS, cb, il);
cb(cur, "attn_norm", il);
// self-attention
{
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
if (model.layers[il].bq) {
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
cb(Qcur, "Qcur", il);
}
struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
cb(Kcur, "Kcur", il);
if (model.layers[il].bk) {
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
cb(Kcur, "Kcur", il);
}
struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
cb(Vcur, "Vcur", il);
if (model.layers[il].bv) {
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
cb(Vcur, "Vcur", il);
}
Qcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur", il);
Kcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Kcur, "Kcur", il);
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
}
if (il == n_layer - 1) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
// scale_res - scale the hidden states for residual connection
const float scale_res = scale_depth/sqrtf(float(n_layer));
cur = ggml_scale(ctx0, cur, scale_res);
cb(cur, "hidden_scaled", -1);
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
// feed-forward network
{
cur = llm_build_norm(ctx0, ffn_inp, hparams,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, cb, il);
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, lctx, cur,
model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
}
// scale the hidden states for residual connection
cur = ggml_scale(ctx0, cur, scale_res);
cb(cur, "hidden_scaled_ffn", -1);
cur = ggml_add(ctx0, cur, ffn_inp);
cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);
// input for next layer
inpL = cur;
}
cur = inpL;
cur = llm_build_norm(ctx0, cur, hparams,
model.output_norm, NULL,
LLM_NORM_RMS, cb, -1);
cb(cur, "result_norm", -1);
// lm_head scaling
const float scale_lmhead = float(n_embd_base)/float(n_embd);
cur = ggml_scale(ctx0, cur, scale_lmhead);
cb(cur, "lmhead_scaling", -1);
// lm_head
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur);
return gf;
}
struct ggml_cgraph * build_minicpm3() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
......@@ -17412,6 +17431,7 @@ static struct ggml_cgraph * llama_build_graph(
 
switch (model.arch) {
case LLM_ARCH_LLAMA:
case LLM_ARCH_MINICPM:
case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE_MOE:
{
......@@ -17467,6 +17487,11 @@ static struct ggml_cgraph * llama_build_graph(
{
result = llm.build_qwen2();
} break;
case LLM_ARCH_QWEN2VL:
{
lctx.n_pos_per_token = 4;
result = llm.build_qwen2vl();
} break;
case LLM_ARCH_QWEN2MOE:
{
result = llm.build_qwen2moe();
......@@ -17499,10 +17524,6 @@ static struct ggml_cgraph * llama_build_graph(
{
result = llm.build_internlm2();
} break;
case LLM_ARCH_MINICPM:
{
result = llm.build_minicpm();
} break;
case LLM_ARCH_MINICPM3:
{
result = llm.build_minicpm3();
......@@ -17702,8 +17723,8 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
 
if (ubatch.pos && lctx.inp_pos) {
const int64_t n_tokens = ubatch.n_tokens;
ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
auto n_pos = lctx.n_pos_per_token;
ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*n_pos*ggml_element_size(lctx.inp_pos));
}
 
if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
......@@ -19191,10 +19212,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
new_type = GGML_TYPE_IQ3_S;
}
else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 ||
new_type == GGML_TYPE_Q4_0_8_8) {
new_type = GGML_TYPE_Q4_0;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
new_type = GGML_TYPE_Q4_K;
}
......@@ -19517,9 +19534,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break;
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break;
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break;
 
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
}
......@@ -19860,14 +19874,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
f32_data = (float *) f32_conv_buf.data();
}
 
int chunk_size_multiplier = 1;
if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) {
if ((new_type == GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0)) new_type = GGML_TYPE_Q4_0;
else if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_0;
if (new_type == GGML_TYPE_Q4_0_8_8) chunk_size_multiplier = 8;
else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4;
}
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
fflush(stdout);
 
......@@ -19880,8 +19886,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
const int64_t nrows = tensor->ne[1];
 
static const int64_t min_chunk_size = 32 * 512;
const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) *
chunk_size_multiplier;
const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row));
 
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
......@@ -20859,6 +20864,9 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_MINICPM3:
return LLAMA_ROPE_TYPE_NEOX;
 
case LLM_ARCH_QWEN2VL:
return LLAMA_ROPE_TYPE_MROPE;
// all model arches should be listed explicitly here
case LLM_ARCH_UNKNOWN:
GGML_ABORT("unknown architecture");
......@@ -22434,7 +22442,7 @@ float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
}
} else if ((size_t) i >= ctx->output_ids.size()) {
throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
throw std::runtime_error(format("out of range [0, %zu)", ctx->output_ids.size()));
} else {
j = ctx->output_ids[i];
}
......
/**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
* llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
*
* MIT License
*
......@@ -130,12 +130,15 @@ extern "C" {
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
};
enum llama_rope_type {
LLAMA_ROPE_TYPE_NONE = -1,
LLAMA_ROPE_TYPE_NORM = 0,
LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX,
LLAMA_ROPE_TYPE_NONE = -1,
LLAMA_ROPE_TYPE_NORM = 0,
LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX,
LLAMA_ROPE_TYPE_MROPE = GGML_ROPE_TYPE_MROPE,
LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION,
};
enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
......@@ -197,9 +200,9 @@ extern "C" {
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors
//LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // removed from gguf files, use Q4_0 and runtime repack
//LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // removed from gguf files, use Q4_0 and runtime repack
//LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
......@@ -211,7 +214,8 @@ extern "C" {
LLAMA_ROPE_SCALING_TYPE_NONE = 0,
LLAMA_ROPE_SCALING_TYPE_LINEAR = 1,
LLAMA_ROPE_SCALING_TYPE_YARN = 2,
LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN,
LLAMA_ROPE_SCALING_TYPE_LONGROPE = 3,
LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_LONGROPE,
};
enum llama_pooling_type {
......@@ -485,6 +489,7 @@ extern "C" {
// Functions to access the model's GGUF metadata scalar values
// - The functions return the length of the string on success, or -1 on failure
// - The output string is always null-terminated and cleared on failure
// - When retrieving a string, an extra byte must be allocated to account for the null terminator
// - GGUF array values are not supported by these functions
// Get metadata value as a string by key name
......
/**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
* llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
*
* MIT License
*
......@@ -285,25 +285,33 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
if (clip_is_minicpmv(ctx_clip)) {
if (clip_is_minicpmv(ctx_clip) || clip_is_qwen2vl(ctx_clip)) {
std::vector<float *> image_embd_v;
image_embd_v.resize(img_res_v.size);
struct clip_image_size * load_image_size = clip_image_size_init();
for (size_t i = 0; i < img_res_v.size; i++) {
const int64_t t_img_enc_step_start_us = ggml_time_us();
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip));
image_embd_v[i] = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny));
int patch_size=14;
load_image_size->width = img_res_v.data[i].nx;
load_image_size->height = img_res_v.data[i].ny;
clip_add_load_image_size(ctx_clip, load_image_size);
bool encoded = false;
int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
if (has_minicpmv_projector == 2) {
encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
}
else if (has_minicpmv_projector == 3) {
if (clip_is_qwen2vl(ctx_clip)) {
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
}
else {
int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
if (has_minicpmv_projector == 2) {
encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
}
else if (has_minicpmv_projector == 3) {
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
}
}
if (!encoded) {
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
return false;
......@@ -316,8 +324,11 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
int n_img_pos_out = 0;
for (size_t i = 0; i < image_embd_v.size(); i++) {
std::memcpy(image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i], clip_embd_nbytes(ctx_clip));
n_img_pos_out += clip_n_patches(ctx_clip);
std::memcpy(
image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
image_embd_v[i],
clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny));
n_img_pos_out += clip_n_patches_by_img(ctx_clip, &img_res_v.data[i]);
}
*n_img_pos = n_img_pos_out;
for (size_t i = 0; i < image_embd_v.size(); i++) {
......@@ -413,7 +424,13 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
if (clip_is_minicpmv(ctx_clip)) {
num_max_patches = 10;
}
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
float * image_embd;
if (clip_is_qwen2vl(ctx_clip)) {
// qwen2vl don't split image into chunks, so `num_max_patches` is not needed.
image_embd = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img->nx, img->ny));
} else {
image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
}
if (!image_embd) {
LOG_ERR("Unable to allocate memory for image embeddings\n");
return false;
......
/**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
* llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
*
* MIT License
*
......
/**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
* llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
*
* MIT License
*
......
/**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
* llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
*
* MIT License
*
......
/**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
* llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
*
* MIT License
*
......@@ -44,10 +44,6 @@
#include <unistd.h>
#endif
#if defined(_OPENMP)
#include <omp.h>
#endif
#if (defined(_WIN32) || defined(_WIN64))
#define RESTRICT __restrict
#else
......@@ -1408,13 +1404,13 @@ struct tinygemm_kernel_avx<float, ggml_fp16_t, float, BLOCK_M, BLOCK_N, BLOCK_K>
#define PACKED_INDEX(n, k, KB, tile_size) (n * KB + k) * tile_size
template<typename TB, int BLOCK_K>
void convert_B_packed_format(void * RESTRICT packed_B, const TB * RESTRICT B, int N, int K, int n_threads) {
void convert_B_packed_format(void * RESTRICT packed_B, const TB * RESTRICT B, int N, int K) {
const int NB = N / TILE_N;
const int KB = K / BLOCK_K;
const int TILE_SIZE = get_tile_size<TB>();
// parallel on NB should be enough
parallel_for(n_threads, NB, [&](int begin, int end) {
parallel_for(NB, [&](int begin, int end) {
for (int n = begin; n < end; ++n) {
for (int k = 0; k < KB; ++k) {
int n0 = n * TILE_N;
......@@ -2360,15 +2356,8 @@ void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * d
const int K = tensor->ne[0]; // ne0: in_features
const int N = tensor->ne[1]; // ne1: out_features
#if defined(_OPENMP)
// the buffer ctx is not initialized when .set_tensor is called
int n_threads = omp_get_num_threads();
#else
int n_threads = 1;
#endif
GGML_DISPATCH_QTYPES(TYPE, [&] {
convert_B_packed_format<type, blck_size>((void *)((char *)tensor->data + offset), (const type *)data, N, K, n_threads);
convert_B_packed_format<type, blck_size>((void *)((char *)tensor->data + offset), (const type *)data, N, K);
});
}
......
/**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
* llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
*
* MIT License
*
......@@ -27,16 +27,10 @@
#pragma once
#include "common.h"
#ifdef __cplusplus
extern "C" {
#endif
size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst);
size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor);
void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
#ifdef __cplusplus
}
#endif
......@@ -26,7 +26,7 @@ index fdb4b986..9b80fe07 100644
size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index d6e4bfdd..52aec229 100644
index c180adc8..000f1777 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -424,6 +424,10 @@ struct ggml_backend_cuda_buffer_context {
......
......@@ -8,10 +8,10 @@ Subject: [PATCH] pretokenizer
1 file changed, 3 insertions(+), 11 deletions(-)
diff --git a/src/llama.cpp b/src/llama.cpp
index 6a6f4c2a..fa09f3b3 100644
index abc1252e..626c3e3f 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -6362,16 +6362,7 @@ static void llm_load_vocab(
@@ -6400,16 +6400,7 @@ static void llm_load_vocab(
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
vocab.tokenizer_add_space_prefix = false;
vocab.tokenizer_clean_spaces = true;
......@@ -29,9 +29,9 @@ index 6a6f4c2a..fa09f3b3 100644
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (
tokenizer_pre == "llama3" ||
@@ -6473,7 +6464,8 @@ static void llm_load_vocab(
vocab.tokenizer_add_bos = true;
vocab.tokenizer_clean_spaces = false;
@@ -6514,7 +6505,8 @@ static void llm_load_vocab(
tokenizer_pre == "minerva-7b") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA;
} else {
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
+ LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
......
......@@ -8,10 +8,10 @@ Subject: [PATCH] embeddings
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/src/llama.cpp b/src/llama.cpp
index fa09f3b3..d1791af0 100644
index 626c3e3f..9e292c4f 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -17398,7 +17398,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
@@ -17419,7 +17419,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead
......@@ -20,7 +20,7 @@ index fa09f3b3..d1791af0 100644
const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
@@ -17693,7 +17693,6 @@ static int llama_decode_internal(
@@ -17714,7 +17714,6 @@ static int llama_decode_internal(
res = nullptr;
embd = nullptr;
} else if (cparams.embeddings) {
......@@ -28,7 +28,7 @@ index fa09f3b3..d1791af0 100644
embd = nullptr;
for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
@@ -17701,11 +17700,15 @@ static int llama_decode_internal(
@@ -17722,11 +17721,15 @@ static int llama_decode_internal(
break;
}
}
......
......@@ -8,7 +8,7 @@ Subject: [PATCH] clip-unicode
1 file changed, 39 insertions(+), 1 deletion(-)
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index d7c94352..427d5e02 100644
index ba28c07c..46998e4c 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -56,6 +56,19 @@
......@@ -31,7 +31,7 @@ index d7c94352..427d5e02 100644
//#define CLIP_DEBUG_FUNCTIONS
// RGB uint8 image
@@ -1242,8 +1255,29 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
@@ -1322,8 +1335,29 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
gguf_free(ctx);
return nullptr;
}
......@@ -62,7 +62,7 @@ index d7c94352..427d5e02 100644
if (!fin) {
LOG_ERR("cannot open model file for loading tensors\n");
clip_free(new_clip);
@@ -1283,7 +1317,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
@@ -1363,7 +1397,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
}
}
......
......@@ -15,10 +15,10 @@ in general, the values are (bskcn_tv, 1 - bskcn_tv)
1 file changed, 253 insertions(+), 14 deletions(-)
diff --git a/src/llama.cpp b/src/llama.cpp
index d1791af0..b01770d0 100644
index 9e292c4f..26be6254 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -195,6 +195,7 @@ enum llm_arch {
@@ -196,6 +196,7 @@ enum llm_arch {
LLM_ARCH_GRANITE,
LLM_ARCH_GRANITE_MOE,
LLM_ARCH_CHAMELEON,
......@@ -26,7 +26,7 @@ index d1791af0..b01770d0 100644
LLM_ARCH_UNKNOWN,
};
@@ -249,6 +250,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
@@ -251,6 +252,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_GRANITE, "granite" },
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
{ LLM_ARCH_CHAMELEON, "chameleon" },
......@@ -34,15 +34,15 @@ index d1791af0..b01770d0 100644
{ LLM_ARCH_UNKNOWN, "(unknown)" },
};
@@ -306,6 +308,7 @@ enum llm_kv {
@@ -308,6 +310,7 @@ enum llm_kv {
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE,
+ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_FREQ_BASE,
@@ -408,20 +411,21 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
LLM_KV_ROPE_DIMENSION_SECTIONS,
@@ -411,20 +414,21 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" },
{ LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
......@@ -77,8 +77,8 @@ index d1791af0..b01770d0 100644
+ { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection.%d" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
@@ -603,6 +607,7 @@ enum llm_tensor {
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
@@ -607,6 +611,7 @@ enum llm_tensor {
LLM_TENSOR_ENC_OUTPUT_NORM,
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
......@@ -86,7 +86,7 @@ index d1791af0..b01770d0 100644
};
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
@@ -1541,6 +1546,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
@@ -1564,6 +1569,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
},
},
......@@ -111,7 +111,7 @@ index d1791af0..b01770d0 100644
{
LLM_ARCH_UNKNOWN,
{
@@ -2401,6 +2424,7 @@ enum e_model {
@@ -2425,6 +2448,7 @@ enum e_model {
MODEL_15B,
MODEL_16B,
MODEL_20B,
......@@ -119,7 +119,7 @@ index d1791af0..b01770d0 100644
MODEL_30B,
MODEL_32B,
MODEL_34B,
@@ -2451,6 +2475,8 @@ struct llama_hparams {
@@ -2475,6 +2499,8 @@ struct llama_hparams {
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
......@@ -128,7 +128,7 @@ index d1791af0..b01770d0 100644
uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0;
uint32_t n_lora_kv = 0;
@@ -2521,6 +2547,7 @@ struct llama_hparams {
@@ -2546,6 +2572,7 @@ struct llama_hparams {
if (this->n_head_arr != other.n_head_arr) return true;
if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
if (this->n_ff_arr != other.n_ff_arr) return true;
......@@ -136,7 +136,7 @@ index d1791af0..b01770d0 100644
if (this->n_rel_attn_bkts != other.n_rel_attn_bkts) return true;
if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
@@ -2630,6 +2657,14 @@ struct llama_hparams {
@@ -2658,6 +2685,14 @@ struct llama_hparams {
return ssm_d_state * ssm_d_inner;
}
}
......@@ -151,7 +151,7 @@ index d1791af0..b01770d0 100644
};
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
@@ -2816,6 +2851,8 @@ struct llama_layer {
@@ -2844,6 +2879,8 @@ struct llama_layer {
struct ggml_tensor * ffn_gate_scale;
struct ggml_tensor * ffn_up_scale;
struct ggml_tensor * ffn_down_scale;
......@@ -160,7 +160,7 @@ index d1791af0..b01770d0 100644
};
// very similar to llama_batch,
@@ -6209,6 +6246,21 @@ static void llm_load_hparams(
@@ -6247,6 +6284,21 @@ static void llm_load_hparams(
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
......@@ -182,7 +182,7 @@ index d1791af0..b01770d0 100644
default: (void)0;
}
@@ -7198,6 +7250,7 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
@@ -7239,6 +7291,7 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
// this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
......@@ -190,7 +190,7 @@ index d1791af0..b01770d0 100644
};
// checks if the weight tensor can be used with the specified buffer type and device
@@ -9205,6 +9258,35 @@ static bool llm_load_tensors(
@@ -9253,6 +9306,35 @@ static bool llm_load_tensors(
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
......@@ -226,7 +226,7 @@ index d1791af0..b01770d0 100644
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
@@ -16652,6 +16734,158 @@ struct llm_build_context {
@@ -16671,6 +16753,158 @@ struct llm_build_context {
return gf;
}
......@@ -385,7 +385,7 @@ index d1791af0..b01770d0 100644
};
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -16921,6 +17155,10 @@ static struct ggml_cgraph * llama_build_graph(
@@ -16942,6 +17176,10 @@ static struct ggml_cgraph * llama_build_graph(
{
result = llm.build_chameleon();
} break;
......@@ -396,7 +396,7 @@ index d1791af0..b01770d0 100644
default:
GGML_ABORT("fatal error");
}
@@ -20132,6 +20370,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
@@ -20137,6 +20375,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_CHAMELEON:
......
......@@ -8,7 +8,7 @@ Subject: [PATCH] conditional-fattn
1 file changed, 2 insertions(+)
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 52aec229..cbf4fddf 100644
index 000f1777..8fd7c1a3 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2162,9 +2162,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
......
......@@ -18,10 +18,10 @@ remaining is to implement the cross attention mask
3 files changed, 467 insertions(+), 20 deletions(-)
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index 4ca53a0b..d56644a8 100644
index 16f30c56..0f0f3f62 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -412,7 +412,7 @@ struct llava_embd_batch {
@@ -429,7 +429,7 @@ struct llava_embd_batch {
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
......@@ -30,7 +30,7 @@ index 4ca53a0b..d56644a8 100644
pos .resize(n_tokens);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
@@ -424,6 +424,7 @@ struct llava_embd_batch {
@@ -441,6 +441,7 @@ struct llava_embd_batch {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ embd,
......@@ -38,7 +38,7 @@ index 4ca53a0b..d56644a8 100644
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
@@ -447,7 +448,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
@@ -464,7 +465,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
n_eval = n_batch;
}
float * embd = image_embed->embed+i*n_embd;
......@@ -48,10 +48,10 @@ index 4ca53a0b..d56644a8 100644
LOG_ERR("%s : failed to eval\n", __func__);
return false;
diff --git a/include/llama.h b/include/llama.h
index e85f459f..aba85f86 100644
index c67988a3..0f266283 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -245,6 +245,7 @@ extern "C" {
@@ -249,6 +249,7 @@ extern "C" {
llama_token * token;
float * embd;
......@@ -59,7 +59,7 @@ index e85f459f..aba85f86 100644
llama_pos * pos;
int32_t * n_seq_id;
llama_seq_id ** seq_id;
@@ -419,6 +420,10 @@ extern "C" {
@@ -423,6 +424,10 @@ extern "C" {
struct llama_model * model,
struct llama_context_params params);
......@@ -71,7 +71,7 @@ index e85f459f..aba85f86 100644
LLAMA_API void llama_free(struct llama_context * ctx);
diff --git a/src/llama.cpp b/src/llama.cpp
index b01770d0..46881642 100644
index 26be6254..4778a9ed 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -146,6 +146,7 @@ static std::string format(const char * fmt, ...) {
......@@ -82,7 +82,7 @@ index b01770d0..46881642 100644
LLM_ARCH_FALCON,
LLM_ARCH_BAICHUAN,
LLM_ARCH_GROK,
@@ -201,6 +202,7 @@ enum llm_arch {
@@ -202,6 +203,7 @@ enum llm_arch {
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_LLAMA, "llama" },
......@@ -90,23 +90,23 @@ index b01770d0..46881642 100644
{ LLM_ARCH_FALCON, "falcon" },
{ LLM_ARCH_GROK, "grok" },
{ LLM_ARCH_GPT2, "gpt2" },
@@ -309,6 +311,7 @@ enum llm_kv {
@@ -311,6 +313,7 @@ enum llm_kv {
LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
+ LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,
LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_FREQ_BASE,
@@ -426,6 +429,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
LLM_KV_ROPE_DIMENSION_SECTIONS,
@@ -429,6 +432,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection.%d" },
+ { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
@@ -608,6 +612,14 @@ enum llm_tensor {
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
@@ -612,6 +616,14 @@ enum llm_tensor {
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
LLM_TENSOR_BSKCN_TV,
......@@ -121,7 +121,7 @@ index b01770d0..46881642 100644
};
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
@@ -637,6 +649,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
@@ -641,6 +653,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
},
},
......@@ -162,7 +162,7 @@ index b01770d0..46881642 100644
{
LLM_ARCH_BAICHUAN,
{
@@ -2432,6 +2478,7 @@ enum e_model {
@@ -2456,6 +2502,7 @@ enum e_model {
MODEL_40B,
MODEL_65B,
MODEL_70B,
......@@ -170,7 +170,7 @@ index b01770d0..46881642 100644
MODEL_236B,
MODEL_314B,
MODEL_SMALL,
@@ -2476,6 +2523,7 @@ struct llama_hparams {
@@ -2500,6 +2547,7 @@ struct llama_hparams {
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr;
......@@ -178,7 +178,7 @@ index b01770d0..46881642 100644
uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0;
@@ -2544,10 +2592,11 @@ struct llama_hparams {
@@ -2569,10 +2617,11 @@ struct llama_hparams {
if (this->n_expert != other.n_expert) return true;
if (this->n_expert_used != other.n_expert_used) return true;
......@@ -194,7 +194,7 @@ index b01770d0..46881642 100644
if (this->n_rel_attn_bkts != other.n_rel_attn_bkts) return true;
if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
@@ -2665,6 +2714,10 @@ struct llama_hparams {
@@ -2693,6 +2742,10 @@ struct llama_hparams {
GGML_ABORT("fatal error");
}
......@@ -205,7 +205,7 @@ index b01770d0..46881642 100644
};
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
@@ -2694,6 +2747,9 @@ struct llama_cparams {
@@ -2722,6 +2775,9 @@ struct llama_cparams {
bool offload_kqv;
bool flash_attn;
bool no_perf;
......@@ -215,7 +215,7 @@ index b01770d0..46881642 100644
enum llama_pooling_type pooling_type;
@@ -2853,6 +2909,16 @@ struct llama_layer {
@@ -2881,6 +2937,16 @@ struct llama_layer {
struct ggml_tensor * ffn_down_scale;
struct ggml_tensor * bskcn_tv;
......@@ -232,7 +232,7 @@ index b01770d0..46881642 100644
};
// very similar to llama_batch,
@@ -3439,6 +3505,8 @@ struct llama_context {
@@ -3472,6 +3538,8 @@ struct llama_context {
struct ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch]
struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc]
struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
......@@ -241,7 +241,7 @@ index b01770d0..46881642 100644
};
struct llama_lora_weight {
@@ -3577,6 +3645,39 @@ static bool llama_kv_cache_init(
@@ -3610,6 +3678,39 @@ static bool llama_kv_cache_init(
cache.v_l.reserve(n_layer);
for (int i = 0; i < (int) n_layer; i++) {
......@@ -281,7 +281,7 @@ index b01770d0..46881642 100644
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
@@ -5520,12 +5621,14 @@ static void llm_load_hparams(
@@ -5547,12 +5648,14 @@ static void llm_load_hparams(
}
// zero-out the per-layer hparams
......@@ -301,7 +301,7 @@ index b01770d0..46881642 100644
// n_head_kv is optional, default to n_head
hparams.n_head_kv_arr = hparams.n_head_arr;
@@ -5574,7 +5677,7 @@ static void llm_load_hparams(
@@ -5601,7 +5704,7 @@ static void llm_load_hparams(
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
......@@ -310,7 +310,7 @@ index b01770d0..46881642 100644
if (hparams.n_rot != hparams.n_embd_head_k) {
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
}
@@ -5614,6 +5717,16 @@ static void llm_load_hparams(
@@ -5641,6 +5744,16 @@ static void llm_load_hparams(
}
}
} break;
......@@ -327,7 +327,7 @@ index b01770d0..46881642 100644
case LLM_ARCH_MINICPM:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -7250,7 +7363,15 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
@@ -7291,7 +7404,15 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
// this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
......@@ -344,7 +344,7 @@ index b01770d0..46881642 100644
};
// checks if the weight tensor can be used with the specified buffer type and device
@@ -7754,6 +7875,53 @@ static bool llm_load_tensors(
@@ -7801,6 +7922,53 @@ static bool llm_load_tensors(
}
}
} break;
......@@ -398,7 +398,7 @@ index b01770d0..46881642 100644
case LLM_ARCH_MINICPM3:
{
const int64_t n_embd_head_qk_rope = hparams.n_rot;
@@ -9463,7 +9631,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
@@ -9511,7 +9679,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
model.hparams.n_vocab != model.vocab.id_to_token.size()) {
......@@ -407,7 +407,7 @@ index b01770d0..46881642 100644
}
if (params.vocab_only) {
@@ -9546,6 +9714,21 @@ static struct ggml_tensor * llm_build_inp_embd(
@@ -9594,6 +9762,21 @@ static struct ggml_tensor * llm_build_inp_embd(
return inpL;
}
......@@ -429,7 +429,7 @@ index b01770d0..46881642 100644
static void llm_build_kv_store(
struct ggml_context * ctx,
const llama_hparams & hparams,
@@ -10513,6 +10696,7 @@ struct llm_build_context {
@@ -10561,6 +10744,7 @@ struct llm_build_context {
lctx.inp_pos_bucket = nullptr;
lctx.inp_embd_enc = nullptr;
lctx.inp_KQ_mask_cross = nullptr;
......@@ -437,7 +437,7 @@ index b01770d0..46881642 100644
}
void free() {
@@ -10992,6 +11176,240 @@ struct llm_build_context {
@@ -11040,6 +11224,240 @@ struct llm_build_context {
return gf;
}
......@@ -678,7 +678,7 @@ index b01770d0..46881642 100644
struct ggml_cgraph * build_baichuan() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
@@ -16973,6 +17391,10 @@ static struct ggml_cgraph * llama_build_graph(
@@ -16993,6 +17411,10 @@ static struct ggml_cgraph * llama_build_graph(
{
result = llm.build_llama();
} break;
......@@ -689,7 +689,7 @@ index b01770d0..46881642 100644
case LLM_ARCH_BAICHUAN:
{
result = llm.build_baichuan();
@@ -17237,10 +17659,19 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
@@ -17258,10 +17680,19 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
}
if (ubatch.embd) {
......@@ -712,7 +712,7 @@ index b01770d0..46881642 100644
}
if (ubatch.pos && lctx.inp_pos) {
@@ -17841,7 +18272,7 @@ static int llama_decode_internal(
@@ -17862,7 +18293,7 @@ static int llama_decode_internal(
n_outputs = 1;
}
......@@ -721,7 +721,7 @@ index b01770d0..46881642 100644
/* simple_split */ !kv_self.recurrent,
/* logits_all */ n_outputs == n_tokens_all);
@@ -18151,7 +18582,7 @@ static int llama_encode_internal(
@@ -18172,7 +18603,7 @@ static int llama_encode_internal(
const int64_t n_embd = hparams.n_embd;
......@@ -730,7 +730,7 @@ index b01770d0..46881642 100644
const llama_ubatch ubatch = lctx.sbatch.split_simple(n_tokens);
@@ -19189,7 +19620,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
@@ -19203,7 +19634,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
if (llama_model_has_encoder(&model)) {
n_attn_layer *= 3;
}
......@@ -741,7 +741,7 @@ index b01770d0..46881642 100644
}
size_t total_size_org = 0;
@@ -20355,6 +20788,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
@@ -20360,6 +20793,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
// use what we call a normal RoPE, operating on pairs of consecutive head values
case LLM_ARCH_LLAMA:
......@@ -749,7 +749,7 @@ index b01770d0..46881642 100644
case LLM_ARCH_BAICHUAN:
case LLM_ARCH_STARCODER:
case LLM_ARCH_PLAMO:
@@ -21782,6 +22216,10 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
@@ -21790,6 +22224,10 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
ctx->cparams.causal_attn = causal_attn;
}
......@@ -760,7 +760,7 @@ index b01770d0..46881642 100644
struct llama_batch llama_batch_get_one(
llama_token * tokens,
int32_t n_tokens) {
@@ -21789,6 +22227,7 @@ struct llama_batch llama_batch_get_one(
@@ -21797,6 +22235,7 @@ struct llama_batch llama_batch_get_one(
/*n_tokens =*/ n_tokens,
/*tokens =*/ tokens,
/*embd =*/ nullptr,
......@@ -768,7 +768,7 @@ index b01770d0..46881642 100644
/*pos =*/ nullptr,
/*n_seq_id =*/ nullptr,
/*seq_id =*/ nullptr,
@@ -21801,6 +22240,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
@@ -21809,6 +22248,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
/*n_tokens =*/ 0,
/*tokens =*/ nullptr,
/*embd =*/ nullptr,
......@@ -776,7 +776,7 @@ index b01770d0..46881642 100644
/*pos =*/ nullptr,
/*n_seq_id =*/ nullptr,
/*seq_id =*/ nullptr,
@@ -21809,6 +22249,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
@@ -21817,6 +22257,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
if (embd) {
batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
......
......@@ -5,30 +5,30 @@ Subject: [PATCH] add unpad operator
---
ggml/include/ggml.h | 10 +++++
ggml/src/ggml-cpu/ggml-cpu.c | 57 ++++++++++++++++++++++++++++
ggml/src/ggml-cpu/ggml-cpu.c | 58 ++++++++++++++++++++++++++++
ggml/src/ggml-cuda/ggml-cuda.cu | 4 ++
ggml/src/ggml-cuda/pad.cu | 46 ++++++++++++++++++++++
ggml/src/ggml-cuda/pad.cuh | 1 +
ggml/src/ggml-metal/ggml-metal.m | 33 ++++++++++++++++
ggml/src/ggml-metal/ggml-metal.metal | 45 ++++++++++++++++++++++
ggml/src/ggml-metal/ggml-metal.metal | 45 +++++++++++++++++++++
ggml/src/ggml.c | 25 +++++++++++-
8 files changed, 219 insertions(+), 2 deletions(-)
8 files changed, 220 insertions(+), 2 deletions(-)
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 65cb92c4..acbcccc6 100644
index b0c1ac9c..091e6e6b 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -499,6 +499,7 @@ extern "C" {
GGML_OP_POOL_2D_BACK,
GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_PAD,
GGML_OP_PAD_REFLECT_1D,
+ GGML_OP_UNPAD,
GGML_OP_ARANGE,
GGML_OP_TIMESTEP_EMBEDDING,
GGML_OP_ARGSORT,
@@ -1695,6 +1696,15 @@ extern "C" {
int p2,
int p3);
@@ -1718,6 +1719,15 @@ extern "C" {
int p0,
int p1);
+ // unpad each dimension: [x, ..., x, y, ..., y] -> [x, ..., x]
+ GGML_API struct ggml_tensor * ggml_unpad(
......@@ -43,10 +43,10 @@ index 65cb92c4..acbcccc6 100644
// timesteps: [N,]
// return: [N, dim]
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 23ae2e10..111ff3b0 100644
index 67e67a08..bebff207 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -10439,6 +10439,58 @@ static void ggml_compute_forward_pad(
@@ -10588,6 +10588,59 @@ static void ggml_compute_forward_pad_reflect_1d(
}
}
......@@ -102,12 +102,13 @@ index 23ae2e10..111ff3b0 100644
+ }
+ }
+}
+
// ggml_compute_forward_arange
@@ -12535,6 +12587,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
static void ggml_compute_forward_arange_f32(
@@ -12690,6 +12743,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_pad(params, tensor);
ggml_compute_forward_pad_reflect_1d(params, tensor);
} break;
+ case GGML_OP_UNPAD:
+ {
......@@ -116,16 +117,16 @@ index 23ae2e10..111ff3b0 100644
case GGML_OP_ARANGE:
{
ggml_compute_forward_arange(params, tensor);
@@ -12877,6 +12933,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
} break;
@@ -13033,6 +13090,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
case GGML_OP_UPSCALE:
case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D:
+ case GGML_OP_UNPAD:
case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT:
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index cbf4fddf..9ca6cb77 100644
index 8fd7c1a3..7c351b89 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2085,6 +2085,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
......@@ -210,34 +211,34 @@ index 8fd386b0..e2ededc3 100644
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 093ae900..cb9a1307 100644
index 28f590f9..787fc713 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -310,6 +310,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F16_F32,
@@ -311,6 +311,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
GGML_METAL_KERNEL_TYPE_PAD_F32,
GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
+ GGML_METAL_KERNEL_TYPE_UNPAD_F32,
GGML_METAL_KERNEL_TYPE_ARANGE_F32,
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
@@ -877,6 +878,7 @@ @implementation GGMLMetalClass
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F16_F32, conv_transpose_1d_f16_f32, true);
@@ -910,6 +911,7 @@ @implementation GGMLMetalClass
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32, unpad_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true);
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32, unpad_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
@@ -1099,6 +1101,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
case GGML_OP_POOL_2D:
@@ -1145,6 +1147,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
case GGML_OP_UPSCALE:
case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D:
+ case GGML_OP_UNPAD:
case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT:
@@ -3258,6 +3261,36 @@ static void ggml_metal_encode_node(
@@ -3348,6 +3351,36 @@ static void ggml_metal_encode_node(
const int nth = MIN(1024, ne0);
......@@ -275,10 +276,10 @@ index 093ae900..cb9a1307 100644
} break;
case GGML_OP_ARANGE:
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 5caa0846..47038c31 100644
index 8ba43904..204c93e6 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -2897,6 +2897,51 @@ kernel void kernel_pad_f32(
@@ -2944,6 +2944,51 @@ kernel void kernel_pad_reflect_1d_f32(
}
}
......@@ -331,44 +332,44 @@ index 5caa0846..47038c31 100644
device char * dst,
constant int64_t & ne0,
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 1a9a7efa..ea2b259b 100644
index 51cc8566..0e74e554 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -950,6 +950,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"POOL_2D_BACK",
@@ -954,6 +954,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"UPSCALE",
"PAD",
"PAD_REFLECT_1D",
+ "UNPAD",
"ARANGE",
"TIMESTEP_EMBEDDING",
"ARGSORT",
@@ -983,7 +984,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
@@ -987,7 +988,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"OPT_STEP_ADAMW",
};
-static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
@@ -1045,6 +1046,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"pool_2d_back(x)",
@@ -1050,6 +1051,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"upscale(x)",
"pad(x)",
"pad_reflect_1d(x)",
+ "unpad(x)",
"arange(start, stop, step)",
"timestep_embedding(timesteps, dim, max_period)",
"argsort(x)",
@@ -1078,7 +1080,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
@@ -1083,7 +1085,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"adamw(x)",
};
-static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@@ -4097,6 +4099,25 @@ struct ggml_tensor * ggml_pad(
@@ -4180,6 +4182,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
return result;
}
......
......@@ -11,7 +11,7 @@ the characters
2 files changed, 23 insertions(+), 1 deletion(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index d1dc9627..05ef0e71 100644
index 8c9aaf5a..3e372dc3 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -389,7 +389,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment