Unverified Commit 7a81daf0 authored by Jeffrey Morgan's avatar Jeffrey Morgan Committed by GitHub
Browse files

llama: update vendor code to commit ba1cb19c (#8101)

parent 60f75560
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
...@@ -444,6 +444,7 @@ struct llm_tokenizer_bpe : llm_tokenizer { ...@@ -444,6 +444,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
case LLAMA_VOCAB_PRE_TYPE_SMOLLM: case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
case LLAMA_VOCAB_PRE_TYPE_CODESHELL: case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
case LLAMA_VOCAB_PRE_TYPE_EXAONE: case LLAMA_VOCAB_PRE_TYPE_EXAONE:
case LLAMA_VOCAB_PRE_TYPE_MINERVA:
regex_exprs = { regex_exprs = {
"\\p{N}", "\\p{N}",
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
......
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
...@@ -190,6 +190,7 @@ enum llm_arch { ...@@ -190,6 +190,7 @@ enum llm_arch {
LLM_ARCH_QWEN, LLM_ARCH_QWEN,
LLM_ARCH_QWEN2, LLM_ARCH_QWEN2,
LLM_ARCH_QWEN2MOE, LLM_ARCH_QWEN2MOE,
LLM_ARCH_QWEN2VL,
LLM_ARCH_PHI2, LLM_ARCH_PHI2,
LLM_ARCH_PHI3, LLM_ARCH_PHI3,
LLM_ARCH_PLAMO, LLM_ARCH_PLAMO,
...@@ -246,6 +247,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = { ...@@ -246,6 +247,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_QWEN, "qwen" }, { LLM_ARCH_QWEN, "qwen" },
{ LLM_ARCH_QWEN2, "qwen2" }, { LLM_ARCH_QWEN2, "qwen2" },
{ LLM_ARCH_QWEN2MOE, "qwen2moe" }, { LLM_ARCH_QWEN2MOE, "qwen2moe" },
{ LLM_ARCH_QWEN2VL, "qwen2vl" },
{ LLM_ARCH_PHI2, "phi2" }, { LLM_ARCH_PHI2, "phi2" },
{ LLM_ARCH_PHI3, "phi3" }, { LLM_ARCH_PHI3, "phi3" },
{ LLM_ARCH_PLAMO, "plamo" }, { LLM_ARCH_PLAMO, "plamo" },
...@@ -340,6 +342,7 @@ enum llm_kv { ...@@ -340,6 +342,7 @@ enum llm_kv {
LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,
   
LLM_KV_ROPE_DIMENSION_COUNT, LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_DIMENSION_SECTIONS,
LLM_KV_ROPE_FREQ_BASE, LLM_KV_ROPE_FREQ_BASE,
LLM_KV_ROPE_SCALE_LINEAR, LLM_KV_ROPE_SCALE_LINEAR,
LLM_KV_ROPE_SCALING_TYPE, LLM_KV_ROPE_SCALING_TYPE,
...@@ -458,6 +461,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = { ...@@ -458,6 +461,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" }, { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
   
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" }, { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" }, { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
...@@ -975,6 +979,23 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N ...@@ -975,6 +979,23 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
}, },
}, },
{
LLM_ARCH_QWEN2VL,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{ {
LLM_ARCH_QWEN2MOE, LLM_ARCH_QWEN2MOE,
{ {
...@@ -1113,6 +1134,8 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N ...@@ -1113,6 +1134,8 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" }, { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" }, { LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
{ LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
{ LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
...@@ -1778,9 +1801,10 @@ struct LLM_TN { ...@@ -1778,9 +1801,10 @@ struct LLM_TN {
// //
   
static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = { static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
{ LLAMA_ROPE_SCALING_TYPE_NONE, "none" }, { LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
{ LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" }, { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
{ LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" }, { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
{ LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
}; };
   
static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) { static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
...@@ -1886,7 +1910,7 @@ private: ...@@ -1886,7 +1910,7 @@ private:
DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL); NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
if (!bufLen) { if (!bufLen) {
ret = format("Win32 error code: %s", error_code); ret = format("Win32 error code: %lx", error_code);
} else { } else {
ret = lpMsgBuf; ret = lpMsgBuf;
LocalFree(lpMsgBuf); LocalFree(lpMsgBuf);
...@@ -2224,7 +2248,7 @@ struct llama_mmap { ...@@ -2224,7 +2248,7 @@ struct llama_mmap {
HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll"); HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
   
// may fail on pre-Windows 8 systems // may fail on pre-Windows 8 systems
pPrefetchVirtualMemory = reinterpret_cast<decltype(pPrefetchVirtualMemory)> (GetProcAddress(hKernel32, "PrefetchVirtualMemory")); pPrefetchVirtualMemory = (decltype(pPrefetchVirtualMemory))(void *) GetProcAddress(hKernel32, "PrefetchVirtualMemory");
   
if (pPrefetchVirtualMemory) { if (pPrefetchVirtualMemory) {
// advise the kernel to preload the mapped memory // advise the kernel to preload the mapped memory
...@@ -2571,11 +2595,12 @@ struct llama_hparams { ...@@ -2571,11 +2595,12 @@ struct llama_hparams {
uint32_t time_decay_extra_dim = 0; uint32_t time_decay_extra_dim = 0;
uint32_t wkv_head_size = 0; uint32_t wkv_head_size = 0;
   
float rope_attn_factor = 1.0f; float rope_attn_factor = 1.0f;
float rope_freq_base_train; float rope_freq_base_train;
float rope_freq_scale_train; float rope_freq_scale_train;
uint32_t n_ctx_orig_yarn; uint32_t n_ctx_orig_yarn;
float rope_yarn_log_mul; float rope_yarn_log_mul;
int rope_sections[4];
   
// for State Space Models // for State Space Models
uint32_t ssm_d_conv = 0; uint32_t ssm_d_conv = 0;
...@@ -2634,6 +2659,9 @@ struct llama_hparams { ...@@ -2634,6 +2659,9 @@ struct llama_hparams {
   
if (this->rope_finetuned != other.rope_finetuned) return true; if (this->rope_finetuned != other.rope_finetuned) return true;
if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true; if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true;
if (std::equal(std::begin(this->rope_sections),
std::end(this->rope_sections),
std::begin(other.rope_sections))) return true;
   
if (this->ssm_d_conv != other.ssm_d_conv) return true; if (this->ssm_d_conv != other.ssm_d_conv) return true;
if (this->ssm_d_inner != other.ssm_d_inner) return true; if (this->ssm_d_inner != other.ssm_d_inner) return true;
...@@ -3504,6 +3532,11 @@ struct llama_context { ...@@ -3504,6 +3532,11 @@ struct llama_context {
// whether we are computing encoder output or decoder output // whether we are computing encoder output or decoder output
bool is_encoding = false; bool is_encoding = false;
   
// TODO: find a better way to accommodate mutli-dimension position encoding methods
// number of position id each token get, 1 for each token in most cases.
// when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate.
int n_pos_per_token = 1;
// output of the encoder part of the encoder-decoder models // output of the encoder part of the encoder-decoder models
std::vector<float> embd_enc; std::vector<float> embd_enc;
std::vector<std::set<llama_seq_id>> seq_ids_enc; std::vector<std::set<llama_seq_id>> seq_ids_enc;
...@@ -4739,9 +4772,6 @@ struct llama_model_loader { ...@@ -4739,9 +4772,6 @@ struct llama_model_loader {
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break; case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break; case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break; case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
case GGML_TYPE_Q4_0_4_4: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_4; break;
case GGML_TYPE_Q4_0_4_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_8; break;
case GGML_TYPE_Q4_0_8_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_8_8; break;
default: default:
{ {
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max)); LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
...@@ -5505,9 +5535,6 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { ...@@ -5505,9 +5535,6 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw"; case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";
   
default: return "unknown, may not work"; default: return "unknown, may not work";
} }
...@@ -5756,8 +5783,12 @@ static void llm_load_hparams( ...@@ -5756,8 +5783,12 @@ static void llm_load_hparams(
case LLM_ARCH_MINICPM: case LLM_ARCH_MINICPM:
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
   
switch (hparams.n_layer) { switch (hparams.n_layer) {
case 52: model.type = e_model::MODEL_1B; break;
case 40: model.type = e_model::MODEL_2B; break; case 40: model.type = e_model::MODEL_2B; break;
default: model.type = e_model::MODEL_UNKNOWN; default: model.type = e_model::MODEL_UNKNOWN;
} }
...@@ -5922,6 +5953,13 @@ static void llm_load_hparams( ...@@ -5922,6 +5953,13 @@ static void llm_load_hparams(
default: model.type = e_model::MODEL_UNKNOWN; default: model.type = e_model::MODEL_UNKNOWN;
} }
} break; } break;
case LLM_ARCH_QWEN2VL:
{
std::array<int, 4> section_dims;
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, section_dims, 4, true);
std::copy(section_dims.begin(), section_dims.begin() + 4, std::begin(hparams.rope_sections));
}
// fall through
case LLM_ARCH_QWEN2: case LLM_ARCH_QWEN2:
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
...@@ -6654,6 +6692,9 @@ static void llm_load_vocab( ...@@ -6654,6 +6692,9 @@ static void llm_load_vocab(
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHAMELEON; vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
vocab.tokenizer_add_bos = true; vocab.tokenizer_add_bos = true;
vocab.tokenizer_clean_spaces = false; vocab.tokenizer_clean_spaces = false;
} else if (
tokenizer_pre == "minerva-7b") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA;
} else { } else {
LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__); LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
...@@ -7248,7 +7289,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { ...@@ -7248,7 +7289,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp); LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
} }
   
if (model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) { if (model.arch == LLM_ARCH_MINICPM || model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale); LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale); LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale); LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
...@@ -7882,7 +7923,13 @@ static bool llm_load_tensors( ...@@ -7882,7 +7923,13 @@ static bool llm_load_tensors(
   
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
   
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
}
else {
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
}
   
if (n_expert == 0) { if (n_expert == 0) {
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
...@@ -8396,6 +8443,7 @@ static bool llm_load_tensors( ...@@ -8396,6 +8443,7 @@ static bool llm_load_tensors(
} }
} break; } break;
case LLM_ARCH_QWEN2: case LLM_ARCH_QWEN2:
case LLM_ARCH_QWEN2VL:
{ {
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
   
...@@ -13064,6 +13112,124 @@ struct llm_build_context { ...@@ -13064,6 +13112,124 @@ struct llm_build_context {
return gf; return gf;
} }
   
struct ggml_cgraph * build_qwen2vl() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// inp_pos - contains the positions
lctx.inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens * 4);
cb(lctx.inp_pos, "inp_pos", -1);
ggml_set_input(lctx.inp_pos);
struct ggml_tensor * inp_pos = lctx.inp_pos;
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
int sections[4];
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL;
// norm
cur = llm_build_norm(ctx0, inpL, hparams,
model.layers[il].attn_norm, NULL,
LLM_NORM_RMS, cb, il);
cb(cur, "attn_norm", il);
// self-attention
{
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
cb(Qcur, "Qcur", il);
struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
cb(Kcur, "Kcur", il);
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
cb(Kcur, "Kcur", il);
struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
cb(Vcur, "Vcur", il);
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
cb(Vcur, "Vcur", il);
Qcur = ggml_rope_multi(
ctx0,
ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur", il);
Kcur = ggml_rope_multi(
ctx0,
ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Kcur, "Kcur", il);
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
}
if (il == n_layer - 1) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
// feed-forward network
cur = llm_build_norm(ctx0, ffn_inp, hparams,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, cb, il);
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, lctx, cur,
model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
cur = ggml_add(ctx0, cur, ffn_inp);
cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);
// input for next layer
inpL = cur;
}
cur = inpL;
cur = llm_build_norm(ctx0, cur, hparams,
model.output_norm, NULL,
LLM_NORM_RMS, cb, -1);
cb(cur, "result_norm", -1);
// lm_head
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur);
return gf;
}
struct ggml_cgraph * build_qwen2moe() { struct ggml_cgraph * build_qwen2moe() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
   
...@@ -14015,153 +14181,6 @@ struct llm_build_context { ...@@ -14015,153 +14181,6 @@ struct llm_build_context {
return gf; return gf;
} }
   
// ref: https://arxiv.org/abs/2203.03466
// https://github.com/ggerganov/llama.cpp/issues/5276#issuecomment-1925774738
// based on the original build_llama() function
struct ggml_cgraph * build_minicpm() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
const int64_t n_embd = hparams.n_embd;
//TODO: if the model varies, these parameters need to be read from the model
const int64_t n_embd_base = 256;
const float scale_embd = 12.0f;
const float scale_depth = 1.4f;
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// scale the input embeddings
inpL = ggml_scale(ctx0, inpL, scale_embd);
cb(inpL, "inp_scaled", -1);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos();
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL;
// norm
cur = llm_build_norm(ctx0, inpL, hparams,
model.layers[il].attn_norm, NULL,
LLM_NORM_RMS, cb, il);
cb(cur, "attn_norm", il);
// self-attention
{
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
if (model.layers[il].bq) {
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
cb(Qcur, "Qcur", il);
}
struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
cb(Kcur, "Kcur", il);
if (model.layers[il].bk) {
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
cb(Kcur, "Kcur", il);
}
struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
cb(Vcur, "Vcur", il);
if (model.layers[il].bv) {
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
cb(Vcur, "Vcur", il);
}
Qcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur", il);
Kcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Kcur, "Kcur", il);
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
}
if (il == n_layer - 1) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
// scale_res - scale the hidden states for residual connection
const float scale_res = scale_depth/sqrtf(float(n_layer));
cur = ggml_scale(ctx0, cur, scale_res);
cb(cur, "hidden_scaled", -1);
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
// feed-forward network
{
cur = llm_build_norm(ctx0, ffn_inp, hparams,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, cb, il);
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, lctx, cur,
model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
}
// scale the hidden states for residual connection
cur = ggml_scale(ctx0, cur, scale_res);
cb(cur, "hidden_scaled_ffn", -1);
cur = ggml_add(ctx0, cur, ffn_inp);
cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);
// input for next layer
inpL = cur;
}
cur = inpL;
cur = llm_build_norm(ctx0, cur, hparams,
model.output_norm, NULL,
LLM_NORM_RMS, cb, -1);
cb(cur, "result_norm", -1);
// lm_head scaling
const float scale_lmhead = float(n_embd_base)/float(n_embd);
cur = ggml_scale(ctx0, cur, scale_lmhead);
cb(cur, "lmhead_scaling", -1);
// lm_head
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur);
return gf;
}
struct ggml_cgraph * build_minicpm3() { struct ggml_cgraph * build_minicpm3() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
   
...@@ -17412,6 +17431,7 @@ static struct ggml_cgraph * llama_build_graph( ...@@ -17412,6 +17431,7 @@ static struct ggml_cgraph * llama_build_graph(
   
switch (model.arch) { switch (model.arch) {
case LLM_ARCH_LLAMA: case LLM_ARCH_LLAMA:
case LLM_ARCH_MINICPM:
case LLM_ARCH_GRANITE: case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE_MOE: case LLM_ARCH_GRANITE_MOE:
{ {
...@@ -17467,6 +17487,11 @@ static struct ggml_cgraph * llama_build_graph( ...@@ -17467,6 +17487,11 @@ static struct ggml_cgraph * llama_build_graph(
{ {
result = llm.build_qwen2(); result = llm.build_qwen2();
} break; } break;
case LLM_ARCH_QWEN2VL:
{
lctx.n_pos_per_token = 4;
result = llm.build_qwen2vl();
} break;
case LLM_ARCH_QWEN2MOE: case LLM_ARCH_QWEN2MOE:
{ {
result = llm.build_qwen2moe(); result = llm.build_qwen2moe();
...@@ -17499,10 +17524,6 @@ static struct ggml_cgraph * llama_build_graph( ...@@ -17499,10 +17524,6 @@ static struct ggml_cgraph * llama_build_graph(
{ {
result = llm.build_internlm2(); result = llm.build_internlm2();
} break; } break;
case LLM_ARCH_MINICPM:
{
result = llm.build_minicpm();
} break;
case LLM_ARCH_MINICPM3: case LLM_ARCH_MINICPM3:
{ {
result = llm.build_minicpm3(); result = llm.build_minicpm3();
...@@ -17702,8 +17723,8 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) ...@@ -17702,8 +17723,8 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
   
if (ubatch.pos && lctx.inp_pos) { if (ubatch.pos && lctx.inp_pos) {
const int64_t n_tokens = ubatch.n_tokens; const int64_t n_tokens = ubatch.n_tokens;
auto n_pos = lctx.n_pos_per_token;
ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos)); ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*n_pos*ggml_element_size(lctx.inp_pos));
} }
   
if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) { if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
...@@ -19191,10 +19212,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n ...@@ -19191,10 +19212,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
new_type = GGML_TYPE_IQ3_S; new_type = GGML_TYPE_IQ3_S;
} }
else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 ||
new_type == GGML_TYPE_Q4_0_8_8) {
new_type = GGML_TYPE_Q4_0;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) { else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
new_type = GGML_TYPE_Q4_K; new_type = GGML_TYPE_Q4_K;
} }
...@@ -19517,9 +19534,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s ...@@ -19517,9 +19534,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break; case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break; case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break; case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break;
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break;
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break;
   
default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
} }
...@@ -19860,14 +19874,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s ...@@ -19860,14 +19874,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
f32_data = (float *) f32_conv_buf.data(); f32_data = (float *) f32_conv_buf.data();
} }
   
int chunk_size_multiplier = 1;
if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) {
if ((new_type == GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0)) new_type = GGML_TYPE_Q4_0;
else if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_0;
if (new_type == GGML_TYPE_Q4_0_8_8) chunk_size_multiplier = 8;
else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4;
}
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type)); LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
fflush(stdout); fflush(stdout);
   
...@@ -19880,8 +19886,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s ...@@ -19880,8 +19886,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
const int64_t nrows = tensor->ne[1]; const int64_t nrows = tensor->ne[1];
   
static const int64_t min_chunk_size = 32 * 512; static const int64_t min_chunk_size = 32 * 512;
const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) * const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row));
chunk_size_multiplier;
   
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1]; const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size; const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
...@@ -20859,6 +20864,9 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { ...@@ -20859,6 +20864,9 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_MINICPM3: case LLM_ARCH_MINICPM3:
return LLAMA_ROPE_TYPE_NEOX; return LLAMA_ROPE_TYPE_NEOX;
   
case LLM_ARCH_QWEN2VL:
return LLAMA_ROPE_TYPE_MROPE;
// all model arches should be listed explicitly here // all model arches should be listed explicitly here
case LLM_ARCH_UNKNOWN: case LLM_ARCH_UNKNOWN:
GGML_ABORT("unknown architecture"); GGML_ABORT("unknown architecture");
...@@ -22434,7 +22442,7 @@ float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) { ...@@ -22434,7 +22442,7 @@ float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs)); throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
} }
} else if ((size_t) i >= ctx->output_ids.size()) { } else if ((size_t) i >= ctx->output_ids.size()) {
throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size())); throw std::runtime_error(format("out of range [0, %zu)", ctx->output_ids.size()));
} else { } else {
j = ctx->output_ids[i]; j = ctx->output_ids[i];
} }
......
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
...@@ -130,12 +130,15 @@ extern "C" { ...@@ -130,12 +130,15 @@ extern "C" {
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24, LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25, LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26, LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
}; };
enum llama_rope_type { enum llama_rope_type {
LLAMA_ROPE_TYPE_NONE = -1, LLAMA_ROPE_TYPE_NONE = -1,
LLAMA_ROPE_TYPE_NORM = 0, LLAMA_ROPE_TYPE_NORM = 0,
LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX, LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX,
LLAMA_ROPE_TYPE_MROPE = GGML_ROPE_TYPE_MROPE,
LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION,
}; };
enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
...@@ -197,9 +200,9 @@ extern "C" { ...@@ -197,9 +200,9 @@ extern "C" {
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // except 1d tensors //LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // removed from gguf files, use Q4_0 and runtime repack
LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // except 1d tensors //LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // removed from gguf files, use Q4_0 and runtime repack
LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors //LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
...@@ -211,7 +214,8 @@ extern "C" { ...@@ -211,7 +214,8 @@ extern "C" {
LLAMA_ROPE_SCALING_TYPE_NONE = 0, LLAMA_ROPE_SCALING_TYPE_NONE = 0,
LLAMA_ROPE_SCALING_TYPE_LINEAR = 1, LLAMA_ROPE_SCALING_TYPE_LINEAR = 1,
LLAMA_ROPE_SCALING_TYPE_YARN = 2, LLAMA_ROPE_SCALING_TYPE_YARN = 2,
LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN, LLAMA_ROPE_SCALING_TYPE_LONGROPE = 3,
LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_LONGROPE,
}; };
enum llama_pooling_type { enum llama_pooling_type {
...@@ -485,6 +489,7 @@ extern "C" { ...@@ -485,6 +489,7 @@ extern "C" {
// Functions to access the model's GGUF metadata scalar values // Functions to access the model's GGUF metadata scalar values
// - The functions return the length of the string on success, or -1 on failure // - The functions return the length of the string on success, or -1 on failure
// - The output string is always null-terminated and cleared on failure // - The output string is always null-terminated and cleared on failure
// - When retrieving a string, an extra byte must be allocated to account for the null terminator
// - GGUF array values are not supported by these functions // - GGUF array values are not supported by these functions
// Get metadata value as a string by key name // Get metadata value as a string by key name
......
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
...@@ -285,25 +285,33 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli ...@@ -285,25 +285,33 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip); const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
if (clip_is_minicpmv(ctx_clip)) { if (clip_is_minicpmv(ctx_clip) || clip_is_qwen2vl(ctx_clip)) {
std::vector<float *> image_embd_v; std::vector<float *> image_embd_v;
image_embd_v.resize(img_res_v.size); image_embd_v.resize(img_res_v.size);
struct clip_image_size * load_image_size = clip_image_size_init(); struct clip_image_size * load_image_size = clip_image_size_init();
for (size_t i = 0; i < img_res_v.size; i++) { for (size_t i = 0; i < img_res_v.size; i++) {
const int64_t t_img_enc_step_start_us = ggml_time_us(); const int64_t t_img_enc_step_start_us = ggml_time_us();
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); image_embd_v[i] = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny));
int patch_size=14; int patch_size=14;
load_image_size->width = img_res_v.data[i].nx; load_image_size->width = img_res_v.data[i].nx;
load_image_size->height = img_res_v.data[i].ny; load_image_size->height = img_res_v.data[i].ny;
clip_add_load_image_size(ctx_clip, load_image_size); clip_add_load_image_size(ctx_clip, load_image_size);
bool encoded = false; bool encoded = false;
int has_minicpmv_projector = clip_is_minicpmv(ctx_clip); if (clip_is_qwen2vl(ctx_clip)) {
if (has_minicpmv_projector == 2) {
encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
}
else if (has_minicpmv_projector == 3) {
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
} }
else {
int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
if (has_minicpmv_projector == 2) {
encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
}
else if (has_minicpmv_projector == 3) {
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
}
}
if (!encoded) { if (!encoded) {
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
return false; return false;
...@@ -316,8 +324,11 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli ...@@ -316,8 +324,11 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
int n_img_pos_out = 0; int n_img_pos_out = 0;
for (size_t i = 0; i < image_embd_v.size(); i++) { for (size_t i = 0; i < image_embd_v.size(); i++) {
std::memcpy(image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i], clip_embd_nbytes(ctx_clip)); std::memcpy(
n_img_pos_out += clip_n_patches(ctx_clip); image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
image_embd_v[i],
clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny));
n_img_pos_out += clip_n_patches_by_img(ctx_clip, &img_res_v.data[i]);
} }
*n_img_pos = n_img_pos_out; *n_img_pos = n_img_pos_out;
for (size_t i = 0; i < image_embd_v.size(); i++) { for (size_t i = 0; i < image_embd_v.size(); i++) {
...@@ -413,7 +424,13 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co ...@@ -413,7 +424,13 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
if (clip_is_minicpmv(ctx_clip)) { if (clip_is_minicpmv(ctx_clip)) {
num_max_patches = 10; num_max_patches = 10;
} }
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model float * image_embd;
if (clip_is_qwen2vl(ctx_clip)) {
// qwen2vl don't split image into chunks, so `num_max_patches` is not needed.
image_embd = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img->nx, img->ny));
} else {
image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
}
if (!image_embd) { if (!image_embd) {
LOG_ERR("Unable to allocate memory for image embeddings\n"); LOG_ERR("Unable to allocate memory for image embeddings\n");
return false; return false;
......
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
...@@ -44,10 +44,6 @@ ...@@ -44,10 +44,6 @@
#include <unistd.h> #include <unistd.h>
#endif #endif
#if defined(_OPENMP)
#include <omp.h>
#endif
#if (defined(_WIN32) || defined(_WIN64)) #if (defined(_WIN32) || defined(_WIN64))
#define RESTRICT __restrict #define RESTRICT __restrict
#else #else
...@@ -1408,13 +1404,13 @@ struct tinygemm_kernel_avx<float, ggml_fp16_t, float, BLOCK_M, BLOCK_N, BLOCK_K> ...@@ -1408,13 +1404,13 @@ struct tinygemm_kernel_avx<float, ggml_fp16_t, float, BLOCK_M, BLOCK_N, BLOCK_K>
#define PACKED_INDEX(n, k, KB, tile_size) (n * KB + k) * tile_size #define PACKED_INDEX(n, k, KB, tile_size) (n * KB + k) * tile_size
template<typename TB, int BLOCK_K> template<typename TB, int BLOCK_K>
void convert_B_packed_format(void * RESTRICT packed_B, const TB * RESTRICT B, int N, int K, int n_threads) { void convert_B_packed_format(void * RESTRICT packed_B, const TB * RESTRICT B, int N, int K) {
const int NB = N / TILE_N; const int NB = N / TILE_N;
const int KB = K / BLOCK_K; const int KB = K / BLOCK_K;
const int TILE_SIZE = get_tile_size<TB>(); const int TILE_SIZE = get_tile_size<TB>();
// parallel on NB should be enough // parallel on NB should be enough
parallel_for(n_threads, NB, [&](int begin, int end) { parallel_for(NB, [&](int begin, int end) {
for (int n = begin; n < end; ++n) { for (int n = begin; n < end; ++n) {
for (int k = 0; k < KB; ++k) { for (int k = 0; k < KB; ++k) {
int n0 = n * TILE_N; int n0 = n * TILE_N;
...@@ -2360,15 +2356,8 @@ void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * d ...@@ -2360,15 +2356,8 @@ void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * d
const int K = tensor->ne[0]; // ne0: in_features const int K = tensor->ne[0]; // ne0: in_features
const int N = tensor->ne[1]; // ne1: out_features const int N = tensor->ne[1]; // ne1: out_features
#if defined(_OPENMP)
// the buffer ctx is not initialized when .set_tensor is called
int n_threads = omp_get_num_threads();
#else
int n_threads = 1;
#endif
GGML_DISPATCH_QTYPES(TYPE, [&] { GGML_DISPATCH_QTYPES(TYPE, [&] {
convert_B_packed_format<type, blck_size>((void *)((char *)tensor->data + offset), (const type *)data, N, K, n_threads); convert_B_packed_format<type, blck_size>((void *)((char *)tensor->data + offset), (const type *)data, N, K);
}); });
} }
......
/** /**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* *
* MIT License * MIT License
* *
...@@ -27,16 +27,10 @@ ...@@ -27,16 +27,10 @@
#pragma once #pragma once
#include "common.h" #include "common.h"
#ifdef __cplusplus size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst);
extern "C" {
#endif
size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor); size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor);
void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
#ifdef __cplusplus
}
#endif
...@@ -26,7 +26,7 @@ index fdb4b986..9b80fe07 100644 ...@@ -26,7 +26,7 @@ index fdb4b986..9b80fe07 100644
size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) { size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index d6e4bfdd..52aec229 100644 index c180adc8..000f1777 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -424,6 +424,10 @@ struct ggml_backend_cuda_buffer_context { @@ -424,6 +424,10 @@ struct ggml_backend_cuda_buffer_context {
......
...@@ -8,10 +8,10 @@ Subject: [PATCH] pretokenizer ...@@ -8,10 +8,10 @@ Subject: [PATCH] pretokenizer
1 file changed, 3 insertions(+), 11 deletions(-) 1 file changed, 3 insertions(+), 11 deletions(-)
diff --git a/src/llama.cpp b/src/llama.cpp diff --git a/src/llama.cpp b/src/llama.cpp
index 6a6f4c2a..fa09f3b3 100644 index abc1252e..626c3e3f 100644
--- a/src/llama.cpp --- a/src/llama.cpp
+++ b/src/llama.cpp +++ b/src/llama.cpp
@@ -6362,16 +6362,7 @@ static void llm_load_vocab( @@ -6400,16 +6400,7 @@ static void llm_load_vocab(
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) { if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
vocab.tokenizer_add_space_prefix = false; vocab.tokenizer_add_space_prefix = false;
vocab.tokenizer_clean_spaces = true; vocab.tokenizer_clean_spaces = true;
...@@ -29,9 +29,9 @@ index 6a6f4c2a..fa09f3b3 100644 ...@@ -29,9 +29,9 @@ index 6a6f4c2a..fa09f3b3 100644
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if ( } else if (
tokenizer_pre == "llama3" || tokenizer_pre == "llama3" ||
@@ -6473,7 +6464,8 @@ static void llm_load_vocab( @@ -6514,7 +6505,8 @@ static void llm_load_vocab(
vocab.tokenizer_add_bos = true; tokenizer_pre == "minerva-7b") {
vocab.tokenizer_clean_spaces = false; vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA;
} else { } else {
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); - throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
+ LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__); + LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
......
...@@ -8,10 +8,10 @@ Subject: [PATCH] embeddings ...@@ -8,10 +8,10 @@ Subject: [PATCH] embeddings
1 file changed, 6 insertions(+), 3 deletions(-) 1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/src/llama.cpp b/src/llama.cpp diff --git a/src/llama.cpp b/src/llama.cpp
index fa09f3b3..d1791af0 100644 index 626c3e3f..9e292c4f 100644
--- a/src/llama.cpp --- a/src/llama.cpp
+++ b/src/llama.cpp +++ b/src/llama.cpp
@@ -17398,7 +17398,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) { @@ -17419,7 +17419,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
const auto n_embd = hparams.n_embd; const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead // TODO: use a per-batch flag for logits presence instead
...@@ -20,7 +20,7 @@ index fa09f3b3..d1791af0 100644 ...@@ -20,7 +20,7 @@ index fa09f3b3..d1791af0 100644
const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE); const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0; const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
@@ -17693,7 +17693,6 @@ static int llama_decode_internal( @@ -17714,7 +17714,6 @@ static int llama_decode_internal(
res = nullptr; res = nullptr;
embd = nullptr; embd = nullptr;
} else if (cparams.embeddings) { } else if (cparams.embeddings) {
...@@ -28,7 +28,7 @@ index fa09f3b3..d1791af0 100644 ...@@ -28,7 +28,7 @@ index fa09f3b3..d1791af0 100644
embd = nullptr; embd = nullptr;
for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) { for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) { if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
@@ -17701,11 +17700,15 @@ static int llama_decode_internal( @@ -17722,11 +17721,15 @@ static int llama_decode_internal(
break; break;
} }
} }
......
...@@ -8,7 +8,7 @@ Subject: [PATCH] clip-unicode ...@@ -8,7 +8,7 @@ Subject: [PATCH] clip-unicode
1 file changed, 39 insertions(+), 1 deletion(-) 1 file changed, 39 insertions(+), 1 deletion(-)
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index d7c94352..427d5e02 100644 index ba28c07c..46998e4c 100644
--- a/examples/llava/clip.cpp --- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp
@@ -56,6 +56,19 @@ @@ -56,6 +56,19 @@
...@@ -31,7 +31,7 @@ index d7c94352..427d5e02 100644 ...@@ -31,7 +31,7 @@ index d7c94352..427d5e02 100644
//#define CLIP_DEBUG_FUNCTIONS //#define CLIP_DEBUG_FUNCTIONS
// RGB uint8 image // RGB uint8 image
@@ -1242,8 +1255,29 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { @@ -1322,8 +1335,29 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
gguf_free(ctx); gguf_free(ctx);
return nullptr; return nullptr;
} }
...@@ -62,7 +62,7 @@ index d7c94352..427d5e02 100644 ...@@ -62,7 +62,7 @@ index d7c94352..427d5e02 100644
if (!fin) { if (!fin) {
LOG_ERR("cannot open model file for loading tensors\n"); LOG_ERR("cannot open model file for loading tensors\n");
clip_free(new_clip); clip_free(new_clip);
@@ -1283,7 +1317,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { @@ -1363,7 +1397,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
} }
} }
......
...@@ -15,10 +15,10 @@ in general, the values are (bskcn_tv, 1 - bskcn_tv) ...@@ -15,10 +15,10 @@ in general, the values are (bskcn_tv, 1 - bskcn_tv)
1 file changed, 253 insertions(+), 14 deletions(-) 1 file changed, 253 insertions(+), 14 deletions(-)
diff --git a/src/llama.cpp b/src/llama.cpp diff --git a/src/llama.cpp b/src/llama.cpp
index d1791af0..b01770d0 100644 index 9e292c4f..26be6254 100644
--- a/src/llama.cpp --- a/src/llama.cpp
+++ b/src/llama.cpp +++ b/src/llama.cpp
@@ -195,6 +195,7 @@ enum llm_arch { @@ -196,6 +196,7 @@ enum llm_arch {
LLM_ARCH_GRANITE, LLM_ARCH_GRANITE,
LLM_ARCH_GRANITE_MOE, LLM_ARCH_GRANITE_MOE,
LLM_ARCH_CHAMELEON, LLM_ARCH_CHAMELEON,
...@@ -26,7 +26,7 @@ index d1791af0..b01770d0 100644 ...@@ -26,7 +26,7 @@ index d1791af0..b01770d0 100644
LLM_ARCH_UNKNOWN, LLM_ARCH_UNKNOWN,
}; };
@@ -249,6 +250,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = { @@ -251,6 +252,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_GRANITE, "granite" }, { LLM_ARCH_GRANITE, "granite" },
{ LLM_ARCH_GRANITE_MOE, "granitemoe" }, { LLM_ARCH_GRANITE_MOE, "granitemoe" },
{ LLM_ARCH_CHAMELEON, "chameleon" }, { LLM_ARCH_CHAMELEON, "chameleon" },
...@@ -34,15 +34,15 @@ index d1791af0..b01770d0 100644 ...@@ -34,15 +34,15 @@ index d1791af0..b01770d0 100644
{ LLM_ARCH_UNKNOWN, "(unknown)" }, { LLM_ARCH_UNKNOWN, "(unknown)" },
}; };
@@ -306,6 +308,7 @@ enum llm_kv { @@ -308,6 +310,7 @@ enum llm_kv {
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
LLM_KV_ATTENTION_SLIDING_WINDOW, LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE, LLM_KV_ATTENTION_SCALE,
+ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, + LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
LLM_KV_ROPE_DIMENSION_COUNT, LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_FREQ_BASE, LLM_KV_ROPE_DIMENSION_SECTIONS,
@@ -408,20 +411,21 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = { @@ -411,20 +414,21 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" }, { LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" },
{ LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" }, { LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
...@@ -77,8 +77,8 @@ index d1791af0..b01770d0 100644 ...@@ -77,8 +77,8 @@ index d1791af0..b01770d0 100644
+ { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection.%d" }, + { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection.%d" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
@@ -603,6 +607,7 @@ enum llm_tensor { @@ -607,6 +611,7 @@ enum llm_tensor {
LLM_TENSOR_ENC_OUTPUT_NORM, LLM_TENSOR_ENC_OUTPUT_NORM,
LLM_TENSOR_CLS, LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT, LLM_TENSOR_CLS_OUT,
...@@ -86,7 +86,7 @@ index d1791af0..b01770d0 100644 ...@@ -86,7 +86,7 @@ index d1791af0..b01770d0 100644
}; };
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = { static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
@@ -1541,6 +1546,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N @@ -1564,6 +1569,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
}, },
}, },
...@@ -111,7 +111,7 @@ index d1791af0..b01770d0 100644 ...@@ -111,7 +111,7 @@ index d1791af0..b01770d0 100644
{ {
LLM_ARCH_UNKNOWN, LLM_ARCH_UNKNOWN,
{ {
@@ -2401,6 +2424,7 @@ enum e_model { @@ -2425,6 +2448,7 @@ enum e_model {
MODEL_15B, MODEL_15B,
MODEL_16B, MODEL_16B,
MODEL_20B, MODEL_20B,
...@@ -119,7 +119,7 @@ index d1791af0..b01770d0 100644 ...@@ -119,7 +119,7 @@ index d1791af0..b01770d0 100644
MODEL_30B, MODEL_30B,
MODEL_32B, MODEL_32B,
MODEL_34B, MODEL_34B,
@@ -2451,6 +2475,8 @@ struct llama_hparams { @@ -2475,6 +2499,8 @@ struct llama_hparams {
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr; std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr; std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
...@@ -128,7 +128,7 @@ index d1791af0..b01770d0 100644 ...@@ -128,7 +128,7 @@ index d1791af0..b01770d0 100644
uint32_t n_layer_dense_lead = 0; uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0; uint32_t n_lora_q = 0;
uint32_t n_lora_kv = 0; uint32_t n_lora_kv = 0;
@@ -2521,6 +2547,7 @@ struct llama_hparams { @@ -2546,6 +2572,7 @@ struct llama_hparams {
if (this->n_head_arr != other.n_head_arr) return true; if (this->n_head_arr != other.n_head_arr) return true;
if (this->n_head_kv_arr != other.n_head_kv_arr) return true; if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
if (this->n_ff_arr != other.n_ff_arr) return true; if (this->n_ff_arr != other.n_ff_arr) return true;
...@@ -136,7 +136,7 @@ index d1791af0..b01770d0 100644 ...@@ -136,7 +136,7 @@ index d1791af0..b01770d0 100644
if (this->n_rel_attn_bkts != other.n_rel_attn_bkts) return true; if (this->n_rel_attn_bkts != other.n_rel_attn_bkts) return true;
if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true; if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
@@ -2630,6 +2657,14 @@ struct llama_hparams { @@ -2658,6 +2685,14 @@ struct llama_hparams {
return ssm_d_state * ssm_d_inner; return ssm_d_state * ssm_d_inner;
} }
} }
...@@ -151,7 +151,7 @@ index d1791af0..b01770d0 100644 ...@@ -151,7 +151,7 @@ index d1791af0..b01770d0 100644
}; };
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable"); static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
@@ -2816,6 +2851,8 @@ struct llama_layer { @@ -2844,6 +2879,8 @@ struct llama_layer {
struct ggml_tensor * ffn_gate_scale; struct ggml_tensor * ffn_gate_scale;
struct ggml_tensor * ffn_up_scale; struct ggml_tensor * ffn_up_scale;
struct ggml_tensor * ffn_down_scale; struct ggml_tensor * ffn_down_scale;
...@@ -160,7 +160,7 @@ index d1791af0..b01770d0 100644 ...@@ -160,7 +160,7 @@ index d1791af0..b01770d0 100644
}; };
// very similar to llama_batch, // very similar to llama_batch,
@@ -6209,6 +6246,21 @@ static void llm_load_hparams( @@ -6247,6 +6284,21 @@ static void llm_load_hparams(
default: model.type = e_model::MODEL_UNKNOWN; default: model.type = e_model::MODEL_UNKNOWN;
} }
} break; } break;
...@@ -182,7 +182,7 @@ index d1791af0..b01770d0 100644 ...@@ -182,7 +182,7 @@ index d1791af0..b01770d0 100644
default: (void)0; default: (void)0;
} }
@@ -7198,6 +7250,7 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = { @@ -7239,6 +7291,7 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, {LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
// this tensor is loaded for T5, but never used // this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}}, {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
...@@ -190,7 +190,7 @@ index d1791af0..b01770d0 100644 ...@@ -190,7 +190,7 @@ index d1791af0..b01770d0 100644
}; };
// checks if the weight tensor can be used with the specified buffer type and device // checks if the weight tensor can be used with the specified buffer type and device
@@ -9205,6 +9258,35 @@ static bool llm_load_tensors( @@ -9253,6 +9306,35 @@ static bool llm_load_tensors(
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
...@@ -226,7 +226,7 @@ index d1791af0..b01770d0 100644 ...@@ -226,7 +226,7 @@ index d1791af0..b01770d0 100644
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
@@ -16652,6 +16734,158 @@ struct llm_build_context { @@ -16671,6 +16753,158 @@ struct llm_build_context {
return gf; return gf;
} }
...@@ -385,7 +385,7 @@ index d1791af0..b01770d0 100644 ...@@ -385,7 +385,7 @@ index d1791af0..b01770d0 100644
}; };
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) { static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -16921,6 +17155,10 @@ static struct ggml_cgraph * llama_build_graph( @@ -16942,6 +17176,10 @@ static struct ggml_cgraph * llama_build_graph(
{ {
result = llm.build_chameleon(); result = llm.build_chameleon();
} break; } break;
...@@ -396,7 +396,7 @@ index d1791af0..b01770d0 100644 ...@@ -396,7 +396,7 @@ index d1791af0..b01770d0 100644
default: default:
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
} }
@@ -20132,6 +20370,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { @@ -20137,6 +20375,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_GRANITE: case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE_MOE: case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_CHAMELEON: case LLM_ARCH_CHAMELEON:
......
...@@ -8,7 +8,7 @@ Subject: [PATCH] conditional-fattn ...@@ -8,7 +8,7 @@ Subject: [PATCH] conditional-fattn
1 file changed, 2 insertions(+) 1 file changed, 2 insertions(+)
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 52aec229..cbf4fddf 100644 index 000f1777..8fd7c1a3 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2162,9 +2162,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg @@ -2162,9 +2162,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
......
...@@ -18,10 +18,10 @@ remaining is to implement the cross attention mask ...@@ -18,10 +18,10 @@ remaining is to implement the cross attention mask
3 files changed, 467 insertions(+), 20 deletions(-) 3 files changed, 467 insertions(+), 20 deletions(-)
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index 4ca53a0b..d56644a8 100644 index 16f30c56..0f0f3f62 100644
--- a/examples/llava/llava.cpp --- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp
@@ -412,7 +412,7 @@ struct llava_embd_batch { @@ -429,7 +429,7 @@ struct llava_embd_batch {
std::vector<llama_seq_id *> seq_ids; std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits; std::vector<int8_t> logits;
llama_batch batch; llama_batch batch;
...@@ -30,7 +30,7 @@ index 4ca53a0b..d56644a8 100644 ...@@ -30,7 +30,7 @@ index 4ca53a0b..d56644a8 100644
pos .resize(n_tokens); pos .resize(n_tokens);
n_seq_id.resize(n_tokens); n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1); seq_ids .resize(n_tokens + 1);
@@ -424,6 +424,7 @@ struct llava_embd_batch { @@ -441,6 +441,7 @@ struct llava_embd_batch {
/*n_tokens =*/ n_tokens, /*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr, /*tokens =*/ nullptr,
/*embd =*/ embd, /*embd =*/ embd,
...@@ -38,7 +38,7 @@ index 4ca53a0b..d56644a8 100644 ...@@ -38,7 +38,7 @@ index 4ca53a0b..d56644a8 100644
/*pos =*/ pos.data(), /*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(), /*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(), /*seq_id =*/ seq_ids.data(),
@@ -447,7 +448,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_ @@ -464,7 +465,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
n_eval = n_batch; n_eval = n_batch;
} }
float * embd = image_embed->embed+i*n_embd; float * embd = image_embed->embed+i*n_embd;
...@@ -48,10 +48,10 @@ index 4ca53a0b..d56644a8 100644 ...@@ -48,10 +48,10 @@ index 4ca53a0b..d56644a8 100644
LOG_ERR("%s : failed to eval\n", __func__); LOG_ERR("%s : failed to eval\n", __func__);
return false; return false;
diff --git a/include/llama.h b/include/llama.h diff --git a/include/llama.h b/include/llama.h
index e85f459f..aba85f86 100644 index c67988a3..0f266283 100644
--- a/include/llama.h --- a/include/llama.h
+++ b/include/llama.h +++ b/include/llama.h
@@ -245,6 +245,7 @@ extern "C" { @@ -249,6 +249,7 @@ extern "C" {
llama_token * token; llama_token * token;
float * embd; float * embd;
...@@ -59,7 +59,7 @@ index e85f459f..aba85f86 100644 ...@@ -59,7 +59,7 @@ index e85f459f..aba85f86 100644
llama_pos * pos; llama_pos * pos;
int32_t * n_seq_id; int32_t * n_seq_id;
llama_seq_id ** seq_id; llama_seq_id ** seq_id;
@@ -419,6 +420,10 @@ extern "C" { @@ -423,6 +424,10 @@ extern "C" {
struct llama_model * model, struct llama_model * model,
struct llama_context_params params); struct llama_context_params params);
...@@ -71,7 +71,7 @@ index e85f459f..aba85f86 100644 ...@@ -71,7 +71,7 @@ index e85f459f..aba85f86 100644
LLAMA_API void llama_free(struct llama_context * ctx); LLAMA_API void llama_free(struct llama_context * ctx);
diff --git a/src/llama.cpp b/src/llama.cpp diff --git a/src/llama.cpp b/src/llama.cpp
index b01770d0..46881642 100644 index 26be6254..4778a9ed 100644
--- a/src/llama.cpp --- a/src/llama.cpp
+++ b/src/llama.cpp +++ b/src/llama.cpp
@@ -146,6 +146,7 @@ static std::string format(const char * fmt, ...) { @@ -146,6 +146,7 @@ static std::string format(const char * fmt, ...) {
...@@ -82,7 +82,7 @@ index b01770d0..46881642 100644 ...@@ -82,7 +82,7 @@ index b01770d0..46881642 100644
LLM_ARCH_FALCON, LLM_ARCH_FALCON,
LLM_ARCH_BAICHUAN, LLM_ARCH_BAICHUAN,
LLM_ARCH_GROK, LLM_ARCH_GROK,
@@ -201,6 +202,7 @@ enum llm_arch { @@ -202,6 +203,7 @@ enum llm_arch {
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = { static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_LLAMA, "llama" }, { LLM_ARCH_LLAMA, "llama" },
...@@ -90,23 +90,23 @@ index b01770d0..46881642 100644 ...@@ -90,23 +90,23 @@ index b01770d0..46881642 100644
{ LLM_ARCH_FALCON, "falcon" }, { LLM_ARCH_FALCON, "falcon" },
{ LLM_ARCH_GROK, "grok" }, { LLM_ARCH_GROK, "grok" },
{ LLM_ARCH_GPT2, "gpt2" }, { LLM_ARCH_GPT2, "gpt2" },
@@ -309,6 +311,7 @@ enum llm_kv { @@ -311,6 +313,7 @@ enum llm_kv {
LLM_KV_ATTENTION_SLIDING_WINDOW, LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE, LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
+ LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, + LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,
LLM_KV_ROPE_DIMENSION_COUNT, LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_FREQ_BASE, LLM_KV_ROPE_DIMENSION_SECTIONS,
@@ -426,6 +429,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = { @@ -429,6 +432,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" }, { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection.%d" }, { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection.%d" },
+ { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" }, + { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
@@ -608,6 +612,14 @@ enum llm_tensor { @@ -612,6 +616,14 @@ enum llm_tensor {
LLM_TENSOR_CLS, LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT, LLM_TENSOR_CLS_OUT,
LLM_TENSOR_BSKCN_TV, LLM_TENSOR_BSKCN_TV,
...@@ -121,7 +121,7 @@ index b01770d0..46881642 100644 ...@@ -121,7 +121,7 @@ index b01770d0..46881642 100644
}; };
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = { static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
@@ -637,6 +649,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N @@ -641,6 +653,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
}, },
}, },
...@@ -162,7 +162,7 @@ index b01770d0..46881642 100644 ...@@ -162,7 +162,7 @@ index b01770d0..46881642 100644
{ {
LLM_ARCH_BAICHUAN, LLM_ARCH_BAICHUAN,
{ {
@@ -2432,6 +2478,7 @@ enum e_model { @@ -2456,6 +2502,7 @@ enum e_model {
MODEL_40B, MODEL_40B,
MODEL_65B, MODEL_65B,
MODEL_70B, MODEL_70B,
...@@ -170,7 +170,7 @@ index b01770d0..46881642 100644 ...@@ -170,7 +170,7 @@ index b01770d0..46881642 100644
MODEL_236B, MODEL_236B,
MODEL_314B, MODEL_314B,
MODEL_SMALL, MODEL_SMALL,
@@ -2476,6 +2523,7 @@ struct llama_hparams { @@ -2500,6 +2547,7 @@ struct llama_hparams {
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr; std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr; std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr;
...@@ -178,7 +178,7 @@ index b01770d0..46881642 100644 ...@@ -178,7 +178,7 @@ index b01770d0..46881642 100644
uint32_t n_layer_dense_lead = 0; uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0; uint32_t n_lora_q = 0;
@@ -2544,10 +2592,11 @@ struct llama_hparams { @@ -2569,10 +2617,11 @@ struct llama_hparams {
if (this->n_expert != other.n_expert) return true; if (this->n_expert != other.n_expert) return true;
if (this->n_expert_used != other.n_expert_used) return true; if (this->n_expert_used != other.n_expert_used) return true;
...@@ -194,7 +194,7 @@ index b01770d0..46881642 100644 ...@@ -194,7 +194,7 @@ index b01770d0..46881642 100644
if (this->n_rel_attn_bkts != other.n_rel_attn_bkts) return true; if (this->n_rel_attn_bkts != other.n_rel_attn_bkts) return true;
if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true; if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
@@ -2665,6 +2714,10 @@ struct llama_hparams { @@ -2693,6 +2742,10 @@ struct llama_hparams {
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
} }
...@@ -205,7 +205,7 @@ index b01770d0..46881642 100644 ...@@ -205,7 +205,7 @@ index b01770d0..46881642 100644
}; };
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable"); static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
@@ -2694,6 +2747,9 @@ struct llama_cparams { @@ -2722,6 +2775,9 @@ struct llama_cparams {
bool offload_kqv; bool offload_kqv;
bool flash_attn; bool flash_attn;
bool no_perf; bool no_perf;
...@@ -215,7 +215,7 @@ index b01770d0..46881642 100644 ...@@ -215,7 +215,7 @@ index b01770d0..46881642 100644
enum llama_pooling_type pooling_type; enum llama_pooling_type pooling_type;
@@ -2853,6 +2909,16 @@ struct llama_layer { @@ -2881,6 +2937,16 @@ struct llama_layer {
struct ggml_tensor * ffn_down_scale; struct ggml_tensor * ffn_down_scale;
struct ggml_tensor * bskcn_tv; struct ggml_tensor * bskcn_tv;
...@@ -232,7 +232,7 @@ index b01770d0..46881642 100644 ...@@ -232,7 +232,7 @@ index b01770d0..46881642 100644
}; };
// very similar to llama_batch, // very similar to llama_batch,
@@ -3439,6 +3505,8 @@ struct llama_context { @@ -3472,6 +3538,8 @@ struct llama_context {
struct ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch] struct ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch]
struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc] struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc]
struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch] struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
...@@ -241,7 +241,7 @@ index b01770d0..46881642 100644 ...@@ -241,7 +241,7 @@ index b01770d0..46881642 100644
}; };
struct llama_lora_weight { struct llama_lora_weight {
@@ -3577,6 +3645,39 @@ static bool llama_kv_cache_init( @@ -3610,6 +3678,39 @@ static bool llama_kv_cache_init(
cache.v_l.reserve(n_layer); cache.v_l.reserve(n_layer);
for (int i = 0; i < (int) n_layer; i++) { for (int i = 0; i < (int) n_layer; i++) {
...@@ -281,7 +281,7 @@ index b01770d0..46881642 100644 ...@@ -281,7 +281,7 @@ index b01770d0..46881642 100644
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s(); const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s(); const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
@@ -5520,12 +5621,14 @@ static void llm_load_hparams( @@ -5547,12 +5648,14 @@ static void llm_load_hparams(
} }
// zero-out the per-layer hparams // zero-out the per-layer hparams
...@@ -301,7 +301,7 @@ index b01770d0..46881642 100644 ...@@ -301,7 +301,7 @@ index b01770d0..46881642 100644
// n_head_kv is optional, default to n_head // n_head_kv is optional, default to n_head
hparams.n_head_kv_arr = hparams.n_head_arr; hparams.n_head_kv_arr = hparams.n_head_arr;
@@ -5574,7 +5677,7 @@ static void llm_load_hparams( @@ -5601,7 +5704,7 @@ static void llm_load_hparams(
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false); ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
...@@ -310,7 +310,7 @@ index b01770d0..46881642 100644 ...@@ -310,7 +310,7 @@ index b01770d0..46881642 100644
if (hparams.n_rot != hparams.n_embd_head_k) { if (hparams.n_rot != hparams.n_embd_head_k) {
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k)); throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
} }
@@ -5614,6 +5717,16 @@ static void llm_load_hparams( @@ -5641,6 +5744,16 @@ static void llm_load_hparams(
} }
} }
} break; } break;
...@@ -327,7 +327,7 @@ index b01770d0..46881642 100644 ...@@ -327,7 +327,7 @@ index b01770d0..46881642 100644
case LLM_ARCH_MINICPM: case LLM_ARCH_MINICPM:
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -7250,7 +7363,15 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = { @@ -7291,7 +7404,15 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, {LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
// this tensor is loaded for T5, but never used // this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}}, {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
...@@ -344,7 +344,7 @@ index b01770d0..46881642 100644 ...@@ -344,7 +344,7 @@ index b01770d0..46881642 100644
}; };
// checks if the weight tensor can be used with the specified buffer type and device // checks if the weight tensor can be used with the specified buffer type and device
@@ -7754,6 +7875,53 @@ static bool llm_load_tensors( @@ -7801,6 +7922,53 @@ static bool llm_load_tensors(
} }
} }
} break; } break;
...@@ -398,7 +398,7 @@ index b01770d0..46881642 100644 ...@@ -398,7 +398,7 @@ index b01770d0..46881642 100644
case LLM_ARCH_MINICPM3: case LLM_ARCH_MINICPM3:
{ {
const int64_t n_embd_head_qk_rope = hparams.n_rot; const int64_t n_embd_head_qk_rope = hparams.n_rot;
@@ -9463,7 +9631,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam @@ -9511,7 +9679,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE && if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
model.hparams.n_vocab != model.vocab.id_to_token.size()) { model.hparams.n_vocab != model.vocab.id_to_token.size()) {
...@@ -407,7 +407,7 @@ index b01770d0..46881642 100644 ...@@ -407,7 +407,7 @@ index b01770d0..46881642 100644
} }
if (params.vocab_only) { if (params.vocab_only) {
@@ -9546,6 +9714,21 @@ static struct ggml_tensor * llm_build_inp_embd( @@ -9594,6 +9762,21 @@ static struct ggml_tensor * llm_build_inp_embd(
return inpL; return inpL;
} }
...@@ -429,7 +429,7 @@ index b01770d0..46881642 100644 ...@@ -429,7 +429,7 @@ index b01770d0..46881642 100644
static void llm_build_kv_store( static void llm_build_kv_store(
struct ggml_context * ctx, struct ggml_context * ctx,
const llama_hparams & hparams, const llama_hparams & hparams,
@@ -10513,6 +10696,7 @@ struct llm_build_context { @@ -10561,6 +10744,7 @@ struct llm_build_context {
lctx.inp_pos_bucket = nullptr; lctx.inp_pos_bucket = nullptr;
lctx.inp_embd_enc = nullptr; lctx.inp_embd_enc = nullptr;
lctx.inp_KQ_mask_cross = nullptr; lctx.inp_KQ_mask_cross = nullptr;
...@@ -437,7 +437,7 @@ index b01770d0..46881642 100644 ...@@ -437,7 +437,7 @@ index b01770d0..46881642 100644
} }
void free() { void free() {
@@ -10992,6 +11176,240 @@ struct llm_build_context { @@ -11040,6 +11224,240 @@ struct llm_build_context {
return gf; return gf;
} }
...@@ -678,7 +678,7 @@ index b01770d0..46881642 100644 ...@@ -678,7 +678,7 @@ index b01770d0..46881642 100644
struct ggml_cgraph * build_baichuan() { struct ggml_cgraph * build_baichuan() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
@@ -16973,6 +17391,10 @@ static struct ggml_cgraph * llama_build_graph( @@ -16993,6 +17411,10 @@ static struct ggml_cgraph * llama_build_graph(
{ {
result = llm.build_llama(); result = llm.build_llama();
} break; } break;
...@@ -689,7 +689,7 @@ index b01770d0..46881642 100644 ...@@ -689,7 +689,7 @@ index b01770d0..46881642 100644
case LLM_ARCH_BAICHUAN: case LLM_ARCH_BAICHUAN:
{ {
result = llm.build_baichuan(); result = llm.build_baichuan();
@@ -17237,10 +17659,19 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) @@ -17258,10 +17680,19 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
} }
if (ubatch.embd) { if (ubatch.embd) {
...@@ -712,7 +712,7 @@ index b01770d0..46881642 100644 ...@@ -712,7 +712,7 @@ index b01770d0..46881642 100644
} }
if (ubatch.pos && lctx.inp_pos) { if (ubatch.pos && lctx.inp_pos) {
@@ -17841,7 +18272,7 @@ static int llama_decode_internal( @@ -17862,7 +18293,7 @@ static int llama_decode_internal(
n_outputs = 1; n_outputs = 1;
} }
...@@ -721,7 +721,7 @@ index b01770d0..46881642 100644 ...@@ -721,7 +721,7 @@ index b01770d0..46881642 100644
/* simple_split */ !kv_self.recurrent, /* simple_split */ !kv_self.recurrent,
/* logits_all */ n_outputs == n_tokens_all); /* logits_all */ n_outputs == n_tokens_all);
@@ -18151,7 +18582,7 @@ static int llama_encode_internal( @@ -18172,7 +18603,7 @@ static int llama_encode_internal(
const int64_t n_embd = hparams.n_embd; const int64_t n_embd = hparams.n_embd;
...@@ -730,7 +730,7 @@ index b01770d0..46881642 100644 ...@@ -730,7 +730,7 @@ index b01770d0..46881642 100644
const llama_ubatch ubatch = lctx.sbatch.split_simple(n_tokens); const llama_ubatch ubatch = lctx.sbatch.split_simple(n_tokens);
@@ -19189,7 +19620,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s @@ -19203,7 +19634,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
if (llama_model_has_encoder(&model)) { if (llama_model_has_encoder(&model)) {
n_attn_layer *= 3; n_attn_layer *= 3;
} }
...@@ -741,7 +741,7 @@ index b01770d0..46881642 100644 ...@@ -741,7 +741,7 @@ index b01770d0..46881642 100644
} }
size_t total_size_org = 0; size_t total_size_org = 0;
@@ -20355,6 +20788,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { @@ -20360,6 +20793,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
// use what we call a normal RoPE, operating on pairs of consecutive head values // use what we call a normal RoPE, operating on pairs of consecutive head values
case LLM_ARCH_LLAMA: case LLM_ARCH_LLAMA:
...@@ -749,7 +749,7 @@ index b01770d0..46881642 100644 ...@@ -749,7 +749,7 @@ index b01770d0..46881642 100644
case LLM_ARCH_BAICHUAN: case LLM_ARCH_BAICHUAN:
case LLM_ARCH_STARCODER: case LLM_ARCH_STARCODER:
case LLM_ARCH_PLAMO: case LLM_ARCH_PLAMO:
@@ -21782,6 +22216,10 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) { @@ -21790,6 +22224,10 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
ctx->cparams.causal_attn = causal_attn; ctx->cparams.causal_attn = causal_attn;
} }
...@@ -760,7 +760,7 @@ index b01770d0..46881642 100644 ...@@ -760,7 +760,7 @@ index b01770d0..46881642 100644
struct llama_batch llama_batch_get_one( struct llama_batch llama_batch_get_one(
llama_token * tokens, llama_token * tokens,
int32_t n_tokens) { int32_t n_tokens) {
@@ -21789,6 +22227,7 @@ struct llama_batch llama_batch_get_one( @@ -21797,6 +22235,7 @@ struct llama_batch llama_batch_get_one(
/*n_tokens =*/ n_tokens, /*n_tokens =*/ n_tokens,
/*tokens =*/ tokens, /*tokens =*/ tokens,
/*embd =*/ nullptr, /*embd =*/ nullptr,
...@@ -768,7 +768,7 @@ index b01770d0..46881642 100644 ...@@ -768,7 +768,7 @@ index b01770d0..46881642 100644
/*pos =*/ nullptr, /*pos =*/ nullptr,
/*n_seq_id =*/ nullptr, /*n_seq_id =*/ nullptr,
/*seq_id =*/ nullptr, /*seq_id =*/ nullptr,
@@ -21801,6 +22240,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_ @@ -21809,6 +22248,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
/*n_tokens =*/ 0, /*n_tokens =*/ 0,
/*tokens =*/ nullptr, /*tokens =*/ nullptr,
/*embd =*/ nullptr, /*embd =*/ nullptr,
...@@ -776,7 +776,7 @@ index b01770d0..46881642 100644 ...@@ -776,7 +776,7 @@ index b01770d0..46881642 100644
/*pos =*/ nullptr, /*pos =*/ nullptr,
/*n_seq_id =*/ nullptr, /*n_seq_id =*/ nullptr,
/*seq_id =*/ nullptr, /*seq_id =*/ nullptr,
@@ -21809,6 +22249,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_ @@ -21817,6 +22257,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
if (embd) { if (embd) {
batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd); batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
......
...@@ -5,30 +5,30 @@ Subject: [PATCH] add unpad operator ...@@ -5,30 +5,30 @@ Subject: [PATCH] add unpad operator
--- ---
ggml/include/ggml.h | 10 +++++ ggml/include/ggml.h | 10 +++++
ggml/src/ggml-cpu/ggml-cpu.c | 57 ++++++++++++++++++++++++++++ ggml/src/ggml-cpu/ggml-cpu.c | 58 ++++++++++++++++++++++++++++
ggml/src/ggml-cuda/ggml-cuda.cu | 4 ++ ggml/src/ggml-cuda/ggml-cuda.cu | 4 ++
ggml/src/ggml-cuda/pad.cu | 46 ++++++++++++++++++++++ ggml/src/ggml-cuda/pad.cu | 46 ++++++++++++++++++++++
ggml/src/ggml-cuda/pad.cuh | 1 + ggml/src/ggml-cuda/pad.cuh | 1 +
ggml/src/ggml-metal/ggml-metal.m | 33 ++++++++++++++++ ggml/src/ggml-metal/ggml-metal.m | 33 ++++++++++++++++
ggml/src/ggml-metal/ggml-metal.metal | 45 ++++++++++++++++++++++ ggml/src/ggml-metal/ggml-metal.metal | 45 +++++++++++++++++++++
ggml/src/ggml.c | 25 +++++++++++- ggml/src/ggml.c | 25 +++++++++++-
8 files changed, 219 insertions(+), 2 deletions(-) 8 files changed, 220 insertions(+), 2 deletions(-)
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 65cb92c4..acbcccc6 100644 index b0c1ac9c..091e6e6b 100644
--- a/ggml/include/ggml.h --- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h +++ b/ggml/include/ggml.h
@@ -499,6 +499,7 @@ extern "C" { @@ -499,6 +499,7 @@ extern "C" {
GGML_OP_POOL_2D_BACK,
GGML_OP_UPSCALE, // nearest interpolate GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_PAD, GGML_OP_PAD,
GGML_OP_PAD_REFLECT_1D,
+ GGML_OP_UNPAD, + GGML_OP_UNPAD,
GGML_OP_ARANGE, GGML_OP_ARANGE,
GGML_OP_TIMESTEP_EMBEDDING, GGML_OP_TIMESTEP_EMBEDDING,
GGML_OP_ARGSORT, GGML_OP_ARGSORT,
@@ -1695,6 +1696,15 @@ extern "C" { @@ -1718,6 +1719,15 @@ extern "C" {
int p2, int p0,
int p3); int p1);
+ // unpad each dimension: [x, ..., x, y, ..., y] -> [x, ..., x] + // unpad each dimension: [x, ..., x, y, ..., y] -> [x, ..., x]
+ GGML_API struct ggml_tensor * ggml_unpad( + GGML_API struct ggml_tensor * ggml_unpad(
...@@ -43,10 +43,10 @@ index 65cb92c4..acbcccc6 100644 ...@@ -43,10 +43,10 @@ index 65cb92c4..acbcccc6 100644
// timesteps: [N,] // timesteps: [N,]
// return: [N, dim] // return: [N, dim]
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 23ae2e10..111ff3b0 100644 index 67e67a08..bebff207 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c --- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -10439,6 +10439,58 @@ static void ggml_compute_forward_pad( @@ -10588,6 +10588,59 @@ static void ggml_compute_forward_pad_reflect_1d(
} }
} }
...@@ -102,12 +102,13 @@ index 23ae2e10..111ff3b0 100644 ...@@ -102,12 +102,13 @@ index 23ae2e10..111ff3b0 100644
+ } + }
+ } + }
+} +}
+
// ggml_compute_forward_arange // ggml_compute_forward_arange
@@ -12535,6 +12587,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm static void ggml_compute_forward_arange_f32(
@@ -12690,6 +12743,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{ {
ggml_compute_forward_pad(params, tensor); ggml_compute_forward_pad_reflect_1d(params, tensor);
} break; } break;
+ case GGML_OP_UNPAD: + case GGML_OP_UNPAD:
+ { + {
...@@ -116,16 +117,16 @@ index 23ae2e10..111ff3b0 100644 ...@@ -116,16 +117,16 @@ index 23ae2e10..111ff3b0 100644
case GGML_OP_ARANGE: case GGML_OP_ARANGE:
{ {
ggml_compute_forward_arange(params, tensor); ggml_compute_forward_arange(params, tensor);
@@ -12877,6 +12933,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { @@ -13033,6 +13090,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
} break;
case GGML_OP_UPSCALE: case GGML_OP_UPSCALE:
case GGML_OP_PAD: case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D:
+ case GGML_OP_UNPAD: + case GGML_OP_UNPAD:
case GGML_OP_ARANGE: case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index cbf4fddf..9ca6cb77 100644 index 8fd7c1a3..7c351b89 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2085,6 +2085,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg @@ -2085,6 +2085,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
...@@ -210,34 +211,34 @@ index 8fd386b0..e2ededc3 100644 ...@@ -210,34 +211,34 @@ index 8fd386b0..e2ededc3 100644
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 093ae900..cb9a1307 100644 index 28f590f9..787fc713 100644
--- a/ggml/src/ggml-metal/ggml-metal.m --- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -310,6 +310,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte @@ -311,6 +311,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F16_F32,
GGML_METAL_KERNEL_TYPE_UPSCALE_F32, GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
GGML_METAL_KERNEL_TYPE_PAD_F32, GGML_METAL_KERNEL_TYPE_PAD_F32,
GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
+ GGML_METAL_KERNEL_TYPE_UNPAD_F32, + GGML_METAL_KERNEL_TYPE_UNPAD_F32,
GGML_METAL_KERNEL_TYPE_ARANGE_F32, GGML_METAL_KERNEL_TYPE_ARANGE_F32,
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
@@ -877,6 +878,7 @@ @implementation GGMLMetalClass @@ -910,6 +911,7 @@ @implementation GGMLMetalClass
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F16_F32, conv_transpose_1d_f16_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32, unpad_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true);
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32, unpad_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
@@ -1099,6 +1101,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex @@ -1145,6 +1147,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
case GGML_OP_POOL_2D:
case GGML_OP_UPSCALE: case GGML_OP_UPSCALE:
case GGML_OP_PAD: case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D:
+ case GGML_OP_UNPAD: + case GGML_OP_UNPAD:
case GGML_OP_ARANGE: case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
@@ -3258,6 +3261,36 @@ static void ggml_metal_encode_node( @@ -3348,6 +3351,36 @@ static void ggml_metal_encode_node(
const int nth = MIN(1024, ne0); const int nth = MIN(1024, ne0);
...@@ -275,10 +276,10 @@ index 093ae900..cb9a1307 100644 ...@@ -275,10 +276,10 @@ index 093ae900..cb9a1307 100644
} break; } break;
case GGML_OP_ARANGE: case GGML_OP_ARANGE:
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 5caa0846..47038c31 100644 index 8ba43904..204c93e6 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal --- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -2897,6 +2897,51 @@ kernel void kernel_pad_f32( @@ -2944,6 +2944,51 @@ kernel void kernel_pad_reflect_1d_f32(
} }
} }
...@@ -331,44 +332,44 @@ index 5caa0846..47038c31 100644 ...@@ -331,44 +332,44 @@ index 5caa0846..47038c31 100644
device char * dst, device char * dst,
constant int64_t & ne0, constant int64_t & ne0,
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 1a9a7efa..ea2b259b 100644 index 51cc8566..0e74e554 100644
--- a/ggml/src/ggml.c --- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c +++ b/ggml/src/ggml.c
@@ -950,6 +950,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { @@ -954,6 +954,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"POOL_2D_BACK",
"UPSCALE", "UPSCALE",
"PAD", "PAD",
"PAD_REFLECT_1D",
+ "UNPAD", + "UNPAD",
"ARANGE", "ARANGE",
"TIMESTEP_EMBEDDING", "TIMESTEP_EMBEDDING",
"ARGSORT", "ARGSORT",
@@ -983,7 +984,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { @@ -987,7 +988,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"OPT_STEP_ADAMW", "OPT_STEP_ADAMW",
}; };
-static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81"); -static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82"); +static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none", "none",
@@ -1045,6 +1046,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { @@ -1050,6 +1051,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"pool_2d_back(x)",
"upscale(x)", "upscale(x)",
"pad(x)", "pad(x)",
"pad_reflect_1d(x)",
+ "unpad(x)", + "unpad(x)",
"arange(start, stop, step)", "arange(start, stop, step)",
"timestep_embedding(timesteps, dim, max_period)", "timestep_embedding(timesteps, dim, max_period)",
"argsort(x)", "argsort(x)",
@@ -1078,7 +1080,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { @@ -1083,7 +1085,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"adamw(x)", "adamw(x)",
}; };
-static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81"); -static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82"); +static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@@ -4097,6 +4099,25 @@ struct ggml_tensor * ggml_pad( @@ -4180,6 +4182,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
return result; return result;
} }
......
...@@ -11,7 +11,7 @@ the characters ...@@ -11,7 +11,7 @@ the characters
2 files changed, 23 insertions(+), 1 deletion(-) 2 files changed, 23 insertions(+), 1 deletion(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index d1dc9627..05ef0e71 100644 index 8c9aaf5a..3e372dc3 100644
--- a/src/llama-vocab.cpp --- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp
@@ -389,7 +389,7 @@ struct llm_tokenizer_bpe : llm_tokenizer { @@ -389,7 +389,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment