Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
7a81daf0
Unverified
Commit
7a81daf0
authored
Dec 14, 2024
by
Jeffrey Morgan
Committed by
GitHub
Dec 14, 2024
Browse files
llama: update vendor code to commit ba1cb19c (#8101)
parent
60f75560
Changes
273
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
369 additions
and
354 deletions
+369
-354
llama/llama-sampling.h
llama/llama-sampling.h
+1
-1
llama/llama-vocab.cpp
llama/llama-vocab.cpp
+2
-1
llama/llama-vocab.h
llama/llama-vocab.h
+1
-1
llama/llama.cpp
llama/llama.cpp
+198
-190
llama/llama.h
llama/llama.h
+13
-8
llama/llava.cpp
llama/llava.cpp
+28
-11
llama/llava.h
llama/llava.h
+1
-1
llama/log.cpp
llama/log.cpp
+1
-1
llama/log.h
llama/log.h
+1
-1
llama/mmq.cpp
llama/mmq.cpp
+4
-15
llama/mmq.h
llama/mmq.h
+2
-8
llama/patches/0001-cuda.patch
llama/patches/0001-cuda.patch
+1
-1
llama/patches/0002-pretokenizer.patch
llama/patches/0002-pretokenizer.patch
+5
-5
llama/patches/0003-embeddings.patch
llama/patches/0003-embeddings.patch
+4
-4
llama/patches/0004-clip-unicode.patch
llama/patches/0004-clip-unicode.patch
+3
-3
llama/patches/0005-solar-pro.patch
llama/patches/0005-solar-pro.patch
+20
-20
llama/patches/0006-conditional-fattn.patch
llama/patches/0006-conditional-fattn.patch
+1
-1
llama/patches/0008-add-mllama-support.patch
llama/patches/0008-add-mllama-support.patch
+42
-42
llama/patches/0009-add-unpad-operator.patch
llama/patches/0009-add-unpad-operator.patch
+40
-39
llama/patches/0010-fix-deepseek-deseret-regex.patch
llama/patches/0010-fix-deepseek-deseret-regex.patch
+1
-1
No files found.
llama/llama-sampling.h
View file @
7a81daf0
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
* MIT License
*
...
...
llama/llama-vocab.cpp
View file @
7a81daf0
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
* MIT License
*
...
...
@@ -444,6 +444,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
case
LLAMA_VOCAB_PRE_TYPE_SMOLLM
:
case
LLAMA_VOCAB_PRE_TYPE_CODESHELL
:
case
LLAMA_VOCAB_PRE_TYPE_EXAONE
:
case
LLAMA_VOCAB_PRE_TYPE_MINERVA
:
regex_exprs
=
{
"
\\
p{N}"
,
"'s|'t|'re|'ve|'m|'ll|'d| ?
\\
p{L}+| ?
\\
p{N}+| ?[^
\\
s
\\
p{L}
\\
p{N}]+|
\\
s+(?!
\\
S)"
,
...
...
llama/llama-vocab.h
View file @
7a81daf0
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
* MIT License
*
...
...
llama/llama.cpp
View file @
7a81daf0
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
* MIT License
*
...
...
@@ -190,6 +190,7 @@ enum llm_arch {
LLM_ARCH_QWEN,
LLM_ARCH_QWEN2,
LLM_ARCH_QWEN2MOE,
LLM_ARCH_QWEN2VL,
LLM_ARCH_PHI2,
LLM_ARCH_PHI3,
LLM_ARCH_PLAMO,
...
...
@@ -246,6 +247,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_QWEN, "qwen" },
{ LLM_ARCH_QWEN2, "qwen2" },
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
{ LLM_ARCH_QWEN2VL, "qwen2vl" },
{ LLM_ARCH_PHI2, "phi2" },
{ LLM_ARCH_PHI3, "phi3" },
{ LLM_ARCH_PLAMO, "plamo" },
...
...
@@ -340,6 +342,7 @@ enum llm_kv {
LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,
LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_DIMENSION_SECTIONS,
LLM_KV_ROPE_FREQ_BASE,
LLM_KV_ROPE_SCALE_LINEAR,
LLM_KV_ROPE_SCALING_TYPE,
...
...
@@ -458,6 +461,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
...
...
@@ -975,6 +979,23 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{
LLM_ARCH_QWEN2VL,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{
LLM_ARCH_QWEN2MOE,
{
...
...
@@ -1113,6 +1134,8 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
{ LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
{ LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
...
...
@@ -1778,9 +1801,10 @@ struct LLM_TN {
//
static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
{ LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
{ LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
{ LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
{ LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
{ LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
{ LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
{ LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
};
static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
...
...
@@ -1886,7 +1910,7 @@ private:
DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
if (!bufLen) {
ret = format("Win32 error code: %
s
", error_code);
ret = format("Win32 error code: %
lx
", error_code);
} else {
ret = lpMsgBuf;
LocalFree(lpMsgBuf);
...
...
@@ -2224,7 +2248,7 @@ struct llama_mmap {
HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
// may fail on pre-Windows 8 systems
pPrefetchVirtualMemory =
reinterpret_cast<
decltype(pPrefetchVirtualMemory)
> (
GetProcAddress(hKernel32, "PrefetchVirtualMemory")
)
;
pPrefetchVirtualMemory =
(
decltype(pPrefetchVirtualMemory)
)(void *)
GetProcAddress(hKernel32, "PrefetchVirtualMemory");
if (pPrefetchVirtualMemory) {
// advise the kernel to preload the mapped memory
...
...
@@ -2571,11 +2595,12 @@ struct llama_hparams {
uint32_t time_decay_extra_dim = 0;
uint32_t wkv_head_size = 0;
float rope_attn_factor = 1.0f;
float rope_freq_base_train;
float rope_freq_scale_train;
uint32_t n_ctx_orig_yarn;
float rope_yarn_log_mul;
float rope_attn_factor = 1.0f;
float rope_freq_base_train;
float rope_freq_scale_train;
uint32_t n_ctx_orig_yarn;
float rope_yarn_log_mul;
int rope_sections[4];
// for State Space Models
uint32_t ssm_d_conv = 0;
...
...
@@ -2634,6 +2659,9 @@ struct llama_hparams {
if (this->rope_finetuned != other.rope_finetuned) return true;
if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true;
if (std::equal(std::begin(this->rope_sections),
std::end(this->rope_sections),
std::begin(other.rope_sections))) return true;
if (this->ssm_d_conv != other.ssm_d_conv) return true;
if (this->ssm_d_inner != other.ssm_d_inner) return true;
...
...
@@ -3504,6 +3532,11 @@ struct llama_context {
// whether we are computing encoder output or decoder output
bool is_encoding = false;
// TODO: find a better way to accommodate mutli-dimension position encoding methods
// number of position id each token get, 1 for each token in most cases.
// when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate.
int n_pos_per_token = 1;
// output of the encoder part of the encoder-decoder models
std::vector<float> embd_enc;
std::vector<std::set<llama_seq_id>> seq_ids_enc;
...
...
@@ -4739,9 +4772,6 @@ struct llama_model_loader {
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
case GGML_TYPE_Q4_0_4_4: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_4; break;
case GGML_TYPE_Q4_0_4_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_8; break;
case GGML_TYPE_Q4_0_8_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_8_8; break;
default:
{
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
...
...
@@ -5505,9 +5535,6 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";
default: return "unknown, may not work";
}
...
...
@@ -5756,8 +5783,12 @@ static void llm_load_hparams(
case LLM_ARCH_MINICPM:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
switch (hparams.n_layer) {
case 52: model.type = e_model::MODEL_1B; break;
case 40: model.type = e_model::MODEL_2B; break;
default: model.type = e_model::MODEL_UNKNOWN;
}
...
...
@@ -5922,6 +5953,13 @@ static void llm_load_hparams(
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
case LLM_ARCH_QWEN2VL:
{
std::array<int, 4> section_dims;
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, section_dims, 4, true);
std::copy(section_dims.begin(), section_dims.begin() + 4, std::begin(hparams.rope_sections));
}
// fall through
case LLM_ARCH_QWEN2:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
...
...
@@ -6654,6 +6692,9 @@ static void llm_load_vocab(
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
vocab.tokenizer_add_bos = true;
vocab.tokenizer_clean_spaces = false;
} else if (
tokenizer_pre == "minerva-7b") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA;
} else {
LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
...
...
@@ -7248,7 +7289,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
}
if (model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
if (model.arch ==
LLM_ARCH_MINICPM || model.arch ==
LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
...
...
@@ -7882,7 +7923,13 @@ static bool llm_load_tensors(
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
}
else {
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
}
if (n_expert == 0) {
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
...
...
@@ -8396,6 +8443,7 @@ static bool llm_load_tensors(
}
} break;
case LLM_ARCH_QWEN2:
case LLM_ARCH_QWEN2VL:
{
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
...
...
@@ -13064,6 +13112,124 @@ struct llm_build_context {
return gf;
}
struct ggml_cgraph * build_qwen2vl() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// inp_pos - contains the positions
lctx.inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens * 4);
cb(lctx.inp_pos, "inp_pos", -1);
ggml_set_input(lctx.inp_pos);
struct ggml_tensor * inp_pos = lctx.inp_pos;
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
int sections[4];
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL;
// norm
cur = llm_build_norm(ctx0, inpL, hparams,
model.layers[il].attn_norm, NULL,
LLM_NORM_RMS, cb, il);
cb(cur, "attn_norm", il);
// self-attention
{
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
cb(Qcur, "Qcur", il);
struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
cb(Kcur, "Kcur", il);
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
cb(Kcur, "Kcur", il);
struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
cb(Vcur, "Vcur", il);
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
cb(Vcur, "Vcur", il);
Qcur = ggml_rope_multi(
ctx0,
ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur", il);
Kcur = ggml_rope_multi(
ctx0,
ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Kcur, "Kcur", il);
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
}
if (il == n_layer - 1) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
// feed-forward network
cur = llm_build_norm(ctx0, ffn_inp, hparams,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, cb, il);
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, lctx, cur,
model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
cur = ggml_add(ctx0, cur, ffn_inp);
cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);
// input for next layer
inpL = cur;
}
cur = inpL;
cur = llm_build_norm(ctx0, cur, hparams,
model.output_norm, NULL,
LLM_NORM_RMS, cb, -1);
cb(cur, "result_norm", -1);
// lm_head
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur);
return gf;
}
struct ggml_cgraph * build_qwen2moe() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
...
...
@@ -14015,153 +14181,6 @@ struct llm_build_context {
return gf;
}
// ref: https://arxiv.org/abs/2203.03466
// https://github.com/ggerganov/llama.cpp/issues/5276#issuecomment-1925774738
// based on the original build_llama() function
struct ggml_cgraph * build_minicpm() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
const int64_t n_embd = hparams.n_embd;
//TODO: if the model varies, these parameters need to be read from the model
const int64_t n_embd_base = 256;
const float scale_embd = 12.0f;
const float scale_depth = 1.4f;
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// scale the input embeddings
inpL = ggml_scale(ctx0, inpL, scale_embd);
cb(inpL, "inp_scaled", -1);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos();
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL;
// norm
cur = llm_build_norm(ctx0, inpL, hparams,
model.layers[il].attn_norm, NULL,
LLM_NORM_RMS, cb, il);
cb(cur, "attn_norm", il);
// self-attention
{
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
if (model.layers[il].bq) {
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
cb(Qcur, "Qcur", il);
}
struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
cb(Kcur, "Kcur", il);
if (model.layers[il].bk) {
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
cb(Kcur, "Kcur", il);
}
struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
cb(Vcur, "Vcur", il);
if (model.layers[il].bv) {
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
cb(Vcur, "Vcur", il);
}
Qcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur", il);
Kcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Kcur, "Kcur", il);
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
}
if (il == n_layer - 1) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
// scale_res - scale the hidden states for residual connection
const float scale_res = scale_depth/sqrtf(float(n_layer));
cur = ggml_scale(ctx0, cur, scale_res);
cb(cur, "hidden_scaled", -1);
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
// feed-forward network
{
cur = llm_build_norm(ctx0, ffn_inp, hparams,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, cb, il);
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, lctx, cur,
model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
}
// scale the hidden states for residual connection
cur = ggml_scale(ctx0, cur, scale_res);
cb(cur, "hidden_scaled_ffn", -1);
cur = ggml_add(ctx0, cur, ffn_inp);
cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);
// input for next layer
inpL = cur;
}
cur = inpL;
cur = llm_build_norm(ctx0, cur, hparams,
model.output_norm, NULL,
LLM_NORM_RMS, cb, -1);
cb(cur, "result_norm", -1);
// lm_head scaling
const float scale_lmhead = float(n_embd_base)/float(n_embd);
cur = ggml_scale(ctx0, cur, scale_lmhead);
cb(cur, "lmhead_scaling", -1);
// lm_head
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur);
return gf;
}
struct ggml_cgraph * build_minicpm3() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
...
...
@@ -17412,6 +17431,7 @@ static struct ggml_cgraph * llama_build_graph(
switch (model.arch) {
case LLM_ARCH_LLAMA:
case LLM_ARCH_MINICPM:
case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE_MOE:
{
...
...
@@ -17467,6 +17487,11 @@ static struct ggml_cgraph * llama_build_graph(
{
result = llm.build_qwen2();
} break;
case LLM_ARCH_QWEN2VL:
{
lctx.n_pos_per_token = 4;
result = llm.build_qwen2vl();
} break;
case LLM_ARCH_QWEN2MOE:
{
result = llm.build_qwen2moe();
...
...
@@ -17499,10 +17524,6 @@ static struct ggml_cgraph * llama_build_graph(
{
result = llm.build_internlm2();
} break;
case LLM_ARCH_MINICPM:
{
result = llm.build_minicpm();
} break;
case LLM_ARCH_MINICPM3:
{
result = llm.build_minicpm3();
...
...
@@ -17702,8 +17723,8 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
if (ubatch.pos && lctx.inp_pos) {
const int64_t n_tokens = ubatch.n_tokens;
ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
auto n_pos = lctx.n_pos_per_token;
ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*
n_pos*
ggml_element_size(lctx.inp_pos));
}
if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
...
...
@@ -19191,10 +19212,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
new_type = GGML_TYPE_IQ3_S;
}
else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 ||
new_type == GGML_TYPE_Q4_0_8_8) {
new_type = GGML_TYPE_Q4_0;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
new_type = GGML_TYPE_Q4_K;
}
...
...
@@ -19517,9 +19534,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break;
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break;
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break;
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
}
...
...
@@ -19860,14 +19874,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
f32_data = (float *) f32_conv_buf.data();
}
int chunk_size_multiplier = 1;
if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) {
if ((new_type == GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0)) new_type = GGML_TYPE_Q4_0;
else if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_0;
if (new_type == GGML_TYPE_Q4_0_8_8) chunk_size_multiplier = 8;
else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4;
}
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
fflush(stdout);
...
...
@@ -19880,8 +19886,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
const int64_t nrows = tensor->ne[1];
static const int64_t min_chunk_size = 32 * 512;
const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) *
chunk_size_multiplier;
const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row));
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
...
...
@@ -20859,6 +20864,9 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_MINICPM3:
return LLAMA_ROPE_TYPE_NEOX;
case LLM_ARCH_QWEN2VL:
return LLAMA_ROPE_TYPE_MROPE;
// all model arches should be listed explicitly here
case LLM_ARCH_UNKNOWN:
GGML_ABORT("unknown architecture");
...
...
@@ -22434,7 +22442,7 @@ float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
}
} else if ((size_t) i >= ctx->output_ids.size()) {
throw std::runtime_error(format("out of range [0, %
l
u)", ctx->output_ids.size()));
throw std::runtime_error(format("out of range [0, %
z
u)", ctx->output_ids.size()));
} else {
j = ctx->output_ids[i];
}
...
...
llama/llama.h
View file @
7a81daf0
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
* MIT License
*
...
...
@@ -130,12 +130,15 @@ extern "C" {
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH
=
24
,
LLAMA_VOCAB_PRE_TYPE_EXAONE
=
25
,
LLAMA_VOCAB_PRE_TYPE_CHAMELEON
=
26
,
LLAMA_VOCAB_PRE_TYPE_MINERVA
=
27
,
};
enum
llama_rope_type
{
LLAMA_ROPE_TYPE_NONE
=
-
1
,
LLAMA_ROPE_TYPE_NORM
=
0
,
LLAMA_ROPE_TYPE_NEOX
=
GGML_ROPE_TYPE_NEOX
,
LLAMA_ROPE_TYPE_NONE
=
-
1
,
LLAMA_ROPE_TYPE_NORM
=
0
,
LLAMA_ROPE_TYPE_NEOX
=
GGML_ROPE_TYPE_NEOX
,
LLAMA_ROPE_TYPE_MROPE
=
GGML_ROPE_TYPE_MROPE
,
LLAMA_ROPE_TYPE_VISION
=
GGML_ROPE_TYPE_VISION
,
};
enum
llama_token_type
{
//TODO: remove, required until per token attributes are available from GGUF file
...
...
@@ -197,9 +200,9 @@ extern "C" {
LLAMA_FTYPE_MOSTLY_IQ4_XS
=
30
,
// except 1d tensors
LLAMA_FTYPE_MOSTLY_IQ1_M
=
31
,
// except 1d tensors
LLAMA_FTYPE_MOSTLY_BF16
=
32
,
// except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_0_4_4
=
33
,
//
except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_0_4_8
=
34
,
//
except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_0_8_8
=
35
,
//
except 1d tensors
//
LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, //
removed from gguf files, use Q4_0 and runtime repack
//
LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, //
removed from gguf files, use Q4_0 and runtime repack
//
LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, //
removed from gguf files, use Q4_0 and runtime repack
LLAMA_FTYPE_MOSTLY_TQ1_0
=
36
,
// except 1d tensors
LLAMA_FTYPE_MOSTLY_TQ2_0
=
37
,
// except 1d tensors
...
...
@@ -211,7 +214,8 @@ extern "C" {
LLAMA_ROPE_SCALING_TYPE_NONE
=
0
,
LLAMA_ROPE_SCALING_TYPE_LINEAR
=
1
,
LLAMA_ROPE_SCALING_TYPE_YARN
=
2
,
LLAMA_ROPE_SCALING_TYPE_MAX_VALUE
=
LLAMA_ROPE_SCALING_TYPE_YARN
,
LLAMA_ROPE_SCALING_TYPE_LONGROPE
=
3
,
LLAMA_ROPE_SCALING_TYPE_MAX_VALUE
=
LLAMA_ROPE_SCALING_TYPE_LONGROPE
,
};
enum
llama_pooling_type
{
...
...
@@ -485,6 +489,7 @@ extern "C" {
// Functions to access the model's GGUF metadata scalar values
// - The functions return the length of the string on success, or -1 on failure
// - The output string is always null-terminated and cleared on failure
// - When retrieving a string, an extra byte must be allocated to account for the null terminator
// - GGUF array values are not supported by these functions
// Get metadata value as a string by key name
...
...
llama/llava.cpp
View file @
7a81daf0
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
* MIT License
*
...
...
@@ -285,25 +285,33 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
const
char
*
mm_patch_merge_type
=
clip_patch_merge_type
(
ctx_clip
);
if
(
clip_is_minicpmv
(
ctx_clip
))
{
if
(
clip_is_minicpmv
(
ctx_clip
)
||
clip_is_qwen2vl
(
ctx_clip
)
)
{
std
::
vector
<
float
*>
image_embd_v
;
image_embd_v
.
resize
(
img_res_v
.
size
);
struct
clip_image_size
*
load_image_size
=
clip_image_size_init
();
for
(
size_t
i
=
0
;
i
<
img_res_v
.
size
;
i
++
)
{
const
int64_t
t_img_enc_step_start_us
=
ggml_time_us
();
image_embd_v
[
i
]
=
(
float
*
)
malloc
(
clip_embd_nbytes
(
ctx_clip
));
image_embd_v
[
i
]
=
(
float
*
)
malloc
(
clip_embd_nbytes
_by_img
(
ctx_clip
,
img_res_v
.
data
[
i
].
nx
,
img_res_v
.
data
[
i
].
ny
));
int
patch_size
=
14
;
load_image_size
->
width
=
img_res_v
.
data
[
i
].
nx
;
load_image_size
->
height
=
img_res_v
.
data
[
i
].
ny
;
clip_add_load_image_size
(
ctx_clip
,
load_image_size
);
bool
encoded
=
false
;
int
has_minicpmv_projector
=
clip_is_minicpmv
(
ctx_clip
);
if
(
has_minicpmv_projector
==
2
)
{
encoded
=
clip_image_encode
(
ctx_clip
,
n_threads
,
only_v2_5_reshape_by_patch
(
&
img_res_v
.
data
[
i
],
patch_size
),
image_embd_v
[
i
]);
}
else
if
(
has_minicpmv_projector
==
3
)
{
if
(
clip_is_qwen2vl
(
ctx_clip
))
{
encoded
=
clip_image_encode
(
ctx_clip
,
n_threads
,
&
img_res_v
.
data
[
i
],
image_embd_v
[
i
]);
}
else
{
int
has_minicpmv_projector
=
clip_is_minicpmv
(
ctx_clip
);
if
(
has_minicpmv_projector
==
2
)
{
encoded
=
clip_image_encode
(
ctx_clip
,
n_threads
,
only_v2_5_reshape_by_patch
(
&
img_res_v
.
data
[
i
],
patch_size
),
image_embd_v
[
i
]);
}
else
if
(
has_minicpmv_projector
==
3
)
{
encoded
=
clip_image_encode
(
ctx_clip
,
n_threads
,
&
img_res_v
.
data
[
i
],
image_embd_v
[
i
]);
}
}
if
(
!
encoded
)
{
LOG_ERR
(
"Unable to encode image - spatial_unpad - subimage %d of %d
\n
"
,
(
int
)
i
+
1
,
(
int
)
img_res_v
.
size
);
return
false
;
...
...
@@ -316,8 +324,11 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
int
n_img_pos_out
=
0
;
for
(
size_t
i
=
0
;
i
<
image_embd_v
.
size
();
i
++
)
{
std
::
memcpy
(
image_embd
+
n_img_pos_out
*
clip_n_mmproj_embd
(
ctx_clip
),
image_embd_v
[
i
],
clip_embd_nbytes
(
ctx_clip
));
n_img_pos_out
+=
clip_n_patches
(
ctx_clip
);
std
::
memcpy
(
image_embd
+
n_img_pos_out
*
clip_n_mmproj_embd
(
ctx_clip
),
image_embd_v
[
i
],
clip_embd_nbytes_by_img
(
ctx_clip
,
img_res_v
.
data
[
i
].
nx
,
img_res_v
.
data
[
i
].
ny
));
n_img_pos_out
+=
clip_n_patches_by_img
(
ctx_clip
,
&
img_res_v
.
data
[
i
]);
}
*
n_img_pos
=
n_img_pos_out
;
for
(
size_t
i
=
0
;
i
<
image_embd_v
.
size
();
i
++
)
{
...
...
@@ -413,7 +424,13 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
if
(
clip_is_minicpmv
(
ctx_clip
))
{
num_max_patches
=
10
;
}
float
*
image_embd
=
(
float
*
)
malloc
(
clip_embd_nbytes
(
ctx_clip
)
*
num_max_patches
);
// TODO: base on gridsize/llava model
float
*
image_embd
;
if
(
clip_is_qwen2vl
(
ctx_clip
))
{
// qwen2vl don't split image into chunks, so `num_max_patches` is not needed.
image_embd
=
(
float
*
)
malloc
(
clip_embd_nbytes_by_img
(
ctx_clip
,
img
->
nx
,
img
->
ny
));
}
else
{
image_embd
=
(
float
*
)
malloc
(
clip_embd_nbytes
(
ctx_clip
)
*
num_max_patches
);
// TODO: base on gridsize/llava model
}
if
(
!
image_embd
)
{
LOG_ERR
(
"Unable to allocate memory for image embeddings
\n
"
);
return
false
;
...
...
llama/llava.h
View file @
7a81daf0
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
* MIT License
*
...
...
llama/log.cpp
View file @
7a81daf0
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
* MIT License
*
...
...
llama/log.h
View file @
7a81daf0
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
* MIT License
*
...
...
llama/mmq.cpp
View file @
7a81daf0
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
* MIT License
*
...
...
@@ -44,10 +44,6 @@
#include <unistd.h>
#endif
#if defined(_OPENMP)
#include <omp.h>
#endif
#if (defined(_WIN32) || defined(_WIN64))
#define RESTRICT __restrict
#else
...
...
@@ -1408,13 +1404,13 @@ struct tinygemm_kernel_avx<float, ggml_fp16_t, float, BLOCK_M, BLOCK_N, BLOCK_K>
#define PACKED_INDEX(n, k, KB, tile_size) (n * KB + k) * tile_size
template
<
typename
TB
,
int
BLOCK_K
>
void
convert_B_packed_format
(
void
*
RESTRICT
packed_B
,
const
TB
*
RESTRICT
B
,
int
N
,
int
K
,
int
n_threads
)
{
void
convert_B_packed_format
(
void
*
RESTRICT
packed_B
,
const
TB
*
RESTRICT
B
,
int
N
,
int
K
)
{
const
int
NB
=
N
/
TILE_N
;
const
int
KB
=
K
/
BLOCK_K
;
const
int
TILE_SIZE
=
get_tile_size
<
TB
>
();
// parallel on NB should be enough
parallel_for
(
n_threads
,
NB
,
[
&
](
int
begin
,
int
end
)
{
parallel_for
(
NB
,
[
&
](
int
begin
,
int
end
)
{
for
(
int
n
=
begin
;
n
<
end
;
++
n
)
{
for
(
int
k
=
0
;
k
<
KB
;
++
k
)
{
int
n0
=
n
*
TILE_N
;
...
...
@@ -2360,15 +2356,8 @@ void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * d
const
int
K
=
tensor
->
ne
[
0
];
// ne0: in_features
const
int
N
=
tensor
->
ne
[
1
];
// ne1: out_features
#if defined(_OPENMP)
// the buffer ctx is not initialized when .set_tensor is called
int
n_threads
=
omp_get_num_threads
();
#else
int
n_threads
=
1
;
#endif
GGML_DISPATCH_QTYPES
(
TYPE
,
[
&
]
{
convert_B_packed_format
<
type
,
blck_size
>
((
void
*
)((
char
*
)
tensor
->
data
+
offset
),
(
const
type
*
)
data
,
N
,
K
,
n_threads
);
convert_B_packed_format
<
type
,
blck_size
>
((
void
*
)((
char
*
)
tensor
->
data
+
offset
),
(
const
type
*
)
data
,
N
,
K
);
});
}
...
...
llama/mmq.h
View file @
7a81daf0
/**
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
* llama.cpp - commit
ba1cb19cdd0d92e012e0f6e009e0620f854b6afd
- do not edit this file
*
* MIT License
*
...
...
@@ -27,16 +27,10 @@
#pragma once
#include "common.h"
#ifdef __cplusplus
extern
"C"
{
#endif
size_t
ggml_backend_amx_desired_wsize
(
const
struct
ggml_tensor
*
dst
);
size_t
ggml_backend_amx_get_alloc_size
(
const
struct
ggml_tensor
*
tensor
);
void
ggml_backend_amx_convert_weight
(
struct
ggml_tensor
*
tensor
,
const
void
*
data
,
size_t
offset
,
size_t
size
);
void
ggml_backend_amx_mul_mat
(
const
struct
ggml_compute_params
*
params
,
struct
ggml_tensor
*
dst
);
#ifdef __cplusplus
}
#endif
llama/patches/0001-cuda.patch
View file @
7a81daf0
...
...
@@ -26,7 +26,7 @@ index fdb4b986..9b80fe07 100644
size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index
d6e4bfdd..52aec229
100644
index
c180adc8..000f1777
100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -424,6 +424,10 @@
struct ggml_backend_cuda_buffer_context {
...
...
llama/patches/0002-pretokenizer.patch
View file @
7a81daf0
...
...
@@ -8,10 +8,10 @@ Subject: [PATCH] pretokenizer
1 file changed, 3 insertions(+), 11 deletions(-)
diff --git a/src/llama.cpp b/src/llama.cpp
index
6a6f4c2a..fa09f3b3
100644
index
abc1252e..626c3e3f
100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -6
362
,16 +6
362
,7 @@
static void llm_load_vocab(
@@ -6
400
,16 +6
400
,7 @@
static void llm_load_vocab(
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
vocab.tokenizer_add_space_prefix = false;
vocab.tokenizer_clean_spaces = true;
...
...
@@ -29,9 +29,9 @@ index 6a6f4c2a..fa09f3b3 100644
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (
tokenizer_pre == "llama3" ||
@@ -6
473
,7 +6
464
,8 @@
static void llm_load_vocab(
vocab.
tokenizer_
add_bos = true;
vocab.t
okenizer_clean_spaces = false
;
@@ -6
514
,7 +6
505
,8 @@
static void llm_load_vocab(
tokenizer_
pre == "minerva-7b") {
vocab.t
ype_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA
;
} else {
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
+ LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
...
...
llama/patches/0003-embeddings.patch
View file @
7a81daf0
...
...
@@ -8,10 +8,10 @@ Subject: [PATCH] embeddings
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/src/llama.cpp b/src/llama.cpp
index
fa09f3b3..d1791af0
100644
index
626c3e3f..9e292c4f
100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -17
398
,7 +17
398
,7 @@
static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
@@ -17
419
,7 +17
419
,7 @@
static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead
...
...
@@ -20,7 +20,7 @@ index fa09f3b3..d1791af0 100644
const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
@@ -17
693
,7 +17
693
,6 @@
static int llama_decode_internal(
@@ -17
714
,7 +17
714
,6 @@
static int llama_decode_internal(
res = nullptr;
embd = nullptr;
} else if (cparams.embeddings) {
...
...
@@ -28,7 +28,7 @@ index fa09f3b3..d1791af0 100644
embd = nullptr;
for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
@@ -177
01
,11 +177
00
,15 @@
static int llama_decode_internal(
@@ -177
22
,11 +177
21
,15 @@
static int llama_decode_internal(
break;
}
}
...
...
llama/patches/0004-clip-unicode.patch
View file @
7a81daf0
...
...
@@ -8,7 +8,7 @@ Subject: [PATCH] clip-unicode
1 file changed, 39 insertions(+), 1 deletion(-)
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index
d7c94352..427d5e02
100644
index
ba28c07c..46998e4c
100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -56,6 +56,19 @@
...
...
@@ -31,7 +31,7 @@ index d7c94352..427d5e02 100644
//#define CLIP_DEBUG_FUNCTIONS
// RGB uint8 image
@@ -12
4
2,8 +1
25
5,29 @@
struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
@@ -1
3
22,8 +1
33
5,29 @@
struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
gguf_free(ctx);
return nullptr;
}
...
...
@@ -62,7 +62,7 @@ index d7c94352..427d5e02 100644
if (!fin) {
LOG_ERR("cannot open model file for loading tensors\n");
clip_free(new_clip);
@@ -1
28
3,7 +13
1
7,11 @@
struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
@@ -1
36
3,7 +13
9
7,11 @@
struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
}
}
...
...
llama/patches/0005-solar-pro.patch
View file @
7a81daf0
...
...
@@ -15,10 +15,10 @@ in general, the values are (bskcn_tv, 1 - bskcn_tv)
1 file changed, 253 insertions(+), 14 deletions(-)
diff --git a/src/llama.cpp b/src/llama.cpp
index
d1791af0..b01770d0
100644
index
9e292c4f..26be6254
100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -19
5
,6 +19
5
,7 @@
enum llm_arch {
@@ -19
6
,6 +19
6
,7 @@
enum llm_arch {
LLM_ARCH_GRANITE,
LLM_ARCH_GRANITE_MOE,
LLM_ARCH_CHAMELEON,
...
...
@@ -26,7 +26,7 @@ index d1791af0..b01770d0 100644
LLM_ARCH_UNKNOWN,
};
@@ -2
49
,6 +25
0
,7 @@
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
@@ -2
51
,6 +25
2
,7 @@
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_GRANITE, "granite" },
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
{ LLM_ARCH_CHAMELEON, "chameleon" },
...
...
@@ -34,15 +34,15 @@ index d1791af0..b01770d0 100644
{ LLM_ARCH_UNKNOWN, "(unknown)" },
};
@@ -30
6
,6 +30
8
,7 @@
enum llm_kv {
@@ -30
8
,6 +3
1
0,7 @@
enum llm_kv {
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE,
+ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_
FREQ_BASE
,
@@ -4
08
,20 +41
1
,21 @@
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
LLM_KV_ROPE_
DIMENSION_SECTIONS
,
@@ -4
11
,20 +41
4
,21 @@
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" },
{ LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
...
...
@@ -77,8 +77,8 @@ index d1791af0..b01770d0 100644
+ { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection.%d" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_
FREQ_BASE, "%s.rope.freq_base"
},
@@ -60
3
,6 +6
07
,7 @@
enum llm_tensor {
{ LLM_KV_ROPE_
DIMENSION_SECTIONS, "%s.rope.dimension_sections"
},
@@ -60
7
,6 +6
11
,7 @@
enum llm_tensor {
LLM_TENSOR_ENC_OUTPUT_NORM,
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
...
...
@@ -86,7 +86,7 @@ index d1791af0..b01770d0 100644
};
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
@@ -154
1
,6 +15
4
6,24 @@
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
@@ -15
6
4,6 +156
9
,24 @@
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
},
},
...
...
@@ -111,7 +111,7 @@ index d1791af0..b01770d0 100644
{
LLM_ARCH_UNKNOWN,
{
@@ -24
01
,6 +24
2
4,7 @@
enum e_model {
@@ -24
25
,6 +244
8
,7 @@
enum e_model {
MODEL_15B,
MODEL_16B,
MODEL_20B,
...
...
@@ -119,7 +119,7 @@ index d1791af0..b01770d0 100644
MODEL_30B,
MODEL_32B,
MODEL_34B,
@@ -245
1
,6 +24
75
,8 @@
struct llama_hparams {
@@ -24
7
5,6 +24
99
,8 @@
struct llama_hparams {
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
...
...
@@ -128,7 +128,7 @@ index d1791af0..b01770d0 100644
uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0;
uint32_t n_lora_kv = 0;
@@ -25
21
,6 +25
4
7,7 @@
struct llama_hparams {
@@ -25
46
,6 +257
2
,7 @@
struct llama_hparams {
if (this->n_head_arr != other.n_head_arr) return true;
if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
if (this->n_ff_arr != other.n_ff_arr) return true;
...
...
@@ -136,7 +136,7 @@ index d1791af0..b01770d0 100644
if (this->n_rel_attn_bkts != other.n_rel_attn_bkts) return true;
if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
@@ -26
30
,6 +265
7
,14 @@
struct llama_hparams {
@@ -26
58
,6 +26
8
5,14 @@
struct llama_hparams {
return ssm_d_state * ssm_d_inner;
}
}
...
...
@@ -151,7 +151,7 @@ index d1791af0..b01770d0 100644
};
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
@@ -28
16
,6 +28
51
,8 @@
struct llama_layer {
@@ -28
44
,6 +28
79
,8 @@
struct llama_layer {
struct ggml_tensor * ffn_gate_scale;
struct ggml_tensor * ffn_up_scale;
struct ggml_tensor * ffn_down_scale;
...
...
@@ -160,7 +160,7 @@ index d1791af0..b01770d0 100644
};
// very similar to llama_batch,
@@ -62
09
,6 +624
6
,21 @@
static void llm_load_hparams(
@@ -62
47
,6 +62
8
4,21 @@
static void llm_load_hparams(
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
...
...
@@ -182,7 +182,7 @@ index d1791af0..b01770d0 100644
default: (void)0;
}
@@ -7
198
,6 +72
50
,7 @@
static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
@@ -7
239
,6 +72
91
,7 @@
static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
// this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
...
...
@@ -190,7 +190,7 @@ index d1791af0..b01770d0 100644
};
// checks if the weight tensor can be used with the specified buffer type and device
@@ -92
0
5,6 +9
258
,35 @@
static bool llm_load_tensors(
@@ -925
3
,6 +9
306
,35 @@
static bool llm_load_tensors(
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
...
...
@@ -226,7 +226,7 @@ index d1791af0..b01770d0 100644
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
@@ -166
52
,6 +1673
4
,158 @@
struct llm_build_context {
@@ -166
71
,6 +167
5
3,158 @@
struct llm_build_context {
return gf;
}
...
...
@@ -385,7 +385,7 @@ index d1791af0..b01770d0 100644
};
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -1692
1
,6 +171
55
,10 @@
static struct ggml_cgraph * llama_build_graph(
@@ -169
4
2,6 +171
76
,10 @@
static struct ggml_cgraph * llama_build_graph(
{
result = llm.build_chameleon();
} break;
...
...
@@ -396,7 +396,7 @@ index d1791af0..b01770d0 100644
default:
GGML_ABORT("fatal error");
}
@@ -2013
2
,6 +2037
0
,7 @@
enum llama_rope_type llama_rope_type(const struct llama_model * model) {
@@ -2013
7
,6 +2037
5
,7 @@
enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_CHAMELEON:
...
...
llama/patches/0006-conditional-fattn.patch
View file @
7a81daf0
...
...
@@ -8,7 +8,7 @@ Subject: [PATCH] conditional-fattn
1 file changed, 2 insertions(+)
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index
52aec229..cbf4fddf
100644
index
000f1777..8fd7c1a3
100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2162,9 +2162,11 @@
static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
...
...
llama/patches/0008-add-mllama-support.patch
View file @
7a81daf0
...
...
@@ -18,10 +18,10 @@ remaining is to implement the cross attention mask
3 files changed, 467 insertions(+), 20 deletions(-)
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index
4ca53a0b..d56644a8
100644
index
16f30c56..0f0f3f62
100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -4
1
2,7 +4
1
2,7 @@
struct llava_embd_batch {
@@ -42
9
,7 +42
9
,7 @@
struct llava_embd_batch {
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
...
...
@@ -30,7 +30,7 @@ index 4ca53a0b..d56644a8 100644
pos .resize(n_tokens);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
@@ -4
2
4,6 +4
2
4,7 @@
struct llava_embd_batch {
@@ -44
1
,6 +44
1
,7 @@
struct llava_embd_batch {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ embd,
...
...
@@ -38,7 +38,7 @@ index 4ca53a0b..d56644a8 100644
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
@@ -44
7
,7 +4
48
,7 @@
bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
@@ -4
6
4,7 +4
65
,7 @@
bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
n_eval = n_batch;
}
float * embd = image_embed->embed+i*n_embd;
...
...
@@ -48,10 +48,10 @@ index 4ca53a0b..d56644a8 100644
LOG_ERR("%s : failed to eval\n", __func__);
return false;
diff --git a/include/llama.h b/include/llama.h
index
e85f459f..aba85f86
100644
index
c67988a3..0f266283
100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -24
5
,6 +24
5
,7 @@
extern "C" {
@@ -24
9
,6 +24
9
,7 @@
extern "C" {
llama_token * token;
float * embd;
...
...
@@ -59,7 +59,7 @@ index e85f459f..aba85f86 100644
llama_pos * pos;
int32_t * n_seq_id;
llama_seq_id ** seq_id;
@@ -4
19
,6 +42
0
,10 @@
extern "C" {
@@ -4
23
,6 +42
4
,10 @@
extern "C" {
struct llama_model * model,
struct llama_context_params params);
...
...
@@ -71,7 +71,7 @@ index e85f459f..aba85f86 100644
LLAMA_API void llama_free(struct llama_context * ctx);
diff --git a/src/llama.cpp b/src/llama.cpp
index
b01770d0..46881642
100644
index
26be6254..4778a9ed
100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -146,6 +146,7 @@
static std::string format(const char * fmt, ...) {
...
...
@@ -82,7 +82,7 @@ index b01770d0..46881642 100644
LLM_ARCH_FALCON,
LLM_ARCH_BAICHUAN,
LLM_ARCH_GROK,
@@ -20
1
,6 +20
2
,7 @@
enum llm_arch {
@@ -20
2
,6 +20
3
,7 @@
enum llm_arch {
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_LLAMA, "llama" },
...
...
@@ -90,23 +90,23 @@ index b01770d0..46881642 100644
{ LLM_ARCH_FALCON, "falcon" },
{ LLM_ARCH_GROK, "grok" },
{ LLM_ARCH_GPT2, "gpt2" },
@@ -3
09
,6 +31
1
,7 @@
enum llm_kv {
@@ -3
11
,6 +31
3
,7 @@
enum llm_kv {
LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
+ LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,
LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_
FREQ_BASE
,
@@ -42
6
,6 +42
9
,7 @@
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
LLM_KV_ROPE_
DIMENSION_SECTIONS
,
@@ -42
9
,6 +4
3
2,7 @@
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection.%d" },
+ { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_
FREQ_BASE, "%s.rope.freq_base"
},
@@ -6
08
,6 +61
2
,14 @@
enum llm_tensor {
{ LLM_KV_ROPE_
DIMENSION_SECTIONS, "%s.rope.dimension_sections"
},
@@ -6
12
,6 +61
6
,14 @@
enum llm_tensor {
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
LLM_TENSOR_BSKCN_TV,
...
...
@@ -121,7 +121,7 @@ index b01770d0..46881642 100644
};
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
@@ -6
37
,6 +6
49
,40 @@
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
@@ -6
41
,6 +6
53
,40 @@
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
},
},
...
...
@@ -162,7 +162,7 @@ index b01770d0..46881642 100644
{
LLM_ARCH_BAICHUAN,
{
@@ -24
32
,6 +2
478
,7 @@
enum e_model {
@@ -24
56
,6 +2
502
,7 @@
enum e_model {
MODEL_40B,
MODEL_65B,
MODEL_70B,
...
...
@@ -170,7 +170,7 @@ index b01770d0..46881642 100644
MODEL_236B,
MODEL_314B,
MODEL_SMALL,
@@ -2
476
,6 +25
23
,7 @@
struct llama_hparams {
@@ -2
500
,6 +25
47
,7 @@
struct llama_hparams {
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr;
...
...
@@ -178,7 +178,7 @@ index b01770d0..46881642 100644
uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0;
@@ -25
44
,10 +2
592
,11 @@
struct llama_hparams {
@@ -25
69
,10 +2
617
,11 @@
struct llama_hparams {
if (this->n_expert != other.n_expert) return true;
if (this->n_expert_used != other.n_expert_used) return true;
...
...
@@ -194,7 +194,7 @@ index b01770d0..46881642 100644
if (this->n_rel_attn_bkts != other.n_rel_attn_bkts) return true;
if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
@@ -26
65
,6 +27
1
4,10 @@
struct llama_hparams {
@@ -26
93
,6 +274
2
,10 @@
struct llama_hparams {
GGML_ABORT("fatal error");
}
...
...
@@ -205,7 +205,7 @@ index b01770d0..46881642 100644
};
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
@@ -2
694
,6 +27
4
7,9 @@
struct llama_cparams {
@@ -2
722
,6 +277
5
,9 @@
struct llama_cparams {
bool offload_kqv;
bool flash_attn;
bool no_perf;
...
...
@@ -215,7 +215,7 @@ index b01770d0..46881642 100644
enum llama_pooling_type pooling_type;
@@ -28
53
,6 +29
09
,16 @@
struct llama_layer {
@@ -28
81
,6 +29
37
,16 @@
struct llama_layer {
struct ggml_tensor * ffn_down_scale;
struct ggml_tensor * bskcn_tv;
...
...
@@ -232,7 +232,7 @@ index b01770d0..46881642 100644
};
// very similar to llama_batch,
@@ -34
39
,6 +35
05
,8 @@
struct llama_context {
@@ -34
72
,6 +35
38
,8 @@
struct llama_context {
struct ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch]
struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc]
struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
...
...
@@ -241,7 +241,7 @@ index b01770d0..46881642 100644
};
struct llama_lora_weight {
@@ -3
577
,6 +36
45
,39 @@
static bool llama_kv_cache_init(
@@ -3
610
,6 +36
78
,39 @@
static bool llama_kv_cache_init(
cache.v_l.reserve(n_layer);
for (int i = 0; i < (int) n_layer; i++) {
...
...
@@ -281,7 +281,7 @@ index b01770d0..46881642 100644
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
@@ -55
20
,12 +56
21
,14 @@
static void llm_load_hparams(
@@ -55
47
,12 +56
48
,14 @@
static void llm_load_hparams(
}
// zero-out the per-layer hparams
...
...
@@ -301,7 +301,7 @@ index b01770d0..46881642 100644
// n_head_kv is optional, default to n_head
hparams.n_head_kv_arr = hparams.n_head_arr;
@@ -5
574
,7 +5
677
,7 @@
static void llm_load_hparams(
@@ -5
601
,7 +5
704
,7 @@
static void llm_load_hparams(
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
...
...
@@ -310,7 +310,7 @@ index b01770d0..46881642 100644
if (hparams.n_rot != hparams.n_embd_head_k) {
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
}
@@ -56
1
4,6 +57
17
,16 @@
static void llm_load_hparams(
@@ -564
1
,6 +57
44
,16 @@
static void llm_load_hparams(
}
}
} break;
...
...
@@ -327,7 +327,7 @@ index b01770d0..46881642 100644
case LLM_ARCH_MINICPM:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -72
50
,7 +7
363
,15 @@
static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
@@ -72
91
,7 +7
404
,15 @@
static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
// this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
...
...
@@ -344,7 +344,7 @@ index b01770d0..46881642 100644
};
// checks if the weight tensor can be used with the specified buffer type and device
@@ -7
754
,6 +7
875
,53 @@
static bool llm_load_tensors(
@@ -7
801
,6 +7
922
,53 @@
static bool llm_load_tensors(
}
}
} break;
...
...
@@ -398,7 +398,7 @@ index b01770d0..46881642 100644
case LLM_ARCH_MINICPM3:
{
const int64_t n_embd_head_qk_rope = hparams.n_rot;
@@ -9
463
,7 +96
31
,7 @@
static int llama_model_load(const std::string & fname, llama_model & model, llam
@@ -9
511
,7 +96
79
,7 @@
static int llama_model_load(const std::string & fname, llama_model & model, llam
if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
model.hparams.n_vocab != model.vocab.id_to_token.size()) {
...
...
@@ -407,7 +407,7 @@ index b01770d0..46881642 100644
}
if (params.vocab_only) {
@@ -954
6
,6 +97
14
,21 @@
static struct ggml_tensor * llm_build_inp_embd(
@@ -95
9
4,6 +97
62
,21 @@
static struct ggml_tensor * llm_build_inp_embd(
return inpL;
}
...
...
@@ -429,7 +429,7 @@ index b01770d0..46881642 100644
static void llm_build_kv_store(
struct ggml_context * ctx,
const llama_hparams & hparams,
@@ -1051
3
,6 +10
696
,7 @@
struct llm_build_context {
@@ -105
6
1,6 +10
744
,7 @@
struct llm_build_context {
lctx.inp_pos_bucket = nullptr;
lctx.inp_embd_enc = nullptr;
lctx.inp_KQ_mask_cross = nullptr;
...
...
@@ -437,7 +437,7 @@ index b01770d0..46881642 100644
}
void free() {
@@ -1
0992
,6 +11
176
,240 @@
struct llm_build_context {
@@ -1
1040
,6 +11
224
,240 @@
struct llm_build_context {
return gf;
}
...
...
@@ -678,7 +678,7 @@ index b01770d0..46881642 100644
struct ggml_cgraph * build_baichuan() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
@@ -169
7
3,6 +17
39
1,10 @@
static struct ggml_cgraph * llama_build_graph(
@@ -169
9
3,6 +17
41
1,10 @@
static struct ggml_cgraph * llama_build_graph(
{
result = llm.build_llama();
} break;
...
...
@@ -689,7 +689,7 @@ index b01770d0..46881642 100644
case LLM_ARCH_BAICHUAN:
{
result = llm.build_baichuan();
@@ -172
37
,10 +176
59
,19 @@
static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
@@ -172
58
,10 +176
80
,19 @@
static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
}
if (ubatch.embd) {
...
...
@@ -712,7 +712,7 @@ index b01770d0..46881642 100644
}
if (ubatch.pos && lctx.inp_pos) {
@@ -178
41
,7 +182
72
,7 @@
static int llama_decode_internal(
@@ -178
62
,7 +182
93
,7 @@
static int llama_decode_internal(
n_outputs = 1;
}
...
...
@@ -721,7 +721,7 @@ index b01770d0..46881642 100644
/* simple_split */ !kv_self.recurrent,
/* logits_all */ n_outputs == n_tokens_all);
@@ -181
51
,7 +18
582
,7 @@
static int llama_encode_internal(
@@ -181
72
,7 +18
603
,7 @@
static int llama_encode_internal(
const int64_t n_embd = hparams.n_embd;
...
...
@@ -730,7 +730,7 @@ index b01770d0..46881642 100644
const llama_ubatch ubatch = lctx.sbatch.split_simple(n_tokens);
@@ -19
189
,7 +196
20
,9 @@
static void llama_model_quantize_internal(const std::string & fname_inp, const s
@@ -19
203
,7 +196
34
,9 @@
static void llama_model_quantize_internal(const std::string & fname_inp, const s
if (llama_model_has_encoder(&model)) {
n_attn_layer *= 3;
}
...
...
@@ -741,7 +741,7 @@ index b01770d0..46881642 100644
}
size_t total_size_org = 0;
@@ -203
55
,6 +207
88
,7 @@
enum llama_rope_type llama_rope_type(const struct llama_model * model) {
@@ -203
60
,6 +207
93
,7 @@
enum llama_rope_type llama_rope_type(const struct llama_model * model) {
// use what we call a normal RoPE, operating on pairs of consecutive head values
case LLM_ARCH_LLAMA:
...
...
@@ -749,7 +749,7 @@ index b01770d0..46881642 100644
case LLM_ARCH_BAICHUAN:
case LLM_ARCH_STARCODER:
case LLM_ARCH_PLAMO:
@@ -217
82
,6 +222
16
,10 @@
void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
@@ -217
90
,6 +222
24
,10 @@
void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
ctx->cparams.causal_attn = causal_attn;
}
...
...
@@ -760,7 +760,7 @@ index b01770d0..46881642 100644
struct llama_batch llama_batch_get_one(
llama_token * tokens,
int32_t n_tokens) {
@@ -217
8
9,6 +222
27
,7 @@
struct llama_batch llama_batch_get_one(
@@ -2179
7
,6 +222
35
,7 @@
struct llama_batch llama_batch_get_one(
/*n_tokens =*/ n_tokens,
/*tokens =*/ tokens,
/*embd =*/ nullptr,
...
...
@@ -768,7 +768,7 @@ index b01770d0..46881642 100644
/*pos =*/ nullptr,
/*n_seq_id =*/ nullptr,
/*seq_id =*/ nullptr,
@@ -2180
1
,6 +2224
0
,7 @@
struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
@@ -2180
9
,6 +2224
8
,7 @@
struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
/*n_tokens =*/ 0,
/*tokens =*/ nullptr,
/*embd =*/ nullptr,
...
...
@@ -776,7 +776,7 @@ index b01770d0..46881642 100644
/*pos =*/ nullptr,
/*n_seq_id =*/ nullptr,
/*seq_id =*/ nullptr,
@@ -218
09
,6 +222
49
,7 @@
struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
@@ -218
17
,6 +222
57
,7 @@
struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
if (embd) {
batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
...
...
llama/patches/0009-add-unpad-operator.patch
View file @
7a81daf0
...
...
@@ -5,30 +5,30 @@ Subject: [PATCH] add unpad operator
---
ggml/include/ggml.h | 10 +++++
ggml/src/ggml-cpu/ggml-cpu.c | 5
7
++++++++++++++++++++++++++++
ggml/src/ggml-cpu/ggml-cpu.c | 5
8
++++++++++++++++++++++++++++
ggml/src/ggml-cuda/ggml-cuda.cu | 4 ++
ggml/src/ggml-cuda/pad.cu | 46 ++++++++++++++++++++++
ggml/src/ggml-cuda/pad.cuh | 1 +
ggml/src/ggml-metal/ggml-metal.m | 33 ++++++++++++++++
ggml/src/ggml-metal/ggml-metal.metal | 45 +++++++++++++++++++++
+
ggml/src/ggml-metal/ggml-metal.metal | 45 +++++++++++++++++++++
ggml/src/ggml.c | 25 +++++++++++-
8 files changed, 2
19
insertions(+), 2 deletions(-)
8 files changed, 2
20
insertions(+), 2 deletions(-)
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index
65cb92c4..acbcccc6
100644
index
b0c1ac9c..091e6e6b
100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -499,6 +499,7 @@
extern "C" {
GGML_OP_POOL_2D_BACK,
GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_PAD,
GGML_OP_PAD_REFLECT_1D,
+ GGML_OP_UNPAD,
GGML_OP_ARANGE,
GGML_OP_TIMESTEP_EMBEDDING,
GGML_OP_ARGSORT,
@@ -1
695
,6 +1
696
,15 @@
extern "C" {
int
p2
,
int
p3
);
@@ -1
718
,6 +1
719
,15 @@
extern "C" {
int
p0
,
int
p1
);
+ // unpad each dimension: [x, ..., x, y, ..., y] -> [x, ..., x]
+ GGML_API struct ggml_tensor * ggml_unpad(
...
...
@@ -43,10 +43,10 @@ index 65cb92c4..acbcccc6 100644
// timesteps: [N,]
// return: [N, dim]
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index
23ae2e10..111ff3b0
100644
index
67e67a08..bebff207
100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -10
439
,6 +10
439
,5
8
@@
static void ggml_compute_forward_pad(
@@ -10
588
,6 +10
588
,5
9
@@
static void ggml_compute_forward_pad
_reflect_1d
(
}
}
...
...
@@ -102,12 +102,13 @@ index 23ae2e10..111ff3b0 100644
+ }
+ }
+}
+
// ggml_compute_forward_arange
@@ -12535,6 +12587,10 @@
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
static void ggml_compute_forward_arange_f32(
@@ -12690,6 +12743,10 @@
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_pad(params, tensor);
ggml_compute_forward_pad
_reflect_1d
(params, tensor);
} break;
+ case GGML_OP_UNPAD:
+ {
...
...
@@ -116,16 +117,16 @@ index 23ae2e10..111ff3b0 100644
case GGML_OP_ARANGE:
{
ggml_compute_forward_arange(params, tensor);
@@ -12877,6 +12933,7 @@
static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
} break;
@@ -13033,6 +13090,7 @@
static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
case GGML_OP_UPSCALE:
case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D:
+ case GGML_OP_UNPAD:
case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT:
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index
cbf4fddf..9ca6cb77
100644
index
8fd7c1a3..7c351b89
100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2085,6 +2085,9 @@
static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
...
...
@@ -210,34 +211,34 @@ index 8fd386b0..e2ededc3 100644
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index
093ae900..cb9a1307
100644
index
28f590f9..787fc713
100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -310,6 +310,7 @@
static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F16_F32,
@@ -311,6 +311,7 @@
static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
GGML_METAL_KERNEL_TYPE_PAD_F32,
GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
+ GGML_METAL_KERNEL_TYPE_UNPAD_F32,
GGML_METAL_KERNEL_TYPE_ARANGE_F32,
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
@@ -877,6 +878,7 @@
@implementation GGMLMetalClass
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F16_F32, conv_transpose_1d_f16_f32, true);
@@ -910,6 +911,7 @@
@implementation GGMLMetalClass
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32, unpad_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true);
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32, unpad_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
@@ -1099,6 +1101,7 @@
static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
case GGML_OP_POOL_2D:
@@ -1145,6 +1147,7 @@
static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
case GGML_OP_UPSCALE:
case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D:
+ case GGML_OP_UNPAD:
case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT:
@@ -3
25
8,6 +3
26
1,36 @@
static void ggml_metal_encode_node(
@@ -3
34
8,6 +3
35
1,36 @@
static void ggml_metal_encode_node(
const int nth = MIN(1024, ne0);
...
...
@@ -275,10 +276,10 @@ index 093ae900..cb9a1307 100644
} break;
case GGML_OP_ARANGE:
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index
5caa0846..47038c31
100644
index
8ba43904..204c93e6
100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -2
897
,6 +2
897
,51 @@
kernel void kernel_pad_f32(
@@ -2
944
,6 +2
944
,51 @@
kernel void kernel_pad_
reflect_1d_
f32(
}
}
...
...
@@ -331,44 +332,44 @@ index 5caa0846..47038c31 100644
device char * dst,
constant int64_t & ne0,
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index
1a9a7efa..ea2b259b
100644
index
51cc8566..0e74e554
100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -950,6 +950,7 @@
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"POOL_2D_BACK",
@@ -954,6 +954,7 @@
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"UPSCALE",
"PAD",
"PAD_REFLECT_1D",
+ "UNPAD",
"ARANGE",
"TIMESTEP_EMBEDDING",
"ARGSORT",
@@ -98
3
,7 +98
4
,7 @@
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
@@ -98
7
,7 +98
8
,7 @@
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"OPT_STEP_ADAMW",
};
-static_assert(GGML_OP_COUNT == 8
1
, "GGML_OP_COUNT != 8
1
");
+static_assert(GGML_OP_COUNT == 8
2
, "GGML_OP_COUNT != 8
2
");
-static_assert(GGML_OP_COUNT == 8
2
, "GGML_OP_COUNT != 8
2
");
+static_assert(GGML_OP_COUNT == 8
3
, "GGML_OP_COUNT != 8
3
");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
@@ -1045,6 +1046,7 @@
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"pool_2d_back(x)",
@@ -1050,6 +1051,7 @@
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"upscale(x)",
"pad(x)",
"pad_reflect_1d(x)",
+ "unpad(x)",
"arange(start, stop, step)",
"timestep_embedding(timesteps, dim, max_period)",
"argsort(x)",
@@ -10
7
8,7 +108
0
,7 @@
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
@@ -108
3
,7 +108
5
,7 @@
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"adamw(x)",
};
-static_assert(GGML_OP_COUNT == 8
1
, "GGML_OP_COUNT != 8
1
");
+static_assert(GGML_OP_COUNT == 8
2
, "GGML_OP_COUNT != 8
2
");
-static_assert(GGML_OP_COUNT == 8
2
, "GGML_OP_COUNT != 8
2
");
+static_assert(GGML_OP_COUNT == 8
3
, "GGML_OP_COUNT != 8
3
");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@@ -4
097
,6 +4
099
,25 @@
struct ggml_tensor * ggml_pad(
@@ -4
180
,6 +4
182
,25 @@
struct ggml_tensor * ggml_pad
_reflect_1d
(
return result;
}
...
...
llama/patches/0010-fix-deepseek-deseret-regex.patch
View file @
7a81daf0
...
...
@@ -11,7 +11,7 @@ the characters
2 files changed, 23 insertions(+), 1 deletion(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index
d1dc9627..05ef0e71
100644
index
8c9aaf5a..3e372dc3
100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -389,7 +389,7 @@
struct llm_tokenizer_bpe : llm_tokenizer {
...
...
Prev
1
…
9
10
11
12
13
14
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment