Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
527cc978
Unverified
Commit
527cc978
authored
Dec 10, 2024
by
Jeffrey Morgan
Committed by
GitHub
Dec 10, 2024
Browse files
llama: update vendored code to commit 40c6d79f (#7875)
parent
a37f4a86
Changes
288
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1372 additions
and
605 deletions
+1372
-605
llama/patches/0002-pretokenizer.patch
llama/patches/0002-pretokenizer.patch
+3
-3
llama/patches/0003-embeddings.patch
llama/patches/0003-embeddings.patch
+10
-14
llama/patches/0003-metal.patch
llama/patches/0003-metal.patch
+0
-54
llama/patches/0004-clip-unicode.patch
llama/patches/0004-clip-unicode.patch
+6
-6
llama/patches/0004-ggml-metal.patch
llama/patches/0004-ggml-metal.patch
+0
-24
llama/patches/0005-solar-pro.patch
llama/patches/0005-solar-pro.patch
+49
-44
llama/patches/0006-conditional-fattn.patch
llama/patches/0006-conditional-fattn.patch
+6
-6
llama/patches/0007-blas.patch
llama/patches/0007-blas.patch
+26
-0
llama/patches/0008-add-mllama-support.patch
llama/patches/0008-add-mllama-support.patch
+169
-115
llama/patches/0009-add-unpad-operator.patch
llama/patches/0009-add-unpad-operator.patch
+126
-139
llama/patches/0010-fix-deepseek-deseret-regex.patch
llama/patches/0010-fix-deepseek-deseret-regex.patch
+12
-6
llama/patches/0011-relative-include-paths.patch
llama/patches/0011-relative-include-paths.patch
+64
-0
llama/runner/runner.go
llama/runner/runner.go
+1
-2
llama/sampling.cpp
llama/sampling.cpp
+156
-109
llama/sampling.h
llama/sampling.h
+42
-21
llama/sampling_ext.cpp
llama/sampling_ext.cpp
+15
-29
llama/sampling_ext.h
llama/sampling_ext.h
+10
-23
llama/sgemm.cpp
llama/sgemm.cpp
+665
-0
llama/unicode-data.cpp
llama/unicode-data.cpp
+7
-5
llama/unicode-data.h
llama/unicode-data.h
+5
-5
No files found.
llama/patches/0002-pretokenizer.patch
View file @
527cc978
...
...
@@ -8,10 +8,10 @@ Subject: [PATCH] pretokenizer
1 file changed, 3 insertions(+), 11 deletions(-)
diff --git a/src/llama.cpp b/src/llama.cpp
index
4c0a1bb6..800dfb95
100644
index
6a6f4c2a..fa09f3b3
100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -6
287
,16 +6
287
,7 @@
static void llm_load_vocab(
@@ -6
362
,16 +6
362
,7 @@
static void llm_load_vocab(
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
vocab.tokenizer_add_space_prefix = false;
vocab.tokenizer_clean_spaces = true;
...
...
@@ -29,7 +29,7 @@ index 4c0a1bb6..800dfb95 100644
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (
tokenizer_pre == "llama3" ||
@@ -6
398
,7 +6
389
,8 @@
static void llm_load_vocab(
@@ -6
473
,7 +6
464
,8 @@
static void llm_load_vocab(
vocab.tokenizer_add_bos = true;
vocab.tokenizer_clean_spaces = false;
} else {
...
...
llama/patches/000
5
-embeddings.patch
→
llama/patches/000
3
-embeddings.patch
View file @
527cc978
...
...
@@ -4,14 +4,14 @@ Date: Mon, 16 Sep 2024 15:53:14 -0700
Subject: [PATCH] embeddings
---
src/llama.cpp |
15 ++
++++++
+---
---
1 file changed,
9
insertions(+),
6
deletions(-)
src/llama.cpp |
9
++++++---
1 file changed,
6
insertions(+),
3
deletions(-)
diff --git a/src/llama.cpp b/src/llama.cpp
index
800dfb95..a639522d
100644
index
fa09f3b3..d1791af0
100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -1
6920
,7 +1
6920
,7 @@
static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
@@ -1
7398
,7 +1
7398
,7 @@
static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead
...
...
@@ -20,20 +20,15 @@ index 800dfb95..a639522d 100644
const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
@@ -17192,20 +17192,23 @@
static int llama_decode_internal(
// no output
@@ -17693,7 +17693,6 @@
static int llama_decode_internal(
res = nullptr;
embd = nullptr;
-
} else if (cparams.embeddings) {
} else if (cparams.embeddings) {
- res = nullptr; // do not extract logits for embedding case
- embd = nullptr;
+ }
+
+ if (cparams.embeddings) {
embd = nullptr;
for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
+ embd = ggml_graph_node(gf, i);
if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
- embd = ggml_graph_node(gf, i);
@@ -17701,11 +17700,15 @@
static int llama_decode_internal(
break;
}
}
...
...
@@ -46,6 +41,7 @@ index 800dfb95..a639522d 100644
+ if (!cparams.causal_attn) {
+ res = nullptr; // do not extract logits when not needed
+ }
+
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
ggml_backend_sched_alloc_graph(lctx.sched, gf);
ggml_backend_sched_alloc_graph(lctx.sched
.get()
, gf);
llama/patches/0003-metal.patch
deleted
100644 → 0
View file @
a37f4a86
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Mon, 16 Sep 2024 15:53:12 -0700
Subject: [PATCH] metal
---
ggml/src/ggml-metal.m | 30 +++++++++++++-----------------
1 file changed, 13 insertions(+), 17 deletions(-)
diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
index 9da08fe2..3a433703 100644
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -1720,27 +1720,23 @@
static void ggml_metal_encode_node(
// to the matrix-vector kernel
int ne11_mm_min = 1;
-#if 0
// the numbers below are measured on M2 Ultra for 7B and 13B models
// these numbers do not translate to other devices or model sizes
// TODO: need to find a better approach
- if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
- switch (src0t) {
- case GGML_TYPE_F16: ne11_mm_min = 2; break;
- case GGML_TYPE_Q8_0: ne11_mm_min = 7; break;
- case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
- case GGML_TYPE_Q3_K: ne11_mm_min = 7; break;
- case GGML_TYPE_Q4_0:
- case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
- case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
- case GGML_TYPE_Q5_0: // not tested yet
- case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
- case GGML_TYPE_Q5_K: ne11_mm_min = 7; break;
- case GGML_TYPE_Q6_K: ne11_mm_min = 7; break;
- default: ne11_mm_min = 1; break;
- }
+ switch (src0t) {
+ case GGML_TYPE_F16: ne11_mm_min = 2; break;
+ case GGML_TYPE_Q8_0: ne11_mm_min = 7; break;
+ case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
+ case GGML_TYPE_Q3_K: ne11_mm_min = 7; break;
+ case GGML_TYPE_Q4_0:
+ case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
+ case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
+ case GGML_TYPE_Q5_0: // not tested yet
+ case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
+ case GGML_TYPE_Q5_K: ne11_mm_min = 7; break;
+ case GGML_TYPE_Q6_K: ne11_mm_min = 7; break;
+ default: ne11_mm_min = 1; break;
}
-#endif
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
llama/patches/000
6
-clip-unicode.patch
→
llama/patches/000
4
-clip-unicode.patch
View file @
527cc978
...
...
@@ -8,12 +8,12 @@ Subject: [PATCH] clip-unicode
1 file changed, 39 insertions(+), 1 deletion(-)
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index
14e02c8d..6e849d8e
100644
index
d7c94352..427d5e02
100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -
44
,6 +
44
,19 @@
#define LOG_
ERR
(...) do { fprintf(std
err
, __VA_ARGS__); } while (0)
#
define LOG_DBG(...) do { fprintf(stderr, __VA_ARGS__); } while (0
)
@@ -
56
,6 +
56
,19 @@
#
define LOG_
DBG
(...) do { fprintf(std
out
, __VA_ARGS__); } while (0)
#
endif // defined(LLAVA_LOG_OFF
)
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
...
...
@@ -31,7 +31,7 @@ index 14e02c8d..6e849d8e 100644
//#define CLIP_DEBUG_FUNCTIONS
// RGB uint8 image
@@ -122
5
,8 +12
38
,29 @@
struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
@@ -12
4
2,8 +12
55
,29 @@
struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
gguf_free(ctx);
return nullptr;
}
...
...
@@ -62,7 +62,7 @@ index 14e02c8d..6e849d8e 100644
if (!fin) {
LOG_ERR("cannot open model file for loading tensors\n");
clip_free(new_clip);
@@ -12
66
,7 +13
00
,11 @@
struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
@@ -12
83
,7 +13
17
,11 @@
struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
}
}
...
...
llama/patches/0004-ggml-metal.patch
deleted
100644 → 0
View file @
a37f4a86
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: Wed, 12 Jun 2024 12:18:40 -0700
Subject: [PATCH] ggml-metal
---
ggml/src/ggml-metal.m | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
index 3a433703..829c5e39 100644
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -392,8 +392,8 @@
static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
#if GGML_METAL_EMBED_LIBRARY
GGML_METAL_LOG_INFO("%s: using embedded metal library\n", __func__);
- extern const char ggml_metallib_start[];
- extern const char ggml_metallib_end[];
+ extern const char *ggml_metallib_start;
+ extern const char *ggml_metallib_end;
NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
#else
llama/patches/000
7
-solar-pro.patch
→
llama/patches/000
5
-solar-pro.patch
View file @
527cc978
...
...
@@ -11,14 +11,14 @@ tensor to store the scalar. the scalar is implemented a 1-dimensional
tensor with 2 elements dervied from the model's bskcn_tv configuration.
in general, the values are (bskcn_tv, 1 - bskcn_tv)
---
src/llama.cpp | 26
9
+++++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 25
5
insertions(+), 14 deletions(-)
src/llama.cpp | 26
7
+++++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 25
3
insertions(+), 14 deletions(-)
diff --git a/src/llama.cpp b/src/llama.cpp
index
a639522d..83b80b59
100644
index
d1791af0..b01770d0
100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -
217
,6 +
217
,7 @@
enum llm_arch {
@@ -
195
,6 +
195
,7 @@
enum llm_arch {
LLM_ARCH_GRANITE,
LLM_ARCH_GRANITE_MOE,
LLM_ARCH_CHAMELEON,
...
...
@@ -26,7 +26,7 @@ index a639522d..83b80b59 100644
LLM_ARCH_UNKNOWN,
};
@@ -2
70
,6 +2
71
,7 @@
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
@@ -2
49
,6 +2
50
,7 @@
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_GRANITE, "granite" },
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
{ LLM_ARCH_CHAMELEON, "chameleon" },
...
...
@@ -34,7 +34,7 @@ index a639522d..83b80b59 100644
{ LLM_ARCH_UNKNOWN, "(unknown)" },
};
@@ -3
27
,6 +3
29
,7 @@
enum llm_kv {
@@ -3
06
,6 +3
08
,7 @@
enum llm_kv {
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE,
...
...
@@ -42,7 +42,7 @@ index a639522d..83b80b59 100644
LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_FREQ_BASE,
@@ -4
21
,20 +4
24
,21 @@
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
@@ -4
08
,20 +4
11
,21 @@
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" },
{ LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
...
...
@@ -78,15 +78,15 @@ index a639522d..83b80b59 100644
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
@@ -60
8
,6 +6
12
,7 @@
enum llm_tensor {
@@ -60
3
,6 +6
07
,7 @@
enum llm_tensor {
LLM_TENSOR_ENC_OUTPUT_NORM,
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
+ LLM_TENSOR_BSKCN_TV,
};
static const std::map<llm_arch, std::map<llm_tensor,
std::string
>> LLM_TENSOR_NAMES = {
@@ -15
27
,6 +15
32
,24 @@
static const std::map<llm_arch, std::map<llm_tensor,
std::string
>> LLM_TENSOR_N
A
static const std::map<llm_arch, std::map<llm_tensor,
const char *
>> LLM_TENSOR_NAMES = {
@@ -15
41
,6 +15
46
,24 @@
static const std::map<llm_arch, std::map<llm_tensor,
const char *
>> LLM_TENSOR_N
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
},
},
...
...
@@ -111,15 +111,15 @@ index a639522d..83b80b59 100644
{
LLM_ARCH_UNKNOWN,
{
@@ -2
360
,6 +2
383
,7 @@
enum e_model {
@@ -2
401
,6 +2
424
,7 @@
enum e_model {
MODEL_15B,
MODEL_16B,
MODEL_20B,
+ MODEL_22B,
MODEL_30B,
MODEL_32B,
MODEL_34B,
MODEL_35B,
@@ -2409,6 +2433,8 @@
struct llama_hparams {
@@ -2451,6 +2475,8 @@
struct llama_hparams {
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
...
...
@@ -128,7 +128,7 @@ index a639522d..83b80b59 100644
uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0;
uint32_t n_lora_kv = 0;
@@ -2
479
,6 +25
05
,7 @@
struct llama_hparams {
@@ -2
521
,6 +25
47
,7 @@
struct llama_hparams {
if (this->n_head_arr != other.n_head_arr) return true;
if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
if (this->n_ff_arr != other.n_ff_arr) return true;
...
...
@@ -136,7 +136,7 @@ index a639522d..83b80b59 100644
if (this->n_rel_attn_bkts != other.n_rel_attn_bkts) return true;
if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
@@ -2
588
,6 +26
1
5,14 @@
struct llama_hparams {
@@ -2
630
,6 +265
7
,14 @@
struct llama_hparams {
return ssm_d_state * ssm_d_inner;
}
}
...
...
@@ -151,7 +151,7 @@ index a639522d..83b80b59 100644
};
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
@@ -2
769
,6 +28
04
,8 @@
struct llama_layer {
@@ -2
816
,6 +28
51
,8 @@
struct llama_layer {
struct ggml_tensor * ffn_gate_scale;
struct ggml_tensor * ffn_up_scale;
struct ggml_tensor * ffn_down_scale;
...
...
@@ -160,7 +160,7 @@ index a639522d..83b80b59 100644
};
// very similar to llama_batch,
@@ -6
134
,6 +6
171
,21 @@
static void llm_load_hparams(
@@ -6
209
,6 +6
246
,21 @@
static void llm_load_hparams(
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
...
...
@@ -182,46 +182,51 @@ index a639522d..83b80b59 100644
default: (void)0;
}
@@ -8831,6 +8883,38 @@
static bool llm_load_tensors(
@@ -7198,6 +7250,7 @@
static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
// this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
+ {LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}
};
// checks if the weight tensor can be used with the specified buffer type and device
@@ -9205,6 +9258,35 @@
static bool llm_load_tensors(
layer.ffn_norm =
ml.
create_tensor(
ctx_layer,
tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}
, 0
);
+ layer.ffn_gate =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
+ layer.ffn_down =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
+ layer.ffn_up =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}
, 0
);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}
, 0
);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}
, 0
);
+ }
+ } break;
+ case LLM_ARCH_SOLAR:
+ {
+ model.tok_embd =
ml.
create_tensor(
ctx_input,
tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+ model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}
, 0
);
+
+ // output
+ {
+ model.output_norm =
ml.
create_tensor(
ctx_output,
tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+ model.output =
ml.
create_tensor(
ctx_output_split,
tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}
, 0
);
+ model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ ggml_context * ctx_layer = ctx_for_layer(i);
+ ggml_context * ctx_split = ctx_for_layer_split(i);
+
+ auto & layer = model.layers[i];
+
+ layer.attn_norm =
ml.
create_tensor(
ctx_layer,
tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}
, 0
);
+
+ layer.wq =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head});
+ layer.wk =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
+ layer.wv =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
+ layer.wo =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}
, 0
);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}
, 0
);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}
, 0
);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}
, 0
);
+
+ layer.ffn_norm =
ml.
create_tensor(
ctx_layer,
tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}
, 0
);
+
+ layer.bskcn_tv =
ml.
create_tensor(
ctx_layer,
tn(LLM_TENSOR_BSKCN_TV, "weight"), {2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+ layer.bskcn_tv = create_tensor(tn(LLM_TENSOR_BSKCN_TV, "weight"
, i
), {2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+
layer.ffn_gate =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
layer.ffn_down =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
layer.ffn_up =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
@@ -16
179
,6 +16
263
,158 @@
struct llm_build_context {
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}
, 0
);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}
, 0
);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}
, 0
);
@@ -16
652
,6 +16
734
,158 @@
struct llm_build_context {
return gf;
}
...
...
@@ -239,7 +244,7 @@ index a639522d..83b80b59 100644
+ struct ggml_tensor * cur;
+ struct ggml_tensor * inpL;
+
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams,
u
batch, model.tok_embd, cb);
+
+ // inp_pos - contains the positions
+ struct ggml_tensor * inp_pos = build_inp_pos();
...
...
@@ -380,7 +385,7 @@ index a639522d..83b80b59 100644
};
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -16
443
,6 +1
6679
,10 @@
static struct ggml_cgraph * llama_build_graph(
@@ -16
921
,6 +1
7155
,10 @@
static struct ggml_cgraph * llama_build_graph(
{
result = llm.build_chameleon();
} break;
...
...
@@ -391,7 +396,7 @@ index a639522d..83b80b59 100644
default:
GGML_ABORT("fatal error");
}
@@ -
19589,6 +19829
,7 @@
enum llama_rope_type llama_rope_type(const struct llama_model * model) {
@@ -
20132,6 +20370
,7 @@
enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_CHAMELEON:
...
...
llama/patches/000
8
-conditional-fattn.patch
→
llama/patches/000
6
-conditional-fattn.patch
View file @
527cc978
...
...
@@ -4,14 +4,14 @@ Date: Wed, 9 Oct 2024 17:26:23 -0700
Subject: [PATCH] conditional-fattn
---
ggml/src/ggml-cuda.cu | 2 ++
ggml/src/ggml-cuda
/ggml-cuda
.cu | 2 ++
1 file changed, 2 insertions(+)
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index
809d6ab1..fe77b81c
100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -2
347
,9 +2
347
,11 @@
static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
diff --git a/ggml/src/ggml-cuda
/ggml-cuda
.cu b/ggml/src/ggml-cuda
/ggml-cuda
.cu
index
52aec229..cbf4fddf
100644
--- a/ggml/src/ggml-cuda
/ggml-cuda
.cu
+++ b/ggml/src/ggml-cuda
/ggml-cuda
.cu
@@ -2
162
,9 +2
162
,11 @@
static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_ARGSORT:
ggml_cuda_op_argsort(ctx, dst);
break;
...
...
llama/patches/000
9
-blas.patch
→
llama/patches/000
7
-blas.patch
View file @
527cc978
...
...
@@ -4,22 +4,23 @@ Date: Mon, 30 Sep 2024 16:31:04 -0700
Subject: [PATCH] blas
---
ggml/src/ggml-blas.cpp | 4 ++++
ggml/src/ggml-blas
/ggml-blas
.cpp | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/ggml/src/ggml-blas.cpp b/ggml/src/ggml-blas.cpp
index
6d99c6be..8e1ab99d
100644
--- a/ggml/src/ggml-blas.cpp
+++ b/ggml/src/ggml-blas.cpp
diff --git a/ggml/src/ggml-blas
/ggml-blas
.cpp b/ggml/src/ggml-blas
/ggml-blas
.cpp
index
ec158dfa..b3ac1fa4
100644
--- a/ggml/src/ggml-blas
/ggml-blas
.cpp
+++ b/ggml/src/ggml-blas
/ggml-blas
.cpp
@@ -1,3 +1,5 @@
+#ifdef GGML_USE_BLAS
+
#include "ggml-impl.h"
#include "ggml-blas.h"
#include "ggml-backend-impl.h"
@@ -366,3 +368,5 @@
void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads)
ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend_blas->context;
ctx->n_threads = n_threads;
@@ -515,3 +517,5 @@
ggml_backend_reg_t ggml_backend_blas_reg(void) {
}
GGML_BACKEND_DL_IMPL(ggml_backend_blas_reg)
+
+#endif
+#endif // GGML_USE_BLAS
\
No newline at end of file
llama/patches/00
1
0-add-mllama-support.patch
→
llama/patches/000
8
-add-mllama-support.patch
View file @
527cc978
...
...
@@ -12,29 +12,46 @@ kv cache once per run
remaining is to implement the cross attention mask
---
examples/llava/llava.cpp |
2
+-
examples/llava/llava.cpp |
5
+-
include/llama.h | 5 +
src/llama.cpp | 4
4
7 +++++++++++++++++++++++++++++++++++++--
3 files changed, 4
3
6 insertions(+),
18
deletions(-)
src/llama.cpp | 4
7
7 +++++++++++++++++++++++++++++++++++++--
3 files changed, 46
7
insertions(+),
20
deletions(-)
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index
8558c6bd..37b2f2e2
100644
index
4ca53a0b..d56644a8
100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -409,7 +409,7 @@
bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
if (n_eval > n_batch) {
@@ -412,7 +412,7 @@
struct llava_embd_batch {
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
- llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+ llava_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
pos .resize(n_tokens);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
@@ -424,6 +424,7 @@
struct llava_embd_batch {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ embd,
+ /*n_embd =*/ n_embd,
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
@@ -447,7 +448,7 @@
bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
n_eval = n_batch;
}
- llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
+ llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), n_embd, nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
if (llama_decode(ctx_llama, batch)) {
float * embd = image_embed->embed+i*n_embd;
- llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
+ llava_embd_batch llava_batch = llava_embd_batch(embd, n_embd, n_eval, *n_past, 0);
if (llama_decode(ctx_llama, llava_batch.batch)) {
LOG_ERR("%s : failed to eval\n", __func__);
return false;
diff --git a/include/llama.h b/include/llama.h
index
7cae1bbe..aca09310
100644
index
e85f459f..aba85f86
100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -24
0
,6 +24
0
,7 @@
extern "C" {
@@ -24
5
,6 +24
5
,7 @@
extern "C" {
llama_token * token;
float * embd;
...
...
@@ -42,7 +59,7 @@ index 7cae1bbe..aca09310 100644
llama_pos * pos;
int32_t * n_seq_id;
llama_seq_id ** seq_id;
@@ -4
23
,6 +42
4
,10 @@
extern "C" {
@@ -4
19
,6 +42
0
,10 @@
extern "C" {
struct llama_model * model,
struct llama_context_params params);
...
...
@@ -54,10 +71,10 @@ index 7cae1bbe..aca09310 100644
LLAMA_API void llama_free(struct llama_context * ctx);
diff --git a/src/llama.cpp b/src/llama.cpp
index
83b80b59..35748488
100644
index
b01770d0..46881642
100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -16
9
,6 +16
9
,7 @@
static std::string format(const char * fmt, ...) {
@@ -1
4
6,6 +1
4
6,7 @@
static std::string format(const char * fmt, ...) {
enum llm_arch {
LLM_ARCH_LLAMA,
...
...
@@ -65,7 +82,7 @@ index 83b80b59..35748488 100644
LLM_ARCH_FALCON,
LLM_ARCH_BAICHUAN,
LLM_ARCH_GROK,
@@ -2
23
,6 +22
4
,7 @@
enum llm_arch {
@@ -2
01
,6 +2
0
2,7 @@
enum llm_arch {
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_LLAMA, "llama" },
...
...
@@ -73,7 +90,7 @@ index 83b80b59..35748488 100644
{ LLM_ARCH_FALCON, "falcon" },
{ LLM_ARCH_GROK, "grok" },
{ LLM_ARCH_GPT2, "gpt2" },
@@ -3
3
0,6 +3
32
,7 @@
enum llm_kv {
@@ -30
9
,6 +3
11
,7 @@
enum llm_kv {
LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
...
...
@@ -81,7 +98,7 @@ index 83b80b59..35748488 100644
LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_FREQ_BASE,
@@ -4
39
,6 +4
4
2,7 @@
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
@@ -4
26
,6 +42
9
,7 @@
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection.%d" },
...
...
@@ -89,7 +106,7 @@ index 83b80b59..35748488 100644
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
@@ -6
13
,6 +61
7
,14 @@
enum llm_tensor {
@@ -6
08
,6 +61
2
,14 @@
enum llm_tensor {
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
LLM_TENSOR_BSKCN_TV,
...
...
@@ -103,8 +120,8 @@ index 83b80b59..35748488 100644
+ LLM_TENSOR_CROSS_ATTN_MLP_GATE,
};
static const std::map<llm_arch, std::map<llm_tensor,
std::string
>> LLM_TENSOR_NAMES = {
@@ -6
42
,6 +6
5
4,40 @@
static const std::map<llm_arch, std::map<llm_tensor,
std::string
>> LLM_TENSOR_N
A
static const std::map<llm_arch, std::map<llm_tensor,
const char *
>> LLM_TENSOR_NAMES = {
@@ -6
37
,6 +64
9
,40 @@
static const std::map<llm_arch, std::map<llm_tensor,
const char *
>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
},
},
...
...
@@ -145,7 +162,7 @@ index 83b80b59..35748488 100644
{
LLM_ARCH_BAICHUAN,
{
@@ -2
390
,6 +24
36
,7 @@
enum e_model {
@@ -2
432
,6 +24
78
,7 @@
enum e_model {
MODEL_40B,
MODEL_65B,
MODEL_70B,
...
...
@@ -153,7 +170,7 @@ index 83b80b59..35748488 100644
MODEL_236B,
MODEL_314B,
MODEL_SMALL,
@@ -24
34
,6 +2
481
,7 @@
struct llama_hparams {
@@ -24
76
,6 +2
523
,7 @@
struct llama_hparams {
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr;
...
...
@@ -161,7 +178,7 @@ index 83b80b59..35748488 100644
uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0;
@@ -25
02
,10 +25
50
,11 @@
struct llama_hparams {
@@ -25
44
,10 +25
92
,11 @@
struct llama_hparams {
if (this->n_expert != other.n_expert) return true;
if (this->n_expert_used != other.n_expert_used) return true;
...
...
@@ -177,7 +194,7 @@ index 83b80b59..35748488 100644
if (this->n_rel_attn_bkts != other.n_rel_attn_bkts) return true;
if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
@@ -26
23
,6 +2
672
,10 @@
struct llama_hparams {
@@ -26
65
,6 +2
714
,10 @@
struct llama_hparams {
GGML_ABORT("fatal error");
}
...
...
@@ -188,7 +205,7 @@ index 83b80b59..35748488 100644
};
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
@@ -26
52
,6 +27
05
,9 @@
struct llama_cparams {
@@ -26
94
,6 +27
47
,9 @@
struct llama_cparams {
bool offload_kqv;
bool flash_attn;
bool no_perf;
...
...
@@ -198,7 +215,7 @@ index 83b80b59..35748488 100644
enum llama_pooling_type pooling_type;
@@ -28
06
,6 +2
862
,16 @@
struct llama_layer {
@@ -28
53
,6 +2
909
,16 @@
struct llama_layer {
struct ggml_tensor * ffn_down_scale;
struct ggml_tensor * bskcn_tv;
...
...
@@ -215,7 +232,7 @@ index 83b80b59..35748488 100644
};
// very similar to llama_batch,
@@ -34
52
,6 +35
18
,8 @@
struct llama_context {
@@ -34
39
,6 +35
05
,8 @@
struct llama_context {
struct ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch]
struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc]
struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
...
...
@@ -224,13 +241,34 @@ index 83b80b59..35748488 100644
};
struct llama_lora_weight {
@@ -3
686
,6 +3
754,18
@@
static bool llama_kv_cache_init(
@@ -3
577
,6 +3
645,39
@@
static bool llama_kv_cache_init(
cache.v_l.reserve(n_layer);
for (int i = 0; i < (int) n_layer; i++) {
+ // for cross attention layers
+ if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layers(i)) {
+ struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
+ const llama_model::buft_list_t * buft_list;
+ if (offload) {
+ buft_list = model.dev_layer.at(i).buft_list;
+ } else {
+ buft_list = &model.cpu_buft_list;
+ }
+ ggml_backend_buffer_type_t buft = select_buft(*buft_list,
+ [&](ggml_context * ctx) {
+ ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
+ if (hparams.rope_type == LLAMA_ROPE_TYPE_NONE) {
+ return k;
+ }
+ ggml_tensor * p = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
+ return ggml_rope(ctx, k, p, hparams.n_rot, hparams.rope_type);
+ });
+ ggml_context * ctx = ctx_for_buft(buft);
+
+ if (!ctx) {
+ LLAMA_LOG_ERROR("%s: failed to create ggml context for kv cache\n", __func__);
+ return false;
+ }
+ ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i));
+ ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i));
+ ggml_format_name(k, "cache_k_l%d", i);
...
...
@@ -243,7 +281,7 @@ index 83b80b59..35748488 100644
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
@@ -5
46
0,12 +5
540
,14 @@
static void llm_load_hparams(
@@ -5
52
0,12 +5
621
,14 @@
static void llm_load_hparams(
}
// zero-out the per-layer hparams
...
...
@@ -263,7 +301,7 @@ index 83b80b59..35748488 100644
// n_head_kv is optional, default to n_head
hparams.n_head_kv_arr = hparams.n_head_arr;
@@ -55
1
4,7 +5
596
,7 @@
static void llm_load_hparams(
@@ -55
7
4,7 +5
677
,7 @@
static void llm_load_hparams(
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
...
...
@@ -272,7 +310,7 @@ index 83b80b59..35748488 100644
if (hparams.n_rot != hparams.n_embd_head_k) {
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
}
@@ -5
55
4,6 +5
636
,16 @@
static void llm_load_hparams(
@@ -5
61
4,6 +5
717
,16 @@
static void llm_load_hparams(
}
}
} break;
...
...
@@ -289,63 +327,78 @@ index 83b80b59..35748488 100644
case LLM_ARCH_MINICPM:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -7249,6 +7341,55 @@
static bool llm_load_tensors(
layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head_qk_rope/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
@@ -7250,7 +7363,15 @@
static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
// this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
- {LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}
+ {LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_CROSS_ATTN_K_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_CROSS_ATTN_K_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_CROSS_ATTN_O_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_CROSS_ATTN_Q_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_CROSS_ATTN_Q_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_CROSS_ATTN_V_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_CROSS_ATTN_ATTN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_CROSS_ATTN_MLP_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
};
// checks if the weight tensor can be used with the specified buffer type and device
@@ -7754,6 +7875,53 @@
static bool llm_load_tensors(
}
}
} break;
+ case LLM_ARCH_MLLAMA:
+ {
+ model.tok_embd =
ml.
create_tensor(
ctx_input,
tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab+8});
+ model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab+8}
, 0
);
+
+ // output
+ {
+ model.output_norm =
ml.
create_tensor(
ctx_output,
tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+ model.output =
ml.
create_tensor(
ctx_output_split,
tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}
, 0
);
+ model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+
+ // if output is NULL, init from the input tok embed
+ if (model.output == NULL) {
+ model.output =
ml.
create_tensor(
ctx_output,
tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
+ model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
+ }
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ ggml_context * ctx_layer = ctx_for_layer(i);
+ ggml_context * ctx_split = ctx_for_layer_split(i);
+
+ auto & layer = model.layers[i];
+
+ if (hparams.cross_attention_layers(i)) {
+ layer.cross_attn_k_norm =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_CROSS_ATTN_K_NORM, "weight", i), {128});
+ layer.cross_attn_k_proj =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_CROSS_ATTN_K_PROJ, "weight", i), {n_embd, 1024});
+ layer.cross_attn_o_proj =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_CROSS_ATTN_O_PROJ, "weight", i), {n_embd, n_embd});
+ layer.cross_attn_q_norm =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_CROSS_ATTN_Q_NORM, "weight", i), {128});
+ layer.cross_attn_q_proj =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_CROSS_ATTN_Q_PROJ, "weight", i), {n_embd, n_embd});
+ layer.cross_attn_v_proj =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_CROSS_ATTN_V_PROJ, "weight", i), {n_embd, 1024});
+ layer.cross_attn_attn_gate =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_CROSS_ATTN_ATTN_GATE, i), {1});
+ layer.cross_attn_mlp_gate =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_CROSS_ATTN_MLP_GATE, i), {1});
+ layer.attn_norm =
ml.
create_tensor(
ctx_layer,
tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+ layer.ffn_down =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
+ layer.ffn_gate =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
+ layer.ffn_up =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
+ layer.ffn_norm =
ml.
create_tensor(
ctx_layer,
tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+ layer.cross_attn_k_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_NORM, "weight", i), {128}
, 0
);
+ layer.cross_attn_k_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_PROJ, "weight", i), {n_embd, 1024}
, 0
);
+ layer.cross_attn_o_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_O_PROJ, "weight", i), {n_embd, n_embd}
, 0
);
+ layer.cross_attn_q_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_NORM, "weight", i), {128}
, 0
);
+ layer.cross_attn_q_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_PROJ, "weight", i), {n_embd, n_embd}
, 0
);
+ layer.cross_attn_v_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_V_PROJ, "weight", i), {n_embd, 1024}
, 0
);
+ layer.cross_attn_attn_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_ATTN_GATE, i), {1}
, 0
);
+ layer.cross_attn_mlp_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_MLP_GATE, i), {1}
, 0
);
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}
, 0
);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}
, 0
);
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}
, 0
);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}
, 0
);
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}
, 0
);
+ } else {
+ layer.attn_norm =
ml.
create_tensor(
ctx_layer,
tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+ layer.wq =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head});
+ layer.wk =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
+ layer.wv =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
+ layer.wo =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
+ layer.ffn_norm =
ml.
create_tensor(
ctx_layer,
tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+ layer.rope_freqs =
ml.
create_tensor(
ctx_layer,
tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+ layer.ffn_gate =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
+ layer.ffn_down =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
+ layer.ffn_up =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}
, 0
);
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}
, 0
);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}
, 0
);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}
, 0
);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}
, 0
);
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}
, 0
);
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight"
, i
), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}
, 0
);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}
, 0
);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}
, 0
);
+ }
+ }
+ } break;
case LLM_ARCH_
GROK
:
case LLM_ARCH_
MINICPM3
:
{
if (n_expert == 0) {
@@ -9
09
3,7 +9
234
,7 @@
static int llama_model_load(const std::string & fname, llama_model & model, llam
const int64_t n_embd_head_qk_rope = hparams.n_rot;
@@ -9
46
3,7 +9
631
,7 @@
static int llama_model_load(const std::string & fname, llama_model & model, llam
if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
model.hparams.n_vocab != model.vocab.id_to_token.size()) {
...
...
@@ -354,7 +407,7 @@ index 83b80b59..35748488 100644
}
if (params.vocab_only) {
@@ -9
193
,6 +9
33
4,21 @@
static struct ggml_tensor * llm_build_inp_embd(
@@ -9
546
,6 +9
71
4,21 @@
static struct ggml_tensor * llm_build_inp_embd(
return inpL;
}
...
...
@@ -376,7 +429,7 @@ index 83b80b59..35748488 100644
static void llm_build_kv_store(
struct ggml_context * ctx,
const llama_hparams & hparams,
@@ -10
167
,6 +10
323
,7 @@
struct llm_build_context {
@@ -10
513
,6 +10
696
,7 @@
struct llm_build_context {
lctx.inp_pos_bucket = nullptr;
lctx.inp_embd_enc = nullptr;
lctx.inp_KQ_mask_cross = nullptr;
...
...
@@ -384,18 +437,10 @@ index 83b80b59..35748488 100644
}
void free() {
@@ -10
754
,6 +1
0911,239
@@
struct llm_build_context {
LLM_NORM_RMS, cb, -1)
;
cb(cur, "result_norm", -1);
@@ -10
992
,6 +1
1176,240
@@
struct llm_build_context {
return gf
;
}
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+ cb(cur, "result_output", -1);
+
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+ }
+
+ struct ggml_cgraph * build_mllama() {
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+
...
...
@@ -410,7 +455,7 @@ index 83b80b59..35748488 100644
+ struct ggml_tensor * inpL;
+ struct ggml_tensor * inpCAS;
+
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams,
u
batch, model.tok_embd, cb);
+ inpCAS = llm_build_inp_cross_attn_state(ctx0, lctx, hparams, cb);
+
+ // inp_pos - contains the positions
...
...
@@ -429,7 +474,7 @@ index 83b80b59..35748488 100644
+ cb(cur, "attn_norm", il);
+
+ if (hparams.cross_attention_layers(il)) {
+ if (!batch.embd && !cparams.cross_attn) {
+ if (!
u
batch.embd && !cparams.cross_attn) {
+ continue;
+ }
+
...
...
@@ -447,7 +492,7 @@ index 83b80b59..35748488 100644
+ cb(Qcur, "Qcur", il);
+
+ struct ggml_tensor * Kcur, * Vcur;
+ if (batch.embd) {
+ if (
u
batch.embd) {
+ Kcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_k_proj, inpCAS);
+ cb(Kcur, "Kcur", il);
+
...
...
@@ -621,10 +666,19 @@ index 83b80b59..35748488 100644
+ LLM_NORM_RMS, cb, -1);
+ cb(cur, "result_norm", -1);
+
// lm_head
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1);
@@ -16501,6 +16891,10 @@
static struct ggml_cgraph * llama_build_graph(
+ // lm_head
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+ cb(cur, "result_output", -1);
+
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+ }
+
struct ggml_cgraph * build_baichuan() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
@@ -16973,6 +17391,10 @@
static struct ggml_cgraph * llama_build_graph(
{
result = llm.build_llama();
} break;
...
...
@@ -635,14 +689,14 @@ index 83b80b59..35748488 100644
case LLM_ARCH_BAICHUAN:
{
result = llm.build_baichuan();
@@ -1
6761
,10 +17
155
,19 @@
static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch)
{
@@ -1
7237
,10 +17
659
,19 @@
static void llama_set_inputs(llama_context & lctx, const llama_ubatch &
u
batch)
}
if (batch.embd) {
if (
u
batch.embd) {
- const int64_t n_embd = hparams.n_embd;
- const int64_t n_tokens = batch.n_tokens;
- const int64_t n_tokens =
u
batch.n_tokens;
+ if (lctx.inp_cross_attn_state && lctx.inp_cross_attn_state->buffer) {
+ ggml_backend_tensor_set(lctx.inp_cross_attn_state, batch.embd, 0, ggml_nbytes(lctx.inp_cross_attn_state));
+ ggml_backend_tensor_set(lctx.inp_cross_attn_state,
u
batch.embd, 0, ggml_nbytes(lctx.inp_cross_attn_state));
+ // zero out inp_embd since it's not used
+ float * inp_embd_data = (float *)lctx.inp_embd->data;
+ for (int i = 0; i < ggml_nelements(lctx.inp_embd); ++i) {
...
...
@@ -650,24 +704,24 @@ index 83b80b59..35748488 100644
+ }
+ } else {
+ const int64_t n_embd = hparams.n_embd;
+ const int64_t n_tokens = batch.n_tokens;
+ const int64_t n_tokens =
u
batch.n_tokens;
- ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
+ ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
- ggml_backend_tensor_set(lctx.inp_embd,
u
batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
+ ggml_backend_tensor_set(lctx.inp_embd,
u
batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
+ }
}
if (batch.pos && lctx.inp_pos) {
@@ -17
345
,7 +1
7748
,7 @@
static int llama_decode_internal(
if (
u
batch.pos && lctx.inp_pos) {
@@ -17
841
,7 +1
8272
,7 @@
static int llama_decode_internal(
n_outputs = 1;
}
- lctx.sbatch.from_batch(batch
_all
, n_embd,
+ lctx.sbatch.from_batch(batch
_all
, batch
_all
.n_embd,
- lctx.sbatch.from_batch(batch, n_embd,
+ lctx.sbatch.from_batch(batch, batch.n_embd,
/* simple_split */ !kv_self.recurrent,
/* logits_all */ n_outputs == n_tokens_all);
@@ -1
7638
,7 +18
041
,7 @@
static int llama_encode_internal(
@@ -1
8151
,7 +18
582
,7 @@
static int llama_encode_internal(
const int64_t n_embd = hparams.n_embd;
...
...
@@ -676,7 +730,7 @@ index 83b80b59..35748488 100644
const llama_ubatch ubatch = lctx.sbatch.split_simple(n_tokens);
@@ -1
8648
,7 +19
051
,9 @@
static void llama_model_quantize_internal(const std::string & fname_inp, const s
@@ -1
9189
,7 +19
620
,9 @@
static void llama_model_quantize_internal(const std::string & fname_inp, const s
if (llama_model_has_encoder(&model)) {
n_attn_layer *= 3;
}
...
...
@@ -687,7 +741,7 @@ index 83b80b59..35748488 100644
}
size_t total_size_org = 0;
@@ -
19814
,6 +20
219
,7 @@
enum llama_rope_type llama_rope_type(const struct llama_model * model) {
@@ -
20355
,6 +20
788
,7 @@
enum llama_rope_type llama_rope_type(const struct llama_model * model) {
// use what we call a normal RoPE, operating on pairs of consecutive head values
case LLM_ARCH_LLAMA:
...
...
@@ -695,7 +749,7 @@ index 83b80b59..35748488 100644
case LLM_ARCH_BAICHUAN:
case LLM_ARCH_STARCODER:
case LLM_ARCH_PLAMO:
@@ -21
230
,6 +2
163
6,10 @@
void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
@@ -21
782
,6 +2
221
6,10 @@
void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
ctx->cparams.causal_attn = causal_attn;
}
...
...
@@ -705,8 +759,8 @@ index 83b80b59..35748488 100644
+
struct llama_batch llama_batch_get_one(
llama_token * tokens,
int32_t n_tokens
,
@@ -21
23
9,6 +2
1649
,7 @@
struct llama_batch llama_batch_get_one(
int32_t n_tokens
) {
@@ -21
78
9,6 +2
2227
,7 @@
struct llama_batch llama_batch_get_one(
/*n_tokens =*/ n_tokens,
/*tokens =*/ tokens,
/*embd =*/ nullptr,
...
...
@@ -714,7 +768,7 @@ index 83b80b59..35748488 100644
/*pos =*/ nullptr,
/*n_seq_id =*/ nullptr,
/*seq_id =*/ nullptr,
@@ -21
254
,6 +2
1665
,7 @@
struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
@@ -21
801
,6 +2
2240
,7 @@
struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
/*n_tokens =*/ 0,
/*tokens =*/ nullptr,
/*embd =*/ nullptr,
...
...
@@ -722,7 +776,7 @@ index 83b80b59..35748488 100644
/*pos =*/ nullptr,
/*n_seq_id =*/ nullptr,
/*seq_id =*/ nullptr,
@@ -21
265
,6 +2
1677
,7 @@
struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
@@ -21
809
,6 +2
2249
,7 @@
struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
if (embd) {
batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
...
...
llama/patches/00
11
-add-unpad-operator.patch
→
llama/patches/00
09
-add-unpad-operator.patch
View file @
527cc978
...
...
@@ -4,20 +4,21 @@ Date: Thu, 17 Oct 2024 17:19:25 -0700
Subject: [PATCH] add unpad operator
---
ggml/include/ggml.h | 10 ++++
ggml/src/ggml-cuda.cu | 4 ++
ggml/src/ggml-cuda/pad.cu | 46 +++++++++++++++++++
ggml/include/ggml.h | 10 +++++
ggml/src/ggml-cpu/ggml-cpu.c | 57 ++++++++++++++++++++++++++++
ggml/src/ggml-cuda/ggml-cuda.cu | 4 ++
ggml/src/ggml-cuda/pad.cu | 46 ++++++++++++++++++++++
ggml/src/ggml-cuda/pad.cuh | 1 +
ggml/src/ggml-metal.m
| 33 ++++++++++++++
ggml/src/ggml-metal.metal
| 45 ++++++++++++++++++
ggml/src/ggml.c
| 93 ++++++++++++++++++++++++++
+++++++++++-
7
files changed, 2
30
insertions(+), 2 deletions(-)
ggml/src/ggml-metal
/ggml-metal
.m | 33 ++++++++++++++
++
ggml/src/ggml-metal
/ggml-metal
.metal | 45 ++++++++++++++++++
++++
ggml/src/ggml.c
| 25
+++++++++++-
8
files changed, 2
19
insertions(+), 2 deletions(-)
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index
ce3d
92c
b
..
962cb5f7
100644
index
65cb
92c
4
..
acbcccc6
100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -
506
,6 +
506
,7 @@
extern "C" {
@@ -
499
,6 +
499
,7 @@
extern "C" {
GGML_OP_POOL_2D_BACK,
GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_PAD,
...
...
@@ -25,7 +26,7 @@ index ce3d92cb..962cb5f7 100644
GGML_OP_ARANGE,
GGML_OP_TIMESTEP_EMBEDDING,
GGML_OP_ARGSORT,
@@ -1
764
,6 +1
765
,15 @@
extern "C" {
@@ -1
695
,6 +1
696
,15 @@
extern "C" {
int p2,
int p3);
...
...
@@ -41,11 +42,93 @@ index ce3d92cb..962cb5f7 100644
// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
// timesteps: [N,]
// return: [N, dim]
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index fe77b81c..6e84af56 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -2270,6 +2270,9 @@
static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 23ae2e10..111ff3b0 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -10439,6 +10439,58 @@
static void ggml_compute_forward_pad(
}
}
+static void ggml_compute_forward_unpad_f32(
+ const struct ggml_compute_params *params,
+ struct ggml_tensor *dst) {
+
+ const struct ggml_tensor * src0 = dst->src[0];
+
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
+ GGML_ASSERT( dst->nb[0] == sizeof(float));
+
+ const int ith = params->ith;
+ const int nth = params->nth;
+
+ GGML_TENSOR_UNARY_OP_LOCALS
+
+ float * dst_ptr = (float *) dst->data;
+
+ // TODO: optimize
+
+ for (int64_t i2 = 0; i2 < ne2; ++i2) {
+ for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
+ for (int64_t i0 = 0; i0 < ne0; ++i0) {
+ for (int64_t i3 = 0; i3 < ne3; ++i3) {
+ const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
+
+ const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+
+ if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
+ dst_ptr[dst_idx] = *src_ptr;
+ }
+ }
+ }
+ }
+ }
+}
+
+static void ggml_compute_forward_unpad(
+ const struct ggml_compute_params * params,
+ struct ggml_tensor * dst) {
+
+ const struct ggml_tensor * src0 = dst->src[0];
+
+ switch (src0->type) {
+ case GGML_TYPE_F32:
+ {
+ ggml_compute_forward_unpad_f32(params, dst);
+ } break;
+ default:
+ {
+ GGML_ABORT("fatal error");
+ }
+ }
+}
// ggml_compute_forward_arange
@@ -12535,6 +12587,10 @@
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_pad(params, tensor);
} break;
+ case GGML_OP_UNPAD:
+ {
+ ggml_compute_forward_unpad(params, tensor);
+ } break;
case GGML_OP_ARANGE:
{
ggml_compute_forward_arange(params, tensor);
@@ -12877,6 +12933,7 @@
static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
} break;
case GGML_OP_UPSCALE:
case GGML_OP_PAD:
+ case GGML_OP_UNPAD:
case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT:
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index cbf4fddf..9ca6cb77 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2085,6 +2085,9 @@
static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_PAD:
ggml_cuda_op_pad(ctx, dst);
break;
...
...
@@ -55,7 +138,7 @@ index fe77b81c..6e84af56 100644
case GGML_OP_ARANGE:
ggml_cuda_op_arange(ctx, dst);
break;
@@ -
299
2,6 +
299
5,7 @@
GGML_CALL
static bool ggml_backend_cuda_supports_op(ggml_backend_
t backend
, cons
@@ -
301
2,6 +
301
5,7 @@
static bool ggml_backend_cuda_
device_
supports_op(ggml_backend_
dev_t dev
, cons
t g
case GGML_OP_GROUP_NORM:
case GGML_OP_UPSCALE:
case GGML_OP_PAD:
...
...
@@ -126,35 +209,35 @@ index 8fd386b0..e2ededc3 100644
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
index
829c5e39..25702d85
100644
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -
193
,6 +
193
,7 @@
GGML_METAL_KERNEL_TYPE_
IM2COL
_F32,
diff --git a/ggml/src/ggml-metal
/ggml-metal
.m b/ggml/src/ggml-metal
/ggml-metal
.m
index
093ae900..cb9a1307
100644
--- a/ggml/src/ggml-metal
/ggml-metal
.m
+++ b/ggml/src/ggml-metal
/ggml-metal
.m
@@ -
310
,6 +
310
,7 @@
static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
GGML_METAL_KERNEL_TYPE_
CONV_TRANSPOSE_1D_F16
_F32,
GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
GGML_METAL_KERNEL_TYPE_PAD_F32,
+ GGML_METAL_KERNEL_TYPE_UNPAD_F32,
GGML_METAL_KERNEL_TYPE_ARANGE_F32,
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
@@ -
689
,6 +
690
,7 @@
static void ggml_
meta
l_log(enum ggml_log_level level, const char * format, ...){
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_
IM2COL_F32, im2col_f32,
true);
@@ -
877
,6 +
878
,7 @@
@imple
me
n
ta
tion GGMLMetalClass
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_
CONV_TRANSPOSE_1D_F16_F32, conv_transpose_1d_f16_f32,
true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32, unpad_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
@@ -
846,6 +848
,7 @@
static bool ggml_metal_supports_op(const struct ggml_backend_metal_
context * ct
x
return false;
@@ -
1099,6 +1101
,7 @@
static bool ggml_metal_supports_op(const struct ggml_backend_metal_
device_conte
x
case GGML_OP_POOL_2D:
case GGML_OP_UPSCALE:
case GGML_OP_PAD:
+ case GGML_OP_UNPAD:
case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT:
@@ -
2655
,6 +26
58
,36 @@
static void ggml_metal_encode_node(
@@ -
3258
,6 +
3
26
1
,36 @@
static void ggml_metal_encode_node(
const int nth = MIN(1024, ne0);
...
...
@@ -191,11 +274,11 @@ index 829c5e39..25702d85 100644
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break;
case GGML_OP_ARANGE:
diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal
index
2b200032..0988751
1 100644
--- a/ggml/src/ggml-metal.metal
+++ b/ggml/src/ggml-metal.metal
@@ -2
029
,6 +2
029
,51 @@
kernel void kernel_pad_f32(
diff --git a/ggml/src/ggml-metal
/ggml-metal
.metal b/ggml/src/ggml-metal
/ggml-metal
.metal
index
5caa0846..47038c3
1 100644
--- a/ggml/src/ggml-metal
/ggml-metal
.metal
+++ b/ggml/src/ggml-metal
/ggml-metal
.metal
@@ -2
897
,6 +2
897
,51 @@
kernel void kernel_pad_f32(
}
}
...
...
@@ -248,10 +331,10 @@ index 2b200032..09887511 100644
device char * dst,
constant int64_t & ne0,
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index
bcbc32d9..f4864ac8
100644
index
1a9a7efa..ea2b259b
100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -
2997,6 +2997
,7 @@
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
@@ -
950,6 +950
,7 @@
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"POOL_2D_BACK",
"UPSCALE",
"PAD",
...
...
@@ -259,16 +342,16 @@ index bcbc32d9..f4864ac8 100644
"ARANGE",
"TIMESTEP_EMBEDDING",
"ARGSORT",
@@ -
3030,7 +3031
,7 @@
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
@@ -
983,7 +984
,7 @@
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"OPT_STEP_ADAMW",
};
-static_assert(GGML_OP_COUNT == 8
0
, "GGML_OP_COUNT != 8
0
");
+static_assert(GGML_OP_COUNT == 8
1
, "GGML_OP_COUNT != 8
1
");
-static_assert(GGML_OP_COUNT == 8
1
, "GGML_OP_COUNT != 8
1
");
+static_assert(GGML_OP_COUNT == 8
2
, "GGML_OP_COUNT != 8
2
");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
@@ -
3091,6 +3092
,7 @@
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
@@ -
1045,6 +1046
,7 @@
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"pool_2d_back(x)",
"upscale(x)",
"pad(x)",
...
...
@@ -276,16 +359,16 @@ index bcbc32d9..f4864ac8 100644
"arange(start, stop, step)",
"timestep_embedding(timesteps, dim, max_period)",
"argsort(x)",
@@ -
3124,7 +3126
,7 @@
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
@@ -
1078,7 +1080
,7 @@
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"adamw(x)",
};
-static_assert(GGML_OP_COUNT == 8
0
, "GGML_OP_COUNT != 8
0
");
+static_assert(GGML_OP_COUNT == 8
1
, "GGML_OP_COUNT != 8
1
");
-static_assert(GGML_OP_COUNT == 8
1
, "GGML_OP_COUNT != 8
1
");
+static_assert(GGML_OP_COUNT == 8
2
, "GGML_OP_COUNT != 8
2
");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@@ -
6955,6 +6957,32
@@
struct ggml_tensor * ggml_pad(
@@ -
4097,6 +4099,25
@@
struct ggml_tensor * ggml_pad(
return result;
}
...
...
@@ -295,12 +378,6 @@ index bcbc32d9..f4864ac8 100644
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int p0, int p1, int p2, int p3) {
+ bool is_node = false;
+
+ if (a->grad) {
+ GGML_ABORT("fatal error"); // TODO: implement backward
+ is_node = true;
+ }
+
+ struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
+ a->ne[0] - p0,
...
...
@@ -309,7 +386,6 @@ index bcbc32d9..f4864ac8 100644
+ a->ne[3] - p3);
+
+ result->op = GGML_OP_UNPAD;
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+ result->src[0] = a;
+
+ return result;
...
...
@@ -318,92 +394,3 @@ index bcbc32d9..f4864ac8 100644
// ggml_arange
struct ggml_tensor * ggml_arange(
@@ -15312,6 +15340,58 @@
static void ggml_compute_forward_pad(
}
}
+static void ggml_compute_forward_unpad_f32(
+ const struct ggml_compute_params *params,
+ struct ggml_tensor *dst) {
+
+ const struct ggml_tensor * src0 = dst->src[0];
+
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
+ GGML_ASSERT( dst->nb[0] == sizeof(float));
+
+ const int ith = params->ith;
+ const int nth = params->nth;
+
+ GGML_TENSOR_UNARY_OP_LOCALS
+
+ float * dst_ptr = (float *) dst->data;
+
+ // TODO: optimize
+
+ for (int64_t i2 = 0; i2 < ne2; ++i2) {
+ for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
+ for (int64_t i0 = 0; i0 < ne0; ++i0) {
+ for (int64_t i3 = 0; i3 < ne3; ++i3) {
+ const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
+
+ const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+
+ if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
+ dst_ptr[dst_idx] = *src_ptr;
+ }
+ }
+ }
+ }
+ }
+}
+
+static void ggml_compute_forward_unpad(
+ const struct ggml_compute_params * params,
+ struct ggml_tensor * dst) {
+
+ const struct ggml_tensor * src0 = dst->src[0];
+
+ switch (src0->type) {
+ case GGML_TYPE_F32:
+ {
+ ggml_compute_forward_unpad_f32(params, dst);
+ } break;
+ default:
+ {
+ GGML_ABORT("fatal error");
+ }
+ }
+}
// ggml_compute_forward_arange
@@ -17294,6 +17374,10 @@
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_pad(params, tensor);
} break;
+ case GGML_OP_UNPAD:
+ {
+ ggml_compute_forward_unpad(params, tensor);
+ } break;
case GGML_OP_ARANGE:
{
ggml_compute_forward_arange(params, tensor);
@@ -18369,6 +18453,10 @@
static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
{
GGML_ABORT("fatal error"); // TODO: not implemented
}
+ case GGML_OP_UNPAD:
+ {
+ GGML_ABORT("fatal error"); // TODO: not implemented
+ }
case GGML_OP_ARANGE:
{
GGML_ABORT("fatal error"); // TODO: not implemented
@@ -19165,6 +19253,7 @@
static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
} break;
case GGML_OP_UPSCALE:
case GGML_OP_PAD:
+ case GGML_OP_UNPAD:
case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT:
llama/patches/001
2
-fix-deepseek-deseret-regex.patch
→
llama/patches/001
0
-fix-deepseek-deseret-regex.patch
View file @
527cc978
...
...
@@ -7,11 +7,11 @@ On windows compiled with gcc the c++ regex library failed to handle
the characters
---
src/llama-vocab.cpp | 2 +-
src/unicode.cpp | 2
1
+++++++++++++++++++++
2 files changed, 2
2
insertions(+), 1 deletion(-)
src/unicode.cpp | 2
2 +
+++++++++++++++++++++
2 files changed, 2
3
insertions(+), 1 deletion(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index d
2f34ddd..3ef6af19
100644
index d
1dc9627..05ef0e71
100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -389,7 +389,7 @@
struct llm_tokenizer_bpe : llm_tokenizer {
...
...
@@ -24,7 +24,7 @@ index d2f34ddd..3ef6af19 100644
"\\s+$",
"[一-龥ࠀ-一가-]+",
diff --git a/src/unicode.cpp b/src/unicode.cpp
index
f4e941cd..9d78ff16
100644
index
3d459263..51dd81fb
100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -2,6 +2,11 @@
...
...
@@ -39,7 +39,7 @@ index f4e941cd..9d78ff16 100644
#include "unicode.h"
#include "unicode-data.h"
@@ -201,
8
+206,2
4
@@
static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
@@ -201,
6
+206,2
2
@@
static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
}
static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
...
...
@@ -58,7 +58,13 @@ index f4e941cd..9d78ff16 100644
+ free(wbuf);
+ return ret;
+#else
std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
+
#if defined(__clang__)
// disable C++17 deprecation warning for std::codecvt_utf8
# pragma clang diagnostic push
@@ -214,6 +235,7 @@
static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
#endif
return conv.from_bytes(s);
+#endif
}
...
...
llama/patches/0011-relative-include-paths.patch
0 → 100644
View file @
527cc978
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: Tue, 3 Dec 2024 21:30:51 -0800
Subject: [PATCH] relative include paths
---
ggml/src/ggml-cpu/ggml-cpu-aarch64.c | 2 +-
ggml/src/ggml-cpu/ggml-cpu.c | 2 +-
ggml/src/ggml-cpu/ggml-cpu.cpp | 2 +-
ggml/src/ggml-quants.c | 2 +-
4 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
index 11152385..bbf8934e 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
@@ -4,7 +4,7 @@
#include "ggml-quants.h"
#include "ggml-impl.h"
#include "ggml-cpu.h"
-#include "ggml-cpu/ggml-cpu-impl.h"
+#include "ggml-cpu-impl.h"
#include <math.h>
#include <string.h>
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 111ff3b0..df0bd3c6 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -10,7 +10,7 @@
#include "ggml-quants.h"
#include "ggml-cpu-quants.h"
#include "ggml-threading.h"
-#include "amx/amx.h"
+#include "amx.h"
#include "ggml.h"
#if defined(_MSC_VER) || defined(__MINGW32__)
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
index 77e5d87a..91476ad0 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -3,7 +3,7 @@
#include "ggml-cpu.h"
#include "ggml-cpu-aarch64.h"
#include "ggml-impl.h"
-#include "amx/amx.h"
+#include "amx.h"
#include <cctype>
#include <string>
#include <vector>
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 7301a9c6..49ab3daf 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -3,7 +3,7 @@
#include "ggml-quants.h"
#include "ggml-impl.h"
-#include "ggml-cpu/ggml-cpu-impl.h"
+#include "ggml-cpu-impl.h"
#include "ggml-cpu.h"
#include <math.h>
llama/runner/runner.go
View file @
527cc978
...
...
@@ -559,7 +559,6 @@ type Options struct {
TopK
int
`json:"top_k"`
TopP
float32
`json:"top_p"`
MinP
float32
`json:"min_p"`
TFSZ
float32
`json:"tfs_z"`
TypicalP
float32
`json:"typical_p"`
RepeatLastN
int
`json:"repeat_last_n"`
Temperature
float32
`json:"temperature"`
...
...
@@ -632,7 +631,6 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
samplingParams
.
TopK
=
req
.
TopK
samplingParams
.
TopP
=
req
.
TopP
samplingParams
.
MinP
=
req
.
MinP
samplingParams
.
TfsZ
=
req
.
TFSZ
samplingParams
.
TypicalP
=
req
.
TypicalP
samplingParams
.
Temp
=
req
.
Temperature
samplingParams
.
RepeatLastN
=
req
.
RepeatLastN
...
...
@@ -930,6 +928,7 @@ func Execute(args []string) error {
level
:=
slog
.
LevelInfo
if
*
verbose
{
level
=
slog
.
LevelDebug
llama
.
EnableDebug
()
}
handler
:=
slog
.
NewTextHandler
(
os
.
Stderr
,
&
slog
.
HandlerOptions
{
Level
:
level
,
...
...
llama/sampling.cpp
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
@@ -124,8 +124,8 @@ struct ring_buffer {
std
::
vector
<
T
>
data
;
};
struct
gpt
_sampler
{
gpt_sampler_params
params
;
struct
common
_sampler
{
common_params_sampling
params
;
struct
llama_sampler
*
grmr
;
struct
llama_sampler
*
chain
;
...
...
@@ -151,26 +151,28 @@ struct gpt_sampler {
}
};
std
::
string
gpt_sampler_params
::
print
()
const
{
std
::
string
common_params_sampling
::
print
()
const
{
char
result
[
1024
];
snprintf
(
result
,
sizeof
(
result
),
"
\t
repeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f
\n
"
"
\t
top_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f
\n
"
"
\t
dry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d
\n
"
"
\t
top_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f
\n
"
"
\t
mirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f"
,
penalty_last_n
,
penalty_repeat
,
penalty_freq
,
penalty_present
,
top_k
,
tfs_z
,
top_p
,
min_p
,
typ_p
,
temp
,
dry_multiplier
,
dry_base
,
dry_allowed_length
,
dry_penalty_last_n
,
top_k
,
top_p
,
min_p
,
xtc_probability
,
xtc_threshold
,
typ_p
,
temp
,
mirostat
,
mirostat_eta
,
mirostat_tau
);
return
std
::
string
(
result
);
}
struct
gpt
_sampler
*
gpt
_sampler_init
(
const
struct
llama_model
*
model
,
const
struct
gpt_sampler_params
&
params
)
{
struct
common
_sampler
*
common
_sampler_init
(
const
struct
llama_model
*
model
,
const
struct
common_params_sampling
&
params
)
{
llama_sampler_chain_params
lparams
=
llama_sampler_chain_default_params
();
lparams
.
no_perf
=
params
.
no_perf
;
auto
*
result
=
new
gpt
_sampler
{
auto
*
result
=
new
common
_sampler
{
/* .params = */
params
,
/* .grmr = */
llama_sampler_init_grammar
(
model
,
params
.
grammar
.
c_str
(),
"root"
),
/* .chain = */
llama_sampler_chain_init
(
lparams
),
...
...
@@ -197,33 +199,45 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st
params
.
penalize_nl
,
params
.
ignore_eos
));
if
(
params
.
temp
>
0.0
f
)
{
if
(
params
.
mirostat
==
0
)
{
for
(
const
auto
&
cnstr
:
params
.
samplers
)
{
switch
(
cnstr
)
{
case
GPT_SAMPLER_TYPE_TOP_K
:
case
COMMON_SAMPLER_TYPE_DRY
:
{
std
::
vector
<
const
char
*>
c_breakers
;
c_breakers
.
reserve
(
params
.
dry_sequence_breakers
.
size
());
for
(
const
auto
&
str
:
params
.
dry_sequence_breakers
)
{
c_breakers
.
push_back
(
str
.
c_str
());
}
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_dry
(
model
,
params
.
dry_multiplier
,
params
.
dry_base
,
params
.
dry_allowed_length
,
params
.
dry_penalty_last_n
,
c_breakers
.
data
(),
c_breakers
.
size
()));
}
break
;
case
COMMON_SAMPLER_TYPE_TOP_K
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_top_k
(
params
.
top_k
));
break
;
case
GPT
_SAMPLER_TYPE_TOP_P
:
case
COMMON
_SAMPLER_TYPE_TOP_P
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_top_p
(
params
.
top_p
,
params
.
min_keep
));
break
;
case
GPT
_SAMPLER_TYPE_MIN_P
:
case
COMMON
_SAMPLER_TYPE_MIN_P
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_min_p
(
params
.
min_p
,
params
.
min_keep
));
break
;
case
GPT
_SAMPLER_TYPE_
TFS_Z
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_
tail_free
(
params
.
tfs_z
,
params
.
min_keep
));
case
COMMON
_SAMPLER_TYPE_
XTC
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_
xtc
(
params
.
xtc_probability
,
params
.
xtc_threshold
,
params
.
min_keep
,
params
.
seed
));
break
;
case
GPT
_SAMPLER_TYPE_TYPICAL_P
:
case
COMMON
_SAMPLER_TYPE_TYPICAL_P
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_typical
(
params
.
typ_p
,
params
.
min_keep
));
break
;
case
GPT
_SAMPLER_TYPE_TEMPERATURE
:
case
COMMON
_SAMPLER_TYPE_TEMPERATURE
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_temp_ext
(
params
.
temp
,
params
.
dynatemp_range
,
params
.
dynatemp_exponent
));
break
;
case
COMMON_SAMPLER_TYPE_INFILL
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_infill
(
model
));
break
;
default:
GGML_ASSERT
(
false
&&
"unknown sampler type"
);
}
}
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_softmax
());
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_dist
(
params
.
seed
));
}
else
if
(
params
.
mirostat
==
1
)
{
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_temp
(
params
.
temp
));
...
...
@@ -234,23 +248,11 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st
}
else
{
GGML_ASSERT
(
false
&&
"unknown mirostat version"
);
}
}
else
{
if
(
params
.
n_probs
>
0
)
{
// some use cases require to sample greedily, but still obtain the probabilities of the top tokens
// ref: https://github.com/ggerganov/llama.cpp/pull/9605
//
// the following will not produce exactly the same probs as applyging softmax to the full vocabulary, but
// it is much faster, since we avoid sorting all tokens and should give a good approximation
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_top_k
(
params
.
n_probs
));
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_softmax
());
}
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_greedy
());
}
return
result
;
}
void
gpt
_sampler_free
(
struct
gpt
_sampler
*
gsmpl
)
{
void
common
_sampler_free
(
struct
common
_sampler
*
gsmpl
)
{
if
(
gsmpl
)
{
llama_sampler_free
(
gsmpl
->
grmr
);
...
...
@@ -260,7 +262,7 @@ void gpt_sampler_free(struct gpt_sampler * gsmpl) {
}
}
void
gpt
_sampler_accept
(
struct
gpt
_sampler
*
gsmpl
,
llama_token
token
,
bool
accept_grammar
)
{
void
common
_sampler_accept
(
struct
common
_sampler
*
gsmpl
,
llama_token
token
,
bool
accept_grammar
)
{
if
(
accept_grammar
)
{
llama_sampler_accept
(
gsmpl
->
grmr
,
token
);
}
...
...
@@ -270,14 +272,14 @@ void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool acce
gsmpl
->
prev
.
push_back
(
token
);
}
void
gpt
_sampler_reset
(
struct
gpt
_sampler
*
gsmpl
)
{
void
common
_sampler_reset
(
struct
common
_sampler
*
gsmpl
)
{
llama_sampler_reset
(
gsmpl
->
grmr
);
llama_sampler_reset
(
gsmpl
->
chain
);
}
struct
gpt
_sampler
*
gpt
_sampler_clone
(
gpt
_sampler
*
gsmpl
)
{
return
new
gpt
_sampler
{
struct
common
_sampler
*
common
_sampler_clone
(
common
_sampler
*
gsmpl
)
{
return
new
common
_sampler
{
/* .params = */
gsmpl
->
params
,
/* .grmr = */
llama_sampler_clone
(
gsmpl
->
grmr
),
/* .chain = */
llama_sampler_clone
(
gsmpl
->
chain
),
...
...
@@ -287,7 +289,7 @@ struct gpt_sampler * gpt_sampler_clone(gpt_sampler * gsmpl) {
};
}
void
gpt
_perf_print
(
const
struct
llama_context
*
ctx
,
const
struct
gpt
_sampler
*
gsmpl
)
{
void
common
_perf_print
(
const
struct
llama_context
*
ctx
,
const
struct
common
_sampler
*
gsmpl
)
{
// TODO: measure grammar performance
if
(
gsmpl
)
{
...
...
@@ -298,7 +300,7 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
}
}
llama_token
gpt
_sampler_sample
(
struct
gpt
_sampler
*
gsmpl
,
struct
llama_context
*
ctx
,
int
idx
,
bool
grammar_first
)
{
llama_token
common
_sampler_sample
(
struct
common
_sampler
*
gsmpl
,
struct
llama_context
*
ctx
,
int
idx
,
bool
grammar_first
)
{
gsmpl
->
set_logits
(
ctx
,
idx
);
auto
&
grmr
=
gsmpl
->
grmr
;
...
...
@@ -344,21 +346,60 @@ llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context
return
cur_p
.
data
[
cur_p
.
selected
].
id
;
}
uint32_t
gpt_sampler_get_seed
(
const
struct
gpt_sampler
*
gsmpl
)
{
std
::
vector
<
llama_token
>
common_sampler_sample_and_accept_n
(
struct
common_sampler
*
gsmpl
,
struct
llama_context
*
ctx
,
const
std
::
vector
<
int
>
&
idxs
,
const
llama_tokens
&
draft
,
bool
grammar_first
)
{
GGML_ASSERT
(
idxs
.
size
()
==
draft
.
size
()
+
1
&&
"idxs.size() must be draft.size() + 1"
);
std
::
vector
<
llama_token
>
result
;
result
.
reserve
(
idxs
.
size
());
size_t
i
=
0
;
for
(;
i
<
draft
.
size
();
i
++
)
{
const
llama_token
id
=
common_sampler_sample
(
gsmpl
,
ctx
,
idxs
[
i
],
grammar_first
);
common_sampler_accept
(
gsmpl
,
id
,
true
);
result
.
push_back
(
id
);
if
(
draft
[
i
]
!=
id
)
{
break
;
}
}
if
(
i
==
draft
.
size
())
{
const
llama_token
id
=
common_sampler_sample
(
gsmpl
,
ctx
,
idxs
[
i
],
grammar_first
);
common_sampler_accept
(
gsmpl
,
id
,
true
);
result
.
push_back
(
id
);
}
return
result
;
}
std
::
vector
<
llama_token
>
common_sampler_sample_and_accept_n
(
struct
common_sampler
*
gsmpl
,
struct
llama_context
*
ctx
,
const
llama_tokens
&
draft
,
bool
grammar_first
)
{
std
::
vector
<
int
>
idxs
(
draft
.
size
()
+
1
);
for
(
size_t
i
=
0
;
i
<
idxs
.
size
();
++
i
)
{
idxs
[
i
]
=
i
;
}
return
common_sampler_sample_and_accept_n
(
gsmpl
,
ctx
,
idxs
,
draft
,
grammar_first
);
}
uint32_t
common_sampler_get_seed
(
const
struct
common_sampler
*
gsmpl
)
{
return
llama_sampler_get_seed
(
gsmpl
->
chain
);
}
// helpers
llama_token_data_array
*
gpt
_sampler_get_candidates
(
struct
gpt
_sampler
*
gsmpl
)
{
llama_token_data_array
*
common
_sampler_get_candidates
(
struct
common
_sampler
*
gsmpl
)
{
return
&
gsmpl
->
cur_p
;
}
llama_token
gpt
_sampler_last
(
const
struct
gpt
_sampler
*
gsmpl
)
{
llama_token
common
_sampler_last
(
const
struct
common
_sampler
*
gsmpl
)
{
return
gsmpl
->
prev
.
rat
(
0
);
}
std
::
string
gpt
_sampler_print
(
const
struct
gpt
_sampler
*
gsmpl
)
{
std
::
string
common
_sampler_print
(
const
struct
common
_sampler
*
gsmpl
)
{
std
::
string
result
=
"logits "
;
for
(
int
i
=
0
;
i
<
llama_sampler_chain_n
(
gsmpl
->
chain
);
i
++
)
{
...
...
@@ -369,7 +410,7 @@ std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
return
result
;
}
std
::
string
gpt
_sampler_prev_str
(
gpt
_sampler
*
gsmpl
,
llama_context
*
ctx_main
,
int
n
)
{
std
::
string
common
_sampler_prev_str
(
common
_sampler
*
gsmpl
,
llama_context
*
ctx_main
,
int
n
)
{
n
=
std
::
min
(
n
,
(
int
)
gsmpl
->
prev
.
size
());
if
(
n
<=
0
)
{
...
...
@@ -384,63 +425,67 @@ std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main,
GGML_ASSERT
(
id
!=
LLAMA_TOKEN_NULL
&&
"null token in the sampling history - should not happen"
);
result
+=
llama
_token_to_piece
(
ctx_main
,
id
);
result
+=
common
_token_to_piece
(
ctx_main
,
id
);
}
return
result
;
}
char
gpt
_sampler_type_to_chr
(
enum
gpt
_sampler_type
cnstr
)
{
char
common
_sampler_type_to_chr
(
enum
common
_sampler_type
cnstr
)
{
switch
(
cnstr
)
{
case
GPT_SAMPLER_TYPE_TOP_K
:
return
'k'
;
case
GPT_SAMPLER_TYPE_TFS_Z
:
return
'f'
;
case
GPT_SAMPLER_TYPE_TYPICAL_P
:
return
'y'
;
case
GPT_SAMPLER_TYPE_TOP_P
:
return
'p'
;
case
GPT_SAMPLER_TYPE_MIN_P
:
return
'm'
;
case
GPT_SAMPLER_TYPE_TEMPERATURE
:
return
't'
;
case
COMMON_SAMPLER_TYPE_DRY
:
return
'd'
;
case
COMMON_SAMPLER_TYPE_TOP_K
:
return
'k'
;
case
COMMON_SAMPLER_TYPE_TYPICAL_P
:
return
'y'
;
case
COMMON_SAMPLER_TYPE_TOP_P
:
return
'p'
;
case
COMMON_SAMPLER_TYPE_MIN_P
:
return
'm'
;
case
COMMON_SAMPLER_TYPE_TEMPERATURE
:
return
't'
;
case
COMMON_SAMPLER_TYPE_XTC
:
return
'x'
;
case
COMMON_SAMPLER_TYPE_INFILL
:
return
'i'
;
default
:
return
'?'
;
}
}
std
::
string
gpt
_sampler_type_to_str
(
enum
gpt
_sampler_type
cnstr
)
{
std
::
string
common
_sampler_type_to_str
(
enum
common
_sampler_type
cnstr
)
{
switch
(
cnstr
)
{
case
GPT_SAMPLER_TYPE_TOP_K
:
return
"top_k"
;
case
GPT_SAMPLER_TYPE_TFS_Z
:
return
"tfs_z"
;
case
GPT_SAMPLER_TYPE_TYPICAL_P
:
return
"typ_p"
;
case
GPT_SAMPLER_TYPE_TOP_P
:
return
"top_p"
;
case
GPT_SAMPLER_TYPE_MIN_P
:
return
"min_p"
;
case
GPT_SAMPLER_TYPE_TEMPERATURE
:
return
"temperature"
;
case
COMMON_SAMPLER_TYPE_DRY
:
return
"dry"
;
case
COMMON_SAMPLER_TYPE_TOP_K
:
return
"top_k"
;
case
COMMON_SAMPLER_TYPE_TYPICAL_P
:
return
"typ_p"
;
case
COMMON_SAMPLER_TYPE_TOP_P
:
return
"top_p"
;
case
COMMON_SAMPLER_TYPE_MIN_P
:
return
"min_p"
;
case
COMMON_SAMPLER_TYPE_TEMPERATURE
:
return
"temperature"
;
case
COMMON_SAMPLER_TYPE_XTC
:
return
"xtc"
;
case
COMMON_SAMPLER_TYPE_INFILL
:
return
"infill"
;
default
:
return
""
;
}
}
std
::
vector
<
gpt_sampler_type
>
gpt_sampler_types_from_names
(
const
std
::
vector
<
std
::
string
>
&
names
,
bool
allow_alt_names
)
{
std
::
unordered_map
<
std
::
string
,
gpt_sampler_type
>
sampler_canonical_name_map
{
{
"top_k"
,
GPT_SAMPLER_TYPE_TOP_K
},
{
"top_p"
,
GPT_SAMPLER_TYPE_TOP_P
},
{
"typ_p"
,
GPT_SAMPLER_TYPE_TYPICAL_P
},
{
"min_p"
,
GPT_SAMPLER_TYPE_MIN_P
},
{
"tfs_z"
,
GPT_SAMPLER_TYPE_TFS_Z
},
{
"temperature"
,
GPT_SAMPLER_TYPE_TEMPERATURE
},
std
::
vector
<
common_sampler_type
>
common_sampler_types_from_names
(
const
std
::
vector
<
std
::
string
>
&
names
,
bool
allow_alt_names
)
{
std
::
unordered_map
<
std
::
string
,
common_sampler_type
>
sampler_canonical_name_map
{
{
"dry"
,
COMMON_SAMPLER_TYPE_DRY
},
{
"top_k"
,
COMMON_SAMPLER_TYPE_TOP_K
},
{
"top_p"
,
COMMON_SAMPLER_TYPE_TOP_P
},
{
"typ_p"
,
COMMON_SAMPLER_TYPE_TYPICAL_P
},
{
"min_p"
,
COMMON_SAMPLER_TYPE_MIN_P
},
{
"temperature"
,
COMMON_SAMPLER_TYPE_TEMPERATURE
},
{
"xtc"
,
COMMON_SAMPLER_TYPE_XTC
},
{
"infill"
,
COMMON_SAMPLER_TYPE_INFILL
},
};
// since samplers names are written multiple ways
// make it ready for both system names and input names
std
::
unordered_map
<
std
::
string
,
gpt_sampler_type
>
sampler_alt_name_map
{
{
"top-k"
,
GPT_SAMPLER_TYPE_TOP_K
},
{
"top-p"
,
GPT_SAMPLER_TYPE_TOP_P
},
{
"nucleus"
,
GPT_SAMPLER_TYPE_TOP_P
},
{
"typical-p"
,
GPT_SAMPLER_TYPE_TYPICAL_P
},
{
"typical"
,
GPT_SAMPLER_TYPE_TYPICAL_P
},
{
"typ-p"
,
GPT_SAMPLER_TYPE_TYPICAL_P
},
{
"typ"
,
GPT_SAMPLER_TYPE_TYPICAL_P
},
{
"min-p"
,
GPT_SAMPLER_TYPE_MIN_P
},
{
"tfs-z"
,
GPT_SAMPLER_TYPE_TFS_Z
},
{
"tfs"
,
GPT_SAMPLER_TYPE_TFS_Z
},
{
"temp"
,
GPT_SAMPLER_TYPE_TEMPERATURE
},
std
::
unordered_map
<
std
::
string
,
common_sampler_type
>
sampler_alt_name_map
{
{
"top-k"
,
COMMON_SAMPLER_TYPE_TOP_K
},
{
"top-p"
,
COMMON_SAMPLER_TYPE_TOP_P
},
{
"nucleus"
,
COMMON_SAMPLER_TYPE_TOP_P
},
{
"typical-p"
,
COMMON_SAMPLER_TYPE_TYPICAL_P
},
{
"typical"
,
COMMON_SAMPLER_TYPE_TYPICAL_P
},
{
"typ-p"
,
COMMON_SAMPLER_TYPE_TYPICAL_P
},
{
"typ"
,
COMMON_SAMPLER_TYPE_TYPICAL_P
},
{
"min-p"
,
COMMON_SAMPLER_TYPE_MIN_P
},
{
"temp"
,
COMMON_SAMPLER_TYPE_TEMPERATURE
},
};
std
::
vector
<
gpt
_sampler_type
>
samplers
;
std
::
vector
<
common
_sampler_type
>
samplers
;
samplers
.
reserve
(
names
.
size
());
for
(
const
auto
&
name
:
names
)
{
...
...
@@ -460,17 +505,19 @@ std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std
return
samplers
;
}
std
::
vector
<
gpt_sampler_type
>
gpt_sampler_types_from_chars
(
const
std
::
string
&
chars
)
{
std
::
unordered_map
<
char
,
gpt_sampler_type
>
sampler_name_map
=
{
{
gpt_sampler_type_to_chr
(
GPT_SAMPLER_TYPE_TOP_K
),
GPT_SAMPLER_TYPE_TOP_K
},
{
gpt_sampler_type_to_chr
(
GPT_SAMPLER_TYPE_TFS_Z
),
GPT_SAMPLER_TYPE_TFS_Z
},
{
gpt_sampler_type_to_chr
(
GPT_SAMPLER_TYPE_TYPICAL_P
),
GPT_SAMPLER_TYPE_TYPICAL_P
},
{
gpt_sampler_type_to_chr
(
GPT_SAMPLER_TYPE_TOP_P
),
GPT_SAMPLER_TYPE_TOP_P
},
{
gpt_sampler_type_to_chr
(
GPT_SAMPLER_TYPE_MIN_P
),
GPT_SAMPLER_TYPE_MIN_P
},
{
gpt_sampler_type_to_chr
(
GPT_SAMPLER_TYPE_TEMPERATURE
),
GPT_SAMPLER_TYPE_TEMPERATURE
}
std
::
vector
<
common_sampler_type
>
common_sampler_types_from_chars
(
const
std
::
string
&
chars
)
{
std
::
unordered_map
<
char
,
common_sampler_type
>
sampler_name_map
=
{
{
common_sampler_type_to_chr
(
COMMON_SAMPLER_TYPE_DRY
),
COMMON_SAMPLER_TYPE_DRY
},
{
common_sampler_type_to_chr
(
COMMON_SAMPLER_TYPE_TOP_K
),
COMMON_SAMPLER_TYPE_TOP_K
},
{
common_sampler_type_to_chr
(
COMMON_SAMPLER_TYPE_TYPICAL_P
),
COMMON_SAMPLER_TYPE_TYPICAL_P
},
{
common_sampler_type_to_chr
(
COMMON_SAMPLER_TYPE_TOP_P
),
COMMON_SAMPLER_TYPE_TOP_P
},
{
common_sampler_type_to_chr
(
COMMON_SAMPLER_TYPE_MIN_P
),
COMMON_SAMPLER_TYPE_MIN_P
},
{
common_sampler_type_to_chr
(
COMMON_SAMPLER_TYPE_TEMPERATURE
),
COMMON_SAMPLER_TYPE_TEMPERATURE
},
{
common_sampler_type_to_chr
(
COMMON_SAMPLER_TYPE_XTC
),
COMMON_SAMPLER_TYPE_XTC
},
{
common_sampler_type_to_chr
(
COMMON_SAMPLER_TYPE_INFILL
),
COMMON_SAMPLER_TYPE_INFILL
},
};
std
::
vector
<
gpt
_sampler_type
>
samplers
;
std
::
vector
<
common
_sampler_type
>
samplers
;
samplers
.
reserve
(
chars
.
size
());
for
(
const
auto
&
c
:
chars
)
{
...
...
llama/sampling.h
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
@@ -33,7 +33,7 @@
#include <string>
#include <vector>
//
gpt
_sampler extends llama_sampler with additional functionality:
//
common
_sampler extends llama_sampler with additional functionality:
//
// - grammar support
// - custom sampler logic based on the parameters
...
...
@@ -49,30 +49,30 @@
// token in order to verify if it fits the grammar. And only if the token doesn't fit the grammar, the
// grammar constraints are applied to the full vocabulary and the token is resampled.
//
// The
gpt
_sampler also maintains a container with the last accepted tokens. In the future, this can
// The
common
_sampler also maintains a container with the last accepted tokens. In the future, this can
// be moved into the core llama library.
//
// For convenience, the
gpt
_sampler also maintains a container with the current candidate tokens.
// For convenience, the
common
_sampler also maintains a container with the current candidate tokens.
// This can be used to access the probabilities of the rest of the non-sampled tokens.
//
// TODO: measure grammar performance
//
struct
gpt
_sampler
;
struct
common
_sampler
;
// llama_sampler API overloads
struct
gpt
_sampler
*
gpt
_sampler_init
(
const
struct
llama_model
*
model
,
const
struct
gpt_sampler_params
&
params
);
struct
common
_sampler
*
common
_sampler_init
(
const
struct
llama_model
*
model
,
const
struct
common_params_sampling
&
params
);
void
gpt
_sampler_free
(
struct
gpt
_sampler
*
gsmpl
);
void
common
_sampler_free
(
struct
common
_sampler
*
gsmpl
);
// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
void
gpt
_sampler_accept
(
struct
gpt
_sampler
*
gsmpl
,
llama_token
token
,
bool
accept_grammar
);
void
gpt
_sampler_reset
(
struct
gpt
_sampler
*
gsmpl
);
struct
gpt
_sampler
*
gpt
_sampler_clone
(
struct
gpt
_sampler
*
gsmpl
);
void
common
_sampler_accept
(
struct
common
_sampler
*
gsmpl
,
llama_token
token
,
bool
accept_grammar
);
void
common
_sampler_reset
(
struct
common
_sampler
*
gsmpl
);
struct
common
_sampler
*
common
_sampler_clone
(
struct
common
_sampler
*
gsmpl
);
// arguments can be nullptr to skip printing
void
gpt
_perf_print
(
const
struct
llama_context
*
ctx
,
const
struct
gpt
_sampler
*
gsmpl
);
void
common
_perf_print
(
const
struct
llama_context
*
ctx
,
const
struct
common
_sampler
*
gsmpl
);
// extended sampling implementation:
//
...
...
@@ -84,26 +84,47 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
// if grammar_first is true, the grammar is applied before the samplers (slower)
// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
//
llama_token
gpt
_sampler_sample
(
struct
gpt
_sampler
*
gsmpl
,
struct
llama_context
*
ctx
,
int
idx
,
bool
grammar_first
=
false
);
llama_token
common
_sampler_sample
(
struct
common
_sampler
*
gsmpl
,
struct
llama_context
*
ctx
,
int
idx
,
bool
grammar_first
=
false
);
uint32_t
gpt_sampler_get_seed
(
const
struct
gpt_sampler
*
gsmpl
);
// generalized version of common_sampler_sample
//
// will cross-reference the sampled tokens with a batch of draft tokens and accept those that match
// if the sampler disagrees at some point, we stop and return the accepted tokens up to now
//
// common_sampler_sample_n(gsmpl, ctx, { idx }, {});
//
// is equivalent to
//
// common_sampler_sample(gsmpl, ctx, idx);
// common_sampler_accept(gsmpl, token, true);
//
// requires: idxs.size() == draft.size() + 1
//
// returns at least 1 token, up to idxs.size()
//
std
::
vector
<
llama_token
>
common_sampler_sample_and_accept_n
(
struct
common_sampler
*
gsmpl
,
struct
llama_context
*
ctx
,
const
std
::
vector
<
int
>
&
idxs
,
const
llama_tokens
&
draft
,
bool
grammar_first
=
false
);
// assume idxs == [ 0, 1, 2, ..., draft.size() ]
std
::
vector
<
llama_token
>
common_sampler_sample_and_accept_n
(
struct
common_sampler
*
gsmpl
,
struct
llama_context
*
ctx
,
const
llama_tokens
&
draft
,
bool
grammar_first
=
false
);
uint32_t
common_sampler_get_seed
(
const
struct
common_sampler
*
gsmpl
);
// helpers
// access the internal list of current candidate tokens
llama_token_data_array
*
gpt
_sampler_get_candidates
(
struct
gpt
_sampler
*
gsmpl
);
llama_token_data_array
*
common
_sampler_get_candidates
(
struct
common
_sampler
*
gsmpl
);
// get the last accepted token
llama_token
gpt
_sampler_last
(
const
struct
gpt
_sampler
*
gsmpl
);
llama_token
common
_sampler_last
(
const
struct
common
_sampler
*
gsmpl
);
// print the sampler chain into a string
std
::
string
gpt
_sampler_print
(
const
struct
gpt
_sampler
*
gsmpl
);
std
::
string
common
_sampler_print
(
const
struct
common
_sampler
*
gsmpl
);
// get a string representation of the last accepted tokens
std
::
string
gpt
_sampler_prev_str
(
gpt
_sampler
*
gsmpl
,
llama_context
*
ctx
,
int
n
);
std
::
string
common
_sampler_prev_str
(
common
_sampler
*
gsmpl
,
llama_context
*
ctx
,
int
n
);
char
gpt
_sampler_type_to_chr
(
enum
gpt
_sampler_type
cnstr
);
std
::
string
gpt
_sampler_type_to_str
(
enum
gpt
_sampler_type
cnstr
);
char
common
_sampler_type_to_chr
(
enum
common
_sampler_type
cnstr
);
std
::
string
common
_sampler_type_to_str
(
enum
common
_sampler_type
cnstr
);
std
::
vector
<
enum
gpt
_sampler_type
>
gpt
_sampler_types_from_names
(
const
std
::
vector
<
std
::
string
>
&
names
,
bool
allow_alt_names
);
std
::
vector
<
enum
gpt
_sampler_type
>
gpt
_sampler_types_from_chars
(
const
std
::
string
&
chars
);
std
::
vector
<
enum
common
_sampler_type
>
common
_sampler_types_from_names
(
const
std
::
vector
<
std
::
string
>
&
names
,
bool
allow_alt_names
);
std
::
vector
<
enum
common
_sampler_type
>
common
_sampler_types_from_chars
(
const
std
::
string
&
chars
);
llama/sampling_ext.cpp
View file @
527cc978
...
...
@@ -3,16 +3,12 @@
#include "sampling_ext.h"
#include "json-schema-to-grammar.h"
struct
gpt_sampler
*
gpt_sampler_cinit
(
const
struct
llama_model
*
model
,
struct
gpt_sampler_cparams
*
params
)
{
try
{
gpt_sampler_params
sparams
;
struct
common_sampler
*
common_sampler_cinit
(
const
struct
llama_model
*
model
,
struct
common_sampler_cparams
*
params
)
{
try
{
common_params_sampling
sparams
;
sparams
.
top_k
=
params
->
top_k
;
sparams
.
top_p
=
params
->
top_p
;
sparams
.
min_p
=
params
->
min_p
;
sparams
.
tfs_z
=
params
->
tfs_z
;
sparams
.
typ_p
=
params
->
typical_p
;
sparams
.
temp
=
params
->
temp
;
sparams
.
penalty_last_n
=
params
->
penalty_last_n
;
...
...
@@ -25,38 +21,28 @@ struct gpt_sampler *gpt_sampler_cinit(
sparams
.
penalize_nl
=
params
->
penalize_nl
;
sparams
.
seed
=
params
->
seed
;
sparams
.
grammar
=
params
->
grammar
;
return
gpt_sampler_init
(
model
,
sparams
)
;
}
catch
(
const
std
::
exception
&
err
)
{
sparams
.
xtc_probability
=
0.0
;
sparams
.
xtc_threshold
=
0.5
;
return
common_sampler_init
(
model
,
sparams
);
}
catch
(
const
std
::
exception
&
err
)
{
return
nullptr
;
}
}
void
gpt_sampler_cfree
(
struct
gpt_sampler
*
sampler
)
{
gpt_sampler_free
(
sampler
);
void
common_sampler_cfree
(
struct
common_sampler
*
sampler
)
{
common_sampler_free
(
sampler
);
}
void
gpt_sampler_creset
(
struct
gpt_sampler
*
sampler
)
{
gpt_sampler_reset
(
sampler
);
void
common_sampler_creset
(
struct
common_sampler
*
sampler
)
{
common_sampler_reset
(
sampler
);
}
llama_token
gpt_sampler_csample
(
struct
gpt_sampler
*
sampler
,
struct
llama_context
*
ctx_main
,
int
idx
)
{
return
gpt_sampler_sample
(
sampler
,
ctx_main
,
idx
);
void
common_sampler_caccept
(
struct
common_sampler
*
sampler
,
llama_token
id
,
bool
apply_grammar
)
{
common_sampler_accept
(
sampler
,
id
,
apply_grammar
);
}
void
gpt_sampler_caccept
(
struct
gpt_sampler
*
sampler
,
llama_token
id
,
bool
apply_grammar
)
{
gpt_sampler_accept
(
sampler
,
id
,
apply_grammar
);
llama_token
common_sampler_csample
(
struct
common_sampler
*
sampler
,
struct
llama_context
*
ctx
,
int
idx
)
{
return
common_sampler_sample
(
sampler
,
ctx
,
idx
);
}
int
schema_to_grammar
(
const
char
*
json_schema
,
char
*
grammar
,
size_t
max_len
)
...
...
llama/sampling_ext.h
View file @
527cc978
// TODO: this is a temporary wrapper to allow calling C++ code from CGo
#ifndef
GPT_
SAMPL
ER
_EXT_H
#define
GPT_
SAMPL
ER
_EXT_H
#ifndef SAMPL
ING
_EXT_H
#define SAMPL
ING
_EXT_H
#ifdef __cplusplus
extern
"C"
...
...
@@ -9,14 +9,11 @@ extern "C"
// Forward declaration to avoid include of "sampling.h" which has c++
// includes
struct
gpt_sampler
;
struct
gpt_sampler_cparams
{
struct
common_sampler
;
struct
common_sampler_cparams
{
int32_t
top_k
;
float
top_p
;
float
min_p
;
float
tfs_z
;
float
typical_p
;
float
temp
;
int32_t
penalty_last_n
;
...
...
@@ -31,21 +28,11 @@ extern "C"
char
*
grammar
;
};
struct
gpt_sampler
*
gpt_sampler_cinit
(
const
struct
llama_model
*
model
,
struct
gpt_sampler_cparams
*
params
);
void
gpt_sampler_cfree
(
struct
gpt_sampler
*
sampler
);
void
gpt_sampler_creset
(
struct
gpt_sampler
*
sampler
);
llama_token
gpt_sampler_csample
(
struct
gpt_sampler
*
sampler
,
struct
llama_context
*
ctx_main
,
int
idx
);
void
gpt_sampler_caccept
(
struct
gpt_sampler
*
sampler
,
llama_token
id
,
bool
apply_grammar
);
struct
common_sampler
*
common_sampler_cinit
(
const
struct
llama_model
*
model
,
struct
common_sampler_cparams
*
params
);
void
common_sampler_cfree
(
struct
common_sampler
*
sampler
);
void
common_sampler_creset
(
struct
common_sampler
*
sampler
);
void
common_sampler_caccept
(
struct
common_sampler
*
sampler
,
llama_token
id
,
bool
apply_grammar
);
llama_token
common_sampler_csample
(
struct
common_sampler
*
sampler
,
struct
llama_context
*
ctx
,
int
idx
);
int
schema_to_grammar
(
const
char
*
json_schema
,
char
*
grammar
,
size_t
max_len
);
...
...
@@ -53,4 +40,4 @@ extern "C"
}
#endif
#endif //
GPT_
SAMPL
ER
_EXT_H
#endif // SAMPL
ING
_EXT_H
llama/sgemm.cpp
View file @
527cc978
...
...
@@ -106,6 +106,10 @@ inline float16x8_t sub(float16x8_t x, float16x8_t y) { return vsubq_f16(x, y); }
inline
float16x8_t
mul
(
float16x8_t
x
,
float16x8_t
y
)
{
return
vmulq_f16
(
x
,
y
);
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#if defined(__MMA__)
typedef
vector
unsigned
char
vec_t
;
typedef
__vector_quad
acc_t
;
#endif
////////////////////////////////////////////////////////////////////////////////////////////////////
// VECTORIZED FUSED MULTIPLY ADD
...
...
@@ -942,6 +946,36 @@ class tinyBLAS_Q0_AVX {
return
_mm_sub_epi8
(
_mm_and_si128
(
_mm_set1_epi8
(
15
),
_mm_srli_epi16
(
x
,
4
)),
_mm_set1_epi8
(
8
));
}
inline
__m256i
load
(
const
block_q5_0
*
b
)
{
return
_mm256_or_si256
(
denibble
(
b
->
qs
),
bittobyte
(
b
->
qh
));
}
inline
__m128i
load0
(
const
block_q5_0
*
b
)
{
const
__m128i
x
=
_mm_loadu_si128
((
const
__m128i
*
)(
b
->
qs
));
uint32_t
x32
;
memcpy
(
&
x32
,
b
->
qh
,
sizeof
(
uint32_t
));
__m128i
qxl
=
_mm_and_si128
(
_mm_set1_epi8
(
15
),
x
);
__m128i
bytesl
=
_mm_cmpeq_epi8
(
_mm_set1_epi64x
(
-
1
),
_mm_or_si128
(
_mm_set1_epi64x
(
0x7fbfdfeff7fbfdfe
),
_mm_shuffle_epi8
(
_mm_set1_epi32
(
x32
),
_mm_set_epi64x
(
0x0101010101010101
,
0x0000000000000000
))));
bytesl
=
_mm_andnot_si128
(
bytesl
,
_mm_set1_epi8
((
char
)
0xF0
));
return
_mm_or_si128
(
qxl
,
bytesl
);
}
inline
__m128i
load1
(
const
block_q5_0
*
b
)
{
const
__m128i
x
=
_mm_loadu_si128
((
const
__m128i
*
)(
b
->
qs
));
uint32_t
x32
;
memcpy
(
&
x32
,
b
->
qh
,
sizeof
(
uint32_t
));
__m128i
qxh
=
_mm_and_si128
(
_mm_set1_epi8
(
15
),
_mm_srli_epi16
(
x
,
4
));
__m128i
bytesh
=
_mm_cmpeq_epi8
(
_mm_set1_epi64x
(
-
1
),
_mm_or_si128
(
_mm_set1_epi64x
(
0x7fbfdfeff7fbfdfe
),
_mm_shuffle_epi8
(
_mm_set1_epi32
(
x32
),
_mm_set_epi64x
(
0x0303030303030303
,
0x0202020202020202
))));
bytesh
=
_mm_andnot_si128
(
bytesh
,
_mm_set1_epi8
((
char
)
0xF0
));
return
_mm_or_si128
(
qxh
,
bytesh
);
}
inline
__m256i
load
(
const
block_iq4_nl
*
b
)
{
return
MM256_SET_M128I
(
load1
(
b
),
load0
(
b
));
}
...
...
@@ -973,6 +1007,17 @@ class tinyBLAS_Q0_AVX {
_mm_srli_epi16
(
x
,
4
),
1
));
}
static
inline
__m256i
bittobyte
(
const
uint8_t
*
p
)
{
uint32_t
x32
;
memcpy
(
&
x32
,
p
,
sizeof
(
uint32_t
));
__m256i
bytes
=
_mm256_cmpeq_epi8
(
_mm256_set1_epi64x
(
-
1
),
_mm256_or_si256
(
_mm256_set1_epi64x
(
0x7fbfdfeff7fbfdfe
),
_mm256_shuffle_epi8
(
_mm256_set1_epi32
(
x32
),
_mm256_set_epi64x
(
0x0303030303030303
,
0x0202020202020202
,
0x0101010101010101
,
0x0000000000000000
))));
return
_mm256_andnot_si256
(
bytes
,
_mm256_set1_epi8
((
char
)
0xF0
));
}
const
TA
*
const
A
;
const
TB
*
const
B
;
TC
*
const
C
;
...
...
@@ -985,6 +1030,600 @@ class tinyBLAS_Q0_AVX {
};
#endif // __AVX__
//PPC Implementation
#if defined(__MMA__)
#define SAVE_ACC(ACC, ii, jj) \
__builtin_mma_disassemble_acc(vec_C, ACC); \
for (int I = 0; I < 4; I++) { \
for (int J = 0; J < 4; J++) { \
*((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&vec_C[I]+J); \
} \
} \
template
<
typename
TA
,
typename
TB
,
typename
TC
>
class
tinyBLAS_PPC
{
public:
tinyBLAS_PPC
(
int64_t
k
,
const
TA
*
A
,
int64_t
lda
,
const
TB
*
B
,
int64_t
ldb
,
TC
*
C
,
int64_t
ldc
,
int
ith
,
int
nth
)
:
A
(
A
),
B
(
B
),
C
(
C
),
k
(
k
),
lda
(
lda
),
ldb
(
ldb
),
ldc
(
ldc
),
ith
(
ith
),
nth
(
nth
)
{
}
void
matmul
(
int64_t
m
,
int64_t
n
)
{
mnpack
(
0
,
m
,
0
,
n
);
}
private:
void
(
tinyBLAS_PPC
::*
kernel
)(
int64_t
,
int64_t
);
void
READ_BLOCK
(
const
float
*
a
,
int64_t
lda
,
int
rows
,
int
cols
,
float
*
vec
)
{
int64_t
i
,
j
;
float
*
aoffset
=
NULL
,
*
boffset
=
NULL
;
float
*
aoffset1
=
NULL
,
*
aoffset2
=
NULL
,
*
aoffset3
=
NULL
,
*
aoffset4
=
NULL
;
float
*
aoffset5
=
NULL
,
*
aoffset6
=
NULL
,
*
aoffset7
=
NULL
,
*
aoffset8
=
NULL
;
aoffset
=
const_cast
<
float
*>
(
a
);
boffset
=
vec
;
j
=
(
rows
>>
3
);
if
(
j
>
0
)
{
do
{
aoffset1
=
aoffset
;
aoffset2
=
aoffset1
+
lda
;
aoffset3
=
aoffset2
+
lda
;
aoffset4
=
aoffset3
+
lda
;
aoffset5
=
aoffset4
+
lda
;
aoffset6
=
aoffset5
+
lda
;
aoffset7
=
aoffset6
+
lda
;
aoffset8
=
aoffset7
+
lda
;
aoffset
+=
8
*
lda
;
i
=
(
cols
>>
3
);
if
(
i
>
0
)
{
__vector_pair
C1
,
C2
,
C3
,
C4
,
C5
,
C6
,
C7
,
C8
;
vector
float
c1
[
2
],
c2
[
2
],
c3
[
2
],
c4
[
2
],
c5
[
2
],
c6
[
2
],
c7
[
2
],
c8
[
2
];
vector
float
t1
,
t2
,
t3
,
t4
,
t5
,
t6
,
t7
,
t8
;
do
{
C1
=
__builtin_vsx_lxvp
(
0
,
(
__vector_pair
*
)
aoffset1
);
C2
=
__builtin_vsx_lxvp
(
0
,
(
__vector_pair
*
)
aoffset2
);
C3
=
__builtin_vsx_lxvp
(
0
,
(
__vector_pair
*
)
aoffset3
);
C4
=
__builtin_vsx_lxvp
(
0
,
(
__vector_pair
*
)
aoffset4
);
C5
=
__builtin_vsx_lxvp
(
0
,
(
__vector_pair
*
)
aoffset5
);
C6
=
__builtin_vsx_lxvp
(
0
,
(
__vector_pair
*
)
aoffset6
);
C7
=
__builtin_vsx_lxvp
(
0
,
(
__vector_pair
*
)
aoffset7
);
C8
=
__builtin_vsx_lxvp
(
0
,
(
__vector_pair
*
)
aoffset8
);
__builtin_vsx_disassemble_pair
(
c1
,
&
C1
);
__builtin_vsx_disassemble_pair
(
c2
,
&
C2
);
__builtin_vsx_disassemble_pair
(
c3
,
&
C3
);
__builtin_vsx_disassemble_pair
(
c4
,
&
C4
);
__builtin_vsx_disassemble_pair
(
c5
,
&
C5
);
__builtin_vsx_disassemble_pair
(
c6
,
&
C6
);
__builtin_vsx_disassemble_pair
(
c7
,
&
C7
);
__builtin_vsx_disassemble_pair
(
c8
,
&
C8
);
t1
=
vec_mergeh
(
c1
[
0
],
c2
[
0
]);
t2
=
vec_mergeh
(
c3
[
0
],
c4
[
0
]);
t3
=
vec_mergeh
(
c5
[
0
],
c6
[
0
]);
t4
=
vec_mergeh
(
c7
[
0
],
c8
[
0
]);
t5
=
vec_xxpermdi
(
t1
,
t2
,
0
);
t6
=
vec_xxpermdi
(
t3
,
t4
,
0
);
t7
=
vec_xxpermdi
(
t1
,
t2
,
3
);
t8
=
vec_xxpermdi
(
t3
,
t4
,
3
);
vec_xst
(
t5
,
0
,
boffset
);
vec_xst
(
t6
,
0
,
boffset
+
4
);
vec_xst
(
t7
,
0
,
boffset
+
8
);
vec_xst
(
t8
,
0
,
boffset
+
12
);
t1
=
vec_mergel
(
c1
[
0
],
c2
[
0
]);
t2
=
vec_mergel
(
c3
[
0
],
c4
[
0
]);
t3
=
vec_mergel
(
c5
[
0
],
c6
[
0
]);
t4
=
vec_mergel
(
c7
[
0
],
c8
[
0
]);
t5
=
vec_xxpermdi
(
t1
,
t2
,
0
);
t6
=
vec_xxpermdi
(
t3
,
t4
,
0
);
t7
=
vec_xxpermdi
(
t1
,
t2
,
3
);
t8
=
vec_xxpermdi
(
t3
,
t4
,
3
);
vec_xst
(
t5
,
0
,
boffset
+
16
);
vec_xst
(
t6
,
0
,
boffset
+
20
);
vec_xst
(
t7
,
0
,
boffset
+
24
);
vec_xst
(
t8
,
0
,
boffset
+
28
);
t1
=
vec_mergeh
(
c1
[
1
],
c2
[
1
]);
t2
=
vec_mergeh
(
c3
[
1
],
c4
[
1
]);
t3
=
vec_mergeh
(
c5
[
1
],
c6
[
1
]);
t4
=
vec_mergeh
(
c7
[
1
],
c8
[
1
]);
t5
=
vec_xxpermdi
(
t1
,
t2
,
0
);
t6
=
vec_xxpermdi
(
t3
,
t4
,
0
);
t7
=
vec_xxpermdi
(
t1
,
t2
,
3
);
t8
=
vec_xxpermdi
(
t3
,
t4
,
3
);
vec_xst
(
t5
,
0
,
boffset
+
32
);
vec_xst
(
t6
,
0
,
boffset
+
36
);
vec_xst
(
t7
,
0
,
boffset
+
40
);
vec_xst
(
t8
,
0
,
boffset
+
44
);
t1
=
vec_mergel
(
c1
[
1
],
c2
[
1
]);
t2
=
vec_mergel
(
c3
[
1
],
c4
[
1
]);
t3
=
vec_mergel
(
c5
[
1
],
c6
[
1
]);
t4
=
vec_mergel
(
c7
[
1
],
c8
[
1
]);
t5
=
vec_xxpermdi
(
t1
,
t2
,
0
);
t6
=
vec_xxpermdi
(
t3
,
t4
,
0
);
t7
=
vec_xxpermdi
(
t1
,
t2
,
3
);
t8
=
vec_xxpermdi
(
t3
,
t4
,
3
);
vec_xst
(
t5
,
0
,
boffset
+
48
);
vec_xst
(
t6
,
0
,
boffset
+
52
);
vec_xst
(
t7
,
0
,
boffset
+
56
);
vec_xst
(
t8
,
0
,
boffset
+
60
);
aoffset1
+=
8
*
lda
;
aoffset2
+=
8
*
lda
;
aoffset3
+=
8
*
lda
;
aoffset4
+=
8
*
lda
;
boffset
+=
64
;
i
--
;
}
while
(
i
>
0
);
}
if
(
cols
&
4
)
{
vector
float
c1
,
c2
,
c3
,
c4
,
c5
,
c6
,
c7
,
c8
;
vector
float
t1
,
t2
,
t3
,
t4
,
t5
,
t6
,
t7
,
t8
;
c1
=
vec_xl
(
0
,
aoffset1
);
c2
=
vec_xl
(
0
,
aoffset2
);
c3
=
vec_xl
(
0
,
aoffset3
);
c4
=
vec_xl
(
0
,
aoffset4
);
c5
=
vec_xl
(
0
,
aoffset5
);
c6
=
vec_xl
(
0
,
aoffset6
);
c7
=
vec_xl
(
0
,
aoffset7
);
c8
=
vec_xl
(
0
,
aoffset8
);
t1
=
vec_mergeh
(
c1
,
c2
);
t2
=
vec_mergeh
(
c3
,
c4
);
t3
=
vec_mergeh
(
c5
,
c6
);
t4
=
vec_mergeh
(
c7
,
c8
);
t5
=
vec_xxpermdi
(
t1
,
t2
,
0
);
t6
=
vec_xxpermdi
(
t3
,
t4
,
0
);
t7
=
vec_xxpermdi
(
t1
,
t2
,
3
);
t8
=
vec_xxpermdi
(
t3
,
t4
,
3
);
vec_xst
(
t5
,
0
,
boffset
);
vec_xst
(
t6
,
0
,
boffset
+
4
);
vec_xst
(
t7
,
0
,
boffset
+
8
);
vec_xst
(
t8
,
0
,
boffset
+
12
);
t1
=
vec_mergel
(
c1
,
c2
);
t2
=
vec_mergel
(
c3
,
c4
);
t3
=
vec_mergel
(
c5
,
c6
);
t4
=
vec_mergel
(
c7
,
c8
);
t5
=
vec_xxpermdi
(
t1
,
t2
,
0
);
t6
=
vec_xxpermdi
(
t3
,
t4
,
0
);
t7
=
vec_xxpermdi
(
t1
,
t2
,
3
);
t8
=
vec_xxpermdi
(
t3
,
t4
,
3
);
vec_xst
(
t5
,
0
,
boffset
+
16
);
vec_xst
(
t6
,
0
,
boffset
+
20
);
vec_xst
(
t7
,
0
,
boffset
+
24
);
vec_xst
(
t8
,
0
,
boffset
+
28
);
}
j
--
;
}
while
(
j
>
0
);
}
if
(
rows
&
4
)
{
aoffset1
=
aoffset
;
aoffset2
=
aoffset1
+
lda
;
aoffset3
=
aoffset2
+
lda
;
aoffset4
=
aoffset3
+
lda
;
aoffset
+=
4
*
lda
;
i
=
(
cols
>>
3
);
if
(
i
>
0
)
{
__vector_pair
C1
,
C2
,
C3
,
C4
;
vector
float
c1
[
2
],
c2
[
2
],
c3
[
2
],
c4
[
2
];
vector
float
t1
,
t2
,
t3
,
t4
,
t5
,
t6
,
t7
,
t8
;
do
{
C1
=
__builtin_vsx_lxvp
(
0
,
(
__vector_pair
*
)
aoffset1
);
C2
=
__builtin_vsx_lxvp
(
0
,
(
__vector_pair
*
)
aoffset2
);
C3
=
__builtin_vsx_lxvp
(
0
,
(
__vector_pair
*
)
aoffset3
);
C4
=
__builtin_vsx_lxvp
(
0
,
(
__vector_pair
*
)
aoffset4
);
__builtin_vsx_disassemble_pair
(
c1
,
&
C1
);
__builtin_vsx_disassemble_pair
(
c2
,
&
C2
);
__builtin_vsx_disassemble_pair
(
c3
,
&
C3
);
__builtin_vsx_disassemble_pair
(
c4
,
&
C4
);
t1
=
vec_mergeh
(
c1
[
0
],
c2
[
0
]);
t2
=
vec_mergeh
(
c3
[
0
],
c4
[
0
]);
t3
=
vec_mergel
(
c1
[
0
],
c2
[
0
]);
t4
=
vec_mergel
(
c3
[
0
],
c4
[
0
]);
t5
=
vec_xxpermdi
(
t1
,
t2
,
0
);
t6
=
vec_xxpermdi
(
t1
,
t2
,
3
);
t7
=
vec_xxpermdi
(
t3
,
t4
,
0
);
t8
=
vec_xxpermdi
(
t3
,
t4
,
3
);
vec_xst
(
t5
,
0
,
boffset
);
vec_xst
(
t6
,
0
,
boffset
+
4
);
vec_xst
(
t7
,
0
,
boffset
+
8
);
vec_xst
(
t8
,
0
,
boffset
+
12
);
t1
=
vec_mergeh
(
c1
[
1
],
c2
[
1
]);
t2
=
vec_mergeh
(
c3
[
1
],
c4
[
1
]);
t3
=
vec_mergel
(
c1
[
1
],
c2
[
1
]);
t4
=
vec_mergel
(
c3
[
1
],
c4
[
1
]);
t5
=
vec_xxpermdi
(
t1
,
t2
,
0
);
t6
=
vec_xxpermdi
(
t1
,
t2
,
3
);
t7
=
vec_xxpermdi
(
t3
,
t4
,
0
);
t8
=
vec_xxpermdi
(
t3
,
t4
,
3
);
vec_xst
(
t5
,
0
,
boffset
+
16
);
vec_xst
(
t6
,
0
,
boffset
+
20
);
vec_xst
(
t7
,
0
,
boffset
+
24
);
vec_xst
(
t8
,
0
,
boffset
+
28
);
aoffset1
+=
8
*
lda
;
aoffset2
+=
8
*
lda
;
aoffset3
+=
8
*
lda
;
aoffset4
+=
8
*
lda
;
boffset
+=
32
;
i
--
;
}
while
(
i
>
0
);
}
if
(
cols
&
4
)
{
vector
float
c1
,
c2
,
c3
,
c4
;
vector
float
t1
,
t2
,
t3
,
t4
;
c1
=
vec_xl
(
0
,
aoffset1
);
c2
=
vec_xl
(
0
,
aoffset2
);
c3
=
vec_xl
(
0
,
aoffset3
);
c4
=
vec_xl
(
0
,
aoffset4
);
t1
=
vec_mergeh
(
c1
,
c2
);
t2
=
vec_mergeh
(
c3
,
c4
);
t3
=
vec_xxpermdi
(
t1
,
t2
,
0
);
t4
=
vec_xxpermdi
(
t1
,
t2
,
3
);
vec_xst
(
t3
,
0
,
boffset
);
vec_xst
(
t4
,
0
,
boffset
+
4
);
t1
=
vec_mergel
(
c1
,
c2
);
t2
=
vec_mergel
(
c3
,
c4
);
t3
=
vec_xxpermdi
(
t1
,
t2
,
0
);
t4
=
vec_xxpermdi
(
t1
,
t2
,
3
);
vec_xst
(
t3
,
0
,
boffset
+
8
);
vec_xst
(
t4
,
0
,
boffset
+
12
);
}
}
if
(
rows
&
3
)
{
aoffset1
=
aoffset
;
aoffset2
=
aoffset1
+
lda
;
aoffset3
=
aoffset2
+
lda
;
if
(
cols
&
4
)
{
vector
float
c1
,
c2
,
c3
,
c4
=
{
0
};
vector
float
t1
,
t2
,
t3
,
t4
;
c1
=
vec_xl
(
0
,
aoffset1
);
c2
=
vec_xl
(
0
,
aoffset2
);
c3
=
vec_xl
(
0
,
aoffset3
);
t1
=
vec_mergeh
(
c1
,
c2
);
t2
=
vec_mergeh
(
c3
,
c4
);
t3
=
vec_xxpermdi
(
t1
,
t2
,
0
);
t4
=
vec_xxpermdi
(
t1
,
t2
,
3
);
vec_xst
(
t3
,
0
,
boffset
);
vec_xst
(
t4
,
0
,
boffset
+
4
);
t1
=
vec_mergel
(
c1
,
c2
);
t2
=
vec_mergel
(
c3
,
c4
);
t3
=
vec_xxpermdi
(
t1
,
t2
,
0
);
t4
=
vec_xxpermdi
(
t1
,
t2
,
3
);
vec_xst
(
t3
,
0
,
boffset
+
8
);
vec_xst
(
t4
,
0
,
boffset
+
12
);
}
}
}
void
KERNEL_4x4
(
int64_t
ii
,
int64_t
jj
)
{
vec_t
vec_A
[
4
],
vec_B
[
4
],
vec_C
[
4
];
acc_t
acc_0
;
__builtin_mma_xxsetaccz
(
&
acc_0
);
for
(
int
l
=
0
;
l
<
k
;
l
+=
4
)
{
READ_BLOCK
(
A
+
(
ii
*
lda
)
+
l
,
lda
,
4
,
4
,
(
float
*
)
vec_A
);
READ_BLOCK
(
B
+
(
jj
*
ldb
)
+
l
,
ldb
,
4
,
4
,
(
float
*
)
vec_B
);
__builtin_mma_xvf32gerpp
(
&
acc_0
,
vec_A
[
0
],
vec_B
[
0
]);
__builtin_mma_xvf32gerpp
(
&
acc_0
,
vec_A
[
1
],
vec_B
[
1
]);
__builtin_mma_xvf32gerpp
(
&
acc_0
,
vec_A
[
2
],
vec_B
[
2
]);
__builtin_mma_xvf32gerpp
(
&
acc_0
,
vec_A
[
3
],
vec_B
[
3
]);
}
SAVE_ACC
(
&
acc_0
,
ii
,
jj
);
}
void
KERNEL_4x8
(
int64_t
ii
,
int64_t
jj
)
{
vec_t
vec_A
[
4
],
vec_B
[
8
],
vec_C
[
4
];
acc_t
acc_0
,
acc_1
;
__builtin_mma_xxsetaccz
(
&
acc_0
);
__builtin_mma_xxsetaccz
(
&
acc_1
);
for
(
int64_t
l
=
0
;
l
<
k
;
l
+=
4
)
{
READ_BLOCK
(
A
+
(
ii
*
lda
)
+
l
,
lda
,
4
,
4
,
(
float
*
)
vec_A
);
READ_BLOCK
(
B
+
(
jj
*
ldb
)
+
l
,
ldb
,
8
,
4
,
(
float
*
)
vec_B
);
__builtin_mma_xvf32gerpp
(
&
acc_0
,
vec_A
[
0
],
(
vec_t
)
vec_B
[
0
]);
__builtin_mma_xvf32gerpp
(
&
acc_1
,
vec_A
[
0
],
(
vec_t
)
vec_B
[
1
]);
__builtin_mma_xvf32gerpp
(
&
acc_0
,
vec_A
[
1
],
(
vec_t
)
vec_B
[
2
]);
__builtin_mma_xvf32gerpp
(
&
acc_1
,
vec_A
[
1
],
(
vec_t
)
vec_B
[
3
]);
__builtin_mma_xvf32gerpp
(
&
acc_0
,
vec_A
[
2
],
(
vec_t
)
vec_B
[
4
]);
__builtin_mma_xvf32gerpp
(
&
acc_1
,
vec_A
[
2
],
(
vec_t
)
vec_B
[
5
]);
__builtin_mma_xvf32gerpp
(
&
acc_0
,
vec_A
[
3
],
(
vec_t
)
vec_B
[
6
]);
__builtin_mma_xvf32gerpp
(
&
acc_1
,
vec_A
[
3
],
(
vec_t
)
vec_B
[
7
]);
}
SAVE_ACC
(
&
acc_0
,
ii
,
jj
);
SAVE_ACC
(
&
acc_1
,
ii
,
jj
+
4
);
}
void
KERNEL_8x4
(
int64_t
ii
,
int64_t
jj
)
{
vec_t
vec_A
[
8
],
vec_B
[
4
],
vec_C
[
4
];
acc_t
acc_0
,
acc_1
;
__builtin_mma_xxsetaccz
(
&
acc_0
);
__builtin_mma_xxsetaccz
(
&
acc_1
);
for
(
int64_t
l
=
0
;
l
<
k
;
l
+=
4
)
{
READ_BLOCK
(
A
+
(
ii
*
lda
)
+
l
,
lda
,
8
,
4
,
(
float
*
)
vec_A
);
READ_BLOCK
(
B
+
(
jj
*
ldb
)
+
l
,
ldb
,
4
,
4
,
(
float
*
)
vec_B
);
__builtin_mma_xvf32gerpp
(
&
acc_0
,
(
vec_t
)
vec_A
[
0
],
vec_B
[
0
]);
__builtin_mma_xvf32gerpp
(
&
acc_1
,
(
vec_t
)
vec_A
[
1
],
vec_B
[
0
]);
__builtin_mma_xvf32gerpp
(
&
acc_0
,
(
vec_t
)
vec_A
[
2
],
vec_B
[
1
]);
__builtin_mma_xvf32gerpp
(
&
acc_1
,
(
vec_t
)
vec_A
[
3
],
vec_B
[
1
]);
__builtin_mma_xvf32gerpp
(
&
acc_0
,
(
vec_t
)
vec_A
[
4
],
vec_B
[
2
]);
__builtin_mma_xvf32gerpp
(
&
acc_1
,
(
vec_t
)
vec_A
[
5
],
vec_B
[
2
]);
__builtin_mma_xvf32gerpp
(
&
acc_0
,
(
vec_t
)
vec_A
[
6
],
vec_B
[
3
]);
__builtin_mma_xvf32gerpp
(
&
acc_1
,
(
vec_t
)
vec_A
[
7
],
vec_B
[
3
]);
}
SAVE_ACC
(
&
acc_0
,
ii
,
jj
);
SAVE_ACC
(
&
acc_1
,
ii
+
4
,
jj
);
}
void
KERNEL_8x8
(
int64_t
ii
,
int64_t
jj
)
{
vec_t
vec_A
[
16
],
vec_B
[
16
],
vec_C
[
4
];
acc_t
acc_0
,
acc_1
,
acc_2
,
acc_3
;
__builtin_mma_xxsetaccz
(
&
acc_0
);
__builtin_mma_xxsetaccz
(
&
acc_1
);
__builtin_mma_xxsetaccz
(
&
acc_2
);
__builtin_mma_xxsetaccz
(
&
acc_3
);
for
(
int
l
=
0
;
l
<
k
;
l
+=
8
)
{
READ_BLOCK
(
A
+
(
ii
*
lda
)
+
l
,
lda
,
8
,
8
,
(
float
*
)
vec_A
);
READ_BLOCK
(
B
+
(
jj
*
ldb
)
+
l
,
ldb
,
8
,
8
,
(
float
*
)
vec_B
);
for
(
int
x
=
0
;
x
<
16
;
x
+=
2
)
{
__builtin_mma_xvf32gerpp
(
&
acc_0
,
(
vec_t
)
vec_A
[
x
],
vec_B
[
x
]);
__builtin_mma_xvf32gerpp
(
&
acc_1
,
(
vec_t
)
vec_A
[
x
],
vec_B
[
x
+
1
]);
__builtin_mma_xvf32gerpp
(
&
acc_2
,
(
vec_t
)
vec_A
[
x
+
1
],
vec_B
[
x
]);
__builtin_mma_xvf32gerpp
(
&
acc_3
,
(
vec_t
)
vec_A
[
x
+
1
],
vec_B
[
x
+
1
]);
}
}
SAVE_ACC
(
&
acc_0
,
ii
,
jj
);
SAVE_ACC
(
&
acc_1
,
ii
,
jj
+
4
);
SAVE_ACC
(
&
acc_2
,
ii
+
4
,
jj
);
SAVE_ACC
(
&
acc_3
,
ii
+
4
,
jj
+
4
);
}
void
mnpack
(
int64_t
m0
,
int64_t
m
,
int64_t
n0
,
int64_t
n
)
{
int64_t
mc
,
nc
,
mp
,
np
;
int
m_rem
=
MIN
(
m
-
m0
,
16
);
int
n_rem
=
MIN
(
n
-
n0
,
16
);
if
(
m_rem
>=
16
&&
n_rem
>=
8
)
{
mc
=
8
;
nc
=
8
;
gemm
<
8
,
8
>
(
m0
,
m
,
n0
,
n
);
}
else
if
(
m_rem
>=
8
&&
n_rem
>=
16
)
{
mc
=
8
;
nc
=
8
;
gemm
<
8
,
8
>
(
m0
,
m
,
n0
,
n
);
}
else
if
(
m_rem
>=
8
&&
n_rem
>=
8
)
{
mc
=
8
;
nc
=
8
;
gemm
<
8
,
8
>
(
m0
,
m
,
n0
,
n
);
}
else
if
(
m_rem
>=
4
&&
n_rem
>=
8
)
{
mc
=
4
;
nc
=
8
;
gemm
<
4
,
8
>
(
m0
,
m
,
n0
,
n
);
}
else
if
(
m_rem
>=
8
&&
n_rem
>=
4
)
{
mc
=
8
;
nc
=
4
;
gemm
<
8
,
4
>
(
m0
,
m
,
n0
,
n
);
}
else
if
(
m_rem
>=
4
&&
n_rem
>=
4
)
{
mc
=
4
;
nc
=
4
;
gemm
<
4
,
4
>
(
m0
,
m
,
n0
,
n
);
}
else
if
((
m_rem
<
4
)
&&
(
n_rem
>
4
))
{
nc
=
4
;
switch
(
m_rem
)
{
case
1
:
mc
=
1
;
gemm_small
(
m0
,
m
,
n0
,
n
,
mc
,
nc
);
break
;
case
2
:
mc
=
2
;
gemm_small
(
m0
,
m
,
n0
,
n
,
mc
,
nc
);
break
;
case
3
:
mc
=
3
;
gemm_small
(
m0
,
m
,
n0
,
n
,
mc
,
nc
);
break
;
default:
return
;
}
}
else
if
((
m_rem
>
4
)
&&
(
n_rem
<
4
))
{
mc
=
4
;
switch
(
n_rem
)
{
case
1
:
nc
=
1
;
gemm_small
(
m0
,
m
,
n0
,
n
,
mc
,
nc
);
break
;
case
2
:
nc
=
2
;
gemm_small
(
m0
,
m
,
n0
,
n
,
mc
,
nc
);
break
;
case
3
:
nc
=
3
;
gemm_small
(
m0
,
m
,
n0
,
n
,
mc
,
nc
);
break
;
default:
return
;
}
}
else
{
switch
((
m_rem
<<
4
)
|
n_rem
)
{
case
0x43
:
mc
=
4
;
nc
=
3
;
gemm_small
(
m0
,
m
,
n0
,
n
,
mc
,
nc
);
break
;
case
0x42
:
mc
=
4
;
nc
=
2
;
gemm_small
(
m0
,
m
,
n0
,
n
,
mc
,
nc
);
break
;
case
0x41
:
mc
=
4
;
nc
=
1
;
gemm_small
(
m0
,
m
,
n0
,
n
,
mc
,
nc
);
break
;
case
0x34
:
mc
=
3
;
nc
=
4
;
gemm_small
(
m0
,
m
,
n0
,
n
,
mc
,
nc
);
break
;
case
0x33
:
mc
=
3
;
nc
=
3
;
gemm_small
(
m0
,
m
,
n0
,
n
,
mc
,
nc
);
break
;
case
0x32
:
mc
=
3
;
nc
=
2
;
gemm_small
(
m0
,
m
,
n0
,
n
,
mc
,
nc
);
break
;
case
0x31
:
mc
=
3
;
nc
=
1
;
gemm_small
(
m0
,
m
,
n0
,
n
,
mc
,
nc
);
break
;
case
0x24
:
mc
=
2
;
nc
=
4
;
gemm_small
(
m0
,
m
,
n0
,
n
,
mc
,
nc
);
break
;
case
0x23
:
mc
=
2
;
nc
=
3
;
gemm_small
(
m0
,
m
,
n0
,
n
,
mc
,
nc
);
break
;
case
0x22
:
mc
=
2
;
nc
=
2
;
gemm_small
(
m0
,
m
,
n0
,
n
,
mc
,
nc
);
break
;
case
0x21
:
mc
=
2
;
nc
=
1
;
gemm_small
(
m0
,
m
,
n0
,
n
,
mc
,
nc
);
break
;
case
0x14
:
mc
=
1
;
nc
=
4
;
gemm_small
(
m0
,
m
,
n0
,
n
,
mc
,
nc
);
break
;
case
0x13
:
mc
=
1
;
nc
=
3
;
gemm_small
(
m0
,
m
,
n0
,
n
,
mc
,
nc
);
break
;
case
0x12
:
mc
=
1
;
nc
=
2
;
gemm_small
(
m0
,
m
,
n0
,
n
,
mc
,
nc
);
break
;
case
0x11
:
mc
=
1
;
nc
=
1
;
gemm_small
(
m0
,
m
,
n0
,
n
,
mc
,
nc
);
break
;
default:
return
;
}
}
mp
=
m0
+
(
m
-
m0
)
/
mc
*
mc
;
np
=
n0
+
(
n
-
n0
)
/
nc
*
nc
;
mnpack
(
mp
,
m
,
n0
,
np
);
mnpack
(
m0
,
m
,
np
,
n
);
}
void
gemm_small
(
int64_t
m0
,
int64_t
m
,
int64_t
n0
,
int64_t
n
,
int
RM
,
int
RN
)
{
int64_t
ytiles
=
(
m
-
m0
)
/
RM
;
int64_t
xtiles
=
(
n
-
n0
)
/
RN
;
int64_t
tiles
=
xtiles
*
ytiles
;
int64_t
duty
=
(
tiles
+
nth
-
1
)
/
nth
;
int64_t
start
=
duty
*
ith
;
int64_t
end
=
start
+
duty
;
if
(
end
>
tiles
)
end
=
tiles
;
for
(
int64_t
job
=
start
;
job
<
end
;
++
job
)
{
int64_t
ii
=
m0
+
job
/
xtiles
*
RM
;
int64_t
jj
=
n0
+
job
%
xtiles
*
RN
;
vec_t
vec_C
[
4
];
acc_t
acc_0
;
__builtin_mma_xxsetaccz
(
&
acc_0
);
vec_t
vec_A
[
4
],
vec_B
[
4
];
for
(
int
l
=
0
;
l
<
k
;
l
+=
4
)
{
if
(
RN
>=
4
&&
RM
==
1
)
{
float
*
a
=
const_cast
<
float
*>
(
A
+
(
ii
)
*
lda
+
l
);
READ_BLOCK
(
B
+
(
jj
*
ldb
)
+
l
,
ldb
,
4
,
4
,
(
float
*
)
vec_B
);
vec_A
[
0
]
=
(
vec_t
)
vec_xl
(
0
,
a
);
vec_A
[
1
]
=
(
vec_t
)
vec_splats
(
*
((
float
*
)
&
vec_A
+
1
));
vec_A
[
2
]
=
(
vec_t
)
vec_splats
(
*
((
float
*
)
&
vec_A
+
2
));
vec_A
[
3
]
=
(
vec_t
)
vec_splats
(
*
((
float
*
)
&
vec_A
+
3
));
}
else
{
READ_BLOCK
(
A
+
(
ii
*
lda
)
+
l
,
lda
,
RM
,
4
,
(
float
*
)
vec_A
);
READ_BLOCK
(
B
+
(
jj
*
ldb
)
+
l
,
ldb
,
RN
,
4
,
(
float
*
)
vec_B
);
}
__builtin_mma_xvf32gerpp
(
&
acc_0
,
vec_A
[
0
],
vec_B
[
0
]);
__builtin_mma_xvf32gerpp
(
&
acc_0
,
vec_A
[
1
],
vec_B
[
1
]);
__builtin_mma_xvf32gerpp
(
&
acc_0
,
vec_A
[
2
],
vec_B
[
2
]);
__builtin_mma_xvf32gerpp
(
&
acc_0
,
vec_A
[
3
],
vec_B
[
3
]);
}
__builtin_mma_disassemble_acc
(
vec_C
,
&
acc_0
);
for
(
int
I
=
0
;
I
<
RM
;
I
++
)
{
for
(
int
J
=
0
;
J
<
RN
;
J
++
)
{
*
((
float
*
)(
C
+
ii
+
((
jj
+
J
)
*
ldc
)
+
I
))
=
*
((
float
*
)
&
vec_C
[
I
]
+
J
);
}
}
}
}
template
<
int
RM
,
int
RN
>
NOINLINE
void
gemm
(
int64_t
m0
,
int64_t
m
,
int64_t
n0
,
int64_t
n
)
{
int64_t
ytiles
=
(
m
-
m0
)
/
RM
;
int64_t
xtiles
=
(
n
-
n0
)
/
RN
;
int64_t
tiles
=
xtiles
*
ytiles
;
int64_t
duty
=
(
tiles
+
nth
-
1
)
/
nth
;
int64_t
start
=
duty
*
ith
;
int64_t
end
=
start
+
duty
;
if
(
RM
==
4
&&
RN
==
4
)
{
kernel
=
&
tinyBLAS_PPC
::
KERNEL_4x4
;
}
else
if
(
RM
==
4
&&
RN
==
8
)
{
kernel
=
&
tinyBLAS_PPC
::
KERNEL_4x8
;
}
else
if
(
RM
==
8
&&
RN
==
4
)
{
kernel
=
&
tinyBLAS_PPC
::
KERNEL_8x4
;
}
else
if
(
RM
==
8
&&
RN
==
8
)
{
kernel
=
&
tinyBLAS_PPC
::
KERNEL_8x8
;
}
if
(
end
>
tiles
)
end
=
tiles
;
for
(
int64_t
job
=
start
;
job
<
end
;
++
job
)
{
int64_t
ii
=
m0
+
job
/
xtiles
*
RM
;
int64_t
jj
=
n0
+
job
%
xtiles
*
RN
;
(
this
->*
kernel
)(
ii
,
jj
);
}
}
const
TA
*
const
A
;
const
TB
*
const
B
;
TC
*
C
;
TA
*
At
;
TB
*
Bt
;
const
int64_t
k
;
const
int64_t
lda
;
const
int64_t
ldb
;
const
int64_t
ldc
;
const
int
ith
;
const
int
nth
;
};
#endif
}
// namespace
/**
...
...
@@ -1073,6 +1712,16 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
ith
,
nth
};
tb
.
matmul
(
m
,
n
);
return
true
;
#elif defined(__MMA__)
if
(
k
%
8
)
return
false
;
tinyBLAS_PPC
<
float
,
float
,
float
>
tb
{
k
,
(
const
float
*
)
A
,
lda
,
(
const
float
*
)
B
,
ldb
,
(
float
*
)
C
,
ldc
,
ith
,
nth
};
tb
.
matmul
(
m
,
n
);
return
true
;
#else
return
false
;
#endif
...
...
@@ -1182,6 +1831,22 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
#endif
}
case
GGML_TYPE_Q5_0
:
{
if
(
Btype
!=
GGML_TYPE_Q8_0
)
return
false
;
#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
tinyBLAS_Q0_AVX
<
block_q5_0
,
block_q8_0
,
float
>
tb
{
k
,
(
const
block_q5_0
*
)
A
,
lda
,
(
const
block_q8_0
*
)
B
,
ldb
,
(
float
*
)
C
,
ldc
,
ith
,
nth
};
tb
.
matmul
(
m
,
n
);
return
true
;
#else
return
false
;
#endif
}
case
GGML_TYPE_IQ4_NL
:
{
if
(
Btype
!=
GGML_TYPE_Q8_0
)
return
false
;
...
...
llama/unicode-data.cpp
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
@@ -33,7 +33,7 @@
#include <unordered_map>
#include <unordered_set>
const
std
::
vector
<
std
::
pair
<
uint32_t
,
uint16_t
>>
unicode_ranges_flags
=
{
// start, flags // last=next_start-1
const
std
::
initializer_list
<
std
::
pair
<
uint32_t
,
uint16_t
>>
unicode_ranges_flags
=
{
// start, flags // last=next_start-1
{
0x000000
,
0x0080
},
{
0x000020
,
0x0008
},
{
0x000021
,
0x0020
},
...
...
@@ -2337,7 +2337,8 @@ const std::unordered_set<uint32_t> unicode_set_whitespace = {
0x003000
,
};
const
std
::
unordered_map
<
uint32_t
,
uint32_t
>
unicode_map_lowercase
=
{
// list is always in ascending order, to enable binary search
const
std
::
initializer_list
<
std
::
pair
<
uint32_t
,
uint32_t
>>
unicode_map_lowercase
=
{
{
0x000041
,
0x000061
},
{
0x000042
,
0x000062
},
{
0x000043
,
0x000063
},
...
...
@@ -3773,7 +3774,8 @@ const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {
{
0x01E921
,
0x01E943
},
};
const
std
::
unordered_map
<
uint32_t
,
uint32_t
>
unicode_map_uppercase
=
{
// list is always in ascending order, to enable binary search
const
std
::
initializer_list
<
std
::
pair
<
uint32_t
,
uint32_t
>>
unicode_map_uppercase
=
{
{
0x000061
,
0x000041
},
{
0x000062
,
0x000042
},
{
0x000063
,
0x000043
},
...
...
@@ -5226,7 +5228,7 @@ const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = {
{
0x01E943
,
0x01E921
},
};
const
std
::
vector
<
range_nfd
>
unicode_ranges_nfd
=
{
// start, last, nfd
const
std
::
initializer_list
<
range_nfd
>
unicode_ranges_nfd
=
{
// start, last, nfd
{
0x000000
,
0x000000
,
0x000000
},
{
0x0000C0
,
0x0000C5
,
0x000041
},
{
0x0000C7
,
0x0000C7
,
0x000043
},
...
...
llama/unicode-data.h
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
@@ -39,8 +39,8 @@ struct range_nfd {
static
const
uint32_t
MAX_CODEPOINTS
=
0x110000
;
extern
const
std
::
vector
<
std
::
pair
<
uint32_t
,
uint16_t
>>
unicode_ranges_flags
;
extern
const
std
::
initializer_list
<
std
::
pair
<
uint32_t
,
uint16_t
>>
unicode_ranges_flags
;
extern
const
std
::
unordered_set
<
uint32_t
>
unicode_set_whitespace
;
extern
const
std
::
unordered_map
<
uint32_t
,
uint32_t
>
unicode_map_lowercase
;
extern
const
std
::
unordered_map
<
uint32_t
,
uint32_t
>
unicode_map_uppercase
;
extern
const
std
::
vector
<
range_nfd
>
unicode_ranges_nfd
;
extern
const
std
::
initializer_list
<
std
::
pair
<
uint32_t
,
uint32_t
>
>
unicode_map_lowercase
;
extern
const
std
::
initializer_list
<
std
::
pair
<
uint32_t
,
uint32_t
>
>
unicode_map_uppercase
;
extern
const
std
::
initializer_list
<
range_nfd
>
unicode_ranges_nfd
;
Prev
1
…
10
11
12
13
14
15
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment