Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
orangecat
ollama
Commits
527cc978
Unverified
Commit
527cc978
authored
Dec 10, 2024
by
Jeffrey Morgan
Committed by
GitHub
Dec 10, 2024
Browse files
llama: update vendored code to commit 40c6d79f (#7875)
parent
a37f4a86
Changes
288
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1372 additions
and
605 deletions
+1372
-605
llama/patches/0002-pretokenizer.patch
llama/patches/0002-pretokenizer.patch
+3
-3
llama/patches/0003-embeddings.patch
llama/patches/0003-embeddings.patch
+10
-14
llama/patches/0003-metal.patch
llama/patches/0003-metal.patch
+0
-54
llama/patches/0004-clip-unicode.patch
llama/patches/0004-clip-unicode.patch
+6
-6
llama/patches/0004-ggml-metal.patch
llama/patches/0004-ggml-metal.patch
+0
-24
llama/patches/0005-solar-pro.patch
llama/patches/0005-solar-pro.patch
+49
-44
llama/patches/0006-conditional-fattn.patch
llama/patches/0006-conditional-fattn.patch
+6
-6
llama/patches/0007-blas.patch
llama/patches/0007-blas.patch
+26
-0
llama/patches/0008-add-mllama-support.patch
llama/patches/0008-add-mllama-support.patch
+169
-115
llama/patches/0009-add-unpad-operator.patch
llama/patches/0009-add-unpad-operator.patch
+126
-139
llama/patches/0010-fix-deepseek-deseret-regex.patch
llama/patches/0010-fix-deepseek-deseret-regex.patch
+12
-6
llama/patches/0011-relative-include-paths.patch
llama/patches/0011-relative-include-paths.patch
+64
-0
llama/runner/runner.go
llama/runner/runner.go
+1
-2
llama/sampling.cpp
llama/sampling.cpp
+156
-109
llama/sampling.h
llama/sampling.h
+42
-21
llama/sampling_ext.cpp
llama/sampling_ext.cpp
+15
-29
llama/sampling_ext.h
llama/sampling_ext.h
+10
-23
llama/sgemm.cpp
llama/sgemm.cpp
+665
-0
llama/unicode-data.cpp
llama/unicode-data.cpp
+7
-5
llama/unicode-data.h
llama/unicode-data.h
+5
-5
No files found.
llama/patches/0002-pretokenizer.patch
View file @
527cc978
...
...
@@ -8,10 +8,10 @@ Subject: [PATCH] pretokenizer
1 file changed, 3 insertions(+), 11 deletions(-)
diff --git a/src/llama.cpp b/src/llama.cpp
index
4c0a1bb6..800dfb95
100644
index
6a6f4c2a..fa09f3b3
100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -6
287
,16 +6
287
,7 @@
static void llm_load_vocab(
@@ -6
362
,16 +6
362
,7 @@
static void llm_load_vocab(
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
vocab.tokenizer_add_space_prefix = false;
vocab.tokenizer_clean_spaces = true;
...
...
@@ -29,7 +29,7 @@ index 4c0a1bb6..800dfb95 100644
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (
tokenizer_pre == "llama3" ||
@@ -6
398
,7 +6
389
,8 @@
static void llm_load_vocab(
@@ -6
473
,7 +6
464
,8 @@
static void llm_load_vocab(
vocab.tokenizer_add_bos = true;
vocab.tokenizer_clean_spaces = false;
} else {
...
...
llama/patches/000
5
-embeddings.patch
→
llama/patches/000
3
-embeddings.patch
View file @
527cc978
...
...
@@ -4,14 +4,14 @@ Date: Mon, 16 Sep 2024 15:53:14 -0700
Subject: [PATCH] embeddings
---
src/llama.cpp |
15 ++
++++++
+---
---
1 file changed,
9
insertions(+),
6
deletions(-)
src/llama.cpp |
9
++++++---
1 file changed,
6
insertions(+),
3
deletions(-)
diff --git a/src/llama.cpp b/src/llama.cpp
index
800dfb95..a639522d
100644
index
fa09f3b3..d1791af0
100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -1
6920
,7 +1
6920
,7 @@
static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
@@ -1
7398
,7 +1
7398
,7 @@
static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead
...
...
@@ -20,20 +20,15 @@ index 800dfb95..a639522d 100644
const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
@@ -17192,20 +17192,23 @@
static int llama_decode_internal(
// no output
@@ -17693,7 +17693,6 @@
static int llama_decode_internal(
res = nullptr;
embd = nullptr;
-
} else if (cparams.embeddings) {
} else if (cparams.embeddings) {
- res = nullptr; // do not extract logits for embedding case
- embd = nullptr;
+ }
+
+ if (cparams.embeddings) {
embd = nullptr;
for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
+ embd = ggml_graph_node(gf, i);
if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
- embd = ggml_graph_node(gf, i);
@@ -17701,11 +17700,15 @@
static int llama_decode_internal(
break;
}
}
...
...
@@ -46,6 +41,7 @@ index 800dfb95..a639522d 100644
+ if (!cparams.causal_attn) {
+ res = nullptr; // do not extract logits when not needed
+ }
+
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
ggml_backend_sched_alloc_graph(lctx.sched, gf);
ggml_backend_sched_alloc_graph(lctx.sched
.get()
, gf);
llama/patches/0003-metal.patch
deleted
100644 → 0
View file @
a37f4a86
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Mon, 16 Sep 2024 15:53:12 -0700
Subject: [PATCH] metal
---
ggml/src/ggml-metal.m | 30 +++++++++++++-----------------
1 file changed, 13 insertions(+), 17 deletions(-)
diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
index 9da08fe2..3a433703 100644
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -1720,27 +1720,23 @@
static void ggml_metal_encode_node(
// to the matrix-vector kernel
int ne11_mm_min = 1;
-#if 0
// the numbers below are measured on M2 Ultra for 7B and 13B models
// these numbers do not translate to other devices or model sizes
// TODO: need to find a better approach
- if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
- switch (src0t) {
- case GGML_TYPE_F16: ne11_mm_min = 2; break;
- case GGML_TYPE_Q8_0: ne11_mm_min = 7; break;
- case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
- case GGML_TYPE_Q3_K: ne11_mm_min = 7; break;
- case GGML_TYPE_Q4_0:
- case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
- case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
- case GGML_TYPE_Q5_0: // not tested yet
- case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
- case GGML_TYPE_Q5_K: ne11_mm_min = 7; break;
- case GGML_TYPE_Q6_K: ne11_mm_min = 7; break;
- default: ne11_mm_min = 1; break;
- }
+ switch (src0t) {
+ case GGML_TYPE_F16: ne11_mm_min = 2; break;
+ case GGML_TYPE_Q8_0: ne11_mm_min = 7; break;
+ case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
+ case GGML_TYPE_Q3_K: ne11_mm_min = 7; break;
+ case GGML_TYPE_Q4_0:
+ case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
+ case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
+ case GGML_TYPE_Q5_0: // not tested yet
+ case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
+ case GGML_TYPE_Q5_K: ne11_mm_min = 7; break;
+ case GGML_TYPE_Q6_K: ne11_mm_min = 7; break;
+ default: ne11_mm_min = 1; break;
}
-#endif
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
llama/patches/000
6
-clip-unicode.patch
→
llama/patches/000
4
-clip-unicode.patch
View file @
527cc978
...
...
@@ -8,12 +8,12 @@ Subject: [PATCH] clip-unicode
1 file changed, 39 insertions(+), 1 deletion(-)
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index
14e02c8d..6e849d8e
100644
index
d7c94352..427d5e02
100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -
44
,6 +
44
,19 @@
#define LOG_
ERR
(...) do { fprintf(std
err
, __VA_ARGS__); } while (0)
#
define LOG_DBG(...) do { fprintf(stderr, __VA_ARGS__); } while (0
)
@@ -
56
,6 +
56
,19 @@
#
define LOG_
DBG
(...) do { fprintf(std
out
, __VA_ARGS__); } while (0)
#
endif // defined(LLAVA_LOG_OFF
)
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
...
...
@@ -31,7 +31,7 @@ index 14e02c8d..6e849d8e 100644
//#define CLIP_DEBUG_FUNCTIONS
// RGB uint8 image
@@ -122
5
,8 +12
38
,29 @@
struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
@@ -12
4
2,8 +12
55
,29 @@
struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
gguf_free(ctx);
return nullptr;
}
...
...
@@ -62,7 +62,7 @@ index 14e02c8d..6e849d8e 100644
if (!fin) {
LOG_ERR("cannot open model file for loading tensors\n");
clip_free(new_clip);
@@ -12
66
,7 +13
00
,11 @@
struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
@@ -12
83
,7 +13
17
,11 @@
struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
}
}
...
...
llama/patches/0004-ggml-metal.patch
deleted
100644 → 0
View file @
a37f4a86
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: Wed, 12 Jun 2024 12:18:40 -0700
Subject: [PATCH] ggml-metal
---
ggml/src/ggml-metal.m | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
index 3a433703..829c5e39 100644
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -392,8 +392,8 @@
static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
#if GGML_METAL_EMBED_LIBRARY
GGML_METAL_LOG_INFO("%s: using embedded metal library\n", __func__);
- extern const char ggml_metallib_start[];
- extern const char ggml_metallib_end[];
+ extern const char *ggml_metallib_start;
+ extern const char *ggml_metallib_end;
NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
#else
llama/patches/000
7
-solar-pro.patch
→
llama/patches/000
5
-solar-pro.patch
View file @
527cc978
...
...
@@ -11,14 +11,14 @@ tensor to store the scalar. the scalar is implemented a 1-dimensional
tensor with 2 elements dervied from the model's bskcn_tv configuration.
in general, the values are (bskcn_tv, 1 - bskcn_tv)
---
src/llama.cpp | 26
9
+++++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 25
5
insertions(+), 14 deletions(-)
src/llama.cpp | 26
7
+++++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 25
3
insertions(+), 14 deletions(-)
diff --git a/src/llama.cpp b/src/llama.cpp
index
a639522d..83b80b59
100644
index
d1791af0..b01770d0
100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -
217
,6 +
217
,7 @@
enum llm_arch {
@@ -
195
,6 +
195
,7 @@
enum llm_arch {
LLM_ARCH_GRANITE,
LLM_ARCH_GRANITE_MOE,
LLM_ARCH_CHAMELEON,
...
...
@@ -26,7 +26,7 @@ index a639522d..83b80b59 100644
LLM_ARCH_UNKNOWN,
};
@@ -2
70
,6 +2
71
,7 @@
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
@@ -2
49
,6 +2
50
,7 @@
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_GRANITE, "granite" },
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
{ LLM_ARCH_CHAMELEON, "chameleon" },
...
...
@@ -34,7 +34,7 @@ index a639522d..83b80b59 100644
{ LLM_ARCH_UNKNOWN, "(unknown)" },
};
@@ -3
27
,6 +3
29
,7 @@
enum llm_kv {
@@ -3
06
,6 +3
08
,7 @@
enum llm_kv {
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE,
...
...
@@ -42,7 +42,7 @@ index a639522d..83b80b59 100644
LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_FREQ_BASE,
@@ -4
21
,20 +4
24
,21 @@
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
@@ -4
08
,20 +4
11
,21 @@
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" },
{ LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
...
...
@@ -76,17 +76,17 @@ index a639522d..83b80b59 100644
+ { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
+ { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection.%d" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
@@ -60
8
,6 +6
12
,7 @@
enum llm_tensor {
{ LLM_KV_ROPE_DIMENSION_COUNT,
"%s.rope.dimension_count" },
{ LLM_KV_ROPE_FREQ_BASE,
"%s.rope.freq_base" },
@@ -60
3
,6 +6
07
,7 @@
enum llm_tensor {
LLM_TENSOR_ENC_OUTPUT_NORM,
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
+ LLM_TENSOR_BSKCN_TV,
};
static const std::map<llm_arch, std::map<llm_tensor,
std::string
>> LLM_TENSOR_NAMES = {
@@ -15
27
,6 +15
32
,24 @@
static const std::map<llm_arch, std::map<llm_tensor,
std::string
>> LLM_TENSOR_N
A
static const std::map<llm_arch, std::map<llm_tensor,
const char *
>> LLM_TENSOR_NAMES = {
@@ -15
41
,6 +15
46
,24 @@
static const std::map<llm_arch, std::map<llm_tensor,
const char *
>> LLM_TENSOR_N
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
},
},
...
...
@@ -111,15 +111,15 @@ index a639522d..83b80b59 100644
{
LLM_ARCH_UNKNOWN,
{
@@ -2
360
,6 +2
383
,7 @@
enum e_model {
@@ -2
401
,6 +2
424
,7 @@
enum e_model {
MODEL_15B,
MODEL_16B,
MODEL_20B,
+ MODEL_22B,
MODEL_30B,
MODEL_32B,
MODEL_34B,
MODEL_35B,
@@ -2409,6 +2433,8 @@
struct llama_hparams {
@@ -2451,6 +2475,8 @@
struct llama_hparams {
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
...
...
@@ -128,7 +128,7 @@ index a639522d..83b80b59 100644
uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0;
uint32_t n_lora_kv = 0;
@@ -2
479
,6 +25
05
,7 @@
struct llama_hparams {
@@ -2
521
,6 +25
47
,7 @@
struct llama_hparams {
if (this->n_head_arr != other.n_head_arr) return true;
if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
if (this->n_ff_arr != other.n_ff_arr) return true;
...
...
@@ -136,7 +136,7 @@ index a639522d..83b80b59 100644
if (this->n_rel_attn_bkts != other.n_rel_attn_bkts) return true;
if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
@@ -2
588
,6 +26
1
5,14 @@
struct llama_hparams {
@@ -2
630
,6 +265
7
,14 @@
struct llama_hparams {
return ssm_d_state * ssm_d_inner;
}
}
...
...
@@ -151,7 +151,7 @@ index a639522d..83b80b59 100644
};
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
@@ -2
769
,6 +28
04
,8 @@
struct llama_layer {
@@ -2
816
,6 +28
51
,8 @@
struct llama_layer {
struct ggml_tensor * ffn_gate_scale;
struct ggml_tensor * ffn_up_scale;
struct ggml_tensor * ffn_down_scale;
...
...
@@ -160,7 +160,7 @@ index a639522d..83b80b59 100644
};
// very similar to llama_batch,
@@ -6
134
,6 +6
171
,21 @@
static void llm_load_hparams(
@@ -6
209
,6 +6
246
,21 @@
static void llm_load_hparams(
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
...
...
@@ -182,46 +182,51 @@ index a639522d..83b80b59 100644
default: (void)0;
}
@@ -8831,6 +8883,38 @@
static bool llm_load_tensors(
@@ -7198,6 +7250,7 @@
static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
// this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
+ {LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}
};
// checks if the weight tensor can be used with the specified buffer type and device
@@ -9205,6 +9258,35 @@
static bool llm_load_tensors(
layer.ffn_norm =
ml.
create_tensor(
ctx_layer,
tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}
, 0
);
+ layer.ffn_gate =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
+ layer.ffn_down =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
+ layer.ffn_up =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}
, 0
);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}
, 0
);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}
, 0
);
+ }
+ } break;
+ case LLM_ARCH_SOLAR:
+ {
+ model.tok_embd =
ml.
create_tensor(
ctx_input,
tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+ model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}
, 0
);
+
+ // output
+ {
+ model.output_norm =
ml.
create_tensor(
ctx_output,
tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+ model.output =
ml.
create_tensor(
ctx_output_split,
tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}
, 0
);
+ model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ ggml_context * ctx_layer = ctx_for_layer(i);
+ ggml_context * ctx_split = ctx_for_layer_split(i);
+
+ auto & layer = model.layers[i];
+
+ layer.attn_norm =
ml.
create_tensor(
ctx_layer,
tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}
, 0
);
+
+ layer.wq =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head});
+ layer.wk =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
+ layer.wv =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
+ layer.wo =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}
, 0
);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}
, 0
);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}
, 0
);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}
, 0
);
+
+ layer.ffn_norm =
ml.
create_tensor(
ctx_layer,
tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}
, 0
);
+
+ layer.bskcn_tv =
ml.
create_tensor(
ctx_layer,
tn(LLM_TENSOR_BSKCN_TV, "weight"), {2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+ layer.bskcn_tv = create_tensor(tn(LLM_TENSOR_BSKCN_TV, "weight"
, i
), {2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+
layer.ffn_gate =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
layer.ffn_down =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
layer.ffn_up =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
@@ -16
179
,6 +16
263
,158 @@
struct llm_build_context {
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}
, 0
);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}
, 0
);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}
, 0
);
@@ -16
652
,6 +16
734
,158 @@
struct llm_build_context {
return gf;
}
...
...
@@ -239,7 +244,7 @@ index a639522d..83b80b59 100644
+ struct ggml_tensor * cur;
+ struct ggml_tensor * inpL;
+
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams,
u
batch, model.tok_embd, cb);
+
+ // inp_pos - contains the positions
+ struct ggml_tensor * inp_pos = build_inp_pos();
...
...
@@ -380,7 +385,7 @@ index a639522d..83b80b59 100644
};
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -16
443
,6 +1
6679
,10 @@
static struct ggml_cgraph * llama_build_graph(
@@ -16
921
,6 +1
7155
,10 @@
static struct ggml_cgraph * llama_build_graph(
{
result = llm.build_chameleon();
} break;
...
...
@@ -391,7 +396,7 @@ index a639522d..83b80b59 100644
default:
GGML_ABORT("fatal error");
}
@@ -
19589,6 +19829
,7 @@
enum llama_rope_type llama_rope_type(const struct llama_model * model) {
@@ -
20132,6 +20370
,7 @@
enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_CHAMELEON:
...
...
llama/patches/000
8
-conditional-fattn.patch
→
llama/patches/000
6
-conditional-fattn.patch
View file @
527cc978
...
...
@@ -4,14 +4,14 @@ Date: Wed, 9 Oct 2024 17:26:23 -0700
Subject: [PATCH] conditional-fattn
---
ggml/src/ggml-cuda.cu | 2 ++
ggml/src/ggml-cuda
/ggml-cuda
.cu | 2 ++
1 file changed, 2 insertions(+)
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index
809d6ab1..fe77b81c
100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -2
347
,9 +2
347
,11 @@
static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
diff --git a/ggml/src/ggml-cuda
/ggml-cuda
.cu b/ggml/src/ggml-cuda
/ggml-cuda
.cu
index
52aec229..cbf4fddf
100644
--- a/ggml/src/ggml-cuda
/ggml-cuda
.cu
+++ b/ggml/src/ggml-cuda
/ggml-cuda
.cu
@@ -2
162
,9 +2
162
,11 @@
static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_ARGSORT:
ggml_cuda_op_argsort(ctx, dst);
break;
...
...
llama/patches/000
9
-blas.patch
→
llama/patches/000
7
-blas.patch
View file @
527cc978
...
...
@@ -4,22 +4,23 @@ Date: Mon, 30 Sep 2024 16:31:04 -0700
Subject: [PATCH] blas
---
ggml/src/ggml-blas.cpp | 4 ++++
ggml/src/ggml-blas
/ggml-blas
.cpp | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/ggml/src/ggml-blas.cpp b/ggml/src/ggml-blas.cpp
index
6d99c6be..8e1ab99d
100644
--- a/ggml/src/ggml-blas.cpp
+++ b/ggml/src/ggml-blas.cpp
diff --git a/ggml/src/ggml-blas
/ggml-blas
.cpp b/ggml/src/ggml-blas
/ggml-blas
.cpp
index
ec158dfa..b3ac1fa4
100644
--- a/ggml/src/ggml-blas
/ggml-blas
.cpp
+++ b/ggml/src/ggml-blas
/ggml-blas
.cpp
@@ -1,3 +1,5 @@
+#ifdef GGML_USE_BLAS
+
#include "ggml-impl.h"
#include "ggml-blas.h"
#include "ggml-backend-impl.h"
@@ -366,3 +368,5 @@
void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads)
ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend_blas->context;
ctx->n_threads = n_threads;
@@ -515,3 +517,5 @@
ggml_backend_reg_t ggml_backend_blas_reg(void) {
}
GGML_BACKEND_DL_IMPL(ggml_backend_blas_reg)
+
+#endif
+#endif // GGML_USE_BLAS
\
No newline at end of file
llama/patches/00
1
0-add-mllama-support.patch
→
llama/patches/000
8
-add-mllama-support.patch
View file @
527cc978
...
...
@@ -12,29 +12,46 @@ kv cache once per run
remaining is to implement the cross attention mask
---
examples/llava/llava.cpp |
2
+-
examples/llava/llava.cpp |
5
+-
include/llama.h | 5 +
src/llama.cpp | 4
4
7 +++++++++++++++++++++++++++++++++++++--
3 files changed, 4
3
6 insertions(+),
18
deletions(-)
src/llama.cpp | 4
7
7 +++++++++++++++++++++++++++++++++++++--
3 files changed, 46
7
insertions(+),
20
deletions(-)
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index
8558c6bd..37b2f2e2
100644
index
4ca53a0b..d56644a8
100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -409,7 +409,7 @@
bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
if (n_eval > n_batch) {
@@ -412,7 +412,7 @@
struct llava_embd_batch {
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
- llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+ llava_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
pos .resize(n_tokens);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
@@ -424,6 +424,7 @@
struct llava_embd_batch {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ embd,
+ /*n_embd =*/ n_embd,
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
@@ -447,7 +448,7 @@
bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
n_eval = n_batch;
}
- llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
+ llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), n_embd, nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
if (llama_decode(ctx_llama, batch)) {
float * embd = image_embed->embed+i*n_embd;
- llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
+ llava_embd_batch llava_batch = llava_embd_batch(embd, n_embd, n_eval, *n_past, 0);
if (llama_decode(ctx_llama, llava_batch.batch)) {
LOG_ERR("%s : failed to eval\n", __func__);
return false;
diff --git a/include/llama.h b/include/llama.h
index
7cae1bbe..aca09310
100644
index
e85f459f..aba85f86
100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -24
0
,6 +24
0
,7 @@
extern "C" {
@@ -24
5
,6 +24
5
,7 @@
extern "C" {
llama_token * token;
float * embd;
...
...
@@ -42,7 +59,7 @@ index 7cae1bbe..aca09310 100644
llama_pos * pos;
int32_t * n_seq_id;
llama_seq_id ** seq_id;
@@ -4
23
,6 +42
4
,10 @@
extern "C" {
@@ -4
19
,6 +42
0
,10 @@
extern "C" {
struct llama_model * model,
struct llama_context_params params);
...
...
@@ -54,10 +71,10 @@ index 7cae1bbe..aca09310 100644
LLAMA_API void llama_free(struct llama_context * ctx);
diff --git a/src/llama.cpp b/src/llama.cpp
index
83b80b59..35748488
100644
index
b01770d0..46881642
100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -16
9
,6 +16
9
,7 @@
static std::string format(const char * fmt, ...) {
@@ -1
4
6,6 +1
4
6,7 @@
static std::string format(const char * fmt, ...) {
enum llm_arch {
LLM_ARCH_LLAMA,
...
...
@@ -65,7 +82,7 @@ index 83b80b59..35748488 100644
LLM_ARCH_FALCON,
LLM_ARCH_BAICHUAN,
LLM_ARCH_GROK,
@@ -2
23
,6 +22
4
,7 @@
enum llm_arch {
@@ -2
01
,6 +2
0
2,7 @@
enum llm_arch {
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_LLAMA, "llama" },
...
...
@@ -73,7 +90,7 @@ index 83b80b59..35748488 100644
{ LLM_ARCH_FALCON, "falcon" },
{ LLM_ARCH_GROK, "grok" },
{ LLM_ARCH_GPT2, "gpt2" },
@@ -3
3
0,6 +3
32
,7 @@
enum llm_kv {
@@ -30
9
,6 +3
11
,7 @@
enum llm_kv {
LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
...
...
@@ -81,15 +98,15 @@ index 83b80b59..35748488 100644
LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_FREQ_BASE,
@@ -4
39
,6 +4
4
2,7 @@
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
@@ -4
26
,6 +42
9
,7 @@
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection.%d" },
+ { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
@@ -6
13
,6 +61
7
,14 @@
enum llm_tensor {
{ LLM_KV_ROPE_DIMENSION_COUNT,
"%s.rope.dimension_count" },
{ LLM_KV_ROPE_FREQ_BASE,
"%s.rope.freq_base" },
@@ -6
08
,6 +61
2
,14 @@
enum llm_tensor {
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
LLM_TENSOR_BSKCN_TV,
...
...
@@ -103,8 +120,8 @@ index 83b80b59..35748488 100644
+ LLM_TENSOR_CROSS_ATTN_MLP_GATE,
};
static const std::map<llm_arch, std::map<llm_tensor,
std::string
>> LLM_TENSOR_NAMES = {
@@ -6
42
,6 +6
5
4,40 @@
static const std::map<llm_arch, std::map<llm_tensor,
std::string
>> LLM_TENSOR_N
A
static const std::map<llm_arch, std::map<llm_tensor,
const char *
>> LLM_TENSOR_NAMES = {
@@ -6
37
,6 +64
9
,40 @@
static const std::map<llm_arch, std::map<llm_tensor,
const char *
>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
},
},
...
...
@@ -145,7 +162,7 @@ index 83b80b59..35748488 100644
{
LLM_ARCH_BAICHUAN,
{
@@ -2
390
,6 +24
36
,7 @@
enum e_model {
@@ -2
432
,6 +24
78
,7 @@
enum e_model {
MODEL_40B,
MODEL_65B,
MODEL_70B,
...
...
@@ -153,7 +170,7 @@ index 83b80b59..35748488 100644
MODEL_236B,
MODEL_314B,
MODEL_SMALL,
@@ -24
34
,6 +2
481
,7 @@
struct llama_hparams {
@@ -24
76
,6 +2
523
,7 @@
struct llama_hparams {
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr;
...
...
@@ -161,7 +178,7 @@ index 83b80b59..35748488 100644
uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0;
@@ -25
02
,10 +25
50
,11 @@
struct llama_hparams {
@@ -25
44
,10 +25
92
,11 @@
struct llama_hparams {
if (this->n_expert != other.n_expert) return true;
if (this->n_expert_used != other.n_expert_used) return true;
...
...
@@ -169,15 +186,15 @@ index 83b80b59..35748488 100644
- if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
- if (this->n_ff_arr != other.n_ff_arr) return true;
- if (this->n_bskcn_arr != other.n_bskcn_arr) return true;
+ if (this->n_head_arr != other.n_head_arr)
return true;
+ if (this->n_head_kv_arr != other.n_head_kv_arr)
return true;
+ if (this->n_ff_arr != other.n_ff_arr)
return true;
+ if (this->n_bskcn_arr != other.n_bskcn_arr)
return true;
+ if (this->n_head_arr != other.n_head_arr) return true;
+ if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
+ if (this->n_ff_arr != other.n_ff_arr) return true;
+ if (this->n_bskcn_arr != other.n_bskcn_arr) return true;
+ if (this->cross_attn_layers != other.cross_attn_layers) return true;
if (this->n_rel_attn_bkts != other.n_rel_attn_bkts) return true;
if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
@@ -26
23
,6 +2
672
,10 @@
struct llama_hparams {
@@ -26
65
,6 +2
714
,10 @@
struct llama_hparams {
GGML_ABORT("fatal error");
}
...
...
@@ -188,7 +205,7 @@ index 83b80b59..35748488 100644
};
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
@@ -26
52
,6 +27
05
,9 @@
struct llama_cparams {
@@ -26
94
,6 +27
47
,9 @@
struct llama_cparams {
bool offload_kqv;
bool flash_attn;
bool no_perf;
...
...
@@ -198,7 +215,7 @@ index 83b80b59..35748488 100644
enum llama_pooling_type pooling_type;
@@ -28
06
,6 +2
862
,16 @@
struct llama_layer {
@@ -28
53
,6 +2
909
,16 @@
struct llama_layer {
struct ggml_tensor * ffn_down_scale;
struct ggml_tensor * bskcn_tv;
...
...
@@ -215,7 +232,7 @@ index 83b80b59..35748488 100644
};
// very similar to llama_batch,
@@ -34
52
,6 +35
18
,8 @@
struct llama_context {
@@ -34
39
,6 +35
05
,8 @@
struct llama_context {
struct ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch]
struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc]
struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
...
...
@@ -224,13 +241,34 @@ index 83b80b59..35748488 100644
};
struct llama_lora_weight {
@@ -3
686
,6 +3
754,18
@@
static bool llama_kv_cache_init(
@@ -3
577
,6 +3
645,39
@@
static bool llama_kv_cache_init(
cache.v_l.reserve(n_layer);
for (int i = 0; i < (int) n_layer; i++) {
+ // for cross attention layers
+ if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layers(i)) {
+ struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
+ const llama_model::buft_list_t * buft_list;
+ if (offload) {
+ buft_list = model.dev_layer.at(i).buft_list;
+ } else {
+ buft_list = &model.cpu_buft_list;
+ }
+ ggml_backend_buffer_type_t buft = select_buft(*buft_list,
+ [&](ggml_context * ctx) {
+ ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
+ if (hparams.rope_type == LLAMA_ROPE_TYPE_NONE) {
+ return k;
+ }
+ ggml_tensor * p = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
+ return ggml_rope(ctx, k, p, hparams.n_rot, hparams.rope_type);
+ });
+ ggml_context * ctx = ctx_for_buft(buft);
+
+ if (!ctx) {
+ LLAMA_LOG_ERROR("%s: failed to create ggml context for kv cache\n", __func__);
+ return false;
+ }
+ ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i));
+ ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i));
+ ggml_format_name(k, "cache_k_l%d", i);
...
...
@@ -243,17 +281,17 @@ index 83b80b59..35748488 100644
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
@@ -5
46
0,12 +5
540
,14 @@
static void llm_load_hparams(
@@ -5
52
0,12 +5
621
,14 @@
static void llm_load_hparams(
}
// zero-out the per-layer hparams
- std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
- std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
- std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
+ std::fill(hparams.n_head_arr.begin(),
hparams.n_head_arr.end(), 0);
+ std::fill(hparams.n_head_kv_arr.begin(),
hparams.n_head_kv_arr.end(), 0);
+ std::fill(hparams.n_ff_arr.begin(),
hparams.n_ff_arr.end(), 0);
+ std::fill(hparams.cross_attn_layers.begin(),
hparams.cross_attn_layers.end(), -1);
+ std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
+ std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
+ std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
+ std::fill(hparams.cross_attn_layers.begin(), hparams.cross_attn_layers.end(), -1);
- ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer);
- ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer);
...
...
@@ -263,7 +301,7 @@ index 83b80b59..35748488 100644
// n_head_kv is optional, default to n_head
hparams.n_head_kv_arr = hparams.n_head_arr;
@@ -55
1
4,7 +5
596
,7 @@
static void llm_load_hparams(
@@ -55
7
4,7 +5
677
,7 @@
static void llm_load_hparams(
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
...
...
@@ -272,7 +310,7 @@ index 83b80b59..35748488 100644
if (hparams.n_rot != hparams.n_embd_head_k) {
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
}
@@ -5
55
4,6 +5
636
,16 @@
static void llm_load_hparams(
@@ -5
61
4,6 +5
717
,16 @@
static void llm_load_hparams(
}
}
} break;
...
...
@@ -289,63 +327,78 @@ index 83b80b59..35748488 100644
case LLM_ARCH_MINICPM:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -7249,6 +7341,55 @@
static bool llm_load_tensors(
layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head_qk_rope/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
@@ -7250,7 +7363,15 @@
static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
// this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
- {LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}
+ {LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_CROSS_ATTN_K_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_CROSS_ATTN_K_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_CROSS_ATTN_O_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_CROSS_ATTN_Q_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_CROSS_ATTN_Q_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_CROSS_ATTN_V_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_CROSS_ATTN_ATTN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_CROSS_ATTN_MLP_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
};
// checks if the weight tensor can be used with the specified buffer type and device
@@ -7754,6 +7875,53 @@
static bool llm_load_tensors(
}
}
} break;
+ case LLM_ARCH_MLLAMA:
+ {
+ model.tok_embd =
ml.
create_tensor(
ctx_input,
tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab+8});
+ model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab+8}
, 0
);
+
+ // output
+ {
+ model.output_norm =
ml.
create_tensor(
ctx_output,
tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+ model.output =
ml.
create_tensor(
ctx_output_split,
tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}
, 0
);
+ model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+
+ // if output is NULL, init from the input tok embed
+ if (model.output == NULL) {
+ model.output =
ml.
create_tensor(
ctx_output,
tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
+ model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
+ }
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ ggml_context * ctx_layer = ctx_for_layer(i);
+ ggml_context * ctx_split = ctx_for_layer_split(i);
+
+ auto & layer = model.layers[i];
+
+ if (hparams.cross_attention_layers(i)) {
+ layer.cross_attn_k_norm =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_CROSS_ATTN_K_NORM, "weight", i), {128});
+ layer.cross_attn_k_proj =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_CROSS_ATTN_K_PROJ, "weight", i), {n_embd, 1024});
+ layer.cross_attn_o_proj =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_CROSS_ATTN_O_PROJ, "weight", i), {n_embd, n_embd});
+ layer.cross_attn_q_norm =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_CROSS_ATTN_Q_NORM, "weight", i), {128});
+ layer.cross_attn_q_proj =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_CROSS_ATTN_Q_PROJ, "weight", i), {n_embd, n_embd});
+ layer.cross_attn_v_proj =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_CROSS_ATTN_V_PROJ, "weight", i), {n_embd, 1024});
+ layer.cross_attn_attn_gate =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_CROSS_ATTN_ATTN_GATE, i), {1});
+ layer.cross_attn_mlp_gate =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_CROSS_ATTN_MLP_GATE, i), {1});
+ layer.attn_norm =
ml.
create_tensor(
ctx_layer,
tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+ layer.ffn_down =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
+ layer.ffn_gate =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
+ layer.ffn_up =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
+ layer.ffn_norm =
ml.
create_tensor(
ctx_layer,
tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+ layer.cross_attn_k_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_NORM, "weight", i), {128}
, 0
);
+ layer.cross_attn_k_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_PROJ, "weight", i), {n_embd, 1024}
, 0
);
+ layer.cross_attn_o_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_O_PROJ, "weight", i), {n_embd, n_embd}
, 0
);
+ layer.cross_attn_q_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_NORM, "weight", i), {128}
, 0
);
+ layer.cross_attn_q_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_PROJ, "weight", i), {n_embd, n_embd}
, 0
);
+ layer.cross_attn_v_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_V_PROJ, "weight", i), {n_embd, 1024}
, 0
);
+ layer.cross_attn_attn_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_ATTN_GATE, i), {1}
, 0
);
+ layer.cross_attn_mlp_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_MLP_GATE, i), {1}
, 0
);
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}
, 0
);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}
, 0
);
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}
, 0
);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}
, 0
);
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}
, 0
);
+ } else {
+ layer.attn_norm =
ml.
create_tensor(
ctx_layer,
tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+ layer.wq =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head});
+ layer.wk =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
+ layer.wv =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
+ layer.wo =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
+ layer.ffn_norm =
ml.
create_tensor(
ctx_layer,
tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+ layer.rope_freqs =
ml.
create_tensor(
ctx_layer,
tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+ layer.ffn_gate =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
+ layer.ffn_down =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
+ layer.ffn_up =
ml.
create_tensor(
ctx_split,
tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}
, 0
);
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}
, 0
);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}
, 0
);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}
, 0
);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}
, 0
);
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}
, 0
);
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight"
, i
), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}
, 0
);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}
, 0
);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}
, 0
);
+ }
+ }
+ } break;
case LLM_ARCH_
GROK
:
case LLM_ARCH_
MINICPM3
:
{
if (n_expert == 0) {
@@ -9
09
3,7 +9
234
,7 @@
static int llama_model_load(const std::string & fname, llama_model & model, llam
const int64_t n_embd_head_qk_rope = hparams.n_rot;
@@ -9
46
3,7 +9
631
,7 @@
static int llama_model_load(const std::string & fname, llama_model & model, llam
if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
model.hparams.n_vocab != model.vocab.id_to_token.size()) {
...
...
@@ -354,7 +407,7 @@ index 83b80b59..35748488 100644
}
if (params.vocab_only) {
@@ -9
193
,6 +9
33
4,21 @@
static struct ggml_tensor * llm_build_inp_embd(
@@ -9
546
,6 +9
71
4,21 @@
static struct ggml_tensor * llm_build_inp_embd(
return inpL;
}
...
...
@@ -376,7 +429,7 @@ index 83b80b59..35748488 100644
static void llm_build_kv_store(
struct ggml_context * ctx,
const llama_hparams & hparams,
@@ -10
167
,6 +10
323
,7 @@
struct llm_build_context {
@@ -10
513
,6 +10
696
,7 @@
struct llm_build_context {
lctx.inp_pos_bucket = nullptr;
lctx.inp_embd_enc = nullptr;
lctx.inp_KQ_mask_cross = nullptr;
...
...
@@ -384,18 +437,10 @@ index 83b80b59..35748488 100644
}
void free() {
@@ -10
754
,6 +1
0911,239
@@
struct llm_build_context {
LLM_NORM_RMS, cb, -1)
;
cb(cur, "result_norm", -1);
@@ -10
992
,6 +1
1176,240
@@
struct llm_build_context {
return gf
;
}
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+ cb(cur, "result_output", -1);
+
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+ }
+
+ struct ggml_cgraph * build_mllama() {
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+
...
...
@@ -410,7 +455,7 @@ index 83b80b59..35748488 100644
+ struct ggml_tensor * inpL;
+ struct ggml_tensor * inpCAS;
+
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams,
u
batch, model.tok_embd, cb);
+ inpCAS = llm_build_inp_cross_attn_state(ctx0, lctx, hparams, cb);
+
+ // inp_pos - contains the positions
...
...
@@ -429,7 +474,7 @@ index 83b80b59..35748488 100644
+ cb(cur, "attn_norm", il);
+
+ if (hparams.cross_attention_layers(il)) {
+ if (!batch.embd && !cparams.cross_attn) {
+ if (!
u
batch.embd && !cparams.cross_attn) {
+ continue;
+ }
+
...
...
@@ -447,7 +492,7 @@ index 83b80b59..35748488 100644
+ cb(Qcur, "Qcur", il);
+
+ struct ggml_tensor * Kcur, * Vcur;
+ if (batch.embd) {
+ if (
u
batch.embd) {
+ Kcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_k_proj, inpCAS);
+ cb(Kcur, "Kcur", il);
+
...
...
@@ -621,10 +666,19 @@ index 83b80b59..35748488 100644
+ LLM_NORM_RMS, cb, -1);
+ cb(cur, "result_norm", -1);
+
// lm_head
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1);
@@ -16501,6 +16891,10 @@
static struct ggml_cgraph * llama_build_graph(
+ // lm_head
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+ cb(cur, "result_output", -1);
+
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+ }
+
struct ggml_cgraph * build_baichuan() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
@@ -16973,6 +17391,10 @@
static struct ggml_cgraph * llama_build_graph(
{
result = llm.build_llama();
} break;
...
...
@@ -635,14 +689,14 @@ index 83b80b59..35748488 100644
case LLM_ARCH_BAICHUAN:
{
result = llm.build_baichuan();
@@ -1
6761
,10 +17
155
,19 @@
static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch)
{
@@ -1
7237
,10 +17
659
,19 @@
static void llama_set_inputs(llama_context & lctx, const llama_ubatch &
u
batch)
}
if (batch.embd) {
if (
u
batch.embd) {
- const int64_t n_embd = hparams.n_embd;
- const int64_t n_tokens = batch.n_tokens;
- const int64_t n_tokens =
u
batch.n_tokens;
+ if (lctx.inp_cross_attn_state && lctx.inp_cross_attn_state->buffer) {
+ ggml_backend_tensor_set(lctx.inp_cross_attn_state, batch.embd, 0, ggml_nbytes(lctx.inp_cross_attn_state));
+ ggml_backend_tensor_set(lctx.inp_cross_attn_state,
u
batch.embd, 0, ggml_nbytes(lctx.inp_cross_attn_state));
+ // zero out inp_embd since it's not used
+ float * inp_embd_data = (float *)lctx.inp_embd->data;
+ for (int i = 0; i < ggml_nelements(lctx.inp_embd); ++i) {
...
...
@@ -650,24 +704,24 @@ index 83b80b59..35748488 100644
+ }
+ } else {
+ const int64_t n_embd = hparams.n_embd;
+ const int64_t n_tokens = batch.n_tokens;
+ const int64_t n_tokens =
u
batch.n_tokens;
- ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
+ ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
- ggml_backend_tensor_set(lctx.inp_embd,
u
batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
+ ggml_backend_tensor_set(lctx.inp_embd,
u
batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
+ }
}
if (batch.pos && lctx.inp_pos) {
@@ -17
345
,7 +1
7748
,7 @@
static int llama_decode_internal(
if (
u
batch.pos && lctx.inp_pos) {
@@ -17
841
,7 +1
8272
,7 @@
static int llama_decode_internal(
n_outputs = 1;
}
- lctx.sbatch.from_batch(batch
_all
, n_embd,
+ lctx.sbatch.from_batch(batch
_all
, batch
_all
.n_embd,
- lctx.sbatch.from_batch(batch, n_embd,
+ lctx.sbatch.from_batch(batch, batch.n_embd,
/* simple_split */ !kv_self.recurrent,
/* logits_all */ n_outputs == n_tokens_all);
@@ -1
7638
,7 +18
041
,7 @@
static int llama_encode_internal(
@@ -1
8151
,7 +18
582
,7 @@
static int llama_encode_internal(
const int64_t n_embd = hparams.n_embd;
...
...
@@ -676,7 +730,7 @@ index 83b80b59..35748488 100644
const llama_ubatch ubatch = lctx.sbatch.split_simple(n_tokens);
@@ -1
8648
,7 +19
051
,9 @@
static void llama_model_quantize_internal(const std::string & fname_inp, const s
@@ -1
9189
,7 +19
620
,9 @@
static void llama_model_quantize_internal(const std::string & fname_inp, const s
if (llama_model_has_encoder(&model)) {
n_attn_layer *= 3;
}
...
...
@@ -687,7 +741,7 @@ index 83b80b59..35748488 100644
}
size_t total_size_org = 0;
@@ -
19814
,6 +20
219
,7 @@
enum llama_rope_type llama_rope_type(const struct llama_model * model) {
@@ -
20355
,6 +20
788
,7 @@
enum llama_rope_type llama_rope_type(const struct llama_model * model) {
// use what we call a normal RoPE, operating on pairs of consecutive head values
case LLM_ARCH_LLAMA:
...
...
@@ -695,7 +749,7 @@ index 83b80b59..35748488 100644
case LLM_ARCH_BAICHUAN:
case LLM_ARCH_STARCODER:
case LLM_ARCH_PLAMO:
@@ -21
230
,6 +2
163
6,10 @@
void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
@@ -21
782
,6 +2
221
6,10 @@
void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
ctx->cparams.causal_attn = causal_attn;
}
...
...
@@ -705,8 +759,8 @@ index 83b80b59..35748488 100644
+
struct llama_batch llama_batch_get_one(
llama_token * tokens,
int32_t n_tokens
,
@@ -21
23
9,6 +2
1649
,7 @@
struct llama_batch llama_batch_get_one(
int32_t n_tokens
) {
@@ -21
78
9,6 +2
2227
,7 @@
struct llama_batch llama_batch_get_one(
/*n_tokens =*/ n_tokens,
/*tokens =*/ tokens,
/*embd =*/ nullptr,
...
...
@@ -714,7 +768,7 @@ index 83b80b59..35748488 100644
/*pos =*/ nullptr,
/*n_seq_id =*/ nullptr,
/*seq_id =*/ nullptr,
@@ -21
254
,6 +2
1665
,7 @@
struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
@@ -21
801
,6 +2
2240
,7 @@
struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
/*n_tokens =*/ 0,
/*tokens =*/ nullptr,
/*embd =*/ nullptr,
...
...
@@ -722,7 +776,7 @@ index 83b80b59..35748488 100644
/*pos =*/ nullptr,
/*n_seq_id =*/ nullptr,
/*seq_id =*/ nullptr,
@@ -21
265
,6 +2
1677
,7 @@
struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
@@ -21
809
,6 +2
2249
,7 @@
struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
if (embd) {
batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
...
...
llama/patches/00
11
-add-unpad-operator.patch
→
llama/patches/00
09
-add-unpad-operator.patch
View file @
527cc978
...
...
@@ -4,20 +4,21 @@ Date: Thu, 17 Oct 2024 17:19:25 -0700
Subject: [PATCH] add unpad operator
---
ggml/include/ggml.h | 10 ++++
ggml/src/ggml-cuda.cu | 4 ++
ggml/src/ggml-cuda/pad.cu | 46 +++++++++++++++++++
ggml/src/ggml-cuda/pad.cuh | 1 +
ggml/src/ggml-metal.m | 33 ++++++++++++++
ggml/src/ggml-metal.metal | 45 ++++++++++++++++++
ggml/src/ggml.c | 93 +++++++++++++++++++++++++++++++++++++-
7 files changed, 230 insertions(+), 2 deletions(-)
ggml/include/ggml.h | 10 +++++
ggml/src/ggml-cpu/ggml-cpu.c | 57 ++++++++++++++++++++++++++++
ggml/src/ggml-cuda/ggml-cuda.cu | 4 ++
ggml/src/ggml-cuda/pad.cu | 46 ++++++++++++++++++++++
ggml/src/ggml-cuda/pad.cuh | 1 +
ggml/src/ggml-metal/ggml-metal.m | 33 ++++++++++++++++
ggml/src/ggml-metal/ggml-metal.metal | 45 ++++++++++++++++++++++
ggml/src/ggml.c | 25 +++++++++++-
8 files changed, 219 insertions(+), 2 deletions(-)
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index
ce3d
92c
b
..
962cb5f7
100644
index
65cb
92c
4
..
acbcccc6
100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -
506
,6 +
506
,7 @@
extern "C" {
@@ -
499
,6 +
499
,7 @@
extern "C" {
GGML_OP_POOL_2D_BACK,
GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_PAD,
...
...
@@ -25,7 +26,7 @@ index ce3d92cb..962cb5f7 100644
GGML_OP_ARANGE,
GGML_OP_TIMESTEP_EMBEDDING,
GGML_OP_ARGSORT,
@@ -1
764
,6 +1
765
,15 @@
extern "C" {
@@ -1
695
,6 +1
696
,15 @@
extern "C" {
int p2,
int p3);
...
...
@@ -41,11 +42,93 @@ index ce3d92cb..962cb5f7 100644
// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
// timesteps: [N,]
// return: [N, dim]
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index fe77b81c..6e84af56 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -2270,6 +2270,9 @@
static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 23ae2e10..111ff3b0 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -10439,6 +10439,58 @@
static void ggml_compute_forward_pad(
}
}
+static void ggml_compute_forward_unpad_f32(
+ const struct ggml_compute_params *params,
+ struct ggml_tensor *dst) {
+
+ const struct ggml_tensor * src0 = dst->src[0];
+
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
+ GGML_ASSERT( dst->nb[0] == sizeof(float));
+
+ const int ith = params->ith;
+ const int nth = params->nth;
+
+ GGML_TENSOR_UNARY_OP_LOCALS
+
+ float * dst_ptr = (float *) dst->data;
+
+ // TODO: optimize
+
+ for (int64_t i2 = 0; i2 < ne2; ++i2) {
+ for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
+ for (int64_t i0 = 0; i0 < ne0; ++i0) {
+ for (int64_t i3 = 0; i3 < ne3; ++i3) {
+ const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
+
+ const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+
+ if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
+ dst_ptr[dst_idx] = *src_ptr;
+ }
+ }
+ }
+ }
+ }
+}
+
+static void ggml_compute_forward_unpad(
+ const struct ggml_compute_params * params,
+ struct ggml_tensor * dst) {
+
+ const struct ggml_tensor * src0 = dst->src[0];
+
+ switch (src0->type) {
+ case GGML_TYPE_F32:
+ {
+ ggml_compute_forward_unpad_f32(params, dst);
+ } break;
+ default:
+ {
+ GGML_ABORT("fatal error");
+ }
+ }
+}
// ggml_compute_forward_arange
@@ -12535,6 +12587,10 @@
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_pad(params, tensor);
} break;
+ case GGML_OP_UNPAD:
+ {
+ ggml_compute_forward_unpad(params, tensor);
+ } break;
case GGML_OP_ARANGE:
{
ggml_compute_forward_arange(params, tensor);
@@ -12877,6 +12933,7 @@
static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
} break;
case GGML_OP_UPSCALE:
case GGML_OP_PAD:
+ case GGML_OP_UNPAD:
case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT:
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index cbf4fddf..9ca6cb77 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2085,6 +2085,9 @@
static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_PAD:
ggml_cuda_op_pad(ctx, dst);
break;
...
...
@@ -55,7 +138,7 @@ index fe77b81c..6e84af56 100644
case GGML_OP_ARANGE:
ggml_cuda_op_arange(ctx, dst);
break;
@@ -
299
2,6 +
299
5,7 @@
GGML_CALL
static bool ggml_backend_cuda_supports_op(ggml_backend_
t backend
, cons
@@ -
301
2,6 +
301
5,7 @@
static bool ggml_backend_cuda_
device_
supports_op(ggml_backend_
dev_t dev
, cons
t g
case GGML_OP_GROUP_NORM:
case GGML_OP_UPSCALE:
case GGML_OP_PAD:
...
...
@@ -126,35 +209,35 @@ index 8fd386b0..e2ededc3 100644
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
index
829c5e39..25702d85
100644
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -
193
,6 +
193
,7 @@
GGML_METAL_KERNEL_TYPE_
IM2COL
_F32,
diff --git a/ggml/src/ggml-metal
/ggml-metal
.m b/ggml/src/ggml-metal
/ggml-metal
.m
index
093ae900..cb9a1307
100644
--- a/ggml/src/ggml-metal
/ggml-metal
.m
+++ b/ggml/src/ggml-metal
/ggml-metal
.m
@@ -
310
,6 +
310
,7 @@
static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
GGML_METAL_KERNEL_TYPE_
CONV_TRANSPOSE_1D_F16
_F32,
GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
GGML_METAL_KERNEL_TYPE_PAD_F32,
+ GGML_METAL_KERNEL_TYPE_UNPAD_F32,
GGML_METAL_KERNEL_TYPE_ARANGE_F32,
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
@@ -
689
,6 +
690
,7 @@
static void ggml_
meta
l_log(enum ggml_log_level level, const char * format, ...){
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_
IM2COL_F32, im2col_f32,
true);
@@ -
877
,6 +
878
,7 @@
@imple
me
n
ta
tion GGMLMetalClass
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_
CONV_TRANSPOSE_1D_F16_F32, conv_transpose_1d_f16_f32,
true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32, unpad_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
@@ -
846,6 +848
,7 @@
static bool ggml_metal_supports_op(const struct ggml_backend_metal_
context * ct
x
return false;
@@ -
1099,6 +1101
,7 @@
static bool ggml_metal_supports_op(const struct ggml_backend_metal_
device_conte
x
case GGML_OP_POOL_2D:
case GGML_OP_UPSCALE:
case GGML_OP_PAD:
+ case GGML_OP_UNPAD:
case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT:
@@ -
2655
,6 +26
58
,36 @@
static void ggml_metal_encode_node(
@@ -
3258
,6 +
3
26
1
,36 @@
static void ggml_metal_encode_node(
const int nth = MIN(1024, ne0);
...
...
@@ -191,11 +274,11 @@ index 829c5e39..25702d85 100644
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break;
case GGML_OP_ARANGE:
diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal
index
2b200032..0988751
1 100644
--- a/ggml/src/ggml-metal.metal
+++ b/ggml/src/ggml-metal.metal
@@ -2
029
,6 +2
029
,51 @@
kernel void kernel_pad_f32(
diff --git a/ggml/src/ggml-metal
/ggml-metal
.metal b/ggml/src/ggml-metal
/ggml-metal
.metal
index
5caa0846..47038c3
1 100644
--- a/ggml/src/ggml-metal
/ggml-metal
.metal
+++ b/ggml/src/ggml-metal
/ggml-metal
.metal
@@ -2
897
,6 +2
897
,51 @@
kernel void kernel_pad_f32(
}
}
...
...
@@ -248,10 +331,10 @@ index 2b200032..09887511 100644
device char * dst,
constant int64_t & ne0,
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index
bcbc32d9..f4864ac8
100644
index
1a9a7efa..ea2b259b
100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -
2997,6 +2997
,7 @@
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
@@ -
950,6 +950
,7 @@
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"POOL_2D_BACK",
"UPSCALE",
"PAD",
...
...
@@ -259,16 +342,16 @@ index bcbc32d9..f4864ac8 100644
"ARANGE",
"TIMESTEP_EMBEDDING",
"ARGSORT",
@@ -
3030,7 +3031
,7 @@
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
@@ -
983,7 +984
,7 @@
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"OPT_STEP_ADAMW",
};
-static_assert(GGML_OP_COUNT == 8
0
, "GGML_OP_COUNT != 8
0
");
+static_assert(GGML_OP_COUNT == 8
1
, "GGML_OP_COUNT != 8
1
");
-static_assert(GGML_OP_COUNT == 8
1
, "GGML_OP_COUNT != 8
1
");
+static_assert(GGML_OP_COUNT == 8
2
, "GGML_OP_COUNT != 8
2
");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
@@ -
3091,6 +3092
,7 @@
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
@@ -
1045,6 +1046
,7 @@
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"pool_2d_back(x)",
"upscale(x)",
"pad(x)",
...
...
@@ -276,16 +359,16 @@ index bcbc32d9..f4864ac8 100644
"arange(start, stop, step)",
"timestep_embedding(timesteps, dim, max_period)",
"argsort(x)",
@@ -
3124,7 +3126
,7 @@
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
@@ -
1078,7 +1080
,7 @@
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"adamw(x)",
};
-static_assert(GGML_OP_COUNT == 8
0
, "GGML_OP_COUNT != 8
0
");
+static_assert(GGML_OP_COUNT == 8
1
, "GGML_OP_COUNT != 8
1
");
-static_assert(GGML_OP_COUNT == 8
1
, "GGML_OP_COUNT != 8
1
");
+static_assert(GGML_OP_COUNT == 8
2
, "GGML_OP_COUNT != 8
2
");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@@ -
6955,6 +6957,32
@@
struct ggml_tensor * ggml_pad(
@@ -
4097,6 +4099,25
@@
struct ggml_tensor * ggml_pad(
return result;
}
...
...
@@ -295,12 +378,6 @@ index bcbc32d9..f4864ac8 100644
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int p0, int p1, int p2, int p3) {
+ bool is_node = false;
+
+ if (a->grad) {
+ GGML_ABORT("fatal error"); // TODO: implement backward
+ is_node = true;
+ }
+
+ struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
+ a->ne[0] - p0,
...
...
@@ -309,7 +386,6 @@ index bcbc32d9..f4864ac8 100644
+ a->ne[3] - p3);
+
+ result->op = GGML_OP_UNPAD;
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+ result->src[0] = a;
+
+ return result;
...
...
@@ -318,92 +394,3 @@ index bcbc32d9..f4864ac8 100644
// ggml_arange
struct ggml_tensor * ggml_arange(
@@ -15312,6 +15340,58 @@
static void ggml_compute_forward_pad(
}
}
+static void ggml_compute_forward_unpad_f32(
+ const struct ggml_compute_params *params,
+ struct ggml_tensor *dst) {
+
+ const struct ggml_tensor * src0 = dst->src[0];
+
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
+ GGML_ASSERT( dst->nb[0] == sizeof(float));
+
+ const int ith = params->ith;
+ const int nth = params->nth;
+
+ GGML_TENSOR_UNARY_OP_LOCALS
+
+ float * dst_ptr = (float *) dst->data;
+
+ // TODO: optimize
+
+ for (int64_t i2 = 0; i2 < ne2; ++i2) {
+ for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
+ for (int64_t i0 = 0; i0 < ne0; ++i0) {
+ for (int64_t i3 = 0; i3 < ne3; ++i3) {
+ const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
+
+ const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+
+ if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
+ dst_ptr[dst_idx] = *src_ptr;
+ }
+ }
+ }
+ }
+ }
+}
+
+static void ggml_compute_forward_unpad(
+ const struct ggml_compute_params * params,
+ struct ggml_tensor * dst) {
+
+ const struct ggml_tensor * src0 = dst->src[0];
+
+ switch (src0->type) {
+ case GGML_TYPE_F32:
+ {
+ ggml_compute_forward_unpad_f32(params, dst);
+ } break;
+ default:
+ {
+ GGML_ABORT("fatal error");
+ }
+ }
+}
// ggml_compute_forward_arange
@@ -17294,6 +17374,10 @@
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_pad(params, tensor);
} break;
+ case GGML_OP_UNPAD:
+ {
+ ggml_compute_forward_unpad(params, tensor);
+ } break;
case GGML_OP_ARANGE:
{
ggml_compute_forward_arange(params, tensor);
@@ -18369,6 +18453,10 @@
static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
{
GGML_ABORT("fatal error"); // TODO: not implemented
}
+ case GGML_OP_UNPAD:
+ {
+ GGML_ABORT("fatal error"); // TODO: not implemented
+ }
case GGML_OP_ARANGE:
{
GGML_ABORT("fatal error"); // TODO: not implemented
@@ -19165,6 +19253,7 @@
static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
} break;
case GGML_OP_UPSCALE:
case GGML_OP_PAD:
+ case GGML_OP_UNPAD:
case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT:
llama/patches/001
2
-fix-deepseek-deseret-regex.patch
→
llama/patches/001
0
-fix-deepseek-deseret-regex.patch
View file @
527cc978
...
...
@@ -7,11 +7,11 @@ On windows compiled with gcc the c++ regex library failed to handle
the characters
---
src/llama-vocab.cpp | 2 +-
src/unicode.cpp | 2
1
+++++++++++++++++++++
2 files changed, 2
2
insertions(+), 1 deletion(-)
src/unicode.cpp | 2
2 +
+++++++++++++++++++++
2 files changed, 2
3
insertions(+), 1 deletion(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index d
2f34ddd..3ef6af19
100644
index d
1dc9627..05ef0e71
100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -389,7 +389,7 @@
struct llm_tokenizer_bpe : llm_tokenizer {
...
...
@@ -24,7 +24,7 @@ index d2f34ddd..3ef6af19 100644
"\\s+$",
"[一-龥ࠀ-一가-]+",
diff --git a/src/unicode.cpp b/src/unicode.cpp
index
f4e941cd..9d78ff16
100644
index
3d459263..51dd81fb
100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -2,6 +2,11 @@
...
...
@@ -39,7 +39,7 @@ index f4e941cd..9d78ff16 100644
#include "unicode.h"
#include "unicode-data.h"
@@ -201,
8
+206,2
4
@@
static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
@@ -201,
6
+206,2
2
@@
static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
}
static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
...
...
@@ -58,7 +58,13 @@ index f4e941cd..9d78ff16 100644
+ free(wbuf);
+ return ret;
+#else
std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
+
#if defined(__clang__)
// disable C++17 deprecation warning for std::codecvt_utf8
# pragma clang diagnostic push
@@ -214,6 +235,7 @@
static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
#endif
return conv.from_bytes(s);
+#endif
}
...
...
llama/patches/0011-relative-include-paths.patch
0 → 100644
View file @
527cc978
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: Tue, 3 Dec 2024 21:30:51 -0800
Subject: [PATCH] relative include paths
---
ggml/src/ggml-cpu/ggml-cpu-aarch64.c | 2 +-
ggml/src/ggml-cpu/ggml-cpu.c | 2 +-
ggml/src/ggml-cpu/ggml-cpu.cpp | 2 +-
ggml/src/ggml-quants.c | 2 +-
4 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
index 11152385..bbf8934e 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
@@ -4,7 +4,7 @@
#include "ggml-quants.h"
#include "ggml-impl.h"
#include "ggml-cpu.h"
-#include "ggml-cpu/ggml-cpu-impl.h"
+#include "ggml-cpu-impl.h"
#include <math.h>
#include <string.h>
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 111ff3b0..df0bd3c6 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -10,7 +10,7 @@
#include "ggml-quants.h"
#include "ggml-cpu-quants.h"
#include "ggml-threading.h"
-#include "amx/amx.h"
+#include "amx.h"
#include "ggml.h"
#if defined(_MSC_VER) || defined(__MINGW32__)
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
index 77e5d87a..91476ad0 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -3,7 +3,7 @@
#include "ggml-cpu.h"
#include "ggml-cpu-aarch64.h"
#include "ggml-impl.h"
-#include "amx/amx.h"
+#include "amx.h"
#include <cctype>
#include <string>
#include <vector>
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 7301a9c6..49ab3daf 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -3,7 +3,7 @@
#include "ggml-quants.h"
#include "ggml-impl.h"
-#include "ggml-cpu/ggml-cpu-impl.h"
+#include "ggml-cpu-impl.h"
#include "ggml-cpu.h"
#include <math.h>
llama/runner/runner.go
View file @
527cc978
...
...
@@ -559,7 +559,6 @@ type Options struct {
TopK
int
`json:"top_k"`
TopP
float32
`json:"top_p"`
MinP
float32
`json:"min_p"`
TFSZ
float32
`json:"tfs_z"`
TypicalP
float32
`json:"typical_p"`
RepeatLastN
int
`json:"repeat_last_n"`
Temperature
float32
`json:"temperature"`
...
...
@@ -632,7 +631,6 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
samplingParams
.
TopK
=
req
.
TopK
samplingParams
.
TopP
=
req
.
TopP
samplingParams
.
MinP
=
req
.
MinP
samplingParams
.
TfsZ
=
req
.
TFSZ
samplingParams
.
TypicalP
=
req
.
TypicalP
samplingParams
.
Temp
=
req
.
Temperature
samplingParams
.
RepeatLastN
=
req
.
RepeatLastN
...
...
@@ -930,6 +928,7 @@ func Execute(args []string) error {
level
:=
slog
.
LevelInfo
if
*
verbose
{
level
=
slog
.
LevelDebug
llama
.
EnableDebug
()
}
handler
:=
slog
.
NewTextHandler
(
os
.
Stderr
,
&
slog
.
HandlerOptions
{
Level
:
level
,
...
...
llama/sampling.cpp
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
@@ -124,8 +124,8 @@ struct ring_buffer {
std
::
vector
<
T
>
data
;
};
struct
gpt
_sampler
{
gpt_sampler_params
params
;
struct
common
_sampler
{
common_params_sampling
params
;
struct
llama_sampler
*
grmr
;
struct
llama_sampler
*
chain
;
...
...
@@ -151,26 +151,28 @@ struct gpt_sampler {
}
};
std
::
string
gpt_sampler_params
::
print
()
const
{
std
::
string
common_params_sampling
::
print
()
const
{
char
result
[
1024
];
snprintf
(
result
,
sizeof
(
result
),
"
\t
repeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f
\n
"
"
\t
top_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f
\n
"
"
\t
dry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d
\n
"
"
\t
top_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f
\n
"
"
\t
mirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f"
,
penalty_last_n
,
penalty_repeat
,
penalty_freq
,
penalty_present
,
top_k
,
tfs_z
,
top_p
,
min_p
,
typ_p
,
temp
,
dry_multiplier
,
dry_base
,
dry_allowed_length
,
dry_penalty_last_n
,
top_k
,
top_p
,
min_p
,
xtc_probability
,
xtc_threshold
,
typ_p
,
temp
,
mirostat
,
mirostat_eta
,
mirostat_tau
);
return
std
::
string
(
result
);
}
struct
gpt
_sampler
*
gpt
_sampler_init
(
const
struct
llama_model
*
model
,
const
struct
gpt_sampler_params
&
params
)
{
struct
common
_sampler
*
common
_sampler_init
(
const
struct
llama_model
*
model
,
const
struct
common_params_sampling
&
params
)
{
llama_sampler_chain_params
lparams
=
llama_sampler_chain_default_params
();
lparams
.
no_perf
=
params
.
no_perf
;
auto
*
result
=
new
gpt
_sampler
{
auto
*
result
=
new
common
_sampler
{
/* .params = */
params
,
/* .grmr = */
llama_sampler_init_grammar
(
model
,
params
.
grammar
.
c_str
(),
"root"
),
/* .chain = */
llama_sampler_chain_init
(
lparams
),
...
...
@@ -197,60 +199,60 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st
params
.
penalize_nl
,
params
.
ignore_eos
));
if
(
params
.
temp
>
0.0
f
)
{
if
(
params
.
mirostat
==
0
)
{
for
(
const
auto
&
cnstr
:
params
.
samplers
)
{
switch
(
cnstr
)
{
case
GPT_SAMPLER_TYPE_TOP_K
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_top_k
(
params
.
top_k
));
if
(
params
.
mirostat
==
0
)
{
for
(
const
auto
&
cnstr
:
params
.
samplers
)
{
switch
(
cnstr
)
{
case
COMMON_SAMPLER_TYPE_DRY
:
{
std
::
vector
<
const
char
*>
c_breakers
;
c_breakers
.
reserve
(
params
.
dry_sequence_breakers
.
size
());
for
(
const
auto
&
str
:
params
.
dry_sequence_breakers
)
{
c_breakers
.
push_back
(
str
.
c_str
());
}
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_dry
(
model
,
params
.
dry_multiplier
,
params
.
dry_base
,
params
.
dry_allowed_length
,
params
.
dry_penalty_last_n
,
c_breakers
.
data
(),
c_breakers
.
size
()));
}
break
;
case
GPT_SAMPLER_TYPE_TOP_P
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_top_p
(
params
.
top_p
,
params
.
min_keep
));
break
;
case
GPT_SAMPLER_TYPE_MIN_P
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_min_p
(
params
.
min_p
,
params
.
min_keep
));
break
;
case
GPT_SAMPLER_TYPE_TFS_Z
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_tail_free
(
params
.
tfs_z
,
params
.
min_keep
));
break
;
case
GPT_SAMPLER_TYPE_TYPICAL_P
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_typical
(
params
.
typ_p
,
params
.
min_keep
));
break
;
case
GPT_SAMPLER_TYPE_TEMPERATURE
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_temp_ext
(
params
.
temp
,
params
.
dynatemp_range
,
params
.
dynatemp_exponent
));
break
;
default:
GGML_ASSERT
(
false
&&
"unknown sampler type"
);
}
case
COMMON_SAMPLER_TYPE_TOP_K
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_top_k
(
params
.
top_k
));
break
;
case
COMMON_SAMPLER_TYPE_TOP_P
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_top_p
(
params
.
top_p
,
params
.
min_keep
));
break
;
case
COMMON_SAMPLER_TYPE_MIN_P
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_min_p
(
params
.
min_p
,
params
.
min_keep
));
break
;
case
COMMON_SAMPLER_TYPE_XTC
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_xtc
(
params
.
xtc_probability
,
params
.
xtc_threshold
,
params
.
min_keep
,
params
.
seed
));
break
;
case
COMMON_SAMPLER_TYPE_TYPICAL_P
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_typical
(
params
.
typ_p
,
params
.
min_keep
));
break
;
case
COMMON_SAMPLER_TYPE_TEMPERATURE
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_temp_ext
(
params
.
temp
,
params
.
dynatemp_range
,
params
.
dynatemp_exponent
));
break
;
case
COMMON_SAMPLER_TYPE_INFILL
:
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_infill
(
model
));
break
;
default:
GGML_ASSERT
(
false
&&
"unknown sampler type"
);
}
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_softmax
());
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_dist
(
params
.
seed
));
}
else
if
(
params
.
mirostat
==
1
)
{
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_temp
(
params
.
temp
));
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_mirostat
(
llama_n_vocab
(
model
),
params
.
seed
,
params
.
mirostat_tau
,
params
.
mirostat_eta
,
100
));
}
else
if
(
params
.
mirostat
==
2
)
{
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_temp
(
params
.
temp
));
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_mirostat_v2
(
params
.
seed
,
params
.
mirostat_tau
,
params
.
mirostat_eta
));
}
else
{
GGML_ASSERT
(
false
&&
"unknown mirostat version"
);
}
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_dist
(
params
.
seed
));
}
else
if
(
params
.
mirostat
==
1
)
{
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_temp
(
params
.
temp
));
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_mirostat
(
llama_n_vocab
(
model
),
params
.
seed
,
params
.
mirostat_tau
,
params
.
mirostat_eta
,
100
));
}
else
if
(
params
.
mirostat
==
2
)
{
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_temp
(
params
.
temp
));
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_mirostat_v2
(
params
.
seed
,
params
.
mirostat_tau
,
params
.
mirostat_eta
));
}
else
{
if
(
params
.
n_probs
>
0
)
{
// some use cases require to sample greedily, but still obtain the probabilities of the top tokens
// ref: https://github.com/ggerganov/llama.cpp/pull/9605
//
// the following will not produce exactly the same probs as applyging softmax to the full vocabulary, but
// it is much faster, since we avoid sorting all tokens and should give a good approximation
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_top_k
(
params
.
n_probs
));
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_softmax
());
}
llama_sampler_chain_add
(
result
->
chain
,
llama_sampler_init_greedy
());
GGML_ASSERT
(
false
&&
"unknown mirostat version"
);
}
return
result
;
}
void
gpt
_sampler_free
(
struct
gpt
_sampler
*
gsmpl
)
{
void
common
_sampler_free
(
struct
common
_sampler
*
gsmpl
)
{
if
(
gsmpl
)
{
llama_sampler_free
(
gsmpl
->
grmr
);
...
...
@@ -260,7 +262,7 @@ void gpt_sampler_free(struct gpt_sampler * gsmpl) {
}
}
void
gpt
_sampler_accept
(
struct
gpt
_sampler
*
gsmpl
,
llama_token
token
,
bool
accept_grammar
)
{
void
common
_sampler_accept
(
struct
common
_sampler
*
gsmpl
,
llama_token
token
,
bool
accept_grammar
)
{
if
(
accept_grammar
)
{
llama_sampler_accept
(
gsmpl
->
grmr
,
token
);
}
...
...
@@ -270,14 +272,14 @@ void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool acce
gsmpl
->
prev
.
push_back
(
token
);
}
void
gpt
_sampler_reset
(
struct
gpt
_sampler
*
gsmpl
)
{
void
common
_sampler_reset
(
struct
common
_sampler
*
gsmpl
)
{
llama_sampler_reset
(
gsmpl
->
grmr
);
llama_sampler_reset
(
gsmpl
->
chain
);
}
struct
gpt
_sampler
*
gpt
_sampler_clone
(
gpt
_sampler
*
gsmpl
)
{
return
new
gpt
_sampler
{
struct
common
_sampler
*
common
_sampler_clone
(
common
_sampler
*
gsmpl
)
{
return
new
common
_sampler
{
/* .params = */
gsmpl
->
params
,
/* .grmr = */
llama_sampler_clone
(
gsmpl
->
grmr
),
/* .chain = */
llama_sampler_clone
(
gsmpl
->
chain
),
...
...
@@ -287,7 +289,7 @@ struct gpt_sampler * gpt_sampler_clone(gpt_sampler * gsmpl) {
};
}
void
gpt
_perf_print
(
const
struct
llama_context
*
ctx
,
const
struct
gpt
_sampler
*
gsmpl
)
{
void
common
_perf_print
(
const
struct
llama_context
*
ctx
,
const
struct
common
_sampler
*
gsmpl
)
{
// TODO: measure grammar performance
if
(
gsmpl
)
{
...
...
@@ -298,7 +300,7 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
}
}
llama_token
gpt
_sampler_sample
(
struct
gpt
_sampler
*
gsmpl
,
struct
llama_context
*
ctx
,
int
idx
,
bool
grammar_first
)
{
llama_token
common
_sampler_sample
(
struct
common
_sampler
*
gsmpl
,
struct
llama_context
*
ctx
,
int
idx
,
bool
grammar_first
)
{
gsmpl
->
set_logits
(
ctx
,
idx
);
auto
&
grmr
=
gsmpl
->
grmr
;
...
...
@@ -344,21 +346,60 @@ llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context
return
cur_p
.
data
[
cur_p
.
selected
].
id
;
}
uint32_t
gpt_sampler_get_seed
(
const
struct
gpt_sampler
*
gsmpl
)
{
std
::
vector
<
llama_token
>
common_sampler_sample_and_accept_n
(
struct
common_sampler
*
gsmpl
,
struct
llama_context
*
ctx
,
const
std
::
vector
<
int
>
&
idxs
,
const
llama_tokens
&
draft
,
bool
grammar_first
)
{
GGML_ASSERT
(
idxs
.
size
()
==
draft
.
size
()
+
1
&&
"idxs.size() must be draft.size() + 1"
);
std
::
vector
<
llama_token
>
result
;
result
.
reserve
(
idxs
.
size
());
size_t
i
=
0
;
for
(;
i
<
draft
.
size
();
i
++
)
{
const
llama_token
id
=
common_sampler_sample
(
gsmpl
,
ctx
,
idxs
[
i
],
grammar_first
);
common_sampler_accept
(
gsmpl
,
id
,
true
);
result
.
push_back
(
id
);
if
(
draft
[
i
]
!=
id
)
{
break
;
}
}
if
(
i
==
draft
.
size
())
{
const
llama_token
id
=
common_sampler_sample
(
gsmpl
,
ctx
,
idxs
[
i
],
grammar_first
);
common_sampler_accept
(
gsmpl
,
id
,
true
);
result
.
push_back
(
id
);
}
return
result
;
}
std
::
vector
<
llama_token
>
common_sampler_sample_and_accept_n
(
struct
common_sampler
*
gsmpl
,
struct
llama_context
*
ctx
,
const
llama_tokens
&
draft
,
bool
grammar_first
)
{
std
::
vector
<
int
>
idxs
(
draft
.
size
()
+
1
);
for
(
size_t
i
=
0
;
i
<
idxs
.
size
();
++
i
)
{
idxs
[
i
]
=
i
;
}
return
common_sampler_sample_and_accept_n
(
gsmpl
,
ctx
,
idxs
,
draft
,
grammar_first
);
}
uint32_t
common_sampler_get_seed
(
const
struct
common_sampler
*
gsmpl
)
{
return
llama_sampler_get_seed
(
gsmpl
->
chain
);
}
// helpers
llama_token_data_array
*
gpt
_sampler_get_candidates
(
struct
gpt
_sampler
*
gsmpl
)
{
llama_token_data_array
*
common
_sampler_get_candidates
(
struct
common
_sampler
*
gsmpl
)
{
return
&
gsmpl
->
cur_p
;
}
llama_token
gpt
_sampler_last
(
const
struct
gpt
_sampler
*
gsmpl
)
{
llama_token
common
_sampler_last
(
const
struct
common
_sampler
*
gsmpl
)
{
return
gsmpl
->
prev
.
rat
(
0
);
}
std
::
string
gpt
_sampler_print
(
const
struct
gpt
_sampler
*
gsmpl
)
{
std
::
string
common
_sampler_print
(
const
struct
common
_sampler
*
gsmpl
)
{
std
::
string
result
=
"logits "
;
for
(
int
i
=
0
;
i
<
llama_sampler_chain_n
(
gsmpl
->
chain
);
i
++
)
{
...
...
@@ -369,7 +410,7 @@ std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
return
result
;
}
std
::
string
gpt
_sampler_prev_str
(
gpt
_sampler
*
gsmpl
,
llama_context
*
ctx_main
,
int
n
)
{
std
::
string
common
_sampler_prev_str
(
common
_sampler
*
gsmpl
,
llama_context
*
ctx_main
,
int
n
)
{
n
=
std
::
min
(
n
,
(
int
)
gsmpl
->
prev
.
size
());
if
(
n
<=
0
)
{
...
...
@@ -384,63 +425,67 @@ std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main,
GGML_ASSERT
(
id
!=
LLAMA_TOKEN_NULL
&&
"null token in the sampling history - should not happen"
);
result
+=
llama
_token_to_piece
(
ctx_main
,
id
);
result
+=
common
_token_to_piece
(
ctx_main
,
id
);
}
return
result
;
}
char
gpt
_sampler_type_to_chr
(
enum
gpt
_sampler_type
cnstr
)
{
char
common
_sampler_type_to_chr
(
enum
common
_sampler_type
cnstr
)
{
switch
(
cnstr
)
{
case
GPT_SAMPLER_TYPE_TOP_K
:
return
'k'
;
case
GPT_SAMPLER_TYPE_TFS_Z
:
return
'f'
;
case
GPT_SAMPLER_TYPE_TYPICAL_P
:
return
'y'
;
case
GPT_SAMPLER_TYPE_TOP_P
:
return
'p'
;
case
GPT_SAMPLER_TYPE_MIN_P
:
return
'm'
;
case
GPT_SAMPLER_TYPE_TEMPERATURE
:
return
't'
;
case
COMMON_SAMPLER_TYPE_DRY
:
return
'd'
;
case
COMMON_SAMPLER_TYPE_TOP_K
:
return
'k'
;
case
COMMON_SAMPLER_TYPE_TYPICAL_P
:
return
'y'
;
case
COMMON_SAMPLER_TYPE_TOP_P
:
return
'p'
;
case
COMMON_SAMPLER_TYPE_MIN_P
:
return
'm'
;
case
COMMON_SAMPLER_TYPE_TEMPERATURE
:
return
't'
;
case
COMMON_SAMPLER_TYPE_XTC
:
return
'x'
;
case
COMMON_SAMPLER_TYPE_INFILL
:
return
'i'
;
default
:
return
'?'
;
}
}
std
::
string
gpt
_sampler_type_to_str
(
enum
gpt
_sampler_type
cnstr
)
{
std
::
string
common
_sampler_type_to_str
(
enum
common
_sampler_type
cnstr
)
{
switch
(
cnstr
)
{
case
GPT_SAMPLER_TYPE_TOP_K
:
return
"top_k"
;
case
GPT_SAMPLER_TYPE_TFS_Z
:
return
"tfs_z"
;
case
GPT_SAMPLER_TYPE_TYPICAL_P
:
return
"typ_p"
;
case
GPT_SAMPLER_TYPE_TOP_P
:
return
"top_p"
;
case
GPT_SAMPLER_TYPE_MIN_P
:
return
"min_p"
;
case
GPT_SAMPLER_TYPE_TEMPERATURE
:
return
"temperature"
;
case
COMMON_SAMPLER_TYPE_DRY
:
return
"dry"
;
case
COMMON_SAMPLER_TYPE_TOP_K
:
return
"top_k"
;
case
COMMON_SAMPLER_TYPE_TYPICAL_P
:
return
"typ_p"
;
case
COMMON_SAMPLER_TYPE_TOP_P
:
return
"top_p"
;
case
COMMON_SAMPLER_TYPE_MIN_P
:
return
"min_p"
;
case
COMMON_SAMPLER_TYPE_TEMPERATURE
:
return
"temperature"
;
case
COMMON_SAMPLER_TYPE_XTC
:
return
"xtc"
;
case
COMMON_SAMPLER_TYPE_INFILL
:
return
"infill"
;
default
:
return
""
;
}
}
std
::
vector
<
gpt_sampler_type
>
gpt_sampler_types_from_names
(
const
std
::
vector
<
std
::
string
>
&
names
,
bool
allow_alt_names
)
{
std
::
unordered_map
<
std
::
string
,
gpt_sampler_type
>
sampler_canonical_name_map
{
{
"top_k"
,
GPT_SAMPLER_TYPE_TOP_K
},
{
"top_p"
,
GPT_SAMPLER_TYPE_TOP_P
},
{
"typ_p"
,
GPT_SAMPLER_TYPE_TYPICAL_P
},
{
"min_p"
,
GPT_SAMPLER_TYPE_MIN_P
},
{
"tfs_z"
,
GPT_SAMPLER_TYPE_TFS_Z
},
{
"temperature"
,
GPT_SAMPLER_TYPE_TEMPERATURE
},
std
::
vector
<
common_sampler_type
>
common_sampler_types_from_names
(
const
std
::
vector
<
std
::
string
>
&
names
,
bool
allow_alt_names
)
{
std
::
unordered_map
<
std
::
string
,
common_sampler_type
>
sampler_canonical_name_map
{
{
"dry"
,
COMMON_SAMPLER_TYPE_DRY
},
{
"top_k"
,
COMMON_SAMPLER_TYPE_TOP_K
},
{
"top_p"
,
COMMON_SAMPLER_TYPE_TOP_P
},
{
"typ_p"
,
COMMON_SAMPLER_TYPE_TYPICAL_P
},
{
"min_p"
,
COMMON_SAMPLER_TYPE_MIN_P
},
{
"temperature"
,
COMMON_SAMPLER_TYPE_TEMPERATURE
},
{
"xtc"
,
COMMON_SAMPLER_TYPE_XTC
},
{
"infill"
,
COMMON_SAMPLER_TYPE_INFILL
},
};
// since samplers names are written multiple ways
// make it ready for both system names and input names
std
::
unordered_map
<
std
::
string
,
gpt_sampler_type
>
sampler_alt_name_map
{
{
"top-k"
,
GPT_SAMPLER_TYPE_TOP_K
},
{
"top-p"
,
GPT_SAMPLER_TYPE_TOP_P
},
{
"nucleus"
,
GPT_SAMPLER_TYPE_TOP_P
},
{
"typical-p"
,
GPT_SAMPLER_TYPE_TYPICAL_P
},
{
"typical"
,
GPT_SAMPLER_TYPE_TYPICAL_P
},
{
"typ-p"
,
GPT_SAMPLER_TYPE_TYPICAL_P
},
{
"typ"
,
GPT_SAMPLER_TYPE_TYPICAL_P
},
{
"min-p"
,
GPT_SAMPLER_TYPE_MIN_P
},
{
"tfs-z"
,
GPT_SAMPLER_TYPE_TFS_Z
},
{
"tfs"
,
GPT_SAMPLER_TYPE_TFS_Z
},
{
"temp"
,
GPT_SAMPLER_TYPE_TEMPERATURE
},
std
::
unordered_map
<
std
::
string
,
common_sampler_type
>
sampler_alt_name_map
{
{
"top-k"
,
COMMON_SAMPLER_TYPE_TOP_K
},
{
"top-p"
,
COMMON_SAMPLER_TYPE_TOP_P
},
{
"nucleus"
,
COMMON_SAMPLER_TYPE_TOP_P
},
{
"typical-p"
,
COMMON_SAMPLER_TYPE_TYPICAL_P
},
{
"typical"
,
COMMON_SAMPLER_TYPE_TYPICAL_P
},
{
"typ-p"
,
COMMON_SAMPLER_TYPE_TYPICAL_P
},
{
"typ"
,
COMMON_SAMPLER_TYPE_TYPICAL_P
},
{
"min-p"
,
COMMON_SAMPLER_TYPE_MIN_P
},
{
"temp"
,
COMMON_SAMPLER_TYPE_TEMPERATURE
},
};
std
::
vector
<
gpt
_sampler_type
>
samplers
;
std
::
vector
<
common
_sampler_type
>
samplers
;
samplers
.
reserve
(
names
.
size
());
for
(
const
auto
&
name
:
names
)
{
...
...
@@ -460,17 +505,19 @@ std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std
return
samplers
;
}
std
::
vector
<
gpt_sampler_type
>
gpt_sampler_types_from_chars
(
const
std
::
string
&
chars
)
{
std
::
unordered_map
<
char
,
gpt_sampler_type
>
sampler_name_map
=
{
{
gpt_sampler_type_to_chr
(
GPT_SAMPLER_TYPE_TOP_K
),
GPT_SAMPLER_TYPE_TOP_K
},
{
gpt_sampler_type_to_chr
(
GPT_SAMPLER_TYPE_TFS_Z
),
GPT_SAMPLER_TYPE_TFS_Z
},
{
gpt_sampler_type_to_chr
(
GPT_SAMPLER_TYPE_TYPICAL_P
),
GPT_SAMPLER_TYPE_TYPICAL_P
},
{
gpt_sampler_type_to_chr
(
GPT_SAMPLER_TYPE_TOP_P
),
GPT_SAMPLER_TYPE_TOP_P
},
{
gpt_sampler_type_to_chr
(
GPT_SAMPLER_TYPE_MIN_P
),
GPT_SAMPLER_TYPE_MIN_P
},
{
gpt_sampler_type_to_chr
(
GPT_SAMPLER_TYPE_TEMPERATURE
),
GPT_SAMPLER_TYPE_TEMPERATURE
}
std
::
vector
<
common_sampler_type
>
common_sampler_types_from_chars
(
const
std
::
string
&
chars
)
{
std
::
unordered_map
<
char
,
common_sampler_type
>
sampler_name_map
=
{
{
common_sampler_type_to_chr
(
COMMON_SAMPLER_TYPE_DRY
),
COMMON_SAMPLER_TYPE_DRY
},
{
common_sampler_type_to_chr
(
COMMON_SAMPLER_TYPE_TOP_K
),
COMMON_SAMPLER_TYPE_TOP_K
},
{
common_sampler_type_to_chr
(
COMMON_SAMPLER_TYPE_TYPICAL_P
),
COMMON_SAMPLER_TYPE_TYPICAL_P
},
{
common_sampler_type_to_chr
(
COMMON_SAMPLER_TYPE_TOP_P
),
COMMON_SAMPLER_TYPE_TOP_P
},
{
common_sampler_type_to_chr
(
COMMON_SAMPLER_TYPE_MIN_P
),
COMMON_SAMPLER_TYPE_MIN_P
},
{
common_sampler_type_to_chr
(
COMMON_SAMPLER_TYPE_TEMPERATURE
),
COMMON_SAMPLER_TYPE_TEMPERATURE
},
{
common_sampler_type_to_chr
(
COMMON_SAMPLER_TYPE_XTC
),
COMMON_SAMPLER_TYPE_XTC
},
{
common_sampler_type_to_chr
(
COMMON_SAMPLER_TYPE_INFILL
),
COMMON_SAMPLER_TYPE_INFILL
},
};
std
::
vector
<
gpt
_sampler_type
>
samplers
;
std
::
vector
<
common
_sampler_type
>
samplers
;
samplers
.
reserve
(
chars
.
size
());
for
(
const
auto
&
c
:
chars
)
{
...
...
llama/sampling.h
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
@@ -33,7 +33,7 @@
#include <string>
#include <vector>
//
gpt
_sampler extends llama_sampler with additional functionality:
//
common
_sampler extends llama_sampler with additional functionality:
//
// - grammar support
// - custom sampler logic based on the parameters
...
...
@@ -49,30 +49,30 @@
// token in order to verify if it fits the grammar. And only if the token doesn't fit the grammar, the
// grammar constraints are applied to the full vocabulary and the token is resampled.
//
// The
gpt
_sampler also maintains a container with the last accepted tokens. In the future, this can
// The
common
_sampler also maintains a container with the last accepted tokens. In the future, this can
// be moved into the core llama library.
//
// For convenience, the
gpt
_sampler also maintains a container with the current candidate tokens.
// For convenience, the
common
_sampler also maintains a container with the current candidate tokens.
// This can be used to access the probabilities of the rest of the non-sampled tokens.
//
// TODO: measure grammar performance
//
struct
gpt
_sampler
;
struct
common
_sampler
;
// llama_sampler API overloads
struct
gpt
_sampler
*
gpt
_sampler_init
(
const
struct
llama_model
*
model
,
const
struct
gpt_sampler_params
&
params
);
struct
common
_sampler
*
common
_sampler_init
(
const
struct
llama_model
*
model
,
const
struct
common_params_sampling
&
params
);
void
gpt
_sampler_free
(
struct
gpt
_sampler
*
gsmpl
);
void
common
_sampler_free
(
struct
common
_sampler
*
gsmpl
);
// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
void
gpt
_sampler_accept
(
struct
gpt
_sampler
*
gsmpl
,
llama_token
token
,
bool
accept_grammar
);
void
gpt
_sampler_reset
(
struct
gpt
_sampler
*
gsmpl
);
struct
gpt
_sampler
*
gpt
_sampler_clone
(
struct
gpt
_sampler
*
gsmpl
);
void
common
_sampler_accept
(
struct
common
_sampler
*
gsmpl
,
llama_token
token
,
bool
accept_grammar
);
void
common
_sampler_reset
(
struct
common
_sampler
*
gsmpl
);
struct
common
_sampler
*
common
_sampler_clone
(
struct
common
_sampler
*
gsmpl
);
// arguments can be nullptr to skip printing
void
gpt
_perf_print
(
const
struct
llama_context
*
ctx
,
const
struct
gpt
_sampler
*
gsmpl
);
void
common
_perf_print
(
const
struct
llama_context
*
ctx
,
const
struct
common
_sampler
*
gsmpl
);
// extended sampling implementation:
//
...
...
@@ -84,26 +84,47 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
// if grammar_first is true, the grammar is applied before the samplers (slower)
// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
//
llama_token
gpt
_sampler_sample
(
struct
gpt
_sampler
*
gsmpl
,
struct
llama_context
*
ctx
,
int
idx
,
bool
grammar_first
=
false
);
llama_token
common
_sampler_sample
(
struct
common
_sampler
*
gsmpl
,
struct
llama_context
*
ctx
,
int
idx
,
bool
grammar_first
=
false
);
uint32_t
gpt_sampler_get_seed
(
const
struct
gpt_sampler
*
gsmpl
);
// generalized version of common_sampler_sample
//
// will cross-reference the sampled tokens with a batch of draft tokens and accept those that match
// if the sampler disagrees at some point, we stop and return the accepted tokens up to now
//
// common_sampler_sample_n(gsmpl, ctx, { idx }, {});
//
// is equivalent to
//
// common_sampler_sample(gsmpl, ctx, idx);
// common_sampler_accept(gsmpl, token, true);
//
// requires: idxs.size() == draft.size() + 1
//
// returns at least 1 token, up to idxs.size()
//
std
::
vector
<
llama_token
>
common_sampler_sample_and_accept_n
(
struct
common_sampler
*
gsmpl
,
struct
llama_context
*
ctx
,
const
std
::
vector
<
int
>
&
idxs
,
const
llama_tokens
&
draft
,
bool
grammar_first
=
false
);
// assume idxs == [ 0, 1, 2, ..., draft.size() ]
std
::
vector
<
llama_token
>
common_sampler_sample_and_accept_n
(
struct
common_sampler
*
gsmpl
,
struct
llama_context
*
ctx
,
const
llama_tokens
&
draft
,
bool
grammar_first
=
false
);
uint32_t
common_sampler_get_seed
(
const
struct
common_sampler
*
gsmpl
);
// helpers
// access the internal list of current candidate tokens
llama_token_data_array
*
gpt
_sampler_get_candidates
(
struct
gpt
_sampler
*
gsmpl
);
llama_token_data_array
*
common
_sampler_get_candidates
(
struct
common
_sampler
*
gsmpl
);
// get the last accepted token
llama_token
gpt
_sampler_last
(
const
struct
gpt
_sampler
*
gsmpl
);
llama_token
common
_sampler_last
(
const
struct
common
_sampler
*
gsmpl
);
// print the sampler chain into a string
std
::
string
gpt
_sampler_print
(
const
struct
gpt
_sampler
*
gsmpl
);
std
::
string
common
_sampler_print
(
const
struct
common
_sampler
*
gsmpl
);
// get a string representation of the last accepted tokens
std
::
string
gpt
_sampler_prev_str
(
gpt
_sampler
*
gsmpl
,
llama_context
*
ctx
,
int
n
);
std
::
string
common
_sampler_prev_str
(
common
_sampler
*
gsmpl
,
llama_context
*
ctx
,
int
n
);
char
gpt
_sampler_type_to_chr
(
enum
gpt
_sampler_type
cnstr
);
std
::
string
gpt
_sampler_type_to_str
(
enum
gpt
_sampler_type
cnstr
);
char
common
_sampler_type_to_chr
(
enum
common
_sampler_type
cnstr
);
std
::
string
common
_sampler_type_to_str
(
enum
common
_sampler_type
cnstr
);
std
::
vector
<
enum
gpt
_sampler_type
>
gpt
_sampler_types_from_names
(
const
std
::
vector
<
std
::
string
>
&
names
,
bool
allow_alt_names
);
std
::
vector
<
enum
gpt
_sampler_type
>
gpt
_sampler_types_from_chars
(
const
std
::
string
&
chars
);
std
::
vector
<
enum
common
_sampler_type
>
common
_sampler_types_from_names
(
const
std
::
vector
<
std
::
string
>
&
names
,
bool
allow_alt_names
);
std
::
vector
<
enum
common
_sampler_type
>
common
_sampler_types_from_chars
(
const
std
::
string
&
chars
);
llama/sampling_ext.cpp
View file @
527cc978
...
...
@@ -3,16 +3,12 @@
#include "sampling_ext.h"
#include "json-schema-to-grammar.h"
struct
gpt_sampler
*
gpt_sampler_cinit
(
const
struct
llama_model
*
model
,
struct
gpt_sampler_cparams
*
params
)
{
try
{
gpt_sampler_params
sparams
;
struct
common_sampler
*
common_sampler_cinit
(
const
struct
llama_model
*
model
,
struct
common_sampler_cparams
*
params
)
{
try
{
common_params_sampling
sparams
;
sparams
.
top_k
=
params
->
top_k
;
sparams
.
top_p
=
params
->
top_p
;
sparams
.
min_p
=
params
->
min_p
;
sparams
.
tfs_z
=
params
->
tfs_z
;
sparams
.
typ_p
=
params
->
typical_p
;
sparams
.
temp
=
params
->
temp
;
sparams
.
penalty_last_n
=
params
->
penalty_last_n
;
...
...
@@ -25,38 +21,28 @@ struct gpt_sampler *gpt_sampler_cinit(
sparams
.
penalize_nl
=
params
->
penalize_nl
;
sparams
.
seed
=
params
->
seed
;
sparams
.
grammar
=
params
->
grammar
;
return
gpt_sampler_init
(
model
,
sparams
)
;
}
catch
(
const
std
::
exception
&
err
)
{
sparams
.
xtc_probability
=
0.0
;
sparams
.
xtc_threshold
=
0.5
;
return
common_sampler_init
(
model
,
sparams
);
}
catch
(
const
std
::
exception
&
err
)
{
return
nullptr
;
}
}
void
gpt_sampler_cfree
(
struct
gpt_sampler
*
sampler
)
{
gpt_sampler_free
(
sampler
);
void
common_sampler_cfree
(
struct
common_sampler
*
sampler
)
{
common_sampler_free
(
sampler
);
}
void
gpt_sampler_creset
(
struct
gpt_sampler
*
sampler
)
{
gpt_sampler_reset
(
sampler
);
void
common_sampler_creset
(
struct
common_sampler
*
sampler
)
{
common_sampler_reset
(
sampler
);
}
llama_token
gpt_sampler_csample
(
struct
gpt_sampler
*
sampler
,
struct
llama_context
*
ctx_main
,
int
idx
)
{
return
gpt_sampler_sample
(
sampler
,
ctx_main
,
idx
);
void
common_sampler_caccept
(
struct
common_sampler
*
sampler
,
llama_token
id
,
bool
apply_grammar
)
{
common_sampler_accept
(
sampler
,
id
,
apply_grammar
);
}
void
gpt_sampler_caccept
(
struct
gpt_sampler
*
sampler
,
llama_token
id
,
bool
apply_grammar
)
{
gpt_sampler_accept
(
sampler
,
id
,
apply_grammar
);
llama_token
common_sampler_csample
(
struct
common_sampler
*
sampler
,
struct
llama_context
*
ctx
,
int
idx
)
{
return
common_sampler_sample
(
sampler
,
ctx
,
idx
);
}
int
schema_to_grammar
(
const
char
*
json_schema
,
char
*
grammar
,
size_t
max_len
)
...
...
llama/sampling_ext.h
View file @
527cc978
// TODO: this is a temporary wrapper to allow calling C++ code from CGo
#ifndef
GPT_
SAMPL
ER
_EXT_H
#define
GPT_
SAMPL
ER
_EXT_H
#ifndef SAMPL
ING
_EXT_H
#define SAMPL
ING
_EXT_H
#ifdef __cplusplus
extern
"C"
...
...
@@ -9,14 +9,11 @@ extern "C"
// Forward declaration to avoid include of "sampling.h" which has c++
// includes
struct
gpt_sampler
;
struct
gpt_sampler_cparams
{
struct
common_sampler
;
struct
common_sampler_cparams
{
int32_t
top_k
;
float
top_p
;
float
min_p
;
float
tfs_z
;
float
typical_p
;
float
temp
;
int32_t
penalty_last_n
;
...
...
@@ -31,21 +28,11 @@ extern "C"
char
*
grammar
;
};
struct
gpt_sampler
*
gpt_sampler_cinit
(
const
struct
llama_model
*
model
,
struct
gpt_sampler_cparams
*
params
);
void
gpt_sampler_cfree
(
struct
gpt_sampler
*
sampler
);
void
gpt_sampler_creset
(
struct
gpt_sampler
*
sampler
);
llama_token
gpt_sampler_csample
(
struct
gpt_sampler
*
sampler
,
struct
llama_context
*
ctx_main
,
int
idx
);
void
gpt_sampler_caccept
(
struct
gpt_sampler
*
sampler
,
llama_token
id
,
bool
apply_grammar
);
struct
common_sampler
*
common_sampler_cinit
(
const
struct
llama_model
*
model
,
struct
common_sampler_cparams
*
params
);
void
common_sampler_cfree
(
struct
common_sampler
*
sampler
);
void
common_sampler_creset
(
struct
common_sampler
*
sampler
);
void
common_sampler_caccept
(
struct
common_sampler
*
sampler
,
llama_token
id
,
bool
apply_grammar
);
llama_token
common_sampler_csample
(
struct
common_sampler
*
sampler
,
struct
llama_context
*
ctx
,
int
idx
);
int
schema_to_grammar
(
const
char
*
json_schema
,
char
*
grammar
,
size_t
max_len
);
...
...
@@ -53,4 +40,4 @@ extern "C"
}
#endif
#endif //
GPT_
SAMPL
ER
_EXT_H
#endif // SAMPL
ING
_EXT_H
llama/sgemm.cpp
View file @
527cc978
...
...
@@ -106,6 +106,10 @@ inline float16x8_t sub(float16x8_t x, float16x8_t y) { return vsubq_f16(x, y); }
inline
float16x8_t
mul
(
float16x8_t
x
,
float16x8_t
y
)
{
return
vmulq_f16
(
x
,
y
);
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#if defined(__MMA__)
typedef
vector
unsigned
char
vec_t
;
typedef
__vector_quad
acc_t
;
#endif
////////////////////////////////////////////////////////////////////////////////////////////////////
// VECTORIZED FUSED MULTIPLY ADD
...
...
@@ -942,6 +946,36 @@ class tinyBLAS_Q0_AVX {
return
_mm_sub_epi8
(
_mm_and_si128
(
_mm_set1_epi8
(
15
),
_mm_srli_epi16
(
x
,
4
)),
_mm_set1_epi8
(
8
));
}
inline
__m256i
load
(
const
block_q5_0
*
b
)
{
return
_mm256_or_si256
(
denibble
(
b
->
qs
),
bittobyte
(
b
->
qh
));
}
inline
__m128i
load0
(
const
block_q5_0
*
b
)
{
const
__m128i
x
=
_mm_loadu_si128
((
const
__m128i
*
)(
b
->
qs
));
uint32_t
x32
;
memcpy
(
&
x32
,
b
->
qh
,
sizeof
(
uint32_t
));
__m128i
qxl
=
_mm_and_si128
(
_mm_set1_epi8
(
15
),
x
);
__m128i
bytesl
=
_mm_cmpeq_epi8
(
_mm_set1_epi64x
(
-
1
),
_mm_or_si128
(
_mm_set1_epi64x
(
0x7fbfdfeff7fbfdfe
),
_mm_shuffle_epi8
(
_mm_set1_epi32
(
x32
),
_mm_set_epi64x
(
0x0101010101010101
,
0x0000000000000000
))));
bytesl
=
_mm_andnot_si128
(
bytesl
,
_mm_set1_epi8
((
char
)
0xF0
));
return
_mm_or_si128
(
qxl
,
bytesl
);
}
inline
__m128i
load1
(
const
block_q5_0
*
b
)
{
const
__m128i
x
=
_mm_loadu_si128
((
const
__m128i
*
)(
b
->
qs
));
uint32_t
x32
;
memcpy
(
&
x32
,
b
->
qh
,
sizeof
(
uint32_t
));
__m128i
qxh
=
_mm_and_si128
(
_mm_set1_epi8
(
15
),
_mm_srli_epi16
(
x
,
4
));
__m128i
bytesh
=
_mm_cmpeq_epi8
(
_mm_set1_epi64x
(
-
1
),
_mm_or_si128
(
_mm_set1_epi64x
(
0x7fbfdfeff7fbfdfe
),
_mm_shuffle_epi8
(
_mm_set1_epi32
(
x32
),
_mm_set_epi64x
(
0x0303030303030303
,
0x0202020202020202
))));
bytesh
=
_mm_andnot_si128
(
bytesh
,
_mm_set1_epi8
((
char
)
0xF0
));
return
_mm_or_si128
(
qxh
,
bytesh
);
}
inline
__m256i
load
(
const
block_iq4_nl
*
b
)
{
return
MM256_SET_M128I
(
load1
(
b
),
load0
(
b
));
}
...
...
@@ -973,6 +1007,17 @@ class tinyBLAS_Q0_AVX {
_mm_srli_epi16
(
x
,
4
),
1
));
}
static
inline
__m256i
bittobyte
(
const
uint8_t
*
p
)
{
uint32_t
x32
;
memcpy
(
&
x32
,
p
,
sizeof
(
uint32_t
));
__m256i
bytes
=
_mm256_cmpeq_epi8
(
_mm256_set1_epi64x
(
-
1
),
_mm256_or_si256
(
_mm256_set1_epi64x
(
0x7fbfdfeff7fbfdfe
),
_mm256_shuffle_epi8
(
_mm256_set1_epi32
(
x32
),
_mm256_set_epi64x
(
0x0303030303030303
,
0x0202020202020202
,
0x0101010101010101
,
0x0000000000000000
))));
return
_mm256_andnot_si256
(
bytes
,
_mm256_set1_epi8
((
char
)
0xF0
));
}
const
TA
*
const
A
;
const
TB
*
const
B
;
TC
*
const
C
;
...
...
@@ -985,6 +1030,600 @@ class tinyBLAS_Q0_AVX {
};
#endif // __AVX__
//PPC Implementation
#if defined(__MMA__)
#define SAVE_ACC(ACC, ii, jj) \
__builtin_mma_disassemble_acc(vec_C, ACC); \
for (int I = 0; I < 4; I++) { \
for (int J = 0; J < 4; J++) { \
*((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&vec_C[I]+J); \
} \
} \
template
<
typename
TA
,
typename
TB
,
typename
TC
>
class
tinyBLAS_PPC
{
public:
tinyBLAS_PPC
(
int64_t
k
,
const
TA
*
A
,
int64_t
lda
,
const
TB
*
B
,
int64_t
ldb
,
TC
*
C
,
int64_t
ldc
,
int
ith
,
int
nth
)
:
A
(
A
),
B
(
B
),
C
(
C
),
k
(
k
),
lda
(
lda
),
ldb
(
ldb
),
ldc
(
ldc
),
ith
(
ith
),
nth
(
nth
)
{
}
void
matmul
(
int64_t
m
,
int64_t
n
)
{
mnpack
(
0
,
m
,
0
,
n
);
}
private:
void
(
tinyBLAS_PPC
::*
kernel
)(
int64_t
,
int64_t
);
void
READ_BLOCK
(
const
float
*
a
,
int64_t
lda
,
int
rows
,
int
cols
,
float
*
vec
)
{
int64_t
i
,
j
;
float
*
aoffset
=
NULL
,
*
boffset
=
NULL
;
float
*
aoffset1
=
NULL
,
*
aoffset2
=
NULL
,
*
aoffset3
=
NULL
,
*
aoffset4
=
NULL
;
float
*
aoffset5
=
NULL
,
*
aoffset6
=
NULL
,
*
aoffset7
=
NULL
,
*
aoffset8
=
NULL
;
aoffset
=
const_cast
<
float
*>
(
a
);
boffset
=
vec
;
j
=
(
rows
>>
3
);
if
(
j
>
0
)
{
do
{
aoffset1
=
aoffset
;
aoffset2
=
aoffset1
+
lda
;
aoffset3
=
aoffset2
+
lda
;
aoffset4
=
aoffset3
+
lda
;
aoffset5
=
aoffset4
+
lda
;
aoffset6
=
aoffset5
+
lda
;
aoffset7
=
aoffset6
+
lda
;
aoffset8
=
aoffset7
+
lda
;
aoffset
+=
8
*
lda
;
i
=
(
cols
>>
3
);
if
(
i
>
0
)
{
__vector_pair
C1
,
C2
,
C3
,
C4
,
C5
,
C6
,
C7
,
C8
;
vector
float
c1
[
2
],
c2
[
2
],
c3
[
2
],
c4
[
2
],
c5
[
2
],
c6
[
2
],
c7
[
2
],
c8
[
2
];
vector
float
t1
,
t2
,
t3
,
t4
,
t5
,
t6
,
t7
,
t8
;
do
{
C1
=
__builtin_vsx_lxvp
(
0
,
(
__vector_pair
*
)
aoffset1
);
C2
=
__builtin_vsx_lxvp
(
0
,
(
__vector_pair
*
)
aoffset2
);
C3
=
__builtin_vsx_lxvp
(
0
,
(
__vector_pair
*
)
aoffset3
);
C4
=
__builtin_vsx_lxvp
(
0
,
(
__vector_pair
*
)
aoffset4
);
C5
=
__builtin_vsx_lxvp
(
0
,
(
__vector_pair
*
)
aoffset5
);
C6
=
__builtin_vsx_lxvp
(
0
,
(
__vector_pair
*
)
aoffset6
);
C7
=
__builtin_vsx_lxvp
(
0
,
(
__vector_pair
*
)
aoffset7
);
C8
=
__builtin_vsx_lxvp
(
0
,
(
__vector_pair
*
)
aoffset8
);
__builtin_vsx_disassemble_pair
(
c1
,
&
C1
);
__builtin_vsx_disassemble_pair
(
c2
,
&
C2
);
__builtin_vsx_disassemble_pair
(
c3
,
&
C3
);
__builtin_vsx_disassemble_pair
(
c4
,
&
C4
);
__builtin_vsx_disassemble_pair
(
c5
,
&
C5
);
__builtin_vsx_disassemble_pair
(
c6
,
&
C6
);
__builtin_vsx_disassemble_pair
(
c7
,
&
C7
);
__builtin_vsx_disassemble_pair
(
c8
,
&
C8
);
t1
=
vec_mergeh
(
c1
[
0
],
c2
[
0
]);
t2
=
vec_mergeh
(
c3
[
0
],
c4
[
0
]);
t3
=
vec_mergeh
(
c5
[
0
],
c6
[
0
]);
t4
=
vec_mergeh
(
c7
[
0
],
c8
[
0
]);
t5
=
vec_xxpermdi
(
t1
,
t2
,
0
);
t6
=
vec_xxpermdi
(
t3
,
t4
,
0
);
t7
=
vec_xxpermdi
(
t1
,
t2
,
3
);
t8
=
vec_xxpermdi
(
t3
,
t4
,
3
);
vec_xst
(
t5
,
0
,
boffset
);
vec_xst
(
t6
,
0
,
boffset
+
4
);
vec_xst
(
t7
,
0
,
boffset
+
8
);
vec_xst
(
t8
,
0
,
boffset
+
12
);
t1
=
vec_mergel
(
c1
[
0
],
c2
[
0
]);
t2
=
vec_mergel
(
c3
[
0
],
c4
[
0
]);
t3
=
vec_mergel
(
c5
[
0
],
c6
[
0
]);
t4
=
vec_mergel
(
c7
[
0
],
c8
[
0
]);
t5
=
vec_xxpermdi
(
t1
,
t2
,
0
);
t6
=
vec_xxpermdi
(
t3
,
t4
,
0
);
t7
=
vec_xxpermdi
(
t1
,
t2
,
3
);
t8
=
vec_xxpermdi
(
t3
,
t4
,
3
);
vec_xst
(
t5
,
0
,
boffset
+
16
);
vec_xst
(
t6
,
0
,
boffset
+
20
);
vec_xst
(
t7
,
0
,
boffset
+
24
);
vec_xst
(
t8
,
0
,
boffset
+
28
);
t1
=
vec_mergeh
(
c1
[
1
],
c2
[
1
]);
t2
=
vec_mergeh
(
c3
[
1
],
c4
[
1
]);
t3
=
vec_mergeh
(
c5
[
1
],
c6
[
1
]);
t4
=
vec_mergeh
(
c7
[
1
],
c8
[
1
]);
t5
=
vec_xxpermdi
(
t1
,
t2
,
0
);
t6
=
vec_xxpermdi
(
t3
,
t4
,
0
);
t7
=
vec_xxpermdi
(
t1
,
t2
,
3
);
t8
=
vec_xxpermdi
(
t3
,
t4
,
3
);
vec_xst
(
t5
,
0
,
boffset
+
32
);
vec_xst
(
t6
,
0
,
boffset
+
36
);
vec_xst
(
t7
,
0
,
boffset
+
40
);
vec_xst
(
t8
,
0
,
boffset
+
44
);
t1
=
vec_mergel
(
c1
[
1
],
c2
[
1
]);
t2
=
vec_mergel
(
c3
[
1
],
c4
[
1
]);
t3
=
vec_mergel
(
c5
[
1
],
c6
[
1
]);
t4
=
vec_mergel
(
c7
[
1
],
c8
[
1
]);
t5
=
vec_xxpermdi
(
t1
,
t2
,
0
);
t6
=
vec_xxpermdi
(
t3
,
t4
,
0
);
t7
=
vec_xxpermdi
(
t1
,
t2
,
3
);
t8
=
vec_xxpermdi
(
t3
,
t4
,
3
);
vec_xst
(
t5
,
0
,
boffset
+
48
);
vec_xst
(
t6
,
0
,
boffset
+
52
);
vec_xst
(
t7
,
0
,
boffset
+
56
);
vec_xst
(
t8
,
0
,
boffset
+
60
);
aoffset1
+=
8
*
lda
;
aoffset2
+=
8
*
lda
;
aoffset3
+=
8
*
lda
;
aoffset4
+=
8
*
lda
;
boffset
+=
64
;
i
--
;
}
while
(
i
>
0
);
}
if
(
cols
&
4
)
{
vector
float
c1
,
c2
,
c3
,
c4
,
c5
,
c6
,
c7
,
c8
;
vector
float
t1
,
t2
,
t3
,
t4
,
t5
,
t6
,
t7
,
t8
;
c1
=
vec_xl
(
0
,
aoffset1
);
c2
=
vec_xl
(
0
,
aoffset2
);
c3
=
vec_xl
(
0
,
aoffset3
);
c4
=
vec_xl
(
0
,
aoffset4
);
c5
=
vec_xl
(
0
,
aoffset5
);
c6
=
vec_xl
(
0
,
aoffset6
);
c7
=
vec_xl
(
0
,
aoffset7
);
c8
=
vec_xl
(
0
,
aoffset8
);
t1
=
vec_mergeh
(
c1
,
c2
);
t2
=
vec_mergeh
(
c3
,
c4
);
t3
=
vec_mergeh
(
c5
,
c6
);
t4
=
vec_mergeh
(
c7
,
c8
);
t5
=
vec_xxpermdi
(
t1
,
t2
,
0
);
t6
=
vec_xxpermdi
(
t3
,
t4
,
0
);
t7
=
vec_xxpermdi
(
t1
,
t2
,
3
);
t8
=
vec_xxpermdi
(
t3
,
t4
,
3
);
vec_xst
(
t5
,
0
,
boffset
);
vec_xst
(
t6
,
0
,
boffset
+
4
);
vec_xst
(
t7
,
0
,
boffset
+
8
);
vec_xst
(
t8
,
0
,
boffset
+
12
);
t1
=
vec_mergel
(
c1
,
c2
);
t2
=
vec_mergel
(
c3
,
c4
);
t3
=
vec_mergel
(
c5
,
c6
);
t4
=
vec_mergel
(
c7
,
c8
);
t5
=
vec_xxpermdi
(
t1
,
t2
,
0
);
t6
=
vec_xxpermdi
(
t3
,
t4
,
0
);
t7
=
vec_xxpermdi
(
t1
,
t2
,
3
);
t8
=
vec_xxpermdi
(
t3
,
t4
,
3
);
vec_xst
(
t5
,
0
,
boffset
+
16
);
vec_xst
(
t6
,
0
,
boffset
+
20
);
vec_xst
(
t7
,
0
,
boffset
+
24
);
vec_xst
(
t8
,
0
,
boffset
+
28
);
}
j
--
;
}
while
(
j
>
0
);
}
if
(
rows
&
4
)
{
aoffset1
=
aoffset
;
aoffset2
=
aoffset1
+
lda
;
aoffset3
=
aoffset2
+
lda
;
aoffset4
=
aoffset3
+
lda
;
aoffset
+=
4
*
lda
;
i
=
(
cols
>>
3
);
if
(
i
>
0
)
{
__vector_pair
C1
,
C2
,
C3
,
C4
;
vector
float
c1
[
2
],
c2
[
2
],
c3
[
2
],
c4
[
2
];
vector
float
t1
,
t2
,
t3
,
t4
,
t5
,
t6
,
t7
,
t8
;
do
{
C1
=
__builtin_vsx_lxvp
(
0
,
(
__vector_pair
*
)
aoffset1
);
C2
=
__builtin_vsx_lxvp
(
0
,
(
__vector_pair
*
)
aoffset2
);
C3
=
__builtin_vsx_lxvp
(
0
,
(
__vector_pair
*
)
aoffset3
);
C4
=
__builtin_vsx_lxvp
(
0
,
(
__vector_pair
*
)
aoffset4
);
__builtin_vsx_disassemble_pair
(
c1
,
&
C1
);
__builtin_vsx_disassemble_pair
(
c2
,
&
C2
);
__builtin_vsx_disassemble_pair
(
c3
,
&
C3
);
__builtin_vsx_disassemble_pair
(
c4
,
&
C4
);
t1
=
vec_mergeh
(
c1
[
0
],
c2
[
0
]);
t2
=
vec_mergeh
(
c3
[
0
],
c4
[
0
]);
t3
=
vec_mergel
(
c1
[
0
],
c2
[
0
]);
t4
=
vec_mergel
(
c3
[
0
],
c4
[
0
]);
t5
=
vec_xxpermdi
(
t1
,
t2
,
0
);
t6
=
vec_xxpermdi
(
t1
,
t2
,
3
);
t7
=
vec_xxpermdi
(
t3
,
t4
,
0
);
t8
=
vec_xxpermdi
(
t3
,
t4
,
3
);
vec_xst
(
t5
,
0
,
boffset
);
vec_xst
(
t6
,
0
,
boffset
+
4
);
vec_xst
(
t7
,
0
,
boffset
+
8
);
vec_xst
(
t8
,
0
,
boffset
+
12
);
t1
=
vec_mergeh
(
c1
[
1
],
c2
[
1
]);
t2
=
vec_mergeh
(
c3
[
1
],
c4
[
1
]);
t3
=
vec_mergel
(
c1
[
1
],
c2
[
1
]);
t4
=
vec_mergel
(
c3
[
1
],
c4
[
1
]);
t5
=
vec_xxpermdi
(
t1
,
t2
,
0
);
t6
=
vec_xxpermdi
(
t1
,
t2
,
3
);
t7
=
vec_xxpermdi
(
t3
,
t4
,
0
);
t8
=
vec_xxpermdi
(
t3
,
t4
,
3
);
vec_xst
(
t5
,
0
,
boffset
+
16
);
vec_xst
(
t6
,
0
,
boffset
+
20
);
vec_xst
(
t7
,
0
,
boffset
+
24
);
vec_xst
(
t8
,
0
,
boffset
+
28
);
aoffset1
+=
8
*
lda
;
aoffset2
+=
8
*
lda
;
aoffset3
+=
8
*
lda
;
aoffset4
+=
8
*
lda
;
boffset
+=
32
;
i
--
;
}
while
(
i
>
0
);
}
if
(
cols
&
4
)
{
vector
float
c1
,
c2
,
c3
,
c4
;
vector
float
t1
,
t2
,
t3
,
t4
;
c1
=
vec_xl
(
0
,
aoffset1
);
c2
=
vec_xl
(
0
,
aoffset2
);
c3
=
vec_xl
(
0
,
aoffset3
);
c4
=
vec_xl
(
0
,
aoffset4
);
t1
=
vec_mergeh
(
c1
,
c2
);
t2
=
vec_mergeh
(
c3
,
c4
);
t3
=
vec_xxpermdi
(
t1
,
t2
,
0
);
t4
=
vec_xxpermdi
(
t1
,
t2
,
3
);
vec_xst
(
t3
,
0
,
boffset
);
vec_xst
(
t4
,
0
,
boffset
+
4
);
t1
=
vec_mergel
(
c1
,
c2
);
t2
=
vec_mergel
(
c3
,
c4
);
t3
=
vec_xxpermdi
(
t1
,
t2
,
0
);
t4
=
vec_xxpermdi
(
t1
,
t2
,
3
);
vec_xst
(
t3
,
0
,
boffset
+
8
);
vec_xst
(
t4
,
0
,
boffset
+
12
);
}
}
if
(
rows
&
3
)
{
aoffset1
=
aoffset
;
aoffset2
=
aoffset1
+
lda
;
aoffset3
=
aoffset2
+
lda
;
if
(
cols
&
4
)
{
vector
float
c1
,
c2
,
c3
,
c4
=
{
0
};
vector
float
t1
,
t2
,
t3
,
t4
;
c1
=
vec_xl
(
0
,
aoffset1
);
c2
=
vec_xl
(
0
,
aoffset2
);
c3
=
vec_xl
(
0
,
aoffset3
);
t1
=
vec_mergeh
(
c1
,
c2
);
t2
=
vec_mergeh
(
c3
,
c4
);
t3
=
vec_xxpermdi
(
t1
,
t2
,
0
);
t4
=
vec_xxpermdi
(
t1
,
t2
,
3
);
vec_xst
(
t3
,
0
,
boffset
);
vec_xst
(
t4
,
0
,
boffset
+
4
);
t1
=
vec_mergel
(
c1
,
c2
);
t2
=
vec_mergel
(
c3
,
c4
);
t3
=
vec_xxpermdi
(
t1
,
t2
,
0
);
t4
=
vec_xxpermdi
(
t1
,
t2
,
3
);
vec_xst
(
t3
,
0
,
boffset
+
8
);
vec_xst
(
t4
,
0
,
boffset
+
12
);
}
}
}
void
KERNEL_4x4
(
int64_t
ii
,
int64_t
jj
)
{
vec_t
vec_A
[
4
],
vec_B
[
4
],
vec_C
[
4
];
acc_t
acc_0
;
__builtin_mma_xxsetaccz
(
&
acc_0
);
for
(
int
l
=
0
;
l
<
k
;
l
+=
4
)
{
READ_BLOCK
(
A
+
(
ii
*
lda
)
+
l
,
lda
,
4
,
4
,
(
float
*
)
vec_A
);
READ_BLOCK
(
B
+
(
jj
*
ldb
)
+
l
,
ldb
,
4
,
4
,
(
float
*
)
vec_B
);
__builtin_mma_xvf32gerpp
(
&
acc_0
,
vec_A
[
0
],
vec_B
[
0
]);
__builtin_mma_xvf32gerpp
(
&
acc_0
,
vec_A
[
1
],
vec_B
[
1
]);
__builtin_mma_xvf32gerpp
(
&
acc_0
,
vec_A
[
2
],
vec_B
[
2
]);
__builtin_mma_xvf32gerpp
(
&
acc_0
,
vec_A
[
3
],
vec_B
[
3
]);
}
SAVE_ACC
(
&
acc_0
,
ii
,
jj
);
}
void
KERNEL_4x8
(
int64_t
ii
,
int64_t
jj
)
{
vec_t
vec_A
[
4
],
vec_B
[
8
],
vec_C
[
4
];
acc_t
acc_0
,
acc_1
;
__builtin_mma_xxsetaccz
(
&
acc_0
);
__builtin_mma_xxsetaccz
(
&
acc_1
);
for
(
int64_t
l
=
0
;
l
<
k
;
l
+=
4
)
{
READ_BLOCK
(
A
+
(
ii
*
lda
)
+
l
,
lda
,
4
,
4
,
(
float
*
)
vec_A
);
READ_BLOCK
(
B
+
(
jj
*
ldb
)
+
l
,
ldb
,
8
,
4
,
(
float
*
)
vec_B
);
__builtin_mma_xvf32gerpp
(
&
acc_0
,
vec_A
[
0
],
(
vec_t
)
vec_B
[
0
]);
__builtin_mma_xvf32gerpp
(
&
acc_1
,
vec_A
[
0
],
(
vec_t
)
vec_B
[
1
]);
__builtin_mma_xvf32gerpp
(
&
acc_0
,
vec_A
[
1
],
(
vec_t
)
vec_B
[
2
]);
__builtin_mma_xvf32gerpp
(
&
acc_1
,
vec_A
[
1
],
(
vec_t
)
vec_B
[
3
]);
__builtin_mma_xvf32gerpp
(
&
acc_0
,
vec_A
[
2
],
(
vec_t
)
vec_B
[
4
]);
__builtin_mma_xvf32gerpp
(
&
acc_1
,
vec_A
[
2
],
(
vec_t
)
vec_B
[
5
]);
__builtin_mma_xvf32gerpp
(
&
acc_0
,
vec_A
[
3
],
(
vec_t
)
vec_B
[
6
]);
__builtin_mma_xvf32gerpp
(
&
acc_1
,
vec_A
[
3
],
(
vec_t
)
vec_B
[
7
]);
}
SAVE_ACC
(
&
acc_0
,
ii
,
jj
);
SAVE_ACC
(
&
acc_1
,
ii
,
jj
+
4
);
}
void
KERNEL_8x4
(
int64_t
ii
,
int64_t
jj
)
{
vec_t
vec_A
[
8
],
vec_B
[
4
],
vec_C
[
4
];
acc_t
acc_0
,
acc_1
;
__builtin_mma_xxsetaccz
(
&
acc_0
);
__builtin_mma_xxsetaccz
(
&
acc_1
);
for
(
int64_t
l
=
0
;
l
<
k
;
l
+=
4
)
{
READ_BLOCK
(
A
+
(
ii
*
lda
)
+
l
,
lda
,
8
,
4
,
(
float
*
)
vec_A
);
READ_BLOCK
(
B
+
(
jj
*
ldb
)
+
l
,
ldb
,
4
,
4
,
(
float
*
)
vec_B
);
__builtin_mma_xvf32gerpp
(
&
acc_0
,
(
vec_t
)
vec_A
[
0
],
vec_B
[
0
]);
__builtin_mma_xvf32gerpp
(
&
acc_1
,
(
vec_t
)
vec_A
[
1
],
vec_B
[
0
]);
__builtin_mma_xvf32gerpp
(
&
acc_0
,
(
vec_t
)
vec_A
[
2
],
vec_B
[
1
]);
__builtin_mma_xvf32gerpp
(
&
acc_1
,
(
vec_t
)
vec_A
[
3
],
vec_B
[
1
]);
__builtin_mma_xvf32gerpp
(
&
acc_0
,
(
vec_t
)
vec_A
[
4
],
vec_B
[
2
]);
__builtin_mma_xvf32gerpp
(
&
acc_1
,
(
vec_t
)
vec_A
[
5
],
vec_B
[
2
]);
__builtin_mma_xvf32gerpp
(
&
acc_0
,
(
vec_t
)
vec_A
[
6
],
vec_B
[
3
]);
__builtin_mma_xvf32gerpp
(
&
acc_1
,
(
vec_t
)
vec_A
[
7
],
vec_B
[
3
]);
}
SAVE_ACC
(
&
acc_0
,
ii
,
jj
);
SAVE_ACC
(
&
acc_1
,
ii
+
4
,
jj
);
}
void
KERNEL_8x8
(
int64_t
ii
,
int64_t
jj
)
{
vec_t
vec_A
[
16
],
vec_B
[
16
],
vec_C
[
4
];
acc_t
acc_0
,
acc_1
,
acc_2
,
acc_3
;
__builtin_mma_xxsetaccz
(
&
acc_0
);
__builtin_mma_xxsetaccz
(
&
acc_1
);
__builtin_mma_xxsetaccz
(
&
acc_2
);
__builtin_mma_xxsetaccz
(
&
acc_3
);
for
(
int
l
=
0
;
l
<
k
;
l
+=
8
)
{
READ_BLOCK
(
A
+
(
ii
*
lda
)
+
l
,
lda
,
8
,
8
,
(
float
*
)
vec_A
);
READ_BLOCK
(
B
+
(
jj
*
ldb
)
+
l
,
ldb
,
8
,
8
,
(
float
*
)
vec_B
);
for
(
int
x
=
0
;
x
<
16
;
x
+=
2
)
{
__builtin_mma_xvf32gerpp
(
&
acc_0
,
(
vec_t
)
vec_A
[
x
],
vec_B
[
x
]);
__builtin_mma_xvf32gerpp
(
&
acc_1
,
(
vec_t
)
vec_A
[
x
],
vec_B
[
x
+
1
]);
__builtin_mma_xvf32gerpp
(
&
acc_2
,
(
vec_t
)
vec_A
[
x
+
1
],
vec_B
[
x
]);
__builtin_mma_xvf32gerpp
(
&
acc_3
,
(
vec_t
)
vec_A
[
x
+
1
],
vec_B
[
x
+
1
]);
}
}
SAVE_ACC
(
&
acc_0
,
ii
,
jj
);
SAVE_ACC
(
&
acc_1
,
ii
,
jj
+
4
);
SAVE_ACC
(
&
acc_2
,
ii
+
4
,
jj
);
SAVE_ACC
(
&
acc_3
,
ii
+
4
,
jj
+
4
);
}
void
mnpack
(
int64_t
m0
,
int64_t
m
,
int64_t
n0
,
int64_t
n
)
{
int64_t
mc
,
nc
,
mp
,
np
;
int
m_rem
=
MIN
(
m
-
m0
,
16
);
int
n_rem
=
MIN
(
n
-
n0
,
16
);
if
(
m_rem
>=
16
&&
n_rem
>=
8
)
{
mc
=
8
;
nc
=
8
;
gemm
<
8
,
8
>
(
m0
,
m
,
n0
,
n
);
}
else
if
(
m_rem
>=
8
&&
n_rem
>=
16
)
{
mc
=
8
;
nc
=
8
;
gemm
<
8
,
8
>
(
m0
,
m
,
n0
,
n
);
}
else
if
(
m_rem
>=
8
&&
n_rem
>=
8
)
{
mc
=
8
;
nc
=
8
;
gemm
<
8
,
8
>
(
m0
,
m
,
n0
,
n
);
}
else
if
(
m_rem
>=
4
&&
n_rem
>=
8
)
{
mc
=
4
;
nc
=
8
;
gemm
<
4
,
8
>
(
m0
,
m
,
n0
,
n
);
}
else
if
(
m_rem
>=
8
&&
n_rem
>=
4
)
{
mc
=
8
;
nc
=
4
;
gemm
<
8
,
4
>
(
m0
,
m
,
n0
,
n
);
}
else
if
(
m_rem
>=
4
&&
n_rem
>=
4
)
{
mc
=
4
;
nc
=
4
;
gemm
<
4
,
4
>
(
m0
,
m
,
n0
,
n
);
}
else
if
((
m_rem
<
4
)
&&
(
n_rem
>
4
))
{
nc
=
4
;
switch
(
m_rem
)
{
case
1
:
mc
=
1
;
gemm_small
(
m0
,
m
,
n0
,
n
,
mc
,
nc
);
break
;
case
2
:
mc
=
2
;
gemm_small
(
m0
,
m
,
n0
,
n
,
mc
,
nc
);
break
;
case
3
:
mc
=
3
;
gemm_small
(
m0
,
m
,
n0
,
n
,
mc
,
nc
);
break
;
default:
return
;
}
}
else
if
((
m_rem
>
4
)
&&
(
n_rem
<
4
))
{
mc
=
4
;
switch
(
n_rem
)
{
case
1
:
nc
=
1
;
gemm_small
(
m0
,
m
,
n0
,
n
,
mc
,
nc
);
break
;
case
2
:
nc
=
2
;
gemm_small
(
m0
,
m
,
n0
,
n
,
mc
,
nc
);
break
;
case
3
:
nc
=
3
;
gemm_small
(
m0
,
m
,
n0
,
n
,
mc
,
nc
);
break
;
default:
return
;
}
}
else
{
switch
((
m_rem
<<
4
)
|
n_rem
)
{
case
0x43
:
mc
=
4
;
nc
=
3
;
gemm_small
(
m0
,
m
,
n0
,
n
,
mc
,
nc
);
break
;
case
0x42
:
mc
=
4
;
nc
=
2
;
gemm_small
(
m0
,
m
,
n0
,
n
,
mc
,
nc
);
break
;
case
0x41
:
mc
=
4
;
nc
=
1
;
gemm_small
(
m0
,
m
,
n0
,
n
,
mc
,
nc
);
break
;
case
0x34
:
mc
=
3
;
nc
=
4
;
gemm_small
(
m0
,
m
,
n0
,
n
,
mc
,
nc
);
break
;
case
0x33
:
mc
=
3
;
nc
=
3
;
gemm_small
(
m0
,
m
,
n0
,
n
,
mc
,
nc
);
break
;
case
0x32
:
mc
=
3
;
nc
=
2
;
gemm_small
(
m0
,
m
,
n0
,
n
,
mc
,
nc
);
break
;
case
0x31
:
mc
=
3
;
nc
=
1
;
gemm_small
(
m0
,
m
,
n0
,
n
,
mc
,
nc
);
break
;
case
0x24
:
mc
=
2
;
nc
=
4
;
gemm_small
(
m0
,
m
,
n0
,
n
,
mc
,
nc
);
break
;
case
0x23
:
mc
=
2
;
nc
=
3
;
gemm_small
(
m0
,
m
,
n0
,
n
,
mc
,
nc
);
break
;
case
0x22
:
mc
=
2
;
nc
=
2
;
gemm_small
(
m0
,
m
,
n0
,
n
,
mc
,
nc
);
break
;
case
0x21
:
mc
=
2
;
nc
=
1
;
gemm_small
(
m0
,
m
,
n0
,
n
,
mc
,
nc
);
break
;
case
0x14
:
mc
=
1
;
nc
=
4
;
gemm_small
(
m0
,
m
,
n0
,
n
,
mc
,
nc
);
break
;
case
0x13
:
mc
=
1
;
nc
=
3
;
gemm_small
(
m0
,
m
,
n0
,
n
,
mc
,
nc
);
break
;
case
0x12
:
mc
=
1
;
nc
=
2
;
gemm_small
(
m0
,
m
,
n0
,
n
,
mc
,
nc
);
break
;
case
0x11
:
mc
=
1
;
nc
=
1
;
gemm_small
(
m0
,
m
,
n0
,
n
,
mc
,
nc
);
break
;
default:
return
;
}
}
mp
=
m0
+
(
m
-
m0
)
/
mc
*
mc
;
np
=
n0
+
(
n
-
n0
)
/
nc
*
nc
;
mnpack
(
mp
,
m
,
n0
,
np
);
mnpack
(
m0
,
m
,
np
,
n
);
}
void
gemm_small
(
int64_t
m0
,
int64_t
m
,
int64_t
n0
,
int64_t
n
,
int
RM
,
int
RN
)
{
int64_t
ytiles
=
(
m
-
m0
)
/
RM
;
int64_t
xtiles
=
(
n
-
n0
)
/
RN
;
int64_t
tiles
=
xtiles
*
ytiles
;
int64_t
duty
=
(
tiles
+
nth
-
1
)
/
nth
;
int64_t
start
=
duty
*
ith
;
int64_t
end
=
start
+
duty
;
if
(
end
>
tiles
)
end
=
tiles
;
for
(
int64_t
job
=
start
;
job
<
end
;
++
job
)
{
int64_t
ii
=
m0
+
job
/
xtiles
*
RM
;
int64_t
jj
=
n0
+
job
%
xtiles
*
RN
;
vec_t
vec_C
[
4
];
acc_t
acc_0
;
__builtin_mma_xxsetaccz
(
&
acc_0
);
vec_t
vec_A
[
4
],
vec_B
[
4
];
for
(
int
l
=
0
;
l
<
k
;
l
+=
4
)
{
if
(
RN
>=
4
&&
RM
==
1
)
{
float
*
a
=
const_cast
<
float
*>
(
A
+
(
ii
)
*
lda
+
l
);
READ_BLOCK
(
B
+
(
jj
*
ldb
)
+
l
,
ldb
,
4
,
4
,
(
float
*
)
vec_B
);
vec_A
[
0
]
=
(
vec_t
)
vec_xl
(
0
,
a
);
vec_A
[
1
]
=
(
vec_t
)
vec_splats
(
*
((
float
*
)
&
vec_A
+
1
));
vec_A
[
2
]
=
(
vec_t
)
vec_splats
(
*
((
float
*
)
&
vec_A
+
2
));
vec_A
[
3
]
=
(
vec_t
)
vec_splats
(
*
((
float
*
)
&
vec_A
+
3
));
}
else
{
READ_BLOCK
(
A
+
(
ii
*
lda
)
+
l
,
lda
,
RM
,
4
,
(
float
*
)
vec_A
);
READ_BLOCK
(
B
+
(
jj
*
ldb
)
+
l
,
ldb
,
RN
,
4
,
(
float
*
)
vec_B
);
}
__builtin_mma_xvf32gerpp
(
&
acc_0
,
vec_A
[
0
],
vec_B
[
0
]);
__builtin_mma_xvf32gerpp
(
&
acc_0
,
vec_A
[
1
],
vec_B
[
1
]);
__builtin_mma_xvf32gerpp
(
&
acc_0
,
vec_A
[
2
],
vec_B
[
2
]);
__builtin_mma_xvf32gerpp
(
&
acc_0
,
vec_A
[
3
],
vec_B
[
3
]);
}
__builtin_mma_disassemble_acc
(
vec_C
,
&
acc_0
);
for
(
int
I
=
0
;
I
<
RM
;
I
++
)
{
for
(
int
J
=
0
;
J
<
RN
;
J
++
)
{
*
((
float
*
)(
C
+
ii
+
((
jj
+
J
)
*
ldc
)
+
I
))
=
*
((
float
*
)
&
vec_C
[
I
]
+
J
);
}
}
}
}
template
<
int
RM
,
int
RN
>
NOINLINE
void
gemm
(
int64_t
m0
,
int64_t
m
,
int64_t
n0
,
int64_t
n
)
{
int64_t
ytiles
=
(
m
-
m0
)
/
RM
;
int64_t
xtiles
=
(
n
-
n0
)
/
RN
;
int64_t
tiles
=
xtiles
*
ytiles
;
int64_t
duty
=
(
tiles
+
nth
-
1
)
/
nth
;
int64_t
start
=
duty
*
ith
;
int64_t
end
=
start
+
duty
;
if
(
RM
==
4
&&
RN
==
4
)
{
kernel
=
&
tinyBLAS_PPC
::
KERNEL_4x4
;
}
else
if
(
RM
==
4
&&
RN
==
8
)
{
kernel
=
&
tinyBLAS_PPC
::
KERNEL_4x8
;
}
else
if
(
RM
==
8
&&
RN
==
4
)
{
kernel
=
&
tinyBLAS_PPC
::
KERNEL_8x4
;
}
else
if
(
RM
==
8
&&
RN
==
8
)
{
kernel
=
&
tinyBLAS_PPC
::
KERNEL_8x8
;
}
if
(
end
>
tiles
)
end
=
tiles
;
for
(
int64_t
job
=
start
;
job
<
end
;
++
job
)
{
int64_t
ii
=
m0
+
job
/
xtiles
*
RM
;
int64_t
jj
=
n0
+
job
%
xtiles
*
RN
;
(
this
->*
kernel
)(
ii
,
jj
);
}
}
const
TA
*
const
A
;
const
TB
*
const
B
;
TC
*
C
;
TA
*
At
;
TB
*
Bt
;
const
int64_t
k
;
const
int64_t
lda
;
const
int64_t
ldb
;
const
int64_t
ldc
;
const
int
ith
;
const
int
nth
;
};
#endif
}
// namespace
/**
...
...
@@ -1073,6 +1712,16 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
ith
,
nth
};
tb
.
matmul
(
m
,
n
);
return
true
;
#elif defined(__MMA__)
if
(
k
%
8
)
return
false
;
tinyBLAS_PPC
<
float
,
float
,
float
>
tb
{
k
,
(
const
float
*
)
A
,
lda
,
(
const
float
*
)
B
,
ldb
,
(
float
*
)
C
,
ldc
,
ith
,
nth
};
tb
.
matmul
(
m
,
n
);
return
true
;
#else
return
false
;
#endif
...
...
@@ -1182,6 +1831,22 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
#endif
}
case
GGML_TYPE_Q5_0
:
{
if
(
Btype
!=
GGML_TYPE_Q8_0
)
return
false
;
#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
tinyBLAS_Q0_AVX
<
block_q5_0
,
block_q8_0
,
float
>
tb
{
k
,
(
const
block_q5_0
*
)
A
,
lda
,
(
const
block_q8_0
*
)
B
,
ldb
,
(
float
*
)
C
,
ldc
,
ith
,
nth
};
tb
.
matmul
(
m
,
n
);
return
true
;
#else
return
false
;
#endif
}
case
GGML_TYPE_IQ4_NL
:
{
if
(
Btype
!=
GGML_TYPE_Q8_0
)
return
false
;
...
...
llama/unicode-data.cpp
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
@@ -33,7 +33,7 @@
#include <unordered_map>
#include <unordered_set>
const
std
::
vector
<
std
::
pair
<
uint32_t
,
uint16_t
>>
unicode_ranges_flags
=
{
// start, flags // last=next_start-1
const
std
::
initializer_list
<
std
::
pair
<
uint32_t
,
uint16_t
>>
unicode_ranges_flags
=
{
// start, flags // last=next_start-1
{
0x000000
,
0x0080
},
{
0x000020
,
0x0008
},
{
0x000021
,
0x0020
},
...
...
@@ -2337,7 +2337,8 @@ const std::unordered_set<uint32_t> unicode_set_whitespace = {
0x003000
,
};
const
std
::
unordered_map
<
uint32_t
,
uint32_t
>
unicode_map_lowercase
=
{
// list is always in ascending order, to enable binary search
const
std
::
initializer_list
<
std
::
pair
<
uint32_t
,
uint32_t
>>
unicode_map_lowercase
=
{
{
0x000041
,
0x000061
},
{
0x000042
,
0x000062
},
{
0x000043
,
0x000063
},
...
...
@@ -3773,7 +3774,8 @@ const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {
{
0x01E921
,
0x01E943
},
};
const
std
::
unordered_map
<
uint32_t
,
uint32_t
>
unicode_map_uppercase
=
{
// list is always in ascending order, to enable binary search
const
std
::
initializer_list
<
std
::
pair
<
uint32_t
,
uint32_t
>>
unicode_map_uppercase
=
{
{
0x000061
,
0x000041
},
{
0x000062
,
0x000042
},
{
0x000063
,
0x000043
},
...
...
@@ -5226,7 +5228,7 @@ const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = {
{
0x01E943
,
0x01E921
},
};
const
std
::
vector
<
range_nfd
>
unicode_ranges_nfd
=
{
// start, last, nfd
const
std
::
initializer_list
<
range_nfd
>
unicode_ranges_nfd
=
{
// start, last, nfd
{
0x000000
,
0x000000
,
0x000000
},
{
0x0000C0
,
0x0000C5
,
0x000041
},
{
0x0000C7
,
0x0000C7
,
0x000043
},
...
...
llama/unicode-data.h
View file @
527cc978
/**
* llama.cpp - commit
3f1ae2e32cde00c39b96be6d01c2997c29bae555
- do not edit this file
* llama.cpp - commit
40c6d79fb52f995f47507fedfeaae2ac05d9b35c
- do not edit this file
*
* MIT License
*
...
...
@@ -39,8 +39,8 @@ struct range_nfd {
static
const
uint32_t
MAX_CODEPOINTS
=
0x110000
;
extern
const
std
::
vector
<
std
::
pair
<
uint32_t
,
uint16_t
>>
unicode_ranges_flags
;
extern
const
std
::
initializer_list
<
std
::
pair
<
uint32_t
,
uint16_t
>>
unicode_ranges_flags
;
extern
const
std
::
unordered_set
<
uint32_t
>
unicode_set_whitespace
;
extern
const
std
::
unordered_map
<
uint32_t
,
uint32_t
>
unicode_map_lowercase
;
extern
const
std
::
unordered_map
<
uint32_t
,
uint32_t
>
unicode_map_uppercase
;
extern
const
std
::
vector
<
range_nfd
>
unicode_ranges_nfd
;
extern
const
std
::
initializer_list
<
std
::
pair
<
uint32_t
,
uint32_t
>
>
unicode_map_lowercase
;
extern
const
std
::
initializer_list
<
std
::
pair
<
uint32_t
,
uint32_t
>
>
unicode_map_uppercase
;
extern
const
std
::
initializer_list
<
range_nfd
>
unicode_ranges_nfd
;
Prev
1
…
10
11
12
13
14
15
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment