Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
68ee42f9
Unverified
Commit
68ee42f9
authored
Jul 29, 2024
by
Jeffrey Morgan
Committed by
GitHub
Jul 29, 2024
Browse files
update llama.cpp submodule to `6eeaeba1` (#6039)
parent
f26aef9a
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
8 additions
and
89 deletions
+8
-89
llm/ext_server/server.cpp
llm/ext_server/server.cpp
+0
-9
llm/llama.cpp
llm/llama.cpp
+1
-1
llm/patches/05-default-pretokenizer.diff
llm/patches/05-default-pretokenizer.diff
+5
-5
llm/patches/09-lora.diff
llm/patches/09-lora.diff
+2
-4
llm/patches/10-llama3-rope.diff
llm/patches/10-llama3-rope.diff
+0
-70
No files found.
llm/ext_server/server.cpp
View file @
68ee42f9
...
@@ -2438,15 +2438,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
...
@@ -2438,15 +2438,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
params
.
lora_adapter
.
emplace_back
(
lora_adapter
,
std
::
stof
(
argv
[
i
]));
params
.
lora_adapter
.
emplace_back
(
lora_adapter
,
std
::
stof
(
argv
[
i
]));
params
.
use_mmap
=
false
;
params
.
use_mmap
=
false
;
}
}
else
if
(
arg
==
"--lora-base"
)
{
if
(
++
i
>=
argc
)
{
invalid_param
=
true
;
break
;
}
params
.
lora_base
=
argv
[
i
];
}
else
if
(
arg
==
"-v"
||
arg
==
"--verbose"
)
else
if
(
arg
==
"-v"
||
arg
==
"--verbose"
)
{
{
server_verbose
=
true
;
server_verbose
=
true
;
...
...
llama.cpp
@
6eeaeba1
Compare
d94c6e0c
...
6eeaeba1
Subproject commit
d94c6e0ccbd29ee1ba4f44e9caa8682ad94df9fa
Subproject commit
6eeaeba126ff701f3e8f79f246805b7023709972
llm/patches/05-default-pretokenizer.diff
View file @
68ee42f9
diff --git a/src/llama.cpp b/src/llama.cpp
diff --git a/src/llama.cpp b/src/llama.cpp
index
8fe51971..7113ba64
100644
index
a207451f..2ddf431d
100644
--- a/src/llama.cpp
--- a/src/llama.cpp
+++ b/src/llama.cpp
+++ b/src/llama.cpp
@@ -5
433
,16 +5
433
,7 @@
static void llm_load_vocab(
@@ -5
347
,16 +5
347
,7 @@
static void llm_load_vocab(
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
vocab.tokenizer_add_space_prefix = false;
vocab.tokenizer_add_space_prefix = false;
vocab.tokenizer_clean_spaces = true;
vocab.tokenizer_clean_spaces = true;
...
@@ -20,9 +20,9 @@ index 8fe51971..7113ba64 100644
...
@@ -20,9 +20,9 @@ index 8fe51971..7113ba64 100644
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (
} else if (
tokenizer_pre == "llama3" ||
tokenizer_pre == "llama3" ||
@@ -5
526
,7 +5
517
,8 @@
static void llm_load_vocab(
@@ -5
443
,7 +5
434
,8 @@
static void llm_load_vocab(
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
tokenizer_pre == "codeshell") {
vocab.t
okenizer_clean_spaces = false
;
vocab.t
ype_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL
;
} else {
} else {
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
+ LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
+ LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
...
...
llm/patches/09-lora.diff
View file @
68ee42f9
...
@@ -2,7 +2,7 @@ diff --git a/common/common.cpp b/common/common.cpp
...
@@ -2,7 +2,7 @@ diff --git a/common/common.cpp b/common/common.cpp
index dbb724fb..c26fe6ee 100644
index dbb724fb..c26fe6ee 100644
--- a/common/common.cpp
--- a/common/common.cpp
+++ b/common/common.cpp
+++ b/common/common.cpp
@@ -2087,14 +2087,2
9
@@
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
@@ -2087,14 +2087,2
7
@@
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
float lora_scale = std::get<1>(params.lora_adapter[i]);
float lora_scale = std::get<1>(params.lora_adapter[i]);
...
@@ -20,9 +20,7 @@ index dbb724fb..c26fe6ee 100644
...
@@ -20,9 +20,7 @@ index dbb724fb..c26fe6ee 100644
+ int err = llama_model_apply_lora_from_file(model,
+ int err = llama_model_apply_lora_from_file(model,
+ lora_adapter.c_str(),
+ lora_adapter.c_str(),
+ lora_scale,
+ lora_scale,
+ ((i > 0) || params.lora_base.empty())
+ nullptr,
+ ? NULL
+ : params.lora_base.c_str(),
+ params.n_threads);
+ params.n_threads);
+ if (err != 0) {
+ if (err != 0) {
+ fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
+ fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
...
...
llm/patches/10-llama3-rope.diff
deleted
100644 → 0
View file @
f26aef9a
From 2f872f294fb6f5c6e8f983b68c40ea656053dd92 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Tue, 23 Jul 2024 14:33:29 -0700
Subject: [PATCH] llama 3.1 rope scaling
---
src/llama.cpp | 14 ++++++++++++--
1 file changed, 12 insertions(+), 2 deletions(-)
diff --git a/src/llama.cpp b/src/llama.cpp
index 8fe51971..a9969df8 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2472,6 +2472,7 @@
struct llama_layer {
// long rope factors
struct ggml_tensor * rope_long = nullptr;
struct ggml_tensor * rope_short = nullptr;
+ struct ggml_tensor * rope_freqs = nullptr;
// bitnet scale
struct ggml_tensor * wq_scale;
@@ -6143,6 +6144,8 @@
static bool llm_load_tensors(
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+ layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), { n_embd/n_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+
if (n_expert == 0) {
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
@@ -8620,6 +8623,10 @@
struct llm_build_context {
// choose long/short freq factors based on the context size
const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
+ if (model.layers[il].rope_freqs != nullptr) {
+ return model.layers[il].rope_freqs;
+ }
+
if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) {
return model.layers[il].rope_long;
}
@@ -8814,6 +8821,9 @@
struct llm_build_context {
// self-attention
{
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
+ struct ggml_tensor * rope_factors = build_rope_factors(il);
+
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
@@ -8837,14 +8847,14 @@
struct llm_build_context {
}
Qcur = ggml_rope_ext(
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur", il);
Kcur = ggml_rope_ext(
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
--
2.45.2
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment