llama: update llama.cpp vendor code to commit d7cfe1ff (#9356)

d7d7e996 · Jeffrey Morgan · GitHub · 2db96c18 · d7d7e996 · d7d7e996
Unverified Commit d7d7e996 authored Feb 26, 2025 by Jeffrey Morgan Committed by GitHub Feb 26, 2025
20 changed files
--- a/llama/llama.cpp/src/llama-vocab.cpp
+++ b/llama/llama.cpp/src/llama-vocab.cpp
--- a/llama/llama.cpp/src/llama-vocab.h
+++ b/llama/llama.cpp/src/llama-vocab.h
--- a/llama/llama.cpp/src/llama.cpp
+++ b/llama/llama.cpp/src/llama.cpp
--- a/llama/llama.cpp/src/unicode.cpp
+++ b/llama/llama.cpp/src/unicode.cpp
@@ -12,18 +12,17 @@
 #include <algorithm>
 #include <cassert>
+#include <codecvt>
 #include <cstddef>
 #include <cstdint>
+#include <locale>
 #include <map>
 #include <regex>
 #include <stdexcept>
 #include <string>
 #include <unordered_map>
-#include <unordered_set>
 #include <utility>
 #include <vector>
-#include <locale>
-#include <codecvt>
 size_t unicode_len_utf8(char src) {
    const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
@@ -641,7 +640,14 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
    result.reserve(utf8.size());
    size_t offset = 0;
    while (offset < utf8.size()) {
-        result.push_back(unicode_cpt_from_utf8(utf8, offset));
+        try {
+            result.push_back(unicode_cpt_from_utf8(utf8, offset));
+        }
+        catch (const std::invalid_argument & /*ex*/) {
+            // Silently ignore invalid UTF-8 input to avoid leaking the exception beyond llama_tokenize
+            ++offset;
+            result.emplace_back(0xFFFD); // replacement character
+        }
    }
    return result;
 }
@@ -724,7 +730,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
    const auto cpts = unicode_cpts_from_utf8(text);
    // generate a "collapsed" representation of the text, where all codepoints are replaced by a single byte
-    // ref: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2081479935
+    // ref: https://github.com/ggml-org/llama.cpp/pull/6920#issuecomment-2081479935
    std::string text_collapsed;
    if (need_collapse) {
        // collapse all unicode categories

--- a/llama/llama.go
+++ b/llama/llama.go
@@ -14,6 +14,7 @@ package llama
 #include "llama.h"
 #include "clip.h"
 #include "llava.h"
+#include "gguf.h"
 #include "mllama.h"
 #include "sampling_ext.h"
@@ -293,29 +294,29 @@ func NewContextWithModel(model *Model, params ContextParams) (*Context, error) {
 }
 func (m *Model) NumVocab() int {
-	return int(C.llama_n_vocab(m.c))
+	return int(C.llama_n_vocab(m.Vocab()))
 }
 func (m *Model) TokenIsEog(token int) bool {
-	return bool(C.llama_token_is_eog(m.c, C.llama_token(token)))
+	return bool(C.llama_token_is_eog(m.Vocab(), C.llama_token(token)))
 }
 func (m *Model) AddBOSToken() bool {
-	return bool(C.llama_add_bos_token(m.c))
+	return bool(C.llama_add_bos_token(m.Vocab()))
 }
 func (m *Model) ApplyLoraFromFile(context *Context, loraPath string, scale float32, threads int) error {
 	cLoraPath := C.CString(loraPath)
 	defer C.free(unsafe.Pointer(cLoraPath))
-	loraAdapter := C.llama_lora_adapter_init(m.c, cLoraPath)
+	loraAdapter := C.llama_adapter_lora_init(m.c, cLoraPath)
 	if loraAdapter == nil {
 		return errors.New("unable to load lora")
 	}
 	err := -1
 	if loraAdapter != nil {
-		err = int(C.llama_lora_adapter_set(context.c, loraAdapter, C.float(scale)))
+		err = int(C.llama_set_adapter_lora(context.c, loraAdapter, C.float(scale)))
 	}
 	if err != 0 {
 		return errors.New("error applying lora from file")
@@ -324,6 +325,10 @@ func (m *Model) ApplyLoraFromFile(context *Context, loraPath string, scale float
 	return nil
 }
+func (m *Model) Vocab() *C.struct_llama_vocab {
+	return C.llama_model_get_vocab(m.c)
+}
 type Batch struct {
 	c         C.struct_llama_batch
 	batchSize int
@@ -414,7 +419,7 @@ func (m *Model) TokenToPiece(token int) string {
 	tokenLen := 12
 	buf := make([]byte, tokenLen)
 	tokenLen = int(C.llama_token_to_piece(
-		m.c,
+		m.Vocab(),
 		C.int32_t(token),
 		(*C.char)(unsafe.Pointer(&buf[0])),
 		C.int32_t(tokenLen),
@@ -426,7 +431,7 @@ func (m *Model) TokenToPiece(token int) string {
 		buf = make([]byte, tokenLen)
 		C.llama_token_to_piece(
-			m.c,
+			m.Vocab(),
 			C.int32_t(token),
 			(*C.char)(unsafe.Pointer(&buf[0])),
 			C.int32_t(tokenLen),
@@ -444,7 +449,7 @@ func (m *Model) Tokenize(text string, addSpecial bool, parseSpecial bool) ([]int
 	defer C.free(unsafe.Pointer(cText))
 	result := C.llama_tokenize(
-		m.c,
+		m.Vocab(),
 		cText,
 		C.int32_t(len(text)),
 		&cTokens[0],
@@ -458,7 +463,7 @@ func (m *Model) Tokenize(text string, addSpecial bool, parseSpecial bool) ([]int
 		maxTokens = int(-result)
 		cTokens = make([]C.llama_token, maxTokens)
 		result = C.llama_tokenize(
-			m.c,
+			m.Vocab(),
 			cText,
 			C.int32_t(len(text)),
 			&cTokens[0],

--- a/llama/mllama.cpp
+++ b/llama/mllama.cpp
@@ -5,6 +5,7 @@
 #include "ggml-backend.h"
 #include "ggml-cpu.h"
 #include "ggml.h"
+#include "gguf.h"
 #ifdef GGML_USE_CUDA
 #include "ggml-cuda.h"

--- a/llama/patches/0001-cuda.patch
+++ b/llama/patches/0001-cuda.patch
@@ -10,7 +10,7 @@ Subject: [PATCH] cuda
 3 files changed, 2 insertions(+), 1 deletion(-)
 diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
-index e2d6c405..a12172dc 100644
+index dba7be33..1ca40b2c 100644
 --- a/ggml/src/ggml-backend.cpp
 +++ b/ggml/src/ggml-backend.cpp
 @@ -106,7 +106,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
@@ -22,10 +22,10 @@ index e2d6c405..a12172dc 100644
 size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 0b06be72..be29e979 100644
+index ebb2ccae..b094929b 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -424,6 +424,7 @@ struct ggml_backend_cuda_buffer_context {
+@@ -529,6 +529,7 @@ struct ggml_backend_cuda_buffer_context {
 static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
     delete ctx;
@@ -34,10 +34,10 @@ index 0b06be72..be29e979 100644
 static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index a85502ee..cd8ef741 100644
+index c550142a..fd9a4e77 100644
 --- a/ggml/src/ggml-metal/ggml-metal.m
 +++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -4187,6 +4187,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
+@@ -4350,6 +4350,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
     }
     free(ctx);

--- a/llama/patches/0002-pretokenizer.patch
+++ b/llama/patches/0002-pretokenizer.patch
--- a/llama/patches/0003-embeddings.patch
+++ b/llama/patches/0003-embeddings.patch
@@ -9,10 +9,10 @@ Subject: [PATCH] embeddings
 2 files changed, 5 insertions(+), 3 deletions(-)
 diff --git a/src/llama-context.cpp b/src/llama-context.cpp
-index 38a55fb2..b9c4a5bf 100644
+index 671d2a81..47e79ed4 100644
 --- a/src/llama-context.cpp
 +++ b/src/llama-context.cpp
-@@ -475,7 +475,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
+@@ -479,7 +479,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
     const auto n_embd  = hparams.n_embd;
     // TODO: use a per-batch flag for logits presence instead
@@ -22,10 +22,10 @@ index 38a55fb2..b9c4a5bf 100644
     const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
 diff --git a/src/llama.cpp b/src/llama.cpp
-index ea78ea48..4eb3f6b9 100644
+index 607f2786..ac85bfed 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -10876,7 +10876,6 @@ static int llama_decode_internal(
+@@ -8652,7 +8652,6 @@ static int llama_decode_impl(
             res  = nullptr;
             embd = nullptr;
         } else if (cparams.embeddings) {
@@ -33,7 +33,7 @@ index ea78ea48..4eb3f6b9 100644
             embd = nullptr;
             for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
                 if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
-@@ -10884,12 +10883,15 @@ static int llama_decode_internal(
+@@ -8660,12 +8659,15 @@ static int llama_decode_impl(
                     break;
                 }
             }

--- a/llama/patches/0004-clip-unicode.patch
+++ b/llama/patches/0004-clip-unicode.patch
--- a/llama/patches/0005-solar-pro.patch
+++ b/llama/patches/0005-solar-pro.patch
--- a/llama/patches/0006-conditional-fattn.patch
+++ b/llama/patches/0006-conditional-fattn.patch
@@ -8,10 +8,10 @@ Subject: [PATCH] conditional-fattn
 1 file changed, 2 insertions(+)
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index be29e979..aaa79ea4 100644
+index b094929b..36165840 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -2159,9 +2159,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
+@@ -2282,9 +2282,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_ARGSORT:
             ggml_cuda_op_argsort(ctx, dst);
             break;

--- a/llama/patches/0007-add-mllama-support.patch
+++ b/llama/patches/0007-add-mllama-support.patch
--- a/llama/patches/0008-add-unpad-operator.patch
+++ b/llama/patches/0008-add-unpad-operator.patch
--- a/llama/patches/0009-fix-deepseek-deseret-regex.patch
+++ b/llama/patches/0009-fix-deepseek-deseret-regex.patch
--- a/llama/patches/0010-Maintain-ordering-for-rules-for-grammar.patch
+++ b/llama/patches/0010-Maintain-ordering-for-rules-for-grammar.patch
--- a/llama/patches/0011-fix-missing-arg-in-static-assert-on-windows.patch
+++ b/llama/patches/0011-fix-missing-arg-in-static-assert-on-windows.patch
--- a/llama/patches/0012-llama-Ensure-KV-cache-is-fully-defragmented.patch
+++ b/llama/patches/0012-llama-Ensure-KV-cache-is-fully-defragmented.patch
--- a/llama/patches/0013-use-dynamic-backend-loading-for-clip.patch
+++ b/llama/patches/0013-use-dynamic-backend-loading-for-clip.patch
--- a/llama/patches/0014-sort-devices-by-score.patch
+++ b/llama/patches/0014-sort-devices-by-score.patch