Unverified Commit d7d7e996 authored by Jeffrey Morgan's avatar Jeffrey Morgan Committed by GitHub
Browse files

llama: update llama.cpp vendor code to commit d7cfe1ff (#9356)

parent 2db96c18
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -12,18 +12,17 @@ ...@@ -12,18 +12,17 @@
#include <algorithm> #include <algorithm>
#include <cassert> #include <cassert>
#include <codecvt>
#include <cstddef> #include <cstddef>
#include <cstdint> #include <cstdint>
#include <locale>
#include <map> #include <map>
#include <regex> #include <regex>
#include <stdexcept> #include <stdexcept>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include <locale>
#include <codecvt>
size_t unicode_len_utf8(char src) { size_t unicode_len_utf8(char src) {
const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
...@@ -641,7 +640,14 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) { ...@@ -641,7 +640,14 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
result.reserve(utf8.size()); result.reserve(utf8.size());
size_t offset = 0; size_t offset = 0;
while (offset < utf8.size()) { while (offset < utf8.size()) {
result.push_back(unicode_cpt_from_utf8(utf8, offset)); try {
result.push_back(unicode_cpt_from_utf8(utf8, offset));
}
catch (const std::invalid_argument & /*ex*/) {
// Silently ignore invalid UTF-8 input to avoid leaking the exception beyond llama_tokenize
++offset;
result.emplace_back(0xFFFD); // replacement character
}
} }
return result; return result;
} }
...@@ -724,7 +730,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std ...@@ -724,7 +730,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
const auto cpts = unicode_cpts_from_utf8(text); const auto cpts = unicode_cpts_from_utf8(text);
// generate a "collapsed" representation of the text, where all codepoints are replaced by a single byte // generate a "collapsed" representation of the text, where all codepoints are replaced by a single byte
// ref: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2081479935 // ref: https://github.com/ggml-org/llama.cpp/pull/6920#issuecomment-2081479935
std::string text_collapsed; std::string text_collapsed;
if (need_collapse) { if (need_collapse) {
// collapse all unicode categories // collapse all unicode categories
......
...@@ -14,6 +14,7 @@ package llama ...@@ -14,6 +14,7 @@ package llama
#include "llama.h" #include "llama.h"
#include "clip.h" #include "clip.h"
#include "llava.h" #include "llava.h"
#include "gguf.h"
#include "mllama.h" #include "mllama.h"
#include "sampling_ext.h" #include "sampling_ext.h"
...@@ -293,29 +294,29 @@ func NewContextWithModel(model *Model, params ContextParams) (*Context, error) { ...@@ -293,29 +294,29 @@ func NewContextWithModel(model *Model, params ContextParams) (*Context, error) {
} }
func (m *Model) NumVocab() int { func (m *Model) NumVocab() int {
return int(C.llama_n_vocab(m.c)) return int(C.llama_n_vocab(m.Vocab()))
} }
func (m *Model) TokenIsEog(token int) bool { func (m *Model) TokenIsEog(token int) bool {
return bool(C.llama_token_is_eog(m.c, C.llama_token(token))) return bool(C.llama_token_is_eog(m.Vocab(), C.llama_token(token)))
} }
func (m *Model) AddBOSToken() bool { func (m *Model) AddBOSToken() bool {
return bool(C.llama_add_bos_token(m.c)) return bool(C.llama_add_bos_token(m.Vocab()))
} }
func (m *Model) ApplyLoraFromFile(context *Context, loraPath string, scale float32, threads int) error { func (m *Model) ApplyLoraFromFile(context *Context, loraPath string, scale float32, threads int) error {
cLoraPath := C.CString(loraPath) cLoraPath := C.CString(loraPath)
defer C.free(unsafe.Pointer(cLoraPath)) defer C.free(unsafe.Pointer(cLoraPath))
loraAdapter := C.llama_lora_adapter_init(m.c, cLoraPath) loraAdapter := C.llama_adapter_lora_init(m.c, cLoraPath)
if loraAdapter == nil { if loraAdapter == nil {
return errors.New("unable to load lora") return errors.New("unable to load lora")
} }
err := -1 err := -1
if loraAdapter != nil { if loraAdapter != nil {
err = int(C.llama_lora_adapter_set(context.c, loraAdapter, C.float(scale))) err = int(C.llama_set_adapter_lora(context.c, loraAdapter, C.float(scale)))
} }
if err != 0 { if err != 0 {
return errors.New("error applying lora from file") return errors.New("error applying lora from file")
...@@ -324,6 +325,10 @@ func (m *Model) ApplyLoraFromFile(context *Context, loraPath string, scale float ...@@ -324,6 +325,10 @@ func (m *Model) ApplyLoraFromFile(context *Context, loraPath string, scale float
return nil return nil
} }
func (m *Model) Vocab() *C.struct_llama_vocab {
return C.llama_model_get_vocab(m.c)
}
type Batch struct { type Batch struct {
c C.struct_llama_batch c C.struct_llama_batch
batchSize int batchSize int
...@@ -414,7 +419,7 @@ func (m *Model) TokenToPiece(token int) string { ...@@ -414,7 +419,7 @@ func (m *Model) TokenToPiece(token int) string {
tokenLen := 12 tokenLen := 12
buf := make([]byte, tokenLen) buf := make([]byte, tokenLen)
tokenLen = int(C.llama_token_to_piece( tokenLen = int(C.llama_token_to_piece(
m.c, m.Vocab(),
C.int32_t(token), C.int32_t(token),
(*C.char)(unsafe.Pointer(&buf[0])), (*C.char)(unsafe.Pointer(&buf[0])),
C.int32_t(tokenLen), C.int32_t(tokenLen),
...@@ -426,7 +431,7 @@ func (m *Model) TokenToPiece(token int) string { ...@@ -426,7 +431,7 @@ func (m *Model) TokenToPiece(token int) string {
buf = make([]byte, tokenLen) buf = make([]byte, tokenLen)
C.llama_token_to_piece( C.llama_token_to_piece(
m.c, m.Vocab(),
C.int32_t(token), C.int32_t(token),
(*C.char)(unsafe.Pointer(&buf[0])), (*C.char)(unsafe.Pointer(&buf[0])),
C.int32_t(tokenLen), C.int32_t(tokenLen),
...@@ -444,7 +449,7 @@ func (m *Model) Tokenize(text string, addSpecial bool, parseSpecial bool) ([]int ...@@ -444,7 +449,7 @@ func (m *Model) Tokenize(text string, addSpecial bool, parseSpecial bool) ([]int
defer C.free(unsafe.Pointer(cText)) defer C.free(unsafe.Pointer(cText))
result := C.llama_tokenize( result := C.llama_tokenize(
m.c, m.Vocab(),
cText, cText,
C.int32_t(len(text)), C.int32_t(len(text)),
&cTokens[0], &cTokens[0],
...@@ -458,7 +463,7 @@ func (m *Model) Tokenize(text string, addSpecial bool, parseSpecial bool) ([]int ...@@ -458,7 +463,7 @@ func (m *Model) Tokenize(text string, addSpecial bool, parseSpecial bool) ([]int
maxTokens = int(-result) maxTokens = int(-result)
cTokens = make([]C.llama_token, maxTokens) cTokens = make([]C.llama_token, maxTokens)
result = C.llama_tokenize( result = C.llama_tokenize(
m.c, m.Vocab(),
cText, cText,
C.int32_t(len(text)), C.int32_t(len(text)),
&cTokens[0], &cTokens[0],
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#include "ggml-backend.h" #include "ggml-backend.h"
#include "ggml-cpu.h" #include "ggml-cpu.h"
#include "ggml.h" #include "ggml.h"
#include "gguf.h"
#ifdef GGML_USE_CUDA #ifdef GGML_USE_CUDA
#include "ggml-cuda.h" #include "ggml-cuda.h"
......
...@@ -10,7 +10,7 @@ Subject: [PATCH] cuda ...@@ -10,7 +10,7 @@ Subject: [PATCH] cuda
3 files changed, 2 insertions(+), 1 deletion(-) 3 files changed, 2 insertions(+), 1 deletion(-)
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index e2d6c405..a12172dc 100644 index dba7be33..1ca40b2c 100644
--- a/ggml/src/ggml-backend.cpp --- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp
@@ -106,7 +106,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) { @@ -106,7 +106,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
...@@ -22,10 +22,10 @@ index e2d6c405..a12172dc 100644 ...@@ -22,10 +22,10 @@ index e2d6c405..a12172dc 100644
size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) { size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 0b06be72..be29e979 100644 index ebb2ccae..b094929b 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -424,6 +424,7 @@ struct ggml_backend_cuda_buffer_context { @@ -529,6 +529,7 @@ struct ggml_backend_cuda_buffer_context {
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
delete ctx; delete ctx;
...@@ -34,10 +34,10 @@ index 0b06be72..be29e979 100644 ...@@ -34,10 +34,10 @@ index 0b06be72..be29e979 100644
static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) { static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index a85502ee..cd8ef741 100644 index c550142a..fd9a4e77 100644
--- a/ggml/src/ggml-metal/ggml-metal.m --- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -4187,6 +4187,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) @@ -4350,6 +4350,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
} }
free(ctx); free(ctx);
......
This diff is collapsed.
...@@ -9,10 +9,10 @@ Subject: [PATCH] embeddings ...@@ -9,10 +9,10 @@ Subject: [PATCH] embeddings
2 files changed, 5 insertions(+), 3 deletions(-) 2 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 38a55fb2..b9c4a5bf 100644 index 671d2a81..47e79ed4 100644
--- a/src/llama-context.cpp --- a/src/llama-context.cpp
+++ b/src/llama-context.cpp +++ b/src/llama-context.cpp
@@ -475,7 +475,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) { @@ -479,7 +479,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
const auto n_embd = hparams.n_embd; const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead // TODO: use a per-batch flag for logits presence instead
...@@ -22,10 +22,10 @@ index 38a55fb2..b9c4a5bf 100644 ...@@ -22,10 +22,10 @@ index 38a55fb2..b9c4a5bf 100644
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0; const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
diff --git a/src/llama.cpp b/src/llama.cpp diff --git a/src/llama.cpp b/src/llama.cpp
index ea78ea48..4eb3f6b9 100644 index 607f2786..ac85bfed 100644
--- a/src/llama.cpp --- a/src/llama.cpp
+++ b/src/llama.cpp +++ b/src/llama.cpp
@@ -10876,7 +10876,6 @@ static int llama_decode_internal( @@ -8652,7 +8652,6 @@ static int llama_decode_impl(
res = nullptr; res = nullptr;
embd = nullptr; embd = nullptr;
} else if (cparams.embeddings) { } else if (cparams.embeddings) {
...@@ -33,7 +33,7 @@ index ea78ea48..4eb3f6b9 100644 ...@@ -33,7 +33,7 @@ index ea78ea48..4eb3f6b9 100644
embd = nullptr; embd = nullptr;
for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) { for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) { if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
@@ -10884,12 +10883,15 @@ static int llama_decode_internal( @@ -8660,12 +8659,15 @@ static int llama_decode_impl(
break; break;
} }
} }
......
This diff is collapsed.
This diff is collapsed.
...@@ -8,10 +8,10 @@ Subject: [PATCH] conditional-fattn ...@@ -8,10 +8,10 @@ Subject: [PATCH] conditional-fattn
1 file changed, 2 insertions(+) 1 file changed, 2 insertions(+)
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index be29e979..aaa79ea4 100644 index b094929b..36165840 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2159,9 +2159,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg @@ -2282,9 +2282,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
ggml_cuda_op_argsort(ctx, dst); ggml_cuda_op_argsort(ctx, dst);
break; break;
......
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment