Unverified Commit d7d7e996 authored by Jeffrey Morgan's avatar Jeffrey Morgan Committed by GitHub
Browse files

llama: update llama.cpp vendor code to commit d7cfe1ff (#9356)

parent 2db96c18
#include "llama-vocab.h" #include "llama-vocab.h"
#include "llama-impl.h" #include "llama-impl.h"
#include "llama-model-loader.h"
#include "unicode.h" #include "unicode.h"
...@@ -11,8 +12,10 @@ ...@@ -11,8 +12,10 @@
#include <cstdarg> #include <cstdarg>
#include <cstring> #include <cstring>
#include <forward_list> #include <forward_list>
#include <map>
#include <queue> #include <queue>
#include <sstream> #include <set>
#include <unordered_map>
// //
// helpers // helpers
...@@ -62,7 +65,7 @@ struct naive_trie { ...@@ -62,7 +65,7 @@ struct naive_trie {
}; };
// //
// impl // tokenizers
// //
struct llm_tokenizer { struct llm_tokenizer {
...@@ -70,88 +73,6 @@ struct llm_tokenizer { ...@@ -70,88 +73,6 @@ struct llm_tokenizer {
virtual ~llm_tokenizer() = default; virtual ~llm_tokenizer() = default;
}; };
llama_vocab::~llama_vocab() {
delete tokenizer;
}
int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
GGML_ASSERT(token_left.find(' ') == std::string::npos);
GGML_ASSERT(token_left.find('\n') == std::string::npos);
GGML_ASSERT(token_right.find(' ') == std::string::npos);
GGML_ASSERT(token_right.find('\n') == std::string::npos);
auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
if (it == bpe_ranks.end()) {
return -1;
}
return it->second;
}
static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
return vocab.type;
}
static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL;
}
static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN;
}
static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL;
}
static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE;
}
static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token id) {
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
}
static bool llama_is_unused_token(const llama_vocab & vocab, llama_token id) {
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNUSED;
}
static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
GGML_ASSERT(llama_is_byte_token(vocab, id));
const auto & token_data = vocab.id_to_token.at(id);
switch (llama_vocab_get_type(vocab)) {
case LLAMA_VOCAB_TYPE_SPM:
case LLAMA_VOCAB_TYPE_UGM: {
auto buf = token_data.text.substr(3, 2);
return strtol(buf.c_str(), NULL, 16);
}
case LLAMA_VOCAB_TYPE_BPE: {
GGML_ABORT("fatal error");
//return unicode_utf8_to_byte(token_data.text); // TODO: why is this here after GGML_ASSERT?
}
case LLAMA_VOCAB_TYPE_WPM: {
GGML_ABORT("fatal error");
}
default:
GGML_ABORT("fatal error");
}
}
static void llama_escape_whitespace(std::string & text) {
replace_all(text, " ", "\xe2\x96\x81");
}
static void llama_unescape_whitespace(std::string & word) {
replace_all(word, "\xe2\x96\x81", " ");
}
struct llm_symbol { struct llm_symbol {
using index = int; using index = int;
index prev; index prev;
...@@ -183,14 +104,13 @@ struct llm_bigram_spm { ...@@ -183,14 +104,13 @@ struct llm_bigram_spm {
}; };
struct llm_tokenizer_spm : llm_tokenizer { struct llm_tokenizer_spm : llm_tokenizer {
llm_tokenizer_spm(const llama_vocab & /*vocab*/) : llm_tokenizer() {} llm_tokenizer_spm(const llama_vocab & /*vocab*/) {}
}; };
struct llm_tokenizer_spm_session { struct llm_tokenizer_spm_session {
llm_tokenizer_spm_session(const llama_vocab & vocab) : vocab(vocab) {} llm_tokenizer_spm_session(const llama_vocab & vocab) : vocab(vocab) {}
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) { void tokenize(const std::string & text, std::vector<llama_token> & output) {
// split string into utf8 chars // split string into utf8 chars
int index = 0; int index = 0;
size_t offs = 0; size_t offs = 0;
...@@ -249,13 +169,13 @@ struct llm_tokenizer_spm_session { ...@@ -249,13 +169,13 @@ struct llm_tokenizer_spm_session {
} }
private: private:
void resegment(llm_symbol & symbol, std::vector<llama_vocab::id> & output) { void resegment(llm_symbol & symbol, std::vector<llama_token> & output) {
auto text = std::string(symbol.text, symbol.n); auto text = std::string(symbol.text, symbol.n);
auto token = vocab.token_to_id.find(text); auto token = vocab.text_to_token(text);
// Do we need to support is_unused? // Do we need to support is_unused?
if (token != vocab.token_to_id.end()) { if (token != LLAMA_TOKEN_NULL) {
output.push_back((*token).second); output.push_back(token);
return; return;
} }
...@@ -265,8 +185,8 @@ private: ...@@ -265,8 +185,8 @@ private:
// output any symbols that did not form tokens as bytes. // output any symbols that did not form tokens as bytes.
output.reserve(output.size() + symbol.n); output.reserve(output.size() + symbol.n);
for (int j = 0; j < (int)symbol.n; ++j) { for (int j = 0; j < (int)symbol.n; ++j) {
llama_vocab::id token_id = llama_byte_to_token_impl(vocab, symbol.text[j]); llama_token id = vocab.byte_to_token(symbol.text[j]);
output.push_back(token_id); output.push_back(id);
} }
return; return;
} }
...@@ -280,17 +200,17 @@ private: ...@@ -280,17 +200,17 @@ private:
return; return;
} }
const std::string text = std::string(symbols[left].text, symbols[left].n + symbols[right].n); const std::string text = std::string(symbols[left].text, symbols[left].n + symbols[right].n);
auto token = vocab.token_to_id.find(text); auto token = vocab.text_to_token(text);
if (token == vocab.token_to_id.end()) { if (token == LLAMA_TOKEN_NULL) {
return; return;
} }
if (static_cast<size_t>((*token).second) >= vocab.id_to_token.size()) { if (static_cast<uint32_t>(token) >= vocab.n_tokens()) {
return; return;
} }
const auto & tok_data = vocab.id_to_token[(*token).second]; const auto & tok_data = vocab.get_token_data(token);
llm_bigram_spm bigram; llm_bigram_spm bigram;
bigram.left = left; bigram.left = left;
...@@ -353,9 +273,9 @@ struct llm_bigram_bpe { ...@@ -353,9 +273,9 @@ struct llm_bigram_bpe {
}; };
struct llm_tokenizer_bpe : llm_tokenizer { struct llm_tokenizer_bpe : llm_tokenizer {
llm_tokenizer_bpe(const llama_vocab & vocab) : llm_tokenizer() { llm_tokenizer_bpe(const llama_vocab & vocab) {
GGML_ASSERT(vocab.type == LLAMA_VOCAB_TYPE_BPE); GGML_ASSERT(vocab.get_type() == LLAMA_VOCAB_TYPE_BPE);
switch (vocab.type_pre) { switch (vocab.get_pre_type()) {
case LLAMA_VOCAB_PRE_TYPE_LLAMA3: case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
regex_exprs = { regex_exprs = {
// original regex from tokenizer.json // original regex from tokenizer.json
...@@ -488,39 +408,38 @@ struct llm_tokenizer_bpe : llm_tokenizer { ...@@ -488,39 +408,38 @@ struct llm_tokenizer_bpe : llm_tokenizer {
}; };
struct llm_tokenizer_bpe_session { struct llm_tokenizer_bpe_session {
llm_tokenizer_bpe_session(const llama_vocab & vocab) : vocab(vocab), llm_tokenizer_bpe_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
bpe_tokenizer(static_cast<const llm_tokenizer_bpe *>(vocab.tokenizer)) {}
static void append(const llama_vocab::id token_id, std::vector<llama_vocab::id> & output) { static void append(const llama_token token_id, std::vector<llama_token> & output) {
output.push_back(token_id); output.push_back(token_id);
} }
bool append_bos(std::vector<llama_vocab::id> & output) const { bool append_bos(std::vector<llama_token> & output) const {
if (vocab.tokenizer_add_bos) { if (vocab.get_add_bos()) {
GGML_ASSERT(vocab.special_bos_id != -1); GGML_ASSERT(vocab.token_bos() != LLAMA_TOKEN_NULL);
output.push_back(vocab.special_bos_id); output.push_back(vocab.token_bos());
return true; return true;
} }
return false; return false;
} }
bool append_eos(std::vector<llama_vocab::id> & output) const { bool append_eos(std::vector<llama_token> & output) const {
if (vocab.tokenizer_add_eos) { if (vocab.get_add_eos()) {
GGML_ASSERT(vocab.special_eos_id != -1); GGML_ASSERT(vocab.token_eos() != LLAMA_TOKEN_NULL);
output.push_back(vocab.special_eos_id); output.push_back(vocab.token_eos());
return true; return true;
} }
return false; return false;
} }
void check_double_bos_eos(const std::vector<llama_vocab::id> & output) const { void check_double_bos_eos(const std::vector<llama_token> & output) const {
if (vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) { if (vocab.get_add_bos() && output.size() >= 2 && output[1] == vocab.token_bos()) {
LLAMA_LOG_WARN( LLAMA_LOG_WARN(
"%s: Added a BOS token to the prompt as specified by the model but the prompt " "%s: Added a BOS token to the prompt as specified by the model but the prompt "
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. " "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
"Are you sure this is what you want?\n", __FUNCTION__); "Are you sure this is what you want?\n", __FUNCTION__);
} }
if (vocab.tokenizer_add_eos && output.size() >= 2 && *(output.end()-2) == vocab.special_eos_id) { if (vocab.get_add_eos() && output.size() >= 2 && *(output.end()-2) == vocab.token_eos()) {
LLAMA_LOG_WARN( LLAMA_LOG_WARN(
"%s: Added a EOS token to the prompt as specified by the model but the prompt " "%s: Added a EOS token to the prompt as specified by the model but the prompt "
"also ends with a EOS token. So now the final prompt ends with 2 EOS tokens. " "also ends with a EOS token. So now the final prompt ends with 2 EOS tokens. "
...@@ -528,9 +447,9 @@ struct llm_tokenizer_bpe_session { ...@@ -528,9 +447,9 @@ struct llm_tokenizer_bpe_session {
} }
} }
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) { void tokenize(const std::string & text, std::vector<llama_token> & output) {
int final_prev_index = -1; int final_prev_index = -1;
const auto word_collection = unicode_regex_split(text, bpe_tokenizer->regex_exprs); const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs);
symbols_final.clear(); symbols_final.clear();
...@@ -541,7 +460,8 @@ struct llm_tokenizer_bpe_session { ...@@ -541,7 +460,8 @@ struct llm_tokenizer_bpe_session {
int index = 0; int index = 0;
size_t offset = 0; size_t offset = 0;
if (vocab.tokenizer_ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) { //if (vocab.tokenizer_ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
if (vocab.get_ignore_merges() && vocab.text_to_token(word) != LLAMA_TOKEN_NULL) {
symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()}); symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
offset = word.size(); offset = word.size();
} }
...@@ -615,18 +535,18 @@ struct llm_tokenizer_bpe_session { ...@@ -615,18 +535,18 @@ struct llm_tokenizer_bpe_session {
} }
const std::string str = std::string(symbol.text, symbol.n); const std::string str = std::string(symbol.text, symbol.n);
const auto token = vocab.token_to_id.find(str); const auto token = vocab.text_to_token(str);
if (token == vocab.token_to_id.end()) { if (token == LLAMA_TOKEN_NULL) {
for (auto j = str.begin(); j != str.end(); ++j) { for (auto j = str.begin(); j != str.end(); ++j) {
std::string byte_str(1, *j); std::string byte_str(1, *j);
auto token_multibyte = vocab.token_to_id.find(byte_str); auto token_multibyte = vocab.text_to_token(byte_str);
if (token_multibyte != vocab.token_to_id.end()) { if (token_multibyte != LLAMA_TOKEN_NULL) {
output.push_back(token_multibyte->second); output.push_back(token_multibyte);
} }
} }
} else { } else {
output.push_back((*token).second); output.push_back(token);
} }
} }
} }
...@@ -660,7 +580,7 @@ private: ...@@ -660,7 +580,7 @@ private:
} }
const llama_vocab & vocab; const llama_vocab & vocab;
const llm_tokenizer_bpe * bpe_tokenizer; const llm_tokenizer_bpe & tokenizer;
std::vector<llm_symbol> symbols; std::vector<llm_symbol> symbols;
std::vector<llm_symbol> symbols_final; std::vector<llm_symbol> symbols_final;
...@@ -672,14 +592,13 @@ private: ...@@ -672,14 +592,13 @@ private:
// //
struct llm_tokenizer_wpm : llm_tokenizer { struct llm_tokenizer_wpm : llm_tokenizer {
llm_tokenizer_wpm(const llama_vocab & /*vocab*/) : llm_tokenizer() {} llm_tokenizer_wpm(const llama_vocab & /*vocab*/) {}
}; };
struct llm_tokenizer_wpm_session { struct llm_tokenizer_wpm_session {
llm_tokenizer_wpm_session(const llama_vocab & vocab) : vocab(vocab) {} llm_tokenizer_wpm_session(const llama_vocab & vocab) : vocab(vocab) {}
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) { void tokenize(const std::string & text, std::vector<llama_token> & output) {
const auto & token_map = vocab.token_to_id;
// normalize and split by whitespace // normalize and split by whitespace
std::vector<std::string> words = preprocess(text); std::vector<std::string> words = preprocess(text);
// bos token prepended already // bos token prepended already
...@@ -702,10 +621,10 @@ struct llm_tokenizer_wpm_session { ...@@ -702,10 +621,10 @@ struct llm_tokenizer_wpm_session {
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
// loop through possible match length // loop through possible match length
bool match = false; bool match = false;
for (int j = std::min(n, i + vocab.max_token_len + 1); j > i; j--) { for (int j = std::min(n, i + vocab.max_token_len() + 1); j > i; j--) {
auto it = token_map.find(word1.substr(i, j - i)); auto id = vocab.text_to_token(word1.substr(i, j - i));
if (it != token_map.end()) { if (id != LLAMA_TOKEN_NULL) {
output.push_back(it->second); output.push_back(id);
match = true; match = true;
i = j - 1; i = j - 1;
break; break;
...@@ -720,7 +639,7 @@ struct llm_tokenizer_wpm_session { ...@@ -720,7 +639,7 @@ struct llm_tokenizer_wpm_session {
// we didn't find any matches for this word // we didn't find any matches for this word
if (current_tokens == output.size()) { if (current_tokens == output.size()) {
output.push_back(vocab.special_unk_id); output.push_back(vocab.token_unk());
} }
} }
} }
...@@ -789,45 +708,45 @@ private: ...@@ -789,45 +708,45 @@ private:
// //
struct llm_tokenizer_ugm : llm_tokenizer { struct llm_tokenizer_ugm : llm_tokenizer {
llm_tokenizer_ugm(const llama_vocab & vocab) : llm_tokenizer() { llm_tokenizer_ugm(const llama_vocab & vocab, const std::vector<char> & precompiled_charsmap) {
if (vocab.precompiled_charsmap.size() > 0) { if (precompiled_charsmap.size() > 0) {
size_t charsmap_offset = 0; size_t charsmap_offset = 0;
// First four bytes of precompiled_charsmap contains length of binary // First four bytes of precompiled_charsmap contains length of binary
// blob containing XOR-compressed compact double array (XCDA) entries // blob containing XOR-compressed compact double array (XCDA) entries
uint32_t xcda_blob_size = *(const uint32_t *) &vocab.precompiled_charsmap[0]; uint32_t xcda_blob_size = *(const uint32_t *) &precompiled_charsmap[0];
charsmap_offset += sizeof(xcda_blob_size); charsmap_offset += sizeof(xcda_blob_size);
if (xcda_blob_size + charsmap_offset >= vocab.precompiled_charsmap.size()) { if (xcda_blob_size + charsmap_offset >= precompiled_charsmap.size()) {
throw std::runtime_error("Index out of array bounds in precompiled charsmap!"); throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
} }
// Next xcda_blob_size bytes contain entries of XOR-compressed compact // Next xcda_blob_size bytes contain entries of XOR-compressed compact
// double array (XCDA). Each entry is bit-packed into a 32-bit integer. // double array (XCDA). Each entry is bit-packed into a 32-bit integer.
xcda_array = (const uint32_t *) &vocab.precompiled_charsmap[charsmap_offset]; xcda_array = (const uint32_t *) &precompiled_charsmap[charsmap_offset];
xcda_array_size = xcda_blob_size / sizeof(uint32_t); xcda_array_size = xcda_blob_size / sizeof(uint32_t);
charsmap_offset += xcda_blob_size; charsmap_offset += xcda_blob_size;
// Remaining bytes of precompiled charsmap contain null-terminated // Remaining bytes of precompiled charsmap contain null-terminated
// replacement strings for prefixes matched by the XCDA. // replacement strings for prefixes matched by the XCDA.
prefix_replacements = &vocab.precompiled_charsmap[charsmap_offset]; prefix_replacements = &precompiled_charsmap[charsmap_offset];
prefix_replacements_size = vocab.precompiled_charsmap.size() - charsmap_offset; prefix_replacements_size = precompiled_charsmap.size() - charsmap_offset;
} }
for (unsigned int id = 0; id < vocab.id_to_token.size(); ++id) { for (uint32_t id = 0; id < vocab.n_tokens(); ++id) {
const auto &token_data = vocab.id_to_token[id]; const auto & token_data = vocab.get_token_data(id);
if (llama_is_normal_token(vocab, id)) { if (vocab.is_normal(id)) {
min_score = std::min<float>(min_score, token_data.score); min_score = std::min<float>(min_score, token_data.score);
max_score = std::max<float>(max_score, token_data.score); max_score = std::max<float>(max_score, token_data.score);
} }
if (llama_is_normal_token(vocab, id) || if (vocab.is_normal(id) ||
llama_is_user_defined_token(vocab, id) || vocab.is_user_defined(id) ||
llama_is_unused_token(vocab, id)) { vocab.is_unused(id)) {
token_matcher.insert(token_data.text.data(), token_data.text.size(), id); token_matcher.insert(token_data.text.data(), token_data.text.size(), id);
} }
if (llama_is_user_defined_token(vocab, id)) { if (vocab.is_user_defined(id)) {
user_defined_token_matcher.insert(token_data.text.data(), token_data.text.size()); user_defined_token_matcher.insert(token_data.text.data(), token_data.text.size());
} }
} }
...@@ -856,8 +775,7 @@ struct llm_tokenizer_ugm : llm_tokenizer { ...@@ -856,8 +775,7 @@ struct llm_tokenizer_ugm : llm_tokenizer {
}; };
struct llm_tokenizer_ugm_session { struct llm_tokenizer_ugm_session {
llm_tokenizer_ugm_session(const llama_vocab & vocab) : vocab(vocab), llm_tokenizer_ugm_session(const llama_vocab & vocab, const llm_tokenizer_ugm & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
ugm_tokenizer(static_cast<const llm_tokenizer_ugm *>(vocab.tokenizer)) {}
/* This implementation is based on SentencePiece optimized Viterbi algorithm for /* This implementation is based on SentencePiece optimized Viterbi algorithm for
* unigram language models. The general idea is to: * unigram language models. The general idea is to:
...@@ -872,7 +790,7 @@ struct llm_tokenizer_ugm_session { ...@@ -872,7 +790,7 @@ struct llm_tokenizer_ugm_session {
* After processing the whole sequence we backtrack from the end to get * After processing the whole sequence we backtrack from the end to get
* the best tokenization. * the best tokenization.
*/ */
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) { void tokenize(const std::string & text, std::vector<llama_token> & output) {
// get current size of output (for reversal later) // get current size of output (for reversal later)
size_t output_size = output.size(); size_t output_size = output.size();
...@@ -885,9 +803,9 @@ struct llm_tokenizer_ugm_session { ...@@ -885,9 +803,9 @@ struct llm_tokenizer_ugm_session {
} }
// initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores // initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.special_unk_id, 0, -FLT_MAX}); std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.token_unk(), 0, -FLT_MAX});
// at the beginning tokenization score is zero // at the beginning tokenization score is zero
tokenization_results[0] = { vocab.special_unk_id, 0, 0 }; tokenization_results[0] = { vocab.token_unk(), 0, 0 };
for (size_t input_offset = 0; input_offset < input_len;) { for (size_t input_offset = 0; input_offset < input_len;) {
size_t prefix_offset = input_offset; size_t prefix_offset = input_offset;
...@@ -897,7 +815,7 @@ struct llm_tokenizer_ugm_session { ...@@ -897,7 +815,7 @@ struct llm_tokenizer_ugm_session {
// traverse the token matcher trie to find a matching token // traverse the token matcher trie to find a matching token
bool single_codepoint_token_found = false; bool single_codepoint_token_found = false;
const struct best_tokenization & current_best = tokenization_results[input_offset]; const struct best_tokenization & current_best = tokenization_results[input_offset];
const struct naive_trie * node = ugm_tokenizer->token_matcher.traverse(normalized[prefix_offset++]); const struct naive_trie * node = tokenizer.token_matcher.traverse(normalized[prefix_offset++]);
while (prefix_offset <= input_len && node != NULL) { while (prefix_offset <= input_len && node != NULL) {
// check if we found valid token in prefix // check if we found valid token in prefix
...@@ -907,13 +825,13 @@ struct llm_tokenizer_ugm_session { ...@@ -907,13 +825,13 @@ struct llm_tokenizer_ugm_session {
single_codepoint_token_found = true; single_codepoint_token_found = true;
} }
llama_token token_id = node->value; llama_token token_id = node->value;
const auto & token_data = vocab.id_to_token[token_id]; const auto & token_data = vocab.get_token_data(token_id);
// we set the user-defined token scores to 0 to make them more likely to be selected // we set the user-defined token scores to 0 to make them more likely to be selected
// (normal token scores are log probabilities, so they are negative) // (normal token scores are log probabilities, so they are negative)
// score type is double here to make tokenization results exactly // score type is double here to make tokenization results exactly
// the same as in the HF tokenizer using SentencePiece // the same as in the HF tokenizer using SentencePiece
const double token_score = llama_is_user_defined_token(vocab, token_id) ? 0.0 : token_data.score; const double token_score = vocab.is_user_defined(token_id) ? 0.0 : token_data.score;
const double challenger_score = current_best.score_sum + token_score; const double challenger_score = current_best.score_sum + token_score;
struct best_tokenization & current_champ = tokenization_results[prefix_offset]; struct best_tokenization & current_champ = tokenization_results[prefix_offset];
if (challenger_score > current_champ.score_sum) { if (challenger_score > current_champ.score_sum) {
...@@ -927,11 +845,11 @@ struct llm_tokenizer_ugm_session { ...@@ -927,11 +845,11 @@ struct llm_tokenizer_ugm_session {
// if we didn't find a valid token corresponding to the whole UTF code point // if we didn't find a valid token corresponding to the whole UTF code point
// then use unknown token as the tokenization of this UTF code point // then use unknown token as the tokenization of this UTF code point
if (!single_codepoint_token_found) { if (!single_codepoint_token_found) {
const double challenger_score = current_best.score_sum + ugm_tokenizer->unknown_token_score; const double challenger_score = current_best.score_sum + tokenizer.unknown_token_score;
prefix_offset = input_offset + n_utf8_code_units; prefix_offset = input_offset + n_utf8_code_units;
struct best_tokenization & current_champ = tokenization_results[prefix_offset]; struct best_tokenization & current_champ = tokenization_results[prefix_offset];
if (challenger_score > current_champ.score_sum) { if (challenger_score > current_champ.score_sum) {
struct best_tokenization challenger = { vocab.special_unk_id, input_offset, (float) challenger_score }; struct best_tokenization challenger = { vocab.token_unk(), input_offset, (float) challenger_score };
current_champ = challenger; current_champ = challenger;
} }
} }
...@@ -944,7 +862,7 @@ struct llm_tokenizer_ugm_session { ...@@ -944,7 +862,7 @@ struct llm_tokenizer_ugm_session {
// merge sequences of consecutive unknown tokens into single unknown tokens // merge sequences of consecutive unknown tokens into single unknown tokens
bool is_prev_unknown = false; bool is_prev_unknown = false;
for (struct best_tokenization & tokenization = tokenization_results[input_len]; ; tokenization = tokenization_results[tokenization.input_offset]) { for (struct best_tokenization & tokenization = tokenization_results[input_len]; ; tokenization = tokenization_results[tokenization.input_offset]) {
bool is_unknown = tokenization.token_id == vocab.special_unk_id; bool is_unknown = tokenization.token_id == vocab.token_unk();
if (!(is_prev_unknown && is_unknown)) { if (!(is_prev_unknown && is_unknown)) {
output.push_back(tokenization.token_id); output.push_back(tokenization.token_id);
} }
...@@ -971,11 +889,11 @@ private: ...@@ -971,11 +889,11 @@ private:
normalized->clear(); normalized->clear();
normalized->reserve(input.size() * 3); normalized->reserve(input.size() * 3);
const std::string space = vocab.tokenizer_escape_whitespaces ? ugm_tokenizer->escaped_space : " "; const std::string space = vocab.get_escape_whitespaces() ? tokenizer.escaped_space : " ";
bool shall_prepend_space = !vocab.tokenizer_treat_whitespace_as_suffix && vocab.tokenizer_add_space_prefix; const bool shall_prepend_space = !vocab.get_treat_whitespace_as_suffix() && vocab.get_add_space_prefix();
bool shall_append_space = vocab.tokenizer_treat_whitespace_as_suffix && vocab.tokenizer_add_space_prefix; const bool shall_append_space = vocab.get_treat_whitespace_as_suffix() && vocab.get_add_space_prefix();
bool shall_merge_spaces = vocab.tokenizer_remove_extra_whitespaces; const bool shall_merge_spaces = vocab.get_remove_extra_whitespaces();
bool is_space_prepended = false; bool is_space_prepended = false;
bool processing_non_ws = false; bool processing_non_ws = false;
...@@ -1067,7 +985,7 @@ private: ...@@ -1067,7 +985,7 @@ private:
// if input prefix matches some user-defined token return this token as normalization result // if input prefix matches some user-defined token return this token as normalization result
auto user_defined_token_match = auto user_defined_token_match =
ugm_tokenizer->user_defined_token_matcher.get_longest_prefix(&input[input_offset], input.size() - input_offset); tokenizer.user_defined_token_matcher.get_longest_prefix(&input[input_offset], input.size() - input_offset);
if (user_defined_token_match.second > 0) { if (user_defined_token_match.second > 0) {
return { &input[input_offset], user_defined_token_match.second, user_defined_token_match.second }; return { &input[input_offset], user_defined_token_match.second, user_defined_token_match.second };
} }
...@@ -1075,8 +993,8 @@ private: ...@@ -1075,8 +993,8 @@ private:
size_t longest_prefix_length = 0; size_t longest_prefix_length = 0;
size_t longest_prefix_offset = 0; size_t longest_prefix_offset = 0;
if (ugm_tokenizer->xcda_array_size > 0) { if (tokenizer.xcda_array_size > 0) {
struct xcda_array_view xcda_view(ugm_tokenizer->xcda_array, ugm_tokenizer->xcda_array_size); struct xcda_array_view xcda_view(tokenizer.xcda_array, tokenizer.xcda_array_size);
// Find the longest normalized sequence matching the input prefix by walking // Find the longest normalized sequence matching the input prefix by walking
// the XOR-compressed compact double array (XCDA) starting from the root node // the XOR-compressed compact double array (XCDA) starting from the root node
...@@ -1112,10 +1030,10 @@ private: ...@@ -1112,10 +1030,10 @@ private:
if (longest_prefix_length > 0) { if (longest_prefix_length > 0) {
// we have a match, so return the replacement sequence // we have a match, so return the replacement sequence
if (longest_prefix_offset >= ugm_tokenizer->prefix_replacements_size) { if (longest_prefix_offset >= tokenizer.prefix_replacements_size) {
throw std::runtime_error("Index out of array bounds in precompiled charsmap!"); throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
} }
const char * prefix_replacement = &(ugm_tokenizer->prefix_replacements)[longest_prefix_offset]; const char * prefix_replacement = &(tokenizer.prefix_replacements)[longest_prefix_offset];
return { prefix_replacement, strlen(prefix_replacement), longest_prefix_length }; return { prefix_replacement, strlen(prefix_replacement), longest_prefix_length };
} }
...@@ -1132,7 +1050,7 @@ private: ...@@ -1132,7 +1050,7 @@ private:
} }
const llama_vocab & vocab; const llama_vocab & vocab;
const llm_tokenizer_ugm * ugm_tokenizer; const llm_tokenizer_ugm & tokenizer;
}; };
// //
...@@ -1194,15 +1112,15 @@ static std::vector<uint8_t> llama_unescape_rwkv_token(const std::string & escape ...@@ -1194,15 +1112,15 @@ static std::vector<uint8_t> llama_unescape_rwkv_token(const std::string & escape
} }
struct llm_tokenizer_rwkv : llm_tokenizer { struct llm_tokenizer_rwkv : llm_tokenizer {
llm_tokenizer_rwkv(const llama_vocab & vocab) : llm_tokenizer() { llm_tokenizer_rwkv(const llama_vocab & vocab) {
// RWKV supports arbitrary byte tokens, but the vocab struct only supports string tokens. // RWKV supports arbitrary byte tokens, but the vocab struct only supports string tokens.
// For now, we decode the vocab here into the lookup we'll use for tokenization. // For now, we decode the vocab here into the lookup we'll use for tokenization.
// build trie // build trie
for (unsigned int id = 0; id < vocab.id_to_token.size(); ++id) { for (uint32_t id = 0; id < vocab.n_tokens(); ++id) {
const auto & token = vocab.id_to_token[id]; const auto & data = vocab.get_token_data(id);
const auto data = llama_unescape_rwkv_token(token.text); const auto text = llama_unescape_rwkv_token(data.text);
token_matcher.insert((const char *) data.data(), data.size(), id); token_matcher.insert((const char *) text.data(), text.size(), id);
} }
} }
...@@ -1210,16 +1128,15 @@ struct llm_tokenizer_rwkv : llm_tokenizer { ...@@ -1210,16 +1128,15 @@ struct llm_tokenizer_rwkv : llm_tokenizer {
}; };
struct llm_tokenizer_rwkv_session { struct llm_tokenizer_rwkv_session {
llm_tokenizer_rwkv_session(const llama_vocab & vocab) : vocab(vocab), llm_tokenizer_rwkv_session(const llama_vocab & vocab, const llm_tokenizer_rwkv & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
rwkv_tokenizer(static_cast<const llm_tokenizer_rwkv &>(*vocab.tokenizer)) {}
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) { void tokenize(const std::string & text, std::vector<llama_token> & output) {
uint32_t position = 0; uint32_t position = 0;
while (position < text.size()) { while (position < text.size()) {
const struct naive_trie * node = rwkv_tokenizer.token_matcher.traverse(text[position]); const struct naive_trie * node = tokenizer.token_matcher.traverse(text[position]);
if (node == NULL) { if (node == NULL) {
// no matching token found, add unknown token // no matching token found, add unknown token
output.push_back(vocab.special_unk_id); output.push_back(vocab.token_unk());
position += 1; position += 1;
continue; continue;
} }
...@@ -1243,33 +1160,11 @@ struct llm_tokenizer_rwkv_session { ...@@ -1243,33 +1160,11 @@ struct llm_tokenizer_rwkv_session {
private: private:
const llama_vocab & vocab; const llama_vocab & vocab;
const llm_tokenizer_rwkv & rwkv_tokenizer; const llm_tokenizer_rwkv & tokenizer;
}; };
void llama_vocab::init_tokenizer() {
switch (type) {
case LLAMA_VOCAB_TYPE_SPM:
tokenizer = new llm_tokenizer_spm(*this);
break;
case LLAMA_VOCAB_TYPE_BPE:
tokenizer = new llm_tokenizer_bpe(*this);
break;
case LLAMA_VOCAB_TYPE_WPM:
tokenizer = new llm_tokenizer_wpm(*this);
break;
case LLAMA_VOCAB_TYPE_UGM:
tokenizer = new llm_tokenizer_ugm(*this);
break;
case LLAMA_VOCAB_TYPE_RWKV:
tokenizer = new llm_tokenizer_rwkv(*this);
break;
default:
GGML_ABORT("unsupported vocab type");
}
}
// //
// (de-) tokenize // impl
// //
typedef enum FRAGMENT_BUFFER_VARIANT_TYPE { typedef enum FRAGMENT_BUFFER_VARIANT_TYPE {
...@@ -1278,7 +1173,7 @@ typedef enum FRAGMENT_BUFFER_VARIANT_TYPE { ...@@ -1278,7 +1173,7 @@ typedef enum FRAGMENT_BUFFER_VARIANT_TYPE {
} FRAGMENT_BUFFER_VARIANT_TYPE; } FRAGMENT_BUFFER_VARIANT_TYPE;
struct fragment_buffer_variant { struct fragment_buffer_variant {
fragment_buffer_variant(llama_vocab::id _token) fragment_buffer_variant(llama_token _token)
: :
type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN), type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
token(_token), token(_token),
...@@ -1289,7 +1184,7 @@ struct fragment_buffer_variant { ...@@ -1289,7 +1184,7 @@ struct fragment_buffer_variant {
fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length) fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
: :
type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT), type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
token((llama_vocab::id) - 1), token((llama_token) - 1),
raw_text(_raw_text), raw_text(_raw_text),
offset(_offset), offset(_offset),
length(_length){ length(_length){
...@@ -1299,684 +1194,2051 @@ struct fragment_buffer_variant { ...@@ -1299,684 +1194,2051 @@ struct fragment_buffer_variant {
} }
const FRAGMENT_BUFFER_VARIANT_TYPE type; const FRAGMENT_BUFFER_VARIANT_TYPE type;
const llama_vocab::id token; const llama_token token;
const std::string _dummy; const std::string _dummy;
const std::string & raw_text; const std::string & raw_text;
const uint64_t offset; const uint64_t offset;
const uint64_t length; const uint64_t length;
}; };
// #define PRETOKENIZERDEBUG struct llama_vocab::impl {
uint32_t n_token_types = 0; // for BERT-style token types
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
enum llama_vocab_pre_type pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
int max_token_len = 0; // used for optimizing longest token search
// default LLaMA special tokens
// TODO: should we set all of these to LLAMA_TOKEN_NULL?
llama_token special_bos_id = 1;
llama_token special_eos_id = 2;
llama_token special_eot_id = LLAMA_TOKEN_NULL;
llama_token special_eom_id = LLAMA_TOKEN_NULL;
llama_token special_unk_id = 0;
llama_token special_sep_id = LLAMA_TOKEN_NULL;
llama_token special_pad_id = LLAMA_TOKEN_NULL;
llama_token special_mask_id = LLAMA_TOKEN_NULL;
llama_token linefeed_id = 13;
// fim tokens
llama_token special_fim_pre_id = LLAMA_TOKEN_NULL;
llama_token special_fim_suf_id = LLAMA_TOKEN_NULL;
llama_token special_fim_mid_id = LLAMA_TOKEN_NULL;
llama_token special_fim_pad_id = LLAMA_TOKEN_NULL;
llama_token special_fim_rep_id = LLAMA_TOKEN_NULL; // repo
llama_token special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator
// tokenizer flags
bool add_space_prefix = false;
bool add_bos = false;
bool add_eos = false;
bool ignore_merges = false;
bool clean_spaces = false; // clean_up_tokenization_spaces
bool remove_extra_whitespaces = false;
bool escape_whitespaces = true;
bool treat_whitespace_as_suffix = false;
std::unordered_map<std::string, llama_token> token_to_id;
std::vector<token_data> id_to_token;
std::vector<llama_token> cache_special_tokens;
std::vector<std::string> cache_token_to_piece; // llama_token_to_piece(special = true);
struct pair_hash {
size_t operator()(const std::pair<std::string, std::string> & p) const {
return std::hash<std::string>{}(p.first) ^ //create some hash for pair
(std::hash<std::string>{}(p.second) << 1);
}
};
std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> bpe_ranks;
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) { // set of all tokens that cause "end of generation"
// for each special token std::set<llama_token> special_eog_ids;
for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
const auto & data = vocab.id_to_token[special_id];
const auto & special_token = data.text;
if (!parse_special && (data.attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_UNKNOWN))) { std::unique_ptr<llm_tokenizer> tokenizer;
// Ignore control and unknown tokens when parse_special == false
continue; std::vector<char> precompiled_charsmap;
// User-defined tokens are still pre-tokenized before everything else
// ref: https://github.com/huggingface/tokenizers/blob/fdd26ba9a3f0c133427aab0423888cbde91362d7/tokenizers/src/tokenizer/mod.rs#L726 impl(const llama_vocab & vocab) : vocab(vocab) {
// This is mostly relevant for neox-style tokenizers (mpt, olmo, stablelm, etc.)
} }
// for each text fragment ~impl() = default;
std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
while (it != buffer.end()) {
auto & fragment = (*it);
// if a fragment is text ( not yet processed ) void load(llama_model_loader & ml, const LLM_KV & kv);
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
const auto & raw_text = fragment.raw_text;
auto raw_text_base_offset = fragment.offset; enum llama_vocab_type get_type() const;
auto raw_text_base_length = fragment.length;
// loop over the text std::string type_name() const;
while (true) {
// find the first occurrence of a given special token in this fragment
// passing offset argument only limit the "search area" but match coordinates
// are still relative to the source full raw_text
auto match = raw_text.find(special_token, raw_text_base_offset);
// no occurrences found, stop processing this fragment for a given special token bool is_normal (llama_token id) const;
if (match == std::string::npos) break; bool is_unknown (llama_token id) const;
bool is_control (llama_token id) const;
bool is_byte (llama_token id) const;
bool is_user_defined(llama_token id) const;
bool is_unused (llama_token id) const;
bool is_eog (llama_token id) const;
// check if match is within bounds of offset <-> length uint8_t token_to_byte(llama_token id) const;
if (match + special_token.length() > raw_text_base_offset + raw_text_base_length) break;
#ifdef PRETOKENIZERDEBUG llama_token_attr token_get_attr(llama_token id) const;
LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
#endif
auto source = std::distance(buffer.begin(), it);
// if match is further than base offset void init_tokenizer(enum llama_vocab_type type);
// then we have some text to the left of it
if (match > raw_text_base_offset) {
// left
const int64_t left_reminder_offset = raw_text_base_offset + 0;
int64_t left_reminder_length = match - raw_text_base_offset;
if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) { void tokenizer_st_partition(std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) const;
while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) {
left_reminder_length--;
}
}
if (left_reminder_length > 0) { std::string token_to_piece_for_cache(
buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length); llama_token token,
it++; bool special) const;
}
#ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
#endif
}
// special token std::vector<llama_token> tokenize(
buffer.emplace_after(it, special_id); const std::string & raw_text,
it++; bool add_special,
bool parse_special = false) const;
// right int32_t tokenize(
if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) { const char * text,
int64_t right_reminder_offset = match + special_token.length(); int32_t text_len,
int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length()); llama_token * tokens,
int32_t n_tokens_max,
bool add_special,
bool parse_special) const;
if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) { // does not write null-terminator to buf
while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) { int32_t token_to_piece(
right_reminder_offset++; llama_token token,
right_reminder_length--; char * buf,
} int32_t length,
int32_t lstrip,
bool special) const;
// use cached data
const std::string & token_to_piece(llama_token token) const;
int32_t detokenize(
const llama_token * tokens,
int32_t n_tokens,
char * text,
int32_t text_len_max,
bool remove_special,
bool unparse_special) const;
std::string detokenize(
const std::vector<llama_token> & tokens,
bool special) const;
void print_info() const;
private:
const llama_vocab & vocab;
};
void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
struct gguf_context * ctx = ml.meta.get();
// determine vocab type
{
std::string tokenizer_model;
std::string tokenizer_pre;
ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, n_token_types, false);
if (tokenizer_model == "no_vocab" || tokenizer_model == "none") {
type = LLAMA_VOCAB_TYPE_NONE;
// default special tokens
special_bos_id = LLAMA_TOKEN_NULL;
special_eos_id = LLAMA_TOKEN_NULL;
special_unk_id = LLAMA_TOKEN_NULL;
special_sep_id = LLAMA_TOKEN_NULL;
special_pad_id = LLAMA_TOKEN_NULL;
special_mask_id = LLAMA_TOKEN_NULL;
linefeed_id = LLAMA_TOKEN_NULL;
// read vocab size from metadata
uint32_t n_tokens = 0;
if (ml.get_key(LLM_KV_VOCAB_SIZE, n_tokens, false)) {
LLAMA_LOG_WARN("%s: adding %u dummy tokens\n", __func__, n_tokens);
id_to_token.resize(n_tokens);
} }
if (right_reminder_length > 0) { return;
buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
it++;
} }
#ifdef PRETOKENIZERDEBUG if (tokenizer_model == "llama") {
LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str()); type = LLAMA_VOCAB_TYPE_SPM;
// default special tokens
special_bos_id = 1;
special_eos_id = 2;
special_unk_id = 0;
special_sep_id = LLAMA_TOKEN_NULL;
special_pad_id = LLAMA_TOKEN_NULL;
special_mask_id = LLAMA_TOKEN_NULL;
} else if (tokenizer_model == "bert") {
type = LLAMA_VOCAB_TYPE_WPM;
// default special tokens
special_bos_id = 101;
special_eos_id = LLAMA_TOKEN_NULL;
special_unk_id = 100;
special_sep_id = 102;
special_pad_id = 0;
special_mask_id = 103;
} else if (tokenizer_model == "gpt2") {
type = LLAMA_VOCAB_TYPE_BPE;
// read bpe merges and populate bpe ranks
const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
if (merges_keyidx == -1) {
throw std::runtime_error("cannot find tokenizer merges in model file\n");
}
const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
for (int i = 0; i < n_merges; i++) {
const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
//GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
std::string first;
std::string second;
const size_t pos = word.find(' ', 1);
if (pos != std::string::npos) {
first = word.substr(0, pos);
second = word.substr(pos + 1);
}
bpe_ranks.emplace(std::make_pair(first, second), i);
}
// default special tokens
special_bos_id = 11;
special_eos_id = 11;
special_unk_id = LLAMA_TOKEN_NULL;
special_sep_id = LLAMA_TOKEN_NULL;
special_pad_id = LLAMA_TOKEN_NULL;
special_mask_id = LLAMA_TOKEN_NULL;
} else if (tokenizer_model == "t5") {
type = LLAMA_VOCAB_TYPE_UGM;
// default special tokens
special_bos_id = LLAMA_TOKEN_NULL;
special_eos_id = 1;
special_unk_id = 2;
special_sep_id = LLAMA_TOKEN_NULL;
special_pad_id = 0;
special_mask_id = LLAMA_TOKEN_NULL;
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
if (precompiled_charsmap_keyidx != -1) {
size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
#ifdef IS_BIG_ENDIAN
// correct endiannes of data in precompiled_charsmap binary blob
uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[0];
*xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap);
size_t xcda_array_size = *xcda_blob_size / sizeof(uint32_t);
uint32_t * xcda_array = (uint32_t *) &precompiled_charsmap[sizeof(uint32_t)];
for (size_t i = 0; i < xcda_array_size; ++i) {
xcda_array[i] = __builtin_bswap32(xcda_array[i]);
}
#endif #endif
}
} else if (tokenizer_model == "rwkv") {
type = LLAMA_VOCAB_TYPE_RWKV;
if (source == 0) { // default special tokens
buffer.erase_after(buffer.before_begin()); special_bos_id = LLAMA_TOKEN_NULL;
special_eos_id = LLAMA_TOKEN_NULL;
special_unk_id = LLAMA_TOKEN_NULL;
special_sep_id = LLAMA_TOKEN_NULL;
special_pad_id = LLAMA_TOKEN_NULL;
} else { } else {
buffer.erase_after(std::next(buffer.begin(), (source-1))); throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
}
// for now, only BPE models have pre-tokenizers
if (type == LLAMA_VOCAB_TYPE_BPE) {
add_space_prefix = false;
clean_spaces = true;
if (tokenizer_pre == "default") {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (
tokenizer_pre == "llama3" ||
tokenizer_pre == "llama-v3" ||
tokenizer_pre == "llama-bpe"||
tokenizer_pre == "falcon3") {
pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
ignore_merges = true;
add_bos = true;
} else if (
tokenizer_pre == "deepseek-llm") {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
clean_spaces = false;
} else if (
tokenizer_pre == "deepseek-coder") {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
clean_spaces = false;
} else if (
tokenizer_pre == "deepseek-v3") {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
clean_spaces = false;
} else if (
tokenizer_pre == "falcon") {
pre_type = LLAMA_VOCAB_PRE_TYPE_FALCON;
} else if (
tokenizer_pre == "mpt") {
pre_type = LLAMA_VOCAB_PRE_TYPE_MPT;
} else if (
tokenizer_pre == "starcoder") {
pre_type = LLAMA_VOCAB_PRE_TYPE_STARCODER;
} else if (
tokenizer_pre == "gpt-2" ||
tokenizer_pre == "phi-2" ||
tokenizer_pre == "jina-es" ||
tokenizer_pre == "jina-de" ||
tokenizer_pre == "gigachat" ||
tokenizer_pre == "jina-v1-en" ||
tokenizer_pre == "jina-v2-es" ||
tokenizer_pre == "jina-v2-de" ||
tokenizer_pre == "jina-v2-code" ||
tokenizer_pre == "roberta-bpe") {
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
} else if (
tokenizer_pre == "refact") {
pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT;
} else if (
tokenizer_pre == "command-r") {
pre_type = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
clean_spaces = false;
} else if (
tokenizer_pre == "qwen2" ||
tokenizer_pre == "deepseek-r1-qwen") {
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
clean_spaces = false;
} else if (
tokenizer_pre == "stablelm2") {
pre_type = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
} else if (
tokenizer_pre == "olmo") {
pre_type = LLAMA_VOCAB_PRE_TYPE_OLMO;
} else if (
tokenizer_pre == "dbrx") {
pre_type = LLAMA_VOCAB_PRE_TYPE_DBRX;
} else if (
tokenizer_pre == "smaug-bpe") {
pre_type = LLAMA_VOCAB_PRE_TYPE_SMAUG;
} else if (
tokenizer_pre == "poro-chat") {
pre_type = LLAMA_VOCAB_PRE_TYPE_PORO;
clean_spaces = false;
} else if (
tokenizer_pre == "chatglm-bpe") {
pre_type = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
special_bos_id = LLAMA_TOKEN_NULL;
} else if (
tokenizer_pre == "viking") {
pre_type = LLAMA_VOCAB_PRE_TYPE_VIKING;
clean_spaces = false;
} else if (
tokenizer_pre == "jais") {
pre_type = LLAMA_VOCAB_PRE_TYPE_JAIS;
} else if (
tokenizer_pre == "tekken") {
pre_type = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
clean_spaces = false;
ignore_merges = true;
add_bos = true;
} else if (
tokenizer_pre == "smollm") {
pre_type = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
clean_spaces = false;
} else if (
tokenizer_pre == "codeshell") {
pre_type = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
} else if (
tokenizer_pre == "bloom") {
pre_type = LLAMA_VOCAB_PRE_TYPE_BLOOM;
} else if (
tokenizer_pre == "gpt3-finnish") {
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH;
} else if (
tokenizer_pre == "exaone") {
pre_type = LLAMA_VOCAB_PRE_TYPE_EXAONE;
} else if (
tokenizer_pre == "chameleon") {
pre_type = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
add_bos = true;
clean_spaces = false;
} else if (
tokenizer_pre == "minerva-7b") {
pre_type = LLAMA_VOCAB_PRE_TYPE_MINERVA;
} else if (
tokenizer_pre == "megrez") {
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
} else {
LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
}
} else if (type == LLAMA_VOCAB_TYPE_SPM) {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
add_space_prefix = true;
clean_spaces = false;
add_bos = true;
add_eos = false;
} else if (type == LLAMA_VOCAB_TYPE_WPM) {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
add_space_prefix = false;
clean_spaces = true;
add_bos = true;
add_eos = false;
} else if (type == LLAMA_VOCAB_TYPE_UGM) {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
add_bos = false;
add_eos = true;
} else if (type == LLAMA_VOCAB_TYPE_RWKV) {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
add_space_prefix = false;
clean_spaces = false;
add_bos = false;
add_eos = false;
} else {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} }
// repeat for the right side ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, add_space_prefix, false);
raw_text_base_offset = right_reminder_offset; ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, remove_extra_whitespaces, false);
raw_text_base_length = right_reminder_length; }
#ifdef PRETOKENIZERDEBUG const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
LLAMA_LOG_WARN("RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str()); if (token_idx == -1) {
#endif throw std::runtime_error("cannot find tokenizer vocab in model file\n");
} else {
if (source == 0) {
buffer.erase_after(buffer.before_begin());
} else {
buffer.erase_after(std::next(buffer.begin(), (source-1)));
} }
break;
const float * scores = nullptr;
const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
if (score_idx != -1) {
scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
} }
const int * toktypes = nullptr;
const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
if (toktype_idx != -1) {
toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
} }
uint32_t n_tokens = gguf_get_arr_n(ctx, token_idx);
id_to_token.resize(n_tokens);
for (uint32_t i = 0; i < n_tokens; i++) {
std::string word = gguf_get_arr_str(ctx, token_idx, i);
if (word.empty()) {
LLAMA_LOG_WARN("%s: empty token at index %u\n", __func__, i);
word = "[EMPTY_" + std::to_string(i) + "]";
} }
it++;
token_to_id[word] = i;
max_token_len = std::max(max_token_len, (int) word.size());
auto & token_data = id_to_token[i];
token_data.text = std::move(word);
token_data.score = scores ? scores[i] : 0.0f;
token_data.attr = LLAMA_TOKEN_ATTR_NORMAL;
if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file
switch(toktypes[i]) {
case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break;
case LLAMA_TOKEN_TYPE_UNUSED: token_data.attr = LLAMA_TOKEN_ATTR_UNUSED; break;
case LLAMA_TOKEN_TYPE_NORMAL: token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; break;
case LLAMA_TOKEN_TYPE_CONTROL: token_data.attr = LLAMA_TOKEN_ATTR_CONTROL; break;
case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break;
case LLAMA_TOKEN_TYPE_BYTE: token_data.attr = LLAMA_TOKEN_ATTR_BYTE; break;
case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
} }
} }
} }
GGML_ASSERT(id_to_token.size() == token_to_id.size());
std::vector<llama_vocab::id> llama_tokenize_internal( init_tokenizer(type);
const llama_vocab & vocab,
std::string raw_text,
bool add_special,
bool parse_special) {
GGML_ASSERT(vocab.tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
std::vector<llama_vocab::id> output; // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
std::forward_list<fragment_buffer_variant> fragment_buffer; if (type == LLAMA_VOCAB_TYPE_SPM) {
try {
linefeed_id = vocab.byte_to_token('\n');
} catch (const std::exception & e) {
LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
linefeed_id = special_pad_id;
}
} else if (type == LLAMA_VOCAB_TYPE_WPM) {
linefeed_id = special_pad_id;
} else if (type == LLAMA_VOCAB_TYPE_RWKV) {
const std::vector<int> ids = tokenize("\n", false);
GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
linefeed_id = ids[0];
} else {
const std::vector<int> ids = tokenize("\n", false);
if (!raw_text.empty()) { //GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
fragment_buffer.emplace_front(raw_text, 0, raw_text.length()); if (ids.empty()) {
tokenizer_st_partition(vocab, fragment_buffer, parse_special); LLAMA_LOG_WARN("%s: model vocab missing newline token, using special_pad_id instead\n", __func__);
linefeed_id = special_pad_id;
} else {
linefeed_id = ids[0];
}
} }
switch (vocab.type) { // special tokens
case LLAMA_VOCAB_TYPE_SPM:
{ {
// OG tokenizer behavior: const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
// { LLM_KV_TOKENIZER_BOS_ID, special_bos_id },
// tokenizer.encode('', add_special_tokens=True) returns [1] { LLM_KV_TOKENIZER_EOS_ID, special_eos_id },
// tokenizer.encode('', add_special_tokens=False) returns [] { LLM_KV_TOKENIZER_EOT_ID, special_eot_id },
{ LLM_KV_TOKENIZER_EOM_ID, special_eom_id },
{ LLM_KV_TOKENIZER_UNK_ID, special_unk_id },
{ LLM_KV_TOKENIZER_SEP_ID, special_sep_id },
{ LLM_KV_TOKENIZER_PAD_ID, special_pad_id },
{ LLM_KV_TOKENIZER_MASK_ID, special_mask_id },
{ LLM_KV_TOKENIZER_FIM_PRE_ID, special_fim_pre_id },
{ LLM_KV_TOKENIZER_FIM_SUF_ID, special_fim_suf_id },
{ LLM_KV_TOKENIZER_FIM_MID_ID, special_fim_mid_id },
{ LLM_KV_TOKENIZER_FIM_PAD_ID, special_fim_pad_id },
{ LLM_KV_TOKENIZER_FIM_REP_ID, special_fim_rep_id },
{ LLM_KV_TOKENIZER_FIM_SEP_ID, special_fim_sep_id },
// deprecated
{ LLM_KV_TOKENIZER_PREFIX_ID, special_fim_pre_id },
{ LLM_KV_TOKENIZER_SUFFIX_ID, special_fim_suf_id },
{ LLM_KV_TOKENIZER_MIDDLE_ID, special_fim_mid_id },
};
bool is_prev_special = true; // prefix with space if first token for (const auto & it : special_token_types) {
const std::string & key = kv(std::get<0>(it));
int32_t & id = std::get<1>(it);
if (add_special && vocab.tokenizer_add_bos) { uint32_t new_id;
GGML_ASSERT(vocab.special_bos_id != -1); if (!ml.get_key(std::get<0>(it), new_id, false)) {
output.push_back(vocab.special_bos_id); continue;
is_prev_special = true; }
if (new_id >= id_to_token.size()) {
LLAMA_LOG_WARN("%s: bad special token: '%s' = %u, using default id %d\n",
__func__, key.c_str(), new_id, id);
} else {
id = new_id;
}
} }
for (const auto & fragment : fragment_buffer) { // Handle add_bos and add_eos
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { {
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); bool temp = true;
// prefix with space if previous is special if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
if (vocab.tokenizer_add_space_prefix && is_prev_special) { add_bos = temp;
raw_text = " " + raw_text;
} }
if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
add_eos = temp;
}
}
// auto-detect special tokens by text
// TODO: convert scripts should provide these tokens through the KV metadata LLM_KV_TOKENIZER_...
// for now, we apply this workaround to find the tokens based on their text
#ifdef PRETOKENIZERDEBUG for (const auto & t : token_to_id) {
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
#endif if (special_eot_id == LLAMA_TOKEN_NULL) {
llama_escape_whitespace(raw_text); if (false
llm_tokenizer_spm_session session(vocab); || t.first == "<|eot_id|>"
session.tokenize(raw_text, output); || t.first == "<|im_end|>"
is_prev_special = false; || t.first == "<|end|>"
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) || t.first == "<end_of_turn>"
output.push_back(fragment.token); || t.first == "<|endoftext|>"
is_prev_special = true; || t.first == "<EOT>"
|| t.first == "<|end▁of▁sentence|>" // DeepSeek
) {
special_eot_id = t.second;
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
} }
} }
if (add_special && vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) {
LLAMA_LOG_WARN(
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
"Are you sure this is what you want?\n", __FUNCTION__);
} }
if (add_special && vocab.tokenizer_add_eos) { // find EOM token: "<|eom_id|>"
GGML_ASSERT(vocab.special_eos_id != -1); if (special_eom_id == LLAMA_TOKEN_NULL) {
output.push_back(vocab.special_eos_id); if (false
|| t.first == "<|eom_id|>"
) {
special_eom_id = t.second;
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
} }
} break;
case LLAMA_VOCAB_TYPE_BPE:
{
llm_tokenizer_bpe_session session(vocab);
// it calls some other methods that are not exist in llm_tokenizer,
// here just cast it to bpe tokenizer object
if (add_special) {
session.append_bos(output);
} }
for (const auto & fragment : fragment_buffer) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
#ifdef PRETOKENIZERDEBUG // find FIM_PRE token: "<|fim_prefix|>", "<fim-prefix>", "<PRE>", etc.
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); if (special_fim_pre_id == LLAMA_TOKEN_NULL) {
#endif if (false
session.tokenize(raw_text, output); || t.first == "<|fim_prefix|>" // Qwen
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) || t.first == "<fim-prefix>"
session.append(fragment.token, output); || t.first == "<|fim▁begin|>" // DeepSeek
|| t.first == "<PRE>"
) {
special_fim_pre_id = t.second;
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
} }
} }
if (add_special) { // find FIM_SUF token: "<|fim_suffix|>", "<fim-suffix>", "<SUF>", etc.
session.append_eos(output); if (special_fim_suf_id == LLAMA_TOKEN_NULL) {
session.check_double_bos_eos(output); if (false
|| t.first == "<|fim_suffix|>" // Qwen
|| t.first == "<fim-suffix>"
|| t.first == "<|fim▁hole|>" // DeepSeek
|| t.first == "<SUF>"
) {
special_fim_suf_id = t.second;
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
} }
} break; }
case LLAMA_VOCAB_TYPE_WPM:
{ // find FIM_MID token: "<|fim_middle|>", "<fim-middle>", "<MID>", etc.
if (add_special) { if (special_fim_mid_id == LLAMA_TOKEN_NULL) {
GGML_ASSERT(vocab.special_cls_id != -1); if (false
output.push_back(vocab.special_cls_id); || t.first == "<|fim_middle|>" // Qwen
|| t.first == "<fim-middle>"
|| t.first == "<|fim▁end|>" // DeepSeek
|| t.first == "<MID>"
) {
special_fim_mid_id = t.second;
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
}
}
// find FIM_PAD token: "<|fim_pad|>", "<fim-pad>", "<PAD>", etc.
if (special_fim_pad_id == LLAMA_TOKEN_NULL) {
if (false
|| t.first == "<|fim_pad|>" // Qwen
|| t.first == "<fim-pad>"
|| t.first == "<PAD>"
) {
special_fim_pad_id = t.second;
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
}
}
// find FIM_REP token: "<|fim_repo|>", "<fim-repo>", "<REP>", etc.
if (special_fim_rep_id == LLAMA_TOKEN_NULL) {
if (false
|| t.first == "<|fim_repo|>" // Qwen
|| t.first == "<|repo_name|>"
|| t.first == "<fim-repo>"
|| t.first == "<REPO>"
) {
special_fim_rep_id = t.second;
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
}
}
// find FIM_SEP token: "<|file_sep|>"
if (special_fim_sep_id == LLAMA_TOKEN_NULL) {
if (false
|| t.first == "<|file_sep|>" // Qwen
) {
special_fim_sep_id = t.second;
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
}
}
}
// maintain a list of tokens that cause end-of-generation
// this is currently determined based on the token text, which is obviously not ideal
// ref: https://github.com/ggerganov/llama.cpp/issues/9606
special_eog_ids.clear();
if (special_fim_pad_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_pad_id) == 0) {
special_eog_ids.insert(special_fim_pad_id);
}
if (special_fim_rep_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_rep_id) == 0) {
special_eog_ids.insert(special_fim_rep_id);
}
if (special_fim_sep_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_sep_id) == 0) {
special_eog_ids.insert(special_fim_sep_id);
}
for (const auto & t : token_to_id) {
if (false
|| t.first == "<|eot_id|>"
|| t.first == "<|im_end|>"
|| t.first == "<|end|>"
|| t.first == "<end_of_turn>"
|| t.first == "<|endoftext|>"
|| t.first == "<|eom_id|>"
|| t.first == "<EOT>"
) {
special_eog_ids.insert(t.second);
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
} else {
// token is control, but not marked as EOG -> print a debug log
if (id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && special_eog_ids.count(t.second) == 0) {
LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
__func__, t.second, t.first.c_str());
}
}
}
// sanity checks
if (special_eos_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eos_id) == 0) {
special_eog_ids.insert(special_eos_id);
LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
}
if (special_eot_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eot_id) == 0) {
special_eog_ids.insert(special_eot_id);
LLAMA_LOG_WARN("%s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
}
if (special_eom_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eom_id) == 0) {
special_eog_ids.insert(special_eom_id);
LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
}
}
// build special tokens cache
{
for (llama_token id = 0; id < (llama_token) n_tokens; ++id) {
if (id_to_token[id].attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN)) {
cache_special_tokens.push_back(id);
}
}
std::sort(cache_special_tokens.begin(), cache_special_tokens.end(),
[&] (const llama_token a, const llama_token b) {
return id_to_token[a].text.size() > id_to_token[b].text.size();
}
);
LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t) cache_special_tokens.size());
}
// build token to piece cache
{
size_t size_cache = 0;
std::vector<std::string> cache(n_tokens);
for (uint32_t id = 0; id < n_tokens; ++id) {
cache[id] = token_to_piece_for_cache(id, true);
size_cache += cache[id].size();
}
std::swap(cache_token_to_piece, cache);
LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
}
// Handle per token attributes
//NOTE: Each model customizes per token attributes.
//NOTE: Per token attributes are missing from the GGUF file.
//TODO: Extract attributes from GGUF file.
{
auto _contains_any = [] (const std::string & str, const std::vector<std::string> & substrs) -> bool {
for (const auto & substr : substrs) {
if (str.find(substr) < std::string::npos) {
return true;
}
}
return false;
};
auto _set_tokenid_attr = [&] (const llama_token id, llama_token_attr attr, bool value) {
uint32_t current = id_to_token.at(id).attr;
current = value ? (current | attr) : (current & ~attr);
id_to_token[id].attr = (llama_token_attr) current;
};
auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) {
_set_tokenid_attr(token_to_id.at(token), attr, value);
};
std::string model_name;
std::string tokenizer_pre;
ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
// model name to lowercase
std::transform(model_name.begin(), model_name.end(), model_name.begin(),
[] (const std::string::value_type x) {
return std::tolower(x);
}
);
// set attributes by model/tokenizer name
if (_contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})) {
_set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
} else if (_contains_any(model_name, {"phi-3", "phi3"})) {
for (auto id : cache_special_tokens) {
_set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
}
for (const auto * token : {"</s>"}) {
_set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true);
}
for (const auto * token : {"<unk>", "<s>", "<|endoftext|>"}) {
_set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
}
}
}
}
enum llama_vocab_type llama_vocab::impl::get_type() const {
return type;
}
std::string llama_vocab::impl::type_name() const{
switch (type) {
case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
case LLAMA_VOCAB_TYPE_UGM: return "UGM";
case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
default: return "unknown";
}
}
bool llama_vocab::impl::is_normal(llama_token id) const {
GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
return id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL;
}
bool llama_vocab::impl::is_unknown(llama_token id) const {
GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
return id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN;
}
bool llama_vocab::impl::is_control(llama_token id) const {
GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
return id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL;
}
bool llama_vocab::impl::is_byte(llama_token id) const {
GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
return id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE;
}
bool llama_vocab::impl::is_user_defined(llama_token id) const {
GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
return id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
}
bool llama_vocab::impl::is_unused(llama_token id) const {
GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
return id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNUSED;
}
bool llama_vocab::impl::is_eog(llama_token id) const {
return id != LLAMA_TOKEN_NULL && special_eog_ids.count(id) > 0;
}
uint8_t llama_vocab::impl::token_to_byte(llama_token id) const {
GGML_ASSERT(get_type() != LLAMA_VOCAB_TYPE_NONE);
GGML_ASSERT(is_byte(id));
const auto & token_data = id_to_token.at(id);
switch (get_type()) {
case LLAMA_VOCAB_TYPE_SPM:
case LLAMA_VOCAB_TYPE_UGM: {
auto buf = token_data.text.substr(3, 2);
return strtol(buf.c_str(), NULL, 16);
}
case LLAMA_VOCAB_TYPE_BPE: {
GGML_ABORT("fatal error");
}
case LLAMA_VOCAB_TYPE_WPM: {
GGML_ABORT("fatal error");
}
default:
GGML_ABORT("fatal error");
}
}
llama_token_attr llama_vocab::impl::token_get_attr(llama_token id) const {
GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
return id_to_token.at(id).attr;
}
void llama_vocab::impl::init_tokenizer(enum llama_vocab_type type) {
LLAMA_LOG_DEBUG("%s: initializing tokenizer for type %d\n", __func__, type);
switch (type) {
case LLAMA_VOCAB_TYPE_SPM:
tokenizer = std::make_unique<llm_tokenizer_spm>(vocab);
break;
case LLAMA_VOCAB_TYPE_BPE:
tokenizer = std::make_unique<llm_tokenizer_bpe>(vocab);
break;
case LLAMA_VOCAB_TYPE_WPM:
tokenizer = std::make_unique<llm_tokenizer_wpm>(vocab);
break;
case LLAMA_VOCAB_TYPE_UGM:
tokenizer = std::make_unique<llm_tokenizer_ugm>(vocab, precompiled_charsmap);
break;
case LLAMA_VOCAB_TYPE_RWKV:
tokenizer = std::make_unique<llm_tokenizer_rwkv>(vocab);
break;
default:
GGML_ABORT("unsupported vocab type");
}
}
//
// (de-) tokenize
//
// #define PRETOKENIZERDEBUG
void llama_vocab::impl::tokenizer_st_partition(std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) const {
// for each special token
for (const llama_token special_id : cache_special_tokens) {
const auto & data = vocab.get_token_data(special_id);
const auto & text = data.text;
if (!parse_special && (data.attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_UNKNOWN))) {
// Ignore control and unknown tokens when parse_special == false
continue;
// User-defined tokens are still pre-tokenized before everything else
// ref: https://github.com/huggingface/tokenizers/blob/fdd26ba9a3f0c133427aab0423888cbde91362d7/tokenizers/src/tokenizer/mod.rs#L726
// This is mostly relevant for neox-style tokenizers (mpt, olmo, stablelm, etc.)
}
// for each text fragment
std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
while (it != buffer.end()) {
auto & fragment = (*it);
// if a fragment is text ( not yet processed )
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
const auto & raw_text = fragment.raw_text;
auto raw_text_base_offset = fragment.offset;
auto raw_text_base_length = fragment.length;
// loop over the text
while (true) {
// find the first occurrence of a given special token in this fragment
// passing offset argument only limit the "search area" but match coordinates
// are still relative to the source full raw_text
auto match = raw_text.find(text, raw_text_base_offset);
// no occurrences found, stop processing this fragment for a given special token
if (match == std::string::npos) break;
// check if match is within bounds of offset <-> length
if (match + text.length() > raw_text_base_offset + raw_text_base_length) break;
#ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
#endif
auto source = std::distance(buffer.begin(), it);
// if match is further than base offset
// then we have some text to the left of it
if (match > raw_text_base_offset) {
// left
const int64_t left_reminder_offset = raw_text_base_offset + 0;
int64_t left_reminder_length = match - raw_text_base_offset;
if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) {
while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) {
left_reminder_length--;
}
}
if (left_reminder_length > 0) {
buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
it++;
}
#ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
#endif
}
// special token
buffer.emplace_after(it, special_id);
it++;
// right
if (match + text.length() < raw_text_base_offset + raw_text_base_length) {
int64_t right_reminder_offset = match + text.length();
int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + text.length());
if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) {
while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) {
right_reminder_offset++;
right_reminder_length--;
}
}
if (right_reminder_length > 0) {
buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
it++;
}
#ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
#endif
if (source == 0) {
buffer.erase_after(buffer.before_begin());
} else {
buffer.erase_after(std::next(buffer.begin(), (source - 1)));
}
// repeat for the right side
raw_text_base_offset = right_reminder_offset;
raw_text_base_length = right_reminder_length;
#ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
#endif
} else {
if (source == 0) {
buffer.erase_after(buffer.before_begin());
} else {
buffer.erase_after(std::next(buffer.begin(), (source - 1)));
}
break;
}
}
}
it++;
}
}
}
// NOTE: avoid ever using this except for building the token_to_piece caches
std::string llama_vocab::impl::token_to_piece_for_cache(llama_token token, bool special) const {
std::string piece;
piece.resize(piece.capacity()); // using string internal cache
const int n_chars = vocab.token_to_piece(token, &piece[0], piece.size(), 0, special);
if (n_chars < 0) {
piece.resize(-n_chars);
int check = vocab.token_to_piece(token, &piece[0], piece.size(), 0, special);
GGML_ASSERT(check == -n_chars);
}
else {
piece.resize(n_chars);
}
return piece;
}
static void llama_escape_whitespace(std::string & text) {
replace_all(text, " ", "\xe2\x96\x81");
}
static void llama_unescape_whitespace(std::string & word) {
replace_all(word, "\xe2\x96\x81", " ");
}
static std::string llama_decode_text(const std::string & text) {
std::string decoded_text;
const auto cpts = unicode_cpts_from_utf8(text);
for (const auto cpt : cpts) {
const auto utf8 = unicode_cpt_to_utf8(cpt);
try {
decoded_text += unicode_utf8_to_byte(utf8);
} catch (const std::out_of_range & /*e*/) {
decoded_text += "[UNK_BYTE_0x";
for (const auto c : utf8) {
decoded_text += format("%02x", (uint8_t) c);
}
decoded_text += text + "]";
}
}
return decoded_text;
}
std::vector<llama_token> llama_vocab::impl::tokenize(
const std::string & raw_text,
bool add_special,
bool parse_special) const {
GGML_ASSERT(tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
std::vector<llama_token> output;
std::forward_list<fragment_buffer_variant> fragment_buffer;
if (!raw_text.empty()) {
fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
tokenizer_st_partition(fragment_buffer, parse_special);
}
switch (get_type()) {
case LLAMA_VOCAB_TYPE_SPM:
{
// OG tokenizer behavior:
//
// tokenizer.encode('', add_special_tokens=True) returns [1]
// tokenizer.encode('', add_special_tokens=False) returns []
bool is_prev_special = true; // prefix with space if first token
if (add_special && add_bos) {
GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL);
output.push_back(special_bos_id);
is_prev_special = true;
}
for (const auto & fragment : fragment_buffer) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
std::string text;
// prefix with space if previous is special
if (add_space_prefix && is_prev_special) {
text = ' ';
}
text += fragment.raw_text.substr(fragment.offset, fragment.length);
#ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
#endif
llama_escape_whitespace(text);
llm_tokenizer_spm_session session(vocab);
session.tokenize(text, output);
is_prev_special = false;
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
output.push_back(fragment.token);
is_prev_special = true;
}
}
if (add_special && add_bos && output.size() >= 2 && output[1] == special_bos_id) {
LLAMA_LOG_WARN(
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
"Are you sure this is what you want?\n", __FUNCTION__);
}
if (add_special && add_eos) {
GGML_ASSERT(special_eos_id != LLAMA_TOKEN_NULL);
output.push_back(special_eos_id);
}
} break;
case LLAMA_VOCAB_TYPE_BPE:
{
llm_tokenizer_bpe_session session(vocab, *static_cast<const llm_tokenizer_bpe *>(tokenizer.get()));
// it calls some other methods that are not exist in llm_tokenizer,
// here just cast it to bpe tokenizer object
if (add_special) {
session.append_bos(output);
}
for (const auto & fragment : fragment_buffer) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
#ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
#endif
session.tokenize(text, output);
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
session.append(fragment.token, output);
}
}
if (add_special) {
session.append_eos(output);
session.check_double_bos_eos(output);
}
} break;
case LLAMA_VOCAB_TYPE_WPM:
{
if (add_special) {
GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL);
output.push_back(special_bos_id);
} }
llm_tokenizer_wpm_session session(vocab); llm_tokenizer_wpm_session session(vocab);
for (const auto & fragment : fragment_buffer) { for (const auto & fragment : fragment_buffer) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
#ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
#endif
session.tokenize(text, output);
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
output.push_back(fragment.token);
}
}
if (add_special) {
GGML_ASSERT(special_sep_id != LLAMA_TOKEN_NULL);
output.push_back(special_sep_id);
}
} break;
case LLAMA_VOCAB_TYPE_UGM:
{
if (add_special && add_bos) {
GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL);
output.push_back(special_bos_id);
}
llm_tokenizer_ugm_session session(vocab, *static_cast<const llm_tokenizer_ugm *>(tokenizer.get()));
for (const auto & fragment : fragment_buffer) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
#ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
#endif
session.tokenize(text, output);
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
output.push_back(fragment.token);
}
}
if (add_special && add_bos && output.size() >= 2 && output[1] == special_bos_id) {
LLAMA_LOG_WARN(
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
"Are you sure this is what you want?\n", __FUNCTION__);
}
if (add_special && add_eos) {
GGML_ASSERT(special_eos_id != LLAMA_TOKEN_NULL);
output.push_back(special_eos_id);
}
} break;
case LLAMA_VOCAB_TYPE_RWKV:
{
llm_tokenizer_rwkv_session session(vocab, *static_cast<const llm_tokenizer_rwkv *>(tokenizer.get()));
for (const auto & fragment : fragment_buffer) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
#ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
#endif
session.tokenize(text, output);
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
output.push_back(fragment.token);
}
}
} break;
case LLAMA_VOCAB_TYPE_NONE:
GGML_ABORT("fatal error");
}
return output;
}
int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) const {
// ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
static const int attr_special = LLAMA_TOKEN_ATTR_UNKNOWN | LLAMA_TOKEN_ATTR_CONTROL;
const llama_token_attr attr = token_get_attr(token);
if (!special && (attr & attr_special)) {
return 0;
}
// copy piece chars to output text buffer
// skip up to 'lstrip' leading spaces before copying
auto _try_copy = [=] (const char * token, size_t size) -> int32_t {
for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) {
token++;
size--;
}
if (length < (int32_t)size) {
return -(int32_t) size;
}
memcpy(buf, token, size);
return (int32_t) size;
};
// if we have a cache - use it
{
const auto & cache = cache_token_to_piece;
if (!cache.empty()) {
const auto & result = cache.at(token);
return _try_copy(result.data(), result.size());
}
}
if (0 <= token && token < (int32_t) id_to_token.size()) {
const std::string & token_text = id_to_token[token].text;
switch (get_type()) {
case LLAMA_VOCAB_TYPE_WPM:
case LLAMA_VOCAB_TYPE_SPM:
case LLAMA_VOCAB_TYPE_UGM: {
// NOTE: we accept all unsupported token types,
// suppressing them like CONTROL tokens.
if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
return _try_copy(token_text.data(), token_text.size());
}
if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
std::string result = token_text;
llama_unescape_whitespace(result);
return _try_copy(result.data(), result.size());
}
if (attr & LLAMA_TOKEN_ATTR_BYTE) {
char byte = (char) token_to_byte(token);
return _try_copy((char*) &byte, 1);
}
break;
}
case LLAMA_VOCAB_TYPE_BPE: {
// NOTE: we accept all unsupported token types,
// suppressing them like CONTROL tokens.
if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
return _try_copy(token_text.data(), token_text.size());
}
if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
std::string result = llama_decode_text(token_text);
return _try_copy(result.data(), result.size());
}
break;
}
case LLAMA_VOCAB_TYPE_RWKV: {
std::vector<uint8_t> result = llama_unescape_rwkv_token(token_text);
// If we don't have enough space, return an error
if (result.size() > (size_t)length) {
return -(int)result.size();
}
memcpy(buf, result.data(), result.size());
return (int)result.size();
}
default:
GGML_ABORT("fatal error");
}
}
return 0;
}
#ifdef PRETOKENIZERDEBUG const std::string & llama_vocab::impl::token_to_piece(llama_token token) const {
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); return cache_token_to_piece.at(token);
#endif }
session.tokenize(raw_text, output);
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) int32_t llama_vocab::impl::detokenize(
output.push_back(fragment.token); const llama_token * tokens,
int32_t n_tokens,
char * text,
int32_t text_len_max,
bool remove_special,
bool unparse_special) const {
if (type == LLAMA_VOCAB_TYPE_NONE) {
return 0;
}
GGML_ASSERT(tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
int32_t avail = text_len_max;
int32_t total = 0;
// remove the leading space
bool remove_space = add_space_prefix;
if (remove_special && add_bos) {
if (n_tokens > 0 && tokens[0] == special_bos_id) {
remove_space = false;
n_tokens--;
tokens++;
} }
} }
if (add_special) { if (remove_special && add_eos) {
GGML_ASSERT(vocab.special_sep_id != -1); if (n_tokens > 0 && tokens[n_tokens - 1] == special_eos_id) {
output.push_back(vocab.special_sep_id); n_tokens--;
} }
} break;
case LLAMA_VOCAB_TYPE_UGM:
{
if (add_special && vocab.tokenizer_add_bos) {
GGML_ASSERT(vocab.special_bos_id != -1);
output.push_back(vocab.special_bos_id);
} }
llm_tokenizer_ugm_session session(vocab);
for (const auto & fragment : fragment_buffer) { for (int32_t i = 0; i < n_tokens; ++i) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { GGML_ASSERT(avail >= 0);
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); int32_t n_chars = token_to_piece(tokens[i], text, avail, remove_space, unparse_special);
#ifdef PRETOKENIZERDEBUG remove_space = false;
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); if (n_chars < 0) {
#endif avail = 0;
session.tokenize(raw_text, output); total -= n_chars;
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) } else if (n_chars > 0) {
output.push_back(fragment.token); avail -= n_chars;
text += n_chars;
total += n_chars;
} }
} }
if (add_special && vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) { if (total > text_len_max) {
LLAMA_LOG_WARN( return -total;
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
"Are you sure this is what you want?\n", __FUNCTION__);
} }
if (add_special && vocab.tokenizer_add_eos) { if (clean_spaces) {
GGML_ASSERT(vocab.special_eos_id != -1); text -= total; // restart text
output.push_back(vocab.special_eos_id);
// first pass: characters ?!., //TODO: where do these characters come from?
const int32_t total1 = total;
total = total ? 1 : 0;
for (int32_t i = 1; i < total1; ++i) {
const char x = text[i];
if (text[i - 1] == ' ') {
if (x == '?' || x == '!' || x == '.' || x == ',') { // " ?", " !", " .", " ,"
total--; // remove space
}
}
text[total++] = x;
} }
} break;
case LLAMA_VOCAB_TYPE_RWKV:
{
llm_tokenizer_rwkv_session session(vocab);
for (const auto & fragment : fragment_buffer) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
#ifdef PRETOKENIZERDEBUG // second pass: strip single apostrophe between spaces
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); const int32_t total2 = total;
#endif total = total ? 1 : 0;
for (int32_t i = 1; i < total2; ++i) {
const char x = text[i];
if (x == '\'' && i + 1 < total2 && text[i - 1] == ' ' && text[i + 1] == ' ') { // " ' "
total--; // remove prev space
text[++i] = '\0'; // remove next space
}
text[total++] = x;
}
session.tokenize(raw_text, output); // third pass: apostrophe contractions //NOTE: this makes sense?
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) const int32_t total3 = total;
output.push_back(fragment.token); total = total ? 1 : 0;
for (int32_t i = 1; i < total3; ++i) {
const char x = text[i];
if (text[i - 1] == ' ') {
if (x == '\'' && i + 1 < total3) {
const char x1 = text[i + 1];
if (x1 == 't' || x1 == 'd') { // " 't", " 'd"
//total--; // remove space
} else if (x1 == 's' || x1 == 'm') { // " 's", " 'm"
total--; // remove space
} else if (i + 2 < total3) {
const char x2 = text[i + 2];
if ((x1 == 'l' && x2 == 'l')) { // " 'll"
//total--; // remove space
} else if ((x1 == 'r' && x2 == 'e') || (x1 == 'v' && x2 == 'e')) { // " 're", " 've"
total--; // remove space
} else {
//total--; // remove space
} }
} else {
//total--; // remove space
}
}
}
text[total++] = x;
} }
} break;
case LLAMA_VOCAB_TYPE_NONE:
GGML_ABORT("fatal error");
} }
return output; return total <= text_len_max ? total : -total;
}
void llama_vocab::impl::print_info() const {
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, type_name().c_str());
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, vocab.n_tokens());
LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (uint32_t) bpe_ranks.size());
// special tokens
if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, special_bos_id, id_to_token[special_bos_id].text.c_str() ); }
if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, special_eos_id, id_to_token[special_eos_id].text.c_str() ); }
if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, special_eot_id, id_to_token[special_eot_id].text.c_str() ); }
if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, special_eom_id, id_to_token[special_eom_id].text.c_str() ); }
if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, special_unk_id, id_to_token[special_unk_id].text.c_str() ); }
if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, special_sep_id, id_to_token[special_sep_id].text.c_str() ); }
if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, special_pad_id, id_to_token[special_pad_id].text.c_str() ); }
if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, special_mask_id, id_to_token[special_mask_id].text.c_str() ); }
if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, linefeed_id, id_to_token[linefeed_id].text.c_str() ); }
if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, special_fim_pre_id, id_to_token[special_fim_pre_id].text.c_str() ); }
if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, special_fim_suf_id, id_to_token[special_fim_suf_id].text.c_str() ); }
if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, special_fim_mid_id, id_to_token[special_fim_mid_id].text.c_str() ); }
if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, special_fim_pad_id, id_to_token[special_fim_pad_id].text.c_str() ); }
if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, special_fim_rep_id, id_to_token[special_fim_rep_id].text.c_str() ); }
if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, special_fim_sep_id, id_to_token[special_fim_sep_id].text.c_str() ); }
for (const auto & id : special_eog_ids) {
LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, id_to_token[id].text.c_str() );
}
LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, max_token_len);
}
llama_vocab::llama_vocab() : pimpl(new impl(*this)) {
}
llama_vocab::~llama_vocab() {
}
void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
pimpl->load(ml, kv);
}
enum llama_vocab_type llama_vocab::get_type() const {
return pimpl->type;
}
enum llama_vocab_pre_type llama_vocab::get_pre_type() const {
return pimpl->pre_type;
}
uint32_t llama_vocab::n_tokens() const {
return (uint32_t) pimpl->id_to_token.size();
}
uint32_t llama_vocab::n_token_types() const {
return (uint32_t) pimpl->n_token_types;
}
std::string llama_vocab::type_name() const{
return pimpl->type_name();
}
bool llama_vocab::is_normal(llama_token id) const {
return pimpl->is_normal(id);
}
bool llama_vocab::is_unknown(llama_token id) const {
return pimpl->is_unknown(id);
}
bool llama_vocab::is_control(llama_token id) const {
return pimpl->is_control(id);
}
bool llama_vocab::is_byte(llama_token id) const {
return pimpl->is_byte(id);
}
bool llama_vocab::is_user_defined(llama_token id) const {
return pimpl->is_user_defined(id);
}
bool llama_vocab::is_unused(llama_token id) const {
return pimpl->is_unused(id);
}
bool llama_vocab::is_eog(llama_token id) const {
return pimpl->is_eog(id);
}
uint8_t llama_vocab::token_to_byte(llama_token id) const {
return pimpl->token_to_byte(id);
} }
llama_token llama_byte_to_token_impl(const llama_vocab & vocab, uint8_t ch) { llama_token llama_vocab::byte_to_token(uint8_t ch) const {
GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE); GGML_ASSERT(get_type() != LLAMA_VOCAB_TYPE_NONE);
static const char * hex = "0123456789ABCDEF"; static const char * hex = "0123456789ABCDEF";
switch (llama_vocab_get_type(vocab)) { switch (get_type()) {
case LLAMA_VOCAB_TYPE_SPM: case LLAMA_VOCAB_TYPE_SPM:
case LLAMA_VOCAB_TYPE_UGM: { case LLAMA_VOCAB_TYPE_UGM: {
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 }; const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
auto token = vocab.token_to_id.find(buf); auto token = pimpl->token_to_id.find(buf);
if (token != vocab.token_to_id.end()) { if (token != pimpl->token_to_id.end()) {
return (*token).second; return (*token).second;
} }
// Try to fall back to just the byte as a string // Try to fall back to just the byte as a string
const char buf2[2] = { (char)ch, 0 }; const char buf2[2] = { (char)ch, 0 };
return vocab.token_to_id.at(buf2); return pimpl->token_to_id.at(buf2);
} }
case LLAMA_VOCAB_TYPE_WPM: case LLAMA_VOCAB_TYPE_WPM:
case LLAMA_VOCAB_TYPE_BPE: { case LLAMA_VOCAB_TYPE_BPE: {
return vocab.token_to_id.at(unicode_byte_to_utf8(ch)); return pimpl->token_to_id.at(unicode_byte_to_utf8(ch));
} }
default: default:
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
} }
} }
const char * llama_token_get_text_impl(const struct llama_vocab & vocab, llama_token token) { llama_token llama_vocab::text_to_token(const std::string & text) const {
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
return vocab.id_to_token[token].text.c_str(); auto it = pimpl->token_to_id.find(text);
if (it != pimpl->token_to_id.end()) {
return (*it).second;
}
return LLAMA_TOKEN_NULL;
}
const llama_vocab::token_data & llama_vocab::get_token_data(llama_token id) const {
GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
return pimpl->id_to_token.at(id);
}
const char * llama_vocab::token_get_text(llama_token id) const {
GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
return pimpl->id_to_token.at(id).text.c_str();
}
float llama_vocab::token_get_score(llama_token id) const {
GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
return pimpl->id_to_token.at(id).score;
}
llama_token_attr llama_vocab::token_get_attr(llama_token id) const {
return pimpl->token_get_attr(id);
}
llama_token llama_vocab::token_bos() const {
return pimpl->special_bos_id;
}
llama_token llama_vocab::token_eos() const {
return pimpl->special_eos_id;
}
llama_token llama_vocab::token_eot() const {
return pimpl->special_eot_id;
}
llama_token llama_vocab::token_eom() const {
return pimpl->special_eom_id;
}
llama_token llama_vocab::token_unk() const {
return pimpl->special_unk_id;
}
llama_token llama_vocab::token_sep() const {
return pimpl->special_sep_id;
}
llama_token llama_vocab::token_nl() const {
return pimpl->linefeed_id;
}
llama_token llama_vocab::token_pad() const {
return pimpl->special_pad_id;
}
llama_token llama_vocab::token_prefix() const {
return pimpl->special_fim_pre_id;
}
llama_token llama_vocab::token_middle() const {
return pimpl->special_fim_mid_id;
}
llama_token llama_vocab::token_suffix() const {
return pimpl->special_fim_suf_id;
}
llama_token llama_vocab::token_fim_pre() const {
return pimpl->special_fim_pre_id;
}
llama_token llama_vocab::token_fim_suf() const {
return pimpl->special_fim_suf_id;
}
llama_token llama_vocab::token_fim_mid() const {
return pimpl->special_fim_mid_id;
}
llama_token llama_vocab::token_fim_pad() const {
return pimpl->special_fim_pad_id;
}
llama_token llama_vocab::token_fim_rep() const {
return pimpl->special_fim_rep_id;
}
llama_token llama_vocab::token_fim_sep() const {
return pimpl->special_fim_sep_id;
}
bool llama_vocab::get_add_space_prefix() const {
return pimpl->add_space_prefix;
}
bool llama_vocab::get_add_bos() const {
return pimpl->add_bos;
}
bool llama_vocab::get_add_eos() const {
return pimpl->add_eos;
}
bool llama_vocab::get_ignore_merges() const {
return pimpl->ignore_merges;
}
bool llama_vocab::get_clean_spaces() const {
return pimpl->clean_spaces;
}
bool llama_vocab::get_remove_extra_whitespaces() const {
return pimpl->remove_extra_whitespaces;
}
bool llama_vocab::get_escape_whitespaces() const {
return pimpl->escape_whitespaces;
}
bool llama_vocab::get_treat_whitespace_as_suffix() const {
return pimpl->treat_whitespace_as_suffix;
}
int llama_vocab::max_token_len() const {
return pimpl->max_token_len;
}
int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
GGML_ASSERT(token_left.find(' ') == std::string::npos);
GGML_ASSERT(token_left.find('\n') == std::string::npos);
GGML_ASSERT(token_right.find(' ') == std::string::npos);
GGML_ASSERT(token_right.find('\n') == std::string::npos);
auto it = pimpl->bpe_ranks.find(std::make_pair(token_left, token_right));
if (it == pimpl->bpe_ranks.end()) {
return -1;
}
return it->second;
}
int32_t llama_vocab::tokenize(
const char * text,
int32_t text_len,
llama_token * tokens,
int32_t n_tokens_max,
bool add_special,
bool parse_special) const {
auto res = tokenize(std::string(text, text_len), add_special, parse_special);
if (n_tokens_max < (int) res.size()) {
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
return -((int) res.size());
}
for (size_t i = 0; i < res.size(); i++) {
tokens[i] = res[i];
}
return res.size();
}
std::vector<llama_token> llama_vocab::tokenize(
const std::string & raw_text,
bool add_special,
bool parse_special) const {
return pimpl->tokenize(raw_text, add_special, parse_special);
} }
float llama_token_get_score_impl(const struct llama_vocab & vocab, llama_token token) { const std::string & llama_vocab::token_to_piece(llama_token token) const {
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); return pimpl->token_to_piece(token);
return vocab.id_to_token[token].score;
} }
llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, llama_token token) { int32_t llama_vocab::token_to_piece(llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) const {
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); return pimpl->token_to_piece(token, buf, length, lstrip, special);
return vocab.id_to_token[token].attr;
} }
bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) { int32_t llama_vocab::detokenize(
return token != -1 && vocab.special_eog_ids.count(token) > 0; const llama_token * tokens,
int32_t n_tokens,
char * text,
int32_t text_len_max,
bool remove_special,
bool unparse_special) const {
return pimpl->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
} }
bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token) { std::string llama_vocab::detokenize(const std::vector<llama_token> & tokens, bool special) const {
return llama_is_control_token(vocab, token); std::string text;
text.resize(std::max(text.capacity(), tokens.size()));
int32_t n_chars = detokenize(tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
if (n_chars < 0) {
text.resize(-n_chars);
n_chars = detokenize(tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
}
text.resize(n_chars);
// NOTE: the original tokenizer decodes bytes after collecting the pieces.
return text;
} }
llama_token llama_token_bos_impl(const struct llama_vocab & vocab) { void llama_vocab::print_info() const {
return vocab.type != LLAMA_VOCAB_TYPE_WPM ? vocab.special_bos_id : vocab.special_cls_id; pimpl->print_info();
} }
llama_token llama_token_eos_impl(const struct llama_vocab & vocab) { //
return vocab.special_eos_id; // interface implementation
//
int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab) {
return vocab->n_tokens();
} }
llama_token llama_token_eot_impl(const struct llama_vocab & vocab) { // deprecated
return vocab.special_eot_id; int32_t llama_n_vocab(const struct llama_vocab * vocab) {
return llama_vocab_n_tokens(vocab);
} }
llama_token llama_token_eom_impl(const struct llama_vocab & vocab) { enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab) {
return vocab.special_eom_id; return vocab->get_type();
} }
llama_token llama_token_cls_impl(const struct llama_vocab & vocab) { const char * llama_vocab_get_text(const struct llama_vocab * vocab, llama_token token) {
return vocab.special_cls_id; return vocab->token_get_text(token);
} }
llama_token llama_token_sep_impl(const struct llama_vocab & vocab) { float llama_vocab_get_score(const struct llama_vocab * vocab, llama_token token) {
return vocab.special_sep_id; return vocab->token_get_score(token);
} }
llama_token llama_token_nl_impl(const struct llama_vocab & vocab) { enum llama_token_attr llama_vocab_get_attr(const struct llama_vocab * vocab, llama_token token) {
return vocab.linefeed_id; return vocab->token_get_attr(token);
} }
llama_token llama_token_pad_impl(const struct llama_vocab & vocab) { bool llama_vocab_is_eog(const struct llama_vocab * vocab, llama_token token) {
return vocab.special_pad_id; return vocab->is_eog(token);
} }
bool llama_add_bos_token_impl(const struct llama_vocab & vocab) { bool llama_vocab_is_control(const struct llama_vocab * vocab, llama_token token) {
return vocab.tokenizer_add_bos; return vocab->is_control(token);
} }
bool llama_add_eos_token_impl(const struct llama_vocab & vocab) { llama_token llama_vocab_bos(const struct llama_vocab * vocab) {
return vocab.tokenizer_add_eos; return vocab->token_bos();
} }
llama_token llama_token_prefix_impl(const struct llama_vocab & vocab) { llama_token llama_vocab_eos(const struct llama_vocab * vocab) {
return vocab.special_fim_pre_id; return vocab->token_eos();
} }
llama_token llama_token_middle_impl(const struct llama_vocab & vocab) { llama_token llama_vocab_eot(const struct llama_vocab * vocab) {
return vocab.special_fim_mid_id; return vocab->token_eot();
} }
llama_token llama_token_suffix_impl(const struct llama_vocab & vocab) { // deprecated
return vocab.special_fim_suf_id; llama_token llama_vocab_cls(const struct llama_vocab * vocab) {
return vocab->token_bos();
} }
llama_token llama_token_fim_pre_impl(const struct llama_vocab & vocab) { llama_token llama_vocab_sep(const struct llama_vocab * vocab) {
return vocab.special_fim_pre_id; return vocab->token_sep();
} }
llama_token llama_token_fim_suf_impl(const struct llama_vocab & vocab) { llama_token llama_vocab_nl (const struct llama_vocab * vocab) {
return vocab.special_fim_suf_id; return vocab->token_nl();
} }
llama_token llama_token_fim_mid_impl(const struct llama_vocab & vocab) { llama_token llama_vocab_pad(const struct llama_vocab * vocab) {
return vocab.special_fim_mid_id; return vocab->token_pad();
} }
llama_token llama_token_fim_pad_impl(const struct llama_vocab & vocab) { bool llama_vocab_get_add_bos(const struct llama_vocab * vocab) {
return vocab.special_fim_pad_id; return vocab->get_add_bos();
} }
llama_token llama_token_fim_rep_impl(const struct llama_vocab & vocab) { bool llama_vocab_get_add_eos(const struct llama_vocab * vocab) {
return vocab.special_fim_rep_id; return vocab->get_add_eos();
} }
llama_token llama_token_fim_sep_impl(const struct llama_vocab & vocab) { llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab) {
return vocab.special_fim_sep_id; return vocab->token_fim_pre();
} }
int32_t llama_tokenize_impl( llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab) {
const struct llama_vocab & vocab, return vocab->token_fim_suf();
const char * text, }
int32_t text_len,
llama_token * tokens,
int32_t n_tokens_max,
bool add_special,
bool parse_special) {
auto res = llama_tokenize_internal(vocab, std::string(text, text_len), add_special, parse_special);
if (n_tokens_max < (int) res.size()) {
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
return -((int) res.size());
}
for (size_t i = 0; i < res.size(); i++) { llama_token llama_vocab_fim_mid(const struct llama_vocab * vocab) {
tokens[i] = res[i]; return vocab->token_fim_mid();
} }
return res.size(); llama_token llama_vocab_fim_pad(const struct llama_vocab * vocab) {
return vocab->token_fim_pad();
} }
static std::string llama_decode_text(const std::string & text) { llama_token llama_vocab_fim_rep(const struct llama_vocab * vocab) {
std::string decoded_text; return vocab->token_fim_rep();
}
const auto cpts = unicode_cpts_from_utf8(text); llama_token llama_vocab_fim_sep(const struct llama_vocab * vocab) {
for (const auto cpt : cpts) { return vocab->token_fim_sep();
const auto utf8 = unicode_cpt_to_utf8(cpt); }
try {
decoded_text += unicode_utf8_to_byte(utf8);
} catch (const std::out_of_range & /*e*/) {
decoded_text += "[UNK_BYTE_0x";
for (const auto c : utf8) {
decoded_text += format("%02x", (uint8_t) c);
}
decoded_text += text + "]";
}
}
return decoded_text; // deprecated
const char * llama_token_get_text(const struct llama_vocab * vocab, llama_token token) {
return llama_vocab_get_text(vocab, token);
} }
// does not write null-terminator to buf // deprecated
int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) { float llama_token_get_score(const struct llama_vocab * vocab, llama_token token) {
// ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843 return llama_vocab_get_score(vocab, token);
static const int attr_special = LLAMA_TOKEN_ATTR_UNKNOWN | LLAMA_TOKEN_ATTR_CONTROL; }
const llama_token_attr attr = llama_token_get_attr_impl(vocab, token);
if (!special && (attr & attr_special)) {
return 0;
}
// copy piece chars to output text buffer // deprecated
// skip up to 'lstrip' leading spaces before copying enum llama_token_attr llama_token_get_attr(const struct llama_vocab * vocab, llama_token token) {
auto _try_copy = [=] (const char * token, size_t size) -> int32_t { return llama_vocab_get_attr(vocab, token);
for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) { }
token++;
size--;
}
if (length < (int32_t)size) {
return -(int32_t) size;
}
memcpy(buf, token, size);
return (int32_t) size;
};
// if we have a cache - use it // deprecated
{ bool llama_token_is_eog(const struct llama_vocab * vocab, llama_token token) {
const auto & cache = vocab.cache_token_to_piece; return llama_vocab_is_eog(vocab, token);
}
if (!cache.empty()) { // deprecated
const auto & result = cache.at(token); bool llama_token_is_control(const struct llama_vocab * vocab, llama_token token) {
return _try_copy(result.data(), result.size()); return llama_vocab_is_control(vocab, token);
} }
}
if (0 <= token && token < (int32_t) vocab.id_to_token.size()) { // deprecated
const std::string & token_text = vocab.id_to_token[token].text; llama_token llama_token_bos(const struct llama_vocab * vocab) {
switch (llama_vocab_get_type(vocab)) { return llama_vocab_bos(vocab);
case LLAMA_VOCAB_TYPE_WPM: }
case LLAMA_VOCAB_TYPE_SPM:
case LLAMA_VOCAB_TYPE_UGM: {
// NOTE: we accept all unsupported token types,
// suppressing them like CONTROL tokens.
if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
return _try_copy(token_text.data(), token_text.size());
}
if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
std::string result = token_text;
llama_unescape_whitespace(result);
return _try_copy(result.data(), result.size());
}
if (attr & LLAMA_TOKEN_ATTR_BYTE) {
char byte = (char) llama_token_to_byte(vocab, token);
return _try_copy((char*) &byte, 1);
}
break;
}
case LLAMA_VOCAB_TYPE_BPE: {
// NOTE: we accept all unsupported token types,
// suppressing them like CONTROL tokens.
if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
return _try_copy(token_text.data(), token_text.size());
}
if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
std::string result = llama_decode_text(token_text);
return _try_copy(result.data(), result.size());
}
break;
}
case LLAMA_VOCAB_TYPE_RWKV: {
std::vector<uint8_t> result = llama_unescape_rwkv_token(token_text);
// If we don't have enough space, return an error // deprecated
if (result.size() > (size_t)length) { llama_token llama_token_eos(const struct llama_vocab * vocab) {
return -(int)result.size(); return llama_vocab_eos(vocab);
} }
memcpy(buf, result.data(), result.size()); // deprecated
return (int)result.size(); llama_token llama_token_eot(const struct llama_vocab * vocab) {
} return llama_vocab_eot(vocab);
default: }
GGML_ABORT("fatal error");
}
}
return 0; // deprecated
llama_token llama_token_cls(const struct llama_vocab * vocab) {
//return llama_vocab_cls(vocab);
return llama_vocab_bos(vocab); // avoid deprecation warning
} }
int32_t llama_detokenize_impl( // deprecated
const struct llama_vocab & vocab, llama_token llama_token_sep(const struct llama_vocab * vocab) {
const llama_token * tokens, return llama_vocab_sep(vocab);
int32_t n_tokens, }
char * text,
int32_t text_len_max,
bool remove_special,
bool unparse_special) {
if (vocab.type == LLAMA_VOCAB_TYPE_NONE) {
return 0;
}
GGML_ASSERT(vocab.tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first."); // deprecated
llama_token llama_token_nl (const struct llama_vocab * vocab) {
return llama_vocab_nl(vocab);
}
int32_t avail = text_len_max; // deprecated
int32_t total = 0; llama_token llama_token_pad(const struct llama_vocab * vocab) {
return llama_vocab_pad(vocab);
}
// remove the leading space // deprecated
bool remove_space = vocab.tokenizer_add_space_prefix; bool llama_add_bos_token(const struct llama_vocab * vocab) {
return llama_vocab_get_add_bos(vocab);
}
if (remove_special && vocab.tokenizer_add_bos) { // deprecated
if (n_tokens > 0 && tokens[0] == vocab.special_bos_id) { bool llama_add_eos_token(const struct llama_vocab * vocab) {
remove_space = false; return llama_vocab_get_add_eos(vocab);
n_tokens--; }
tokens++;
}
}
if (remove_special && vocab.tokenizer_add_eos) { // deprecated
if (n_tokens > 0 && tokens[n_tokens-1] == vocab.special_eos_id) { llama_token llama_token_fim_pre(const struct llama_vocab * vocab) {
n_tokens--; return llama_vocab_fim_pre(vocab);
} }
}
for (int32_t i = 0; i < n_tokens; ++i) { // deprecated
GGML_ASSERT(avail >= 0); llama_token llama_token_fim_suf(const struct llama_vocab * vocab) {
int32_t n_chars = llama_token_to_piece_impl(vocab, tokens[i], text, avail, remove_space, unparse_special); return llama_vocab_fim_suf(vocab);
remove_space = false; }
if (n_chars < 0) {
avail = 0;
total -= n_chars;
} else if (n_chars > 0) {
avail -= n_chars;
text += n_chars;
total += n_chars;
}
}
if (total > text_len_max) { // deprecated
return -total; llama_token llama_token_fim_mid(const struct llama_vocab * vocab) {
} return llama_vocab_fim_mid(vocab);
}
if (vocab.tokenizer_clean_spaces) { // deprecated
text -= total; // restart text llama_token llama_token_fim_pad(const struct llama_vocab * vocab) {
return llama_vocab_fim_pad(vocab);
}
// first pass: characters ?!., //TODO: where do these characters come from? // deprecated
const int32_t total1 = total; llama_token llama_token_fim_rep(const struct llama_vocab * vocab) {
total = total ? 1 : 0; return llama_vocab_fim_rep(vocab);
for (int32_t i = 1; i < total1; ++i) { }
const char x = text[i];
if (text[i - 1] == ' ') {
if (x == '?' || x == '!' || x == '.' || x == ',') { // " ?", " !", " .", " ,"
total--; // remove space
}
}
text[total++] = x;
}
// second pass: strip single apostrophe between spaces // deprecated
const int32_t total2 = total; llama_token llama_token_fim_sep(const struct llama_vocab * vocab) {
total = total ? 1 : 0; return llama_vocab_fim_sep(vocab);
for (int32_t i = 1; i < total2; ++i) { }
const char x = text[i];
if (x == '\'' && i + 1 < total2 && text[i - 1] == ' ' && text[i + 1] == ' ') { // " ' "
total--; // remove prev space
text[++i] = '\0'; // remove next space
}
text[total++] = x;
}
// third pass: apostrophe contractions //NOTE: this makes sense? //
const int32_t total3 = total; // tokenization
total = total ? 1 : 0; //
for (int32_t i = 1; i < total3; ++i) {
const char x = text[i];
if (text[i - 1] == ' ') {
if (x == '\'' && i + 1 < total3) {
const char x1 = text[i + 1];
if (x1 == 't' || x1 == 'd') { // " 't", " 'd"
//total--; // remove space
} else if (x1 == 's' || x1 == 'm') { // " 's", " 'm"
total--; // remove space
} else if (i + 2 < total3) {
const char x2 = text[i + 2];
if ((x1 == 'l' && x2 == 'l')) { // " 'll"
//total--; // remove space
} else if ((x1 == 'r' && x2 == 'e') || (x1 == 'v' && x2 == 'e')) { // " 're", " 've"
total--; // remove space
} else {
//total--; // remove space
}
} else {
//total--; // remove space
}
}
}
text[total++] = x;
}
}
return total <= text_len_max ? total : -total; int32_t llama_tokenize(
const struct llama_vocab * vocab,
const char * text,
int32_t text_len,
llama_token * tokens,
int32_t n_tokens_max,
bool add_special,
bool parse_special) {
return vocab->tokenize(text, text_len, tokens, n_tokens_max, add_special, parse_special);
} }
std::string llama_detokenize(const struct llama_vocab & vocab, const std::vector<llama_token> & tokens, bool special) { int32_t llama_token_to_piece(
std::string text; const struct llama_vocab * vocab,
text.resize(std::max(text.capacity(), tokens.size())); llama_token token,
int32_t n_chars = llama_detokenize_impl(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); char * buf,
if (n_chars < 0) { int32_t length,
text.resize(-n_chars); int32_t lstrip,
n_chars = llama_detokenize_impl(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); bool special) {
GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization return vocab->token_to_piece(token, buf, length, lstrip, special);
} }
text.resize(n_chars);
// NOTE: the original tokenizer decodes bytes after collecting the pieces. int32_t llama_detokenize(
return text; const struct llama_vocab * vocab,
const llama_token * tokens,
int32_t n_tokens,
char * text,
int32_t text_len_max,
bool remove_special,
bool unparse_special) {
return vocab->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
} }
...@@ -4,179 +4,122 @@ ...@@ -4,179 +4,122 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include <unordered_map> #include <memory>
#include <map>
#include <set>
static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
switch (type) {
case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
case LLAMA_VOCAB_TYPE_UGM: return "UGM";
case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
default: return "unknown";
}
}
struct llm_tokenizer;
struct llama_vocab { struct LLM_KV;
using id = llama_token; struct llama_model_loader;
using token = std::string;
using tattr = llama_token_attr;
struct llama_vocab {
struct token_data { struct token_data {
token text; std::string text;
float score; float score;
tattr attr; llama_token_attr attr;
}; };
uint32_t n_vocab = 0; // TODO: not great because has to keep in sync with hparams.n_vocab llama_vocab();
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
int max_token_len = 0; // used for optimizing longest token search
std::unordered_map<token, id> token_to_id;
std::vector<token_data> id_to_token;
std::vector<id> cache_special_tokens;
std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = true);
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
// default LLaMA special tokens
// TODO: should we set all of these to LLAMA_TOKEN_NULL?
id special_bos_id = 1;
id special_eos_id = 2;
id special_eot_id = LLAMA_TOKEN_NULL;
id special_eom_id = LLAMA_TOKEN_NULL;
id special_unk_id = 0;
id special_sep_id = LLAMA_TOKEN_NULL;
id special_pad_id = LLAMA_TOKEN_NULL;
id special_cls_id = LLAMA_TOKEN_NULL; // TODO: revisit if this is really needed https://github.com/ggerganov/llama.cpp/pull/10930
id special_mask_id = LLAMA_TOKEN_NULL;
id linefeed_id = 13;
// fim tokens
id special_fim_pre_id = LLAMA_TOKEN_NULL;
id special_fim_suf_id = LLAMA_TOKEN_NULL;
id special_fim_mid_id = LLAMA_TOKEN_NULL;
id special_fim_pad_id = LLAMA_TOKEN_NULL;
id special_fim_rep_id = LLAMA_TOKEN_NULL; // repo
id special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator
// set of all tokens that cause "end of generation"
std::set<id> special_eog_ids;
// tokenizer flags
bool tokenizer_add_space_prefix = false;
bool tokenizer_add_bos = false;
bool tokenizer_add_eos = false;
bool tokenizer_ignore_merges = false;
bool tokenizer_clean_spaces = false; // clean_up_tokenization_spaces
bool tokenizer_remove_extra_whitespaces = false;
bool tokenizer_escape_whitespaces = true;
bool tokenizer_treat_whitespace_as_suffix = false;
std::vector<char> precompiled_charsmap;
llm_tokenizer * tokenizer = nullptr;
llama_vocab() = default;
~llama_vocab(); ~llama_vocab();
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const; void load(llama_model_loader & ml, const LLM_KV & kv);
void init_tokenizer(); enum llama_vocab_type get_type() const;
}; enum llama_vocab_pre_type get_pre_type() const;
// uint32_t n_tokens() const;
// internal API uint32_t n_token_types() const;
//
// TODO: rename to llama_tokenize_impl std::string type_name() const;
// TODO: This should probably be in llama.h
std::vector<llama_vocab::id> llama_tokenize_internal( bool is_normal (llama_token id) const;
const llama_vocab & vocab, bool is_unknown (llama_token id) const;
std::string raw_text, bool is_control (llama_token id) const;
bool add_special, bool is_byte (llama_token id) const;
bool parse_special = false); bool is_user_defined(llama_token id) const;
bool is_unused (llama_token id) const;
bool is_eog (llama_token id) const;
// TODO: move the API below as member functions of llama_vocab uint8_t token_to_byte(llama_token id) const;
llama_token llama_byte_to_token_impl(const llama_vocab & vocab, uint8_t ch); llama_token byte_to_token(uint8_t ch) const;
const char * llama_token_get_text_impl(const struct llama_vocab & vocab, llama_token token); llama_token text_to_token(const std::string & text) const;
float llama_token_get_score_impl(const struct llama_vocab & vocab, llama_token token); const token_data & get_token_data(llama_token id) const;
llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, llama_token token); const char * token_get_text (llama_token id) const;
float token_get_score(llama_token id) const;
llama_token_attr token_get_attr (llama_token id) const;
bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token); llama_token token_bos() const;
llama_token token_eos() const;
llama_token token_eot() const;
llama_token token_eom() const;
llama_token token_unk() const;
llama_token token_sep() const;
llama_token token_nl () const;
llama_token token_pad() const;
bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token); llama_token token_prefix() const;
llama_token token_middle() const;
llama_token token_suffix() const;
llama_token llama_token_bos_impl(const struct llama_vocab & vocab); llama_token token_fim_pre() const;
llama_token llama_token_eos_impl(const struct llama_vocab & vocab); llama_token token_fim_suf() const;
llama_token llama_token_eot_impl(const struct llama_vocab & vocab); llama_token token_fim_mid() const;
llama_token llama_token_eom_impl(const struct llama_vocab & vocab); llama_token token_fim_pad() const;
llama_token llama_token_cls_impl(const struct llama_vocab & vocab); llama_token token_fim_rep() const;
llama_token llama_token_sep_impl(const struct llama_vocab & vocab); llama_token token_fim_sep() const;
llama_token llama_token_nl_impl (const struct llama_vocab & vocab);
llama_token llama_token_pad_impl(const struct llama_vocab & vocab);
llama_token llama_token_prefix_impl(const struct llama_vocab & vocab); bool get_add_space_prefix () const;
llama_token llama_token_middle_impl(const struct llama_vocab & vocab); bool get_add_bos () const;
llama_token llama_token_suffix_impl(const struct llama_vocab & vocab); bool get_add_eos () const;
bool get_ignore_merges () const;
bool get_clean_spaces () const;
bool get_remove_extra_whitespaces () const;
bool get_escape_whitespaces () const;
bool get_treat_whitespace_as_suffix() const;
llama_token llama_token_fim_pre_impl(const struct llama_vocab & vocab); int max_token_len() const;
llama_token llama_token_fim_suf_impl(const struct llama_vocab & vocab);
llama_token llama_token_fim_mid_impl(const struct llama_vocab & vocab);
llama_token llama_token_fim_pad_impl(const struct llama_vocab & vocab);
llama_token llama_token_fim_rep_impl(const struct llama_vocab & vocab);
llama_token llama_token_fim_sep_impl(const struct llama_vocab & vocab);
bool llama_add_bos_token_impl(const struct llama_vocab & vocab); int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
bool llama_add_eos_token_impl(const struct llama_vocab & vocab);
int32_t llama_tokenize_impl( int32_t tokenize(
const struct llama_vocab & vocab,
const char * text, const char * text,
int32_t text_len, int32_t text_len,
llama_token * tokens, llama_token * tokens,
int32_t n_tokens_max, int32_t n_tokens_max,
bool add_special, bool add_special,
bool parse_special); bool parse_special) const;
std::vector<llama_token> tokenize(
const std::string & raw_text,
bool add_special,
bool parse_special = false) const;
// does not write null-terminator to buf // does not write null-terminator to buf
int32_t llama_token_to_piece_impl( int32_t token_to_piece(
const struct llama_vocab & vocab,
llama_token token, llama_token token,
char * buf, char * buf,
int32_t length, int32_t length,
int32_t lstrip, int32_t lstrip,
bool special); bool special) const;
// check if token0 is contained as a prefix in token1 // use cached data
bool llama_token_is_prefix_impl( const std::string & token_to_piece(llama_token token) const;
const struct llama_vocab & vocab,
llama_token token0,
llama_token token1);
int32_t llama_detokenize_impl( int32_t detokenize(
const struct llama_vocab & vocab,
const llama_token * tokens, const llama_token * tokens,
int32_t n_tokens, int32_t n_tokens,
char * text, char * text,
int32_t text_len_max, int32_t text_len_max,
bool remove_special, bool remove_special,
bool unparse_special); bool unparse_special) const;
std::string llama_detokenize( std::string detokenize(
const struct llama_vocab & vocab,
const std::vector<llama_token> & tokens, const std::vector<llama_token> & tokens,
bool special); bool special) const;
void print_info() const;
private:
struct impl;
std::unique_ptr<impl> pimpl;
};
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -12,18 +12,17 @@ ...@@ -12,18 +12,17 @@
#include <algorithm> #include <algorithm>
#include <cassert> #include <cassert>
#include <codecvt>
#include <cstddef> #include <cstddef>
#include <cstdint> #include <cstdint>
#include <locale>
#include <map> #include <map>
#include <regex> #include <regex>
#include <stdexcept> #include <stdexcept>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include <locale>
#include <codecvt>
size_t unicode_len_utf8(char src) { size_t unicode_len_utf8(char src) {
const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
...@@ -641,8 +640,15 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) { ...@@ -641,8 +640,15 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
result.reserve(utf8.size()); result.reserve(utf8.size());
size_t offset = 0; size_t offset = 0;
while (offset < utf8.size()) { while (offset < utf8.size()) {
try {
result.push_back(unicode_cpt_from_utf8(utf8, offset)); result.push_back(unicode_cpt_from_utf8(utf8, offset));
} }
catch (const std::invalid_argument & /*ex*/) {
// Silently ignore invalid UTF-8 input to avoid leaking the exception beyond llama_tokenize
++offset;
result.emplace_back(0xFFFD); // replacement character
}
}
return result; return result;
} }
...@@ -724,7 +730,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std ...@@ -724,7 +730,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
const auto cpts = unicode_cpts_from_utf8(text); const auto cpts = unicode_cpts_from_utf8(text);
// generate a "collapsed" representation of the text, where all codepoints are replaced by a single byte // generate a "collapsed" representation of the text, where all codepoints are replaced by a single byte
// ref: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2081479935 // ref: https://github.com/ggml-org/llama.cpp/pull/6920#issuecomment-2081479935
std::string text_collapsed; std::string text_collapsed;
if (need_collapse) { if (need_collapse) {
// collapse all unicode categories // collapse all unicode categories
......
...@@ -14,6 +14,7 @@ package llama ...@@ -14,6 +14,7 @@ package llama
#include "llama.h" #include "llama.h"
#include "clip.h" #include "clip.h"
#include "llava.h" #include "llava.h"
#include "gguf.h"
#include "mllama.h" #include "mllama.h"
#include "sampling_ext.h" #include "sampling_ext.h"
...@@ -293,29 +294,29 @@ func NewContextWithModel(model *Model, params ContextParams) (*Context, error) { ...@@ -293,29 +294,29 @@ func NewContextWithModel(model *Model, params ContextParams) (*Context, error) {
} }
func (m *Model) NumVocab() int { func (m *Model) NumVocab() int {
return int(C.llama_n_vocab(m.c)) return int(C.llama_n_vocab(m.Vocab()))
} }
func (m *Model) TokenIsEog(token int) bool { func (m *Model) TokenIsEog(token int) bool {
return bool(C.llama_token_is_eog(m.c, C.llama_token(token))) return bool(C.llama_token_is_eog(m.Vocab(), C.llama_token(token)))
} }
func (m *Model) AddBOSToken() bool { func (m *Model) AddBOSToken() bool {
return bool(C.llama_add_bos_token(m.c)) return bool(C.llama_add_bos_token(m.Vocab()))
} }
func (m *Model) ApplyLoraFromFile(context *Context, loraPath string, scale float32, threads int) error { func (m *Model) ApplyLoraFromFile(context *Context, loraPath string, scale float32, threads int) error {
cLoraPath := C.CString(loraPath) cLoraPath := C.CString(loraPath)
defer C.free(unsafe.Pointer(cLoraPath)) defer C.free(unsafe.Pointer(cLoraPath))
loraAdapter := C.llama_lora_adapter_init(m.c, cLoraPath) loraAdapter := C.llama_adapter_lora_init(m.c, cLoraPath)
if loraAdapter == nil { if loraAdapter == nil {
return errors.New("unable to load lora") return errors.New("unable to load lora")
} }
err := -1 err := -1
if loraAdapter != nil { if loraAdapter != nil {
err = int(C.llama_lora_adapter_set(context.c, loraAdapter, C.float(scale))) err = int(C.llama_set_adapter_lora(context.c, loraAdapter, C.float(scale)))
} }
if err != 0 { if err != 0 {
return errors.New("error applying lora from file") return errors.New("error applying lora from file")
...@@ -324,6 +325,10 @@ func (m *Model) ApplyLoraFromFile(context *Context, loraPath string, scale float ...@@ -324,6 +325,10 @@ func (m *Model) ApplyLoraFromFile(context *Context, loraPath string, scale float
return nil return nil
} }
func (m *Model) Vocab() *C.struct_llama_vocab {
return C.llama_model_get_vocab(m.c)
}
type Batch struct { type Batch struct {
c C.struct_llama_batch c C.struct_llama_batch
batchSize int batchSize int
...@@ -414,7 +419,7 @@ func (m *Model) TokenToPiece(token int) string { ...@@ -414,7 +419,7 @@ func (m *Model) TokenToPiece(token int) string {
tokenLen := 12 tokenLen := 12
buf := make([]byte, tokenLen) buf := make([]byte, tokenLen)
tokenLen = int(C.llama_token_to_piece( tokenLen = int(C.llama_token_to_piece(
m.c, m.Vocab(),
C.int32_t(token), C.int32_t(token),
(*C.char)(unsafe.Pointer(&buf[0])), (*C.char)(unsafe.Pointer(&buf[0])),
C.int32_t(tokenLen), C.int32_t(tokenLen),
...@@ -426,7 +431,7 @@ func (m *Model) TokenToPiece(token int) string { ...@@ -426,7 +431,7 @@ func (m *Model) TokenToPiece(token int) string {
buf = make([]byte, tokenLen) buf = make([]byte, tokenLen)
C.llama_token_to_piece( C.llama_token_to_piece(
m.c, m.Vocab(),
C.int32_t(token), C.int32_t(token),
(*C.char)(unsafe.Pointer(&buf[0])), (*C.char)(unsafe.Pointer(&buf[0])),
C.int32_t(tokenLen), C.int32_t(tokenLen),
...@@ -444,7 +449,7 @@ func (m *Model) Tokenize(text string, addSpecial bool, parseSpecial bool) ([]int ...@@ -444,7 +449,7 @@ func (m *Model) Tokenize(text string, addSpecial bool, parseSpecial bool) ([]int
defer C.free(unsafe.Pointer(cText)) defer C.free(unsafe.Pointer(cText))
result := C.llama_tokenize( result := C.llama_tokenize(
m.c, m.Vocab(),
cText, cText,
C.int32_t(len(text)), C.int32_t(len(text)),
&cTokens[0], &cTokens[0],
...@@ -458,7 +463,7 @@ func (m *Model) Tokenize(text string, addSpecial bool, parseSpecial bool) ([]int ...@@ -458,7 +463,7 @@ func (m *Model) Tokenize(text string, addSpecial bool, parseSpecial bool) ([]int
maxTokens = int(-result) maxTokens = int(-result)
cTokens = make([]C.llama_token, maxTokens) cTokens = make([]C.llama_token, maxTokens)
result = C.llama_tokenize( result = C.llama_tokenize(
m.c, m.Vocab(),
cText, cText,
C.int32_t(len(text)), C.int32_t(len(text)),
&cTokens[0], &cTokens[0],
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#include "ggml-backend.h" #include "ggml-backend.h"
#include "ggml-cpu.h" #include "ggml-cpu.h"
#include "ggml.h" #include "ggml.h"
#include "gguf.h"
#ifdef GGML_USE_CUDA #ifdef GGML_USE_CUDA
#include "ggml-cuda.h" #include "ggml-cuda.h"
......
...@@ -10,7 +10,7 @@ Subject: [PATCH] cuda ...@@ -10,7 +10,7 @@ Subject: [PATCH] cuda
3 files changed, 2 insertions(+), 1 deletion(-) 3 files changed, 2 insertions(+), 1 deletion(-)
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index e2d6c405..a12172dc 100644 index dba7be33..1ca40b2c 100644
--- a/ggml/src/ggml-backend.cpp --- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp
@@ -106,7 +106,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) { @@ -106,7 +106,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
...@@ -22,10 +22,10 @@ index e2d6c405..a12172dc 100644 ...@@ -22,10 +22,10 @@ index e2d6c405..a12172dc 100644
size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) { size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 0b06be72..be29e979 100644 index ebb2ccae..b094929b 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -424,6 +424,7 @@ struct ggml_backend_cuda_buffer_context { @@ -529,6 +529,7 @@ struct ggml_backend_cuda_buffer_context {
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
delete ctx; delete ctx;
...@@ -34,10 +34,10 @@ index 0b06be72..be29e979 100644 ...@@ -34,10 +34,10 @@ index 0b06be72..be29e979 100644
static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) { static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index a85502ee..cd8ef741 100644 index c550142a..fd9a4e77 100644
--- a/ggml/src/ggml-metal/ggml-metal.m --- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -4187,6 +4187,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) @@ -4350,6 +4350,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
} }
free(ctx); free(ctx);
......
...@@ -4,17 +4,17 @@ Date: Mon, 16 Sep 2024 15:53:13 -0700 ...@@ -4,17 +4,17 @@ Date: Mon, 16 Sep 2024 15:53:13 -0700
Subject: [PATCH] pretokenizer Subject: [PATCH] pretokenizer
--- ---
src/llama-model.cpp | 14 +++----------- src/llama-vocab.cpp | 14 +++-----------
1 file changed, 3 insertions(+), 11 deletions(-) 1 file changed, 3 insertions(+), 11 deletions(-)
diff --git a/src/llama-model.cpp b/src/llama-model.cpp diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 405e0528..00b80c52 100644 index ad9ffe66..a4eee9b8 100644
--- a/src/llama-model.cpp --- a/src/llama-vocab.cpp
+++ b/src/llama-model.cpp +++ b/src/llama-vocab.cpp
@@ -1249,16 +1249,7 @@ void llm_load_vocab(llama_model_loader & ml, llama_model & model) { @@ -1468,16 +1468,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) { if (type == LLAMA_VOCAB_TYPE_BPE) {
vocab.tokenizer_add_space_prefix = false; add_space_prefix = false;
vocab.tokenizer_clean_spaces = true; clean_spaces = true;
- if (tokenizer_pre.empty()) { - if (tokenizer_pre.empty()) {
- LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__); - LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
- LLAMA_LOG_WARN("%s: \n", __func__); - LLAMA_LOG_WARN("%s: \n", __func__);
...@@ -23,19 +23,19 @@ index 405e0528..00b80c52 100644 ...@@ -23,19 +23,19 @@ index 405e0528..00b80c52 100644
- LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__); - LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__);
- LLAMA_LOG_WARN("%s: ************************************ \n", __func__); - LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
- LLAMA_LOG_WARN("%s: \n", __func__); - LLAMA_LOG_WARN("%s: \n", __func__);
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; - pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
- } else if (tokenizer_pre == "default") { - } else if (tokenizer_pre == "default") {
+ if (tokenizer_pre == "default") { + if (tokenizer_pre == "default") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if ( } else if (
tokenizer_pre == "llama3" || tokenizer_pre == "llama3" ||
@@ -1373,7 +1364,8 @@ void llm_load_vocab(llama_model_loader & ml, llama_model & model) { @@ -1593,7 +1584,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre == "megrez") { tokenizer_pre == "megrez") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2; pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
} else { } else {
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); - throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
+ LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__); + LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; + pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} }
} else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) { } else if (type == LLAMA_VOCAB_TYPE_SPM) {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
...@@ -9,10 +9,10 @@ Subject: [PATCH] embeddings ...@@ -9,10 +9,10 @@ Subject: [PATCH] embeddings
2 files changed, 5 insertions(+), 3 deletions(-) 2 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 38a55fb2..b9c4a5bf 100644 index 671d2a81..47e79ed4 100644
--- a/src/llama-context.cpp --- a/src/llama-context.cpp
+++ b/src/llama-context.cpp +++ b/src/llama-context.cpp
@@ -475,7 +475,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) { @@ -479,7 +479,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
const auto n_embd = hparams.n_embd; const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead // TODO: use a per-batch flag for logits presence instead
...@@ -22,10 +22,10 @@ index 38a55fb2..b9c4a5bf 100644 ...@@ -22,10 +22,10 @@ index 38a55fb2..b9c4a5bf 100644
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0; const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
diff --git a/src/llama.cpp b/src/llama.cpp diff --git a/src/llama.cpp b/src/llama.cpp
index ea78ea48..4eb3f6b9 100644 index 607f2786..ac85bfed 100644
--- a/src/llama.cpp --- a/src/llama.cpp
+++ b/src/llama.cpp +++ b/src/llama.cpp
@@ -10876,7 +10876,6 @@ static int llama_decode_internal( @@ -8652,7 +8652,6 @@ static int llama_decode_impl(
res = nullptr; res = nullptr;
embd = nullptr; embd = nullptr;
} else if (cparams.embeddings) { } else if (cparams.embeddings) {
...@@ -33,7 +33,7 @@ index ea78ea48..4eb3f6b9 100644 ...@@ -33,7 +33,7 @@ index ea78ea48..4eb3f6b9 100644
embd = nullptr; embd = nullptr;
for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) { for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) { if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
@@ -10884,12 +10883,15 @@ static int llama_decode_internal( @@ -8660,12 +8659,15 @@ static int llama_decode_impl(
break; break;
} }
} }
......
...@@ -8,10 +8,10 @@ Subject: [PATCH] clip-unicode ...@@ -8,10 +8,10 @@ Subject: [PATCH] clip-unicode
1 file changed, 39 insertions(+), 1 deletion(-) 1 file changed, 39 insertions(+), 1 deletion(-)
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 3cd0d2fa..b3c1829f 100644 index 76d4a785..205af1eb 100644
--- a/examples/llava/clip.cpp --- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp
@@ -56,6 +56,19 @@ @@ -58,6 +58,19 @@
# define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0) # define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
#endif // defined(LLAVA_LOG_OFF) #endif // defined(LLAVA_LOG_OFF)
...@@ -31,7 +31,7 @@ index 3cd0d2fa..b3c1829f 100644 ...@@ -31,7 +31,7 @@ index 3cd0d2fa..b3c1829f 100644
//#define CLIP_DEBUG_FUNCTIONS //#define CLIP_DEBUG_FUNCTIONS
// RGB uint8 image // RGB uint8 image
@@ -1322,8 +1335,29 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { @@ -1402,8 +1415,29 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
gguf_free(ctx); gguf_free(ctx);
return nullptr; return nullptr;
} }
...@@ -62,7 +62,7 @@ index 3cd0d2fa..b3c1829f 100644 ...@@ -62,7 +62,7 @@ index 3cd0d2fa..b3c1829f 100644
if (!fin) { if (!fin) {
LOG_ERR("cannot open model file for loading tensors\n"); LOG_ERR("cannot open model file for loading tensors\n");
clip_free(new_clip); clip_free(new_clip);
@@ -1363,7 +1397,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { @@ -1443,7 +1477,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
} }
} }
......
...@@ -11,21 +11,21 @@ tensor to store the scalar. the scalar is implemented a 1-dimensional ...@@ -11,21 +11,21 @@ tensor to store the scalar. the scalar is implemented a 1-dimensional
tensor with 2 elements dervied from the model's bskcn_tv configuration. tensor with 2 elements dervied from the model's bskcn_tv configuration.
in general, the values are (bskcn_tv, 1 - bskcn_tv) in general, the values are (bskcn_tv, 1 - bskcn_tv)
--- ---
src/llama-arch.cpp | 53 +++++++---- src/llama-arch.cpp | 21 +++++
src/llama-arch.h | 3 + src/llama-arch.h | 3 +
src/llama-hparams.cpp | 8 ++ src/llama-hparams.cpp | 8 ++
src/llama-hparams.h | 5 + src/llama-hparams.h | 5 ++
src/llama-model-loader.cpp | 1 + src/llama-model-loader.cpp | 1 +
src/llama-model.cpp | 16 ++++ src/llama-model.cpp | 44 +++++++++++
src/llama-model.h | 3 + src/llama-model.h | 3 +
src/llama.cpp | 185 +++++++++++++++++++++++++++++++++++++ src/llama.cpp | 152 ++++++++++++++++++++++++++++++++++++-
8 files changed, 258 insertions(+), 16 deletions(-) 8 files changed, 236 insertions(+), 1 deletion(-)
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 007d79f8..5b376c5e 100644 index 97a1e7e5..a1e0ebcc 100644
--- a/src/llama-arch.cpp --- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp +++ b/src/llama-arch.cpp
@@ -59,6 +59,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = { @@ -61,6 +61,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_GRANITE, "granite" }, { LLM_ARCH_GRANITE, "granite" },
{ LLM_ARCH_GRANITE_MOE, "granitemoe" }, { LLM_ARCH_GRANITE_MOE, "granitemoe" },
{ LLM_ARCH_CHAMELEON, "chameleon" }, { LLM_ARCH_CHAMELEON, "chameleon" },
...@@ -33,48 +33,16 @@ index 007d79f8..5b376c5e 100644 ...@@ -33,48 +33,16 @@ index 007d79f8..5b376c5e 100644
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" }, { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
{ LLM_ARCH_UNKNOWN, "(unknown)" }, { LLM_ARCH_UNKNOWN, "(unknown)" },
}; };
@@ -106,22 +107,23 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = { @@ -125,6 +126,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" }, { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
{ LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" }, { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
- { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
- { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
- { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
- { LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
- { LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" },
- { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
- { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
- { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
- { LLM_KV_ATTENTION_GROUPNORM_EPS, "%s.attention.group_norm_epsilon" },
- { LLM_KV_ATTENTION_GROUPNORM_GROUPS, "%s.attention.group_norm_groups" },
- { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
- { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
- { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
- { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
- { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
- { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
+ { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
+ { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
+ { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
+ { LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
+ { LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" },
+ { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
+ { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
+ { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
+ { LLM_KV_ATTENTION_GROUPNORM_EPS, "%s.attention.group_norm_epsilon" },
+ { LLM_KV_ATTENTION_GROUPNORM_GROUPS, "%s.attention.group_norm_groups" },
+ { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
+ { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
+ { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
+ { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
+ { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
+ { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
+ { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" }, + { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" }, { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
@@ -1240,6 +1242,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N @@ -1271,6 +1273,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_POS_NET_ATTN_OUT, "posnet.%d.attn_output" }, { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
}, },
}, },
+ { + {
...@@ -96,9 +64,9 @@ index 007d79f8..5b376c5e 100644 ...@@ -96,9 +64,9 @@ index 007d79f8..5b376c5e 100644
+ }, + },
+ }, + },
{ {
LLM_ARCH_UNKNOWN, LLM_ARCH_WAVTOKENIZER_DEC,
{ {
@@ -1372,6 +1392,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = { @@ -1429,6 +1449,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, {LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
// this tensor is loaded for T5, but never used // this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}}, {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
...@@ -107,10 +75,10 @@ index 007d79f8..5b376c5e 100644 ...@@ -107,10 +75,10 @@ index 007d79f8..5b376c5e 100644
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h diff --git a/src/llama-arch.h b/src/llama-arch.h
index 45e458bb..eac7055b 100644 index 122fdceb..77919578 100644
--- a/src/llama-arch.h --- a/src/llama-arch.h
+++ b/src/llama-arch.h +++ b/src/llama-arch.h
@@ -63,6 +63,7 @@ enum llm_arch { @@ -65,6 +65,7 @@ enum llm_arch {
LLM_ARCH_GRANITE, LLM_ARCH_GRANITE,
LLM_ARCH_GRANITE_MOE, LLM_ARCH_GRANITE_MOE,
LLM_ARCH_CHAMELEON, LLM_ARCH_CHAMELEON,
...@@ -118,7 +86,7 @@ index 45e458bb..eac7055b 100644 ...@@ -118,7 +86,7 @@ index 45e458bb..eac7055b 100644
LLM_ARCH_WAVTOKENIZER_DEC, LLM_ARCH_WAVTOKENIZER_DEC,
LLM_ARCH_UNKNOWN, LLM_ARCH_UNKNOWN,
}; };
@@ -126,6 +127,7 @@ enum llm_kv { @@ -129,6 +130,7 @@ enum llm_kv {
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
LLM_KV_ATTENTION_SLIDING_WINDOW, LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE, LLM_KV_ATTENTION_SCALE,
...@@ -126,7 +94,7 @@ index 45e458bb..eac7055b 100644 ...@@ -126,7 +94,7 @@ index 45e458bb..eac7055b 100644
LLM_KV_ROPE_DIMENSION_COUNT, LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_DIMENSION_SECTIONS, LLM_KV_ROPE_DIMENSION_SECTIONS,
@@ -305,6 +307,7 @@ enum llm_tensor { @@ -311,6 +313,7 @@ enum llm_tensor {
LLM_TENSOR_ENC_OUTPUT_NORM, LLM_TENSOR_ENC_OUTPUT_NORM,
LLM_TENSOR_CLS, LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT, LLM_TENSOR_CLS_OUT,
...@@ -135,7 +103,7 @@ index 45e458bb..eac7055b 100644 ...@@ -135,7 +103,7 @@ index 45e458bb..eac7055b 100644
LLM_TENSOR_CONVNEXT_DW, LLM_TENSOR_CONVNEXT_DW,
LLM_TENSOR_CONVNEXT_NORM, LLM_TENSOR_CONVNEXT_NORM,
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
index c4053469..450738da 100644 index ea87b295..f3955de9 100644
--- a/src/llama-hparams.cpp --- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp
@@ -69,3 +69,11 @@ uint32_t llama_hparams::n_embd_v_s() const { @@ -69,3 +69,11 @@ uint32_t llama_hparams::n_embd_v_s() const {
...@@ -152,10 +120,10 @@ index c4053469..450738da 100644 ...@@ -152,10 +120,10 @@ index c4053469..450738da 100644
+} +}
\ No newline at end of file \ No newline at end of file
diff --git a/src/llama-hparams.h b/src/llama-hparams.h diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index a29f20ec..fd898e27 100644 index 1fe45410..1bdcdfd5 100644
--- a/src/llama-hparams.h --- a/src/llama-hparams.h
+++ b/src/llama-hparams.h +++ b/src/llama-hparams.h
@@ -52,6 +52,8 @@ struct llama_hparams { @@ -50,6 +50,8 @@ struct llama_hparams {
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr; std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr; std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
...@@ -164,7 +132,7 @@ index a29f20ec..fd898e27 100644 ...@@ -164,7 +132,7 @@ index a29f20ec..fd898e27 100644
uint32_t n_layer_dense_lead = 0; uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0; uint32_t n_lora_q = 0;
uint32_t n_lora_kv = 0; uint32_t n_lora_kv = 0;
@@ -134,6 +136,9 @@ struct llama_hparams { @@ -133,6 +135,9 @@ struct llama_hparams {
// dimension of the recurrent state embeddings // dimension of the recurrent state embeddings
uint32_t n_embd_v_s() const; uint32_t n_embd_v_s() const;
...@@ -175,23 +143,23 @@ index a29f20ec..fd898e27 100644 ...@@ -175,23 +143,23 @@ index a29f20ec..fd898e27 100644
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable"); static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index 7743b465..422524a8 100644 index 05d58ad9..1252aca1 100644
--- a/src/llama-model-loader.cpp --- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp
@@ -364,6 +364,7 @@ namespace GGUFMeta { @@ -439,6 +439,7 @@ namespace GGUFMeta {
// TODO: this is not very clever - figure out something better // TODO: this is not very clever - figure out something better
template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required); template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required); template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
+ template bool llama_model_loader::get_key_or_arr<uint32_t>(const std::string & key, std::array<uint32_t, 512> & result, uint32_t n, bool required); + template bool llama_model_loader::get_key_or_arr<uint32_t>(const std::string & key, std::array<uint32_t, 512> & result, uint32_t n, bool required);
llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) { llama_model_loader::llama_model_loader(
int trace = 0; const std::string & fname,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 00b80c52..306c557d 100644 index 36a0a009..ad1315c6 100644
--- a/src/llama-model.cpp --- a/src/llama-model.cpp
+++ b/src/llama-model.cpp +++ b/src/llama-model.cpp
@@ -1091,6 +1091,21 @@ void llm_load_hparams(llama_model_loader & ml, llama_model & model) { @@ -1238,6 +1238,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: model.type = e_model::MODEL_UNKNOWN; default: type = LLM_TYPE_UNKNOWN;
} }
} break; } break;
+ case LLM_ARCH_SOLAR: + case LLM_ARCH_SOLAR:
...@@ -200,52 +168,19 @@ index 00b80c52..306c557d 100644 ...@@ -200,52 +168,19 @@ index 00b80c52..306c557d 100644
+ for (size_t i = 0; i < hparams.n_bskcn_arr.max_size(); ++i) { + for (size_t i = 0; i < hparams.n_bskcn_arr.max_size(); ++i) {
+ auto & bskcn = hparams.n_bskcn_arr[i]; + auto & bskcn = hparams.n_bskcn_arr[i];
+ bskcn.fill(0); + bskcn.fill(0);
+ auto kv = LLM_KV(model.arch); + auto kv = LLM_KV(arch);
+ ml.get_key_or_arr(format((kv(LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION) + ".%d").c_str(), i), bskcn, hparams.n_layer, false); + ml.get_key_or_arr(format((kv(LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION) + ".%d").c_str(), i), bskcn, hparams.n_layer, false);
+ } + }
+ +
+ switch (hparams.n_layer) { + switch (hparams.n_layer) {
+ case 64: model.type = e_model::MODEL_22B; break; + case 64: type = LLM_TYPE_22B; break;
+ default: model.type = e_model::MODEL_UNKNOWN; + default: type = LLM_TYPE_UNKNOWN;
+ } + }
+ } break; + } break;
case LLM_ARCH_WAVTOKENIZER_DEC: case LLM_ARCH_WAVTOKENIZER_DEC:
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -2065,6 +2080,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { @@ -3316,6 +3331,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_CHAMELEON:
+ case LLM_ARCH_SOLAR:
return LLAMA_ROPE_TYPE_NORM;
// the pairs of head values are offset by n_rot/2
diff --git a/src/llama-model.h b/src/llama-model.h
index ce038932..c1b9c0a1 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -54,6 +54,7 @@ enum llm_type {
MODEL_15B,
MODEL_16B,
MODEL_20B,
+ MODEL_22B,
MODEL_30B,
MODEL_32B,
MODEL_34B,
@@ -275,6 +276,8 @@ struct llama_layer {
struct ggml_tensor * ffn_up_scale = nullptr;
struct ggml_tensor * ffn_down_scale = nullptr;
+ struct ggml_tensor * bskcn_tv = nullptr;
+
struct llama_layer_posnet posnet;
struct llama_layer_convnext convnext;
diff --git a/src/llama.cpp b/src/llama.cpp
index 4eb3f6b9..7dec50ae 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2206,6 +2206,35 @@ static bool llm_load_tensors(
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
...@@ -256,16 +191,16 @@ index 4eb3f6b9..7dec50ae 100644 ...@@ -256,16 +191,16 @@ index 4eb3f6b9..7dec50ae 100644
+ } break; + } break;
+ case LLM_ARCH_SOLAR: + case LLM_ARCH_SOLAR:
+ { + {
+ model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+ +
+ // output + // output
+ { + {
+ model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ } + }
+ +
+ for (int i = 0; i < n_layer; ++i) { + for (int i = 0; i < n_layer; ++i) {
+ auto & layer = model.layers[i]; + auto & layer = layers[i];
+ +
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ +
...@@ -277,16 +212,53 @@ index 4eb3f6b9..7dec50ae 100644 ...@@ -277,16 +212,53 @@ index 4eb3f6b9..7dec50ae 100644
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ +
+ layer.bskcn_tv = create_tensor(tn(LLM_TENSOR_BSKCN_TV, "weight", i), {2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); + layer.bskcn_tv = create_tensor(tn(LLM_TENSOR_BSKCN_TV, "weight", i), {2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
@@ -10226,6 +10255,158 @@ struct llm_build_context { @@ -3900,6 +3943,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
return gf; case LLM_ARCH_GRANITE:
} case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_CHAMELEON:
+ case LLM_ARCH_SOLAR:
return LLAMA_ROPE_TYPE_NORM;
// the pairs of head values are offset by n_rot/2
diff --git a/src/llama-model.h b/src/llama-model.h
index a7c30444..1afb0024 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -55,6 +55,7 @@ enum llm_type {
LLM_TYPE_15B,
LLM_TYPE_16B,
LLM_TYPE_20B,
+ LLM_TYPE_22B,
LLM_TYPE_30B,
LLM_TYPE_32B,
LLM_TYPE_34B,
@@ -281,6 +282,8 @@ struct llama_layer {
struct ggml_tensor * ffn_up_scale = nullptr;
struct ggml_tensor * ffn_down_scale = nullptr;
+ struct ggml_tensor * bskcn_tv = nullptr;
+
struct llama_layer_posnet posnet;
struct llama_layer_convnext convnext;
diff --git a/src/llama.cpp b/src/llama.cpp
index ac85bfed..6d320ea4 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -7953,9 +7953,155 @@ struct llm_build_context {
cb(img_logits, "img_logits", -1);
cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx);
cb(cur, "result_output", -1);
-
ggml_build_forward_expand(gf, cur);
+ return gf;
+ }
+
+ ggml_cgraph * build_solar() { + ggml_cgraph * build_solar() {
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+ +
+ // mutable variable, needed during the last layer of the computation to skip unused tokens + // mutable variable, needed during the last layer of the computation to skip unused tokens
+ int32_t n_tokens = this->n_tokens; + int32_t n_tokens = this->n_tokens;
...@@ -333,7 +305,7 @@ index 4eb3f6b9..7dec50ae 100644 ...@@ -333,7 +305,7 @@ index 4eb3f6b9..7dec50ae 100644
+ ggml_mul(ctx0, bskcn_2, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, 0)), + ggml_mul(ctx0, bskcn_2, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, 0)),
+ ggml_mul(ctx0, inpSA, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, ggml_element_size(model.layers[il].bskcn_tv)))); + ggml_mul(ctx0, inpSA, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, ggml_element_size(model.layers[il].bskcn_tv))));
+ } + }
+
+ // norm + // norm
+ cur = llm_build_norm(ctx0, inpL, hparams, + cur = llm_build_norm(ctx0, inpL, hparams,
+ model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, NULL,
...@@ -422,25 +394,18 @@ index 4eb3f6b9..7dec50ae 100644 ...@@ -422,25 +394,18 @@ index 4eb3f6b9..7dec50ae 100644
+ } + }
+ +
+ cur = inpL; + cur = inpL;
+
+ cur = llm_build_norm(ctx0, cur, hparams, + cur = llm_build_norm(ctx0, cur, hparams,
+ model.output_norm, NULL, + model.output_norm, NULL,
+ LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, cb, -1);
+ cb(cur, "result_norm", -1); + cb(cur, "result_norm", -1);
+
+ // lm_head + // lm_head
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+ cb(cur, "result_output", -1); + cb(cur, "result_output", -1);
+
+ ggml_build_forward_expand(gf, cur); + ggml_build_forward_expand(gf, cur);
+ return gf;
+ return gf; }
+ }
+
struct ggml_cgraph * build_wavtokenizer_dec() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
@@ -10660,6 +10841,10 @@ static struct ggml_cgraph * llama_build_graph( @@ -8398,6 +8544,10 @@ static struct ggml_cgraph * llama_build_graph(
{ {
result = llm.build_chameleon(); result = llm.build_chameleon();
} break; } break;
......
...@@ -8,10 +8,10 @@ Subject: [PATCH] conditional-fattn ...@@ -8,10 +8,10 @@ Subject: [PATCH] conditional-fattn
1 file changed, 2 insertions(+) 1 file changed, 2 insertions(+)
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index be29e979..aaa79ea4 100644 index b094929b..36165840 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2159,9 +2159,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg @@ -2282,9 +2282,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
ggml_cuda_op_argsort(ctx, dst); ggml_cuda_op_argsort(ctx, dst);
break; break;
......
...@@ -15,27 +15,27 @@ remaining is to implement the cross attention mask ...@@ -15,27 +15,27 @@ remaining is to implement the cross attention mask
examples/llava/llava.cpp | 5 +- examples/llava/llava.cpp | 5 +-
ggml/src/ggml-backend-reg.cpp | 6 +- ggml/src/ggml-backend-reg.cpp | 6 +-
include/llama.h | 6 + include/llama.h | 6 +
src/llama-arch.cpp | 44 +++++ src/llama-arch.cpp | 44 ++++++
src/llama-arch.h | 10 ++ src/llama-arch.h | 10 ++
src/llama-batch.cpp | 3 + src/llama-batch.cpp | 3 +
src/llama-context.cpp | 19 ++- src/llama-context.cpp | 28 ++--
src/llama-context.h | 2 + src/llama-context.h | 2 +
src/llama-cparams.h | 1 + src/llama-cparams.h | 1 +
src/llama-hparams.cpp | 8 +- src/llama-hparams.cpp | 6 +
src/llama-hparams.h | 4 + src/llama-hparams.h | 5 +
src/llama-kv-cache.cpp | 33 ++++ src/llama-kv-cache.cpp | 13 +-
src/llama-model-loader.cpp | 2 + src/llama-model-loader.cpp | 2 +
src/llama-model.cpp | 59 ++----- src/llama-model.cpp | 65 ++++++++-
src/llama-model.h | 51 ++++++ src/llama-model.h | 12 ++
src/llama-quant.cpp | 4 +- src/llama-quant.cpp | 4 +-
src/llama.cpp | 307 +++++++++++++++++++++++++++++++++- src/llama.cpp | 262 +++++++++++++++++++++++++++++++++-
17 files changed, 508 insertions(+), 56 deletions(-) 17 files changed, 452 insertions(+), 22 deletions(-)
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index 16f30c56..0f0f3f62 100644 index 518aad3f..f0e484a1 100644
--- a/examples/llava/llava.cpp --- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp
@@ -429,7 +429,7 @@ struct llava_embd_batch { @@ -445,7 +445,7 @@ struct llava_embd_batch {
std::vector<llama_seq_id *> seq_ids; std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits; std::vector<int8_t> logits;
llama_batch batch; llama_batch batch;
...@@ -44,7 +44,7 @@ index 16f30c56..0f0f3f62 100644 ...@@ -44,7 +44,7 @@ index 16f30c56..0f0f3f62 100644
pos .resize(n_tokens); pos .resize(n_tokens);
n_seq_id.resize(n_tokens); n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1); seq_ids .resize(n_tokens + 1);
@@ -441,6 +441,7 @@ struct llava_embd_batch { @@ -457,6 +457,7 @@ struct llava_embd_batch {
/*n_tokens =*/ n_tokens, /*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr, /*tokens =*/ nullptr,
/*embd =*/ embd, /*embd =*/ embd,
...@@ -52,7 +52,7 @@ index 16f30c56..0f0f3f62 100644 ...@@ -52,7 +52,7 @@ index 16f30c56..0f0f3f62 100644
/*pos =*/ pos.data(), /*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(), /*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(), /*seq_id =*/ seq_ids.data(),
@@ -464,7 +465,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_ @@ -480,7 +481,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
n_eval = n_batch; n_eval = n_batch;
} }
float * embd = image_embed->embed+i*n_embd; float * embd = image_embed->embed+i*n_embd;
...@@ -62,7 +62,7 @@ index 16f30c56..0f0f3f62 100644 ...@@ -62,7 +62,7 @@ index 16f30c56..0f0f3f62 100644
LOG_ERR("%s : failed to eval\n", __func__); LOG_ERR("%s : failed to eval\n", __func__);
return false; return false;
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 7ddd178b..899d16f2 100644 index 955ed505..95036ef8 100644
--- a/ggml/src/ggml-backend-reg.cpp --- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp
@@ -171,9 +171,9 @@ struct ggml_backend_registry { @@ -171,9 +171,9 @@ struct ggml_backend_registry {
...@@ -79,10 +79,10 @@ index 7ddd178b..899d16f2 100644 ...@@ -79,10 +79,10 @@ index 7ddd178b..899d16f2 100644
register_backend(ggml_backend_rpc_reg()); register_backend(ggml_backend_rpc_reg());
#endif #endif
diff --git a/include/llama.h b/include/llama.h diff --git a/include/llama.h b/include/llama.h
index a0d5ba5d..9f411960 100644 index 47919602..cc948005 100644
--- a/include/llama.h --- a/include/llama.h
+++ b/include/llama.h +++ b/include/llama.h
@@ -250,6 +250,7 @@ extern "C" { @@ -249,6 +249,7 @@ extern "C" {
llama_token * token; llama_token * token;
float * embd; float * embd;
...@@ -90,7 +90,7 @@ index a0d5ba5d..9f411960 100644 ...@@ -90,7 +90,7 @@ index a0d5ba5d..9f411960 100644
llama_pos * pos; llama_pos * pos;
int32_t * n_seq_id; int32_t * n_seq_id;
llama_seq_id ** seq_id; llama_seq_id ** seq_id;
@@ -347,6 +348,7 @@ extern "C" { @@ -343,6 +344,7 @@ extern "C" {
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
bool flash_attn; // whether to use flash attention [EXPERIMENTAL] bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
bool no_perf; // whether to measure performance timings bool no_perf; // whether to measure performance timings
...@@ -98,9 +98,9 @@ index a0d5ba5d..9f411960 100644 ...@@ -98,9 +98,9 @@ index a0d5ba5d..9f411960 100644
// Abort callback // Abort callback
// if it returns true, execution of llama_decode() will be aborted // if it returns true, execution of llama_decode() will be aborted
@@ -426,6 +428,10 @@ extern "C" { @@ -443,6 +445,10 @@ extern "C" {
struct llama_model * model, struct llama_context_params params),
struct llama_context_params params); "use llama_init_from_model instead");
+ // TODO (jmorganca): this should most likely be passed in as part of a batch + // TODO (jmorganca): this should most likely be passed in as part of a batch
+ // and not set on the context for all batches. + // and not set on the context for all batches.
...@@ -110,7 +110,7 @@ index a0d5ba5d..9f411960 100644 ...@@ -110,7 +110,7 @@ index a0d5ba5d..9f411960 100644
LLAMA_API void llama_free(struct llama_context * ctx); LLAMA_API void llama_free(struct llama_context * ctx);
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 5b376c5e..b35aeb31 100644 index a1e0ebcc..b6f20286 100644
--- a/src/llama-arch.cpp --- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp +++ b/src/llama-arch.cpp
@@ -6,6 +6,7 @@ @@ -6,6 +6,7 @@
...@@ -121,7 +121,7 @@ index 5b376c5e..b35aeb31 100644 ...@@ -121,7 +121,7 @@ index 5b376c5e..b35aeb31 100644
{ LLM_ARCH_DECI, "deci" }, { LLM_ARCH_DECI, "deci" },
{ LLM_ARCH_FALCON, "falcon" }, { LLM_ARCH_FALCON, "falcon" },
{ LLM_ARCH_GROK, "grok" }, { LLM_ARCH_GROK, "grok" },
@@ -124,6 +125,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = { @@ -127,6 +128,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" }, { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" }, { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
...@@ -129,7 +129,7 @@ index 5b376c5e..b35aeb31 100644 ...@@ -129,7 +129,7 @@ index 5b376c5e..b35aeb31 100644
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" }, { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
@@ -220,6 +222,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N @@ -225,6 +227,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
}, },
}, },
...@@ -170,7 +170,7 @@ index 5b376c5e..b35aeb31 100644 ...@@ -170,7 +170,7 @@ index 5b376c5e..b35aeb31 100644
{ {
LLM_ARCH_DECI, LLM_ARCH_DECI,
{ {
@@ -1393,6 +1429,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = { @@ -1450,6 +1486,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
// this tensor is loaded for T5, but never used // this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}}, {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
{LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
...@@ -186,7 +186,7 @@ index 5b376c5e..b35aeb31 100644 ...@@ -186,7 +186,7 @@ index 5b376c5e..b35aeb31 100644
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h diff --git a/src/llama-arch.h b/src/llama-arch.h
index eac7055b..e8235ae0 100644 index 77919578..ec742224 100644
--- a/src/llama-arch.h --- a/src/llama-arch.h
+++ b/src/llama-arch.h +++ b/src/llama-arch.h
@@ -10,6 +10,7 @@ @@ -10,6 +10,7 @@
...@@ -197,7 +197,7 @@ index eac7055b..e8235ae0 100644 ...@@ -197,7 +197,7 @@ index eac7055b..e8235ae0 100644
LLM_ARCH_DECI, LLM_ARCH_DECI,
LLM_ARCH_FALCON, LLM_ARCH_FALCON,
LLM_ARCH_BAICHUAN, LLM_ARCH_BAICHUAN,
@@ -128,6 +129,7 @@ enum llm_kv { @@ -131,6 +132,7 @@ enum llm_kv {
LLM_KV_ATTENTION_SLIDING_WINDOW, LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE, LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
...@@ -205,7 +205,7 @@ index eac7055b..e8235ae0 100644 ...@@ -205,7 +205,7 @@ index eac7055b..e8235ae0 100644
LLM_KV_ROPE_DIMENSION_COUNT, LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_DIMENSION_SECTIONS, LLM_KV_ROPE_DIMENSION_SECTIONS,
@@ -308,6 +310,14 @@ enum llm_tensor { @@ -314,6 +316,14 @@ enum llm_tensor {
LLM_TENSOR_CLS, LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT, LLM_TENSOR_CLS_OUT,
LLM_TENSOR_BSKCN_TV, LLM_TENSOR_BSKCN_TV,
...@@ -249,10 +249,10 @@ index 01d5ca57..8682b0e6 100644 ...@@ -249,10 +249,10 @@ index 01d5ca57..8682b0e6 100644
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc); batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
} }
diff --git a/src/llama-context.cpp b/src/llama-context.cpp diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index b9c4a5bf..9d0e7ca3 100644 index 47e79ed4..7b22fe13 100644
--- a/src/llama-context.cpp --- a/src/llama-context.cpp
+++ b/src/llama-context.cpp +++ b/src/llama-context.cpp
@@ -71,10 +71,19 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { @@ -74,10 +74,19 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
} }
if (ubatch.embd) { if (ubatch.embd) {
...@@ -275,7 +275,30 @@ index b9c4a5bf..9d0e7ca3 100644 ...@@ -275,7 +275,30 @@ index b9c4a5bf..9d0e7ca3 100644
} }
if (ubatch.pos && lctx.inp_pos) { if (ubatch.pos && lctx.inp_pos) {
@@ -653,6 +662,10 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) { @@ -470,12 +479,11 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
const auto & cparams = lctx.cparams;
const auto & hparams = lctx.model.hparams;
- const auto & vocab = lctx.model.vocab;
const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
const auto n_batch = cparams.n_batch;
- const auto n_vocab = vocab.n_tokens();
+ const auto n_vocab = hparams.n_vocab;
const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead
@@ -542,7 +550,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
void llama_output_reorder(struct llama_context & ctx) {
std::vector<size_t> & out_ids = ctx.sbatch.out_ids;
if (!out_ids.empty()) {
- const uint32_t n_vocab = ctx.model.vocab.n_tokens();
+ const uint32_t n_vocab = ctx.model.hparams.n_vocab;
const uint32_t n_embd = ctx.model.hparams.n_embd;
const int32_t n_outputs = ctx.n_outputs;
@@ -657,6 +665,10 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
ctx->cparams.causal_attn = causal_attn; ctx->cparams.causal_attn = causal_attn;
} }
...@@ -286,8 +309,26 @@ index b9c4a5bf..9d0e7ca3 100644 ...@@ -286,8 +309,26 @@ index b9c4a5bf..9d0e7ca3 100644
void llama_synchronize(struct llama_context * ctx) { void llama_synchronize(struct llama_context * ctx) {
ggml_backend_sched_synchronize(ctx->sched.get()); ggml_backend_sched_synchronize(ctx->sched.get());
@@ -726,7 +738,7 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
}
- return ctx->logits + j*ctx->model.vocab.n_tokens();
+ return ctx->logits + j*ctx->model.hparams.n_vocab;
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
#ifndef NDEBUG
@@ -886,7 +898,7 @@ struct llama_data_write {
}
void write_logits(const struct llama_context * ctx) {
- const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.vocab.n_tokens());
+ const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.hparams.n_vocab);
write(&logits_size, sizeof(logits_size));
diff --git a/src/llama-context.h b/src/llama-context.h diff --git a/src/llama-context.h b/src/llama-context.h
index 0d163c47..4980a60e 100644 index a9268b29..cf12c9d7 100644
--- a/src/llama-context.h --- a/src/llama-context.h
+++ b/src/llama-context.h +++ b/src/llama-context.h
@@ -107,6 +107,8 @@ struct llama_context { @@ -107,6 +107,8 @@ struct llama_context {
...@@ -312,7 +353,7 @@ index 252012f3..9681e5a0 100644 ...@@ -312,7 +353,7 @@ index 252012f3..9681e5a0 100644
enum llama_pooling_type pooling_type; enum llama_pooling_type pooling_type;
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
index 450738da..42f8a58f 100644 index f3955de9..0b841028 100644
--- a/src/llama-hparams.cpp --- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp
@@ -2,6 +2,8 @@ @@ -2,6 +2,8 @@
...@@ -328,18 +369,25 @@ index 450738da..42f8a58f 100644 ...@@ -328,18 +369,25 @@ index 450738da..42f8a58f 100644
} }
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
-}
\ No newline at end of file
+} +}
+ +
+bool llama_hparams::cross_attention_layers(uint32_t il) const { +bool llama_hparams::cross_attention_layers(uint32_t il) const {
+ return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end(); + return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
+} }
\ No newline at end of file
diff --git a/src/llama-hparams.h b/src/llama-hparams.h diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index fd898e27..f826cd9a 100644 index 1bdcdfd5..05383046 100644
--- a/src/llama-hparams.h --- a/src/llama-hparams.h
+++ b/src/llama-hparams.h +++ b/src/llama-hparams.h
@@ -53,6 +53,7 @@ struct llama_hparams { @@ -41,6 +41,7 @@ struct llama_hparams {
uint32_t n_expert = 0;
uint32_t n_expert_used = 0;
uint32_t n_rel_attn_bkts = 0;
+ uint32_t n_vocab = 0;
// for WavTokenizer
struct llama_hparams_posnet posnet;
@@ -51,6 +52,7 @@ struct llama_hparams {
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr; std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr = {}; std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr = {};
...@@ -347,7 +395,7 @@ index fd898e27..f826cd9a 100644 ...@@ -347,7 +395,7 @@ index fd898e27..f826cd9a 100644
uint32_t n_layer_dense_lead = 0; uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0; uint32_t n_lora_q = 0;
@@ -139,6 +140,9 @@ struct llama_hparams { @@ -138,6 +140,9 @@ struct llama_hparams {
// Block skip connection // Block skip connection
bool n_bskcn(uint32_t n, uint32_t il) const; bool n_bskcn(uint32_t n, uint32_t il) const;
...@@ -358,54 +406,34 @@ index fd898e27..f826cd9a 100644 ...@@ -358,54 +406,34 @@ index fd898e27..f826cd9a 100644
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable"); static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index 53379253..cf814dbe 100644 index feffdf0d..b541c5a3 100644
--- a/src/llama-kv-cache.cpp --- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp
@@ -72,6 +72,39 @@ bool llama_kv_cache_init( @@ -91,8 +91,17 @@ bool llama_kv_cache_init(
cache.v_l.reserve(n_layer); return false;
}
for (int i = 0; i < n_layer; i++) { - ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
- ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
+ ggml_tensor * k, *v;
+
+ // for cross attention layers + // for cross attention layers
+ if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layers(i)) { + if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layers(i)) {
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s(); + k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i));
+ const llama_model::buft_list_t * buft_list; + v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i));
+ if (offload) {
+ buft_list = model.dev_layer.at(i).buft_list;
+ } else { + } else {
+ buft_list = &model.cpu_buft_list; + k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
+ } + v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
+ ggml_backend_buffer_type_t buft = select_buft(*buft_list,
+ [&](ggml_context * ctx) {
+ ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
+ if (hparams.rope_type == LLAMA_ROPE_TYPE_NONE) {
+ return k;
+ }
+ ggml_tensor * p = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
+ return ggml_rope(ctx, k, p, hparams.n_rot, hparams.rope_type);
+ });
+ ggml_context * ctx = ctx_for_buft(buft);
+
+ if (!ctx) {
+ LLAMA_LOG_ERROR("%s: failed to create ggml context for kv cache\n", __func__);
+ return false;
+ }
+ ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i));
+ ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i));
+ ggml_format_name(k, "cache_k_l%d", i);
+ ggml_format_name(v, "cache_v_l%d", i);
+ cache.k_l.push_back(k);
+ cache.v_l.push_back(v);
+ continue;
+ } + }
+ +
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s(); ggml_format_name(k, "cache_k_l%d", i);
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s(); ggml_format_name(v, "cache_v_l%d", i);
cache.k_l.push_back(k);
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index 422524a8..b12d6566 100644 index 1252aca1..45d08721 100644
--- a/src/llama-model-loader.cpp --- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp
@@ -240,6 +240,8 @@ namespace GGUFMeta { @@ -315,6 +315,8 @@ namespace GGUFMeta {
return true; return true;
} }
...@@ -415,80 +443,47 @@ index 422524a8..b12d6566 100644 ...@@ -415,80 +443,47 @@ index 422524a8..b12d6566 100644
bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) { bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
const int kid = gguf_find_key(meta.get(), key.c_str()); const int kid = gguf_find_key(meta.get(), key.c_str());
diff --git a/src/llama-model.cpp b/src/llama-model.cpp diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 306c557d..4f9bbf90 100644 index ad1315c6..21819080 100644
--- a/src/llama-model.cpp --- a/src/llama-model.cpp
+++ b/src/llama-model.cpp +++ b/src/llama-model.cpp
@@ -146,46 +146,6 @@ std::string llama_model_ftype_name(const llama_model & model) { @@ -401,6 +401,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
return llama_model_ftype_name(model.ftype);
} // get general kv
ml.get_key(LLM_KV_GENERAL_NAME, name, false);
-template<typename F> + ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, false);
-static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
- ggml_init_params params = { // everything past this point is not vocab-related
- /*.mem_size =*/ ggml_tensor_overhead()*8, if (hparams.vocab_only) {
- /*.mem_buffer =*/ NULL, @@ -412,6 +413,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
- /*.no_alloc =*/ true, ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
- }; ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
- ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
- ggml_context_ptr ctx { ggml_init(params) }; + ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false);
- if (!ctx) {
- throw std::runtime_error(format("failed to create ggml context")); if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
- } ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
- @@ -435,9 +437,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
- ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
- ggml_tensor * op_tensor = fn(ctx.get());
- for (int i = 0; i < GGML_MAX_SRC; i++) {
- if (op_tensor->src[i] != nullptr) {
- assert(op_tensor->src[i]->buffer == nullptr);
- op_tensor->src[i]->buffer = buf.get();
- }
- }
-
- bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
-
- return op_supported;
-}
-
-template<typename F>
-static ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t & buft_list, const F & fn) {
- for (const auto & cur : buft_list) {
- ggml_backend_dev_t cur_dev = cur.first;
- ggml_backend_buffer_type_t cur_buft = cur.second;
- if (buft_supported(cur_buft, cur_dev, fn)) {
- return cur_buft;
- }
- }
-
- throw std::runtime_error(format("no suitable buffer type found"));
-}
-
ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il) {
return select_buft(
*model.dev_layer.at(il).buft_list,
@@ -312,9 +272,11 @@ void llm_load_hparams(llama_model_loader & ml, llama_model & model) {
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0); std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0); std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0); std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
+ std::fill(hparams.cross_attn_layers.begin(), hparams.cross_attn_layers.end(), -1); + std::fill(hparams.cross_attn_layers.begin(), hparams.cross_attn_layers.end(), -1);
- ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false); ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
- ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false); ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
+ ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
+ ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
+ ml.get_arr(LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, hparams.cross_attn_layers, false); + ml.get_arr(LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, hparams.cross_attn_layers, false);
// n_head_kv is optional, default to n_head // n_head_kv is optional, default to n_head
hparams.n_head_kv_arr = hparams.n_head_arr; hparams.n_head_kv_arr = hparams.n_head_arr;
@@ -363,7 +325,7 @@ void llm_load_hparams(llama_model_loader & ml, llama_model & model) { @@ -486,7 +490,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false); ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
- if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_DECI || model.arch == LLM_ARCH_FALCON) { - if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
+ if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_MLLAMA || model.arch == LLM_ARCH_DECI || model.arch == LLM_ARCH_FALCON) { + if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_MLLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
if (hparams.n_rot != hparams.n_embd_head_k) { if (hparams.n_rot != hparams.n_embd_head_k) {
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k)); throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
} }
@@ -405,6 +367,16 @@ void llm_load_hparams(llama_model_loader & ml, llama_model & model) { @@ -530,6 +534,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
} }
} }
} break; } break;
...@@ -497,145 +492,44 @@ index 306c557d..4f9bbf90 100644 ...@@ -497,145 +492,44 @@ index 306c557d..4f9bbf90 100644
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ +
+ switch (hparams.n_layer) { + switch (hparams.n_layer) {
+ case 40: model.type = e_model::MODEL_11B; break; + case 40: type = LLM_TYPE_11B; break;
+ case 100: model.type = e_model::MODEL_90B; break; + case 100: type = LLM_TYPE_90B; break;
+ default: model.type = e_model::MODEL_UNKNOWN; + default: type = LLM_TYPE_UNKNOWN;
+ } + }
+ } break; + } break;
case LLM_ARCH_DECI: case LLM_ARCH_DECI:
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -2062,6 +2034,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { @@ -1398,7 +1412,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
const int64_t n_embd_head_v = hparams.n_embd_head_v;
// use what we call a normal RoPE, operating on pairs of consecutive head values const int64_t n_ff = hparams.n_ff();
case LLM_ARCH_LLAMA: const int64_t n_embd_gqa = n_embd_v_gqa;
+ case LLM_ARCH_MLLAMA: - const int64_t n_vocab = vocab.n_tokens();
case LLM_ARCH_DECI: + const int64_t n_vocab = hparams.n_vocab;
case LLM_ARCH_BAICHUAN: const int64_t n_token_types = vocab.n_token_types();
case LLM_ARCH_STARCODER: const int64_t n_rot = hparams.n_rot;
diff --git a/src/llama-model.h b/src/llama-model.h const int64_t n_expert = hparams.n_expert;
index c1b9c0a1..5b23e2ba 100644 @@ -1581,6 +1595,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -9,6 +9,7 @@
#include "ggml-cpp.h"
#include <vector>
+#include <stdexcept>
// available models
// TODO: this enum does not follow the enum naming convention
@@ -62,6 +63,7 @@ enum llm_type {
MODEL_40B,
MODEL_65B,
MODEL_70B,
+ MODEL_90B,
MODEL_236B,
MODEL_314B,
MODEL_671B,
@@ -278,6 +280,16 @@ struct llama_layer {
struct ggml_tensor * bskcn_tv = nullptr;
+ // cross attention
+ struct ggml_tensor * cross_attn_k_norm = nullptr;
+ struct ggml_tensor * cross_attn_k_proj = nullptr;
+ struct ggml_tensor * cross_attn_o_proj = nullptr;
+ struct ggml_tensor * cross_attn_q_norm = nullptr;
+ struct ggml_tensor * cross_attn_q_proj = nullptr;
+ struct ggml_tensor * cross_attn_v_proj = nullptr;
+ struct ggml_tensor * cross_attn_attn_gate = nullptr;
+ struct ggml_tensor * cross_attn_mlp_gate = nullptr;
+
struct llama_layer_posnet posnet;
struct llama_layer_convnext convnext;
@@ -376,6 +388,45 @@ std::string llama_model_arch_name (const llama_model & model);
std::string llama_model_type_name (const llama_model & model);
std::string llama_model_ftype_name(const llama_model & model);
+template<typename F>
+bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
+ ggml_init_params params = {
+ /*.mem_size =*/ ggml_tensor_overhead()*8,
+ /*.mem_buffer =*/ NULL,
+ /*.no_alloc =*/ true,
+ };
+
+ ggml_context_ptr ctx { ggml_init(params) };
+ if (!ctx) {
+ throw std::runtime_error("failed to create ggml context");
+ }
+
+ ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
+ ggml_tensor * op_tensor = fn(ctx.get());
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
+ if (op_tensor->src[i] != nullptr) {
+ op_tensor->src[i]->buffer = buf.get();
+ }
+ }
+
+ bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
+
+ return op_supported;
+}
+
+template<typename F>
+ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t & buft_list, const F & fn) {
+ for (const auto & cur : buft_list) {
+ ggml_backend_dev_t cur_dev = cur.first;
+ ggml_backend_buffer_type_t cur_buft = cur.second;
+ if (buft_supported(cur_buft, cur_dev, fn)) {
+ return cur_buft;
+ }
+ }
+
+ throw std::runtime_error("no suitable buffer type found");
+}
+
// used by llama_adapter_cvec
ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il);
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 42974f8f..27def6fd 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -629,7 +629,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
if (llama_model_has_encoder(&model)) {
n_attn_layer *= 3;
}
- GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
+ if (qs.n_attention_wv != n_attn_layer) {
+ LLAMA_LOG_WARN("%s: n_attention_wv is unexpected, expected: %d, found: %d\n", __func__, n_attn_layer, qs.n_attention_wv);
+ }
}
size_t total_size_org = 0;
diff --git a/src/llama.cpp b/src/llama.cpp
index 7dec50ae..bac66c24 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -563,6 +563,52 @@ static bool llm_load_tensors(
} }
} }
} break; } break;
+ case LLM_ARCH_MLLAMA: + case LLM_ARCH_MLLAMA:
+ { + {
+ model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab+8}, 0); + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab+8}, 0);
+ +
+ // output + // output
+ { + {
+ model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ +
+ // if output is NULL, init from the input tok embed + // if output is NULL, init from the input tok embed
+ if (model.output == NULL) { + if (output == NULL) {
+ model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
+ } + }
+ } + }
+ +
+ for (int i = 0; i < n_layer; ++i) { + for (int i = 0; i < n_layer; ++i) {
+ auto & layer = model.layers[i]; + auto & layer = layers[i];
+ +
+ if (hparams.cross_attention_layers(i)) { + if (hparams.cross_attention_layers(i)) {
+ layer.cross_attn_k_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_NORM, "weight", i), {128}, 0); + layer.cross_attn_k_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_NORM, "weight", i), {128}, 0);
...@@ -667,17 +561,72 @@ index 7dec50ae..bac66c24 100644 ...@@ -667,17 +561,72 @@ index 7dec50ae..bac66c24 100644
+ } break; + } break;
case LLM_ARCH_DECI: case LLM_ARCH_DECI:
{ {
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -2514,7 +2560,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam @@ -3925,6 +3985,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
// use what we call a normal RoPE, operating on pairs of consecutive head values
case LLM_ARCH_LLAMA:
+ case LLM_ARCH_MLLAMA:
case LLM_ARCH_DECI:
case LLM_ARCH_BAICHUAN:
case LLM_ARCH_STARCODER:
diff --git a/src/llama-model.h b/src/llama-model.h
index 1afb0024..7cf57587 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -9,6 +9,7 @@
#include <string>
#include <unordered_map>
#include <vector>
+#include <stdexcept>
struct llama_model_loader;
@@ -63,6 +64,7 @@ enum llm_type {
LLM_TYPE_40B,
LLM_TYPE_65B,
LLM_TYPE_70B,
+ LLM_TYPE_90B,
LLM_TYPE_236B,
LLM_TYPE_314B,
LLM_TYPE_671B,
@@ -284,6 +286,16 @@ struct llama_layer {
struct ggml_tensor * bskcn_tv = nullptr;
if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE && + // cross attention
model.hparams.n_vocab != model.vocab.id_to_token.size()) { + struct ggml_tensor * cross_attn_k_norm = nullptr;
- throw std::runtime_error("vocab size mismatch"); + struct ggml_tensor * cross_attn_k_proj = nullptr;
+ LLAMA_LOG_WARN("%s: vocab mismatch %u !- %zu ...\n", __func__, model.hparams.n_vocab, model.vocab.id_to_token.size()); + struct ggml_tensor * cross_attn_o_proj = nullptr;
+ struct ggml_tensor * cross_attn_q_norm = nullptr;
+ struct ggml_tensor * cross_attn_q_proj = nullptr;
+ struct ggml_tensor * cross_attn_v_proj = nullptr;
+ struct ggml_tensor * cross_attn_attn_gate = nullptr;
+ struct ggml_tensor * cross_attn_mlp_gate = nullptr;
+
struct llama_layer_posnet posnet;
struct llama_layer_convnext convnext;
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index fb798265..6eb1da08 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -632,7 +632,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
if (llama_model_has_encoder(&model)) {
n_attn_layer *= 3;
}
- GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
+ if (qs.n_attention_wv != n_attn_layer) {
+ LLAMA_LOG_WARN("%s: n_attention_wv is unexpected, expected: %d, found: %d\n", __func__, n_attn_layer, qs.n_attention_wv);
+ }
} }
if (params.vocab_only) { size_t total_size_org = 0;
@@ -2598,6 +2644,21 @@ static struct ggml_tensor * llm_build_inp_embd( diff --git a/src/llama.cpp b/src/llama.cpp
index 6d320ea4..8f7902df 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -154,6 +154,21 @@ static struct ggml_tensor * llm_build_inp_embd(
return inpL; return inpL;
} }
...@@ -699,7 +648,7 @@ index 7dec50ae..bac66c24 100644 ...@@ -699,7 +648,7 @@ index 7dec50ae..bac66c24 100644
static void llm_build_kv_store( static void llm_build_kv_store(
struct ggml_context * ctx, struct ggml_context * ctx,
const llama_hparams & hparams, const llama_hparams & hparams,
@@ -3593,6 +3654,7 @@ struct llm_build_context { @@ -1157,6 +1172,7 @@ struct llm_build_context {
lctx.inp_pos_bucket = nullptr; lctx.inp_pos_bucket = nullptr;
lctx.inp_embd_enc = nullptr; lctx.inp_embd_enc = nullptr;
lctx.inp_KQ_mask_cross = nullptr; lctx.inp_KQ_mask_cross = nullptr;
...@@ -707,12 +656,12 @@ index 7dec50ae..bac66c24 100644 ...@@ -707,12 +656,12 @@ index 7dec50ae..bac66c24 100644
} }
void free() { void free() {
@@ -4074,6 +4136,240 @@ struct llm_build_context { @@ -1639,6 +1655,240 @@ struct llm_build_context {
return gf; return gf;
} }
+ struct ggml_cgraph * build_mllama() { + struct ggml_cgraph * build_mllama() {
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+ +
+ // mutable variable, needed during the last layer of the computation to skip unused tokens + // mutable variable, needed during the last layer of the computation to skip unused tokens
+ int32_t n_tokens = this->n_tokens; + int32_t n_tokens = this->n_tokens;
...@@ -946,9 +895,9 @@ index 7dec50ae..bac66c24 100644 ...@@ -946,9 +895,9 @@ index 7dec50ae..bac66c24 100644
+ } + }
+ +
struct ggml_cgraph * build_deci() { struct ggml_cgraph * build_deci() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
@@ -10646,6 +10942,10 @@ static struct ggml_cgraph * llama_build_graph( @@ -8344,6 +8594,10 @@ static struct ggml_cgraph * llama_build_graph(
{ {
result = llm.build_llama(); result = llm.build_llama();
} break; } break;
...@@ -959,16 +908,33 @@ index 7dec50ae..bac66c24 100644 ...@@ -959,16 +908,33 @@ index 7dec50ae..bac66c24 100644
case LLM_ARCH_DECI: case LLM_ARCH_DECI:
{ {
result = llm.build_deci(); result = llm.build_deci();
@@ -10971,7 +11271,7 @@ static int llama_decode_internal( @@ -8634,7 +8888,7 @@ static int llama_prepare_sbatch(
n_outputs = 1; n_outputs = 1;
} }
- lctx.sbatch.from_batch(batch, n_embd, - lctx.sbatch.from_batch(batch, n_embd,
+ lctx.sbatch.from_batch(batch, batch.n_embd, + lctx.sbatch.from_batch(batch, batch.n_embd,
/* simple_split */ !kv_self.recurrent, /* simple_split */ !lctx.kv_self.recurrent,
/* logits_all */ n_outputs == n_tokens_all); /* logits_all */ n_outputs == n_tokens_all);
@@ -11282,7 +11582,7 @@ static int llama_encode_internal( @@ -8749,7 +9003,6 @@ static int llama_decode_impl(
const llama_batch & batch = batch_allocr.batch;
const auto & model = lctx.model;
- const auto & vocab = model.vocab;
const auto & hparams = model.hparams;
const auto & cparams = lctx.cparams;
@@ -8760,7 +9013,7 @@ static int llama_decode_impl(
llama_kv_slot_restorer kv_slot_restorer(kv_self);
const int64_t n_embd = hparams.n_embd;
- const int64_t n_vocab = vocab.n_tokens();
+ const int64_t n_vocab = hparams.n_vocab;
uint32_t n_outputs = 0;
uint32_t n_outputs_prev = 0;
@@ -9025,7 +9278,7 @@ static int llama_encode_impl(
const int64_t n_embd = hparams.n_embd; const int64_t n_embd = hparams.n_embd;
...@@ -977,7 +943,7 @@ index 7dec50ae..bac66c24 100644 ...@@ -977,7 +943,7 @@ index 7dec50ae..bac66c24 100644
const llama_ubatch ubatch = lctx.sbatch.split_simple(n_tokens); const llama_ubatch ubatch = lctx.sbatch.split_simple(n_tokens);
@@ -11775,6 +12075,7 @@ struct llama_context_params llama_context_default_params() { @@ -9511,6 +9764,7 @@ struct llama_context_params llama_context_default_params() {
/*.offload_kqv =*/ true, /*.offload_kqv =*/ true,
/*.flash_attn =*/ false, /*.flash_attn =*/ false,
/*.no_perf =*/ true, /*.no_perf =*/ true,
......
...@@ -15,10 +15,10 @@ Subject: [PATCH] add unpad operator ...@@ -15,10 +15,10 @@ Subject: [PATCH] add unpad operator
8 files changed, 220 insertions(+), 2 deletions(-) 8 files changed, 220 insertions(+), 2 deletions(-)
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index c714fc8c..1bc50fca 100644 index dd0c6a96..8d269a9c 100644
--- a/ggml/include/ggml.h --- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h +++ b/ggml/include/ggml.h
@@ -499,6 +499,7 @@ extern "C" { @@ -487,6 +487,7 @@ extern "C" {
GGML_OP_UPSCALE, // nearest interpolate GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_PAD, GGML_OP_PAD,
GGML_OP_PAD_REFLECT_1D, GGML_OP_PAD_REFLECT_1D,
...@@ -26,7 +26,7 @@ index c714fc8c..1bc50fca 100644 ...@@ -26,7 +26,7 @@ index c714fc8c..1bc50fca 100644
GGML_OP_ARANGE, GGML_OP_ARANGE,
GGML_OP_TIMESTEP_EMBEDDING, GGML_OP_TIMESTEP_EMBEDDING,
GGML_OP_ARGSORT, GGML_OP_ARGSORT,
@@ -1735,6 +1736,15 @@ extern "C" { @@ -1743,6 +1744,15 @@ extern "C" {
int p0, int p0,
int p1); int p1);
...@@ -43,10 +43,10 @@ index c714fc8c..1bc50fca 100644 ...@@ -43,10 +43,10 @@ index c714fc8c..1bc50fca 100644
// timesteps: [N,] // timesteps: [N,]
// return: [N, dim] // return: [N, dim]
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index b7fefb9d..b307d554 100644 index 72325349..2f606d82 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c --- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -10588,6 +10588,59 @@ static void ggml_compute_forward_pad_reflect_1d( @@ -10844,6 +10844,59 @@ static void ggml_compute_forward_pad_reflect_1d(
} }
} }
...@@ -106,7 +106,7 @@ index b7fefb9d..b307d554 100644 ...@@ -106,7 +106,7 @@ index b7fefb9d..b307d554 100644
// ggml_compute_forward_arange // ggml_compute_forward_arange
static void ggml_compute_forward_arange_f32( static void ggml_compute_forward_arange_f32(
@@ -12690,6 +12743,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm @@ -13137,6 +13190,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{ {
ggml_compute_forward_pad_reflect_1d(params, tensor); ggml_compute_forward_pad_reflect_1d(params, tensor);
} break; } break;
...@@ -117,7 +117,7 @@ index b7fefb9d..b307d554 100644 ...@@ -117,7 +117,7 @@ index b7fefb9d..b307d554 100644
case GGML_OP_ARANGE: case GGML_OP_ARANGE:
{ {
ggml_compute_forward_arange(params, tensor); ggml_compute_forward_arange(params, tensor);
@@ -13033,6 +13090,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { @@ -13484,6 +13541,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
case GGML_OP_UPSCALE: case GGML_OP_UPSCALE:
case GGML_OP_PAD: case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D: case GGML_OP_PAD_REFLECT_1D:
...@@ -126,10 +126,10 @@ index b7fefb9d..b307d554 100644 ...@@ -126,10 +126,10 @@ index b7fefb9d..b307d554 100644
case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index aaa79ea4..9286f866 100644 index 36165840..1adf08fa 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2082,6 +2082,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg @@ -2198,6 +2198,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_PAD: case GGML_OP_PAD:
ggml_cuda_op_pad(ctx, dst); ggml_cuda_op_pad(ctx, dst);
break; break;
...@@ -139,8 +139,8 @@ index aaa79ea4..9286f866 100644 ...@@ -139,8 +139,8 @@ index aaa79ea4..9286f866 100644
case GGML_OP_ARANGE: case GGML_OP_ARANGE:
ggml_cuda_op_arange(ctx, dst); ggml_cuda_op_arange(ctx, dst);
break; break;
@@ -3010,6 +3013,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g @@ -3197,6 +3200,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_GROUP_NORM: return ggml_is_contiguous(op->src[0]);
case GGML_OP_UPSCALE: case GGML_OP_UPSCALE:
case GGML_OP_PAD: case GGML_OP_PAD:
+ case GGML_OP_UNPAD: + case GGML_OP_UNPAD:
...@@ -148,7 +148,7 @@ index aaa79ea4..9286f866 100644 ...@@ -148,7 +148,7 @@ index aaa79ea4..9286f866 100644
case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_LEAKY_RELU: case GGML_OP_LEAKY_RELU:
diff --git a/ggml/src/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu diff --git a/ggml/src/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu
index aba539e8..39fd4b16 100644 index aba539e8..b4b87409 100644
--- a/ggml/src/ggml-cuda/pad.cu --- a/ggml/src/ggml-cuda/pad.cu
+++ b/ggml/src/ggml-cuda/pad.cu +++ b/ggml/src/ggml-cuda/pad.cu
@@ -47,3 +47,49 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { @@ -47,3 +47,49 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
...@@ -201,6 +201,7 @@ index aba539e8..39fd4b16 100644 ...@@ -201,6 +201,7 @@ index aba539e8..39fd4b16 100644
+ src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], + src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
+ dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream); + dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
+} +}
\ No newline at end of file
diff --git a/ggml/src/ggml-cuda/pad.cuh b/ggml/src/ggml-cuda/pad.cuh diff --git a/ggml/src/ggml-cuda/pad.cuh b/ggml/src/ggml-cuda/pad.cuh
index 8fd386b0..e2ededc3 100644 index 8fd386b0..e2ededc3 100644
--- a/ggml/src/ggml-cuda/pad.cuh --- a/ggml/src/ggml-cuda/pad.cuh
...@@ -211,10 +212,10 @@ index 8fd386b0..e2ededc3 100644 ...@@ -211,10 +212,10 @@ index 8fd386b0..e2ededc3 100644
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index cd8ef741..318addec 100644 index fd9a4e77..e4c093f9 100644
--- a/ggml/src/ggml-metal/ggml-metal.m --- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -311,6 +311,7 @@ enum ggml_metal_kernel_type { @@ -331,6 +331,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
GGML_METAL_KERNEL_TYPE_UPSCALE_F32, GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
GGML_METAL_KERNEL_TYPE_PAD_F32, GGML_METAL_KERNEL_TYPE_PAD_F32,
GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
...@@ -222,7 +223,7 @@ index cd8ef741..318addec 100644 ...@@ -222,7 +223,7 @@ index cd8ef741..318addec 100644
GGML_METAL_KERNEL_TYPE_ARANGE_F32, GGML_METAL_KERNEL_TYPE_ARANGE_F32,
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
@@ -910,6 +911,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de @@ -946,6 +947,7 @@ @implementation GGMLMetalClass
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true);
...@@ -230,7 +231,7 @@ index cd8ef741..318addec 100644 ...@@ -230,7 +231,7 @@ index cd8ef741..318addec 100644
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
@@ -1145,6 +1147,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex @@ -1254,6 +1256,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
case GGML_OP_UPSCALE: case GGML_OP_UPSCALE:
case GGML_OP_PAD: case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D: case GGML_OP_PAD_REFLECT_1D:
...@@ -238,7 +239,7 @@ index cd8ef741..318addec 100644 ...@@ -238,7 +239,7 @@ index cd8ef741..318addec 100644
case GGML_OP_ARANGE: case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
@@ -3348,6 +3351,36 @@ static void ggml_metal_encode_node( @@ -3469,6 +3472,36 @@ static void ggml_metal_encode_node(
const int nth = MIN(1024, ne0); const int nth = MIN(1024, ne0);
...@@ -276,10 +277,10 @@ index cd8ef741..318addec 100644 ...@@ -276,10 +277,10 @@ index cd8ef741..318addec 100644
} break; } break;
case GGML_OP_ARANGE: case GGML_OP_ARANGE:
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 8ba43904..204c93e6 100644 index d092a169..f38909d0 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal --- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -2944,6 +2944,51 @@ kernel void kernel_pad_reflect_1d_f32( @@ -2953,6 +2953,51 @@ kernel void kernel_pad_reflect_1d_f32(
} }
} }
...@@ -332,10 +333,10 @@ index 8ba43904..204c93e6 100644 ...@@ -332,10 +333,10 @@ index 8ba43904..204c93e6 100644
device char * dst, device char * dst,
constant int64_t & ne0, constant int64_t & ne0,
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 2bbe5f48..7ffcd907 100644 index 7fc06724..635aa299 100644
--- a/ggml/src/ggml.c --- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c +++ b/ggml/src/ggml.c
@@ -954,6 +954,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { @@ -962,6 +962,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"UPSCALE", "UPSCALE",
"PAD", "PAD",
"PAD_REFLECT_1D", "PAD_REFLECT_1D",
...@@ -343,16 +344,16 @@ index 2bbe5f48..7ffcd907 100644 ...@@ -343,16 +344,16 @@ index 2bbe5f48..7ffcd907 100644
"ARANGE", "ARANGE",
"TIMESTEP_EMBEDDING", "TIMESTEP_EMBEDDING",
"ARGSORT", "ARGSORT",
@@ -987,7 +988,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { @@ -996,7 +997,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"OPT_STEP_ADAMW", "OPT_STEP_ADAMW",
}; };
-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82"); -static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83"); +static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none", "none",
@@ -1050,6 +1051,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { @@ -1059,6 +1060,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"upscale(x)", "upscale(x)",
"pad(x)", "pad(x)",
"pad_reflect_1d(x)", "pad_reflect_1d(x)",
...@@ -360,16 +361,16 @@ index 2bbe5f48..7ffcd907 100644 ...@@ -360,16 +361,16 @@ index 2bbe5f48..7ffcd907 100644
"arange(start, stop, step)", "arange(start, stop, step)",
"timestep_embedding(timesteps, dim, max_period)", "timestep_embedding(timesteps, dim, max_period)",
"argsort(x)", "argsort(x)",
@@ -1083,7 +1085,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { @@ -1093,7 +1095,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"adamw(x)", "adamw(x)",
}; };
-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82"); -static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83"); +static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@@ -4214,6 +4216,25 @@ struct ggml_tensor * ggml_pad_reflect_1d( @@ -4225,6 +4227,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
return result; return result;
} }
......
...@@ -11,10 +11,10 @@ the characters ...@@ -11,10 +11,10 @@ the characters
2 files changed, 23 insertions(+), 1 deletion(-) 2 files changed, 23 insertions(+), 1 deletion(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 3fcfcaa3..8f44705a 100644 index a4eee9b8..1ca827eb 100644
--- a/src/llama-vocab.cpp --- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp
@@ -375,7 +375,7 @@ struct llm_tokenizer_bpe : llm_tokenizer { @@ -295,7 +295,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM: case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
regex_exprs = { regex_exprs = {
"[\r\n]", "[\r\n]",
...@@ -24,7 +24,7 @@ index 3fcfcaa3..8f44705a 100644 ...@@ -24,7 +24,7 @@ index 3fcfcaa3..8f44705a 100644
"\\s+$", "\\s+$",
"[一-龥ࠀ-一가-퟿]+", "[一-龥ࠀ-一가-퟿]+",
diff --git a/src/unicode.cpp b/src/unicode.cpp diff --git a/src/unicode.cpp b/src/unicode.cpp
index 7aca6544..6155da80 100644 index e63bb4ab..9dd53b9a 100644
--- a/src/unicode.cpp --- a/src/unicode.cpp
+++ b/src/unicode.cpp +++ b/src/unicode.cpp
@@ -2,6 +2,11 @@ @@ -2,6 +2,11 @@
...@@ -39,7 +39,7 @@ index 7aca6544..6155da80 100644 ...@@ -39,7 +39,7 @@ index 7aca6544..6155da80 100644
#include "unicode.h" #include "unicode.h"
#include "unicode-data.h" #include "unicode-data.h"
@@ -201,6 +206,22 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() { @@ -200,6 +205,22 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
} }
static inline std::wstring unicode_wstring_from_utf8(const std::string & s) { static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
...@@ -62,7 +62,7 @@ index 7aca6544..6155da80 100644 ...@@ -62,7 +62,7 @@ index 7aca6544..6155da80 100644
#if defined(__clang__) #if defined(__clang__)
// disable C++17 deprecation warning for std::codecvt_utf8 // disable C++17 deprecation warning for std::codecvt_utf8
# pragma clang diagnostic push # pragma clang diagnostic push
@@ -214,6 +235,7 @@ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) { @@ -213,6 +234,7 @@ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
#endif #endif
return conv.from_bytes(s); return conv.from_bytes(s);
......
...@@ -8,11 +8,11 @@ Subject: [PATCH] Maintain ordering for rules for grammar ...@@ -8,11 +8,11 @@ Subject: [PATCH] Maintain ordering for rules for grammar
1 file changed, 1 insertion(+), 1 deletion(-) 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index dadc18c8..2a8dbd22 100644 index 3ebcc3d9..30c28808 100644
--- a/common/json-schema-to-grammar.cpp --- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp
@@ -391,7 +391,7 @@ class SchemaConverter { @@ -346,7 +346,7 @@ private:
private: friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
std::function<json(const std::string &)> _fetch_json; std::function<json(const std::string &)> _fetch_json;
bool _dotall; bool _dotall;
- std::map<std::string, std::string> _rules; - std::map<std::string, std::string> _rules;
......
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: Sat, 14 Dec 2024 12:54:00 -0800
Subject: [PATCH] fix missing arg in static assert on windows
---
ggml/src/ggml-cuda/concat.cu | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/ggml/src/ggml-cuda/concat.cu b/ggml/src/ggml-cuda/concat.cu
index 2f42b8a9..5eb9f08d 100644
--- a/ggml/src/ggml-cuda/concat.cu
+++ b/ggml/src/ggml-cuda/concat.cu
@@ -124,7 +124,7 @@ static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE)
uint64_t nb1,
uint64_t nb2,
uint64_t nb3){
- static_assert(dim >= 0 && dim <= 3);
+ static_assert(dim >= 0 && dim <= 3, "dim must be between 0 and 3");
const int64_t i3 = blockIdx.z;
const int64_t i2 = blockIdx.y;
...@@ -19,10 +19,10 @@ multiple batches of processing until everything is complete. ...@@ -19,10 +19,10 @@ multiple batches of processing until everything is complete.
1 file changed, 46 insertions(+), 53 deletions(-) 1 file changed, 46 insertions(+), 53 deletions(-)
diff --git a/src/llama.cpp b/src/llama.cpp diff --git a/src/llama.cpp b/src/llama.cpp
index bac66c24..c95da45d 100644 index 8f7902df..01854fce 100644
--- a/src/llama.cpp --- a/src/llama.cpp
+++ b/src/llama.cpp +++ b/src/llama.cpp
@@ -3536,6 +3536,13 @@ static struct ggml_tensor * llm_build_rwkv6_channel_mix( @@ -1054,6 +1054,13 @@ static struct ggml_tensor * llm_build_rwkv6_channel_mix(
return ggml_mul(ctx, r, llm_build_lora_mm(lctx, ctx, layer->channel_mix_value, k)); return ggml_mul(ctx, r, llm_build_lora_mm(lctx, ctx, layer->channel_mix_value, k));
} }
...@@ -36,13 +36,13 @@ index bac66c24..c95da45d 100644 ...@@ -36,13 +36,13 @@ index bac66c24..c95da45d 100644
struct llm_build_context { struct llm_build_context {
const llama_model & model; const llama_model & model;
llama_context & lctx; llama_context & lctx;
@@ -3712,35 +3719,23 @@ struct llm_build_context { @@ -1230,35 +1237,23 @@ struct llm_build_context {
return gf; return gf;
} }
- struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) { - struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
+ struct ggml_cgraph * build_defrag(const std::vector<struct llama_kv_defrag_move> & moves) { + struct ggml_cgraph * build_defrag(const std::vector<struct llama_kv_defrag_move> & moves) {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
- for (uint32_t i = 0; i < ids.size(); ++i) { - for (uint32_t i = 0; i < ids.size(); ++i) {
- const uint32_t id = ids[i]; - const uint32_t id = ids[i];
...@@ -78,7 +78,7 @@ index bac66c24..c95da45d 100644 ...@@ -78,7 +78,7 @@ index bac66c24..c95da45d 100644
ggml_tensor * view_v_src; ggml_tensor * view_v_src;
ggml_tensor * view_v_dst; ggml_tensor * view_v_dst;
@@ -3748,31 +3743,29 @@ struct llm_build_context { @@ -1266,31 +1261,29 @@ struct llm_build_context {
if (flash_attn) { if (flash_attn) {
// NOTE: the V cache is not transposed when using flash attention // NOTE: the V cache is not transposed when using flash attention
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il], view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
...@@ -118,7 +118,7 @@ index bac66c24..c95da45d 100644 ...@@ -118,7 +118,7 @@ index bac66c24..c95da45d 100644
} }
//LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes); //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
@@ -10856,7 +10849,7 @@ struct llm_build_context { @@ -8508,7 +8501,7 @@ struct llm_build_context {
} }
}; };
...@@ -127,7 +127,7 @@ index bac66c24..c95da45d 100644 ...@@ -127,7 +127,7 @@ index bac66c24..c95da45d 100644
llama_ubatch dummy = {}; llama_ubatch dummy = {};
dummy.equal_seqs = true; dummy.equal_seqs = true;
@@ -10866,7 +10859,7 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const @@ -8518,7 +8511,7 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const
llm.init(); llm.init();
...@@ -136,7 +136,7 @@ index bac66c24..c95da45d 100644 ...@@ -136,7 +136,7 @@ index bac66c24..c95da45d 100644
llm.free(); llm.free();
@@ -11329,7 +11322,12 @@ static int llama_decode_internal( @@ -8956,7 +8949,12 @@ static int llama_prepare_ubatch(
kv_self.head = 0; kv_self.head = 0;
} }
...@@ -150,7 +150,7 @@ index bac66c24..c95da45d 100644 ...@@ -150,7 +150,7 @@ index bac66c24..c95da45d 100644
if (!slot) { if (!slot) {
return 1; return 1;
} }
@@ -11735,8 +11733,8 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { @@ -9431,8 +9429,8 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
//const int64_t t_start = ggml_time_us(); //const int64_t t_start = ggml_time_us();
...@@ -161,7 +161,7 @@ index bac66c24..c95da45d 100644 ...@@ -161,7 +161,7 @@ index bac66c24..c95da45d 100644
// each move requires 6*n_layer tensors (see build_defrag) // each move requires 6*n_layer tensors (see build_defrag)
// - source view, destination view, copy operation // - source view, destination view, copy operation
@@ -11800,19 +11798,11 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { @@ -9496,19 +9494,11 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
// are we moving a continuous block of memory? // are we moving a continuous block of memory?
bool cont = false; bool cont = false;
...@@ -181,7 +181,7 @@ index bac66c24..c95da45d 100644 ...@@ -181,7 +181,7 @@ index bac66c24..c95da45d 100644
cont = false; cont = false;
continue; continue;
} }
@@ -11828,8 +11818,10 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { @@ -9524,8 +9514,10 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
kv_self.head = n_used; kv_self.head = n_used;
if (!cont) { if (!cont) {
...@@ -193,7 +193,7 @@ index bac66c24..c95da45d 100644 ...@@ -193,7 +193,7 @@ index bac66c24..c95da45d 100644
} }
nf++; nf++;
@@ -11839,22 +11831,16 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { @@ -9535,22 +9527,16 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
} }
} }
...@@ -218,7 +218,7 @@ index bac66c24..c95da45d 100644 ...@@ -218,7 +218,7 @@ index bac66c24..c95da45d 100644
#if 0 #if 0
// CPU defrag // CPU defrag
@@ -11929,11 +11915,18 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { @@ -9625,11 +9611,18 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
#else #else
// ggml_graph defrag // ggml_graph defrag
......
...@@ -8,12 +8,12 @@ Subject: [PATCH] use dynamic backend loading for clip ...@@ -8,12 +8,12 @@ Subject: [PATCH] use dynamic backend loading for clip
1 file changed, 27 insertions(+), 47 deletions(-) 1 file changed, 27 insertions(+), 47 deletions(-)
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index b3c1829f..86b91d5c 100644 index 205af1eb..560021c7 100644
--- a/examples/llava/clip.cpp --- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp
@@ -8,25 +8,25 @@ @@ -9,25 +9,25 @@
#include "ggml-alloc.h"
#include "ggml-backend.h" #include "ggml-backend.h"
#include "gguf.h"
-//#ifdef GGML_USE_CUDA -//#ifdef GGML_USE_CUDA
-//#include "ggml-cuda.h" -//#include "ggml-cuda.h"
...@@ -56,7 +56,7 @@ index b3c1829f..86b91d5c 100644 ...@@ -56,7 +56,7 @@ index b3c1829f..86b91d5c 100644
#define STB_IMAGE_IMPLEMENTATION #define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h" #include "stb_image.h"
@@ -1235,35 +1235,15 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { @@ -1309,35 +1309,15 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
} }
} }
......
...@@ -8,7 +8,7 @@ Subject: [PATCH] sort devices by score ...@@ -8,7 +8,7 @@ Subject: [PATCH] sort devices by score
1 file changed, 13 insertions(+), 8 deletions(-) 1 file changed, 13 insertions(+), 8 deletions(-)
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 899d16f2..135f7df0 100644 index 95036ef8..98d5e14d 100644
--- a/ggml/src/ggml-backend-reg.cpp --- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp
@@ -150,7 +150,7 @@ struct ggml_backend_reg_entry { @@ -150,7 +150,7 @@ struct ggml_backend_reg_entry {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment