Unverified Commit d7d7e996 authored by Jeffrey Morgan's avatar Jeffrey Morgan Committed by GitHub
Browse files

llama: update llama.cpp vendor code to commit d7cfe1ff (#9356)

parent 2db96c18
#include "llama-vocab.h"
#include "llama-impl.h"
#include "llama-model-loader.h"
#include "unicode.h"
......@@ -11,8 +12,10 @@
#include <cstdarg>
#include <cstring>
#include <forward_list>
#include <map>
#include <queue>
#include <sstream>
#include <set>
#include <unordered_map>
//
// helpers
......@@ -62,96 +65,14 @@ struct naive_trie {
};
//
// impl
// tokenizers
//
struct llm_tokenizer {
llm_tokenizer() {}
virtual ~llm_tokenizer() = default;
llm_tokenizer() {}
virtual ~llm_tokenizer() = default;
};
llama_vocab::~llama_vocab() {
delete tokenizer;
}
int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
GGML_ASSERT(token_left.find(' ') == std::string::npos);
GGML_ASSERT(token_left.find('\n') == std::string::npos);
GGML_ASSERT(token_right.find(' ') == std::string::npos);
GGML_ASSERT(token_right.find('\n') == std::string::npos);
auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
if (it == bpe_ranks.end()) {
return -1;
}
return it->second;
}
static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
return vocab.type;
}
static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL;
}
static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN;
}
static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL;
}
static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE;
}
static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token id) {
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
}
static bool llama_is_unused_token(const llama_vocab & vocab, llama_token id) {
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNUSED;
}
static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
GGML_ASSERT(llama_is_byte_token(vocab, id));
const auto & token_data = vocab.id_to_token.at(id);
switch (llama_vocab_get_type(vocab)) {
case LLAMA_VOCAB_TYPE_SPM:
case LLAMA_VOCAB_TYPE_UGM: {
auto buf = token_data.text.substr(3, 2);
return strtol(buf.c_str(), NULL, 16);
}
case LLAMA_VOCAB_TYPE_BPE: {
GGML_ABORT("fatal error");
//return unicode_utf8_to_byte(token_data.text); // TODO: why is this here after GGML_ASSERT?
}
case LLAMA_VOCAB_TYPE_WPM: {
GGML_ABORT("fatal error");
}
default:
GGML_ABORT("fatal error");
}
}
static void llama_escape_whitespace(std::string & text) {
replace_all(text, " ", "\xe2\x96\x81");
}
static void llama_unescape_whitespace(std::string & word) {
replace_all(word, "\xe2\x96\x81", " ");
}
struct llm_symbol {
using index = int;
index prev;
......@@ -183,14 +104,13 @@ struct llm_bigram_spm {
};
struct llm_tokenizer_spm : llm_tokenizer {
llm_tokenizer_spm(const llama_vocab & /*vocab*/) : llm_tokenizer() {}
llm_tokenizer_spm(const llama_vocab & /*vocab*/) {}
};
struct llm_tokenizer_spm_session {
llm_tokenizer_spm_session(const llama_vocab & vocab) : vocab(vocab) {}
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
void tokenize(const std::string & text, std::vector<llama_token> & output) {
// split string into utf8 chars
int index = 0;
size_t offs = 0;
......@@ -249,13 +169,13 @@ struct llm_tokenizer_spm_session {
}
private:
void resegment(llm_symbol & symbol, std::vector<llama_vocab::id> & output) {
void resegment(llm_symbol & symbol, std::vector<llama_token> & output) {
auto text = std::string(symbol.text, symbol.n);
auto token = vocab.token_to_id.find(text);
auto token = vocab.text_to_token(text);
// Do we need to support is_unused?
if (token != vocab.token_to_id.end()) {
output.push_back((*token).second);
if (token != LLAMA_TOKEN_NULL) {
output.push_back(token);
return;
}
......@@ -265,8 +185,8 @@ private:
// output any symbols that did not form tokens as bytes.
output.reserve(output.size() + symbol.n);
for (int j = 0; j < (int)symbol.n; ++j) {
llama_vocab::id token_id = llama_byte_to_token_impl(vocab, symbol.text[j]);
output.push_back(token_id);
llama_token id = vocab.byte_to_token(symbol.text[j]);
output.push_back(id);
}
return;
}
......@@ -280,17 +200,17 @@ private:
return;
}
const std::string text = std::string(symbols[left].text, symbols[left].n + symbols[right].n);
auto token = vocab.token_to_id.find(text);
auto token = vocab.text_to_token(text);
if (token == vocab.token_to_id.end()) {
if (token == LLAMA_TOKEN_NULL) {
return;
}
if (static_cast<size_t>((*token).second) >= vocab.id_to_token.size()) {
if (static_cast<uint32_t>(token) >= vocab.n_tokens()) {
return;
}
const auto & tok_data = vocab.id_to_token[(*token).second];
const auto & tok_data = vocab.get_token_data(token);
llm_bigram_spm bigram;
bigram.left = left;
......@@ -353,9 +273,9 @@ struct llm_bigram_bpe {
};
struct llm_tokenizer_bpe : llm_tokenizer {
llm_tokenizer_bpe(const llama_vocab & vocab) : llm_tokenizer() {
GGML_ASSERT(vocab.type == LLAMA_VOCAB_TYPE_BPE);
switch (vocab.type_pre) {
llm_tokenizer_bpe(const llama_vocab & vocab) {
GGML_ASSERT(vocab.get_type() == LLAMA_VOCAB_TYPE_BPE);
switch (vocab.get_pre_type()) {
case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
regex_exprs = {
// original regex from tokenizer.json
......@@ -488,39 +408,38 @@ struct llm_tokenizer_bpe : llm_tokenizer {
};
struct llm_tokenizer_bpe_session {
llm_tokenizer_bpe_session(const llama_vocab & vocab) : vocab(vocab),
bpe_tokenizer(static_cast<const llm_tokenizer_bpe *>(vocab.tokenizer)) {}
llm_tokenizer_bpe_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
static void append(const llama_vocab::id token_id, std::vector<llama_vocab::id> & output) {
static void append(const llama_token token_id, std::vector<llama_token> & output) {
output.push_back(token_id);
}
bool append_bos(std::vector<llama_vocab::id> & output) const {
if (vocab.tokenizer_add_bos) {
GGML_ASSERT(vocab.special_bos_id != -1);
output.push_back(vocab.special_bos_id);
bool append_bos(std::vector<llama_token> & output) const {
if (vocab.get_add_bos()) {
GGML_ASSERT(vocab.token_bos() != LLAMA_TOKEN_NULL);
output.push_back(vocab.token_bos());
return true;
}
return false;
}
bool append_eos(std::vector<llama_vocab::id> & output) const {
if (vocab.tokenizer_add_eos) {
GGML_ASSERT(vocab.special_eos_id != -1);
output.push_back(vocab.special_eos_id);
bool append_eos(std::vector<llama_token> & output) const {
if (vocab.get_add_eos()) {
GGML_ASSERT(vocab.token_eos() != LLAMA_TOKEN_NULL);
output.push_back(vocab.token_eos());
return true;
}
return false;
}
void check_double_bos_eos(const std::vector<llama_vocab::id> & output) const {
if (vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) {
void check_double_bos_eos(const std::vector<llama_token> & output) const {
if (vocab.get_add_bos() && output.size() >= 2 && output[1] == vocab.token_bos()) {
LLAMA_LOG_WARN(
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
"Are you sure this is what you want?\n", __FUNCTION__);
}
if (vocab.tokenizer_add_eos && output.size() >= 2 && *(output.end()-2) == vocab.special_eos_id) {
if (vocab.get_add_eos() && output.size() >= 2 && *(output.end()-2) == vocab.token_eos()) {
LLAMA_LOG_WARN(
"%s: Added a EOS token to the prompt as specified by the model but the prompt "
"also ends with a EOS token. So now the final prompt ends with 2 EOS tokens. "
......@@ -528,9 +447,9 @@ struct llm_tokenizer_bpe_session {
}
}
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
void tokenize(const std::string & text, std::vector<llama_token> & output) {
int final_prev_index = -1;
const auto word_collection = unicode_regex_split(text, bpe_tokenizer->regex_exprs);
const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs);
symbols_final.clear();
......@@ -541,7 +460,8 @@ struct llm_tokenizer_bpe_session {
int index = 0;
size_t offset = 0;
if (vocab.tokenizer_ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
//if (vocab.tokenizer_ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
if (vocab.get_ignore_merges() && vocab.text_to_token(word) != LLAMA_TOKEN_NULL) {
symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
offset = word.size();
}
......@@ -615,18 +535,18 @@ struct llm_tokenizer_bpe_session {
}
const std::string str = std::string(symbol.text, symbol.n);
const auto token = vocab.token_to_id.find(str);
const auto token = vocab.text_to_token(str);
if (token == vocab.token_to_id.end()) {
if (token == LLAMA_TOKEN_NULL) {
for (auto j = str.begin(); j != str.end(); ++j) {
std::string byte_str(1, *j);
auto token_multibyte = vocab.token_to_id.find(byte_str);
if (token_multibyte != vocab.token_to_id.end()) {
output.push_back(token_multibyte->second);
auto token_multibyte = vocab.text_to_token(byte_str);
if (token_multibyte != LLAMA_TOKEN_NULL) {
output.push_back(token_multibyte);
}
}
} else {
output.push_back((*token).second);
output.push_back(token);
}
}
}
......@@ -660,7 +580,7 @@ private:
}
const llama_vocab & vocab;
const llm_tokenizer_bpe * bpe_tokenizer;
const llm_tokenizer_bpe & tokenizer;
std::vector<llm_symbol> symbols;
std::vector<llm_symbol> symbols_final;
......@@ -672,14 +592,13 @@ private:
//
struct llm_tokenizer_wpm : llm_tokenizer {
llm_tokenizer_wpm(const llama_vocab & /*vocab*/) : llm_tokenizer() {}
llm_tokenizer_wpm(const llama_vocab & /*vocab*/) {}
};
struct llm_tokenizer_wpm_session {
llm_tokenizer_wpm_session(const llama_vocab & vocab) : vocab(vocab) {}
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
const auto & token_map = vocab.token_to_id;
void tokenize(const std::string & text, std::vector<llama_token> & output) {
// normalize and split by whitespace
std::vector<std::string> words = preprocess(text);
// bos token prepended already
......@@ -702,10 +621,10 @@ struct llm_tokenizer_wpm_session {
for (int i = 0; i < n; ++i) {
// loop through possible match length
bool match = false;
for (int j = std::min(n, i + vocab.max_token_len + 1); j > i; j--) {
auto it = token_map.find(word1.substr(i, j - i));
if (it != token_map.end()) {
output.push_back(it->second);
for (int j = std::min(n, i + vocab.max_token_len() + 1); j > i; j--) {
auto id = vocab.text_to_token(word1.substr(i, j - i));
if (id != LLAMA_TOKEN_NULL) {
output.push_back(id);
match = true;
i = j - 1;
break;
......@@ -720,7 +639,7 @@ struct llm_tokenizer_wpm_session {
// we didn't find any matches for this word
if (current_tokens == output.size()) {
output.push_back(vocab.special_unk_id);
output.push_back(vocab.token_unk());
}
}
}
......@@ -789,45 +708,45 @@ private:
//
struct llm_tokenizer_ugm : llm_tokenizer {
llm_tokenizer_ugm(const llama_vocab & vocab) : llm_tokenizer() {
if (vocab.precompiled_charsmap.size() > 0) {
llm_tokenizer_ugm(const llama_vocab & vocab, const std::vector<char> & precompiled_charsmap) {
if (precompiled_charsmap.size() > 0) {
size_t charsmap_offset = 0;
// First four bytes of precompiled_charsmap contains length of binary
// blob containing XOR-compressed compact double array (XCDA) entries
uint32_t xcda_blob_size = *(const uint32_t *) &vocab.precompiled_charsmap[0];
uint32_t xcda_blob_size = *(const uint32_t *) &precompiled_charsmap[0];
charsmap_offset += sizeof(xcda_blob_size);
if (xcda_blob_size + charsmap_offset >= vocab.precompiled_charsmap.size()) {
if (xcda_blob_size + charsmap_offset >= precompiled_charsmap.size()) {
throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
}
// Next xcda_blob_size bytes contain entries of XOR-compressed compact
// double array (XCDA). Each entry is bit-packed into a 32-bit integer.
xcda_array = (const uint32_t *) &vocab.precompiled_charsmap[charsmap_offset];
xcda_array = (const uint32_t *) &precompiled_charsmap[charsmap_offset];
xcda_array_size = xcda_blob_size / sizeof(uint32_t);
charsmap_offset += xcda_blob_size;
// Remaining bytes of precompiled charsmap contain null-terminated
// replacement strings for prefixes matched by the XCDA.
prefix_replacements = &vocab.precompiled_charsmap[charsmap_offset];
prefix_replacements_size = vocab.precompiled_charsmap.size() - charsmap_offset;
prefix_replacements = &precompiled_charsmap[charsmap_offset];
prefix_replacements_size = precompiled_charsmap.size() - charsmap_offset;
}
for (unsigned int id = 0; id < vocab.id_to_token.size(); ++id) {
const auto &token_data = vocab.id_to_token[id];
for (uint32_t id = 0; id < vocab.n_tokens(); ++id) {
const auto & token_data = vocab.get_token_data(id);
if (llama_is_normal_token(vocab, id)) {
if (vocab.is_normal(id)) {
min_score = std::min<float>(min_score, token_data.score);
max_score = std::max<float>(max_score, token_data.score);
}
if (llama_is_normal_token(vocab, id) ||
llama_is_user_defined_token(vocab, id) ||
llama_is_unused_token(vocab, id)) {
if (vocab.is_normal(id) ||
vocab.is_user_defined(id) ||
vocab.is_unused(id)) {
token_matcher.insert(token_data.text.data(), token_data.text.size(), id);
}
if (llama_is_user_defined_token(vocab, id)) {
if (vocab.is_user_defined(id)) {
user_defined_token_matcher.insert(token_data.text.data(), token_data.text.size());
}
}
......@@ -856,8 +775,7 @@ struct llm_tokenizer_ugm : llm_tokenizer {
};
struct llm_tokenizer_ugm_session {
llm_tokenizer_ugm_session(const llama_vocab & vocab) : vocab(vocab),
ugm_tokenizer(static_cast<const llm_tokenizer_ugm *>(vocab.tokenizer)) {}
llm_tokenizer_ugm_session(const llama_vocab & vocab, const llm_tokenizer_ugm & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
/* This implementation is based on SentencePiece optimized Viterbi algorithm for
* unigram language models. The general idea is to:
......@@ -872,7 +790,7 @@ struct llm_tokenizer_ugm_session {
* After processing the whole sequence we backtrack from the end to get
* the best tokenization.
*/
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
void tokenize(const std::string & text, std::vector<llama_token> & output) {
// get current size of output (for reversal later)
size_t output_size = output.size();
......@@ -885,9 +803,9 @@ struct llm_tokenizer_ugm_session {
}
// initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.special_unk_id, 0, -FLT_MAX});
std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.token_unk(), 0, -FLT_MAX});
// at the beginning tokenization score is zero
tokenization_results[0] = { vocab.special_unk_id, 0, 0 };
tokenization_results[0] = { vocab.token_unk(), 0, 0 };
for (size_t input_offset = 0; input_offset < input_len;) {
size_t prefix_offset = input_offset;
......@@ -897,7 +815,7 @@ struct llm_tokenizer_ugm_session {
// traverse the token matcher trie to find a matching token
bool single_codepoint_token_found = false;
const struct best_tokenization & current_best = tokenization_results[input_offset];
const struct naive_trie * node = ugm_tokenizer->token_matcher.traverse(normalized[prefix_offset++]);
const struct naive_trie * node = tokenizer.token_matcher.traverse(normalized[prefix_offset++]);
while (prefix_offset <= input_len && node != NULL) {
// check if we found valid token in prefix
......@@ -907,13 +825,13 @@ struct llm_tokenizer_ugm_session {
single_codepoint_token_found = true;
}
llama_token token_id = node->value;
const auto & token_data = vocab.id_to_token[token_id];
const auto & token_data = vocab.get_token_data(token_id);
// we set the user-defined token scores to 0 to make them more likely to be selected
// (normal token scores are log probabilities, so they are negative)
// score type is double here to make tokenization results exactly
// the same as in the HF tokenizer using SentencePiece
const double token_score = llama_is_user_defined_token(vocab, token_id) ? 0.0 : token_data.score;
const double token_score = vocab.is_user_defined(token_id) ? 0.0 : token_data.score;
const double challenger_score = current_best.score_sum + token_score;
struct best_tokenization & current_champ = tokenization_results[prefix_offset];
if (challenger_score > current_champ.score_sum) {
......@@ -927,11 +845,11 @@ struct llm_tokenizer_ugm_session {
// if we didn't find a valid token corresponding to the whole UTF code point
// then use unknown token as the tokenization of this UTF code point
if (!single_codepoint_token_found) {
const double challenger_score = current_best.score_sum + ugm_tokenizer->unknown_token_score;
const double challenger_score = current_best.score_sum + tokenizer.unknown_token_score;
prefix_offset = input_offset + n_utf8_code_units;
struct best_tokenization & current_champ = tokenization_results[prefix_offset];
if (challenger_score > current_champ.score_sum) {
struct best_tokenization challenger = { vocab.special_unk_id, input_offset, (float) challenger_score };
struct best_tokenization challenger = { vocab.token_unk(), input_offset, (float) challenger_score };
current_champ = challenger;
}
}
......@@ -944,7 +862,7 @@ struct llm_tokenizer_ugm_session {
// merge sequences of consecutive unknown tokens into single unknown tokens
bool is_prev_unknown = false;
for (struct best_tokenization & tokenization = tokenization_results[input_len]; ; tokenization = tokenization_results[tokenization.input_offset]) {
bool is_unknown = tokenization.token_id == vocab.special_unk_id;
bool is_unknown = tokenization.token_id == vocab.token_unk();
if (!(is_prev_unknown && is_unknown)) {
output.push_back(tokenization.token_id);
}
......@@ -971,11 +889,11 @@ private:
normalized->clear();
normalized->reserve(input.size() * 3);
const std::string space = vocab.tokenizer_escape_whitespaces ? ugm_tokenizer->escaped_space : " ";
const std::string space = vocab.get_escape_whitespaces() ? tokenizer.escaped_space : " ";
bool shall_prepend_space = !vocab.tokenizer_treat_whitespace_as_suffix && vocab.tokenizer_add_space_prefix;
bool shall_append_space = vocab.tokenizer_treat_whitespace_as_suffix && vocab.tokenizer_add_space_prefix;
bool shall_merge_spaces = vocab.tokenizer_remove_extra_whitespaces;
const bool shall_prepend_space = !vocab.get_treat_whitespace_as_suffix() && vocab.get_add_space_prefix();
const bool shall_append_space = vocab.get_treat_whitespace_as_suffix() && vocab.get_add_space_prefix();
const bool shall_merge_spaces = vocab.get_remove_extra_whitespaces();
bool is_space_prepended = false;
bool processing_non_ws = false;
......@@ -1067,7 +985,7 @@ private:
// if input prefix matches some user-defined token return this token as normalization result
auto user_defined_token_match =
ugm_tokenizer->user_defined_token_matcher.get_longest_prefix(&input[input_offset], input.size() - input_offset);
tokenizer.user_defined_token_matcher.get_longest_prefix(&input[input_offset], input.size() - input_offset);
if (user_defined_token_match.second > 0) {
return { &input[input_offset], user_defined_token_match.second, user_defined_token_match.second };
}
......@@ -1075,8 +993,8 @@ private:
size_t longest_prefix_length = 0;
size_t longest_prefix_offset = 0;
if (ugm_tokenizer->xcda_array_size > 0) {
struct xcda_array_view xcda_view(ugm_tokenizer->xcda_array, ugm_tokenizer->xcda_array_size);
if (tokenizer.xcda_array_size > 0) {
struct xcda_array_view xcda_view(tokenizer.xcda_array, tokenizer.xcda_array_size);
// Find the longest normalized sequence matching the input prefix by walking
// the XOR-compressed compact double array (XCDA) starting from the root node
......@@ -1112,10 +1030,10 @@ private:
if (longest_prefix_length > 0) {
// we have a match, so return the replacement sequence
if (longest_prefix_offset >= ugm_tokenizer->prefix_replacements_size) {
if (longest_prefix_offset >= tokenizer.prefix_replacements_size) {
throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
}
const char * prefix_replacement = &(ugm_tokenizer->prefix_replacements)[longest_prefix_offset];
const char * prefix_replacement = &(tokenizer.prefix_replacements)[longest_prefix_offset];
return { prefix_replacement, strlen(prefix_replacement), longest_prefix_length };
}
......@@ -1132,7 +1050,7 @@ private:
}
const llama_vocab & vocab;
const llm_tokenizer_ugm * ugm_tokenizer;
const llm_tokenizer_ugm & tokenizer;
};
//
......@@ -1194,15 +1112,15 @@ static std::vector<uint8_t> llama_unescape_rwkv_token(const std::string & escape
}
struct llm_tokenizer_rwkv : llm_tokenizer {
llm_tokenizer_rwkv(const llama_vocab & vocab) : llm_tokenizer() {
llm_tokenizer_rwkv(const llama_vocab & vocab) {
// RWKV supports arbitrary byte tokens, but the vocab struct only supports string tokens.
// For now, we decode the vocab here into the lookup we'll use for tokenization.
// build trie
for (unsigned int id = 0; id < vocab.id_to_token.size(); ++id) {
const auto & token = vocab.id_to_token[id];
const auto data = llama_unescape_rwkv_token(token.text);
token_matcher.insert((const char *) data.data(), data.size(), id);
for (uint32_t id = 0; id < vocab.n_tokens(); ++id) {
const auto & data = vocab.get_token_data(id);
const auto text = llama_unescape_rwkv_token(data.text);
token_matcher.insert((const char *) text.data(), text.size(), id);
}
}
......@@ -1210,16 +1128,15 @@ struct llm_tokenizer_rwkv : llm_tokenizer {
};
struct llm_tokenizer_rwkv_session {
llm_tokenizer_rwkv_session(const llama_vocab & vocab) : vocab(vocab),
rwkv_tokenizer(static_cast<const llm_tokenizer_rwkv &>(*vocab.tokenizer)) {}
llm_tokenizer_rwkv_session(const llama_vocab & vocab, const llm_tokenizer_rwkv & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
void tokenize(const std::string & text, std::vector<llama_token> & output) {
uint32_t position = 0;
while (position < text.size()) {
const struct naive_trie * node = rwkv_tokenizer.token_matcher.traverse(text[position]);
const struct naive_trie * node = tokenizer.token_matcher.traverse(text[position]);
if (node == NULL) {
// no matching token found, add unknown token
output.push_back(vocab.special_unk_id);
output.push_back(vocab.token_unk());
position += 1;
continue;
}
......@@ -1243,33 +1160,11 @@ struct llm_tokenizer_rwkv_session {
private:
const llama_vocab & vocab;
const llm_tokenizer_rwkv & rwkv_tokenizer;
const llm_tokenizer_rwkv & tokenizer;
};
void llama_vocab::init_tokenizer() {
switch (type) {
case LLAMA_VOCAB_TYPE_SPM:
tokenizer = new llm_tokenizer_spm(*this);
break;
case LLAMA_VOCAB_TYPE_BPE:
tokenizer = new llm_tokenizer_bpe(*this);
break;
case LLAMA_VOCAB_TYPE_WPM:
tokenizer = new llm_tokenizer_wpm(*this);
break;
case LLAMA_VOCAB_TYPE_UGM:
tokenizer = new llm_tokenizer_ugm(*this);
break;
case LLAMA_VOCAB_TYPE_RWKV:
tokenizer = new llm_tokenizer_rwkv(*this);
break;
default:
GGML_ABORT("unsupported vocab type");
}
}
//
// (de-) tokenize
// impl
//
typedef enum FRAGMENT_BUFFER_VARIANT_TYPE {
......@@ -1278,7 +1173,7 @@ typedef enum FRAGMENT_BUFFER_VARIANT_TYPE {
} FRAGMENT_BUFFER_VARIANT_TYPE;
struct fragment_buffer_variant {
fragment_buffer_variant(llama_vocab::id _token)
fragment_buffer_variant(llama_token _token)
:
type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
token(_token),
......@@ -1289,7 +1184,7 @@ struct fragment_buffer_variant {
fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
:
type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
token((llama_vocab::id) - 1),
token((llama_token) - 1),
raw_text(_raw_text),
offset(_offset),
length(_length){
......@@ -1299,451 +1194,1093 @@ struct fragment_buffer_variant {
}
const FRAGMENT_BUFFER_VARIANT_TYPE type;
const llama_vocab::id token;
const llama_token token;
const std::string _dummy;
const std::string & raw_text;
const uint64_t offset;
const uint64_t length;
};
// #define PRETOKENIZERDEBUG
struct llama_vocab::impl {
uint32_t n_token_types = 0; // for BERT-style token types
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
enum llama_vocab_pre_type pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
int max_token_len = 0; // used for optimizing longest token search
// default LLaMA special tokens
// TODO: should we set all of these to LLAMA_TOKEN_NULL?
llama_token special_bos_id = 1;
llama_token special_eos_id = 2;
llama_token special_eot_id = LLAMA_TOKEN_NULL;
llama_token special_eom_id = LLAMA_TOKEN_NULL;
llama_token special_unk_id = 0;
llama_token special_sep_id = LLAMA_TOKEN_NULL;
llama_token special_pad_id = LLAMA_TOKEN_NULL;
llama_token special_mask_id = LLAMA_TOKEN_NULL;
llama_token linefeed_id = 13;
// fim tokens
llama_token special_fim_pre_id = LLAMA_TOKEN_NULL;
llama_token special_fim_suf_id = LLAMA_TOKEN_NULL;
llama_token special_fim_mid_id = LLAMA_TOKEN_NULL;
llama_token special_fim_pad_id = LLAMA_TOKEN_NULL;
llama_token special_fim_rep_id = LLAMA_TOKEN_NULL; // repo
llama_token special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator
// tokenizer flags
bool add_space_prefix = false;
bool add_bos = false;
bool add_eos = false;
bool ignore_merges = false;
bool clean_spaces = false; // clean_up_tokenization_spaces
bool remove_extra_whitespaces = false;
bool escape_whitespaces = true;
bool treat_whitespace_as_suffix = false;
std::unordered_map<std::string, llama_token> token_to_id;
std::vector<token_data> id_to_token;
std::vector<llama_token> cache_special_tokens;
std::vector<std::string> cache_token_to_piece; // llama_token_to_piece(special = true);
struct pair_hash {
size_t operator()(const std::pair<std::string, std::string> & p) const {
return std::hash<std::string>{}(p.first) ^ //create some hash for pair
(std::hash<std::string>{}(p.second) << 1);
}
};
std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> bpe_ranks;
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) {
// for each special token
for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
const auto & data = vocab.id_to_token[special_id];
const auto & special_token = data.text;
// set of all tokens that cause "end of generation"
std::set<llama_token> special_eog_ids;
if (!parse_special && (data.attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_UNKNOWN))) {
// Ignore control and unknown tokens when parse_special == false
continue;
// User-defined tokens are still pre-tokenized before everything else
// ref: https://github.com/huggingface/tokenizers/blob/fdd26ba9a3f0c133427aab0423888cbde91362d7/tokenizers/src/tokenizer/mod.rs#L726
// This is mostly relevant for neox-style tokenizers (mpt, olmo, stablelm, etc.)
}
std::unique_ptr<llm_tokenizer> tokenizer;
// for each text fragment
std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
while (it != buffer.end()) {
auto & fragment = (*it);
std::vector<char> precompiled_charsmap;
// if a fragment is text ( not yet processed )
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
const auto & raw_text = fragment.raw_text;
impl(const llama_vocab & vocab) : vocab(vocab) {
}
auto raw_text_base_offset = fragment.offset;
auto raw_text_base_length = fragment.length;
~impl() = default;
// loop over the text
while (true) {
// find the first occurrence of a given special token in this fragment
// passing offset argument only limit the "search area" but match coordinates
// are still relative to the source full raw_text
auto match = raw_text.find(special_token, raw_text_base_offset);
void load(llama_model_loader & ml, const LLM_KV & kv);
// no occurrences found, stop processing this fragment for a given special token
if (match == std::string::npos) break;
enum llama_vocab_type get_type() const;
// check if match is within bounds of offset <-> length
if (match + special_token.length() > raw_text_base_offset + raw_text_base_length) break;
std::string type_name() const;
#ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
#endif
auto source = std::distance(buffer.begin(), it);
bool is_normal (llama_token id) const;
bool is_unknown (llama_token id) const;
bool is_control (llama_token id) const;
bool is_byte (llama_token id) const;
bool is_user_defined(llama_token id) const;
bool is_unused (llama_token id) const;
bool is_eog (llama_token id) const;
// if match is further than base offset
// then we have some text to the left of it
if (match > raw_text_base_offset) {
// left
const int64_t left_reminder_offset = raw_text_base_offset + 0;
int64_t left_reminder_length = match - raw_text_base_offset;
uint8_t token_to_byte(llama_token id) const;
if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) {
while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) {
left_reminder_length--;
}
}
llama_token_attr token_get_attr(llama_token id) const;
if (left_reminder_length > 0) {
buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
it++;
}
void init_tokenizer(enum llama_vocab_type type);
#ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
#endif
}
void tokenizer_st_partition(std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) const;
// special token
buffer.emplace_after(it, special_id);
it++;
std::string token_to_piece_for_cache(
llama_token token,
bool special) const;
// right
if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
int64_t right_reminder_offset = match + special_token.length();
int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) {
while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) {
right_reminder_offset++;
right_reminder_length--;
}
}
std::vector<llama_token> tokenize(
const std::string & raw_text,
bool add_special,
bool parse_special = false) const;
if (right_reminder_length > 0) {
buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
it++;
}
int32_t tokenize(
const char * text,
int32_t text_len,
llama_token * tokens,
int32_t n_tokens_max,
bool add_special,
bool parse_special) const;
#ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
#endif
// does not write null-terminator to buf
int32_t token_to_piece(
llama_token token,
char * buf,
int32_t length,
int32_t lstrip,
bool special) const;
if (source == 0) {
buffer.erase_after(buffer.before_begin());
} else {
buffer.erase_after(std::next(buffer.begin(), (source-1)));
}
// use cached data
const std::string & token_to_piece(llama_token token) const;
// repeat for the right side
raw_text_base_offset = right_reminder_offset;
raw_text_base_length = right_reminder_length;
int32_t detokenize(
const llama_token * tokens,
int32_t n_tokens,
char * text,
int32_t text_len_max,
bool remove_special,
bool unparse_special) const;
#ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
#endif
} else {
if (source == 0) {
buffer.erase_after(buffer.before_begin());
} else {
buffer.erase_after(std::next(buffer.begin(), (source-1)));
}
break;
}
std::string detokenize(
const std::vector<llama_token> & tokens,
bool special) const;
void print_info() const;
private:
const llama_vocab & vocab;
};
void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
struct gguf_context * ctx = ml.meta.get();
// determine vocab type
{
std::string tokenizer_model;
std::string tokenizer_pre;
ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, n_token_types, false);
if (tokenizer_model == "no_vocab" || tokenizer_model == "none") {
type = LLAMA_VOCAB_TYPE_NONE;
// default special tokens
special_bos_id = LLAMA_TOKEN_NULL;
special_eos_id = LLAMA_TOKEN_NULL;
special_unk_id = LLAMA_TOKEN_NULL;
special_sep_id = LLAMA_TOKEN_NULL;
special_pad_id = LLAMA_TOKEN_NULL;
special_mask_id = LLAMA_TOKEN_NULL;
linefeed_id = LLAMA_TOKEN_NULL;
// read vocab size from metadata
uint32_t n_tokens = 0;
if (ml.get_key(LLM_KV_VOCAB_SIZE, n_tokens, false)) {
LLAMA_LOG_WARN("%s: adding %u dummy tokens\n", __func__, n_tokens);
id_to_token.resize(n_tokens);
}
return;
}
if (tokenizer_model == "llama") {
type = LLAMA_VOCAB_TYPE_SPM;
// default special tokens
special_bos_id = 1;
special_eos_id = 2;
special_unk_id = 0;
special_sep_id = LLAMA_TOKEN_NULL;
special_pad_id = LLAMA_TOKEN_NULL;
special_mask_id = LLAMA_TOKEN_NULL;
} else if (tokenizer_model == "bert") {
type = LLAMA_VOCAB_TYPE_WPM;
// default special tokens
special_bos_id = 101;
special_eos_id = LLAMA_TOKEN_NULL;
special_unk_id = 100;
special_sep_id = 102;
special_pad_id = 0;
special_mask_id = 103;
} else if (tokenizer_model == "gpt2") {
type = LLAMA_VOCAB_TYPE_BPE;
// read bpe merges and populate bpe ranks
const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
if (merges_keyidx == -1) {
throw std::runtime_error("cannot find tokenizer merges in model file\n");
}
const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
for (int i = 0; i < n_merges; i++) {
const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
//GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
std::string first;
std::string second;
const size_t pos = word.find(' ', 1);
if (pos != std::string::npos) {
first = word.substr(0, pos);
second = word.substr(pos + 1);
}
bpe_ranks.emplace(std::make_pair(first, second), i);
}
it++;
// default special tokens
special_bos_id = 11;
special_eos_id = 11;
special_unk_id = LLAMA_TOKEN_NULL;
special_sep_id = LLAMA_TOKEN_NULL;
special_pad_id = LLAMA_TOKEN_NULL;
special_mask_id = LLAMA_TOKEN_NULL;
} else if (tokenizer_model == "t5") {
type = LLAMA_VOCAB_TYPE_UGM;
// default special tokens
special_bos_id = LLAMA_TOKEN_NULL;
special_eos_id = 1;
special_unk_id = 2;
special_sep_id = LLAMA_TOKEN_NULL;
special_pad_id = 0;
special_mask_id = LLAMA_TOKEN_NULL;
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
if (precompiled_charsmap_keyidx != -1) {
size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
#ifdef IS_BIG_ENDIAN
// correct endiannes of data in precompiled_charsmap binary blob
uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[0];
*xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap);
size_t xcda_array_size = *xcda_blob_size / sizeof(uint32_t);
uint32_t * xcda_array = (uint32_t *) &precompiled_charsmap[sizeof(uint32_t)];
for (size_t i = 0; i < xcda_array_size; ++i) {
xcda_array[i] = __builtin_bswap32(xcda_array[i]);
}
#endif
}
} else if (tokenizer_model == "rwkv") {
type = LLAMA_VOCAB_TYPE_RWKV;
// default special tokens
special_bos_id = LLAMA_TOKEN_NULL;
special_eos_id = LLAMA_TOKEN_NULL;
special_unk_id = LLAMA_TOKEN_NULL;
special_sep_id = LLAMA_TOKEN_NULL;
special_pad_id = LLAMA_TOKEN_NULL;
} else {
throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
}
// for now, only BPE models have pre-tokenizers
if (type == LLAMA_VOCAB_TYPE_BPE) {
add_space_prefix = false;
clean_spaces = true;
if (tokenizer_pre == "default") {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (
tokenizer_pre == "llama3" ||
tokenizer_pre == "llama-v3" ||
tokenizer_pre == "llama-bpe"||
tokenizer_pre == "falcon3") {
pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
ignore_merges = true;
add_bos = true;
} else if (
tokenizer_pre == "deepseek-llm") {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
clean_spaces = false;
} else if (
tokenizer_pre == "deepseek-coder") {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
clean_spaces = false;
} else if (
tokenizer_pre == "deepseek-v3") {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
clean_spaces = false;
} else if (
tokenizer_pre == "falcon") {
pre_type = LLAMA_VOCAB_PRE_TYPE_FALCON;
} else if (
tokenizer_pre == "mpt") {
pre_type = LLAMA_VOCAB_PRE_TYPE_MPT;
} else if (
tokenizer_pre == "starcoder") {
pre_type = LLAMA_VOCAB_PRE_TYPE_STARCODER;
} else if (
tokenizer_pre == "gpt-2" ||
tokenizer_pre == "phi-2" ||
tokenizer_pre == "jina-es" ||
tokenizer_pre == "jina-de" ||
tokenizer_pre == "gigachat" ||
tokenizer_pre == "jina-v1-en" ||
tokenizer_pre == "jina-v2-es" ||
tokenizer_pre == "jina-v2-de" ||
tokenizer_pre == "jina-v2-code" ||
tokenizer_pre == "roberta-bpe") {
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
} else if (
tokenizer_pre == "refact") {
pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT;
} else if (
tokenizer_pre == "command-r") {
pre_type = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
clean_spaces = false;
} else if (
tokenizer_pre == "qwen2" ||
tokenizer_pre == "deepseek-r1-qwen") {
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
clean_spaces = false;
} else if (
tokenizer_pre == "stablelm2") {
pre_type = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
} else if (
tokenizer_pre == "olmo") {
pre_type = LLAMA_VOCAB_PRE_TYPE_OLMO;
} else if (
tokenizer_pre == "dbrx") {
pre_type = LLAMA_VOCAB_PRE_TYPE_DBRX;
} else if (
tokenizer_pre == "smaug-bpe") {
pre_type = LLAMA_VOCAB_PRE_TYPE_SMAUG;
} else if (
tokenizer_pre == "poro-chat") {
pre_type = LLAMA_VOCAB_PRE_TYPE_PORO;
clean_spaces = false;
} else if (
tokenizer_pre == "chatglm-bpe") {
pre_type = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
special_bos_id = LLAMA_TOKEN_NULL;
} else if (
tokenizer_pre == "viking") {
pre_type = LLAMA_VOCAB_PRE_TYPE_VIKING;
clean_spaces = false;
} else if (
tokenizer_pre == "jais") {
pre_type = LLAMA_VOCAB_PRE_TYPE_JAIS;
} else if (
tokenizer_pre == "tekken") {
pre_type = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
clean_spaces = false;
ignore_merges = true;
add_bos = true;
} else if (
tokenizer_pre == "smollm") {
pre_type = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
clean_spaces = false;
} else if (
tokenizer_pre == "codeshell") {
pre_type = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
} else if (
tokenizer_pre == "bloom") {
pre_type = LLAMA_VOCAB_PRE_TYPE_BLOOM;
} else if (
tokenizer_pre == "gpt3-finnish") {
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH;
} else if (
tokenizer_pre == "exaone") {
pre_type = LLAMA_VOCAB_PRE_TYPE_EXAONE;
} else if (
tokenizer_pre == "chameleon") {
pre_type = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
add_bos = true;
clean_spaces = false;
} else if (
tokenizer_pre == "minerva-7b") {
pre_type = LLAMA_VOCAB_PRE_TYPE_MINERVA;
} else if (
tokenizer_pre == "megrez") {
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
} else {
LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
}
} else if (type == LLAMA_VOCAB_TYPE_SPM) {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
add_space_prefix = true;
clean_spaces = false;
add_bos = true;
add_eos = false;
} else if (type == LLAMA_VOCAB_TYPE_WPM) {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
add_space_prefix = false;
clean_spaces = true;
add_bos = true;
add_eos = false;
} else if (type == LLAMA_VOCAB_TYPE_UGM) {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
add_bos = false;
add_eos = true;
} else if (type == LLAMA_VOCAB_TYPE_RWKV) {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
add_space_prefix = false;
clean_spaces = false;
add_bos = false;
add_eos = false;
} else {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
}
ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, add_space_prefix, false);
ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, remove_extra_whitespaces, false);
}
}
std::vector<llama_vocab::id> llama_tokenize_internal(
const llama_vocab & vocab,
std::string raw_text,
bool add_special,
bool parse_special) {
GGML_ASSERT(vocab.tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
if (token_idx == -1) {
throw std::runtime_error("cannot find tokenizer vocab in model file\n");
}
std::vector<llama_vocab::id> output;
std::forward_list<fragment_buffer_variant> fragment_buffer;
const float * scores = nullptr;
const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
if (score_idx != -1) {
scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
}
if (!raw_text.empty()) {
fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
tokenizer_st_partition(vocab, fragment_buffer, parse_special);
const int * toktypes = nullptr;
const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
if (toktype_idx != -1) {
toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
}
switch (vocab.type) {
case LLAMA_VOCAB_TYPE_SPM:
{
// OG tokenizer behavior:
//
// tokenizer.encode('', add_special_tokens=True) returns [1]
// tokenizer.encode('', add_special_tokens=False) returns []
uint32_t n_tokens = gguf_get_arr_n(ctx, token_idx);
id_to_token.resize(n_tokens);
for (uint32_t i = 0; i < n_tokens; i++) {
std::string word = gguf_get_arr_str(ctx, token_idx, i);
if (word.empty()) {
LLAMA_LOG_WARN("%s: empty token at index %u\n", __func__, i);
word = "[EMPTY_" + std::to_string(i) + "]";
}
token_to_id[word] = i;
max_token_len = std::max(max_token_len, (int) word.size());
auto & token_data = id_to_token[i];
token_data.text = std::move(word);
token_data.score = scores ? scores[i] : 0.0f;
token_data.attr = LLAMA_TOKEN_ATTR_NORMAL;
if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file
switch(toktypes[i]) {
case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break;
case LLAMA_TOKEN_TYPE_UNUSED: token_data.attr = LLAMA_TOKEN_ATTR_UNUSED; break;
case LLAMA_TOKEN_TYPE_NORMAL: token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; break;
case LLAMA_TOKEN_TYPE_CONTROL: token_data.attr = LLAMA_TOKEN_ATTR_CONTROL; break;
case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break;
case LLAMA_TOKEN_TYPE_BYTE: token_data.attr = LLAMA_TOKEN_ATTR_BYTE; break;
case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
}
}
}
GGML_ASSERT(id_to_token.size() == token_to_id.size());
bool is_prev_special = true; // prefix with space if first token
init_tokenizer(type);
if (add_special && vocab.tokenizer_add_bos) {
GGML_ASSERT(vocab.special_bos_id != -1);
output.push_back(vocab.special_bos_id);
is_prev_special = true;
}
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
if (type == LLAMA_VOCAB_TYPE_SPM) {
try {
linefeed_id = vocab.byte_to_token('\n');
} catch (const std::exception & e) {
LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
linefeed_id = special_pad_id;
}
} else if (type == LLAMA_VOCAB_TYPE_WPM) {
linefeed_id = special_pad_id;
} else if (type == LLAMA_VOCAB_TYPE_RWKV) {
const std::vector<int> ids = tokenize("\n", false);
GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
linefeed_id = ids[0];
} else {
const std::vector<int> ids = tokenize("\n", false);
//GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
if (ids.empty()) {
LLAMA_LOG_WARN("%s: model vocab missing newline token, using special_pad_id instead\n", __func__);
linefeed_id = special_pad_id;
} else {
linefeed_id = ids[0];
}
}
for (const auto & fragment : fragment_buffer) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
// special tokens
{
const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
{ LLM_KV_TOKENIZER_BOS_ID, special_bos_id },
{ LLM_KV_TOKENIZER_EOS_ID, special_eos_id },
{ LLM_KV_TOKENIZER_EOT_ID, special_eot_id },
{ LLM_KV_TOKENIZER_EOM_ID, special_eom_id },
{ LLM_KV_TOKENIZER_UNK_ID, special_unk_id },
{ LLM_KV_TOKENIZER_SEP_ID, special_sep_id },
{ LLM_KV_TOKENIZER_PAD_ID, special_pad_id },
{ LLM_KV_TOKENIZER_MASK_ID, special_mask_id },
{ LLM_KV_TOKENIZER_FIM_PRE_ID, special_fim_pre_id },
{ LLM_KV_TOKENIZER_FIM_SUF_ID, special_fim_suf_id },
{ LLM_KV_TOKENIZER_FIM_MID_ID, special_fim_mid_id },
{ LLM_KV_TOKENIZER_FIM_PAD_ID, special_fim_pad_id },
{ LLM_KV_TOKENIZER_FIM_REP_ID, special_fim_rep_id },
{ LLM_KV_TOKENIZER_FIM_SEP_ID, special_fim_sep_id },
// deprecated
{ LLM_KV_TOKENIZER_PREFIX_ID, special_fim_pre_id },
{ LLM_KV_TOKENIZER_SUFFIX_ID, special_fim_suf_id },
{ LLM_KV_TOKENIZER_MIDDLE_ID, special_fim_mid_id },
};
for (const auto & it : special_token_types) {
const std::string & key = kv(std::get<0>(it));
int32_t & id = std::get<1>(it);
uint32_t new_id;
if (!ml.get_key(std::get<0>(it), new_id, false)) {
continue;
}
if (new_id >= id_to_token.size()) {
LLAMA_LOG_WARN("%s: bad special token: '%s' = %u, using default id %d\n",
__func__, key.c_str(), new_id, id);
} else {
id = new_id;
}
}
// prefix with space if previous is special
if (vocab.tokenizer_add_space_prefix && is_prev_special) {
raw_text = " " + raw_text;
}
// Handle add_bos and add_eos
{
bool temp = true;
#ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
#endif
llama_escape_whitespace(raw_text);
llm_tokenizer_spm_session session(vocab);
session.tokenize(raw_text, output);
is_prev_special = false;
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
output.push_back(fragment.token);
is_prev_special = true;
if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
add_bos = temp;
}
if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
add_eos = temp;
}
}
// auto-detect special tokens by text
// TODO: convert scripts should provide these tokens through the KV metadata LLM_KV_TOKENIZER_...
// for now, we apply this workaround to find the tokens based on their text
for (const auto & t : token_to_id) {
// find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
if (special_eot_id == LLAMA_TOKEN_NULL) {
if (false
|| t.first == "<|eot_id|>"
|| t.first == "<|im_end|>"
|| t.first == "<|end|>"
|| t.first == "<end_of_turn>"
|| t.first == "<|endoftext|>"
|| t.first == "<EOT>"
|| t.first == "<|end▁of▁sentence|>" // DeepSeek
) {
special_eot_id = t.second;
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
}
}
if (add_special && vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) {
LLAMA_LOG_WARN(
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
"Are you sure this is what you want?\n", __FUNCTION__);
// find EOM token: "<|eom_id|>"
if (special_eom_id == LLAMA_TOKEN_NULL) {
if (false
|| t.first == "<|eom_id|>"
) {
special_eom_id = t.second;
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
}
}
if (add_special && vocab.tokenizer_add_eos) {
GGML_ASSERT(vocab.special_eos_id != -1);
output.push_back(vocab.special_eos_id);
}
} break;
case LLAMA_VOCAB_TYPE_BPE:
{
llm_tokenizer_bpe_session session(vocab);
// it calls some other methods that are not exist in llm_tokenizer,
// here just cast it to bpe tokenizer object
if (add_special) {
session.append_bos(output);
// find FIM_PRE token: "<|fim_prefix|>", "<fim-prefix>", "<PRE>", etc.
if (special_fim_pre_id == LLAMA_TOKEN_NULL) {
if (false
|| t.first == "<|fim_prefix|>" // Qwen
|| t.first == "<fim-prefix>"
|| t.first == "<|fim▁begin|>" // DeepSeek
|| t.first == "<PRE>"
) {
special_fim_pre_id = t.second;
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
}
for (const auto & fragment : fragment_buffer) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
}
#ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
#endif
session.tokenize(raw_text, output);
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
session.append(fragment.token, output);
// find FIM_SUF token: "<|fim_suffix|>", "<fim-suffix>", "<SUF>", etc.
if (special_fim_suf_id == LLAMA_TOKEN_NULL) {
if (false
|| t.first == "<|fim_suffix|>" // Qwen
|| t.first == "<fim-suffix>"
|| t.first == "<|fim▁hole|>" // DeepSeek
|| t.first == "<SUF>"
) {
special_fim_suf_id = t.second;
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
}
}
if (add_special) {
session.append_eos(output);
session.check_double_bos_eos(output);
}
} break;
case LLAMA_VOCAB_TYPE_WPM:
{
if (add_special) {
GGML_ASSERT(vocab.special_cls_id != -1);
output.push_back(vocab.special_cls_id);
// find FIM_MID token: "<|fim_middle|>", "<fim-middle>", "<MID>", etc.
if (special_fim_mid_id == LLAMA_TOKEN_NULL) {
if (false
|| t.first == "<|fim_middle|>" // Qwen
|| t.first == "<fim-middle>"
|| t.first == "<|fim▁end|>" // DeepSeek
|| t.first == "<MID>"
) {
special_fim_mid_id = t.second;
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
}
}
llm_tokenizer_wpm_session session(vocab);
for (const auto & fragment : fragment_buffer) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
#ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
#endif
session.tokenize(raw_text, output);
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
output.push_back(fragment.token);
// find FIM_PAD token: "<|fim_pad|>", "<fim-pad>", "<PAD>", etc.
if (special_fim_pad_id == LLAMA_TOKEN_NULL) {
if (false
|| t.first == "<|fim_pad|>" // Qwen
|| t.first == "<fim-pad>"
|| t.first == "<PAD>"
) {
special_fim_pad_id = t.second;
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
}
}
if (add_special) {
GGML_ASSERT(vocab.special_sep_id != -1);
output.push_back(vocab.special_sep_id);
}
} break;
case LLAMA_VOCAB_TYPE_UGM:
{
if (add_special && vocab.tokenizer_add_bos) {
GGML_ASSERT(vocab.special_bos_id != -1);
output.push_back(vocab.special_bos_id);
// find FIM_REP token: "<|fim_repo|>", "<fim-repo>", "<REP>", etc.
if (special_fim_rep_id == LLAMA_TOKEN_NULL) {
if (false
|| t.first == "<|fim_repo|>" // Qwen
|| t.first == "<|repo_name|>"
|| t.first == "<fim-repo>"
|| t.first == "<REPO>"
) {
special_fim_rep_id = t.second;
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
}
llm_tokenizer_ugm_session session(vocab);
}
for (const auto & fragment : fragment_buffer) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
#ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
#endif
session.tokenize(raw_text, output);
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
output.push_back(fragment.token);
// find FIM_SEP token: "<|file_sep|>"
if (special_fim_sep_id == LLAMA_TOKEN_NULL) {
if (false
|| t.first == "<|file_sep|>" // Qwen
) {
special_fim_sep_id = t.second;
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
}
}
}
if (add_special && vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) {
LLAMA_LOG_WARN(
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
"Are you sure this is what you want?\n", __FUNCTION__);
}
// maintain a list of tokens that cause end-of-generation
// this is currently determined based on the token text, which is obviously not ideal
// ref: https://github.com/ggerganov/llama.cpp/issues/9606
special_eog_ids.clear();
if (add_special && vocab.tokenizer_add_eos) {
GGML_ASSERT(vocab.special_eos_id != -1);
output.push_back(vocab.special_eos_id);
}
} break;
case LLAMA_VOCAB_TYPE_RWKV:
{
llm_tokenizer_rwkv_session session(vocab);
for (const auto & fragment : fragment_buffer) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
if (special_fim_pad_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_pad_id) == 0) {
special_eog_ids.insert(special_fim_pad_id);
}
#ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
#endif
if (special_fim_rep_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_rep_id) == 0) {
special_eog_ids.insert(special_fim_rep_id);
}
session.tokenize(raw_text, output);
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
output.push_back(fragment.token);
}
if (special_fim_sep_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_sep_id) == 0) {
special_eog_ids.insert(special_fim_sep_id);
}
for (const auto & t : token_to_id) {
if (false
|| t.first == "<|eot_id|>"
|| t.first == "<|im_end|>"
|| t.first == "<|end|>"
|| t.first == "<end_of_turn>"
|| t.first == "<|endoftext|>"
|| t.first == "<|eom_id|>"
|| t.first == "<EOT>"
) {
special_eog_ids.insert(t.second);
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
} break;
case LLAMA_VOCAB_TYPE_NONE:
GGML_ABORT("fatal error");
} else {
// token is control, but not marked as EOG -> print a debug log
if (id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && special_eog_ids.count(t.second) == 0) {
LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
__func__, t.second, t.first.c_str());
}
}
}
// sanity checks
if (special_eos_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eos_id) == 0) {
special_eog_ids.insert(special_eos_id);
LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
}
if (special_eot_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eot_id) == 0) {
special_eog_ids.insert(special_eot_id);
LLAMA_LOG_WARN("%s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
}
if (special_eom_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eom_id) == 0) {
special_eog_ids.insert(special_eom_id);
LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
}
}
return output;
}
// build special tokens cache
{
for (llama_token id = 0; id < (llama_token) n_tokens; ++id) {
if (id_to_token[id].attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN)) {
cache_special_tokens.push_back(id);
}
}
llama_token llama_byte_to_token_impl(const llama_vocab & vocab, uint8_t ch) {
GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
static const char * hex = "0123456789ABCDEF";
switch (llama_vocab_get_type(vocab)) {
case LLAMA_VOCAB_TYPE_SPM:
case LLAMA_VOCAB_TYPE_UGM: {
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
auto token = vocab.token_to_id.find(buf);
if (token != vocab.token_to_id.end()) {
return (*token).second;
std::sort(cache_special_tokens.begin(), cache_special_tokens.end(),
[&] (const llama_token a, const llama_token b) {
return id_to_token[a].text.size() > id_to_token[b].text.size();
}
// Try to fall back to just the byte as a string
const char buf2[2] = { (char)ch, 0 };
return vocab.token_to_id.at(buf2);
);
LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t) cache_special_tokens.size());
}
// build token to piece cache
{
size_t size_cache = 0;
std::vector<std::string> cache(n_tokens);
for (uint32_t id = 0; id < n_tokens; ++id) {
cache[id] = token_to_piece_for_cache(id, true);
size_cache += cache[id].size();
}
case LLAMA_VOCAB_TYPE_WPM:
case LLAMA_VOCAB_TYPE_BPE: {
return vocab.token_to_id.at(unicode_byte_to_utf8(ch));
std::swap(cache_token_to_piece, cache);
LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
}
// Handle per token attributes
//NOTE: Each model customizes per token attributes.
//NOTE: Per token attributes are missing from the GGUF file.
//TODO: Extract attributes from GGUF file.
{
auto _contains_any = [] (const std::string & str, const std::vector<std::string> & substrs) -> bool {
for (const auto & substr : substrs) {
if (str.find(substr) < std::string::npos) {
return true;
}
}
return false;
};
auto _set_tokenid_attr = [&] (const llama_token id, llama_token_attr attr, bool value) {
uint32_t current = id_to_token.at(id).attr;
current = value ? (current | attr) : (current & ~attr);
id_to_token[id].attr = (llama_token_attr) current;
};
auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) {
_set_tokenid_attr(token_to_id.at(token), attr, value);
};
std::string model_name;
std::string tokenizer_pre;
ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
// model name to lowercase
std::transform(model_name.begin(), model_name.end(), model_name.begin(),
[] (const std::string::value_type x) {
return std::tolower(x);
}
);
// set attributes by model/tokenizer name
if (_contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})) {
_set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
} else if (_contains_any(model_name, {"phi-3", "phi3"})) {
for (auto id : cache_special_tokens) {
_set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
}
for (const auto * token : {"</s>"}) {
_set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true);
}
for (const auto * token : {"<unk>", "<s>", "<|endoftext|>"}) {
_set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
}
}
default:
GGML_ABORT("fatal error");
}
}
const char * llama_token_get_text_impl(const struct llama_vocab & vocab, llama_token token) {
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
return vocab.id_to_token[token].text.c_str();
enum llama_vocab_type llama_vocab::impl::get_type() const {
return type;
}
float llama_token_get_score_impl(const struct llama_vocab & vocab, llama_token token) {
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
return vocab.id_to_token[token].score;
std::string llama_vocab::impl::type_name() const{
switch (type) {
case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
case LLAMA_VOCAB_TYPE_UGM: return "UGM";
case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
default: return "unknown";
}
}
llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, llama_token token) {
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
return vocab.id_to_token[token].attr;
bool llama_vocab::impl::is_normal(llama_token id) const {
GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
return id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL;
}
bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) {
return token != -1 && vocab.special_eog_ids.count(token) > 0;
bool llama_vocab::impl::is_unknown(llama_token id) const {
GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
return id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN;
}
bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token) {
return llama_is_control_token(vocab, token);
bool llama_vocab::impl::is_control(llama_token id) const {
GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
return id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL;
}
llama_token llama_token_bos_impl(const struct llama_vocab & vocab) {
return vocab.type != LLAMA_VOCAB_TYPE_WPM ? vocab.special_bos_id : vocab.special_cls_id;
bool llama_vocab::impl::is_byte(llama_token id) const {
GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
return id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE;
}
llama_token llama_token_eos_impl(const struct llama_vocab & vocab) {
return vocab.special_eos_id;
bool llama_vocab::impl::is_user_defined(llama_token id) const {
GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
return id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
}
llama_token llama_token_eot_impl(const struct llama_vocab & vocab) {
return vocab.special_eot_id;
bool llama_vocab::impl::is_unused(llama_token id) const {
GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
return id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNUSED;
}
llama_token llama_token_eom_impl(const struct llama_vocab & vocab) {
return vocab.special_eom_id;
bool llama_vocab::impl::is_eog(llama_token id) const {
return id != LLAMA_TOKEN_NULL && special_eog_ids.count(id) > 0;
}
llama_token llama_token_cls_impl(const struct llama_vocab & vocab) {
return vocab.special_cls_id;
uint8_t llama_vocab::impl::token_to_byte(llama_token id) const {
GGML_ASSERT(get_type() != LLAMA_VOCAB_TYPE_NONE);
GGML_ASSERT(is_byte(id));
const auto & token_data = id_to_token.at(id);
switch (get_type()) {
case LLAMA_VOCAB_TYPE_SPM:
case LLAMA_VOCAB_TYPE_UGM: {
auto buf = token_data.text.substr(3, 2);
return strtol(buf.c_str(), NULL, 16);
}
case LLAMA_VOCAB_TYPE_BPE: {
GGML_ABORT("fatal error");
}
case LLAMA_VOCAB_TYPE_WPM: {
GGML_ABORT("fatal error");
}
default:
GGML_ABORT("fatal error");
}
}
llama_token llama_token_sep_impl(const struct llama_vocab & vocab) {
return vocab.special_sep_id;
llama_token_attr llama_vocab::impl::token_get_attr(llama_token id) const {
GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
return id_to_token.at(id).attr;
}
llama_token llama_token_nl_impl(const struct llama_vocab & vocab) {
return vocab.linefeed_id;
}
void llama_vocab::impl::init_tokenizer(enum llama_vocab_type type) {
LLAMA_LOG_DEBUG("%s: initializing tokenizer for type %d\n", __func__, type);
llama_token llama_token_pad_impl(const struct llama_vocab & vocab) {
return vocab.special_pad_id;
switch (type) {
case LLAMA_VOCAB_TYPE_SPM:
tokenizer = std::make_unique<llm_tokenizer_spm>(vocab);
break;
case LLAMA_VOCAB_TYPE_BPE:
tokenizer = std::make_unique<llm_tokenizer_bpe>(vocab);
break;
case LLAMA_VOCAB_TYPE_WPM:
tokenizer = std::make_unique<llm_tokenizer_wpm>(vocab);
break;
case LLAMA_VOCAB_TYPE_UGM:
tokenizer = std::make_unique<llm_tokenizer_ugm>(vocab, precompiled_charsmap);
break;
case LLAMA_VOCAB_TYPE_RWKV:
tokenizer = std::make_unique<llm_tokenizer_rwkv>(vocab);
break;
default:
GGML_ABORT("unsupported vocab type");
}
}
bool llama_add_bos_token_impl(const struct llama_vocab & vocab) {
return vocab.tokenizer_add_bos;
}
//
// (de-) tokenize
//
bool llama_add_eos_token_impl(const struct llama_vocab & vocab) {
return vocab.tokenizer_add_eos;
}
// #define PRETOKENIZERDEBUG
llama_token llama_token_prefix_impl(const struct llama_vocab & vocab) {
return vocab.special_fim_pre_id;
}
void llama_vocab::impl::tokenizer_st_partition(std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) const {
// for each special token
for (const llama_token special_id : cache_special_tokens) {
const auto & data = vocab.get_token_data(special_id);
const auto & text = data.text;
llama_token llama_token_middle_impl(const struct llama_vocab & vocab) {
return vocab.special_fim_mid_id;
}
if (!parse_special && (data.attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_UNKNOWN))) {
// Ignore control and unknown tokens when parse_special == false
continue;
// User-defined tokens are still pre-tokenized before everything else
// ref: https://github.com/huggingface/tokenizers/blob/fdd26ba9a3f0c133427aab0423888cbde91362d7/tokenizers/src/tokenizer/mod.rs#L726
// This is mostly relevant for neox-style tokenizers (mpt, olmo, stablelm, etc.)
}
llama_token llama_token_suffix_impl(const struct llama_vocab & vocab) {
return vocab.special_fim_suf_id;
}
// for each text fragment
std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
while (it != buffer.end()) {
auto & fragment = (*it);
llama_token llama_token_fim_pre_impl(const struct llama_vocab & vocab) {
return vocab.special_fim_pre_id;
}
// if a fragment is text ( not yet processed )
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
const auto & raw_text = fragment.raw_text;
llama_token llama_token_fim_suf_impl(const struct llama_vocab & vocab) {
return vocab.special_fim_suf_id;
}
auto raw_text_base_offset = fragment.offset;
auto raw_text_base_length = fragment.length;
llama_token llama_token_fim_mid_impl(const struct llama_vocab & vocab) {
return vocab.special_fim_mid_id;
}
// loop over the text
while (true) {
// find the first occurrence of a given special token in this fragment
// passing offset argument only limit the "search area" but match coordinates
// are still relative to the source full raw_text
auto match = raw_text.find(text, raw_text_base_offset);
llama_token llama_token_fim_pad_impl(const struct llama_vocab & vocab) {
return vocab.special_fim_pad_id;
}
// no occurrences found, stop processing this fragment for a given special token
if (match == std::string::npos) break;
llama_token llama_token_fim_rep_impl(const struct llama_vocab & vocab) {
return vocab.special_fim_rep_id;
}
// check if match is within bounds of offset <-> length
if (match + text.length() > raw_text_base_offset + raw_text_base_length) break;
llama_token llama_token_fim_sep_impl(const struct llama_vocab & vocab) {
return vocab.special_fim_sep_id;
}
#ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
#endif
auto source = std::distance(buffer.begin(), it);
int32_t llama_tokenize_impl(
const struct llama_vocab & vocab,
const char * text,
int32_t text_len,
llama_token * tokens,
int32_t n_tokens_max,
bool add_special,
bool parse_special) {
auto res = llama_tokenize_internal(vocab, std::string(text, text_len), add_special, parse_special);
if (n_tokens_max < (int) res.size()) {
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
return -((int) res.size());
// if match is further than base offset
// then we have some text to the left of it
if (match > raw_text_base_offset) {
// left
const int64_t left_reminder_offset = raw_text_base_offset + 0;
int64_t left_reminder_length = match - raw_text_base_offset;
if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) {
while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) {
left_reminder_length--;
}
}
if (left_reminder_length > 0) {
buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
it++;
}
#ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
#endif
}
// special token
buffer.emplace_after(it, special_id);
it++;
// right
if (match + text.length() < raw_text_base_offset + raw_text_base_length) {
int64_t right_reminder_offset = match + text.length();
int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + text.length());
if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) {
while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) {
right_reminder_offset++;
right_reminder_length--;
}
}
if (right_reminder_length > 0) {
buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
it++;
}
#ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
#endif
if (source == 0) {
buffer.erase_after(buffer.before_begin());
} else {
buffer.erase_after(std::next(buffer.begin(), (source - 1)));
}
// repeat for the right side
raw_text_base_offset = right_reminder_offset;
raw_text_base_length = right_reminder_length;
#ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
#endif
} else {
if (source == 0) {
buffer.erase_after(buffer.before_begin());
} else {
buffer.erase_after(std::next(buffer.begin(), (source - 1)));
}
break;
}
}
}
it++;
}
}
}
for (size_t i = 0; i < res.size(); i++) {
tokens[i] = res[i];
// NOTE: avoid ever using this except for building the token_to_piece caches
std::string llama_vocab::impl::token_to_piece_for_cache(llama_token token, bool special) const {
std::string piece;
piece.resize(piece.capacity()); // using string internal cache
const int n_chars = vocab.token_to_piece(token, &piece[0], piece.size(), 0, special);
if (n_chars < 0) {
piece.resize(-n_chars);
int check = vocab.token_to_piece(token, &piece[0], piece.size(), 0, special);
GGML_ASSERT(check == -n_chars);
}
else {
piece.resize(n_chars);
}
return res.size();
return piece;
}
static void llama_escape_whitespace(std::string & text) {
replace_all(text, " ", "\xe2\x96\x81");
}
static void llama_unescape_whitespace(std::string & word) {
replace_all(word, "\xe2\x96\x81", " ");
}
static std::string llama_decode_text(const std::string & text) {
......@@ -1766,11 +2303,185 @@ static std::string llama_decode_text(const std::string & text) {
return decoded_text;
}
// does not write null-terminator to buf
int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) {
std::vector<llama_token> llama_vocab::impl::tokenize(
const std::string & raw_text,
bool add_special,
bool parse_special) const {
GGML_ASSERT(tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
std::vector<llama_token> output;
std::forward_list<fragment_buffer_variant> fragment_buffer;
if (!raw_text.empty()) {
fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
tokenizer_st_partition(fragment_buffer, parse_special);
}
switch (get_type()) {
case LLAMA_VOCAB_TYPE_SPM:
{
// OG tokenizer behavior:
//
// tokenizer.encode('', add_special_tokens=True) returns [1]
// tokenizer.encode('', add_special_tokens=False) returns []
bool is_prev_special = true; // prefix with space if first token
if (add_special && add_bos) {
GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL);
output.push_back(special_bos_id);
is_prev_special = true;
}
for (const auto & fragment : fragment_buffer) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
std::string text;
// prefix with space if previous is special
if (add_space_prefix && is_prev_special) {
text = ' ';
}
text += fragment.raw_text.substr(fragment.offset, fragment.length);
#ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
#endif
llama_escape_whitespace(text);
llm_tokenizer_spm_session session(vocab);
session.tokenize(text, output);
is_prev_special = false;
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
output.push_back(fragment.token);
is_prev_special = true;
}
}
if (add_special && add_bos && output.size() >= 2 && output[1] == special_bos_id) {
LLAMA_LOG_WARN(
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
"Are you sure this is what you want?\n", __FUNCTION__);
}
if (add_special && add_eos) {
GGML_ASSERT(special_eos_id != LLAMA_TOKEN_NULL);
output.push_back(special_eos_id);
}
} break;
case LLAMA_VOCAB_TYPE_BPE:
{
llm_tokenizer_bpe_session session(vocab, *static_cast<const llm_tokenizer_bpe *>(tokenizer.get()));
// it calls some other methods that are not exist in llm_tokenizer,
// here just cast it to bpe tokenizer object
if (add_special) {
session.append_bos(output);
}
for (const auto & fragment : fragment_buffer) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
#ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
#endif
session.tokenize(text, output);
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
session.append(fragment.token, output);
}
}
if (add_special) {
session.append_eos(output);
session.check_double_bos_eos(output);
}
} break;
case LLAMA_VOCAB_TYPE_WPM:
{
if (add_special) {
GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL);
output.push_back(special_bos_id);
}
llm_tokenizer_wpm_session session(vocab);
for (const auto & fragment : fragment_buffer) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
#ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
#endif
session.tokenize(text, output);
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
output.push_back(fragment.token);
}
}
if (add_special) {
GGML_ASSERT(special_sep_id != LLAMA_TOKEN_NULL);
output.push_back(special_sep_id);
}
} break;
case LLAMA_VOCAB_TYPE_UGM:
{
if (add_special && add_bos) {
GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL);
output.push_back(special_bos_id);
}
llm_tokenizer_ugm_session session(vocab, *static_cast<const llm_tokenizer_ugm *>(tokenizer.get()));
for (const auto & fragment : fragment_buffer) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
#ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
#endif
session.tokenize(text, output);
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
output.push_back(fragment.token);
}
}
if (add_special && add_bos && output.size() >= 2 && output[1] == special_bos_id) {
LLAMA_LOG_WARN(
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
"Are you sure this is what you want?\n", __FUNCTION__);
}
if (add_special && add_eos) {
GGML_ASSERT(special_eos_id != LLAMA_TOKEN_NULL);
output.push_back(special_eos_id);
}
} break;
case LLAMA_VOCAB_TYPE_RWKV:
{
llm_tokenizer_rwkv_session session(vocab, *static_cast<const llm_tokenizer_rwkv *>(tokenizer.get()));
for (const auto & fragment : fragment_buffer) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
#ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
#endif
session.tokenize(text, output);
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
output.push_back(fragment.token);
}
}
} break;
case LLAMA_VOCAB_TYPE_NONE:
GGML_ABORT("fatal error");
}
return output;
}
int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) const {
// ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
static const int attr_special = LLAMA_TOKEN_ATTR_UNKNOWN | LLAMA_TOKEN_ATTR_CONTROL;
const llama_token_attr attr = llama_token_get_attr_impl(vocab, token);
const llama_token_attr attr = token_get_attr(token);
if (!special && (attr & attr_special)) {
return 0;
}
......@@ -1791,7 +2502,7 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token
// if we have a cache - use it
{
const auto & cache = vocab.cache_token_to_piece;
const auto & cache = cache_token_to_piece;
if (!cache.empty()) {
const auto & result = cache.at(token);
......@@ -1799,9 +2510,9 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token
}
}
if (0 <= token && token < (int32_t) vocab.id_to_token.size()) {
const std::string & token_text = vocab.id_to_token[token].text;
switch (llama_vocab_get_type(vocab)) {
if (0 <= token && token < (int32_t) id_to_token.size()) {
const std::string & token_text = id_to_token[token].text;
switch (get_type()) {
case LLAMA_VOCAB_TYPE_WPM:
case LLAMA_VOCAB_TYPE_SPM:
case LLAMA_VOCAB_TYPE_UGM: {
......@@ -1816,7 +2527,7 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token
return _try_copy(result.data(), result.size());
}
if (attr & LLAMA_TOKEN_ATTR_BYTE) {
char byte = (char) llama_token_to_byte(vocab, token);
char byte = (char) token_to_byte(token);
return _try_copy((char*) &byte, 1);
}
break;
......@@ -1852,43 +2563,46 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token
return 0;
}
int32_t llama_detokenize_impl(
const struct llama_vocab & vocab,
const std::string & llama_vocab::impl::token_to_piece(llama_token token) const {
return cache_token_to_piece.at(token);
}
int32_t llama_vocab::impl::detokenize(
const llama_token * tokens,
int32_t n_tokens,
char * text,
int32_t text_len_max,
bool remove_special,
bool unparse_special) {
if (vocab.type == LLAMA_VOCAB_TYPE_NONE) {
bool unparse_special) const {
if (type == LLAMA_VOCAB_TYPE_NONE) {
return 0;
}
GGML_ASSERT(vocab.tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
GGML_ASSERT(tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
int32_t avail = text_len_max;
int32_t total = 0;
// remove the leading space
bool remove_space = vocab.tokenizer_add_space_prefix;
bool remove_space = add_space_prefix;
if (remove_special && vocab.tokenizer_add_bos) {
if (n_tokens > 0 && tokens[0] == vocab.special_bos_id) {
if (remove_special && add_bos) {
if (n_tokens > 0 && tokens[0] == special_bos_id) {
remove_space = false;
n_tokens--;
tokens++;
}
}
if (remove_special && vocab.tokenizer_add_eos) {
if (n_tokens > 0 && tokens[n_tokens-1] == vocab.special_eos_id) {
if (remove_special && add_eos) {
if (n_tokens > 0 && tokens[n_tokens - 1] == special_eos_id) {
n_tokens--;
}
}
for (int32_t i = 0; i < n_tokens; ++i) {
GGML_ASSERT(avail >= 0);
int32_t n_chars = llama_token_to_piece_impl(vocab, tokens[i], text, avail, remove_space, unparse_special);
int32_t n_chars = token_to_piece(tokens[i], text, avail, remove_space, unparse_special);
remove_space = false;
if (n_chars < 0) {
avail = 0;
......@@ -1904,7 +2618,7 @@ int32_t llama_detokenize_impl(
return -total;
}
if (vocab.tokenizer_clean_spaces) {
if (clean_spaces) {
text -= total; // restart text
// first pass: characters ?!., //TODO: where do these characters come from?
......@@ -1965,13 +2679,321 @@ int32_t llama_detokenize_impl(
return total <= text_len_max ? total : -total;
}
std::string llama_detokenize(const struct llama_vocab & vocab, const std::vector<llama_token> & tokens, bool special) {
void llama_vocab::impl::print_info() const {
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, type_name().c_str());
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, vocab.n_tokens());
LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (uint32_t) bpe_ranks.size());
// special tokens
if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, special_bos_id, id_to_token[special_bos_id].text.c_str() ); }
if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, special_eos_id, id_to_token[special_eos_id].text.c_str() ); }
if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, special_eot_id, id_to_token[special_eot_id].text.c_str() ); }
if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, special_eom_id, id_to_token[special_eom_id].text.c_str() ); }
if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, special_unk_id, id_to_token[special_unk_id].text.c_str() ); }
if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, special_sep_id, id_to_token[special_sep_id].text.c_str() ); }
if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, special_pad_id, id_to_token[special_pad_id].text.c_str() ); }
if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, special_mask_id, id_to_token[special_mask_id].text.c_str() ); }
if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, linefeed_id, id_to_token[linefeed_id].text.c_str() ); }
if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, special_fim_pre_id, id_to_token[special_fim_pre_id].text.c_str() ); }
if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, special_fim_suf_id, id_to_token[special_fim_suf_id].text.c_str() ); }
if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, special_fim_mid_id, id_to_token[special_fim_mid_id].text.c_str() ); }
if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, special_fim_pad_id, id_to_token[special_fim_pad_id].text.c_str() ); }
if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, special_fim_rep_id, id_to_token[special_fim_rep_id].text.c_str() ); }
if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, special_fim_sep_id, id_to_token[special_fim_sep_id].text.c_str() ); }
for (const auto & id : special_eog_ids) {
LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, id_to_token[id].text.c_str() );
}
LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, max_token_len);
}
llama_vocab::llama_vocab() : pimpl(new impl(*this)) {
}
llama_vocab::~llama_vocab() {
}
void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
pimpl->load(ml, kv);
}
enum llama_vocab_type llama_vocab::get_type() const {
return pimpl->type;
}
enum llama_vocab_pre_type llama_vocab::get_pre_type() const {
return pimpl->pre_type;
}
uint32_t llama_vocab::n_tokens() const {
return (uint32_t) pimpl->id_to_token.size();
}
uint32_t llama_vocab::n_token_types() const {
return (uint32_t) pimpl->n_token_types;
}
std::string llama_vocab::type_name() const{
return pimpl->type_name();
}
bool llama_vocab::is_normal(llama_token id) const {
return pimpl->is_normal(id);
}
bool llama_vocab::is_unknown(llama_token id) const {
return pimpl->is_unknown(id);
}
bool llama_vocab::is_control(llama_token id) const {
return pimpl->is_control(id);
}
bool llama_vocab::is_byte(llama_token id) const {
return pimpl->is_byte(id);
}
bool llama_vocab::is_user_defined(llama_token id) const {
return pimpl->is_user_defined(id);
}
bool llama_vocab::is_unused(llama_token id) const {
return pimpl->is_unused(id);
}
bool llama_vocab::is_eog(llama_token id) const {
return pimpl->is_eog(id);
}
uint8_t llama_vocab::token_to_byte(llama_token id) const {
return pimpl->token_to_byte(id);
}
llama_token llama_vocab::byte_to_token(uint8_t ch) const {
GGML_ASSERT(get_type() != LLAMA_VOCAB_TYPE_NONE);
static const char * hex = "0123456789ABCDEF";
switch (get_type()) {
case LLAMA_VOCAB_TYPE_SPM:
case LLAMA_VOCAB_TYPE_UGM: {
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
auto token = pimpl->token_to_id.find(buf);
if (token != pimpl->token_to_id.end()) {
return (*token).second;
}
// Try to fall back to just the byte as a string
const char buf2[2] = { (char)ch, 0 };
return pimpl->token_to_id.at(buf2);
}
case LLAMA_VOCAB_TYPE_WPM:
case LLAMA_VOCAB_TYPE_BPE: {
return pimpl->token_to_id.at(unicode_byte_to_utf8(ch));
}
default:
GGML_ABORT("fatal error");
}
}
llama_token llama_vocab::text_to_token(const std::string & text) const {
GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
auto it = pimpl->token_to_id.find(text);
if (it != pimpl->token_to_id.end()) {
return (*it).second;
}
return LLAMA_TOKEN_NULL;
}
const llama_vocab::token_data & llama_vocab::get_token_data(llama_token id) const {
GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
return pimpl->id_to_token.at(id);
}
const char * llama_vocab::token_get_text(llama_token id) const {
GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
return pimpl->id_to_token.at(id).text.c_str();
}
float llama_vocab::token_get_score(llama_token id) const {
GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
return pimpl->id_to_token.at(id).score;
}
llama_token_attr llama_vocab::token_get_attr(llama_token id) const {
return pimpl->token_get_attr(id);
}
llama_token llama_vocab::token_bos() const {
return pimpl->special_bos_id;
}
llama_token llama_vocab::token_eos() const {
return pimpl->special_eos_id;
}
llama_token llama_vocab::token_eot() const {
return pimpl->special_eot_id;
}
llama_token llama_vocab::token_eom() const {
return pimpl->special_eom_id;
}
llama_token llama_vocab::token_unk() const {
return pimpl->special_unk_id;
}
llama_token llama_vocab::token_sep() const {
return pimpl->special_sep_id;
}
llama_token llama_vocab::token_nl() const {
return pimpl->linefeed_id;
}
llama_token llama_vocab::token_pad() const {
return pimpl->special_pad_id;
}
llama_token llama_vocab::token_prefix() const {
return pimpl->special_fim_pre_id;
}
llama_token llama_vocab::token_middle() const {
return pimpl->special_fim_mid_id;
}
llama_token llama_vocab::token_suffix() const {
return pimpl->special_fim_suf_id;
}
llama_token llama_vocab::token_fim_pre() const {
return pimpl->special_fim_pre_id;
}
llama_token llama_vocab::token_fim_suf() const {
return pimpl->special_fim_suf_id;
}
llama_token llama_vocab::token_fim_mid() const {
return pimpl->special_fim_mid_id;
}
llama_token llama_vocab::token_fim_pad() const {
return pimpl->special_fim_pad_id;
}
llama_token llama_vocab::token_fim_rep() const {
return pimpl->special_fim_rep_id;
}
llama_token llama_vocab::token_fim_sep() const {
return pimpl->special_fim_sep_id;
}
bool llama_vocab::get_add_space_prefix() const {
return pimpl->add_space_prefix;
}
bool llama_vocab::get_add_bos() const {
return pimpl->add_bos;
}
bool llama_vocab::get_add_eos() const {
return pimpl->add_eos;
}
bool llama_vocab::get_ignore_merges() const {
return pimpl->ignore_merges;
}
bool llama_vocab::get_clean_spaces() const {
return pimpl->clean_spaces;
}
bool llama_vocab::get_remove_extra_whitespaces() const {
return pimpl->remove_extra_whitespaces;
}
bool llama_vocab::get_escape_whitespaces() const {
return pimpl->escape_whitespaces;
}
bool llama_vocab::get_treat_whitespace_as_suffix() const {
return pimpl->treat_whitespace_as_suffix;
}
int llama_vocab::max_token_len() const {
return pimpl->max_token_len;
}
int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
GGML_ASSERT(token_left.find(' ') == std::string::npos);
GGML_ASSERT(token_left.find('\n') == std::string::npos);
GGML_ASSERT(token_right.find(' ') == std::string::npos);
GGML_ASSERT(token_right.find('\n') == std::string::npos);
auto it = pimpl->bpe_ranks.find(std::make_pair(token_left, token_right));
if (it == pimpl->bpe_ranks.end()) {
return -1;
}
return it->second;
}
int32_t llama_vocab::tokenize(
const char * text,
int32_t text_len,
llama_token * tokens,
int32_t n_tokens_max,
bool add_special,
bool parse_special) const {
auto res = tokenize(std::string(text, text_len), add_special, parse_special);
if (n_tokens_max < (int) res.size()) {
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
return -((int) res.size());
}
for (size_t i = 0; i < res.size(); i++) {
tokens[i] = res[i];
}
return res.size();
}
std::vector<llama_token> llama_vocab::tokenize(
const std::string & raw_text,
bool add_special,
bool parse_special) const {
return pimpl->tokenize(raw_text, add_special, parse_special);
}
const std::string & llama_vocab::token_to_piece(llama_token token) const {
return pimpl->token_to_piece(token);
}
int32_t llama_vocab::token_to_piece(llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) const {
return pimpl->token_to_piece(token, buf, length, lstrip, special);
}
int32_t llama_vocab::detokenize(
const llama_token * tokens,
int32_t n_tokens,
char * text,
int32_t text_len_max,
bool remove_special,
bool unparse_special) const {
return pimpl->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
}
std::string llama_vocab::detokenize(const std::vector<llama_token> & tokens, bool special) const {
std::string text;
text.resize(std::max(text.capacity(), tokens.size()));
int32_t n_chars = llama_detokenize_impl(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
int32_t n_chars = detokenize(tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
if (n_chars < 0) {
text.resize(-n_chars);
n_chars = llama_detokenize_impl(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
n_chars = detokenize(tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
}
......@@ -1980,3 +3002,243 @@ std::string llama_detokenize(const struct llama_vocab & vocab, const std::vector
// NOTE: the original tokenizer decodes bytes after collecting the pieces.
return text;
}
void llama_vocab::print_info() const {
pimpl->print_info();
}
//
// interface implementation
//
int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab) {
return vocab->n_tokens();
}
// deprecated
int32_t llama_n_vocab(const struct llama_vocab * vocab) {
return llama_vocab_n_tokens(vocab);
}
enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab) {
return vocab->get_type();
}
const char * llama_vocab_get_text(const struct llama_vocab * vocab, llama_token token) {
return vocab->token_get_text(token);
}
float llama_vocab_get_score(const struct llama_vocab * vocab, llama_token token) {
return vocab->token_get_score(token);
}
enum llama_token_attr llama_vocab_get_attr(const struct llama_vocab * vocab, llama_token token) {
return vocab->token_get_attr(token);
}
bool llama_vocab_is_eog(const struct llama_vocab * vocab, llama_token token) {
return vocab->is_eog(token);
}
bool llama_vocab_is_control(const struct llama_vocab * vocab, llama_token token) {
return vocab->is_control(token);
}
llama_token llama_vocab_bos(const struct llama_vocab * vocab) {
return vocab->token_bos();
}
llama_token llama_vocab_eos(const struct llama_vocab * vocab) {
return vocab->token_eos();
}
llama_token llama_vocab_eot(const struct llama_vocab * vocab) {
return vocab->token_eot();
}
// deprecated
llama_token llama_vocab_cls(const struct llama_vocab * vocab) {
return vocab->token_bos();
}
llama_token llama_vocab_sep(const struct llama_vocab * vocab) {
return vocab->token_sep();
}
llama_token llama_vocab_nl (const struct llama_vocab * vocab) {
return vocab->token_nl();
}
llama_token llama_vocab_pad(const struct llama_vocab * vocab) {
return vocab->token_pad();
}
bool llama_vocab_get_add_bos(const struct llama_vocab * vocab) {
return vocab->get_add_bos();
}
bool llama_vocab_get_add_eos(const struct llama_vocab * vocab) {
return vocab->get_add_eos();
}
llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab) {
return vocab->token_fim_pre();
}
llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab) {
return vocab->token_fim_suf();
}
llama_token llama_vocab_fim_mid(const struct llama_vocab * vocab) {
return vocab->token_fim_mid();
}
llama_token llama_vocab_fim_pad(const struct llama_vocab * vocab) {
return vocab->token_fim_pad();
}
llama_token llama_vocab_fim_rep(const struct llama_vocab * vocab) {
return vocab->token_fim_rep();
}
llama_token llama_vocab_fim_sep(const struct llama_vocab * vocab) {
return vocab->token_fim_sep();
}
// deprecated
const char * llama_token_get_text(const struct llama_vocab * vocab, llama_token token) {
return llama_vocab_get_text(vocab, token);
}
// deprecated
float llama_token_get_score(const struct llama_vocab * vocab, llama_token token) {
return llama_vocab_get_score(vocab, token);
}
// deprecated
enum llama_token_attr llama_token_get_attr(const struct llama_vocab * vocab, llama_token token) {
return llama_vocab_get_attr(vocab, token);
}
// deprecated
bool llama_token_is_eog(const struct llama_vocab * vocab, llama_token token) {
return llama_vocab_is_eog(vocab, token);
}
// deprecated
bool llama_token_is_control(const struct llama_vocab * vocab, llama_token token) {
return llama_vocab_is_control(vocab, token);
}
// deprecated
llama_token llama_token_bos(const struct llama_vocab * vocab) {
return llama_vocab_bos(vocab);
}
// deprecated
llama_token llama_token_eos(const struct llama_vocab * vocab) {
return llama_vocab_eos(vocab);
}
// deprecated
llama_token llama_token_eot(const struct llama_vocab * vocab) {
return llama_vocab_eot(vocab);
}
// deprecated
llama_token llama_token_cls(const struct llama_vocab * vocab) {
//return llama_vocab_cls(vocab);
return llama_vocab_bos(vocab); // avoid deprecation warning
}
// deprecated
llama_token llama_token_sep(const struct llama_vocab * vocab) {
return llama_vocab_sep(vocab);
}
// deprecated
llama_token llama_token_nl (const struct llama_vocab * vocab) {
return llama_vocab_nl(vocab);
}
// deprecated
llama_token llama_token_pad(const struct llama_vocab * vocab) {
return llama_vocab_pad(vocab);
}
// deprecated
bool llama_add_bos_token(const struct llama_vocab * vocab) {
return llama_vocab_get_add_bos(vocab);
}
// deprecated
bool llama_add_eos_token(const struct llama_vocab * vocab) {
return llama_vocab_get_add_eos(vocab);
}
// deprecated
llama_token llama_token_fim_pre(const struct llama_vocab * vocab) {
return llama_vocab_fim_pre(vocab);
}
// deprecated
llama_token llama_token_fim_suf(const struct llama_vocab * vocab) {
return llama_vocab_fim_suf(vocab);
}
// deprecated
llama_token llama_token_fim_mid(const struct llama_vocab * vocab) {
return llama_vocab_fim_mid(vocab);
}
// deprecated
llama_token llama_token_fim_pad(const struct llama_vocab * vocab) {
return llama_vocab_fim_pad(vocab);
}
// deprecated
llama_token llama_token_fim_rep(const struct llama_vocab * vocab) {
return llama_vocab_fim_rep(vocab);
}
// deprecated
llama_token llama_token_fim_sep(const struct llama_vocab * vocab) {
return llama_vocab_fim_sep(vocab);
}
//
// tokenization
//
int32_t llama_tokenize(
const struct llama_vocab * vocab,
const char * text,
int32_t text_len,
llama_token * tokens,
int32_t n_tokens_max,
bool add_special,
bool parse_special) {
return vocab->tokenize(text, text_len, tokens, n_tokens_max, add_special, parse_special);
}
int32_t llama_token_to_piece(
const struct llama_vocab * vocab,
llama_token token,
char * buf,
int32_t length,
int32_t lstrip,
bool special) {
return vocab->token_to_piece(token, buf, length, lstrip, special);
}
int32_t llama_detokenize(
const struct llama_vocab * vocab,
const llama_token * tokens,
int32_t n_tokens,
char * text,
int32_t text_len_max,
bool remove_special,
bool unparse_special) {
return vocab->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
}
......@@ -4,179 +4,122 @@
#include <string>
#include <vector>
#include <unordered_map>
#include <map>
#include <set>
static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
switch (type) {
case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
case LLAMA_VOCAB_TYPE_UGM: return "UGM";
case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
default: return "unknown";
}
}
struct llm_tokenizer;
#include <memory>
struct llama_vocab {
using id = llama_token;
using token = std::string;
using tattr = llama_token_attr;
struct LLM_KV;
struct llama_model_loader;
struct llama_vocab {
struct token_data {
token text;
float score;
tattr attr;
std::string text;
float score;
llama_token_attr attr;
};
uint32_t n_vocab = 0; // TODO: not great because has to keep in sync with hparams.n_vocab
llama_vocab();
~llama_vocab();
void load(llama_model_loader & ml, const LLM_KV & kv);
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
enum llama_vocab_type get_type() const;
enum llama_vocab_pre_type get_pre_type() const;
int max_token_len = 0; // used for optimizing longest token search
uint32_t n_tokens() const;
uint32_t n_token_types() const;
std::unordered_map<token, id> token_to_id;
std::vector<token_data> id_to_token;
std::string type_name() const;
std::vector<id> cache_special_tokens;
std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = true);
bool is_normal (llama_token id) const;
bool is_unknown (llama_token id) const;
bool is_control (llama_token id) const;
bool is_byte (llama_token id) const;
bool is_user_defined(llama_token id) const;
bool is_unused (llama_token id) const;
bool is_eog (llama_token id) const;
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
uint8_t token_to_byte(llama_token id) const;
llama_token byte_to_token(uint8_t ch) const;
// default LLaMA special tokens
// TODO: should we set all of these to LLAMA_TOKEN_NULL?
id special_bos_id = 1;
id special_eos_id = 2;
id special_eot_id = LLAMA_TOKEN_NULL;
id special_eom_id = LLAMA_TOKEN_NULL;
id special_unk_id = 0;
id special_sep_id = LLAMA_TOKEN_NULL;
id special_pad_id = LLAMA_TOKEN_NULL;
id special_cls_id = LLAMA_TOKEN_NULL; // TODO: revisit if this is really needed https://github.com/ggerganov/llama.cpp/pull/10930
id special_mask_id = LLAMA_TOKEN_NULL;
llama_token text_to_token(const std::string & text) const;
id linefeed_id = 13;
const token_data & get_token_data(llama_token id) const;
// fim tokens
id special_fim_pre_id = LLAMA_TOKEN_NULL;
id special_fim_suf_id = LLAMA_TOKEN_NULL;
id special_fim_mid_id = LLAMA_TOKEN_NULL;
id special_fim_pad_id = LLAMA_TOKEN_NULL;
id special_fim_rep_id = LLAMA_TOKEN_NULL; // repo
id special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator
const char * token_get_text (llama_token id) const;
float token_get_score(llama_token id) const;
llama_token_attr token_get_attr (llama_token id) const;
// set of all tokens that cause "end of generation"
std::set<id> special_eog_ids;
llama_token token_bos() const;
llama_token token_eos() const;
llama_token token_eot() const;
llama_token token_eom() const;
llama_token token_unk() const;
llama_token token_sep() const;
llama_token token_nl () const;
llama_token token_pad() const;
// tokenizer flags
bool tokenizer_add_space_prefix = false;
bool tokenizer_add_bos = false;
bool tokenizer_add_eos = false;
bool tokenizer_ignore_merges = false;
bool tokenizer_clean_spaces = false; // clean_up_tokenization_spaces
bool tokenizer_remove_extra_whitespaces = false;
bool tokenizer_escape_whitespaces = true;
bool tokenizer_treat_whitespace_as_suffix = false;
llama_token token_prefix() const;
llama_token token_middle() const;
llama_token token_suffix() const;
std::vector<char> precompiled_charsmap;
llama_token token_fim_pre() const;
llama_token token_fim_suf() const;
llama_token token_fim_mid() const;
llama_token token_fim_pad() const;
llama_token token_fim_rep() const;
llama_token token_fim_sep() const;
llm_tokenizer * tokenizer = nullptr;
bool get_add_space_prefix () const;
bool get_add_bos () const;
bool get_add_eos () const;
bool get_ignore_merges () const;
bool get_clean_spaces () const;
bool get_remove_extra_whitespaces () const;
bool get_escape_whitespaces () const;
bool get_treat_whitespace_as_suffix() const;
llama_vocab() = default;
~llama_vocab();
int max_token_len() const;
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
void init_tokenizer();
int32_t tokenize(
const char * text,
int32_t text_len,
llama_token * tokens,
int32_t n_tokens_max,
bool add_special,
bool parse_special) const;
std::vector<llama_token> tokenize(
const std::string & raw_text,
bool add_special,
bool parse_special = false) const;
// does not write null-terminator to buf
int32_t token_to_piece(
llama_token token,
char * buf,
int32_t length,
int32_t lstrip,
bool special) const;
// use cached data
const std::string & token_to_piece(llama_token token) const;
int32_t detokenize(
const llama_token * tokens,
int32_t n_tokens,
char * text,
int32_t text_len_max,
bool remove_special,
bool unparse_special) const;
std::string detokenize(
const std::vector<llama_token> & tokens,
bool special) const;
void print_info() const;
private:
struct impl;
std::unique_ptr<impl> pimpl;
};
//
// internal API
//
// TODO: rename to llama_tokenize_impl
// TODO: This should probably be in llama.h
std::vector<llama_vocab::id> llama_tokenize_internal(
const llama_vocab & vocab,
std::string raw_text,
bool add_special,
bool parse_special = false);
// TODO: move the API below as member functions of llama_vocab
llama_token llama_byte_to_token_impl(const llama_vocab & vocab, uint8_t ch);
const char * llama_token_get_text_impl(const struct llama_vocab & vocab, llama_token token);
float llama_token_get_score_impl(const struct llama_vocab & vocab, llama_token token);
llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, llama_token token);
bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token);
bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token);
llama_token llama_token_bos_impl(const struct llama_vocab & vocab);
llama_token llama_token_eos_impl(const struct llama_vocab & vocab);
llama_token llama_token_eot_impl(const struct llama_vocab & vocab);
llama_token llama_token_eom_impl(const struct llama_vocab & vocab);
llama_token llama_token_cls_impl(const struct llama_vocab & vocab);
llama_token llama_token_sep_impl(const struct llama_vocab & vocab);
llama_token llama_token_nl_impl (const struct llama_vocab & vocab);
llama_token llama_token_pad_impl(const struct llama_vocab & vocab);
llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
llama_token llama_token_fim_pre_impl(const struct llama_vocab & vocab);
llama_token llama_token_fim_suf_impl(const struct llama_vocab & vocab);
llama_token llama_token_fim_mid_impl(const struct llama_vocab & vocab);
llama_token llama_token_fim_pad_impl(const struct llama_vocab & vocab);
llama_token llama_token_fim_rep_impl(const struct llama_vocab & vocab);
llama_token llama_token_fim_sep_impl(const struct llama_vocab & vocab);
bool llama_add_bos_token_impl(const struct llama_vocab & vocab);
bool llama_add_eos_token_impl(const struct llama_vocab & vocab);
int32_t llama_tokenize_impl(
const struct llama_vocab & vocab,
const char * text,
int32_t text_len,
llama_token * tokens,
int32_t n_tokens_max,
bool add_special,
bool parse_special);
// does not write null-terminator to buf
int32_t llama_token_to_piece_impl(
const struct llama_vocab & vocab,
llama_token token,
char * buf,
int32_t length,
int32_t lstrip,
bool special);
// check if token0 is contained as a prefix in token1
bool llama_token_is_prefix_impl(
const struct llama_vocab & vocab,
llama_token token0,
llama_token token1);
int32_t llama_detokenize_impl(
const struct llama_vocab & vocab,
const llama_token * tokens,
int32_t n_tokens,
char * text,
int32_t text_len_max,
bool remove_special,
bool unparse_special);
std::string llama_detokenize(
const struct llama_vocab & vocab,
const std::vector<llama_token> & tokens,
bool special);
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -12,18 +12,17 @@
#include <algorithm>
#include <cassert>
#include <codecvt>
#include <cstddef>
#include <cstdint>
#include <locale>
#include <map>
#include <regex>
#include <stdexcept>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include <locale>
#include <codecvt>
size_t unicode_len_utf8(char src) {
const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
......@@ -641,7 +640,14 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
result.reserve(utf8.size());
size_t offset = 0;
while (offset < utf8.size()) {
result.push_back(unicode_cpt_from_utf8(utf8, offset));
try {
result.push_back(unicode_cpt_from_utf8(utf8, offset));
}
catch (const std::invalid_argument & /*ex*/) {
// Silently ignore invalid UTF-8 input to avoid leaking the exception beyond llama_tokenize
++offset;
result.emplace_back(0xFFFD); // replacement character
}
}
return result;
}
......@@ -724,7 +730,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
const auto cpts = unicode_cpts_from_utf8(text);
// generate a "collapsed" representation of the text, where all codepoints are replaced by a single byte
// ref: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2081479935
// ref: https://github.com/ggml-org/llama.cpp/pull/6920#issuecomment-2081479935
std::string text_collapsed;
if (need_collapse) {
// collapse all unicode categories
......
......@@ -14,6 +14,7 @@ package llama
#include "llama.h"
#include "clip.h"
#include "llava.h"
#include "gguf.h"
#include "mllama.h"
#include "sampling_ext.h"
......@@ -293,29 +294,29 @@ func NewContextWithModel(model *Model, params ContextParams) (*Context, error) {
}
func (m *Model) NumVocab() int {
return int(C.llama_n_vocab(m.c))
return int(C.llama_n_vocab(m.Vocab()))
}
func (m *Model) TokenIsEog(token int) bool {
return bool(C.llama_token_is_eog(m.c, C.llama_token(token)))
return bool(C.llama_token_is_eog(m.Vocab(), C.llama_token(token)))
}
func (m *Model) AddBOSToken() bool {
return bool(C.llama_add_bos_token(m.c))
return bool(C.llama_add_bos_token(m.Vocab()))
}
func (m *Model) ApplyLoraFromFile(context *Context, loraPath string, scale float32, threads int) error {
cLoraPath := C.CString(loraPath)
defer C.free(unsafe.Pointer(cLoraPath))
loraAdapter := C.llama_lora_adapter_init(m.c, cLoraPath)
loraAdapter := C.llama_adapter_lora_init(m.c, cLoraPath)
if loraAdapter == nil {
return errors.New("unable to load lora")
}
err := -1
if loraAdapter != nil {
err = int(C.llama_lora_adapter_set(context.c, loraAdapter, C.float(scale)))
err = int(C.llama_set_adapter_lora(context.c, loraAdapter, C.float(scale)))
}
if err != 0 {
return errors.New("error applying lora from file")
......@@ -324,6 +325,10 @@ func (m *Model) ApplyLoraFromFile(context *Context, loraPath string, scale float
return nil
}
func (m *Model) Vocab() *C.struct_llama_vocab {
return C.llama_model_get_vocab(m.c)
}
type Batch struct {
c C.struct_llama_batch
batchSize int
......@@ -414,7 +419,7 @@ func (m *Model) TokenToPiece(token int) string {
tokenLen := 12
buf := make([]byte, tokenLen)
tokenLen = int(C.llama_token_to_piece(
m.c,
m.Vocab(),
C.int32_t(token),
(*C.char)(unsafe.Pointer(&buf[0])),
C.int32_t(tokenLen),
......@@ -426,7 +431,7 @@ func (m *Model) TokenToPiece(token int) string {
buf = make([]byte, tokenLen)
C.llama_token_to_piece(
m.c,
m.Vocab(),
C.int32_t(token),
(*C.char)(unsafe.Pointer(&buf[0])),
C.int32_t(tokenLen),
......@@ -444,7 +449,7 @@ func (m *Model) Tokenize(text string, addSpecial bool, parseSpecial bool) ([]int
defer C.free(unsafe.Pointer(cText))
result := C.llama_tokenize(
m.c,
m.Vocab(),
cText,
C.int32_t(len(text)),
&cTokens[0],
......@@ -458,7 +463,7 @@ func (m *Model) Tokenize(text string, addSpecial bool, parseSpecial bool) ([]int
maxTokens = int(-result)
cTokens = make([]C.llama_token, maxTokens)
result = C.llama_tokenize(
m.c,
m.Vocab(),
cText,
C.int32_t(len(text)),
&cTokens[0],
......
......@@ -5,6 +5,7 @@
#include "ggml-backend.h"
#include "ggml-cpu.h"
#include "ggml.h"
#include "gguf.h"
#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
......
......@@ -10,7 +10,7 @@ Subject: [PATCH] cuda
3 files changed, 2 insertions(+), 1 deletion(-)
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index e2d6c405..a12172dc 100644
index dba7be33..1ca40b2c 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -106,7 +106,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
......@@ -22,10 +22,10 @@ index e2d6c405..a12172dc 100644
size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 0b06be72..be29e979 100644
index ebb2ccae..b094929b 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -424,6 +424,7 @@ struct ggml_backend_cuda_buffer_context {
@@ -529,6 +529,7 @@ struct ggml_backend_cuda_buffer_context {
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
delete ctx;
......@@ -34,10 +34,10 @@ index 0b06be72..be29e979 100644
static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index a85502ee..cd8ef741 100644
index c550142a..fd9a4e77 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -4187,6 +4187,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
@@ -4350,6 +4350,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
}
free(ctx);
......
......@@ -4,17 +4,17 @@ Date: Mon, 16 Sep 2024 15:53:13 -0700
Subject: [PATCH] pretokenizer
---
src/llama-model.cpp | 14 +++-----------
src/llama-vocab.cpp | 14 +++-----------
1 file changed, 3 insertions(+), 11 deletions(-)
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 405e0528..00b80c52 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1249,16 +1249,7 @@ void llm_load_vocab(llama_model_loader & ml, llama_model & model) {
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
vocab.tokenizer_add_space_prefix = false;
vocab.tokenizer_clean_spaces = true;
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index ad9ffe66..a4eee9b8 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1468,16 +1468,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
if (type == LLAMA_VOCAB_TYPE_BPE) {
add_space_prefix = false;
clean_spaces = true;
- if (tokenizer_pre.empty()) {
- LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
- LLAMA_LOG_WARN("%s: \n", __func__);
......@@ -23,19 +23,19 @@ index 405e0528..00b80c52 100644
- LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__);
- LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
- LLAMA_LOG_WARN("%s: \n", __func__);
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
- pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
- } else if (tokenizer_pre == "default") {
+ if (tokenizer_pre == "default") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (
tokenizer_pre == "llama3" ||
@@ -1373,7 +1364,8 @@ void llm_load_vocab(llama_model_loader & ml, llama_model & model) {
@@ -1593,7 +1584,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre == "megrez") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
} else {
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
+ LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
}
} else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (type == LLAMA_VOCAB_TYPE_SPM) {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
......@@ -9,10 +9,10 @@ Subject: [PATCH] embeddings
2 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 38a55fb2..b9c4a5bf 100644
index 671d2a81..47e79ed4 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -475,7 +475,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
@@ -479,7 +479,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead
......@@ -22,10 +22,10 @@ index 38a55fb2..b9c4a5bf 100644
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
diff --git a/src/llama.cpp b/src/llama.cpp
index ea78ea48..4eb3f6b9 100644
index 607f2786..ac85bfed 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -10876,7 +10876,6 @@ static int llama_decode_internal(
@@ -8652,7 +8652,6 @@ static int llama_decode_impl(
res = nullptr;
embd = nullptr;
} else if (cparams.embeddings) {
......@@ -33,7 +33,7 @@ index ea78ea48..4eb3f6b9 100644
embd = nullptr;
for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
@@ -10884,12 +10883,15 @@ static int llama_decode_internal(
@@ -8660,12 +8659,15 @@ static int llama_decode_impl(
break;
}
}
......
......@@ -8,10 +8,10 @@ Subject: [PATCH] clip-unicode
1 file changed, 39 insertions(+), 1 deletion(-)
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 3cd0d2fa..b3c1829f 100644
index 76d4a785..205af1eb 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -56,6 +56,19 @@
@@ -58,6 +58,19 @@
# define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
#endif // defined(LLAVA_LOG_OFF)
......@@ -31,7 +31,7 @@ index 3cd0d2fa..b3c1829f 100644
//#define CLIP_DEBUG_FUNCTIONS
// RGB uint8 image
@@ -1322,8 +1335,29 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
@@ -1402,8 +1415,29 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
gguf_free(ctx);
return nullptr;
}
......@@ -62,7 +62,7 @@ index 3cd0d2fa..b3c1829f 100644
if (!fin) {
LOG_ERR("cannot open model file for loading tensors\n");
clip_free(new_clip);
@@ -1363,7 +1397,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
@@ -1443,7 +1477,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
}
}
......
......@@ -11,21 +11,21 @@ tensor to store the scalar. the scalar is implemented a 1-dimensional
tensor with 2 elements dervied from the model's bskcn_tv configuration.
in general, the values are (bskcn_tv, 1 - bskcn_tv)
---
src/llama-arch.cpp | 53 +++++++----
src/llama-arch.cpp | 21 +++++
src/llama-arch.h | 3 +
src/llama-hparams.cpp | 8 ++
src/llama-hparams.h | 5 +
src/llama-hparams.h | 5 ++
src/llama-model-loader.cpp | 1 +
src/llama-model.cpp | 16 ++++
src/llama-model.cpp | 44 +++++++++++
src/llama-model.h | 3 +
src/llama.cpp | 185 +++++++++++++++++++++++++++++++++++++
8 files changed, 258 insertions(+), 16 deletions(-)
src/llama.cpp | 152 ++++++++++++++++++++++++++++++++++++-
8 files changed, 236 insertions(+), 1 deletion(-)
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 007d79f8..5b376c5e 100644
index 97a1e7e5..a1e0ebcc 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -59,6 +59,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
@@ -61,6 +61,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_GRANITE, "granite" },
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
{ LLM_ARCH_CHAMELEON, "chameleon" },
......@@ -33,48 +33,16 @@ index 007d79f8..5b376c5e 100644
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
{ LLM_ARCH_UNKNOWN, "(unknown)" },
};
@@ -106,22 +107,23 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" },
{ LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
- { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
- { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
- { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
- { LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
- { LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" },
- { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
- { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
- { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
- { LLM_KV_ATTENTION_GROUPNORM_EPS, "%s.attention.group_norm_epsilon" },
- { LLM_KV_ATTENTION_GROUPNORM_GROUPS, "%s.attention.group_norm_groups" },
- { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
- { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
- { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
- { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
- { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
- { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
+ { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
+ { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
+ { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
+ { LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
+ { LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" },
+ { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
+ { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
+ { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
+ { LLM_KV_ATTENTION_GROUPNORM_EPS, "%s.attention.group_norm_epsilon" },
+ { LLM_KV_ATTENTION_GROUPNORM_GROUPS, "%s.attention.group_norm_groups" },
+ { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
+ { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
+ { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
+ { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
+ { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
+ { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
+ { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
@@ -125,6 +126,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
+ { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
@@ -1240,6 +1242,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_POS_NET_ATTN_OUT, "posnet.%d.attn_output" },
@@ -1271,6 +1273,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
},
},
+ {
......@@ -96,9 +64,9 @@ index 007d79f8..5b376c5e 100644
+ },
+ },
{
LLM_ARCH_UNKNOWN,
LLM_ARCH_WAVTOKENIZER_DEC,
{
@@ -1372,6 +1392,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
@@ -1429,6 +1449,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
// this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
......@@ -107,10 +75,10 @@ index 007d79f8..5b376c5e 100644
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 45e458bb..eac7055b 100644
index 122fdceb..77919578 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -63,6 +63,7 @@ enum llm_arch {
@@ -65,6 +65,7 @@ enum llm_arch {
LLM_ARCH_GRANITE,
LLM_ARCH_GRANITE_MOE,
LLM_ARCH_CHAMELEON,
......@@ -118,7 +86,7 @@ index 45e458bb..eac7055b 100644
LLM_ARCH_WAVTOKENIZER_DEC,
LLM_ARCH_UNKNOWN,
};
@@ -126,6 +127,7 @@ enum llm_kv {
@@ -129,6 +130,7 @@ enum llm_kv {
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE,
......@@ -126,7 +94,7 @@ index 45e458bb..eac7055b 100644
LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_DIMENSION_SECTIONS,
@@ -305,6 +307,7 @@ enum llm_tensor {
@@ -311,6 +313,7 @@ enum llm_tensor {
LLM_TENSOR_ENC_OUTPUT_NORM,
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
......@@ -135,7 +103,7 @@ index 45e458bb..eac7055b 100644
LLM_TENSOR_CONVNEXT_DW,
LLM_TENSOR_CONVNEXT_NORM,
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
index c4053469..450738da 100644
index ea87b295..f3955de9 100644
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@@ -69,3 +69,11 @@ uint32_t llama_hparams::n_embd_v_s() const {
......@@ -152,10 +120,10 @@ index c4053469..450738da 100644
+}
\ No newline at end of file
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index a29f20ec..fd898e27 100644
index 1fe45410..1bdcdfd5 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -52,6 +52,8 @@ struct llama_hparams {
@@ -50,6 +50,8 @@ struct llama_hparams {
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
......@@ -164,7 +132,7 @@ index a29f20ec..fd898e27 100644
uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0;
uint32_t n_lora_kv = 0;
@@ -134,6 +136,9 @@ struct llama_hparams {
@@ -133,6 +135,9 @@ struct llama_hparams {
// dimension of the recurrent state embeddings
uint32_t n_embd_v_s() const;
......@@ -175,23 +143,23 @@ index a29f20ec..fd898e27 100644
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index 7743b465..422524a8 100644
index 05d58ad9..1252aca1 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -364,6 +364,7 @@ namespace GGUFMeta {
@@ -439,6 +439,7 @@ namespace GGUFMeta {
// TODO: this is not very clever - figure out something better
template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
+ template bool llama_model_loader::get_key_or_arr<uint32_t>(const std::string & key, std::array<uint32_t, 512> & result, uint32_t n, bool required);
llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) {
int trace = 0;
llama_model_loader::llama_model_loader(
const std::string & fname,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 00b80c52..306c557d 100644
index 36a0a009..ad1315c6 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1091,6 +1091,21 @@ void llm_load_hparams(llama_model_loader & ml, llama_model & model) {
default: model.type = e_model::MODEL_UNKNOWN;
@@ -1238,6 +1238,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN;
}
} break;
+ case LLM_ARCH_SOLAR:
......@@ -200,52 +168,19 @@ index 00b80c52..306c557d 100644
+ for (size_t i = 0; i < hparams.n_bskcn_arr.max_size(); ++i) {
+ auto & bskcn = hparams.n_bskcn_arr[i];
+ bskcn.fill(0);
+ auto kv = LLM_KV(model.arch);
+ auto kv = LLM_KV(arch);
+ ml.get_key_or_arr(format((kv(LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION) + ".%d").c_str(), i), bskcn, hparams.n_layer, false);
+ }
+
+ switch (hparams.n_layer) {
+ case 64: model.type = e_model::MODEL_22B; break;
+ default: model.type = e_model::MODEL_UNKNOWN;
+ case 64: type = LLM_TYPE_22B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
case LLM_ARCH_WAVTOKENIZER_DEC:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -2065,6 +2080,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_CHAMELEON:
+ case LLM_ARCH_SOLAR:
return LLAMA_ROPE_TYPE_NORM;
// the pairs of head values are offset by n_rot/2
diff --git a/src/llama-model.h b/src/llama-model.h
index ce038932..c1b9c0a1 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -54,6 +54,7 @@ enum llm_type {
MODEL_15B,
MODEL_16B,
MODEL_20B,
+ MODEL_22B,
MODEL_30B,
MODEL_32B,
MODEL_34B,
@@ -275,6 +276,8 @@ struct llama_layer {
struct ggml_tensor * ffn_up_scale = nullptr;
struct ggml_tensor * ffn_down_scale = nullptr;
+ struct ggml_tensor * bskcn_tv = nullptr;
+
struct llama_layer_posnet posnet;
struct llama_layer_convnext convnext;
diff --git a/src/llama.cpp b/src/llama.cpp
index 4eb3f6b9..7dec50ae 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2206,6 +2206,35 @@ static bool llm_load_tensors(
@@ -3316,6 +3331,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
......@@ -256,16 +191,16 @@ index 4eb3f6b9..7dec50ae 100644
+ } break;
+ case LLM_ARCH_SOLAR:
+ {
+ model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ {
+ model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = model.layers[i];
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
......@@ -277,16 +212,53 @@ index 4eb3f6b9..7dec50ae 100644
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.bskcn_tv = create_tensor(tn(LLM_TENSOR_BSKCN_TV, "weight", i), {2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
@@ -10226,6 +10255,158 @@ struct llm_build_context {
return gf;
}
@@ -3900,6 +3943,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_CHAMELEON:
+ case LLM_ARCH_SOLAR:
return LLAMA_ROPE_TYPE_NORM;
// the pairs of head values are offset by n_rot/2
diff --git a/src/llama-model.h b/src/llama-model.h
index a7c30444..1afb0024 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -55,6 +55,7 @@ enum llm_type {
LLM_TYPE_15B,
LLM_TYPE_16B,
LLM_TYPE_20B,
+ LLM_TYPE_22B,
LLM_TYPE_30B,
LLM_TYPE_32B,
LLM_TYPE_34B,
@@ -281,6 +282,8 @@ struct llama_layer {
struct ggml_tensor * ffn_up_scale = nullptr;
struct ggml_tensor * ffn_down_scale = nullptr;
+ ggml_cgraph * build_solar() {
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_tensor * bskcn_tv = nullptr;
+
struct llama_layer_posnet posnet;
struct llama_layer_convnext convnext;
diff --git a/src/llama.cpp b/src/llama.cpp
index ac85bfed..6d320ea4 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -7953,9 +7953,155 @@ struct llm_build_context {
cb(img_logits, "img_logits", -1);
cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx);
cb(cur, "result_output", -1);
-
ggml_build_forward_expand(gf, cur);
+ return gf;
+ }
+
+ ggml_cgraph * build_solar() {
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
+ int32_t n_tokens = this->n_tokens;
......@@ -333,7 +305,7 @@ index 4eb3f6b9..7dec50ae 100644
+ ggml_mul(ctx0, bskcn_2, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, 0)),
+ ggml_mul(ctx0, inpSA, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, ggml_element_size(model.layers[il].bskcn_tv))));
+ }
+
+ // norm
+ cur = llm_build_norm(ctx0, inpL, hparams,
+ model.layers[il].attn_norm, NULL,
......@@ -422,25 +394,18 @@ index 4eb3f6b9..7dec50ae 100644
+ }
+
+ cur = inpL;
+
+ cur = llm_build_norm(ctx0, cur, hparams,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, cb, -1);
+ cb(cur, "result_norm", -1);
+
+ // lm_head
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+ cb(cur, "result_output", -1);
+
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+ }
+
struct ggml_cgraph * build_wavtokenizer_dec() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
return gf;
}
@@ -10660,6 +10841,10 @@ static struct ggml_cgraph * llama_build_graph(
@@ -8398,6 +8544,10 @@ static struct ggml_cgraph * llama_build_graph(
{
result = llm.build_chameleon();
} break;
......
......@@ -8,10 +8,10 @@ Subject: [PATCH] conditional-fattn
1 file changed, 2 insertions(+)
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index be29e979..aaa79ea4 100644
index b094929b..36165840 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2159,9 +2159,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
@@ -2282,9 +2282,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_ARGSORT:
ggml_cuda_op_argsort(ctx, dst);
break;
......
......@@ -15,27 +15,27 @@ remaining is to implement the cross attention mask
examples/llava/llava.cpp | 5 +-
ggml/src/ggml-backend-reg.cpp | 6 +-
include/llama.h | 6 +
src/llama-arch.cpp | 44 +++++
src/llama-arch.cpp | 44 ++++++
src/llama-arch.h | 10 ++
src/llama-batch.cpp | 3 +
src/llama-context.cpp | 19 ++-
src/llama-context.cpp | 28 ++--
src/llama-context.h | 2 +
src/llama-cparams.h | 1 +
src/llama-hparams.cpp | 8 +-
src/llama-hparams.h | 4 +
src/llama-kv-cache.cpp | 33 ++++
src/llama-hparams.cpp | 6 +
src/llama-hparams.h | 5 +
src/llama-kv-cache.cpp | 13 +-
src/llama-model-loader.cpp | 2 +
src/llama-model.cpp | 59 ++-----
src/llama-model.h | 51 ++++++
src/llama-model.cpp | 65 ++++++++-
src/llama-model.h | 12 ++
src/llama-quant.cpp | 4 +-
src/llama.cpp | 307 +++++++++++++++++++++++++++++++++-
17 files changed, 508 insertions(+), 56 deletions(-)
src/llama.cpp | 262 +++++++++++++++++++++++++++++++++-
17 files changed, 452 insertions(+), 22 deletions(-)
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index 16f30c56..0f0f3f62 100644
index 518aad3f..f0e484a1 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -429,7 +429,7 @@ struct llava_embd_batch {
@@ -445,7 +445,7 @@ struct llava_embd_batch {
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
......@@ -44,7 +44,7 @@ index 16f30c56..0f0f3f62 100644
pos .resize(n_tokens);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
@@ -441,6 +441,7 @@ struct llava_embd_batch {
@@ -457,6 +457,7 @@ struct llava_embd_batch {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ embd,
......@@ -52,7 +52,7 @@ index 16f30c56..0f0f3f62 100644
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
@@ -464,7 +465,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
@@ -480,7 +481,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
n_eval = n_batch;
}
float * embd = image_embed->embed+i*n_embd;
......@@ -62,7 +62,7 @@ index 16f30c56..0f0f3f62 100644
LOG_ERR("%s : failed to eval\n", __func__);
return false;
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 7ddd178b..899d16f2 100644
index 955ed505..95036ef8 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -171,9 +171,9 @@ struct ggml_backend_registry {
......@@ -79,10 +79,10 @@ index 7ddd178b..899d16f2 100644
register_backend(ggml_backend_rpc_reg());
#endif
diff --git a/include/llama.h b/include/llama.h
index a0d5ba5d..9f411960 100644
index 47919602..cc948005 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -250,6 +250,7 @@ extern "C" {
@@ -249,6 +249,7 @@ extern "C" {
llama_token * token;
float * embd;
......@@ -90,7 +90,7 @@ index a0d5ba5d..9f411960 100644
llama_pos * pos;
int32_t * n_seq_id;
llama_seq_id ** seq_id;
@@ -347,6 +348,7 @@ extern "C" {
@@ -343,6 +344,7 @@ extern "C" {
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
bool no_perf; // whether to measure performance timings
......@@ -98,9 +98,9 @@ index a0d5ba5d..9f411960 100644
// Abort callback
// if it returns true, execution of llama_decode() will be aborted
@@ -426,6 +428,10 @@ extern "C" {
struct llama_model * model,
struct llama_context_params params);
@@ -443,6 +445,10 @@ extern "C" {
struct llama_context_params params),
"use llama_init_from_model instead");
+ // TODO (jmorganca): this should most likely be passed in as part of a batch
+ // and not set on the context for all batches.
......@@ -110,7 +110,7 @@ index a0d5ba5d..9f411960 100644
LLAMA_API void llama_free(struct llama_context * ctx);
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 5b376c5e..b35aeb31 100644
index a1e0ebcc..b6f20286 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -6,6 +6,7 @@
......@@ -121,15 +121,15 @@ index 5b376c5e..b35aeb31 100644
{ LLM_ARCH_DECI, "deci" },
{ LLM_ARCH_FALCON, "falcon" },
{ LLM_ARCH_GROK, "grok" },
@@ -124,6 +125,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
+ { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
@@ -127,6 +128,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
+ { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
@@ -220,6 +222,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
@@ -225,6 +227,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
},
},
......@@ -170,7 +170,7 @@ index 5b376c5e..b35aeb31 100644
{
LLM_ARCH_DECI,
{
@@ -1393,6 +1429,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
@@ -1450,6 +1486,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
// this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
{LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
......@@ -186,7 +186,7 @@ index 5b376c5e..b35aeb31 100644
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h
index eac7055b..e8235ae0 100644
index 77919578..ec742224 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -10,6 +10,7 @@
......@@ -197,7 +197,7 @@ index eac7055b..e8235ae0 100644
LLM_ARCH_DECI,
LLM_ARCH_FALCON,
LLM_ARCH_BAICHUAN,
@@ -128,6 +129,7 @@ enum llm_kv {
@@ -131,6 +132,7 @@ enum llm_kv {
LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
......@@ -205,7 +205,7 @@ index eac7055b..e8235ae0 100644
LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_DIMENSION_SECTIONS,
@@ -308,6 +310,14 @@ enum llm_tensor {
@@ -314,6 +316,14 @@ enum llm_tensor {
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
LLM_TENSOR_BSKCN_TV,
......@@ -249,10 +249,10 @@ index 01d5ca57..8682b0e6 100644
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
}
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index b9c4a5bf..9d0e7ca3 100644
index 47e79ed4..7b22fe13 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -71,10 +71,19 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
@@ -74,10 +74,19 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
}
if (ubatch.embd) {
......@@ -275,7 +275,30 @@ index b9c4a5bf..9d0e7ca3 100644
}
if (ubatch.pos && lctx.inp_pos) {
@@ -653,6 +662,10 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
@@ -470,12 +479,11 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
const auto & cparams = lctx.cparams;
const auto & hparams = lctx.model.hparams;
- const auto & vocab = lctx.model.vocab;
const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
const auto n_batch = cparams.n_batch;
- const auto n_vocab = vocab.n_tokens();
+ const auto n_vocab = hparams.n_vocab;
const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead
@@ -542,7 +550,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
void llama_output_reorder(struct llama_context & ctx) {
std::vector<size_t> & out_ids = ctx.sbatch.out_ids;
if (!out_ids.empty()) {
- const uint32_t n_vocab = ctx.model.vocab.n_tokens();
+ const uint32_t n_vocab = ctx.model.hparams.n_vocab;
const uint32_t n_embd = ctx.model.hparams.n_embd;
const int32_t n_outputs = ctx.n_outputs;
@@ -657,6 +665,10 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
ctx->cparams.causal_attn = causal_attn;
}
......@@ -286,8 +309,26 @@ index b9c4a5bf..9d0e7ca3 100644
void llama_synchronize(struct llama_context * ctx) {
ggml_backend_sched_synchronize(ctx->sched.get());
@@ -726,7 +738,7 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
}
- return ctx->logits + j*ctx->model.vocab.n_tokens();
+ return ctx->logits + j*ctx->model.hparams.n_vocab;
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
#ifndef NDEBUG
@@ -886,7 +898,7 @@ struct llama_data_write {
}
void write_logits(const struct llama_context * ctx) {
- const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.vocab.n_tokens());
+ const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.hparams.n_vocab);
write(&logits_size, sizeof(logits_size));
diff --git a/src/llama-context.h b/src/llama-context.h
index 0d163c47..4980a60e 100644
index a9268b29..cf12c9d7 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -107,6 +107,8 @@ struct llama_context {
......@@ -312,7 +353,7 @@ index 252012f3..9681e5a0 100644
enum llama_pooling_type pooling_type;
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
index 450738da..42f8a58f 100644
index f3955de9..0b841028 100644
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@@ -2,6 +2,8 @@
......@@ -328,18 +369,25 @@ index 450738da..42f8a58f 100644
}
GGML_ABORT("fatal error");
-}
\ No newline at end of file
+}
+
+bool llama_hparams::cross_attention_layers(uint32_t il) const {
+ return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
+}
}
\ No newline at end of file
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index fd898e27..f826cd9a 100644
index 1bdcdfd5..05383046 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -53,6 +53,7 @@ struct llama_hparams {
@@ -41,6 +41,7 @@ struct llama_hparams {
uint32_t n_expert = 0;
uint32_t n_expert_used = 0;
uint32_t n_rel_attn_bkts = 0;
+ uint32_t n_vocab = 0;
// for WavTokenizer
struct llama_hparams_posnet posnet;
@@ -51,6 +52,7 @@ struct llama_hparams {
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr = {};
......@@ -347,65 +395,45 @@ index fd898e27..f826cd9a 100644
uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0;
@@ -139,6 +140,9 @@ struct llama_hparams {
@@ -138,6 +140,9 @@ struct llama_hparams {
// Block skip connection
bool n_bskcn(uint32_t n, uint32_t il) const;
+
+ // cross attention layers
+ // cross attention layers
+ bool cross_attention_layers(uint32_t il) const;
};
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index 53379253..cf814dbe 100644
index feffdf0d..b541c5a3 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -72,6 +72,39 @@ bool llama_kv_cache_init(
cache.v_l.reserve(n_layer);
@@ -91,8 +91,17 @@ bool llama_kv_cache_init(
return false;
}
for (int i = 0; i < n_layer; i++) {
- ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
- ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
+ ggml_tensor * k, *v;
+
+ // for cross attention layers
+ if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layers(i)) {
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
+ const llama_model::buft_list_t * buft_list;
+ if (offload) {
+ buft_list = model.dev_layer.at(i).buft_list;
+ } else {
+ buft_list = &model.cpu_buft_list;
+ }
+ ggml_backend_buffer_type_t buft = select_buft(*buft_list,
+ [&](ggml_context * ctx) {
+ ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
+ if (hparams.rope_type == LLAMA_ROPE_TYPE_NONE) {
+ return k;
+ }
+ ggml_tensor * p = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
+ return ggml_rope(ctx, k, p, hparams.n_rot, hparams.rope_type);
+ });
+ ggml_context * ctx = ctx_for_buft(buft);
+
+ if (!ctx) {
+ LLAMA_LOG_ERROR("%s: failed to create ggml context for kv cache\n", __func__);
+ return false;
+ }
+ ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i));
+ ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i));
+ ggml_format_name(k, "cache_k_l%d", i);
+ ggml_format_name(v, "cache_v_l%d", i);
+ cache.k_l.push_back(k);
+ cache.v_l.push_back(v);
+ continue;
+ k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i));
+ v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i));
+ } else {
+ k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
+ v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
+ }
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
ggml_format_name(k, "cache_k_l%d", i);
ggml_format_name(v, "cache_v_l%d", i);
cache.k_l.push_back(k);
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index 422524a8..b12d6566 100644
index 1252aca1..45d08721 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -240,6 +240,8 @@ namespace GGUFMeta {
@@ -315,6 +315,8 @@ namespace GGUFMeta {
return true;
}
......@@ -415,80 +443,47 @@ index 422524a8..b12d6566 100644
bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
const int kid = gguf_find_key(meta.get(), key.c_str());
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 306c557d..4f9bbf90 100644
index ad1315c6..21819080 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -146,46 +146,6 @@ std::string llama_model_ftype_name(const llama_model & model) {
return llama_model_ftype_name(model.ftype);
}
-template<typename F>
-static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
- ggml_init_params params = {
- /*.mem_size =*/ ggml_tensor_overhead()*8,
- /*.mem_buffer =*/ NULL,
- /*.no_alloc =*/ true,
- };
-
- ggml_context_ptr ctx { ggml_init(params) };
- if (!ctx) {
- throw std::runtime_error(format("failed to create ggml context"));
- }
-
- ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
- ggml_tensor * op_tensor = fn(ctx.get());
- for (int i = 0; i < GGML_MAX_SRC; i++) {
- if (op_tensor->src[i] != nullptr) {
- assert(op_tensor->src[i]->buffer == nullptr);
- op_tensor->src[i]->buffer = buf.get();
- }
- }
-
- bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
-
- return op_supported;
-}
-
-template<typename F>
-static ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t & buft_list, const F & fn) {
- for (const auto & cur : buft_list) {
- ggml_backend_dev_t cur_dev = cur.first;
- ggml_backend_buffer_type_t cur_buft = cur.second;
- if (buft_supported(cur_buft, cur_dev, fn)) {
- return cur_buft;
- }
- }
-
- throw std::runtime_error(format("no suitable buffer type found"));
-}
-
ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il) {
return select_buft(
*model.dev_layer.at(il).buft_list,
@@ -312,9 +272,11 @@ void llm_load_hparams(llama_model_loader & ml, llama_model & model) {
@@ -401,6 +401,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
// get general kv
ml.get_key(LLM_KV_GENERAL_NAME, name, false);
+ ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, false);
// everything past this point is not vocab-related
if (hparams.vocab_only) {
@@ -412,6 +413,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
+ ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false);
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
@@ -435,9 +437,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
+ std::fill(hparams.cross_attn_layers.begin(), hparams.cross_attn_layers.end(), -1);
- ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
- ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
+ ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
+ ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
+ ml.get_arr(LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, hparams.cross_attn_layers, false);
// n_head_kv is optional, default to n_head
hparams.n_head_kv_arr = hparams.n_head_arr;
@@ -363,7 +325,7 @@ void llm_load_hparams(llama_model_loader & ml, llama_model & model) {
@@ -486,7 +490,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
- if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_DECI || model.arch == LLM_ARCH_FALCON) {
+ if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_MLLAMA || model.arch == LLM_ARCH_DECI || model.arch == LLM_ARCH_FALCON) {
- if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
+ if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_MLLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
if (hparams.n_rot != hparams.n_embd_head_k) {
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
}
@@ -405,6 +367,16 @@ void llm_load_hparams(llama_model_loader & ml, llama_model & model) {
@@ -530,6 +534,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
}
}
} break;
......@@ -497,145 +492,44 @@ index 306c557d..4f9bbf90 100644
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ switch (hparams.n_layer) {
+ case 40: model.type = e_model::MODEL_11B; break;
+ case 100: model.type = e_model::MODEL_90B; break;
+ default: model.type = e_model::MODEL_UNKNOWN;
+ case 40: type = LLM_TYPE_11B; break;
+ case 100: type = LLM_TYPE_90B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
case LLM_ARCH_DECI:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -2062,6 +2034,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
// use what we call a normal RoPE, operating on pairs of consecutive head values
case LLM_ARCH_LLAMA:
+ case LLM_ARCH_MLLAMA:
case LLM_ARCH_DECI:
case LLM_ARCH_BAICHUAN:
case LLM_ARCH_STARCODER:
diff --git a/src/llama-model.h b/src/llama-model.h
index c1b9c0a1..5b23e2ba 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -9,6 +9,7 @@
#include "ggml-cpp.h"
#include <vector>
+#include <stdexcept>
// available models
// TODO: this enum does not follow the enum naming convention
@@ -62,6 +63,7 @@ enum llm_type {
MODEL_40B,
MODEL_65B,
MODEL_70B,
+ MODEL_90B,
MODEL_236B,
MODEL_314B,
MODEL_671B,
@@ -278,6 +280,16 @@ struct llama_layer {
struct ggml_tensor * bskcn_tv = nullptr;
+ // cross attention
+ struct ggml_tensor * cross_attn_k_norm = nullptr;
+ struct ggml_tensor * cross_attn_k_proj = nullptr;
+ struct ggml_tensor * cross_attn_o_proj = nullptr;
+ struct ggml_tensor * cross_attn_q_norm = nullptr;
+ struct ggml_tensor * cross_attn_q_proj = nullptr;
+ struct ggml_tensor * cross_attn_v_proj = nullptr;
+ struct ggml_tensor * cross_attn_attn_gate = nullptr;
+ struct ggml_tensor * cross_attn_mlp_gate = nullptr;
+
struct llama_layer_posnet posnet;
struct llama_layer_convnext convnext;
@@ -376,6 +388,45 @@ std::string llama_model_arch_name (const llama_model & model);
std::string llama_model_type_name (const llama_model & model);
std::string llama_model_ftype_name(const llama_model & model);
+template<typename F>
+bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
+ ggml_init_params params = {
+ /*.mem_size =*/ ggml_tensor_overhead()*8,
+ /*.mem_buffer =*/ NULL,
+ /*.no_alloc =*/ true,
+ };
+
+ ggml_context_ptr ctx { ggml_init(params) };
+ if (!ctx) {
+ throw std::runtime_error("failed to create ggml context");
+ }
+
+ ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
+ ggml_tensor * op_tensor = fn(ctx.get());
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
+ if (op_tensor->src[i] != nullptr) {
+ op_tensor->src[i]->buffer = buf.get();
+ }
+ }
+
+ bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
+
+ return op_supported;
+}
+
+template<typename F>
+ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t & buft_list, const F & fn) {
+ for (const auto & cur : buft_list) {
+ ggml_backend_dev_t cur_dev = cur.first;
+ ggml_backend_buffer_type_t cur_buft = cur.second;
+ if (buft_supported(cur_buft, cur_dev, fn)) {
+ return cur_buft;
+ }
+ }
+
+ throw std::runtime_error("no suitable buffer type found");
+}
+
// used by llama_adapter_cvec
ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il);
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 42974f8f..27def6fd 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -629,7 +629,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
if (llama_model_has_encoder(&model)) {
n_attn_layer *= 3;
}
- GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
+ if (qs.n_attention_wv != n_attn_layer) {
+ LLAMA_LOG_WARN("%s: n_attention_wv is unexpected, expected: %d, found: %d\n", __func__, n_attn_layer, qs.n_attention_wv);
+ }
}
size_t total_size_org = 0;
diff --git a/src/llama.cpp b/src/llama.cpp
index 7dec50ae..bac66c24 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -563,6 +563,52 @@ static bool llm_load_tensors(
@@ -1398,7 +1412,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
const int64_t n_embd_head_v = hparams.n_embd_head_v;
const int64_t n_ff = hparams.n_ff();
const int64_t n_embd_gqa = n_embd_v_gqa;
- const int64_t n_vocab = vocab.n_tokens();
+ const int64_t n_vocab = hparams.n_vocab;
const int64_t n_token_types = vocab.n_token_types();
const int64_t n_rot = hparams.n_rot;
const int64_t n_expert = hparams.n_expert;
@@ -1581,6 +1595,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
}
}
} break;
+ case LLM_ARCH_MLLAMA:
+ {
+ model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab+8}, 0);
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab+8}, 0);
+
+ // output
+ {
+ model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+
+ // if output is NULL, init from the input tok embed
+ if (model.output == NULL) {
+ model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
+ }
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = model.layers[i];
+ auto & layer = layers[i];
+
+ if (hparams.cross_attention_layers(i)) {
+ layer.cross_attn_k_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_NORM, "weight", i), {128}, 0);
......@@ -667,17 +561,72 @@ index 7dec50ae..bac66c24 100644
+ } break;
case LLM_ARCH_DECI:
{
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -2514,7 +2560,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -3925,6 +3985,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
// use what we call a normal RoPE, operating on pairs of consecutive head values
case LLM_ARCH_LLAMA:
+ case LLM_ARCH_MLLAMA:
case LLM_ARCH_DECI:
case LLM_ARCH_BAICHUAN:
case LLM_ARCH_STARCODER:
diff --git a/src/llama-model.h b/src/llama-model.h
index 1afb0024..7cf57587 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -9,6 +9,7 @@
#include <string>
#include <unordered_map>
#include <vector>
+#include <stdexcept>
struct llama_model_loader;
@@ -63,6 +64,7 @@ enum llm_type {
LLM_TYPE_40B,
LLM_TYPE_65B,
LLM_TYPE_70B,
+ LLM_TYPE_90B,
LLM_TYPE_236B,
LLM_TYPE_314B,
LLM_TYPE_671B,
@@ -284,6 +286,16 @@ struct llama_layer {
if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
model.hparams.n_vocab != model.vocab.id_to_token.size()) {
- throw std::runtime_error("vocab size mismatch");
+ LLAMA_LOG_WARN("%s: vocab mismatch %u !- %zu ...\n", __func__, model.hparams.n_vocab, model.vocab.id_to_token.size());
struct ggml_tensor * bskcn_tv = nullptr;
+ // cross attention
+ struct ggml_tensor * cross_attn_k_norm = nullptr;
+ struct ggml_tensor * cross_attn_k_proj = nullptr;
+ struct ggml_tensor * cross_attn_o_proj = nullptr;
+ struct ggml_tensor * cross_attn_q_norm = nullptr;
+ struct ggml_tensor * cross_attn_q_proj = nullptr;
+ struct ggml_tensor * cross_attn_v_proj = nullptr;
+ struct ggml_tensor * cross_attn_attn_gate = nullptr;
+ struct ggml_tensor * cross_attn_mlp_gate = nullptr;
+
struct llama_layer_posnet posnet;
struct llama_layer_convnext convnext;
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index fb798265..6eb1da08 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -632,7 +632,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
if (llama_model_has_encoder(&model)) {
n_attn_layer *= 3;
}
- GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
+ if (qs.n_attention_wv != n_attn_layer) {
+ LLAMA_LOG_WARN("%s: n_attention_wv is unexpected, expected: %d, found: %d\n", __func__, n_attn_layer, qs.n_attention_wv);
+ }
}
if (params.vocab_only) {
@@ -2598,6 +2644,21 @@ static struct ggml_tensor * llm_build_inp_embd(
size_t total_size_org = 0;
diff --git a/src/llama.cpp b/src/llama.cpp
index 6d320ea4..8f7902df 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -154,6 +154,21 @@ static struct ggml_tensor * llm_build_inp_embd(
return inpL;
}
......@@ -699,7 +648,7 @@ index 7dec50ae..bac66c24 100644
static void llm_build_kv_store(
struct ggml_context * ctx,
const llama_hparams & hparams,
@@ -3593,6 +3654,7 @@ struct llm_build_context {
@@ -1157,6 +1172,7 @@ struct llm_build_context {
lctx.inp_pos_bucket = nullptr;
lctx.inp_embd_enc = nullptr;
lctx.inp_KQ_mask_cross = nullptr;
......@@ -707,12 +656,12 @@ index 7dec50ae..bac66c24 100644
}
void free() {
@@ -4074,6 +4136,240 @@ struct llm_build_context {
@@ -1639,6 +1655,240 @@ struct llm_build_context {
return gf;
}
+ struct ggml_cgraph * build_mllama() {
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * build_mllama() {
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
+ int32_t n_tokens = this->n_tokens;
......@@ -946,9 +895,9 @@ index 7dec50ae..bac66c24 100644
+ }
+
struct ggml_cgraph * build_deci() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
@@ -10646,6 +10942,10 @@ static struct ggml_cgraph * llama_build_graph(
@@ -8344,6 +8594,10 @@ static struct ggml_cgraph * llama_build_graph(
{
result = llm.build_llama();
} break;
......@@ -959,16 +908,33 @@ index 7dec50ae..bac66c24 100644
case LLM_ARCH_DECI:
{
result = llm.build_deci();
@@ -10971,7 +11271,7 @@ static int llama_decode_internal(
@@ -8634,7 +8888,7 @@ static int llama_prepare_sbatch(
n_outputs = 1;
}
- lctx.sbatch.from_batch(batch, n_embd,
+ lctx.sbatch.from_batch(batch, batch.n_embd,
/* simple_split */ !kv_self.recurrent,
/* simple_split */ !lctx.kv_self.recurrent,
/* logits_all */ n_outputs == n_tokens_all);
@@ -11282,7 +11582,7 @@ static int llama_encode_internal(
@@ -8749,7 +9003,6 @@ static int llama_decode_impl(
const llama_batch & batch = batch_allocr.batch;
const auto & model = lctx.model;
- const auto & vocab = model.vocab;
const auto & hparams = model.hparams;
const auto & cparams = lctx.cparams;
@@ -8760,7 +9013,7 @@ static int llama_decode_impl(
llama_kv_slot_restorer kv_slot_restorer(kv_self);
const int64_t n_embd = hparams.n_embd;
- const int64_t n_vocab = vocab.n_tokens();
+ const int64_t n_vocab = hparams.n_vocab;
uint32_t n_outputs = 0;
uint32_t n_outputs_prev = 0;
@@ -9025,7 +9278,7 @@ static int llama_encode_impl(
const int64_t n_embd = hparams.n_embd;
......@@ -977,7 +943,7 @@ index 7dec50ae..bac66c24 100644
const llama_ubatch ubatch = lctx.sbatch.split_simple(n_tokens);
@@ -11775,6 +12075,7 @@ struct llama_context_params llama_context_default_params() {
@@ -9511,6 +9764,7 @@ struct llama_context_params llama_context_default_params() {
/*.offload_kqv =*/ true,
/*.flash_attn =*/ false,
/*.no_perf =*/ true,
......
......@@ -15,10 +15,10 @@ Subject: [PATCH] add unpad operator
8 files changed, 220 insertions(+), 2 deletions(-)
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index c714fc8c..1bc50fca 100644
index dd0c6a96..8d269a9c 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -499,6 +499,7 @@ extern "C" {
@@ -487,6 +487,7 @@ extern "C" {
GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_PAD,
GGML_OP_PAD_REFLECT_1D,
......@@ -26,7 +26,7 @@ index c714fc8c..1bc50fca 100644
GGML_OP_ARANGE,
GGML_OP_TIMESTEP_EMBEDDING,
GGML_OP_ARGSORT,
@@ -1735,6 +1736,15 @@ extern "C" {
@@ -1743,6 +1744,15 @@ extern "C" {
int p0,
int p1);
......@@ -43,10 +43,10 @@ index c714fc8c..1bc50fca 100644
// timesteps: [N,]
// return: [N, dim]
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index b7fefb9d..b307d554 100644
index 72325349..2f606d82 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -10588,6 +10588,59 @@ static void ggml_compute_forward_pad_reflect_1d(
@@ -10844,6 +10844,59 @@ static void ggml_compute_forward_pad_reflect_1d(
}
}
......@@ -106,7 +106,7 @@ index b7fefb9d..b307d554 100644
// ggml_compute_forward_arange
static void ggml_compute_forward_arange_f32(
@@ -12690,6 +12743,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
@@ -13137,6 +13190,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_pad_reflect_1d(params, tensor);
} break;
......@@ -117,7 +117,7 @@ index b7fefb9d..b307d554 100644
case GGML_OP_ARANGE:
{
ggml_compute_forward_arange(params, tensor);
@@ -13033,6 +13090,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
@@ -13484,6 +13541,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
case GGML_OP_UPSCALE:
case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D:
......@@ -126,10 +126,10 @@ index b7fefb9d..b307d554 100644
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT:
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index aaa79ea4..9286f866 100644
index 36165840..1adf08fa 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2082,6 +2082,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
@@ -2198,6 +2198,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_PAD:
ggml_cuda_op_pad(ctx, dst);
break;
......@@ -139,8 +139,8 @@ index aaa79ea4..9286f866 100644
case GGML_OP_ARANGE:
ggml_cuda_op_arange(ctx, dst);
break;
@@ -3010,6 +3013,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_GROUP_NORM:
@@ -3197,6 +3200,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
return ggml_is_contiguous(op->src[0]);
case GGML_OP_UPSCALE:
case GGML_OP_PAD:
+ case GGML_OP_UNPAD:
......@@ -148,7 +148,7 @@ index aaa79ea4..9286f866 100644
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_LEAKY_RELU:
diff --git a/ggml/src/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu
index aba539e8..39fd4b16 100644
index aba539e8..b4b87409 100644
--- a/ggml/src/ggml-cuda/pad.cu
+++ b/ggml/src/ggml-cuda/pad.cu
@@ -47,3 +47,49 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
......@@ -201,6 +201,7 @@ index aba539e8..39fd4b16 100644
+ src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
+ dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
+}
\ No newline at end of file
diff --git a/ggml/src/ggml-cuda/pad.cuh b/ggml/src/ggml-cuda/pad.cuh
index 8fd386b0..e2ededc3 100644
--- a/ggml/src/ggml-cuda/pad.cuh
......@@ -211,10 +212,10 @@ index 8fd386b0..e2ededc3 100644
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index cd8ef741..318addec 100644
index fd9a4e77..e4c093f9 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -311,6 +311,7 @@ enum ggml_metal_kernel_type {
@@ -331,6 +331,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
GGML_METAL_KERNEL_TYPE_PAD_F32,
GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
......@@ -222,7 +223,7 @@ index cd8ef741..318addec 100644
GGML_METAL_KERNEL_TYPE_ARANGE_F32,
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
@@ -910,6 +911,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
@@ -946,6 +947,7 @@ @implementation GGMLMetalClass
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true);
......@@ -230,7 +231,7 @@ index cd8ef741..318addec 100644
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
@@ -1145,6 +1147,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
@@ -1254,6 +1256,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
case GGML_OP_UPSCALE:
case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D:
......@@ -238,7 +239,7 @@ index cd8ef741..318addec 100644
case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT:
@@ -3348,6 +3351,36 @@ static void ggml_metal_encode_node(
@@ -3469,6 +3472,36 @@ static void ggml_metal_encode_node(
const int nth = MIN(1024, ne0);
......@@ -276,10 +277,10 @@ index cd8ef741..318addec 100644
} break;
case GGML_OP_ARANGE:
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 8ba43904..204c93e6 100644
index d092a169..f38909d0 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -2944,6 +2944,51 @@ kernel void kernel_pad_reflect_1d_f32(
@@ -2953,6 +2953,51 @@ kernel void kernel_pad_reflect_1d_f32(
}
}
......@@ -332,10 +333,10 @@ index 8ba43904..204c93e6 100644
device char * dst,
constant int64_t & ne0,
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 2bbe5f48..7ffcd907 100644
index 7fc06724..635aa299 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -954,6 +954,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
@@ -962,6 +962,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"UPSCALE",
"PAD",
"PAD_REFLECT_1D",
......@@ -343,16 +344,16 @@ index 2bbe5f48..7ffcd907 100644
"ARANGE",
"TIMESTEP_EMBEDDING",
"ARGSORT",
@@ -987,7 +988,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
@@ -996,7 +997,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"OPT_STEP_ADAMW",
};
-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
-static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
+static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
@@ -1050,6 +1051,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
@@ -1059,6 +1060,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"upscale(x)",
"pad(x)",
"pad_reflect_1d(x)",
......@@ -360,16 +361,16 @@ index 2bbe5f48..7ffcd907 100644
"arange(start, stop, step)",
"timestep_embedding(timesteps, dim, max_period)",
"argsort(x)",
@@ -1083,7 +1085,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
@@ -1093,7 +1095,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"adamw(x)",
};
-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
-static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
+static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@@ -4214,6 +4216,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
@@ -4225,6 +4227,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
return result;
}
......
......@@ -11,10 +11,10 @@ the characters
2 files changed, 23 insertions(+), 1 deletion(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 3fcfcaa3..8f44705a 100644
index a4eee9b8..1ca827eb 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -375,7 +375,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
@@ -295,7 +295,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
regex_exprs = {
"[\r\n]",
......@@ -24,7 +24,7 @@ index 3fcfcaa3..8f44705a 100644
"\\s+$",
"[一-龥ࠀ-一가-퟿]+",
diff --git a/src/unicode.cpp b/src/unicode.cpp
index 7aca6544..6155da80 100644
index e63bb4ab..9dd53b9a 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -2,6 +2,11 @@
......@@ -39,7 +39,7 @@ index 7aca6544..6155da80 100644
#include "unicode.h"
#include "unicode-data.h"
@@ -201,6 +206,22 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
@@ -200,6 +205,22 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
}
static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
......@@ -62,7 +62,7 @@ index 7aca6544..6155da80 100644
#if defined(__clang__)
// disable C++17 deprecation warning for std::codecvt_utf8
# pragma clang diagnostic push
@@ -214,6 +235,7 @@ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
@@ -213,6 +234,7 @@ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
#endif
return conv.from_bytes(s);
......
......@@ -8,11 +8,11 @@ Subject: [PATCH] Maintain ordering for rules for grammar
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index dadc18c8..2a8dbd22 100644
index 3ebcc3d9..30c28808 100644
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -391,7 +391,7 @@ class SchemaConverter {
private:
@@ -346,7 +346,7 @@ private:
friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
std::function<json(const std::string &)> _fetch_json;
bool _dotall;
- std::map<std::string, std::string> _rules;
......
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: Sat, 14 Dec 2024 12:54:00 -0800
Subject: [PATCH] fix missing arg in static assert on windows
---
ggml/src/ggml-cuda/concat.cu | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/ggml/src/ggml-cuda/concat.cu b/ggml/src/ggml-cuda/concat.cu
index 2f42b8a9..5eb9f08d 100644
--- a/ggml/src/ggml-cuda/concat.cu
+++ b/ggml/src/ggml-cuda/concat.cu
@@ -124,7 +124,7 @@ static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE)
uint64_t nb1,
uint64_t nb2,
uint64_t nb3){
- static_assert(dim >= 0 && dim <= 3);
+ static_assert(dim >= 0 && dim <= 3, "dim must be between 0 and 3");
const int64_t i3 = blockIdx.z;
const int64_t i2 = blockIdx.y;
......@@ -19,10 +19,10 @@ multiple batches of processing until everything is complete.
1 file changed, 46 insertions(+), 53 deletions(-)
diff --git a/src/llama.cpp b/src/llama.cpp
index bac66c24..c95da45d 100644
index 8f7902df..01854fce 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3536,6 +3536,13 @@ static struct ggml_tensor * llm_build_rwkv6_channel_mix(
@@ -1054,6 +1054,13 @@ static struct ggml_tensor * llm_build_rwkv6_channel_mix(
return ggml_mul(ctx, r, llm_build_lora_mm(lctx, ctx, layer->channel_mix_value, k));
}
......@@ -36,13 +36,13 @@ index bac66c24..c95da45d 100644
struct llm_build_context {
const llama_model & model;
llama_context & lctx;
@@ -3712,35 +3719,23 @@ struct llm_build_context {
@@ -1230,35 +1237,23 @@ struct llm_build_context {
return gf;
}
- struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
+ struct ggml_cgraph * build_defrag(const std::vector<struct llama_kv_defrag_move> & moves) {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
- for (uint32_t i = 0; i < ids.size(); ++i) {
- const uint32_t id = ids[i];
......@@ -78,7 +78,7 @@ index bac66c24..c95da45d 100644
ggml_tensor * view_v_src;
ggml_tensor * view_v_dst;
@@ -3748,31 +3743,29 @@ struct llm_build_context {
@@ -1266,31 +1261,29 @@ struct llm_build_context {
if (flash_attn) {
// NOTE: the V cache is not transposed when using flash attention
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
......@@ -118,7 +118,7 @@ index bac66c24..c95da45d 100644
}
//LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
@@ -10856,7 +10849,7 @@ struct llm_build_context {
@@ -8508,7 +8501,7 @@ struct llm_build_context {
}
};
......@@ -127,7 +127,7 @@ index bac66c24..c95da45d 100644
llama_ubatch dummy = {};
dummy.equal_seqs = true;
@@ -10866,7 +10859,7 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const
@@ -8518,7 +8511,7 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const
llm.init();
......@@ -136,21 +136,21 @@ index bac66c24..c95da45d 100644
llm.free();
@@ -11329,7 +11322,12 @@ static int llama_decode_internal(
kv_self.head = 0;
}
@@ -8956,7 +8949,12 @@ static int llama_prepare_ubatch(
kv_self.head = 0;
}
- const auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
+ auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
+ if (!slot) {
+ llama_kv_cache_defrag(kv_self);
+ llama_kv_cache_update(&lctx);
+ slot = llama_kv_cache_find_slot(kv_self, ubatch);
+ }
if (!slot) {
return 1;
}
@@ -11735,8 +11733,8 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
- const auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
+ auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
+ if (!slot) {
+ llama_kv_cache_defrag(kv_self);
+ llama_kv_cache_update(&lctx);
+ slot = llama_kv_cache_find_slot(kv_self, ubatch);
+ }
if (!slot) {
return 1;
}
@@ -9431,8 +9429,8 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
//const int64_t t_start = ggml_time_us();
......@@ -161,7 +161,7 @@ index bac66c24..c95da45d 100644
// each move requires 6*n_layer tensors (see build_defrag)
// - source view, destination view, copy operation
@@ -11800,19 +11798,11 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
@@ -9496,19 +9494,11 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
// are we moving a continuous block of memory?
bool cont = false;
......@@ -181,7 +181,7 @@ index bac66c24..c95da45d 100644
cont = false;
continue;
}
@@ -11828,8 +11818,10 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
@@ -9524,8 +9514,10 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
kv_self.head = n_used;
if (!cont) {
......@@ -193,7 +193,7 @@ index bac66c24..c95da45d 100644
}
nf++;
@@ -11839,22 +11831,16 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
@@ -9535,22 +9527,16 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
}
}
......@@ -218,7 +218,7 @@ index bac66c24..c95da45d 100644
#if 0
// CPU defrag
@@ -11929,11 +11915,18 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
@@ -9625,11 +9611,18 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
#else
// ggml_graph defrag
......
......@@ -8,12 +8,12 @@ Subject: [PATCH] use dynamic backend loading for clip
1 file changed, 27 insertions(+), 47 deletions(-)
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index b3c1829f..86b91d5c 100644
index 205af1eb..560021c7 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -8,25 +8,25 @@
#include "ggml-alloc.h"
@@ -9,25 +9,25 @@
#include "ggml-backend.h"
#include "gguf.h"
-//#ifdef GGML_USE_CUDA
-//#include "ggml-cuda.h"
......@@ -56,7 +56,7 @@ index b3c1829f..86b91d5c 100644
#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"
@@ -1235,35 +1235,15 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
@@ -1309,35 +1309,15 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
}
}
......
......@@ -8,7 +8,7 @@ Subject: [PATCH] sort devices by score
1 file changed, 13 insertions(+), 8 deletions(-)
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 899d16f2..135f7df0 100644
index 95036ef8..98d5e14d 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -150,7 +150,7 @@ struct ggml_backend_reg_entry {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment