Unverified Commit 943464cc authored by Jeffrey Morgan's avatar Jeffrey Morgan Committed by GitHub
Browse files

llama: update to commit 71e90e88 (#10192)

parent 369de832
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -2,7 +2,9 @@ ...@@ -2,7 +2,9 @@
#include "llama.h" #include "llama.h"
#include "llama-arch.h" #include "llama-arch.h"
#include "llama-graph.h"
#include "llama-hparams.h" #include "llama-hparams.h"
#include "llama-memory.h"
#include "llama-vocab.h" #include "llama-vocab.h"
#include <memory> #include <memory>
...@@ -11,6 +13,8 @@ ...@@ -11,6 +13,8 @@
#include <vector> #include <vector>
#include <stdexcept> #include <stdexcept>
struct llama_cparams;
struct llama_ubatch;
struct llama_model_loader; struct llama_model_loader;
// available models // available models
...@@ -26,6 +30,7 @@ enum llm_type { ...@@ -26,6 +30,7 @@ enum llm_type {
LLM_TYPE_109M, LLM_TYPE_109M,
LLM_TYPE_137M, LLM_TYPE_137M,
LLM_TYPE_160M, LLM_TYPE_160M,
LLM_TYPE_190M,
LLM_TYPE_220M, LLM_TYPE_220M,
LLM_TYPE_250M, LLM_TYPE_250M,
LLM_TYPE_270M, LLM_TYPE_270M,
...@@ -40,8 +45,10 @@ enum llm_type { ...@@ -40,8 +45,10 @@ enum llm_type {
LLM_TYPE_1_4B, LLM_TYPE_1_4B,
LLM_TYPE_1_5B, LLM_TYPE_1_5B,
LLM_TYPE_1_6B, LLM_TYPE_1_6B,
LLM_TYPE_1_8B,
LLM_TYPE_2B, LLM_TYPE_2B,
LLM_TYPE_2_8B, LLM_TYPE_2_8B,
LLM_TYPE_2_9B,
LLM_TYPE_3B, LLM_TYPE_3B,
LLM_TYPE_4B, LLM_TYPE_4B,
LLM_TYPE_6B, LLM_TYPE_6B,
...@@ -81,6 +88,9 @@ enum llm_type { ...@@ -81,6 +88,9 @@ enum llm_type {
LLM_TYPE_10B_128x3_66B, LLM_TYPE_10B_128x3_66B,
LLM_TYPE_57B_A14B, LLM_TYPE_57B_A14B,
LLM_TYPE_27B, LLM_TYPE_27B,
LLM_TYPE_290B,
LLM_TYPE_17B_16E, // llama4 Scout
LLM_TYPE_17B_128E, // llama4 Maverick
}; };
struct llama_layer_posnet { struct llama_layer_posnet {
...@@ -259,6 +269,20 @@ struct llama_layer { ...@@ -259,6 +269,20 @@ struct llama_layer {
struct ggml_tensor * time_mix_receptance_b = nullptr; struct ggml_tensor * time_mix_receptance_b = nullptr;
struct ggml_tensor * time_mix_gate = nullptr; struct ggml_tensor * time_mix_gate = nullptr;
// rwkv7
struct ggml_tensor * time_mix_w0 = nullptr;
struct ggml_tensor * time_mix_a0 = nullptr;
struct ggml_tensor * time_mix_a1 = nullptr;
struct ggml_tensor * time_mix_a2 = nullptr;
struct ggml_tensor * time_mix_v0 = nullptr;
struct ggml_tensor * time_mix_v1 = nullptr;
struct ggml_tensor * time_mix_v2 = nullptr;
struct ggml_tensor * time_mix_g1 = nullptr;
struct ggml_tensor * time_mix_g2 = nullptr;
struct ggml_tensor * time_mix_k_k = nullptr;
struct ggml_tensor * time_mix_k_a = nullptr;
struct ggml_tensor * time_mix_r_k = nullptr;
struct ggml_tensor * time_mix_ln = nullptr; struct ggml_tensor * time_mix_ln = nullptr;
struct ggml_tensor * time_mix_ln_b = nullptr; struct ggml_tensor * time_mix_ln_b = nullptr;
struct ggml_tensor * time_mix_output = nullptr; struct ggml_tensor * time_mix_output = nullptr;
...@@ -362,7 +386,7 @@ struct llama_model { ...@@ -362,7 +386,7 @@ struct llama_model {
std::string desc() const; std::string desc() const;
size_t size() const; size_t size() const;
size_t max_nodes() const; size_t n_tensors() const;
size_t n_devices() const; size_t n_devices() const;
// total number of parameters in the model // total number of parameters in the model
...@@ -375,11 +399,26 @@ struct llama_model { ...@@ -375,11 +399,26 @@ struct llama_model {
ggml_backend_buffer_type_t select_buft(int il) const; ggml_backend_buffer_type_t select_buft(int il) const;
bool has_tensor_overrides() const;
const struct ggml_tensor * get_tensor(const char * name) const; const struct ggml_tensor * get_tensor(const char * name) const;
// TODO: move this to new llm_arch_model_i interface
llama_memory_i * create_memory() const; // TODO: params
// TODO: move this to new llm_arch_model_i interface
llm_graph_result_ptr build_graph(
const llm_graph_params & params,
ggml_cgraph * gf,
llm_graph_type type) const;
private: private:
struct impl; struct impl;
std::unique_ptr<impl> pimpl; std::unique_ptr<impl> pimpl;
}; };
const char * llm_type_name(llm_type type); const char * llm_type_name(llm_type type);
// For internal test use
// TODO: remove
const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model);
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#include <cinttypes> #include <cinttypes>
#include <fstream> #include <fstream>
#include <mutex> #include <mutex>
#include <regex>
#include <thread> #include <thread>
#include <unordered_map> #include <unordered_map>
...@@ -47,8 +48,14 @@ struct quantize_state_impl { ...@@ -47,8 +48,14 @@ struct quantize_state_impl {
{} {}
}; };
// changes to this struct must be replicated in quantize.cpp
struct tensor_quantization {
std::string name;
ggml_type quant = GGML_TYPE_COUNT;
};
static void llama_tensor_dequantize_impl( static void llama_tensor_dequantize_impl(
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers, ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
const size_t nelements, const int nthread const size_t nelements, const int nthread
) { ) {
if (output.size() < nelements) { if (output.size() < nelements) {
...@@ -527,7 +534,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: ...@@ -527,7 +534,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
} }
std::vector<std::string> splits = {}; std::vector<std::string> splits = {};
llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides); llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, nullptr);
ml.init_mappings(false); // no prefetching ml.init_mappings(false); // no prefetching
llama_model model(llama_model_default_params()); llama_model model(llama_model_default_params());
...@@ -536,7 +543,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: ...@@ -536,7 +543,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
model.load_hparams(ml); model.load_hparams(ml);
model.load_stats (ml); model.load_stats (ml);
struct quantize_state_impl qs(model, params); quantize_state_impl qs(model, params);
if (params->only_copy) { if (params->only_copy) {
ftype = ml.ftype; ftype = ml.ftype;
...@@ -663,7 +670,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: ...@@ -663,7 +670,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
// populate the original tensors so we get an initial meta data // populate the original tensors so we get an initial meta data
for (const auto * it : tensors) { for (const auto * it : tensors) {
uint16_t i_split = params->keep_split ? it->idx : 0; uint16_t i_split = params->keep_split ? it->idx : 0;
struct ggml_tensor * tensor = it->tensor; ggml_tensor * tensor = it->tensor;
if (!ctx_outs[i_split]) { if (!ctx_outs[i_split]) {
ctx_outs[i_split].reset(gguf_init_empty()); ctx_outs[i_split].reset(gguf_init_empty());
} }
...@@ -712,7 +719,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: ...@@ -712,7 +719,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
new_ofstream(0); new_ofstream(0);
for (const auto * it : tensors) { for (const auto * it : tensors) {
const auto & weight = *it; const auto & weight = *it;
struct ggml_tensor * tensor = weight.tensor; ggml_tensor * tensor = weight.tensor;
if (weight.idx != cur_split && params->keep_split) { if (weight.idx != cur_split && params->keep_split) {
close_ofstream(); close_ofstream();
new_ofstream(weight.idx); new_ofstream(weight.idx);
...@@ -762,10 +769,19 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: ...@@ -762,10 +769,19 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
// NOTE: can't use LLM_TN here because the layer number is not known // NOTE: can't use LLM_TN here because the layer number is not known
quantize &= name.find("ssm_conv1d.weight") == std::string::npos; quantize &= name.find("ssm_conv1d.weight") == std::string::npos;
// do not quantize RWKV's time_mix_first tensors // do not quantize RWKV's small yet 2D weights
quantize &= name.find("time_mix_first.weight") == std::string::npos; quantize &= name.find("time_mix_first.weight") == std::string::npos;
quantize &= name.find("time_mix_w0.weight") == std::string::npos;
quantize &= name.find("time_mix_w1.weight") == std::string::npos; quantize &= name.find("time_mix_w1.weight") == std::string::npos;
quantize &= name.find("time_mix_w2.weight") == std::string::npos; quantize &= name.find("time_mix_w2.weight") == std::string::npos;
quantize &= name.find("time_mix_v0.weight") == std::string::npos;
quantize &= name.find("time_mix_v1.weight") == std::string::npos;
quantize &= name.find("time_mix_v2.weight") == std::string::npos;
quantize &= name.find("time_mix_a0.weight") == std::string::npos;
quantize &= name.find("time_mix_a1.weight") == std::string::npos;
quantize &= name.find("time_mix_a2.weight") == std::string::npos;
quantize &= name.find("time_mix_g1.weight") == std::string::npos;
quantize &= name.find("time_mix_g2.weight") == std::string::npos;
quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos; quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos; quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos; quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
...@@ -773,7 +789,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: ...@@ -773,7 +789,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
// do not quantize relative position bias (T5) // do not quantize relative position bias (T5)
quantize &= name.find("attn_rel_b.weight") == std::string::npos; quantize &= name.find("attn_rel_b.weight") == std::string::npos;
enum ggml_type new_type; ggml_type new_type;
void * new_data; void * new_data;
size_t new_size; size_t new_size;
...@@ -783,6 +799,19 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: ...@@ -783,6 +799,19 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
// get more optimal quantization type based on the tensor shape, layer, etc. // get more optimal quantization type based on the tensor shape, layer, etc.
if (!params->pure && ggml_is_quantized(default_type)) { if (!params->pure && ggml_is_quantized(default_type)) {
new_type = llama_tensor_get_type(qs, new_type, tensor, ftype); new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
// unless the user specifies a type
if (params->tensor_types) {
const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
for (const auto & [tname, qtype] : tensor_types) {
if (std::regex pattern(tname); std::regex_search(tensor->name, pattern)) {
if (qtype != new_type) {
LLAMA_LOG_DEBUG("(overriding %s -> %s), ", ggml_type_name(new_type), ggml_type_name(qtype));
}
new_type = qtype;
break;
}
}
}
} }
if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) { if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
new_type = params->token_embedding_type; new_type = params->token_embedding_type;
...@@ -907,8 +936,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: ...@@ -907,8 +936,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
// interface implementation // interface implementation
// //
struct llama_model_quantize_params llama_model_quantize_default_params() { llama_model_quantize_params llama_model_quantize_default_params() {
struct llama_model_quantize_params result = { llama_model_quantize_params result = {
/*.nthread =*/ 0, /*.nthread =*/ 0,
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1, /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
/*.output_tensor_type =*/ GGML_TYPE_COUNT, /*.output_tensor_type =*/ GGML_TYPE_COUNT,
...@@ -920,6 +949,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { ...@@ -920,6 +949,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
/*.keep_split =*/ false, /*.keep_split =*/ false,
/*.imatrix =*/ nullptr, /*.imatrix =*/ nullptr,
/*.kv_overrides =*/ nullptr, /*.kv_overrides =*/ nullptr,
/*.tensor_type =*/ nullptr,
}; };
return result; return result;
......
...@@ -1449,7 +1449,9 @@ static struct llama_sampler * llama_sampler_init_grammar_impl( ...@@ -1449,7 +1449,9 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
const char ** trigger_words, const char ** trigger_words,
size_t num_trigger_words, size_t num_trigger_words,
const llama_token * trigger_tokens, const llama_token * trigger_tokens,
size_t num_trigger_tokens); size_t num_trigger_tokens,
const char ** trigger_patterns,
size_t num_trigger_patterns);
static void llama_sampler_grammar_reset(struct llama_sampler * smpl) { static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
auto * ctx = (llama_sampler_grammar *) smpl->ctx; auto * ctx = (llama_sampler_grammar *) smpl->ctx;
...@@ -1457,12 +1459,14 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) { ...@@ -1457,12 +1459,14 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
return; return;
} }
std::vector<const char *> trigger_words; std::vector<const char *> trigger_patterns_c;
for (auto & word : ctx->grammar->trigger_words) { trigger_patterns_c.reserve(ctx->grammar->trigger_patterns.size());
trigger_words.push_back(word.c_str()); for (auto & trigger_pattern : ctx->grammar->trigger_patterns) {
trigger_patterns_c.push_back(trigger_pattern.pattern.c_str());
} }
auto * grammar_new = llama_grammar_init_impl(ctx->grammar->vocab, ctx->grammar_str.c_str(), ctx->grammar_root.c_str(), auto * grammar_new = llama_grammar_init_impl(ctx->grammar->vocab, ctx->grammar_str.c_str(), ctx->grammar_root.c_str(),
ctx->grammar->lazy, trigger_words.data(), trigger_words.size(), ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(),
ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size()); ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size());
llama_grammar_free_impl(ctx->grammar); llama_grammar_free_impl(ctx->grammar);
...@@ -1472,7 +1476,8 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) { ...@@ -1472,7 +1476,8 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
static struct llama_sampler * llama_sampler_grammar_clone(const struct llama_sampler * smpl) { static struct llama_sampler * llama_sampler_grammar_clone(const struct llama_sampler * smpl) {
const auto * ctx = (const llama_sampler_grammar *) smpl->ctx; const auto * ctx = (const llama_sampler_grammar *) smpl->ctx;
auto * result = llama_sampler_init_grammar_impl(ctx->vocab, nullptr, nullptr, false, nullptr, 0, nullptr, 0); auto * result = llama_sampler_init_grammar_impl(ctx->vocab, nullptr, nullptr, false, nullptr, 0, nullptr, 0, nullptr, 0);
GGML_ASSERT(result);
// copy the state // copy the state
{ {
...@@ -1516,16 +1521,38 @@ static struct llama_sampler * llama_sampler_init_grammar_impl( ...@@ -1516,16 +1521,38 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
const char ** trigger_words, const char ** trigger_words,
size_t num_trigger_words, size_t num_trigger_words,
const llama_token * trigger_tokens, const llama_token * trigger_tokens,
size_t num_trigger_tokens) { size_t num_trigger_tokens,
const char ** trigger_patterns,
size_t num_trigger_patterns) {
auto * ctx = new llama_sampler_grammar; auto * ctx = new llama_sampler_grammar;
if (grammar_str != nullptr && grammar_str[0] != '\0') { if (grammar_str != nullptr && grammar_str[0] != '\0') {
// TODO: remove trigger_words support.
if (trigger_words != nullptr && num_trigger_words > 0) {
GGML_ASSERT(trigger_patterns == nullptr && num_trigger_patterns == 0);
std::string trigger_pattern("[\\s\\S]*?(");
for (size_t i = 0; i < num_trigger_words; ++i) {
static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
if (i > 0) {
trigger_pattern += "|";
}
trigger_pattern += std::regex_replace(trigger_words[i], special_chars, "\\$0");
}
trigger_pattern += ")[\\s\\S]*";
auto trigger_pattern_c = trigger_pattern.c_str();
trigger_patterns = &trigger_pattern_c;
num_trigger_patterns = 1;
}
*ctx = { *ctx = {
/* .vocab = */ vocab, /* .vocab = */ vocab,
/* .grammar_str = */ grammar_str, /* .grammar_str = */ grammar_str,
/* .grammar_root = */ grammar_root, /* .grammar_root = */ grammar_root,
/* .grammar = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens), /* .grammar = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_patterns, num_trigger_patterns, trigger_tokens, num_trigger_tokens),
}; };
if (!ctx->grammar) {
delete ctx;
return nullptr;
}
} else { } else {
*ctx = { *ctx = {
/* .vocab = */ vocab, /* .vocab = */ vocab,
...@@ -1545,7 +1572,7 @@ struct llama_sampler * llama_sampler_init_grammar( ...@@ -1545,7 +1572,7 @@ struct llama_sampler * llama_sampler_init_grammar(
const struct llama_vocab * vocab, const struct llama_vocab * vocab,
const char * grammar_str, const char * grammar_str,
const char * grammar_root) { const char * grammar_root) {
return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ false, nullptr, 0, nullptr, 0); return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ false, nullptr, 0, nullptr, 0, nullptr, 0);
} }
struct llama_sampler * llama_sampler_init_grammar_lazy( struct llama_sampler * llama_sampler_init_grammar_lazy(
...@@ -1556,7 +1583,18 @@ struct llama_sampler * llama_sampler_init_grammar_lazy( ...@@ -1556,7 +1583,18 @@ struct llama_sampler * llama_sampler_init_grammar_lazy(
size_t num_trigger_words, size_t num_trigger_words,
const llama_token * trigger_tokens, const llama_token * trigger_tokens,
size_t num_trigger_tokens) { size_t num_trigger_tokens) {
return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens); return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens, nullptr, 0);
}
struct llama_sampler * llama_sampler_init_grammar_lazy_patterns(
const struct llama_vocab * vocab,
const char * grammar_str,
const char * grammar_root,
const char ** trigger_patterns,
size_t num_trigger_patterns,
const llama_token * trigger_tokens,
size_t num_trigger_tokens) {
return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, nullptr, 0, trigger_tokens, num_trigger_tokens, trigger_patterns, num_trigger_patterns);
} }
// penalties // penalties
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <queue> #include <queue>
#include <set> #include <set>
#include <unordered_map> #include <unordered_map>
#include <cctype>
// //
// helpers // helpers
...@@ -341,6 +342,7 @@ struct llm_tokenizer_bpe : llm_tokenizer { ...@@ -341,6 +342,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
case LLAMA_VOCAB_PRE_TYPE_MPT: case LLAMA_VOCAB_PRE_TYPE_MPT:
case LLAMA_VOCAB_PRE_TYPE_OLMO: case LLAMA_VOCAB_PRE_TYPE_OLMO:
case LLAMA_VOCAB_PRE_TYPE_JAIS: case LLAMA_VOCAB_PRE_TYPE_JAIS:
case LLAMA_VOCAB_PRE_TYPE_TRILLION:
regex_exprs = { regex_exprs = {
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
}; };
...@@ -393,10 +395,24 @@ struct llm_tokenizer_bpe : llm_tokenizer { ...@@ -393,10 +395,24 @@ struct llm_tokenizer_bpe : llm_tokenizer {
}; };
break; break;
case LLAMA_VOCAB_PRE_TYPE_GPT4O: case LLAMA_VOCAB_PRE_TYPE_GPT4O:
// original regex from tokenizer.json
// [^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+
regex_exprs = { regex_exprs = {
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" // original regex from tokenizer.json
// "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
};
break;
case LLAMA_VOCAB_PRE_TYPE_SUPERBPE:
regex_exprs = {
"\\p{N}+",
"(?=(\\d{3})+(?!\\d))",
};
break;
case LLAMA_VOCAB_PRE_TYPE_BAILINGMOE:
regex_exprs = {
// original regex from tokenizer.json
// "'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+"
// FIXME? Changed possessive quantifiers (?+ and ++) to greedy to avoid errors and imatrix hanging (tried atomic grouping but it's not supported?)
"'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
}; };
break; break;
default: default:
...@@ -1547,6 +1563,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { ...@@ -1547,6 +1563,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
pre_type = LLAMA_VOCAB_PRE_TYPE_PORO; pre_type = LLAMA_VOCAB_PRE_TYPE_PORO;
clean_spaces = false; clean_spaces = false;
} else if ( } else if (
tokenizer_pre == "glm4" ||
tokenizer_pre == "chatglm-bpe") { tokenizer_pre == "chatglm-bpe") {
pre_type = LLAMA_VOCAB_PRE_TYPE_CHATGLM4; pre_type = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
special_bos_id = LLAMA_TOKEN_NULL; special_bos_id = LLAMA_TOKEN_NULL;
...@@ -1591,9 +1608,22 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { ...@@ -1591,9 +1608,22 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre == "megrez") { tokenizer_pre == "megrez") {
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2; pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
} else if ( } else if (
tokenizer_pre == "gpt-4o") { tokenizer_pre == "gpt-4o" ||
tokenizer_pre == "llama4") {
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O; pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O;
clean_spaces = false; clean_spaces = false;
} else if (
tokenizer_pre == "superbpe") {
pre_type = LLAMA_VOCAB_PRE_TYPE_SUPERBPE;
clean_spaces = false;
} else if (
tokenizer_pre == "trillion") {
pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION;
clean_spaces = false;
} else if (
tokenizer_pre == "bailingmoe") {
pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
clean_spaces = false;
} else { } else {
LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__); LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
...@@ -1772,6 +1802,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { ...@@ -1772,6 +1802,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "<end_of_turn>" || t.first == "<end_of_turn>"
|| t.first == "<|endoftext|>" || t.first == "<|endoftext|>"
|| t.first == "<EOT>" || t.first == "<EOT>"
|| t.first == "_<EOT>"
|| t.first == "<|end▁of▁sentence|>" // DeepSeek || t.first == "<|end▁of▁sentence|>" // DeepSeek
) { ) {
special_eot_id = t.second; special_eot_id = t.second;
...@@ -1804,6 +1835,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { ...@@ -1804,6 +1835,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "<fim-prefix>" || t.first == "<fim-prefix>"
|| t.first == "<|fim▁begin|>" // DeepSeek || t.first == "<|fim▁begin|>" // DeepSeek
|| t.first == "<PRE>" || t.first == "<PRE>"
|| t.first == "▁<PRE>" // CodeLlama
) { ) {
special_fim_pre_id = t.second; special_fim_pre_id = t.second;
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
...@@ -1821,6 +1853,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { ...@@ -1821,6 +1853,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "<fim-suffix>" || t.first == "<fim-suffix>"
|| t.first == "<|fim▁hole|>" // DeepSeek || t.first == "<|fim▁hole|>" // DeepSeek
|| t.first == "<SUF>" || t.first == "<SUF>"
|| t.first == "▁<SUF>" // CodeLlama
) { ) {
special_fim_suf_id = t.second; special_fim_suf_id = t.second;
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
...@@ -1838,6 +1871,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { ...@@ -1838,6 +1871,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "<fim-middle>" || t.first == "<fim-middle>"
|| t.first == "<|fim▁end|>" // DeepSeek || t.first == "<|fim▁end|>" // DeepSeek
|| t.first == "<MID>" || t.first == "<MID>"
|| t.first == "▁<MID>" // CodeLlama
) { ) {
special_fim_mid_id = t.second; special_fim_mid_id = t.second;
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
...@@ -1922,6 +1956,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { ...@@ -1922,6 +1956,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "<|endoftext|>" || t.first == "<|endoftext|>"
|| t.first == "<|eom_id|>" || t.first == "<|eom_id|>"
|| t.first == "<EOT>" || t.first == "<EOT>"
|| t.first == "_<EOT>"
) { ) {
special_eog_ids.insert(t.second); special_eog_ids.insert(t.second);
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
...@@ -2180,14 +2215,12 @@ void llama_vocab::impl::tokenizer_st_partition(std::forward_list<fragment_buffer ...@@ -2180,14 +2215,12 @@ void llama_vocab::impl::tokenizer_st_partition(std::forward_list<fragment_buffer
// find the first occurrence of a given special token in this fragment // find the first occurrence of a given special token in this fragment
// passing offset argument only limit the "search area" but match coordinates // passing offset argument only limit the "search area" but match coordinates
// are still relative to the source full raw_text // are still relative to the source full raw_text
auto match = raw_text.find(text, raw_text_base_offset); // string_view begins at pos 0 for the same reason
auto match = std::string_view(raw_text.data(), raw_text_base_offset + raw_text_base_length).find(text, raw_text_base_offset);
// no occurrences found, stop processing this fragment for a given special token // no occurrences found, stop processing this fragment for a given special token
if (match == std::string::npos) break; if (match == std::string::npos) break;
// check if match is within bounds of offset <-> length
if (match + text.length() > raw_text_base_offset + raw_text_base_length) break;
#ifdef PRETOKENIZERDEBUG #ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str()); LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
#endif #endif
......
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -220,7 +220,6 @@ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) { ...@@ -220,7 +220,6 @@ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
free(wbuf); free(wbuf);
return ret; return ret;
#else #else
#if defined(__clang__) #if defined(__clang__)
// disable C++17 deprecation warning for std::codecvt_utf8 // disable C++17 deprecation warning for std::codecvt_utf8
# pragma clang diagnostic push # pragma clang diagnostic push
......
...@@ -147,27 +147,27 @@ func (c *Context) Model() *Model { ...@@ -147,27 +147,27 @@ func (c *Context) Model() *Model {
} }
func (c *Context) KvCacheSeqAdd(seqId int, p0 int, p1 int, delta int) { func (c *Context) KvCacheSeqAdd(seqId int, p0 int, p1 int, delta int) {
C.llama_kv_cache_seq_add(c.c, C.int(seqId), C.int(p0), C.int(p1), C.int(delta)) C.llama_kv_self_seq_add(c.c, C.int(seqId), C.int(p0), C.int(p1), C.int(delta))
} }
func (c *Context) KvCacheSeqRm(seqId int, p0 int, p1 int) bool { func (c *Context) KvCacheSeqRm(seqId int, p0 int, p1 int) bool {
return bool(C.llama_kv_cache_seq_rm(c.c, C.int(seqId), C.int(p0), C.int(p1))) return bool(C.llama_kv_self_seq_rm(c.c, C.int(seqId), C.int(p0), C.int(p1)))
} }
func (c *Context) KvCacheSeqCp(srcSeqId int, dstSeqId int, p0 int, p1 int) { func (c *Context) KvCacheSeqCp(srcSeqId int, dstSeqId int, p0 int, p1 int) {
C.llama_kv_cache_seq_cp(c.c, C.int(srcSeqId), C.int(dstSeqId), C.int(p0), C.int(p1)) C.llama_kv_self_seq_cp(c.c, C.int(srcSeqId), C.int(dstSeqId), C.int(p0), C.int(p1))
} }
func (c *Context) KvCacheClear() { func (c *Context) KvCacheClear() {
C.llama_kv_cache_clear(c.c) C.llama_kv_self_clear(c.c)
} }
func (c *Context) KvCacheDefrag() { func (c *Context) KvCacheDefrag() {
C.llama_kv_cache_defrag(c.c) C.llama_kv_self_defrag(c.c)
} }
func (c *Context) KvCacheCanShift() bool { func (c *Context) KvCacheCanShift() bool {
return bool(C.llama_kv_cache_can_shift(c.c)) return bool(C.llama_kv_self_can_shift(c.c))
} }
// Get the embeddings for a sequence id // Get the embeddings for a sequence id
......
...@@ -24,10 +24,10 @@ problem. ...@@ -24,10 +24,10 @@ problem.
9 files changed, 21 insertions(+), 2 deletions(-) 9 files changed, 21 insertions(+), 2 deletions(-)
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index dba7be33..65e150d6 100644 index 273075f4..dd11f304 100644
--- a/ggml/src/ggml-backend.cpp --- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp
@@ -106,7 +106,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) { @@ -107,7 +107,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
if (buffer->iface.free_buffer != NULL) { if (buffer->iface.free_buffer != NULL) {
buffer->iface.free_buffer(buffer); buffer->iface.free_buffer(buffer);
} }
...@@ -35,7 +35,7 @@ index dba7be33..65e150d6 100644 ...@@ -35,7 +35,7 @@ index dba7be33..65e150d6 100644
} }
size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) { size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
@@ -542,6 +541,7 @@ static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) @@ -544,6 +543,7 @@ static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer)
free(ctx->buffers); free(ctx->buffers);
free(ctx); free(ctx);
...@@ -43,7 +43,7 @@ index dba7be33..65e150d6 100644 ...@@ -43,7 +43,7 @@ index dba7be33..65e150d6 100644
} }
static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
@@ -1865,6 +1865,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { @@ -1867,6 +1867,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_aligned_free(buffer->context, buffer->size); ggml_aligned_free(buffer->context, buffer->size);
...@@ -55,7 +55,7 @@ index dba7be33..65e150d6 100644 ...@@ -55,7 +55,7 @@ index dba7be33..65e150d6 100644
} }
static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
@@ -1912,7 +1917,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = { @@ -1914,7 +1919,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
}; };
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = { static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
...@@ -65,7 +65,7 @@ index dba7be33..65e150d6 100644 ...@@ -65,7 +65,7 @@ index dba7be33..65e150d6 100644
/* .init_tensor = */ NULL, // no initialization required /* .init_tensor = */ NULL, // no initialization required
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor, /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index d410c024..a207ab1e 100644 index cec36b36..4b057973 100644
--- a/ggml/src/ggml-cann/ggml-cann.cpp --- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -530,6 +530,7 @@ static void ggml_backend_cann_buffer_free_buffer( @@ -530,6 +530,7 @@ static void ggml_backend_cann_buffer_free_buffer(
...@@ -76,7 +76,7 @@ index d410c024..a207ab1e 100644 ...@@ -76,7 +76,7 @@ index d410c024..a207ab1e 100644
} }
/** /**
@@ -1198,6 +1199,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf @@ -1199,6 +1200,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
*/ */
static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) { static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
ACL_CHECK(aclrtFreeHost(buffer->context)); ACL_CHECK(aclrtFreeHost(buffer->context));
...@@ -85,10 +85,10 @@ index d410c024..a207ab1e 100644 ...@@ -85,10 +85,10 @@ index d410c024..a207ab1e 100644
/** /**
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index ebb2ccae..dfff21a2 100644 index fafe9633..59a49560 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -529,6 +529,7 @@ struct ggml_backend_cuda_buffer_context { @@ -533,6 +533,7 @@ struct ggml_backend_cuda_buffer_context {
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
delete ctx; delete ctx;
...@@ -96,7 +96,7 @@ index ebb2ccae..dfff21a2 100644 ...@@ -96,7 +96,7 @@ index ebb2ccae..dfff21a2 100644
} }
static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) { static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
@@ -783,6 +784,7 @@ struct ggml_backend_cuda_split_buffer_context { @@ -788,6 +789,7 @@ struct ggml_backend_cuda_split_buffer_context {
static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context; ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
delete ctx; delete ctx;
...@@ -104,7 +104,7 @@ index ebb2ccae..dfff21a2 100644 ...@@ -104,7 +104,7 @@ index ebb2ccae..dfff21a2 100644
} }
static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -1055,6 +1057,7 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_ @@ -1061,6 +1063,7 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
CUDA_CHECK(cudaFreeHost(buffer->context)); CUDA_CHECK(cudaFreeHost(buffer->context));
...@@ -125,10 +125,10 @@ index 50579227..2799a0a5 100644 ...@@ -125,10 +125,10 @@ index 50579227..2799a0a5 100644
static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index c550142a..fd9a4e77 100644 index 9f1c6c6c..310afe8a 100644
--- a/ggml/src/ggml-metal/ggml-metal.m --- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -4350,6 +4350,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) @@ -4641,6 +4641,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
} }
free(ctx); free(ctx);
...@@ -137,10 +137,10 @@ index c550142a..fd9a4e77 100644 ...@@ -137,10 +137,10 @@ index c550142a..fd9a4e77 100644
static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index f5906246..062e93b8 100644 index b8b5cbd3..14d4561b 100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp --- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -1203,6 +1203,7 @@ static void * const cl_ptr_base = (void *)(uintptr_t) 0x1000; @@ -1443,6 +1443,7 @@ struct ggml_backend_opencl_buffer_context {
static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context; ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
delete ctx; delete ctx;
...@@ -149,10 +149,10 @@ index f5906246..062e93b8 100644 ...@@ -149,10 +149,10 @@ index f5906246..062e93b8 100644
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
index 97873acc..893ee0b9 100644 index 862b9b66..34536681 100644
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp --- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -419,6 +419,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) { @@ -443,6 +443,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0); bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
GGML_ASSERT(status); GGML_ASSERT(status);
delete ctx; delete ctx;
...@@ -161,10 +161,10 @@ index 97873acc..893ee0b9 100644 ...@@ -161,10 +161,10 @@ index 97873acc..893ee0b9 100644
static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 792e0569..5e233e8b 100644 index 3e48a924..a3d182fc 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp --- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -311,6 +311,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try { @@ -316,6 +316,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
ggml_sycl_set_device(ctx->device); ggml_sycl_set_device(ctx->device);
delete ctx; delete ctx;
...@@ -172,7 +172,7 @@ index 792e0569..5e233e8b 100644 ...@@ -172,7 +172,7 @@ index 792e0569..5e233e8b 100644
} }
catch (sycl::exception const &exc) { catch (sycl::exception const &exc) {
std::cerr << exc.what() << "Exception caught at file:" << __FILE__ std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -720,6 +721,7 @@ struct ggml_backend_sycl_split_buffer_context { @@ -761,6 +762,7 @@ struct ggml_backend_sycl_split_buffer_context {
static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context; ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
delete ctx; delete ctx;
...@@ -180,7 +180,7 @@ index 792e0569..5e233e8b 100644 ...@@ -180,7 +180,7 @@ index 792e0569..5e233e8b 100644
} }
static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -1053,6 +1055,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_ @@ -1095,6 +1097,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_sycl_host_free(buffer->context); ggml_sycl_host_free(buffer->context);
...@@ -189,10 +189,10 @@ index 792e0569..5e233e8b 100644 ...@@ -189,10 +189,10 @@ index 792e0569..5e233e8b 100644
static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index abe3e790..1dad714b 100644 index 783a0ff8..8ac1e07e 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -7914,6 +7914,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) { @@ -8639,6 +8639,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
ggml_vk_destroy_buffer(ctx->dev_buffer); ggml_vk_destroy_buffer(ctx->dev_buffer);
delete ctx; delete ctx;
...@@ -200,7 +200,7 @@ index abe3e790..1dad714b 100644 ...@@ -200,7 +200,7 @@ index abe3e790..1dad714b 100644
} }
static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -8056,6 +8057,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe @@ -8782,6 +8783,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()"); VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
ggml_vk_host_free(vk_instance.devices[0], buffer->context); ggml_vk_host_free(vk_instance.devices[0], buffer->context);
......
...@@ -3,15 +3,17 @@ From: Michael Yang <mxyng@pm.me> ...@@ -3,15 +3,17 @@ From: Michael Yang <mxyng@pm.me>
Date: Mon, 16 Sep 2024 15:53:13 -0700 Date: Mon, 16 Sep 2024 15:53:13 -0700
Subject: [PATCH] pretokenizer Subject: [PATCH] pretokenizer
allow for an unset pretokenizer with a warning in the
logs instead of throwing an error
--- ---
src/llama-vocab.cpp | 14 +++----------- src/llama-vocab.cpp | 14 +++-----------
1 file changed, 3 insertions(+), 11 deletions(-) 1 file changed, 3 insertions(+), 11 deletions(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index ad9ffe66..a4eee9b8 100644 index 464ff01e..0125ee53 100644
--- a/src/llama-vocab.cpp --- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp
@@ -1468,16 +1468,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { @@ -1491,16 +1491,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
if (type == LLAMA_VOCAB_TYPE_BPE) { if (type == LLAMA_VOCAB_TYPE_BPE) {
add_space_prefix = false; add_space_prefix = false;
clean_spaces = true; clean_spaces = true;
...@@ -29,9 +31,9 @@ index ad9ffe66..a4eee9b8 100644 ...@@ -29,9 +31,9 @@ index ad9ffe66..a4eee9b8 100644
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if ( } else if (
tokenizer_pre == "llama3" || tokenizer_pre == "llama3" ||
@@ -1593,7 +1584,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { @@ -1634,7 +1625,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre == "megrez") { pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2; clean_spaces = false;
} else { } else {
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); - throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
+ LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__); + LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
......
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me> From: jmorganca <jmorganca@gmail.com>
Date: Mon, 16 Sep 2024 15:53:14 -0700 Date: Tue, 8 Apr 2025 15:28:34 -0700
Subject: [PATCH] embeddings Subject: [PATCH] embeddings
allow a loaded model in llama.cpp to be used for
both embeddings and causal attention text generation
instead of forcing one or the error
--- ---
src/llama-context.cpp | 2 +- src/llama-context.cpp | 6 +++---
src/llama.cpp | 6 ++++-- 1 file changed, 3 insertions(+), 3 deletions(-)
2 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 671d2a81..47e79ed4 100644 index 4735e98e..65135172 100644
--- a/src/llama-context.cpp --- a/src/llama-context.cpp
+++ b/src/llama-context.cpp +++ b/src/llama-context.cpp
@@ -479,7 +479,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) { @@ -1232,7 +1232,7 @@ int llama_context::decode(llama_batch & inp_batch) {
const auto n_embd = hparams.n_embd; int64_t n_outputs_all = 0;
// TODO: use a per-batch flag for logits presence instead
- const bool has_logits = !cparams.embeddings;
+ const bool has_logits = cparams.causal_attn;
const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0; // count outputs
diff --git a/src/llama.cpp b/src/llama.cpp - if (batch.logits && !embd_pooled) {
index 607f2786..ac85bfed 100644 + if (batch.logits) {
--- a/src/llama.cpp for (uint32_t i = 0; i < n_tokens_all; ++i) {
+++ b/src/llama.cpp n_outputs_all += batch.logits[i] != 0;
@@ -8652,7 +8652,6 @@ static int llama_decode_impl(
res = nullptr;
embd = nullptr;
} else if (cparams.embeddings) {
- res = nullptr; // do not extract logits for embedding case
embd = nullptr;
for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
@@ -8660,12 +8659,15 @@ static int llama_decode_impl(
break;
}
}
- GGML_ASSERT(embd != nullptr && "missing embeddings tensor");
} else {
embd = nullptr; // do not extract embeddings when not needed
GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
} }
@@ -1344,7 +1344,7 @@ int llama_context::decode(llama_batch & inp_batch) {
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
//}
- auto * t_logits = cparams.embeddings ? nullptr : res->get_logits();
+ auto * t_logits = cparams.causal_attn ? res->get_logits() : nullptr;
auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr;
+ if (!cparams.causal_attn) { if (t_embd && res->get_embd_pooled()) {
+ res = nullptr; // do not extract logits when not needed @@ -1488,7 +1488,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
+ } const auto n_embd = hparams.n_embd;
+
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); // TODO: use a per-batch flag for logits presence instead
- bool has_logits = !cparams.embeddings;
+ bool has_logits = cparams.causal_attn;
bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
ggml_backend_sched_alloc_graph(lctx.sched.get(), gf); // TODO: hacky enc-dec support
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me> From: jmorganca <jmorganca@gmail.com>
Date: Mon, 16 Sep 2024 15:53:15 -0700 Date: Tue, 8 Apr 2025 15:34:37 -0700
Subject: [PATCH] clip-unicode Subject: [PATCH] clip-unicode
fixes loading vision models in llama.cpp on windows
filesystems for paths that include wide characters
--- ---
examples/llava/clip.cpp | 40 +++++++++++++++++++++++++++++++++++++++- examples/llava/clip.cpp | 39 +++++++++++++++++++++++++++++++++++++++
1 file changed, 39 insertions(+), 1 deletion(-) 1 file changed, 39 insertions(+)
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 76d4a785..205af1eb 100644 index 49c90b75..4b72ea9f 100644
--- a/examples/llava/clip.cpp --- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp
@@ -58,6 +58,19 @@ @@ -28,6 +28,19 @@
# define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0) #include <cinttypes>
#endif // defined(LLAVA_LOG_OFF) #include <limits>
+#if defined(_WIN32) +#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN
...@@ -28,49 +30,48 @@ index 76d4a785..205af1eb 100644 ...@@ -28,49 +30,48 @@ index 76d4a785..205af1eb 100644
+#endif +#endif
+#endif +#endif
+ +
struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
//#define CLIP_DEBUG_FUNCTIONS //#define CLIP_DEBUG_FUNCTIONS
@@ -1429,7 +1442,29 @@ struct clip_model_loader {
{
std::vector<uint8_t> read_buf;
// RGB uint8 image
@@ -1402,8 +1415,29 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
gguf_free(ctx);
return nullptr;
}
-
+#ifdef _WIN32 +#ifdef _WIN32
+ int wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0); + int wlen = MultiByteToWideChar(CP_UTF8, 0, fname.c_str(), -1, NULL, 0);
+ if (!wlen) { + if (!wlen) {
+ return NULL; + throw std::runtime_error(string_format("%s: failed to convert filename to wide string\n", __func__));
+ } + }
+ wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t)); + wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t));
+ wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, wbuf, wlen); + wlen = MultiByteToWideChar(CP_UTF8, 0, fname.c_str(), -1, wbuf, wlen);
+ if (!wlen) { + if (!wlen) {
+ free(wbuf); + free(wbuf);
+ return NULL; + throw std::runtime_error(string_format("%s: failed to convert filename to wide string\n", __func__));
+ } + }
+#if __GLIBCXX__ +#if __GLIBCXX__
+ int fd = _wopen(wbuf, _O_RDONLY | _O_BINARY); + int fd = _wopen(wbuf, _O_RDONLY | _O_BINARY);
+ __gnu_cxx::stdio_filebuf<char> buffer(fd, std::ios_base::in); + __gnu_cxx::stdio_filebuf<char> buffer(fd, std::ios_base::in);
+ std::istream fin(&buffer); + std::istream fin(&buffer);
+#else // MSVC +#else // MSVC
+ // unused in our current build + // unused in our current build
+ auto fin = std::ifstream(wbuf, std::ios::binary); + auto fin = std::ifstream(wbuf, std::ios::binary);
+#endif +#endif
+ free(wbuf); + free(wbuf);
+#else +#else
auto fin = std::ifstream(fname, std::ios::binary); auto fin = std::ifstream(fname, std::ios::binary);
+#endif +#endif
if (!fin) { if (!fin) {
LOG_ERR("cannot open model file for loading tensors\n"); throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
clip_free(new_clip); }
@@ -1443,7 +1477,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { @@ -1456,7 +1491,11 @@ struct clip_model_loader {
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
}
} }
}
+#if defined(_WIN32) && defined(__GLIBCXX__) +#if defined(_WIN32) && defined(__GLIBCXX__)
+ close(fd); + close(fd);
+#else +#else
fin.close(); fin.close();
+#endif +#endif
}
// vision model LOG_DBG("%s: loaded %zu tensors from %s\n", __func__, tensors_to_load.size(), fname.c_str());
}
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me> From: jmorganca <jmorganca@gmail.com>
Date: Mon, 16 Sep 2024 15:53:16 -0700 Date: Tue, 8 Apr 2025 16:03:51 -0700
Subject: [PATCH] solar-pro Subject: [PATCH] solar-pro
solar-pro introduces block skip connections where blocks are connected adds support for the Solar Pro architecture
to other, non-sequential blocks with a scale multiple
this change adds 4 new keys to store the skip connections and one new
tensor to store the scalar. the scalar is implemented a 1-dimensional
tensor with 2 elements dervied from the model's bskcn_tv configuration.
in general, the values are (bskcn_tv, 1 - bskcn_tv)
--- ---
src/llama-arch.cpp | 21 +++++ src/llama-arch.cpp | 21 ++++
src/llama-arch.h | 3 + src/llama-arch.h | 3 +
src/llama-hparams.cpp | 8 ++ src/llama-hparams.cpp | 8 ++
src/llama-hparams.h | 5 ++ src/llama-hparams.h | 5 +
src/llama-model-loader.cpp | 1 + src/llama-model-loader.cpp | 1 +
src/llama-model.cpp | 44 +++++++++++ src/llama-model.cpp | 207 +++++++++++++++++++++++++++++++++++++
src/llama-model.h | 3 + src/llama-model.h | 3 +
src/llama.cpp | 152 ++++++++++++++++++++++++++++++++++++- 7 files changed, 248 insertions(+)
8 files changed, 236 insertions(+), 1 deletion(-)
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 97a1e7e5..a1e0ebcc 100644 index a6fddc7f..0b0fedcd 100644
--- a/src/llama-arch.cpp --- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp +++ b/src/llama-arch.cpp
@@ -61,6 +61,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = { @@ -68,6 +68,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_GRANITE, "granite" }, { LLM_ARCH_GRANITE, "granite" },
{ LLM_ARCH_GRANITE_MOE, "granitemoe" }, { LLM_ARCH_GRANITE_MOE, "granitemoe" },
{ LLM_ARCH_CHAMELEON, "chameleon" }, { LLM_ARCH_CHAMELEON, "chameleon" },
+ { LLM_ARCH_SOLAR, "solar" }, + { LLM_ARCH_SOLAR, "solar" },
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" }, { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
{ LLM_ARCH_UNKNOWN, "(unknown)" }, { LLM_ARCH_PLM, "plm" },
}; { LLM_ARCH_BAILINGMOE, "bailingmoe" },
@@ -125,6 +126,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = { @@ -140,6 +141,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" }, { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" }, { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
+ { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" }, + { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" }, { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
@@ -1271,6 +1273,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N @@ -1478,6 +1480,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
}, },
}, },
...@@ -66,7 +59,7 @@ index 97a1e7e5..a1e0ebcc 100644 ...@@ -66,7 +59,7 @@ index 97a1e7e5..a1e0ebcc 100644
{ {
LLM_ARCH_WAVTOKENIZER_DEC, LLM_ARCH_WAVTOKENIZER_DEC,
{ {
@@ -1429,6 +1449,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = { @@ -1671,6 +1691,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, {LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
// this tensor is loaded for T5, but never used // this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}}, {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
...@@ -75,18 +68,18 @@ index 97a1e7e5..a1e0ebcc 100644 ...@@ -75,18 +68,18 @@ index 97a1e7e5..a1e0ebcc 100644
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h diff --git a/src/llama-arch.h b/src/llama-arch.h
index 122fdceb..77919578 100644 index 2c2099b3..74aa3dd0 100644
--- a/src/llama-arch.h --- a/src/llama-arch.h
+++ b/src/llama-arch.h +++ b/src/llama-arch.h
@@ -65,6 +65,7 @@ enum llm_arch { @@ -72,6 +72,7 @@ enum llm_arch {
LLM_ARCH_GRANITE, LLM_ARCH_GRANITE,
LLM_ARCH_GRANITE_MOE, LLM_ARCH_GRANITE_MOE,
LLM_ARCH_CHAMELEON, LLM_ARCH_CHAMELEON,
+ LLM_ARCH_SOLAR, + LLM_ARCH_SOLAR,
LLM_ARCH_WAVTOKENIZER_DEC, LLM_ARCH_WAVTOKENIZER_DEC,
LLM_ARCH_UNKNOWN, LLM_ARCH_PLM,
}; LLM_ARCH_BAILINGMOE,
@@ -129,6 +130,7 @@ enum llm_kv { @@ -144,6 +145,7 @@ enum llm_kv {
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
LLM_KV_ATTENTION_SLIDING_WINDOW, LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE, LLM_KV_ATTENTION_SCALE,
...@@ -94,7 +87,7 @@ index 122fdceb..77919578 100644 ...@@ -94,7 +87,7 @@ index 122fdceb..77919578 100644
LLM_KV_ROPE_DIMENSION_COUNT, LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_DIMENSION_SECTIONS, LLM_KV_ROPE_DIMENSION_SECTIONS,
@@ -311,6 +313,7 @@ enum llm_tensor { @@ -340,6 +342,7 @@ enum llm_tensor {
LLM_TENSOR_ENC_OUTPUT_NORM, LLM_TENSOR_ENC_OUTPUT_NORM,
LLM_TENSOR_CLS, LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT, LLM_TENSOR_CLS_OUT,
...@@ -103,14 +96,13 @@ index 122fdceb..77919578 100644 ...@@ -103,14 +96,13 @@ index 122fdceb..77919578 100644
LLM_TENSOR_CONVNEXT_DW, LLM_TENSOR_CONVNEXT_DW,
LLM_TENSOR_CONVNEXT_NORM, LLM_TENSOR_CONVNEXT_NORM,
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
index ea87b295..f3955de9 100644 index 90dfe7a7..8a667960 100644
--- a/src/llama-hparams.cpp --- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp
@@ -69,3 +69,11 @@ uint32_t llama_hparams::n_embd_v_s() const { @@ -70,6 +70,14 @@ uint32_t llama_hparams::n_embd_v_s() const {
// corresponds to Mamba's ssm_states size
return ssm_d_state * ssm_d_inner; return ssm_d_state * ssm_d_inner;
} }
+
+bool llama_hparams::n_bskcn(uint32_t n, uint32_t il) const { +bool llama_hparams::n_bskcn(uint32_t n, uint32_t il) const {
+ if (il < n_layer) { + if (il < n_layer) {
+ return n_bskcn_arr[n][il] > 0; + return n_bskcn_arr[n][il] > 0;
...@@ -118,12 +110,15 @@ index ea87b295..f3955de9 100644 ...@@ -118,12 +110,15 @@ index ea87b295..f3955de9 100644
+ +
+ GGML_ABORT("fatal error"); + GGML_ABORT("fatal error");
+} +}
\ No newline at end of file +
bool llama_hparams::is_swa(uint32_t il) const {
if (il < n_layer) {
return n_swa > 0 && n_swa_pattern > 0 && il % n_swa_pattern < (n_swa_pattern - 1);
diff --git a/src/llama-hparams.h b/src/llama-hparams.h diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index 1fe45410..1bdcdfd5 100644 index 4e0b5719..c3147cbc 100644
--- a/src/llama-hparams.h --- a/src/llama-hparams.h
+++ b/src/llama-hparams.h +++ b/src/llama-hparams.h
@@ -50,6 +50,8 @@ struct llama_hparams { @@ -51,6 +51,8 @@ struct llama_hparams {
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr; std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr; std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
...@@ -132,18 +127,18 @@ index 1fe45410..1bdcdfd5 100644 ...@@ -132,18 +127,18 @@ index 1fe45410..1bdcdfd5 100644
uint32_t n_layer_dense_lead = 0; uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0; uint32_t n_lora_q = 0;
uint32_t n_lora_kv = 0; uint32_t n_lora_kv = 0;
@@ -133,6 +135,9 @@ struct llama_hparams { @@ -149,6 +151,9 @@ struct llama_hparams {
// dimension of the recurrent state embeddings // dimension of the recurrent state embeddings
uint32_t n_embd_v_s() const; uint32_t n_embd_v_s() const;
+
+ // Block skip connection + // Block skip connection
+ bool n_bskcn(uint32_t n, uint32_t il) const; + bool n_bskcn(uint32_t n, uint32_t il) const;
+
bool is_swa(uint32_t il) const;
}; };
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index 05d58ad9..1252aca1 100644 index ea73a8a7..a012aeae 100644
--- a/src/llama-model-loader.cpp --- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp
@@ -439,6 +439,7 @@ namespace GGUFMeta { @@ -439,6 +439,7 @@ namespace GGUFMeta {
...@@ -155,10 +150,10 @@ index 05d58ad9..1252aca1 100644 ...@@ -155,10 +150,10 @@ index 05d58ad9..1252aca1 100644
llama_model_loader::llama_model_loader( llama_model_loader::llama_model_loader(
const std::string & fname, const std::string & fname,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 36a0a009..ad1315c6 100644 index b74dd72c..5fbd0055 100644
--- a/src/llama-model.cpp --- a/src/llama-model.cpp
+++ b/src/llama-model.cpp +++ b/src/llama-model.cpp
@@ -1238,6 +1238,21 @@ void llama_model::load_hparams(llama_model_loader & ml) { @@ -1372,6 +1372,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN; default: type = LLM_TYPE_UNKNOWN;
} }
} break; } break;
...@@ -180,7 +175,7 @@ index 36a0a009..ad1315c6 100644 ...@@ -180,7 +175,7 @@ index 36a0a009..ad1315c6 100644
case LLM_ARCH_WAVTOKENIZER_DEC: case LLM_ARCH_WAVTOKENIZER_DEC:
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -3316,6 +3331,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) { @@ -3701,6 +3716,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
...@@ -215,54 +210,12 @@ index 36a0a009..ad1315c6 100644 ...@@ -215,54 +210,12 @@ index 36a0a009..ad1315c6 100644
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
@@ -3900,6 +3943,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) { @@ -12244,6 +12287,165 @@ struct llm_build_chameleon : public llm_graph_context {
case LLM_ARCH_GRANITE: }
case LLM_ARCH_GRANITE_MOE: };
case LLM_ARCH_CHAMELEON:
+ case LLM_ARCH_SOLAR:
return LLAMA_ROPE_TYPE_NORM;
// the pairs of head values are offset by n_rot/2
diff --git a/src/llama-model.h b/src/llama-model.h
index a7c30444..1afb0024 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -55,6 +55,7 @@ enum llm_type {
LLM_TYPE_15B,
LLM_TYPE_16B,
LLM_TYPE_20B,
+ LLM_TYPE_22B,
LLM_TYPE_30B,
LLM_TYPE_32B,
LLM_TYPE_34B,
@@ -281,6 +282,8 @@ struct llama_layer {
struct ggml_tensor * ffn_up_scale = nullptr;
struct ggml_tensor * ffn_down_scale = nullptr;
+ struct ggml_tensor * bskcn_tv = nullptr;
+
struct llama_layer_posnet posnet;
struct llama_layer_convnext convnext; +struct llm_build_solar : public llm_graph_context {
diff --git a/src/llama.cpp b/src/llama.cpp + llm_build_solar(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
index ac85bfed..6d320ea4 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -7953,9 +7953,155 @@ struct llm_build_context {
cb(img_logits, "img_logits", -1);
cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx);
cb(cur, "result_output", -1);
-
ggml_build_forward_expand(gf, cur);
+ return gf;
+ }
+
+ ggml_cgraph * build_solar() {
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
+ int32_t n_tokens = this->n_tokens;
+
+ const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v;
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_rot);
...@@ -270,13 +223,15 @@ index ac85bfed..6d320ea4 100644 ...@@ -270,13 +223,15 @@ index ac85bfed..6d320ea4 100644
+ struct ggml_tensor * cur; + struct ggml_tensor * cur;
+ struct ggml_tensor * inpL; + struct ggml_tensor * inpL;
+ +
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd);
+ +
+ // inp_pos - contains the positions + // inp_pos - contains the positions
+ struct ggml_tensor * inp_pos = build_inp_pos(); + struct ggml_tensor * inp_pos = build_inp_pos();
+ +
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + auto * inp_attn = build_attn_inp_kv_unified();
+
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+ +
+ struct ggml_tensor * bskcn_1; + struct ggml_tensor * bskcn_1;
+ struct ggml_tensor * bskcn_2; + struct ggml_tensor * bskcn_2;
...@@ -305,88 +260,94 @@ index ac85bfed..6d320ea4 100644 ...@@ -305,88 +260,94 @@ index ac85bfed..6d320ea4 100644
+ ggml_mul(ctx0, bskcn_2, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, 0)), + ggml_mul(ctx0, bskcn_2, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, 0)),
+ ggml_mul(ctx0, inpSA, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, ggml_element_size(model.layers[il].bskcn_tv)))); + ggml_mul(ctx0, inpSA, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, ggml_element_size(model.layers[il].bskcn_tv))));
+ } + }
+
+ // norm + // norm
+ cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il); + cb(cur, "attn_norm", il);
+ +
+ // self-attention + // self-attention
+ { + {
+ // rope freq factors for llama3; may return nullptr for llama2 and other models + // rope freq factors for llama3; may return nullptr for llama2 and other models
+ struct ggml_tensor * rope_factors = build_rope_factors(il); + ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
+ +
+ // compute Q and K and RoPE them + // compute Q and K and RoPE them
+ struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il); + cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) { + if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il); + cb(Qcur, "Qcur", il);
+ } + }
+ +
+ struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il); + cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) { + if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il); + cb(Kcur, "Kcur", il);
+ } + }
+ +
+ struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il); + cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) { + if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il); + cb(Vcur, "Vcur", il);
+ } + }
+ +
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext( + Qcur = ggml_rope_ext(
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors, + ctx0, Qcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow + ext_factor, attn_factor, beta_fast, beta_slow
+ ); + );
+ cb(Qcur, "Qcur", il);
+ +
+ Kcur = ggml_rope_ext( + Kcur = ggml_rope_ext(
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors, + ctx0, Kcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow + ext_factor, attn_factor, beta_fast, beta_slow
+ ); + );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il); + cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+ +
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(inp_attn, gf,
+ model.layers[il].wo, model.layers[il].bo, + model.layers[il].wo, model.layers[il].bo,
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Qcur, Kcur, Vcur, nullptr, kq_scale, il);
+ cb(cur, "attn_out", il);
+ } + }
+ +
+ if (il == n_layer - 1) { + if (il == n_layer - 1) {
+ // skip computing output for unused tokens + // skip computing output for unused tokens
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids();
+ n_tokens = n_outputs;
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids); + cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ } + }
+ +
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il); + cb(ffn_inp, "ffn_inp", il);
+ +
+ // feed-forward network + // feed-forward network
+ cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il); + cb(cur, "ffn_norm", il);
+ +
+ cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, + model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL, + NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il); + cb(cur, "ffn_out", il);
+ +
+ cur = ggml_add(ctx0, cur, ffn_inp); + cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il); + cb(cur, "ffn_out", il);
+ +
+ cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = build_cvec(cur, il);
+ cb(cur, "l_out", il); + cb(cur, "l_out", il);
+ +
+ // input for next layer + // input for next layer
...@@ -394,25 +355,64 @@ index ac85bfed..6d320ea4 100644 ...@@ -394,25 +355,64 @@ index ac85bfed..6d320ea4 100644
+ } + }
+ +
+ cur = inpL; + cur = inpL;
+ cur = llm_build_norm(ctx0, cur, hparams, +
+ cur = build_norm(cur,
+ model.output_norm, NULL, + model.output_norm, NULL,
+ LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1); + cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head + // lm_head
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1); + cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur); + ggml_build_forward_expand(gf, cur);
return gf; + }
} +};
+
@@ -8398,6 +8544,10 @@ static struct ggml_cgraph * llama_build_graph( struct llm_build_wavtokenizer_dec : public llm_graph_context {
llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
ggml_tensor * cur;
@@ -12993,6 +13195,10 @@ llm_graph_result_ptr llama_model::build_graph(
{ {
result = llm.build_chameleon(); llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
} break; } break;
+ case LLM_ARCH_SOLAR: + case LLM_ARCH_SOLAR:
+ { + {
+ result = llm.build_solar(); + llm = std::make_unique<llm_build_solar>(*this, params, gf);
+ } break; + } break;
case LLM_ARCH_WAVTOKENIZER_DEC: case LLM_ARCH_WAVTOKENIZER_DEC:
{ {
result = llm.build_wavtokenizer_dec(); llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf);
@@ -13139,6 +13345,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_CHAMELEON:
+ case LLM_ARCH_SOLAR:
case LLM_ARCH_BAILINGMOE:
return LLAMA_ROPE_TYPE_NORM;
diff --git a/src/llama-model.h b/src/llama-model.h
index 0f18dac1..e08d4ae4 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -62,6 +62,7 @@ enum llm_type {
LLM_TYPE_15B,
LLM_TYPE_16B,
LLM_TYPE_20B,
+ LLM_TYPE_22B,
LLM_TYPE_30B,
LLM_TYPE_32B,
LLM_TYPE_34B,
@@ -305,6 +306,8 @@ struct llama_layer {
struct ggml_tensor * ffn_up_scale = nullptr;
struct ggml_tensor * ffn_down_scale = nullptr;
+ struct ggml_tensor * bskcn_tv = nullptr;
+
struct llama_layer_posnet posnet;
struct llama_layer_convnext convnext;
...@@ -8,10 +8,10 @@ Subject: [PATCH] conditional-fattn ...@@ -8,10 +8,10 @@ Subject: [PATCH] conditional-fattn
1 file changed, 2 insertions(+) 1 file changed, 2 insertions(+)
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index dfff21a2..1b0d074b 100644 index 59a49560..b70c6a32 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2284,9 +2284,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg @@ -2338,9 +2338,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
ggml_cuda_op_argsort(ctx, dst); ggml_cuda_op_argsort(ctx, dst);
break; break;
......
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com> From: jmorganca <jmorganca@gmail.com>
Date: Thu, 17 Oct 2024 15:18:22 -0700 Date: Tue, 8 Apr 2025 19:27:12 -0700
Subject: [PATCH] add mllama support Subject: [PATCH] add mllama support
mllama adds cross-attention layers to the standard llama architecture adds support for the llama 3.2 vision architecture
it also requires a way to input a new tensor: cross_attention_state
once per generation
cross-attention layers don't change and so they are cached in the
kv cache once per run
remaining is to implement the cross attention mask
--- ---
examples/llava/gemma3-cli.cpp | 3 +-
examples/llava/llava.cpp | 5 +- examples/llava/llava.cpp | 5 +-
examples/llava/mtmd.cpp | 6 +-
ggml/src/ggml-backend-reg.cpp | 6 +- ggml/src/ggml-backend-reg.cpp | 6 +-
include/llama.h | 6 + include/llama.h | 6 +
src/llama-arch.cpp | 44 ++++++ src/llama-arch.cpp | 44 +++++
src/llama-arch.h | 10 ++ src/llama-arch.h | 10 ++
src/llama-batch.cpp | 3 + src/llama-batch.cpp | 3 +
src/llama-context.cpp | 28 ++-- src/llama-context.cpp | 25 ++-
src/llama-context.h | 2 + src/llama-context.h | 1 +
src/llama-cparams.h | 1 + src/llama-cparams.h | 1 +
src/llama-hparams.cpp | 6 + src/llama-graph.cpp | 25 +++
src/llama-hparams.h | 5 + src/llama-graph.h | 12 ++
src/llama-kv-cache.cpp | 13 +- src/llama-hparams.cpp | 4 +
src/llama-hparams.h | 7 +
src/llama-kv-cache.cpp | 12 +-
src/llama-model-loader.cpp | 2 + src/llama-model-loader.cpp | 2 +
src/llama-model.cpp | 65 ++++++++- src/llama-model.cpp | 309 +++++++++++++++++++++++++++++++++-
src/llama-model.h | 12 ++ src/llama-model.h | 12 ++
src/llama-quant.cpp | 4 +- src/llama-quant.cpp | 4 +-
src/llama.cpp | 262 +++++++++++++++++++++++++++++++++- 20 files changed, 475 insertions(+), 22 deletions(-)
17 files changed, 452 insertions(+), 22 deletions(-)
diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp
index 91a07e2a..13127c7b 100644
--- a/examples/llava/gemma3-cli.cpp
+++ b/examples/llava/gemma3-cli.cpp
@@ -106,7 +106,7 @@ struct decode_embd_batch {
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
- decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+ decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
pos .resize(n_tokens);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
@@ -118,6 +118,7 @@ struct decode_embd_batch {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ embd,
+ /*n_embd =*/ n_embd,
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index 518aad3f..f0e484a1 100644 index 03a22cbb..5eb40bcd 100644
--- a/examples/llava/llava.cpp --- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp
@@ -445,7 +445,7 @@ struct llava_embd_batch { @@ -456,7 +456,7 @@ struct llava_embd_batch {
std::vector<llama_seq_id *> seq_ids; std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits; std::vector<int8_t> logits;
llama_batch batch; llama_batch batch;
...@@ -44,7 +61,7 @@ index 518aad3f..f0e484a1 100644 ...@@ -44,7 +61,7 @@ index 518aad3f..f0e484a1 100644
pos .resize(n_tokens); pos .resize(n_tokens);
n_seq_id.resize(n_tokens); n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1); seq_ids .resize(n_tokens + 1);
@@ -457,6 +457,7 @@ struct llava_embd_batch { @@ -468,6 +468,7 @@ struct llava_embd_batch {
/*n_tokens =*/ n_tokens, /*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr, /*tokens =*/ nullptr,
/*embd =*/ embd, /*embd =*/ embd,
...@@ -52,7 +69,7 @@ index 518aad3f..f0e484a1 100644 ...@@ -52,7 +69,7 @@ index 518aad3f..f0e484a1 100644
/*pos =*/ pos.data(), /*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(), /*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(), /*seq_id =*/ seq_ids.data(),
@@ -480,7 +481,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_ @@ -491,7 +492,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
n_eval = n_batch; n_eval = n_batch;
} }
float * embd = image_embed->embed+i*n_embd; float * embd = image_embed->embed+i*n_embd;
...@@ -61,11 +78,42 @@ index 518aad3f..f0e484a1 100644 ...@@ -61,11 +78,42 @@ index 518aad3f..f0e484a1 100644
if (llama_decode(ctx_llama, llava_batch.batch)) { if (llama_decode(ctx_llama, llava_batch.batch)) {
LOG_ERR("%s : failed to eval\n", __func__); LOG_ERR("%s : failed to eval\n", __func__);
return false; return false;
diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
index 114c274b..a0e649ad 100644
--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
@@ -213,7 +213,7 @@ struct decode_embd_batch {
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
- decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+ decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
pos .resize(n_tokens);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
@@ -225,6 +225,7 @@ struct decode_embd_batch {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ embd,
+ /*n_embd =*/ n_embd,
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
@@ -291,7 +292,8 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
int32_t n_tokens = chunk.tokens_image->n_tokens();
float * embd = mtmd_get_output_embd(ctx);
- decode_embd_batch batch_img(embd, n_tokens, n_past, 0);
+ int n_embd = llama_model_n_embd(llama_get_model(lctx));
+ decode_embd_batch batch_img(embd, n_embd, n_tokens, n_past, 0);
int64_t t1 = ggml_time_ms();
ret = llama_decode(lctx, batch_img.batch);
if (ret != 0) {
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 955ed505..95036ef8 100644 index 405d8e31..82ae1b5b 100644
--- a/ggml/src/ggml-backend-reg.cpp --- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp
@@ -171,9 +171,9 @@ struct ggml_backend_registry { @@ -178,9 +178,9 @@ struct ggml_backend_registry {
#ifdef GGML_USE_CANN #ifdef GGML_USE_CANN
register_backend(ggml_backend_cann_reg()); register_backend(ggml_backend_cann_reg());
#endif #endif
...@@ -79,10 +127,10 @@ index 955ed505..95036ef8 100644 ...@@ -79,10 +127,10 @@ index 955ed505..95036ef8 100644
register_backend(ggml_backend_rpc_reg()); register_backend(ggml_backend_rpc_reg());
#endif #endif
diff --git a/include/llama.h b/include/llama.h diff --git a/include/llama.h b/include/llama.h
index 47919602..cc948005 100644 index 5657fbf0..f91896e4 100644
--- a/include/llama.h --- a/include/llama.h
+++ b/include/llama.h +++ b/include/llama.h
@@ -249,6 +249,7 @@ extern "C" { @@ -255,6 +255,7 @@ extern "C" {
llama_token * token; llama_token * token;
float * embd; float * embd;
...@@ -90,7 +138,7 @@ index 47919602..cc948005 100644 ...@@ -90,7 +138,7 @@ index 47919602..cc948005 100644
llama_pos * pos; llama_pos * pos;
int32_t * n_seq_id; int32_t * n_seq_id;
llama_seq_id ** seq_id; llama_seq_id ** seq_id;
@@ -343,6 +344,7 @@ extern "C" { @@ -357,6 +358,7 @@ extern "C" {
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
bool flash_attn; // whether to use flash attention [EXPERIMENTAL] bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
bool no_perf; // whether to measure performance timings bool no_perf; // whether to measure performance timings
...@@ -98,7 +146,7 @@ index 47919602..cc948005 100644 ...@@ -98,7 +146,7 @@ index 47919602..cc948005 100644
// Abort callback // Abort callback
// if it returns true, execution of llama_decode() will be aborted // if it returns true, execution of llama_decode() will be aborted
@@ -443,6 +445,10 @@ extern "C" { @@ -458,6 +460,10 @@ extern "C" {
struct llama_context_params params), struct llama_context_params params),
"use llama_init_from_model instead"); "use llama_init_from_model instead");
...@@ -110,7 +158,7 @@ index 47919602..cc948005 100644 ...@@ -110,7 +158,7 @@ index 47919602..cc948005 100644
LLAMA_API void llama_free(struct llama_context * ctx); LLAMA_API void llama_free(struct llama_context * ctx);
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index a1e0ebcc..b6f20286 100644 index 0b0fedcd..c1f78618 100644
--- a/src/llama-arch.cpp --- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp +++ b/src/llama-arch.cpp
@@ -6,6 +6,7 @@ @@ -6,6 +6,7 @@
...@@ -118,19 +166,19 @@ index a1e0ebcc..b6f20286 100644 ...@@ -118,19 +166,19 @@ index a1e0ebcc..b6f20286 100644
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = { static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_LLAMA, "llama" }, { LLM_ARCH_LLAMA, "llama" },
+ { LLM_ARCH_MLLAMA, "mllama" }, + { LLM_ARCH_MLLAMA, "mllama" },
{ LLM_ARCH_LLAMA4, "llama4" },
{ LLM_ARCH_DECI, "deci" }, { LLM_ARCH_DECI, "deci" },
{ LLM_ARCH_FALCON, "falcon" }, { LLM_ARCH_FALCON, "falcon" },
{ LLM_ARCH_GROK, "grok" }, @@ -142,6 +143,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
@@ -127,6 +128,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = { { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" }, { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" }, + { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
+ { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" }, { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
@@ -225,6 +227,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N @@ -269,6 +271,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
}, },
}, },
+ { + {
...@@ -170,7 +218,7 @@ index a1e0ebcc..b6f20286 100644 ...@@ -170,7 +218,7 @@ index a1e0ebcc..b6f20286 100644
{ {
LLM_ARCH_DECI, LLM_ARCH_DECI,
{ {
@@ -1450,6 +1486,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = { @@ -1692,6 +1728,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
// this tensor is loaded for T5, but never used // this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}}, {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
{LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
...@@ -186,18 +234,18 @@ index a1e0ebcc..b6f20286 100644 ...@@ -186,18 +234,18 @@ index a1e0ebcc..b6f20286 100644
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h diff --git a/src/llama-arch.h b/src/llama-arch.h
index 77919578..ec742224 100644 index 74aa3dd0..f987844d 100644
--- a/src/llama-arch.h --- a/src/llama-arch.h
+++ b/src/llama-arch.h +++ b/src/llama-arch.h
@@ -10,6 +10,7 @@ @@ -11,6 +11,7 @@
enum llm_arch { enum llm_arch {
LLM_ARCH_LLAMA, LLM_ARCH_LLAMA,
LLM_ARCH_LLAMA4,
+ LLM_ARCH_MLLAMA, + LLM_ARCH_MLLAMA,
LLM_ARCH_DECI, LLM_ARCH_DECI,
LLM_ARCH_FALCON, LLM_ARCH_FALCON,
LLM_ARCH_BAICHUAN, LLM_ARCH_BAICHUAN,
@@ -131,6 +132,7 @@ enum llm_kv { @@ -146,6 +147,7 @@ enum llm_kv {
LLM_KV_ATTENTION_SLIDING_WINDOW, LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE, LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
...@@ -205,7 +253,7 @@ index 77919578..ec742224 100644 ...@@ -205,7 +253,7 @@ index 77919578..ec742224 100644
LLM_KV_ROPE_DIMENSION_COUNT, LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_DIMENSION_SECTIONS, LLM_KV_ROPE_DIMENSION_SECTIONS,
@@ -314,6 +316,14 @@ enum llm_tensor { @@ -343,6 +345,14 @@ enum llm_tensor {
LLM_TENSOR_CLS, LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT, LLM_TENSOR_CLS_OUT,
LLM_TENSOR_BSKCN_TV, LLM_TENSOR_BSKCN_TV,
...@@ -249,39 +297,66 @@ index 01d5ca57..8682b0e6 100644 ...@@ -249,39 +297,66 @@ index 01d5ca57..8682b0e6 100644
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc); batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
} }
diff --git a/src/llama-context.cpp b/src/llama-context.cpp diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 47e79ed4..7b22fe13 100644 index 65135172..afe6f552 100644
--- a/src/llama-context.cpp --- a/src/llama-context.cpp
+++ b/src/llama-context.cpp +++ b/src/llama-context.cpp
@@ -74,10 +74,19 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { @@ -858,7 +858,7 @@ float * llama_context::get_logits_ith(int32_t i) {
} throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
}
if (ubatch.embd) { - return logits + j*model.vocab.n_tokens();
- const int64_t n_embd = hparams.n_embd; + return logits + j*model.hparams.n_vocab;
- const int64_t n_tokens = ubatch.n_tokens; } catch (const std::exception & err) {
+ if (lctx.inp_cross_attn_state && lctx.inp_cross_attn_state->buffer) { LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
+ ggml_backend_tensor_set(lctx.inp_cross_attn_state, ubatch.embd, 0, ggml_nbytes(lctx.inp_cross_attn_state)); #ifndef NDEBUG
+ // zero out inp_embd since it's not used @@ -979,6 +979,10 @@ void llama_context::set_warmup(bool value) {
+ float * inp_embd_data = (float *)lctx.inp_embd->data; cparams.warmup = value;
+ for (int i = 0; i < ggml_nelements(lctx.inp_embd); ++i) { }
+ inp_embd_data[i] = 0.0f;
+ }
+ } else {
+ const int64_t n_embd = hparams.n_embd;
+ const int64_t n_tokens = ubatch.n_tokens;
- ggml_backend_tensor_set(lctx.inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd)); +void llama_context::set_cross_attn(bool value) {
+ ggml_backend_tensor_set(lctx.inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd)); + cparams.cross_attn = value;
+ } +}
} +
void llama_context::set_adapter_lora(
llama_adapter_lora * adapter,
float scale) {
@@ -1054,7 +1058,7 @@ int llama_context::encode(llama_batch & inp_batch) {
if (ubatch.pos && lctx.inp_pos) { const int64_t n_embd = hparams.n_embd;
@@ -470,12 +479,11 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) { - sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
const auto & cparams = lctx.cparams; + sbatch.from_batch(batch, batch.n_embd, /* simple_split */ true, /* logits_all */ true);
const auto & hparams = lctx.model.hparams;
- const auto & vocab = lctx.model.vocab; const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
@@ -1194,10 +1198,9 @@ int llama_context::decode(llama_batch & inp_batch) {
const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max); const llama_batch & batch = batch_allocr.batch;
- const auto & vocab = model.vocab;
const auto & hparams = model.hparams;
- const int32_t n_vocab = vocab.n_tokens();
+ const int32_t n_vocab = hparams.n_vocab;
const int64_t n_tokens_all = batch.n_tokens;
const int64_t n_embd = hparams.n_embd;
@@ -1245,7 +1248,7 @@ int llama_context::decode(llama_batch & inp_batch) {
const bool logits_all = n_outputs_all == n_tokens_all;
- sbatch.from_batch(batch, n_embd,
+ sbatch.from_batch(batch, batch.n_embd,
/* simple_split */ !kv_self->recurrent,
/* logits_all */ logits_all);
@@ -1479,12 +1482,11 @@ int llama_context::decode(llama_batch & inp_batch) {
int32_t llama_context::output_reserve(int32_t n_outputs) {
const auto & hparams = model.hparams;
- const auto & vocab = model.vocab;
const int64_t n_outputs_max = std::max<int64_t>(n_outputs, n_seq_max());
const auto n_batch = cparams.n_batch; const auto n_batch = cparams.n_batch;
- const auto n_vocab = vocab.n_tokens(); - const auto n_vocab = vocab.n_tokens();
...@@ -289,59 +364,57 @@ index 47e79ed4..7b22fe13 100644 ...@@ -289,59 +364,57 @@ index 47e79ed4..7b22fe13 100644
const auto n_embd = hparams.n_embd; const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead // TODO: use a per-batch flag for logits presence instead
@@ -542,7 +550,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) { @@ -1554,7 +1556,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
void llama_output_reorder(struct llama_context & ctx) { void llama_context::output_reorder() {
std::vector<size_t> & out_ids = ctx.sbatch.out_ids; auto & out_ids = sbatch.out_ids;
if (!out_ids.empty()) { if (!out_ids.empty()) {
- const uint32_t n_vocab = ctx.model.vocab.n_tokens(); - const uint32_t n_vocab = model.vocab.n_tokens();
+ const uint32_t n_vocab = ctx.model.hparams.n_vocab; + const uint32_t n_vocab = model.hparams.n_vocab;
const uint32_t n_embd = ctx.model.hparams.n_embd; const uint32_t n_embd = model.hparams.n_embd;
GGML_ASSERT((size_t) n_outputs == out_ids.size());
@@ -2061,7 +2063,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
{
LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
- const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.vocab.n_tokens());
+ const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.hparams.n_vocab);
const int32_t n_outputs = ctx.n_outputs; io.write(&logits_size, sizeof(logits_size));
@@ -657,6 +665,10 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
ctx->cparams.causal_attn = causal_attn; @@ -2244,6 +2246,7 @@ llama_context_params llama_context_default_params() {
/*.offload_kqv =*/ true,
/*.flash_attn =*/ false,
/*.no_perf =*/ true,
+ /*.cross_attn =*/ false,
/*.abort_callback =*/ nullptr,
/*.abort_callback_data =*/ nullptr,
};
@@ -2371,6 +2374,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
ctx->set_warmup(warmup);
} }
+void llama_set_cross_attention(struct llama_context * ctx, bool cross_attention) { +void llama_set_cross_attention(struct llama_context * ctx, bool cross_attention) {
+ ctx->cparams.cross_attn = cross_attention; + ctx->set_cross_attn(cross_attention);
+} +}
+ +
void llama_synchronize(struct llama_context * ctx) { void llama_synchronize(llama_context * ctx) {
ggml_backend_sched_synchronize(ctx->sched.get()); ctx->synchronize();
}
@@ -726,7 +738,7 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
}
- return ctx->logits + j*ctx->model.vocab.n_tokens();
+ return ctx->logits + j*ctx->model.hparams.n_vocab;
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
#ifndef NDEBUG
@@ -886,7 +898,7 @@ struct llama_data_write {
}
void write_logits(const struct llama_context * ctx) {
- const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.vocab.n_tokens());
+ const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.hparams.n_vocab);
write(&logits_size, sizeof(logits_size));
diff --git a/src/llama-context.h b/src/llama-context.h diff --git a/src/llama-context.h b/src/llama-context.h
index a9268b29..cf12c9d7 100644 index 04facb54..baa03276 100644
--- a/src/llama-context.h --- a/src/llama-context.h
+++ b/src/llama-context.h +++ b/src/llama-context.h
@@ -107,6 +107,8 @@ struct llama_context { @@ -65,6 +65,7 @@ struct llama_context {
struct ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch] void set_embeddings (bool value);
struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc] void set_causal_attn(bool value);
struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch] void set_warmup(bool value);
+ + void set_cross_attn(bool value);
+ struct ggml_tensor * inp_cross_attn_state; // F32 [4, n_embd, 1061]
};
// TODO: make these methods of llama_context void set_adapter_lora(
llama_adapter_lora * adapter,
diff --git a/src/llama-cparams.h b/src/llama-cparams.h diff --git a/src/llama-cparams.h b/src/llama-cparams.h
index 252012f3..9681e5a0 100644 index 30e550f0..85ad91b9 100644
--- a/src/llama-cparams.h --- a/src/llama-cparams.h
+++ b/src/llama-cparams.h +++ b/src/llama-cparams.h
@@ -29,6 +29,7 @@ struct llama_cparams { @@ -29,6 +29,7 @@ struct llama_cparams {
...@@ -349,37 +422,115 @@ index 252012f3..9681e5a0 100644 ...@@ -349,37 +422,115 @@ index 252012f3..9681e5a0 100644
bool flash_attn; bool flash_attn;
bool no_perf; bool no_perf;
+ bool cross_attn; + bool cross_attn;
bool warmup;
enum llama_pooling_type pooling_type; enum llama_pooling_type pooling_type;
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index cd955d63..83f3c5a8 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -546,6 +546,12 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
}
}
+void llm_graph_input_cross_attn_state::set_input(const llama_ubatch * ubatch) {
+ if (ubatch->embd) {
+ ggml_backend_tensor_set(cross_attn_state, ubatch->embd, 0, ggml_nbytes(cross_attn_state));
+ }
+}
+
//
// llm_graph_context
//
@@ -1495,6 +1501,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
}
+ggml_tensor * llm_graph_context::build_inp_cross_attn_state() const {
+ const int64_t n_embd = hparams.n_embd;
+
+ auto inp = std::make_unique<llm_graph_input_cross_attn_state>();
+
+ ggml_tensor * cur = nullptr;
+
+ inp->cross_attn_state = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd, 1601, 4);
+ ggml_set_input(inp->cross_attn_state);
+
+ cur = inp->cross_attn_state;
+
+ cb(cur, "inp_cross_attn_state", -1);
+
+ res->add_input(std::move(inp));
+
+ return cur;
+}
+
ggml_tensor * llm_graph_context::build_attn(
llm_graph_input_attn_cross * inp,
ggml_cgraph * gf,
diff --git a/src/llama-graph.h b/src/llama-graph.h
index 5b6618f9..51993998 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -86,6 +86,7 @@ public:
ggml_tensor * tokens = nullptr; // I32 [n_batch]
ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch]
+ ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061]
};
class llm_graph_input_pos : public llm_graph_input_i {
@@ -285,6 +286,16 @@ public:
const llama_cross * cross = nullptr;
};
+class llm_graph_input_cross_attn_state : public llm_graph_input_i {
+public:
+ llm_graph_input_cross_attn_state() = default;
+ virtual ~llm_graph_input_cross_attn_state() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
+ ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061]
+};
+
//
// llm_graph_result
//
@@ -493,6 +504,7 @@ struct llm_graph_context {
ggml_tensor * build_inp_cls() const;
ggml_tensor * build_inp_s_copy() const;
ggml_tensor * build_inp_s_mask() const;
+ ggml_tensor * build_inp_cross_attn_state() const;
ggml_tensor * build_inp_cross_embd() const;
ggml_tensor * build_inp_pos_bucket_enc() const;
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
index f3955de9..0b841028 100644 index 8a667960..6a02de03 100644
--- a/src/llama-hparams.cpp --- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp
@@ -2,6 +2,8 @@ @@ -85,3 +85,7 @@ bool llama_hparams::is_swa(uint32_t il) const {
#include "ggml.h"
+#include <algorithm>
+
uint32_t llama_hparams::n_head(uint32_t il) const {
if (il < n_layer) {
return n_head_arr[il];
@@ -76,4 +78,8 @@ bool llama_hparams::n_bskcn(uint32_t n, uint32_t il) const {
}
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
+} }
+ +
+bool llama_hparams::cross_attention_layers(uint32_t il) const { +bool llama_hparams::cross_attention_layers(uint32_t il) const {
+ return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end(); + return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
} +}
\ No newline at end of file
diff --git a/src/llama-hparams.h b/src/llama-hparams.h diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index 1bdcdfd5..05383046 100644 index c3147cbc..4567a0e9 100644
--- a/src/llama-hparams.h --- a/src/llama-hparams.h
+++ b/src/llama-hparams.h +++ b/src/llama-hparams.h
@@ -41,6 +41,7 @@ struct llama_hparams { @@ -2,6 +2,8 @@
#include "llama.h"
+#include <algorithm>
+
#include <array>
// bump if necessary
@@ -42,6 +44,7 @@ struct llama_hparams {
uint32_t n_expert = 0; uint32_t n_expert = 0;
uint32_t n_expert_used = 0; uint32_t n_expert_used = 0;
uint32_t n_rel_attn_bkts = 0; uint32_t n_rel_attn_bkts = 0;
...@@ -387,7 +538,7 @@ index 1bdcdfd5..05383046 100644 ...@@ -387,7 +538,7 @@ index 1bdcdfd5..05383046 100644
// for WavTokenizer // for WavTokenizer
struct llama_hparams_posnet posnet; struct llama_hparams_posnet posnet;
@@ -51,6 +52,7 @@ struct llama_hparams { @@ -52,6 +55,7 @@ struct llama_hparams {
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr; std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr = {}; std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr = {};
...@@ -395,21 +546,21 @@ index 1bdcdfd5..05383046 100644 ...@@ -395,21 +546,21 @@ index 1bdcdfd5..05383046 100644
uint32_t n_layer_dense_lead = 0; uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0; uint32_t n_lora_q = 0;
@@ -138,6 +140,9 @@ struct llama_hparams { @@ -154,6 +158,9 @@ struct llama_hparams {
// Block skip connection // Block skip connection
bool n_bskcn(uint32_t n, uint32_t il) const; bool n_bskcn(uint32_t n, uint32_t il) const;
+
+ // cross attention layers + // cross attention layers
+ bool cross_attention_layers(uint32_t il) const; + bool cross_attention_layers(uint32_t il) const;
+
bool is_swa(uint32_t il) const;
}; };
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index feffdf0d..b541c5a3 100644 index dbf5f118..9310f262 100644
--- a/src/llama-kv-cache.cpp --- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp
@@ -91,8 +91,17 @@ bool llama_kv_cache_init( @@ -95,8 +95,16 @@ bool llama_kv_cache_unified::init(
return false; return false;
} }
...@@ -425,12 +576,11 @@ index feffdf0d..b541c5a3 100644 ...@@ -425,12 +576,11 @@ index feffdf0d..b541c5a3 100644
+ k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size); + k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
+ v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size); + v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
+ } + }
+
ggml_format_name(k, "cache_k_l%d", i); ggml_format_name(k, "cache_k_l%d", i);
ggml_format_name(v, "cache_v_l%d", i); ggml_format_name(v, "cache_v_l%d", i);
cache.k_l.push_back(k); k_l.push_back(k);
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index 1252aca1..45d08721 100644 index a012aeae..2e11507d 100644
--- a/src/llama-model-loader.cpp --- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp
@@ -315,6 +315,8 @@ namespace GGUFMeta { @@ -315,6 +315,8 @@ namespace GGUFMeta {
...@@ -443,10 +593,10 @@ index 1252aca1..45d08721 100644 ...@@ -443,10 +593,10 @@ index 1252aca1..45d08721 100644
bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) { bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
const int kid = gguf_find_key(meta.get(), key.c_str()); const int kid = gguf_find_key(meta.get(), key.c_str());
diff --git a/src/llama-model.cpp b/src/llama-model.cpp diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index ad1315c6..21819080 100644 index 5fbd0055..d5ad466e 100644
--- a/src/llama-model.cpp --- a/src/llama-model.cpp
+++ b/src/llama-model.cpp +++ b/src/llama-model.cpp
@@ -401,6 +401,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { @@ -419,6 +419,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
// get general kv // get general kv
ml.get_key(LLM_KV_GENERAL_NAME, name, false); ml.get_key(LLM_KV_GENERAL_NAME, name, false);
...@@ -454,7 +604,7 @@ index ad1315c6..21819080 100644 ...@@ -454,7 +604,7 @@ index ad1315c6..21819080 100644
// everything past this point is not vocab-related // everything past this point is not vocab-related
if (hparams.vocab_only) { if (hparams.vocab_only) {
@@ -412,6 +413,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { @@ -430,6 +431,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer); ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false); ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false); ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
...@@ -462,7 +612,7 @@ index ad1315c6..21819080 100644 ...@@ -462,7 +612,7 @@ index ad1315c6..21819080 100644
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) { if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features); ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
@@ -435,9 +437,11 @@ void llama_model::load_hparams(llama_model_loader & ml) { @@ -453,9 +455,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0); std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0); std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0); std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
...@@ -474,7 +624,7 @@ index ad1315c6..21819080 100644 ...@@ -474,7 +624,7 @@ index ad1315c6..21819080 100644
// n_head_kv is optional, default to n_head // n_head_kv is optional, default to n_head
hparams.n_head_kv_arr = hparams.n_head_arr; hparams.n_head_kv_arr = hparams.n_head_arr;
@@ -486,7 +490,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { @@ -508,7 +512,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false); ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
...@@ -483,8 +633,8 @@ index ad1315c6..21819080 100644 ...@@ -483,8 +633,8 @@ index ad1315c6..21819080 100644
if (hparams.n_rot != hparams.n_embd_head_k) { if (hparams.n_rot != hparams.n_embd_head_k) {
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k)); throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
} }
@@ -530,6 +534,16 @@ void llama_model::load_hparams(llama_model_loader & ml) { @@ -571,6 +575,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
} hparams.use_kq_norm = false;
} }
} break; } break;
+ case LLM_ARCH_MLLAMA: + case LLM_ARCH_MLLAMA:
...@@ -500,7 +650,7 @@ index ad1315c6..21819080 100644 ...@@ -500,7 +650,7 @@ index ad1315c6..21819080 100644
case LLM_ARCH_DECI: case LLM_ARCH_DECI:
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -1398,7 +1412,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { @@ -1548,7 +1562,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
const int64_t n_embd_head_v = hparams.n_embd_head_v; const int64_t n_embd_head_v = hparams.n_embd_head_v;
const int64_t n_ff = hparams.n_ff(); const int64_t n_ff = hparams.n_ff();
const int64_t n_embd_gqa = n_embd_v_gqa; const int64_t n_embd_gqa = n_embd_v_gqa;
...@@ -509,7 +659,7 @@ index ad1315c6..21819080 100644 ...@@ -509,7 +659,7 @@ index ad1315c6..21819080 100644
const int64_t n_token_types = vocab.n_token_types(); const int64_t n_token_types = vocab.n_token_types();
const int64_t n_rot = hparams.n_rot; const int64_t n_rot = hparams.n_rot;
const int64_t n_expert = hparams.n_expert; const int64_t n_expert = hparams.n_expert;
@@ -1581,6 +1595,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) { @@ -1801,6 +1815,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
} }
} }
} break; } break;
...@@ -562,107 +712,12 @@ index ad1315c6..21819080 100644 ...@@ -562,107 +712,12 @@ index ad1315c6..21819080 100644
case LLM_ARCH_DECI: case LLM_ARCH_DECI:
{ {
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -3925,6 +3985,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) { @@ -4665,6 +4725,246 @@ struct llm_build_llama : public llm_graph_context {
// use what we call a normal RoPE, operating on pairs of consecutive head values
case LLM_ARCH_LLAMA:
+ case LLM_ARCH_MLLAMA:
case LLM_ARCH_DECI:
case LLM_ARCH_BAICHUAN:
case LLM_ARCH_STARCODER:
diff --git a/src/llama-model.h b/src/llama-model.h
index 1afb0024..7cf57587 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -9,6 +9,7 @@
#include <string>
#include <unordered_map>
#include <vector>
+#include <stdexcept>
struct llama_model_loader;
@@ -63,6 +64,7 @@ enum llm_type {
LLM_TYPE_40B,
LLM_TYPE_65B,
LLM_TYPE_70B,
+ LLM_TYPE_90B,
LLM_TYPE_236B,
LLM_TYPE_314B,
LLM_TYPE_671B,
@@ -284,6 +286,16 @@ struct llama_layer {
struct ggml_tensor * bskcn_tv = nullptr;
+ // cross attention
+ struct ggml_tensor * cross_attn_k_norm = nullptr;
+ struct ggml_tensor * cross_attn_k_proj = nullptr;
+ struct ggml_tensor * cross_attn_o_proj = nullptr;
+ struct ggml_tensor * cross_attn_q_norm = nullptr;
+ struct ggml_tensor * cross_attn_q_proj = nullptr;
+ struct ggml_tensor * cross_attn_v_proj = nullptr;
+ struct ggml_tensor * cross_attn_attn_gate = nullptr;
+ struct ggml_tensor * cross_attn_mlp_gate = nullptr;
+
struct llama_layer_posnet posnet;
struct llama_layer_convnext convnext;
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index fb798265..6eb1da08 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -632,7 +632,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
if (llama_model_has_encoder(&model)) {
n_attn_layer *= 3;
}
- GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
+ if (qs.n_attention_wv != n_attn_layer) {
+ LLAMA_LOG_WARN("%s: n_attention_wv is unexpected, expected: %d, found: %d\n", __func__, n_attn_layer, qs.n_attention_wv);
+ }
}
size_t total_size_org = 0;
diff --git a/src/llama.cpp b/src/llama.cpp
index 6d320ea4..8f7902df 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -154,6 +154,21 @@ static struct ggml_tensor * llm_build_inp_embd(
return inpL;
}
+static struct ggml_tensor * llm_build_inp_cross_attn_state(
+ struct ggml_context * ctx,
+ struct llama_context & lctx,
+ const llama_hparams & hparams,
+ const llm_build_cb & cb) {
+ const int64_t n_embd = hparams.n_embd;
+
+ struct ggml_tensor * inpCAS = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd, 1601, 4);
+ cb(inpCAS, "inp_cross_attn_state", -1);
+ ggml_set_input(inpCAS);
+ lctx.inp_cross_attn_state = inpCAS;
+
+ return inpCAS;
+}
+
static void llm_build_kv_store(
struct ggml_context * ctx,
const llama_hparams & hparams,
@@ -1157,6 +1172,7 @@ struct llm_build_context {
lctx.inp_pos_bucket = nullptr;
lctx.inp_embd_enc = nullptr;
lctx.inp_KQ_mask_cross = nullptr;
+ lctx.inp_cross_attn_state = nullptr;
}
void free() {
@@ -1639,6 +1655,240 @@ struct llm_build_context {
return gf;
} }
};
+ struct ggml_cgraph * build_mllama() { +struct llm_build_mllama: public llm_graph_context {
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + llm_build_mllama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+
+ // mutable variable, needed during the last layer of the computation to skip unused tokens + // mutable variable, needed during the last layer of the computation to skip unused tokens
+ int32_t n_tokens = this->n_tokens; + int32_t n_tokens = this->n_tokens;
+ +
...@@ -670,26 +725,26 @@ index 6d320ea4..8f7902df 100644 ...@@ -670,26 +725,26 @@ index 6d320ea4..8f7902df 100644
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_rot);
+ +
+ struct ggml_tensor * cur; + ggml_tensor * cur;
+ struct ggml_tensor * inpL; + ggml_tensor * inpL;
+ struct ggml_tensor * inpCAS; + ggml_tensor * inpCAS;
+ +
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd);
+ inpCAS = llm_build_inp_cross_attn_state(ctx0, lctx, hparams, cb); + inpCAS = build_inp_cross_attn_state();
+ +
+ // inp_pos - contains the positions + // inp_pos - contains the positions
+ struct ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_pos = build_inp_pos();
+ +
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + auto * inp_attn = build_attn_inp_kv_unified();
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
+ +
+ for (int il = 0; il < n_layer; ++il) { + for (int il = 0; il < n_layer; ++il) {
+ struct ggml_tensor * inpSA = inpL; + ggml_tensor * inpSA = inpL;
+ +
+ // norm + // norm
+ cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il); + cb(cur, "attn_norm", il);
+ +
+ if (hparams.cross_attention_layers(il)) { + if (hparams.cross_attention_layers(il)) {
...@@ -698,7 +753,7 @@ index 6d320ea4..8f7902df 100644 ...@@ -698,7 +753,7 @@ index 6d320ea4..8f7902df 100644
+ } + }
+ +
+ // cross attention layer + // cross attention layer
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_q_proj, cur); + ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_q_proj, cur);
+ cb(Qcur, "Qcur", il); + cb(Qcur, "Qcur", il);
+ +
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
...@@ -707,10 +762,10 @@ index 6d320ea4..8f7902df 100644 ...@@ -707,10 +762,10 @@ index 6d320ea4..8f7902df 100644
+ Qcur = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 0, 2, 1, 3)); + Qcur = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 0, 2, 1, 3));
+ cb(Qcur, "Qcur", il); + cb(Qcur, "Qcur", il);
+ +
+ Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].cross_attn_q_norm, NULL, LLM_NORM_RMS, cb, il); + Qcur = build_norm(Qcur, model.layers[il].cross_attn_q_norm, NULL, LLM_NORM_RMS, il);
+ cb(Qcur, "Qcur", il); + cb(Qcur, "Qcur", il);
+ +
+ struct ggml_tensor * Kcur, * Vcur; + ggml_tensor * Kcur, * Vcur;
+ if (ubatch.embd) { + if (ubatch.embd) {
+ Kcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_k_proj, inpCAS); + Kcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_k_proj, inpCAS);
+ cb(Kcur, "Kcur", il); + cb(Kcur, "Kcur", il);
...@@ -721,10 +776,10 @@ index 6d320ea4..8f7902df 100644 ...@@ -721,10 +776,10 @@ index 6d320ea4..8f7902df 100644
+ Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); + Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
+ cb(Kcur, "Kcur", il); + cb(Kcur, "Kcur", il);
+ +
+ Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].cross_attn_k_norm, NULL, LLM_NORM_RMS, cb, il); + Kcur = build_norm(Kcur, model.layers[il].cross_attn_k_norm, NULL, LLM_NORM_RMS, il);
+ cb(Kcur, "Kcur", il); + cb(Kcur, "Kcur", il);
+ +
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, kv_self.k_l[il])); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, kv_self->k_l[il]));
+ +
+ Vcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_v_proj, inpCAS); + Vcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_v_proj, inpCAS);
+ cb(Vcur, "Vcur", il); + cb(Vcur, "Vcur", il);
...@@ -735,12 +790,12 @@ index 6d320ea4..8f7902df 100644 ...@@ -735,12 +790,12 @@ index 6d320ea4..8f7902df 100644
+ Vcur = ggml_permute(ctx0, Vcur, 0, 2, 1, 3); + Vcur = ggml_permute(ctx0, Vcur, 0, 2, 1, 3);
+ cb(Vcur, "Vcur", il); + cb(Vcur, "Vcur", il);
+ +
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, kv_self.v_l[il])); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, kv_self->v_l[il]));
+ } else { + } else {
+ Kcur = ggml_view_tensor(ctx0, kv_self.k_l[il]); + Kcur = ggml_view_tensor(ctx0, kv_self->k_l[il]);
+ cb(Kcur, "Kcur (view)", il); + cb(Kcur, "Kcur (view)", il);
+ +
+ Vcur = ggml_view_tensor(ctx0, kv_self.v_l[il]); + Vcur = ggml_view_tensor(ctx0, kv_self->v_l[il]);
+ cb(Vcur, "Vcur (view)", il); + cb(Vcur, "Vcur (view)", il);
+ } + }
+ +
...@@ -773,24 +828,24 @@ index 6d320ea4..8f7902df 100644 ...@@ -773,24 +828,24 @@ index 6d320ea4..8f7902df 100644
+ cb(ffn_inp, "ffn_inp", il); + cb(ffn_inp, "ffn_inp", il);
+ +
+ // feed-forward network + // feed-forward network
+ cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il); + cb(cur, "ffn_norm", il);
+ +
+ cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, + model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL, + NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il); + cb(cur, "ffn_out", il);
+ +
+ // TODO: do this inplace once? + // TODO: do this inplace once?
+ cur = ggml_add_inplace(ctx0, ggml_mul_inplace(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_mlp_gate)), ffn_inp); + cur = ggml_add_inplace(ctx0, ggml_mul_inplace(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_mlp_gate)), ffn_inp);
+ cb(cur, "ffn_out", il); + cb(cur, "ffn_out", il);
+ +
+ cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = build_cvec(cur, il);
+ cb(cur, "l_out", il); + cb(cur, "l_out", il);
+ +
+ // input for next layer + // input for next layer
...@@ -799,48 +854,53 @@ index 6d320ea4..8f7902df 100644 ...@@ -799,48 +854,53 @@ index 6d320ea4..8f7902df 100644
+ // self attention layer + // self attention layer
+ +
+ // rope freq factors for llama3; may return nullptr for llama2 and other models + // rope freq factors for llama3; may return nullptr for llama2 and other models
+ struct ggml_tensor * rope_factors = build_rope_factors(il); + ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
+ +
+ // compute Q and K and RoPE them + // compute Q and K and RoPE them
+ struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il); + cb(Qcur, "Qcur", il);
+ if (model.layers[il].bq) { + if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ cb(Qcur, "Qcur", il); + cb(Qcur, "Qcur", il);
+ } + }
+ +
+ struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il); + cb(Kcur, "Kcur", il);
+ if (model.layers[il].bk) { + if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ cb(Kcur, "Kcur", il); + cb(Kcur, "Kcur", il);
+ } + }
+ +
+ struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il); + cb(Vcur, "Vcur", il);
+ if (model.layers[il].bv) { + if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ cb(Vcur, "Vcur", il); + cb(Vcur, "Vcur", il);
+ } + }
+ +
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext( + Qcur = ggml_rope_ext(
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors, + ctx0, Qcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow + ext_factor, attn_factor, beta_fast, beta_slow
+ ); + );
+ cb(Qcur, "Qcur", il);
+ +
+ Kcur = ggml_rope_ext( + Kcur = ggml_rope_ext(
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors, + ctx0, Kcur, inp_pos, rope_factors,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow + ext_factor, attn_factor, beta_fast, beta_slow
+ ); + );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il); + cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+ +
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(inp_attn, gf,
+ model.layers[il].wo, model.layers[il].bo, + model.layers[il].wo, model.layers[il].bo,
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+
+ +
+ if (il == n_layer - 1) { + if (il == n_layer - 1) {
+ // skip computing output for unused tokens + // skip computing output for unused tokens
...@@ -854,23 +914,23 @@ index 6d320ea4..8f7902df 100644 ...@@ -854,23 +914,23 @@ index 6d320ea4..8f7902df 100644
+ cb(ffn_inp, "ffn_inp", il); + cb(ffn_inp, "ffn_inp", il);
+ +
+ // feed-forward network + // feed-forward network
+ cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il); + cb(cur, "ffn_norm", il);
+ +
+ cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, + model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL, + NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il); + cb(cur, "ffn_out", il);
+ +
+ cur = ggml_add(ctx0, cur, ffn_inp); + cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il); + cb(cur, "ffn_out", il);
+ +
+ cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = build_cvec(cur, il);
+ cb(cur, "l_out", il); + cb(cur, "l_out", il);
+ +
+ // input for next layer + // input for next layer
...@@ -880,74 +940,93 @@ index 6d320ea4..8f7902df 100644 ...@@ -880,74 +940,93 @@ index 6d320ea4..8f7902df 100644
+ +
+ cur = inpL; + cur = inpL;
+ +
+ cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur,
+ model.output_norm, NULL, + model.output_norm, NULL,
+ LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1);
+ cb(cur, "result_norm", -1); + cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+ +
+ // lm_head + // lm_head
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1); + cb(cur, "result_output", -1);
+ res->t_logits = cur;
+ +
+ ggml_build_forward_expand(gf, cur); + ggml_build_forward_expand(gf, cur);
+
+ return gf;
+ } + }
+};
+ +
struct ggml_cgraph * build_deci() { struct llm_build_deci : public llm_graph_context {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -8344,6 +8594,10 @@ static struct ggml_cgraph * llama_build_graph( @@ -12965,6 +13265,10 @@ llm_graph_result_ptr llama_model::build_graph(
{ {
result = llm.build_llama(); llm = std::make_unique<llm_build_llama>(*this, params, gf);
} break; } break;
+ case LLM_ARCH_MLLAMA: + case LLM_ARCH_MLLAMA:
+ { + {
+ result = llm.build_mllama(); + llm = std::make_unique<llm_build_mllama>(*this, params, gf);
+ } break; + } break;
case LLM_ARCH_DECI: case LLM_ARCH_DECI:
{ {
result = llm.build_deci(); llm = std::make_unique<llm_build_deci>(*this, params, gf);
@@ -8634,7 +8888,7 @@ static int llama_prepare_sbatch( @@ -13325,6 +13629,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
n_outputs = 1; // use what we call a normal RoPE, operating on pairs of consecutive head values
} case LLM_ARCH_LLAMA:
case LLM_ARCH_LLAMA4:
- lctx.sbatch.from_batch(batch, n_embd, + case LLM_ARCH_MLLAMA:
+ lctx.sbatch.from_batch(batch, batch.n_embd, case LLM_ARCH_DECI:
/* simple_split */ !lctx.kv_self.recurrent, case LLM_ARCH_BAICHUAN:
/* logits_all */ n_outputs == n_tokens_all); case LLM_ARCH_STARCODER:
diff --git a/src/llama-model.h b/src/llama-model.h
@@ -8749,7 +9003,6 @@ static int llama_decode_impl( index e08d4ae4..21c4617b 100644
const llama_batch & batch = batch_allocr.batch; --- a/src/llama-model.h
+++ b/src/llama-model.h
const auto & model = lctx.model; @@ -11,6 +11,7 @@
- const auto & vocab = model.vocab; #include <string>
const auto & hparams = model.hparams; #include <unordered_map>
const auto & cparams = lctx.cparams; #include <vector>
+#include <stdexcept>
@@ -8760,7 +9013,7 @@ static int llama_decode_impl(
llama_kv_slot_restorer kv_slot_restorer(kv_self);
const int64_t n_embd = hparams.n_embd;
- const int64_t n_vocab = vocab.n_tokens();
+ const int64_t n_vocab = hparams.n_vocab;
uint32_t n_outputs = 0; struct llama_cparams;
uint32_t n_outputs_prev = 0; struct llama_ubatch;
@@ -9025,7 +9278,7 @@ static int llama_encode_impl( @@ -70,6 +71,7 @@ enum llm_type {
LLM_TYPE_40B,
LLM_TYPE_65B,
LLM_TYPE_70B,
+ LLM_TYPE_90B,
LLM_TYPE_236B,
LLM_TYPE_314B,
LLM_TYPE_671B,
@@ -308,6 +310,16 @@ struct llama_layer {
const int64_t n_embd = hparams.n_embd; struct ggml_tensor * bskcn_tv = nullptr;
- lctx.sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true); + // cross attention
+ lctx.sbatch.from_batch(batch, batch.n_embd, /* simple_split */ true, /* logits_all */ true); + struct ggml_tensor * cross_attn_k_norm = nullptr;
+ struct ggml_tensor * cross_attn_k_proj = nullptr;
+ struct ggml_tensor * cross_attn_o_proj = nullptr;
+ struct ggml_tensor * cross_attn_q_norm = nullptr;
+ struct ggml_tensor * cross_attn_q_proj = nullptr;
+ struct ggml_tensor * cross_attn_v_proj = nullptr;
+ struct ggml_tensor * cross_attn_attn_gate = nullptr;
+ struct ggml_tensor * cross_attn_mlp_gate = nullptr;
+
struct llama_layer_posnet posnet;
const llama_ubatch ubatch = lctx.sbatch.split_simple(n_tokens); struct llama_layer_convnext convnext;
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 7dc54227..223e1f3f 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -639,7 +639,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
if (llama_model_has_encoder(&model)) {
n_attn_layer *= 3;
}
- GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
+ if (qs.n_attention_wv != n_attn_layer) {
+ LLAMA_LOG_WARN("%s: n_attention_wv is unexpected, expected: %d, found: %d\n", __func__, n_attn_layer, qs.n_attention_wv);
+ }
}
@@ -9511,6 +9764,7 @@ struct llama_context_params llama_context_default_params() { size_t total_size_org = 0;
/*.offload_kqv =*/ true,
/*.flash_attn =*/ false,
/*.no_perf =*/ true,
+ /*.cross_attn =*/ false,
/*.abort_callback =*/ nullptr,
/*.abort_callback_data =*/ nullptr,
};
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me> From: jmorganca <jmorganca@gmail.com>
Date: Thu, 17 Oct 2024 17:19:25 -0700 Date: Sun, 13 Apr 2025 22:10:06 -0400
Subject: [PATCH] add unpad operator Subject: [PATCH] add unpad operator
adds the unpad operator to GGML
--- ---
ggml/include/ggml.h | 10 +++++ ggml/include/ggml.h | 10 +++++
ggml/src/ggml-cpu/ggml-cpu.c | 58 ++++++++++++++++++++++++++++ ggml/src/ggml-cpu/ggml-cpu.c | 5 +++
ggml/src/ggml-cpu/ops.cpp | 55 ++++++++++++++++++++++++++++
ggml/src/ggml-cpu/ops.h | 1 +
ggml/src/ggml-cuda/ggml-cuda.cu | 4 ++ ggml/src/ggml-cuda/ggml-cuda.cu | 4 ++
ggml/src/ggml-cuda/pad.cu | 46 ++++++++++++++++++++++ ggml/src/ggml-cuda/pad.cu | 46 +++++++++++++++++++++++
ggml/src/ggml-cuda/pad.cuh | 1 + ggml/src/ggml-cuda/pad.cuh | 1 +
ggml/src/ggml-metal/ggml-metal.m | 33 ++++++++++++++++ ggml/src/ggml-metal/ggml-metal.m | 33 +++++++++++++++++
ggml/src/ggml-metal/ggml-metal.metal | 45 +++++++++++++++++++++ ggml/src/ggml-metal/ggml-metal.metal | 45 +++++++++++++++++++++++
ggml/src/ggml.c | 25 +++++++++++- ggml/src/ggml.c | 25 ++++++++++++-
8 files changed, 220 insertions(+), 2 deletions(-) 10 files changed, 223 insertions(+), 2 deletions(-)
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index dd0c6a96..8d269a9c 100644 index 8fcc16df..d19fc167 100644
--- a/ggml/include/ggml.h --- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h +++ b/ggml/include/ggml.h
@@ -487,6 +487,7 @@ extern "C" { @@ -488,6 +488,7 @@ extern "C" {
GGML_OP_UPSCALE, // nearest interpolate GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_PAD, GGML_OP_PAD,
GGML_OP_PAD_REFLECT_1D, GGML_OP_PAD_REFLECT_1D,
...@@ -26,7 +29,7 @@ index dd0c6a96..8d269a9c 100644 ...@@ -26,7 +29,7 @@ index dd0c6a96..8d269a9c 100644
GGML_OP_ARANGE, GGML_OP_ARANGE,
GGML_OP_TIMESTEP_EMBEDDING, GGML_OP_TIMESTEP_EMBEDDING,
GGML_OP_ARGSORT, GGML_OP_ARGSORT,
@@ -1743,6 +1744,15 @@ extern "C" { @@ -1757,6 +1758,15 @@ extern "C" {
int p0, int p0,
int p1); int p1);
...@@ -43,13 +46,38 @@ index dd0c6a96..8d269a9c 100644 ...@@ -43,13 +46,38 @@ index dd0c6a96..8d269a9c 100644
// timesteps: [N,] // timesteps: [N,]
// return: [N, dim] // return: [N, dim]
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 72325349..2f606d82 100644 index 50400328..432942bf 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c --- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -10844,6 +10844,59 @@ static void ggml_compute_forward_pad_reflect_1d( @@ -1960,6 +1960,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_pad_reflect_1d(params, tensor);
} break;
+ case GGML_OP_UNPAD:
+ {
+ ggml_compute_forward_unpad(params, tensor);
+ } break;
case GGML_OP_ARANGE:
{
ggml_compute_forward_arange(params, tensor);
@@ -2282,6 +2286,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
case GGML_OP_UPSCALE:
case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D:
+ case GGML_OP_UNPAD:
case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT:
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 6050147b..66b8da68 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -6531,6 +6531,61 @@ void ggml_compute_forward_pad_reflect_1d(
} }
} }
+// ggml_compute_forward_unpad
+
+static void ggml_compute_forward_unpad_f32( +static void ggml_compute_forward_unpad_f32(
+ const struct ggml_compute_params *params, + const struct ggml_compute_params *params,
+ struct ggml_tensor *dst) { + struct ggml_tensor *dst) {
...@@ -85,7 +113,7 @@ index 72325349..2f606d82 100644 ...@@ -85,7 +113,7 @@ index 72325349..2f606d82 100644
+ } + }
+} +}
+ +
+static void ggml_compute_forward_unpad( +void ggml_compute_forward_unpad(
+ const struct ggml_compute_params * params, + const struct ggml_compute_params * params,
+ struct ggml_tensor * dst) { + struct ggml_tensor * dst) {
+ +
...@@ -106,30 +134,23 @@ index 72325349..2f606d82 100644 ...@@ -106,30 +134,23 @@ index 72325349..2f606d82 100644
// ggml_compute_forward_arange // ggml_compute_forward_arange
static void ggml_compute_forward_arange_f32( static void ggml_compute_forward_arange_f32(
@@ -13137,6 +13190,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h
{ index 410a3720..3eca1cf8 100644
ggml_compute_forward_pad_reflect_1d(params, tensor); --- a/ggml/src/ggml-cpu/ops.h
} break; +++ b/ggml/src/ggml-cpu/ops.h
+ case GGML_OP_UNPAD: @@ -71,6 +71,7 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params
+ { void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+ ggml_compute_forward_unpad(params, tensor); void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+ } break; void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
case GGML_OP_ARANGE: +void ggml_compute_forward_unpad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
{ void ggml_compute_forward_arange(const struct ggml_compute_params * params, struct ggml_tensor * dst);
ggml_compute_forward_arange(params, tensor); void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
@@ -13484,6 +13541,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
case GGML_OP_UPSCALE:
case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D:
+ case GGML_OP_UNPAD:
case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT:
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 1b0d074b..c7a957c8 100644 index b70c6a32..67208cba 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2200,6 +2200,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg @@ -2245,6 +2245,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_PAD: case GGML_OP_PAD:
ggml_cuda_op_pad(ctx, dst); ggml_cuda_op_pad(ctx, dst);
break; break;
...@@ -139,16 +160,16 @@ index 1b0d074b..c7a957c8 100644 ...@@ -139,16 +160,16 @@ index 1b0d074b..c7a957c8 100644
case GGML_OP_ARANGE: case GGML_OP_ARANGE:
ggml_cuda_op_arange(ctx, dst); ggml_cuda_op_arange(ctx, dst);
break; break;
@@ -3199,6 +3202,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g @@ -3223,6 +3226,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
return ggml_is_contiguous(op->src[0]);
case GGML_OP_UPSCALE: case GGML_OP_UPSCALE:
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
case GGML_OP_PAD: case GGML_OP_PAD:
+ case GGML_OP_UNPAD: + case GGML_OP_UNPAD:
case GGML_OP_ARANGE: case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_LEAKY_RELU: case GGML_OP_LEAKY_RELU:
diff --git a/ggml/src/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu diff --git a/ggml/src/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu
index aba539e8..b4b87409 100644 index 77432b04..7d45a7e1 100644
--- a/ggml/src/ggml-cuda/pad.cu --- a/ggml/src/ggml-cuda/pad.cu
+++ b/ggml/src/ggml-cuda/pad.cu +++ b/ggml/src/ggml-cuda/pad.cu
@@ -47,3 +47,49 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { @@ -47,3 +47,49 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
...@@ -212,10 +233,10 @@ index 8fd386b0..e2ededc3 100644 ...@@ -212,10 +233,10 @@ index 8fd386b0..e2ededc3 100644
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index fd9a4e77..e4c093f9 100644 index 310afe8a..b121ab9e 100644
--- a/ggml/src/ggml-metal/ggml-metal.m --- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -331,6 +331,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte @@ -341,6 +341,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
GGML_METAL_KERNEL_TYPE_UPSCALE_F32, GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
GGML_METAL_KERNEL_TYPE_PAD_F32, GGML_METAL_KERNEL_TYPE_PAD_F32,
GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
...@@ -223,23 +244,23 @@ index fd9a4e77..e4c093f9 100644 ...@@ -223,23 +244,23 @@ index fd9a4e77..e4c093f9 100644
GGML_METAL_KERNEL_TYPE_ARANGE_F32, GGML_METAL_KERNEL_TYPE_ARANGE_F32,
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
@@ -946,6 +947,7 @@ @implementation GGMLMetalClass @@ -998,6 +999,7 @@ @implementation GGMLMetalClass
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true);
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32, unpad_f32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32, unpad_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
@@ -1254,6 +1256,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex @@ -1339,6 +1341,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
case GGML_OP_UPSCALE: case GGML_OP_POOL_2D:
case GGML_OP_PAD: case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D: case GGML_OP_PAD_REFLECT_1D:
+ case GGML_OP_UNPAD: + case GGML_OP_UNPAD:
case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
@@ -3469,6 +3472,36 @@ static void ggml_metal_encode_node( case GGML_OP_LEAKY_RELU:
@@ -3669,6 +3672,36 @@ static void ggml_metal_encode_node(
const int nth = MIN(1024, ne0); const int nth = MIN(1024, ne0);
...@@ -277,10 +298,10 @@ index fd9a4e77..e4c093f9 100644 ...@@ -277,10 +298,10 @@ index fd9a4e77..e4c093f9 100644
} break; } break;
case GGML_OP_ARANGE: case GGML_OP_ARANGE:
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index d092a169..f38909d0 100644 index b08666e2..e3185e5b 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal --- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -2953,6 +2953,51 @@ kernel void kernel_pad_reflect_1d_f32( @@ -2968,6 +2968,51 @@ kernel void kernel_pad_reflect_1d_f32(
} }
} }
...@@ -331,12 +352,12 @@ index d092a169..f38909d0 100644 ...@@ -331,12 +352,12 @@ index d092a169..f38909d0 100644
+ +
kernel void kernel_arange_f32( kernel void kernel_arange_f32(
device char * dst, device char * dst,
constant int64_t & ne0, constant ggml_metal_kargs_arange & args,
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 7fc06724..635aa299 100644 index 950772c7..2276b631 100644
--- a/ggml/src/ggml.c --- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c +++ b/ggml/src/ggml.c
@@ -962,6 +962,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { @@ -963,6 +963,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"UPSCALE", "UPSCALE",
"PAD", "PAD",
"PAD_REFLECT_1D", "PAD_REFLECT_1D",
...@@ -344,16 +365,16 @@ index 7fc06724..635aa299 100644 ...@@ -344,16 +365,16 @@ index 7fc06724..635aa299 100644
"ARANGE", "ARANGE",
"TIMESTEP_EMBEDDING", "TIMESTEP_EMBEDDING",
"ARGSORT", "ARGSORT",
@@ -996,7 +997,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { @@ -993,7 +994,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"OPT_STEP_ADAMW", "OPT_STEP_ADAMW",
}; };
-static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83"); -static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
+static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84"); +static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none", "none",
@@ -1059,6 +1060,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { @@ -1057,6 +1058,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"upscale(x)", "upscale(x)",
"pad(x)", "pad(x)",
"pad_reflect_1d(x)", "pad_reflect_1d(x)",
...@@ -361,16 +382,16 @@ index 7fc06724..635aa299 100644 ...@@ -361,16 +382,16 @@ index 7fc06724..635aa299 100644
"arange(start, stop, step)", "arange(start, stop, step)",
"timestep_embedding(timesteps, dim, max_period)", "timestep_embedding(timesteps, dim, max_period)",
"argsort(x)", "argsort(x)",
@@ -1093,7 +1095,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { @@ -1087,7 +1089,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"adamw(x)", "adamw(x)",
}; };
-static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83"); -static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
+static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84"); +static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@@ -4225,6 +4227,25 @@ struct ggml_tensor * ggml_pad_reflect_1d( @@ -4262,6 +4264,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
return result; return result;
} }
......
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com> From: jmorganca <jmorganca@gmail.com>
Date: Fri, 25 Oct 2024 16:25:18 -0700 Date: Tue, 8 Apr 2025 19:43:06 -0700
Subject: [PATCH] fix deepseek deseret regex Subject: [PATCH] fix deepseek deseret regex
On windows compiled with gcc the c++ regex library failed to handle on some systems, deepseek's regex would throw an error
the characters on windows due to the deseret characters in the matching
regex
--- ---
src/llama-vocab.cpp | 2 +- src/llama-vocab.cpp | 2 +-
src/unicode.cpp | 22 ++++++++++++++++++++++ src/unicode.cpp | 21 +++++++++++++++++++++
2 files changed, 23 insertions(+), 1 deletion(-) 2 files changed, 22 insertions(+), 1 deletion(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index a4eee9b8..1ca827eb 100644 index 0125ee53..d74919d2 100644
--- a/src/llama-vocab.cpp --- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp
@@ -295,7 +295,7 @@ struct llm_tokenizer_bpe : llm_tokenizer { @@ -296,7 +296,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM: case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
regex_exprs = { regex_exprs = {
"[\r\n]", "[\r\n]",
...@@ -24,7 +25,7 @@ index a4eee9b8..1ca827eb 100644 ...@@ -24,7 +25,7 @@ index a4eee9b8..1ca827eb 100644
"\\s+$", "\\s+$",
"[一-龥ࠀ-一가-퟿]+", "[一-龥ࠀ-一가-퟿]+",
diff --git a/src/unicode.cpp b/src/unicode.cpp diff --git a/src/unicode.cpp b/src/unicode.cpp
index e63bb4ab..9dd53b9a 100644 index e63bb4ab..73cb2b1a 100644
--- a/src/unicode.cpp --- a/src/unicode.cpp
+++ b/src/unicode.cpp +++ b/src/unicode.cpp
@@ -2,6 +2,11 @@ @@ -2,6 +2,11 @@
...@@ -39,7 +40,7 @@ index e63bb4ab..9dd53b9a 100644 ...@@ -39,7 +40,7 @@ index e63bb4ab..9dd53b9a 100644
#include "unicode.h" #include "unicode.h"
#include "unicode-data.h" #include "unicode-data.h"
@@ -200,6 +205,22 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() { @@ -200,6 +205,21 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
} }
static inline std::wstring unicode_wstring_from_utf8(const std::string & s) { static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
...@@ -58,11 +59,10 @@ index e63bb4ab..9dd53b9a 100644 ...@@ -58,11 +59,10 @@ index e63bb4ab..9dd53b9a 100644
+ free(wbuf); + free(wbuf);
+ return ret; + return ret;
+#else +#else
+
#if defined(__clang__) #if defined(__clang__)
// disable C++17 deprecation warning for std::codecvt_utf8 // disable C++17 deprecation warning for std::codecvt_utf8
# pragma clang diagnostic push # pragma clang diagnostic push
@@ -213,6 +234,7 @@ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) { @@ -213,6 +233,7 @@ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
#endif #endif
return conv.from_bytes(s); return conv.from_bytes(s);
......
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: ParthSareen <parth.sareen@ollama.com> From: jmorganca <jmorganca@gmail.com>
Date: Wed, 11 Dec 2024 15:37:32 -0800 Date: Tue, 8 Apr 2025 19:43:40 -0700
Subject: [PATCH] Maintain ordering for rules for grammar Subject: [PATCH] maintain ordering for rules for grammar
--- ---
common/json-schema-to-grammar.cpp | 2 +- common/json-schema-to-grammar.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-) 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index 3ebcc3d9..30c28808 100644 index 90679822..56043678 100644
--- a/common/json-schema-to-grammar.cpp --- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp
@@ -346,7 +346,7 @@ private: @@ -346,7 +346,7 @@ private:
......
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: Tue, 15 Apr 2025 14:27:40 -0400
Subject: [PATCH] ensure KV cache is fully defragmented
Sometimes the KV cache requires defragmentation even without
triggering the threshold heuristic. In this case, decoding
will not being able to find a KV cache slot. This is particularly
difficult for the caller to handle if it happens in between
ubatches. To avoid this, we should immediately trigger a defrag.
In addition, a heavily fragmented cache can require more than
max_moves to defragment. Currently, we stop when we hit the limit
but this can leave a cache that still does not have adequate space
even after defragmentation is triggered. Instead, we should do
multiple batches of processing until everything is complete.
---
src/llama-context.cpp | 105 +++++++++++++----------------------------
src/llama-context.h | 4 +-
src/llama-kv-cache.cpp | 39 +++------------
src/llama-kv-cache.h | 9 +++-
4 files changed, 51 insertions(+), 106 deletions(-)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index afe6f552..d6e7b3af 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -590,13 +590,12 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
llm_graph_result_ptr llama_context::build_kv_self_defrag(
ggml_context * ctx0,
- ggml_cgraph * gf) const {
+ ggml_cgraph * gf,
+ const std::vector<struct llama_kv_defrag_move> & moves) const {
auto res = std::make_unique<llm_graph_result>();
const auto & hparams = model.hparams;
- const auto & ids = kv_self->defrag_info.ids;
-
#if 0
// CPU defrag
//
@@ -668,32 +667,20 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
}
#else
- for (uint32_t i = 0; i < ids.size(); ++i) {
- const uint32_t id = ids[i];
-
- if (i == id || id == ids.size()) {
- continue;
- }
-
- uint32_t nm = 1;
-
- while (i + nm < ids.size() && ids[i + nm] == id + nm) {
- nm++;
- }
-
+ for (const auto & move : moves) {
for (uint32_t il = 0; il < hparams.n_layer; ++il) { // NOLINT
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self->k_l[il],
- n_embd_k_gqa, nm,
+ n_embd_k_gqa, move.len,
ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
- ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*i));
+ ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*move.src));
ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self->k_l[il],
- n_embd_k_gqa, nm,
+ n_embd_k_gqa, move.len,
ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
- ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*id));
+ ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*move.dst));
ggml_tensor * view_v_src;
ggml_tensor * view_v_dst;
@@ -701,34 +688,30 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
if (cparams.flash_attn) {
// NOTE: the V cache is not transposed when using flash attention
view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il],
- n_embd_v_gqa, nm,
+ n_embd_v_gqa, move.len,
ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa),
- ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*i));
+ ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*move.src));
view_v_dst = ggml_view_2d(ctx0, kv_self->v_l[il],
- n_embd_v_gqa, nm,
+ n_embd_v_gqa, move.len,
ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa),
- ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*id));
+ ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*move.dst));
} else {
view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il],
- nm, n_embd_v_gqa,
+ move.len, n_embd_v_gqa,
ggml_row_size(kv_self->v_l[il]->type, kv_self->size),
- ggml_row_size(kv_self->v_l[il]->type, i));
+ ggml_row_size(kv_self->v_l[il]->type, move.src));
view_v_dst = ggml_view_2d(ctx0, kv_self->v_l[il],
- nm, n_embd_v_gqa,
+ move.len, n_embd_v_gqa,
ggml_row_size(kv_self->v_l[il]->type, kv_self->size),
- ggml_row_size(kv_self->v_l[il]->type, id));
+ ggml_row_size(kv_self->v_l[il]->type, move.dst));
}
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
}
-
- i += nm - 1;
}
-
- //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
#endif
return res;
@@ -737,8 +720,6 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
void llama_context::kv_self_update() {
auto & kv = kv_self;
- bool need_reserve = false;
-
if (kv->has_shift) {
if (!kv->get_can_shift()) {
GGML_ABORT("The current context does not support K-shift");
@@ -759,8 +740,6 @@ void llama_context::kv_self_update() {
res->set_inputs(nullptr);
graph_compute(gf, false);
-
- need_reserve = true;
}
{
@@ -775,49 +754,28 @@ void llama_context::kv_self_update() {
// defragment the KV cache if needed
if (kv->do_defrag) {
LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
+ const uint32_t n_max_nodes = graph_max_nodes();
+ const uint32_t max_moves = (n_max_nodes - 2*model.hparams.n_layer)/(6*model.hparams.n_layer);
+ if (!kv->defrag_prepare(n_max_nodes)) {
+ LLAMA_LOG_ERROR("%s: failed to prepare defragmentation\n", __func__);
+ return;
+ }
- if (kv->defrag_prepare(graph_max_nodes())) {
- ggml_backend_sched_reset(sched.get());
+ for (std::size_t i = 0; i < kv_self->defrag_info.moves.size(); i += max_moves) {
+ std::vector<struct llama_kv_defrag_move> chunk;
+ auto end = std::min(i + max_moves, kv_self->defrag_info.moves.size());
+ chunk.assign(kv_self->defrag_info.moves.begin() + i, kv_self->defrag_info.moves.begin() + end);
+ ggml_backend_sched_reset(sched.get());
auto * gf = graph_init();
-
- auto res = build_kv_self_defrag(ctx_compute.get(), gf);
-
+ auto res = build_kv_self_defrag(ctx_compute.get(), gf, chunk);
ggml_backend_sched_alloc_graph(sched.get(), gf);
-
res->set_inputs(nullptr);
-
graph_compute(gf, false);
-
- need_reserve = true;
}
kv->do_defrag = false;
}
-
- // reserve a worst case graph if needed
- if (need_reserve) {
- LLAMA_LOG_DEBUG("%s: reserving a worst case graph\n", __func__);
-
- // build worst-case graph
- uint32_t n_seqs = 1; // TODO: worst-case number of sequences
- uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
-
- // simulate full KV cache
- kv_self->n = kv_self->size;
-
- llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
- llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-
- auto * gf = graph_init();
- graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT);
-
- // initialize scheduler with the worst-case graph
- ggml_backend_sched_reset(sched.get());
- if (!ggml_backend_sched_reserve(sched.get(), gf)) {
- LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
- }
- }
}
enum llama_pooling_type llama_context::pooling_type() const {
@@ -1301,9 +1259,12 @@ int llama_context::decode(llama_batch & inp_batch) {
// find KV slot
{
if (!kv_self->find_slot(ubatch)) {
- LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
-
- return 1;
+ kv_self->defrag();
+ kv_self_update();
+ if (!kv_self->find_slot(ubatch)) {
+ LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
+ return 1;
+ }
}
if (!kv_self->recurrent) {
diff --git a/src/llama-context.h b/src/llama-context.h
index baa03276..a59ff8fd 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -5,6 +5,7 @@
#include "llama-cparams.h"
#include "llama-graph.h"
#include "llama-adapter.h"
+#include "llama-kv-cache.h"
#include "ggml-cpp.h"
@@ -180,7 +181,8 @@ private:
llm_graph_result_ptr build_kv_self_defrag(
ggml_context * ctx0,
- ggml_cgraph * gf) const;
+ ggml_cgraph * gf,
+ const std::vector<struct llama_kv_defrag_move> & moves) const;
// TODO: read/write lora adapters and cvec
size_t state_write_data(llama_io_write_i & io);
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index 9310f262..5c941e7c 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -781,17 +781,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
assert(n_used <= n_kv);
- //const int64_t t_start = ggml_time_us();
-
- // number of cells moved
- uint32_t n_moves = 0;
-
- // each move requires 6*n_layer tensors (see graph_build_kv_self_defrag)
- // - source view, destination view, copy operation
- // - x2 for keys and values
- //const uint32_t max_moves = max_nodes()/(6*n_layer);
- // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
- const uint32_t max_moves = (n_max_nodes - 2*n_layer)/(6*n_layer);
+ defrag_info.moves.clear();
// determine which KV cells to move where
//
@@ -799,10 +789,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
//
// if ids[i] == i || ids[i] == n_kv, then cell i is not moved
//
- auto & ids = defrag_info.ids;
-
- ids.clear();
- ids.resize(n_kv, n_kv);
+ std::vector<uint32_t> ids(n_kv, n_kv);
for (uint32_t i0 = 0; i0 < n_used; ++i0) {
const auto & cell0 = cells[i0];
@@ -851,19 +838,11 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
// are we moving a continuous block of memory?
bool cont = false;
- // should we stop searching for the next move?
- bool stop = false;
-
// go back and move the nf cells to the hole
for (; i1 < n_kv; ++i1) {
auto & cell1 = cells[i1];
if (cell1.is_empty() || ids[i1] != n_kv) {
- if (n_moves == max_moves) {
- stop = true;
- break;
- }
-
cont = false;
continue;
}
@@ -879,8 +858,10 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
head = n_used;
if (!cont) {
- n_moves++;
+ defrag_info.moves.push_back({i1, i0 + nf, 1});
cont = true;
+ } else {
+ defrag_info.moves.back().len++;
}
nf++;
@@ -890,22 +871,16 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
}
}
- if (stop || n_moves == max_moves) {
- break;
- }
-
//LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
i0 += nh - 1;
}
- if (n_moves == 0) {
+ if (defrag_info.moves.size() == 0) {
return false;
}
- LLAMA_LOG_DEBUG("(tmp log) KV defrag cell moves: %u\n", n_moves);
-
- LLAMA_LOG_DEBUG("expected gf nodes: %u\n", 6*n_moves*n_layer);
+ // LLAMA_LOG_DEBUG("(tmp log) KV defrag cell moves: %u\n", n_moves);
return true;
}
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
index 56c74035..25cbcb56 100644
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -43,6 +43,13 @@ private:
llama_kv_cache * kv;
};
+// block of KV slots to move when defragging
+struct llama_kv_defrag_move {
+ uint32_t src;
+ uint32_t dst;
+ uint32_t len;
+};
+
struct llama_kv_cell {
llama_pos pos = -1;
llama_pos delta = 0;
@@ -131,7 +138,7 @@ public:
// defrag
struct {
- std::vector<uint32_t> ids;
+ std::vector<llama_kv_defrag_move> moves;
} defrag_info;
// return true if cells have been moved
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Fri, 13 Dec 2024 16:11:59 -0800
Subject: [PATCH] llama: Ensure KV cache is fully defragmented.
Sometimes the KV cache requires defragmentation even without
triggering the threshold heuristic. In this case, decoding
will not being able to find a KV cache slot. This is particularly
difficult for the caller to handle if it happens in between
ubatches. To avoid this, we should immediately trigger a defrag.
In addition, a heavily fragmented cache can require more than
max_moves to defragment. Currently, we stop when we hit the limit
but this can leave a cache that still does not have adequate space
even after defragmentation is triggered. Instead, we should do
multiple batches of processing until everything is complete.
---
src/llama.cpp | 99 ++++++++++++++++++++++++---------------------------
1 file changed, 46 insertions(+), 53 deletions(-)
diff --git a/src/llama.cpp b/src/llama.cpp
index 8f7902df..01854fce 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -1054,6 +1054,13 @@ static struct ggml_tensor * llm_build_rwkv6_channel_mix(
return ggml_mul(ctx, r, llm_build_lora_mm(lctx, ctx, layer->channel_mix_value, k));
}
+// block of KV slots to move when defragging
+struct llama_kv_defrag_move {
+ uint32_t src;
+ uint32_t dst;
+ uint32_t len;
+};
+
struct llm_build_context {
const llama_model & model;
llama_context & lctx;
@@ -1230,35 +1237,23 @@ struct llm_build_context {
return gf;
}
- struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
+ struct ggml_cgraph * build_defrag(const std::vector<struct llama_kv_defrag_move> & moves) {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
- for (uint32_t i = 0; i < ids.size(); ++i) {
- const uint32_t id = ids[i];
-
- if (i == id || id == ids.size()) {
- continue;
- }
-
- uint32_t nm = 1;
-
- while (i + nm < ids.size() && ids[i + nm] == id + nm) {
- nm++;
- }
-
+ for (const auto & move : moves) {
for (int il = 0; il < n_layer; ++il) {
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
- n_embd_k_gqa, nm,
+ n_embd_k_gqa, move.len,
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
- ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*move.src));
ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
- n_embd_k_gqa, nm,
+ n_embd_k_gqa, move.len,
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
- ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*move.dst));
ggml_tensor * view_v_src;
ggml_tensor * view_v_dst;
@@ -1266,31 +1261,29 @@ struct llm_build_context {
if (flash_attn) {
// NOTE: the V cache is not transposed when using flash attention
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
- n_embd_v_gqa, nm,
+ n_embd_v_gqa, move.len,
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
- ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
+ ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*move.src));
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
- n_embd_v_gqa, nm,
+ n_embd_v_gqa, move.len,
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
- ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
+ ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*move.dst));
} else {
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
- nm, n_embd_v_gqa,
+ move.len, n_embd_v_gqa,
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
- ggml_row_size(kv_self.v_l[il]->type, i));
+ ggml_row_size(kv_self.v_l[il]->type, move.src));
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
- nm, n_embd_v_gqa,
+ move.len, n_embd_v_gqa,
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
- ggml_row_size(kv_self.v_l[il]->type, id));
+ ggml_row_size(kv_self.v_l[il]->type, move.dst));
}
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
}
-
- i += nm - 1;
}
//LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
@@ -8508,7 +8501,7 @@ struct llm_build_context {
}
};
-static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
+static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<struct llama_kv_defrag_move> & moves) {
llama_ubatch dummy = {};
dummy.equal_seqs = true;
@@ -8518,7 +8511,7 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const
llm.init();
- struct ggml_cgraph * result = llm.build_defrag(ids);
+ struct ggml_cgraph * result = llm.build_defrag(moves);
llm.free();
@@ -8956,7 +8949,12 @@ static int llama_prepare_ubatch(
kv_self.head = 0;
}
- const auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
+ auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
+ if (!slot) {
+ llama_kv_cache_defrag(kv_self);
+ llama_kv_cache_update(&lctx);
+ slot = llama_kv_cache_find_slot(kv_self, ubatch);
+ }
if (!slot) {
return 1;
}
@@ -9431,8 +9429,8 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
//const int64_t t_start = ggml_time_us();
- // number of cells moved
- uint32_t n_moves = 0;
+ // groups of cells moved
+ std::vector<struct llama_kv_defrag_move> moves;
// each move requires 6*n_layer tensors (see build_defrag)
// - source view, destination view, copy operation
@@ -9496,19 +9494,11 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
// are we moving a continuous block of memory?
bool cont = false;
- // should we stop searching for the next move?
- bool stop = false;
-
// go back and move the nf cells to the hole
for (; i1 < n_kv; ++i1) {
auto & cell1 = kv_self.cells[i1];
if (cell1.is_empty() || ids[i1] != n_kv) {
- if (n_moves == max_moves) {
- stop = true;
- break;
- }
-
cont = false;
continue;
}
@@ -9524,8 +9514,10 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
kv_self.head = n_used;
if (!cont) {
- n_moves++;
+ moves.push_back({i1, i0 + nf, 1});
cont = true;
+ } else {
+ moves.back().len++;
}
nf++;
@@ -9535,22 +9527,16 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
}
}
- if (stop || n_moves == max_moves) {
- break;
- }
-
//LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
i0 += nh - 1;
}
- if (n_moves == 0) {
+ if (moves.size() == 0) {
return;
}
- //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
-
- //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
+ //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", moves.size());
#if 0
// CPU defrag
@@ -9625,11 +9611,18 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
#else
// ggml_graph defrag
- ggml_backend_sched_reset(lctx.sched.get());
+ for (std::size_t i = 0; i < moves.size(); i += max_moves) {
+ std::vector<struct llama_kv_defrag_move> chunk;
+ auto end = std::min(i + max_moves, moves.size());
+ chunk.assign(moves.begin() + i, moves.begin() + end);
- ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
+ ggml_backend_sched_reset(lctx.sched.get());
+
+ //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*chunk.size()*n_layer);
+ ggml_cgraph * gf = llama_build_graph_defrag(lctx, chunk);
- llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
+ llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
+ }
#endif
//const int64_t t_end = ggml_time_us();
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment