"docs/get_started/installation.md" did not exist on "823b62ed9478049c5e977101f373eaf4e60ac98c"
Unverified Commit d7d7e996 authored by Jeffrey Morgan's avatar Jeffrey Morgan Committed by GitHub
Browse files

llama: update llama.cpp vendor code to commit d7cfe1ff (#9356)

parent 2db96c18
...@@ -15,6 +15,7 @@ enum llm_chat_template { ...@@ -15,6 +15,7 @@ enum llm_chat_template {
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN, LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
LLM_CHAT_TEMPLATE_MISTRAL_V7, LLM_CHAT_TEMPLATE_MISTRAL_V7,
LLM_CHAT_TEMPLATE_PHI_3, LLM_CHAT_TEMPLATE_PHI_3,
LLM_CHAT_TEMPLATE_PHI_4,
LLM_CHAT_TEMPLATE_FALCON_3, LLM_CHAT_TEMPLATE_FALCON_3,
LLM_CHAT_TEMPLATE_ZEPHYR, LLM_CHAT_TEMPLATE_ZEPHYR,
LLM_CHAT_TEMPLATE_MONARCH, LLM_CHAT_TEMPLATE_MONARCH,
...@@ -30,6 +31,7 @@ enum llm_chat_template { ...@@ -30,6 +31,7 @@ enum llm_chat_template {
LLM_CHAT_TEMPLATE_LLAMA_3, LLM_CHAT_TEMPLATE_LLAMA_3,
LLM_CHAT_TEMPLATE_CHATGML_3, LLM_CHAT_TEMPLATE_CHATGML_3,
LLM_CHAT_TEMPLATE_CHATGML_4, LLM_CHAT_TEMPLATE_CHATGML_4,
LLM_CHAT_TEMPLATE_GLMEDGE,
LLM_CHAT_TEMPLATE_MINICPM, LLM_CHAT_TEMPLATE_MINICPM,
LLM_CHAT_TEMPLATE_EXAONE_3, LLM_CHAT_TEMPLATE_EXAONE_3,
LLM_CHAT_TEMPLATE_RWKV_WORLD, LLM_CHAT_TEMPLATE_RWKV_WORLD,
......
#include "llama-context.h" #include "llama-context.h"
#include "llama-impl.h"
#include "llama-mmap.h"
#include <cassert> #include <cassert>
#include <cmath> #include <cmath>
#include <cstring> #include <cstring>
...@@ -513,7 +516,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) { ...@@ -513,7 +516,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
auto * buft = ggml_backend_cpu_buffer_type(); auto * buft = ggml_backend_cpu_buffer_type();
// try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
auto * output_dev = lctx.model.dev_output.dev; auto * output_dev = lctx.model.dev_output();
auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr; auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr;
if (output_dev_host_buft) { if (output_dev_host_buft) {
buft = output_dev_host_buft; buft = output_dev_host_buft;
......
...@@ -22,12 +22,12 @@ struct llama_context { ...@@ -22,12 +22,12 @@ struct llama_context {
const struct llama_model & model; const struct llama_model & model;
struct llama_cparams cparams; struct llama_cparams cparams;
struct llama_sbatch sbatch; // TODO: revisit if needed struct llama_sbatch sbatch; // TODO: revisit if needed
struct llama_kv_cache kv_self; struct llama_kv_cache kv_self;
struct llama_control_vector cvec; struct llama_adapter_cvec cvec;
std::unordered_map<struct llama_lora_adapter *, float> lora_adapters; std::unordered_map<struct llama_adapter_lora *, float> lora;
std::vector<ggml_backend_ptr> backends; std::vector<ggml_backend_ptr> backends;
std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns; std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
......
This diff is collapsed.
...@@ -114,6 +114,15 @@ struct llama_grammar { ...@@ -114,6 +114,15 @@ struct llama_grammar {
// buffer for partially generated UTF-8 sequence from accepted tokens // buffer for partially generated UTF-8 sequence from accepted tokens
llama_partial_utf8 partial_utf8; llama_partial_utf8 partial_utf8;
// lazy grammars wait for trigger words or tokens before constraining the sampling.
// we still have trigger_tokens for non-lazy grammars to force printing of special trigger tokens.
// (useful e.g. for tool_choice=required)
bool lazy = false;
bool awaiting_trigger = false; // Initialized to true for lazy grammars only
std::string trigger_buffer; // Output buffered by lazy grammar. Will be cleared once trigger is found.
std::vector<llama_token> trigger_tokens; // Tokens that trigger a lazy grammar, or tokens to force printing of (even if special).
std::vector<std::string> trigger_words;
}; };
// //
...@@ -127,7 +136,15 @@ struct llama_grammar * llama_grammar_init_impl( ...@@ -127,7 +136,15 @@ struct llama_grammar * llama_grammar_init_impl(
size_t n_rules, size_t n_rules,
size_t start_rule_index); size_t start_rule_index);
struct llama_grammar * llama_grammar_init_impl(const struct llama_vocab * vocab, const char * grammar_str, const char * grammar_root); struct llama_grammar * llama_grammar_init_impl(
const struct llama_vocab * vocab,
const char * grammar_str,
const char * grammar_root,
bool lazy,
const char ** trigger_words,
size_t num_trigger_words,
const llama_token * trigger_tokens,
size_t num_trigger_tokens);
void llama_grammar_free_impl(struct llama_grammar * grammar); void llama_grammar_free_impl(struct llama_grammar * grammar);
...@@ -141,3 +158,7 @@ void llama_grammar_apply_impl( ...@@ -141,3 +158,7 @@ void llama_grammar_apply_impl(
void llama_grammar_accept_impl( void llama_grammar_accept_impl(
struct llama_grammar & grammar, struct llama_grammar & grammar,
llama_token token); llama_token token);
void llama_grammar_accept_str(
struct llama_grammar & grammar,
const std::string & piece);
...@@ -54,7 +54,7 @@ uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const { ...@@ -54,7 +54,7 @@ uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const {
uint32_t llama_hparams::n_embd_k_s() const { uint32_t llama_hparams::n_embd_k_s() const {
if (wkv_head_size != 0) { if (wkv_head_size != 0) {
// for RWKV models // for RWKV models
return 2 * n_embd; return token_shift_count * n_embd;
} }
// TODO: maybe support other convolution strides than 1 // TODO: maybe support other convolution strides than 1
...@@ -82,4 +82,4 @@ bool llama_hparams::n_bskcn(uint32_t n, uint32_t il) const { ...@@ -82,4 +82,4 @@ bool llama_hparams::n_bskcn(uint32_t n, uint32_t il) const {
bool llama_hparams::cross_attention_layers(uint32_t il) const { bool llama_hparams::cross_attention_layers(uint32_t il) const {
return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end(); return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
} }
\ No newline at end of file
...@@ -30,7 +30,6 @@ struct llama_hparams { ...@@ -30,7 +30,6 @@ struct llama_hparams {
bool use_par_res; bool use_par_res;
bool swin_norm; bool swin_norm;
uint32_t n_vocab = 0;
uint32_t n_ctx_train; // context size the model was trained on uint32_t n_ctx_train; // context size the model was trained on
uint32_t n_embd; uint32_t n_embd;
uint32_t n_embd_features = 0; uint32_t n_embd_features = 0;
...@@ -41,8 +40,8 @@ struct llama_hparams { ...@@ -41,8 +40,8 @@ struct llama_hparams {
uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
uint32_t n_expert = 0; uint32_t n_expert = 0;
uint32_t n_expert_used = 0; uint32_t n_expert_used = 0;
uint32_t n_vocab_type = 0; // for BERT-style token types
uint32_t n_rel_attn_bkts = 0; uint32_t n_rel_attn_bkts = 0;
uint32_t n_vocab = 0;
// for WavTokenizer // for WavTokenizer
struct llama_hparams_posnet posnet; struct llama_hparams_posnet posnet;
...@@ -79,6 +78,7 @@ struct llama_hparams { ...@@ -79,6 +78,7 @@ struct llama_hparams {
uint32_t time_mix_extra_dim = 0; uint32_t time_mix_extra_dim = 0;
uint32_t time_decay_extra_dim = 0; uint32_t time_decay_extra_dim = 0;
uint32_t wkv_head_size = 0; uint32_t wkv_head_size = 0;
uint32_t token_shift_count = 2;
float rope_attn_factor = 1.0f; float rope_attn_factor = 1.0f;
float rope_freq_base_train; float rope_freq_base_train;
...@@ -141,7 +141,7 @@ struct llama_hparams { ...@@ -141,7 +141,7 @@ struct llama_hparams {
// Block skip connection // Block skip connection
bool n_bskcn(uint32_t n, uint32_t il) const; bool n_bskcn(uint32_t n, uint32_t il) const;
// cross attention layers // cross attention layers
bool cross_attention_layers(uint32_t il) const; bool cross_attention_layers(uint32_t il) const;
}; };
......
#include "llama-impl.h" #include "llama-impl.h"
#include "gguf.h"
#include "llama.h" #include "llama.h"
#include <cinttypes> #include <cinttypes>
...@@ -138,7 +139,7 @@ std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { ...@@ -138,7 +139,7 @@ std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
{ {
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i); const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
int arr_n = gguf_get_arr_n(ctx_gguf, i); int arr_n = gguf_get_arr_n(ctx_gguf, i);
const void * data = gguf_get_arr_data(ctx_gguf, i); const void * data = arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx_gguf, i);
std::stringstream ss; std::stringstream ss;
ss << "["; ss << "[";
for (int j = 0; j < arr_n; j++) { for (int j = 0; j < arr_n; j++) {
......
...@@ -6,13 +6,13 @@ ...@@ -6,13 +6,13 @@
#include <vector> #include <vector>
#ifdef __GNUC__ #ifdef __GNUC__
#ifdef __MINGW32__ # if defined(__MINGW32__) && !defined(__clang__)
#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) # define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
# else
# define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
# endif
#else #else
#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) # define LLAMA_ATTRIBUTE_FORMAT(...)
#endif
#else
#define LLAMA_ATTRIBUTE_FORMAT(...)
#endif #endif
// //
......
...@@ -72,39 +72,6 @@ bool llama_kv_cache_init( ...@@ -72,39 +72,6 @@ bool llama_kv_cache_init(
cache.v_l.reserve(n_layer); cache.v_l.reserve(n_layer);
for (int i = 0; i < n_layer; i++) { for (int i = 0; i < n_layer; i++) {
// for cross attention layers
if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layers(i)) {
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
const llama_model::buft_list_t * buft_list;
if (offload) {
buft_list = model.dev_layer.at(i).buft_list;
} else {
buft_list = &model.cpu_buft_list;
}
ggml_backend_buffer_type_t buft = select_buft(*buft_list,
[&](ggml_context * ctx) {
ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
if (hparams.rope_type == LLAMA_ROPE_TYPE_NONE) {
return k;
}
ggml_tensor * p = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
return ggml_rope(ctx, k, p, hparams.n_rot, hparams.rope_type);
});
ggml_context * ctx = ctx_for_buft(buft);
if (!ctx) {
LLAMA_LOG_ERROR("%s: failed to create ggml context for kv cache\n", __func__);
return false;
}
ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i));
ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i));
ggml_format_name(k, "cache_k_l%d", i);
ggml_format_name(v, "cache_v_l%d", i);
cache.k_l.push_back(k);
cache.v_l.push_back(v);
continue;
}
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s(); const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s(); const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
...@@ -112,7 +79,7 @@ bool llama_kv_cache_init( ...@@ -112,7 +79,7 @@ bool llama_kv_cache_init(
ggml_backend_buffer_type_t buft; ggml_backend_buffer_type_t buft;
if (offload) { if (offload) {
auto * dev = model.dev_layer.at(i).dev; auto * dev = model.dev_layer(i);
buft = ggml_backend_dev_buffer_type(dev); buft = ggml_backend_dev_buffer_type(dev);
} else { } else {
buft = ggml_backend_cpu_buffer_type(); buft = ggml_backend_cpu_buffer_type();
...@@ -124,8 +91,17 @@ bool llama_kv_cache_init( ...@@ -124,8 +91,17 @@ bool llama_kv_cache_init(
return false; return false;
} }
ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size); ggml_tensor * k, *v;
ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
// for cross attention layers
if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layers(i)) {
k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i));
v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i));
} else {
k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
}
ggml_format_name(k, "cache_k_l%d", i); ggml_format_name(k, "cache_k_l%d", i);
ggml_format_name(v, "cache_v_l%d", i); ggml_format_name(v, "cache_v_l%d", i);
cache.k_l.push_back(k); cache.k_l.push_back(k);
...@@ -152,10 +128,10 @@ bool llama_kv_cache_init( ...@@ -152,10 +128,10 @@ bool llama_kv_cache_init(
struct llama_kv_cache_slot_info llama_kv_cache_find_slot( struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
struct llama_kv_cache & cache, struct llama_kv_cache & cache,
const struct llama_ubatch & batch) { const struct llama_ubatch & ubatch) {
const uint32_t n_tokens = batch.n_tokens; const uint32_t n_tokens = ubatch.n_tokens;
const uint32_t n_seqs = batch.n_seqs; const uint32_t n_seqs = ubatch.n_seqs;
const uint32_t n_seq_tokens = batch.n_seq_tokens; const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
if (cache.recurrent) { if (cache.recurrent) {
// For recurrent state architectures (like Mamba or RWKV), // For recurrent state architectures (like Mamba or RWKV),
...@@ -163,16 +139,16 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot( ...@@ -163,16 +139,16 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
// A slot should be always be contiguous. // A slot should be always be contiguous.
// can only process batches with an equal number of new tokens in each sequence // can only process batches with an equal number of new tokens in each sequence
GGML_ASSERT(batch.equal_seqs); GGML_ASSERT(ubatch.equal_seqs);
int32_t min = cache.size - 1; int32_t min = cache.size - 1;
int32_t max = 0; int32_t max = 0;
// everything should fit if all seq_ids are smaller than the max // everything should fit if all seq_ids are smaller than the max
for (uint32_t s = 0; s < n_seqs; ++s) { for (uint32_t s = 0; s < n_seqs; ++s) {
const uint32_t n_seq_id = batch.n_seq_id[s]; const uint32_t n_seq_id = ubatch.n_seq_id[s];
for (uint32_t j = 0; j < n_seq_id; ++j) { for (uint32_t j = 0; j < n_seq_id; ++j) {
const llama_seq_id seq_id = batch.seq_id[s][j]; const llama_seq_id seq_id = ubatch.seq_id[s][j];
if (seq_id < 0 || (uint32_t) seq_id >= cache.size) { if (seq_id < 0 || (uint32_t) seq_id >= cache.size) {
// too big seq_id // too big seq_id
...@@ -231,7 +207,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot( ...@@ -231,7 +207,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
// find usable cell range // find usable cell range
for (uint32_t s = 0; s < n_seqs; ++s) { for (uint32_t s = 0; s < n_seqs; ++s) {
const llama_seq_id seq_id = batch.seq_id[s][0]; const llama_seq_id seq_id = ubatch.seq_id[s][0];
llama_kv_cell & seq_meta = cache.cells[seq_id]; llama_kv_cell & seq_meta = cache.cells[seq_id];
bool has_cell = false; bool has_cell = false;
if (seq_meta.tail >= 0) { if (seq_meta.tail >= 0) {
...@@ -270,7 +246,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot( ...@@ -270,7 +246,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
// gather and re-order // gather and re-order
for (uint32_t s = 0; s < n_seqs; ++s) { for (uint32_t s = 0; s < n_seqs; ++s) {
int32_t dst_id = s + min; int32_t dst_id = s + min;
int32_t src_id = cache.cells[batch.seq_id[s][0]].tail; int32_t src_id = cache.cells[ubatch.seq_id[s][0]].tail;
if (dst_id != src_id) { if (dst_id != src_id) {
llama_kv_cell & dst_cell = cache.cells[dst_id]; llama_kv_cell & dst_cell = cache.cells[dst_id];
llama_kv_cell & src_cell = cache.cells[src_id]; llama_kv_cell & src_cell = cache.cells[src_id];
...@@ -291,7 +267,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot( ...@@ -291,7 +267,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
// update the pos of the used seqs // update the pos of the used seqs
for (uint32_t s = 0; s < n_seqs; ++s) { for (uint32_t s = 0; s < n_seqs; ++s) {
const llama_pos last_pos = batch.pos[n_seq_tokens * s + n_seq_tokens - 1]; const llama_pos last_pos = ubatch.pos[n_seq_tokens * s + n_seq_tokens - 1];
int32_t cell_id = s + min; int32_t cell_id = s + min;
llama_kv_cell & cell = cache.cells[cell_id]; llama_kv_cell & cell = cache.cells[cell_id];
...@@ -299,12 +275,12 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot( ...@@ -299,12 +275,12 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
// What should happen when the pos backtracks or skips a value? // What should happen when the pos backtracks or skips a value?
// Clearing the state mid-batch would require special-casing which isn't done. // Clearing the state mid-batch would require special-casing which isn't done.
LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d with %u new tokens\n", LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d with %u new tokens\n",
__func__, last_pos, cell.pos, batch.seq_id[s][0], n_seq_tokens); __func__, last_pos, cell.pos, ubatch.seq_id[s][0], n_seq_tokens);
} }
cell.pos = last_pos; cell.pos = last_pos;
cell.seq_id.clear(); cell.seq_id.clear();
for (int32_t j = 0; j < batch.n_seq_id[s]; ++j) { for (int32_t j = 0; j < ubatch.n_seq_id[s]; ++j) {
const llama_seq_id seq_id = batch.seq_id[s][j]; const llama_seq_id seq_id = ubatch.seq_id[s][j];
cell.seq_id.insert(seq_id); cell.seq_id.insert(seq_id);
cache.cells[seq_id].tail = cell_id; cache.cells[seq_id].tail = cell_id;
} }
...@@ -358,10 +334,10 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot( ...@@ -358,10 +334,10 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
for (uint32_t s = 0; s < n_seqs; s++) { for (uint32_t s = 0; s < n_seqs; s++) {
for (uint32_t i = 0; i < n_seq_tokens; ++i) { for (uint32_t i = 0; i < n_seq_tokens; ++i) {
uint32_t k = s*n_seq_tokens + i; uint32_t k = s*n_seq_tokens + i;
cache.cells[cache.head + k].pos = batch.pos[k]; cache.cells[cache.head + k].pos = ubatch.pos[k];
for (int32_t j = 0; j < batch.n_seq_id[s]; j++) { for (int32_t j = 0; j < ubatch.n_seq_id[s]; j++) {
cache.cells[cache.head + k].seq_id.insert(batch.seq_id[s][j]); cache.cells[cache.head + k].seq_id.insert(ubatch.seq_id[s][j]);
} }
} }
} }
......
...@@ -37,7 +37,7 @@ struct llama_kv_cache { ...@@ -37,7 +37,7 @@ struct llama_kv_cache {
bool can_shift = false; bool can_shift = false;
// Note: The value of head isn't only used to optimize searching // Note: The value of head isn't only used to optimize searching
// for a free KV slot. llama_decode_internal also uses it, so it // for a free KV slot. llama_decode_impl also uses it, so it
// cannot be freely changed after a slot has been allocated. // cannot be freely changed after a slot has been allocated.
uint32_t head = 0; uint32_t head = 0;
uint32_t size = 0; uint32_t size = 0;
......
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include <cstring> #include <cstring>
#include <climits> #include <climits>
#include <stdexcept> #include <stdexcept>
#include <cerrno>
#ifdef __has_include #ifdef __has_include
#if __has_include(<unistd.h>) #if __has_include(<unistd.h>)
...@@ -35,7 +36,7 @@ ...@@ -35,7 +36,7 @@
// TODO: consider moving to llama-impl.h if needed in more places // TODO: consider moving to llama-impl.h if needed in more places
#if defined(_WIN32) #if defined(_WIN32)
std::string llama_format_win_err(DWORD err) { static std::string llama_format_win_err(DWORD err) {
LPSTR buf; LPSTR buf;
size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL); NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
...@@ -241,12 +242,16 @@ llama_file::~llama_file() = default; ...@@ -241,12 +242,16 @@ llama_file::~llama_file() = default;
size_t llama_file::tell() const { return pimpl->tell(); } size_t llama_file::tell() const { return pimpl->tell(); }
size_t llama_file::size() const { return pimpl->size; } size_t llama_file::size() const { return pimpl->size; }
int llama_file::fileno() const { int llama_file::file_id() const {
#ifdef _WIN32 #ifdef _WIN32
return _fileno(pimpl->fp); return _fileno(pimpl->fp);
#else
#if defined(fileno)
return fileno(pimpl->fp);
#else #else
return ::fileno(pimpl->fp); return ::fileno(pimpl->fp);
#endif #endif
#endif
} }
void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); } void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
...@@ -265,7 +270,7 @@ struct llama_mmap::impl { ...@@ -265,7 +270,7 @@ struct llama_mmap::impl {
impl(struct llama_file * file, size_t prefetch, bool numa) { impl(struct llama_file * file, size_t prefetch, bool numa) {
size = file->size(); size = file->size();
int fd = file->fileno(); int fd = file->file_id();
int flags = MAP_SHARED; int flags = MAP_SHARED;
if (numa) { prefetch = 0; } if (numa) { prefetch = 0; }
#ifdef __linux__ #ifdef __linux__
...@@ -357,7 +362,7 @@ struct llama_mmap::impl { ...@@ -357,7 +362,7 @@ struct llama_mmap::impl {
size = file->size(); size = file->size();
HANDLE hFile = (HANDLE) _get_osfhandle(file->fileno()); HANDLE hFile = (HANDLE) _get_osfhandle(file->file_id());
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL); HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
......
#pragma once #pragma once
#include <cstdint>
#include <memory> #include <memory>
#include <vector> #include <vector>
...@@ -18,7 +19,7 @@ struct llama_file { ...@@ -18,7 +19,7 @@ struct llama_file {
size_t tell() const; size_t tell() const;
size_t size() const; size_t size() const;
int fileno() const; int file_id() const; // fileno overload
void seek(size_t offset, int whence) const; void seek(size_t offset, int whence) const;
......
...@@ -7,6 +7,10 @@ ...@@ -7,6 +7,10 @@
#include <cstring> #include <cstring>
#include <future> #include <future>
static const size_t kiB = 1024;
static const size_t MiB = 1024*kiB;
static const size_t GiB = 1024*MiB;
const char * llama_file_version_name(llama_fver version) { const char * llama_file_version_name(llama_fver version) {
switch (version) { switch (version) {
case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)"; case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
...@@ -17,8 +21,78 @@ const char * llama_file_version_name(llama_fver version) { ...@@ -17,8 +21,78 @@ const char * llama_file_version_name(llama_fver version) {
return "unknown"; return "unknown";
} }
static std::string llama_model_ftype_name(llama_ftype ftype) {
if (ftype & LLAMA_FTYPE_GUESSED) {
return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
}
switch (ftype) {
case LLAMA_FTYPE_ALL_F32: return "all F32";
case LLAMA_FTYPE_MOSTLY_F16: return "F16";
case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
case LLAMA_FTYPE_MOSTLY_TQ1_0: return "TQ1_0 - 1.69 bpw ternary";
case LLAMA_FTYPE_MOSTLY_TQ2_0: return "TQ2_0 - 2.06 bpw ternary";
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw";
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw";
case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw";
case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw";
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
default: return "unknown, may not work";
}
}
// return a list of splits for a given path
// for example, given "<name>-00002-of-00004.gguf", returns list of all 4 splits
static std::vector<std::string> llama_get_list_splits(const std::string & path, const int idx, const int n_split) {
std::vector<std::string> paths;
std::string split_prefix;
std::vector<char> buf(llama_path_max(), 0);
{
int ret = llama_split_prefix(buf.data(), buf.size(), path.c_str(), idx, n_split);
if (!ret) {
throw std::runtime_error(format("invalid split file name: %s", path.c_str()));
}
split_prefix = std::string(buf.data(), ret);
}
if (split_prefix.empty()) {
throw std::runtime_error(format("invalid split file: %s", path.c_str()));
}
for (int idx = 0; idx < n_split; ++idx) {
int ret = llama_split_path(buf.data(), buf.size(), split_prefix.c_str(), idx, n_split);
paths.push_back(std::string(buf.data(), ret));
}
return paths;
}
namespace GGUFMeta { namespace GGUFMeta {
template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int)> template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int64_t)>
struct GKV_Base_Type { struct GKV_Base_Type {
static constexpr gguf_type gt = gt_; static constexpr gguf_type gt = gt_;
...@@ -60,10 +134,11 @@ namespace GGUFMeta { ...@@ -60,10 +134,11 @@ namespace GGUFMeta {
public: public:
static constexpr gguf_type gt = GGUF_TYPE_ARRAY; static constexpr gguf_type gt = GGUF_TYPE_ARRAY;
static ArrayInfo getter(const gguf_context *ctx, const int k) { static ArrayInfo getter(const gguf_context *ctx, const int k) {
const enum gguf_type arr_type = gguf_get_arr_type(ctx, k);
return ArrayInfo { return ArrayInfo {
gguf_get_arr_type(ctx, k), arr_type,
size_t(gguf_get_arr_n(ctx, k)), size_t(gguf_get_arr_n(ctx, k)),
gguf_get_arr_data(ctx, k), arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx, k),
}; };
} }
}; };
...@@ -368,7 +443,12 @@ namespace GGUFMeta { ...@@ -368,7 +443,12 @@ namespace GGUFMeta {
template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required); template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
template bool llama_model_loader::get_key_or_arr<uint32_t>(const std::string & key, std::array<uint32_t, 512> & result, uint32_t n, bool required); template bool llama_model_loader::get_key_or_arr<uint32_t>(const std::string & key, std::array<uint32_t, 512> & result, uint32_t n, bool required);
llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) { llama_model_loader::llama_model_loader(
const std::string & fname,
std::vector<std::string> & splits,
bool use_mmap,
bool check_tensors,
const struct llama_model_kv_override * param_overrides_p) {
int trace = 0; int trace = 0;
if (getenv("LLAMA_TRACE")) { if (getenv("LLAMA_TRACE")) {
trace = atoi(getenv("LLAMA_TRACE")); trace = atoi(getenv("LLAMA_TRACE"));
...@@ -380,6 +460,7 @@ llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap, ...@@ -380,6 +460,7 @@ llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap,
} }
} }
// Load the main GGUF
struct ggml_context * ctx = NULL; struct ggml_context * ctx = NULL;
struct gguf_init_params params = { struct gguf_init_params params = {
/*.no_alloc = */ true, /*.no_alloc = */ true,
...@@ -415,35 +496,54 @@ llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap, ...@@ -415,35 +496,54 @@ llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap,
// Load additional GGML contexts // Load additional GGML contexts
if (n_split > 1) { if (n_split > 1) {
// make sure the main file is loaded first
uint16_t idx = 0; uint16_t idx = 0;
get_key(llm_kv(LLM_KV_SPLIT_NO), idx); const std::string kv_split_no = llm_kv(LLM_KV_SPLIT_NO);
get_key(kv_split_no, idx);
if (idx != 0) { if (idx != 0) {
throw std::runtime_error(format("illegal split file: %d, model must be loaded with the first split", idx)); throw std::runtime_error(format("illegal split file idx: %d (file: %s), model must be loaded with the first split", idx, fname.c_str()));
}
// generate list of splits if needed
if (splits.empty()) {
splits = llama_get_list_splits(fname, idx, n_split);
} }
std::vector<char> split_prefix(llama_path_max(), 0); // in case user give a custom list of splits, check if it matches the expected number
if (!llama_split_prefix(split_prefix.data(), split_prefix.size(), fname.c_str(), idx, n_split)) { if (n_split != (uint16_t)splits.size()) {
throw std::runtime_error(format("invalid split file: %s", fname.c_str())); throw std::runtime_error(format("invalid split count, given: %zu splits, but expected %d", splits.size(), n_split));
} }
if (trace > 0) { if (trace > 0) {
LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split); LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
} }
std::vector<char> split_path(llama_path_max(), 0); // load other splits
for (idx = 1; idx < n_split; idx++) { for (idx = 1; idx < n_split; idx++) {
llama_split_path(split_path.data(), split_path.size(), split_prefix.data(), idx, n_split); const char * fname_split = splits[idx].c_str();
struct gguf_init_params split_params = { struct gguf_init_params split_params = {
/*.no_alloc = */ true, /*.no_alloc = */ true,
/*.ctx = */ &ctx, /*.ctx = */ &ctx,
}; };
gguf_context_ptr ctx_gguf { gguf_init_from_file(split_path.data(), split_params) }; gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) };
if (!ctx_gguf) { if (!ctx_gguf) {
throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path.data())); throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, fname_split));
}
// check idx
{
const int kid = gguf_find_key(ctx_gguf.get(), kv_split_no.c_str());
if (kid < 0) {
throw std::runtime_error(format("missing key %s in GGUF split %s", kv_split_no.c_str(), fname_split));
}
int idx_gguf = gguf_get_val_u16(ctx_gguf.get(), kid);
if (idx_gguf != idx) {
throw std::runtime_error(format("invalid split file idx: %d (file: %s), expected %d", idx_gguf, fname_split, idx));
}
} }
files.emplace_back(new llama_file(split_path.data(), "rb")); files.emplace_back(new llama_file(fname_split, "rb"));
contexts.emplace_back(ctx); contexts.emplace_back(ctx);
// Save tensors data offset info of the shard. // Save tensors data offset info of the shard.
...@@ -556,7 +656,7 @@ llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap, ...@@ -556,7 +656,7 @@ llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap,
const enum gguf_type type = gguf_get_kv_type(meta.get(), i); const enum gguf_type type = gguf_get_kv_type(meta.get(), i);
const std::string type_name = const std::string type_name =
type == GGUF_TYPE_ARRAY type == GGUF_TYPE_ARRAY
? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta.get(), i)), gguf_get_arr_n(meta.get(), i)) ? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta.get(), i)), gguf_get_arr_n(meta.get(), i))
: gguf_type_name(type); : gguf_type_name(type);
std::string value = gguf_kv_to_str(meta.get(), i); std::string value = gguf_kv_to_str(meta.get(), i);
...@@ -722,7 +822,7 @@ void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps ...@@ -722,7 +822,7 @@ void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps
for (const auto & file : files) { for (const auto & file : files) {
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU)); auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa"); auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, is_numa_fn())); std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa_fn());
mmaps_used.emplace_back(mapping->size(), 0); mmaps_used.emplace_back(mapping->size(), 0);
if (mlock_mmaps) { if (mlock_mmaps) {
std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock()); std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
...@@ -1011,3 +1111,17 @@ bool llama_model_loader::load_all_data( ...@@ -1011,3 +1111,17 @@ bool llama_model_loader::load_all_data(
return true; return true;
} }
std::string llama_model_loader::ftype_name() const {
return llama_model_ftype_name(ftype);
}
void llama_model_loader::print_info() const {
LLAMA_LOG_INFO("%s: file format = %s\n", __func__, llama_file_version_name(fver));
LLAMA_LOG_INFO("%s: file type = %s\n", __func__, llama_model_ftype_name(ftype).c_str());
if (n_bytes < GiB) {
LLAMA_LOG_INFO("%s: file size = %.2f MiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0, n_bytes*8.0/n_elements);
} else {
LLAMA_LOG_INFO("%s: file size = %.2f GiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0/1024.0, n_bytes*8.0/n_elements);
}
}
...@@ -90,7 +90,12 @@ struct llama_model_loader { ...@@ -90,7 +90,12 @@ struct llama_model_loader {
size_t size_data = 0; size_t size_data = 0;
std::vector<std::pair<size_t, size_t>> mmaps_used; std::vector<std::pair<size_t, size_t>> mmaps_used;
llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p); llama_model_loader(
const std::string & fname,
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
bool use_mmap,
bool check_tensors,
const struct llama_model_kv_override * param_overrides_p);
template<typename T> template<typename T>
typename std::enable_if<std::is_integral<T>::value, bool>::type typename std::enable_if<std::is_integral<T>::value, bool>::type
...@@ -155,4 +160,8 @@ struct llama_model_loader { ...@@ -155,4 +160,8 @@ struct llama_model_loader {
llama_mlocks * lmlocks, llama_mlocks * lmlocks,
llama_progress_callback progress_callback, llama_progress_callback progress_callback,
void * progress_callback_user_data); void * progress_callback_user_data);
std::string ftype_name() const;
void print_info() const;
}; };
This diff is collapsed.
...@@ -4,81 +4,83 @@ ...@@ -4,81 +4,83 @@
#include "llama-arch.h" #include "llama-arch.h"
#include "llama-hparams.h" #include "llama-hparams.h"
#include "llama-vocab.h" #include "llama-vocab.h"
#include "llama-mmap.h"
#include "ggml-cpp.h"
#include <memory>
#include <string>
#include <unordered_map>
#include <vector> #include <vector>
#include <stdexcept> #include <stdexcept>
struct llama_model_loader;
// available models // available models
// TODO: this enum does not follow the enum naming convention
enum llm_type { enum llm_type {
MODEL_UNKNOWN, LLM_TYPE_UNKNOWN,
MODEL_14M, LLM_TYPE_14M,
MODEL_17M, LLM_TYPE_17M,
MODEL_22M, LLM_TYPE_22M,
MODEL_33M, LLM_TYPE_33M,
MODEL_60M, LLM_TYPE_60M,
MODEL_70M, LLM_TYPE_70M,
MODEL_80M, LLM_TYPE_80M,
MODEL_109M, LLM_TYPE_109M,
MODEL_137M, LLM_TYPE_137M,
MODEL_160M, LLM_TYPE_160M,
MODEL_220M, LLM_TYPE_220M,
MODEL_250M, LLM_TYPE_250M,
MODEL_270M, LLM_TYPE_270M,
MODEL_335M, LLM_TYPE_335M,
MODEL_410M, LLM_TYPE_410M,
MODEL_450M, LLM_TYPE_450M,
MODEL_770M, LLM_TYPE_770M,
MODEL_780M, LLM_TYPE_780M,
MODEL_0_5B, LLM_TYPE_0_5B,
MODEL_1B, LLM_TYPE_1B,
MODEL_1_3B, LLM_TYPE_1_3B,
MODEL_1_4B, LLM_TYPE_1_4B,
MODEL_1_5B, LLM_TYPE_1_5B,
MODEL_1_6B, LLM_TYPE_1_6B,
MODEL_2B, LLM_TYPE_2B,
MODEL_2_8B, LLM_TYPE_2_8B,
MODEL_3B, LLM_TYPE_3B,
MODEL_4B, LLM_TYPE_4B,
MODEL_6B, LLM_TYPE_6B,
MODEL_6_9B, LLM_TYPE_6_9B,
MODEL_7B, LLM_TYPE_7B,
MODEL_8B, LLM_TYPE_8B,
MODEL_9B, LLM_TYPE_9B,
MODEL_11B, LLM_TYPE_11B,
MODEL_12B, LLM_TYPE_12B,
MODEL_13B, LLM_TYPE_13B,
MODEL_14B, LLM_TYPE_14B,
MODEL_15B, LLM_TYPE_15B,
MODEL_16B, LLM_TYPE_16B,
MODEL_20B, LLM_TYPE_20B,
MODEL_22B, LLM_TYPE_22B,
MODEL_30B, LLM_TYPE_30B,
MODEL_32B, LLM_TYPE_32B,
MODEL_34B, LLM_TYPE_34B,
MODEL_35B, LLM_TYPE_35B,
MODEL_40B, LLM_TYPE_40B,
MODEL_65B, LLM_TYPE_65B,
MODEL_70B, LLM_TYPE_70B,
MODEL_90B, LLM_TYPE_90B,
MODEL_236B, LLM_TYPE_236B,
MODEL_314B, LLM_TYPE_314B,
MODEL_671B, LLM_TYPE_671B,
MODEL_SMALL, LLM_TYPE_SMALL,
MODEL_MEDIUM, LLM_TYPE_MEDIUM,
MODEL_LARGE, LLM_TYPE_LARGE,
MODEL_XL, LLM_TYPE_XL,
MODEL_A1_7B, LLM_TYPE_A1_7B,
MODEL_A2_7B, LLM_TYPE_A2_7B,
MODEL_8x7B, LLM_TYPE_8x7B,
MODEL_8x22B, LLM_TYPE_8x22B,
MODEL_16x12B, LLM_TYPE_16x12B,
MODEL_10B_128x3_66B, LLM_TYPE_16x3_8B,
MODEL_57B_A14B, LLM_TYPE_10B_128x3_66B,
MODEL_27B, LLM_TYPE_57B_A14B,
LLM_TYPE_27B,
}; };
struct llama_layer_posnet { struct llama_layer_posnet {
...@@ -243,15 +245,19 @@ struct llama_layer { ...@@ -243,15 +245,19 @@ struct llama_layer {
struct ggml_tensor * time_mix_lerp_v = nullptr; struct ggml_tensor * time_mix_lerp_v = nullptr;
struct ggml_tensor * time_mix_lerp_r = nullptr; struct ggml_tensor * time_mix_lerp_r = nullptr;
struct ggml_tensor * time_mix_lerp_g = nullptr; struct ggml_tensor * time_mix_lerp_g = nullptr;
struct ggml_tensor * time_mix_lerp_fused = nullptr;
struct ggml_tensor * time_mix_first = nullptr;
struct ggml_tensor * time_mix_decay = nullptr; struct ggml_tensor * time_mix_first = nullptr;
struct ggml_tensor * time_mix_decay_w1 = nullptr; struct ggml_tensor * time_mix_decay = nullptr;
struct ggml_tensor * time_mix_decay_w2 = nullptr; struct ggml_tensor * time_mix_decay_w1 = nullptr;
struct ggml_tensor * time_mix_key = nullptr; struct ggml_tensor * time_mix_decay_w2 = nullptr;
struct ggml_tensor * time_mix_value = nullptr; struct ggml_tensor * time_mix_key = nullptr;
struct ggml_tensor * time_mix_receptance = nullptr; struct ggml_tensor * time_mix_key_b = nullptr;
struct ggml_tensor * time_mix_gate = nullptr; struct ggml_tensor * time_mix_value = nullptr;
struct ggml_tensor * time_mix_value_b = nullptr;
struct ggml_tensor * time_mix_receptance = nullptr;
struct ggml_tensor * time_mix_receptance_b = nullptr;
struct ggml_tensor * time_mix_gate = nullptr;
struct ggml_tensor * time_mix_ln = nullptr; struct ggml_tensor * time_mix_ln = nullptr;
struct ggml_tensor * time_mix_ln_b = nullptr; struct ggml_tensor * time_mix_ln_b = nullptr;
...@@ -280,7 +286,7 @@ struct llama_layer { ...@@ -280,7 +286,7 @@ struct llama_layer {
struct ggml_tensor * bskcn_tv = nullptr; struct ggml_tensor * bskcn_tv = nullptr;
// cross attention // cross attention
struct ggml_tensor * cross_attn_k_norm = nullptr; struct ggml_tensor * cross_attn_k_norm = nullptr;
struct ggml_tensor * cross_attn_k_proj = nullptr; struct ggml_tensor * cross_attn_k_proj = nullptr;
struct ggml_tensor * cross_attn_o_proj = nullptr; struct ggml_tensor * cross_attn_o_proj = nullptr;
...@@ -296,11 +302,9 @@ struct llama_layer { ...@@ -296,11 +302,9 @@ struct llama_layer {
}; };
struct llama_model { struct llama_model {
llm_type type = MODEL_UNKNOWN; llm_type type = LLM_TYPE_UNKNOWN;
llm_arch arch = LLM_ARCH_UNKNOWN; llm_arch arch = LLM_ARCH_UNKNOWN;
llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
std::string name = "n/a"; std::string name = "n/a";
llama_hparams hparams = {}; llama_hparams hparams = {};
...@@ -329,117 +333,53 @@ struct llama_model { ...@@ -329,117 +333,53 @@ struct llama_model {
std::vector<llama_layer> layers; std::vector<llama_layer> layers;
llama_model_params params;
// gguf metadata // gguf metadata
std::unordered_map<std::string, std::string> gguf_kv; std::unordered_map<std::string, std::string> gguf_kv;
llama_split_mode split_mode;
int main_gpu;
int n_gpu_layers;
std::vector<std::string> rpc_servers;
// list of devices used in this model // list of devices used in this model
std::vector<ggml_backend_dev_t> devices; std::vector<ggml_backend_dev_t> devices;
// for quantize-stats only
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
// lists of buffer types used for each layer int64_t t_load_us = 0;
using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>; int64_t t_start_us = 0;
buft_list_t cpu_buft_list;
std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
struct layer_dev { explicit llama_model(const struct llama_model_params & params);
ggml_backend_dev_t dev; ~llama_model();
buft_list_t * buft_list;
};
layer_dev dev_input = {}; void load_stats (llama_model_loader & ml);
layer_dev dev_output = {}; void load_arch (llama_model_loader & ml);
std::vector<layer_dev> dev_layer; void load_hparams(llama_model_loader & ml);
void load_vocab (llama_model_loader & ml);
bool load_tensors(llama_model_loader & ml); // returns false if cancelled by progress_callback
// contexts where the model tensors metadata is stored std::string arch_name() const;
std::vector<ggml_context_ptr> ctxs; std::string type_name() const;
// the model memory buffers for the tensor data std::string desc() const;
std::vector<ggml_backend_buffer_ptr> bufs;
// model memory mapped files size_t size() const;
llama_mmaps mappings; size_t max_nodes() const;
size_t n_devices() const;
// objects representing data potentially being locked in memory // total number of parameters in the model
llama_mlocks mlock_bufs; uint64_t n_elements() const;
llama_mlocks mlock_mmaps;
// for quantize-stats only void print_info() const;
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
int64_t t_load_us = 0; ggml_backend_dev_t dev_layer(int il) const;
int64_t t_start_us = 0; ggml_backend_dev_t dev_output() const;
// total number of parameters in the model ggml_backend_buffer_type_t select_buft(int il) const;
uint64_t n_elements = 0;
// total size of all the tensors in the model in bytes const struct ggml_tensor * get_tensor(const char * name) const;
size_t n_bytes = 0;
private:
struct impl;
std::unique_ptr<impl> pimpl;
}; };
const char * llm_type_name(llm_type type); const char * llm_type_name(llm_type type);
std::string llama_model_arch_name (const llama_model & model);
std::string llama_model_type_name (const llama_model & model);
std::string llama_model_ftype_name(const llama_model & model);
template<typename F>
bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
ggml_init_params params = {
/*.mem_size =*/ ggml_tensor_overhead()*8,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
ggml_context_ptr ctx { ggml_init(params) };
if (!ctx) {
throw std::runtime_error("failed to create ggml context");
}
ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
ggml_tensor * op_tensor = fn(ctx.get());
for (int i = 0; i < GGML_MAX_SRC; i++) {
if (op_tensor->src[i] != nullptr) {
op_tensor->src[i]->buffer = buf.get();
}
}
bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
return op_supported;
}
template<typename F>
ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t & buft_list, const F & fn) {
for (const auto & cur : buft_list) {
ggml_backend_dev_t cur_dev = cur.first;
ggml_backend_buffer_type_t cur_buft = cur.second;
if (buft_supported(cur_buft, cur_dev, fn)) {
return cur_buft;
}
}
throw std::runtime_error("no suitable buffer type found");
}
// used by llama_adapter_cvec
ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il);
// used by llama_adapter_lora
struct ggml_tensor * llama_model_get_tensor(const struct llama_model & model, const char * name);
size_t llama_model_max_nodes(const llama_model & model);
struct llama_model_loader;
// TODO: become llama_model methods
void llm_load_stats (llama_model_loader & ml, llama_model & model);
void llm_load_arch (llama_model_loader & ml, llama_model & model);
void llm_load_hparams (llama_model_loader & ml, llama_model & model);
void llm_load_vocab (llama_model_loader & ml, llama_model & model);
void llm_load_print_meta(llama_model_loader & ml, llama_model & model);
...@@ -7,14 +7,12 @@ ...@@ -7,14 +7,12 @@
#include <algorithm> #include <algorithm>
#include <cmath> #include <cmath>
#include <cstring> #include <cstring>
#include <cinttypes>
#include <fstream> #include <fstream>
#include <mutex> #include <mutex>
#include <thread> #include <thread>
#include <unordered_map> #include <unordered_map>
// TODO: replace with ggml API call
#define QK_K 256
static void zeros(std::ofstream & file, size_t n) { static void zeros(std::ofstream & file, size_t n) {
char zero = 0; char zero = 0;
for (size_t i = 0; i < n; ++i) { for (size_t i = 0; i < n; ++i) {
...@@ -22,7 +20,7 @@ static void zeros(std::ofstream & file, size_t n) { ...@@ -22,7 +20,7 @@ static void zeros(std::ofstream & file, size_t n) {
} }
} }
struct quantize_state_internal { struct quantize_state_impl {
const llama_model & model; const llama_model & model;
const llama_model_quantize_params * params; const llama_model_quantize_params * params;
...@@ -43,13 +41,13 @@ struct quantize_state_internal { ...@@ -43,13 +41,13 @@ struct quantize_state_internal {
// used to figure out if a model shares tok_embd with the output weight // used to figure out if a model shares tok_embd with the output weight
bool has_output = false; bool has_output = false;
quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params) quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params)
: model(model) : model(model)
, params(params) , params(params)
{} {}
}; };
static void llama_tensor_dequantize_internal( static void llama_tensor_dequantize_impl(
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers, struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
const size_t nelements, const int nthread const size_t nelements, const int nthread
) { ) {
...@@ -121,7 +119,7 @@ static void llama_tensor_dequantize_internal( ...@@ -121,7 +119,7 @@ static void llama_tensor_dequantize_internal(
workers.clear(); workers.clear();
} }
static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) { static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
const std::string name = ggml_get_name(tensor); const std::string name = ggml_get_name(tensor);
// TODO: avoid hardcoded tensor names - use the TN_* constants // TODO: avoid hardcoded tensor names - use the TN_* constants
...@@ -154,8 +152,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n ...@@ -154,8 +152,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
if (qs.params->output_tensor_type < GGML_TYPE_COUNT) { if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
new_type = qs.params->output_tensor_type; new_type = qs.params->output_tensor_type;
} else { } else {
int nx = tensor->ne[0]; const int64_t nx = tensor->ne[0];
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) { const int64_t qk_k = ggml_blck_size(new_type);
if (arch == LLM_ARCH_FALCON || nx % qk_k != 0) {
new_type = GGML_TYPE_Q8_0; new_type = GGML_TYPE_Q8_0;
} }
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
...@@ -235,7 +235,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n ...@@ -235,7 +235,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K; use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
if (qs.model.type == MODEL_70B) { if (qs.model.type == LLM_TYPE_70B) {
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
// nearly negligible increase in model size by quantizing this tensor with more bits: // nearly negligible increase in model size by quantizing this tensor with more bits:
...@@ -367,20 +367,19 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n ...@@ -367,20 +367,19 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K; // if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
//} //}
bool convert_incompatible_tensor = false; bool convert_incompatible_tensor = false;
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || {
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS || const int64_t nx = tensor->ne[0];
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S || const int64_t ny = tensor->ne[1];
new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S || const int64_t qk_k = ggml_blck_size(new_type);
new_type == GGML_TYPE_IQ1_M) {
int nx = tensor->ne[0]; if (nx % qk_k != 0) {
int ny = tensor->ne[1]; LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
if (nx % QK_K != 0) {
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type));
convert_incompatible_tensor = true; convert_incompatible_tensor = true;
} else { } else {
++qs.n_k_quantized; ++qs.n_k_quantized;
} }
} }
if (convert_incompatible_tensor) { if (convert_incompatible_tensor) {
switch (new_type) { switch (new_type) {
case GGML_TYPE_TQ1_0: case GGML_TYPE_TQ1_0:
...@@ -410,7 +409,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n ...@@ -410,7 +409,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
return new_type; return new_type;
} }
static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) { static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
if (nthread < 2) { if (nthread < 2) {
// single-thread // single-thread
size_t new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix); size_t new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
...@@ -464,7 +463,7 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa ...@@ -464,7 +463,7 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
return new_size; return new_size;
} }
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) { static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
ggml_type default_type; ggml_type default_type;
llama_ftype ftype = params->ftype; llama_ftype ftype = params->ftype;
...@@ -526,18 +525,21 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s ...@@ -526,18 +525,21 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides; auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
kv_overrides = v->data(); kv_overrides = v->data();
} }
llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, kv_overrides);
std::vector<std::string> splits = {};
llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides);
ml.init_mappings(false); // no prefetching ml.init_mappings(false); // no prefetching
llama_model model; llama_model model(llama_model_default_params());
llm_load_arch (ml, model);
llm_load_hparams(ml, model); model.load_arch (ml);
llm_load_stats (ml, model); model.load_hparams(ml);
model.load_stats (ml);
struct quantize_state_internal qs(model, params); struct quantize_state_impl qs(model, params);
if (params->only_copy) { if (params->only_copy) {
ftype = model.ftype; ftype = ml.ftype;
} }
const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr; const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr;
if (params->imatrix) { if (params->imatrix) {
...@@ -621,7 +623,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s ...@@ -621,7 +623,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer; qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
// sanity checks // sanity checks for models that have attention layers
if (qs.n_attention_wv != 0)
{ {
const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin(); const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
// attention layers have a non-zero number of kv heads // attention layers have a non-zero number of kv heads
...@@ -761,6 +764,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s ...@@ -761,6 +764,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
quantize &= name.find("time_mix_w2.weight") == std::string::npos; quantize &= name.find("time_mix_w2.weight") == std::string::npos;
quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos; quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos; quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
// do not quantize relative position bias (T5) // do not quantize relative position bias (T5)
quantize &= name.find("attn_rel_b.weight") == std::string::npos; quantize &= name.find("attn_rel_b.weight") == std::string::npos;
...@@ -839,7 +843,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s ...@@ -839,7 +843,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) { } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type))); throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
} else { } else {
llama_tensor_dequantize_internal(tensor, f32_conv_buf, workers, nelements, nthread); llama_tensor_dequantize_impl(tensor, f32_conv_buf, workers, nelements, nthread);
f32_data = (float *) f32_conv_buf.data(); f32_data = (float *) f32_conv_buf.data();
} }
...@@ -868,7 +872,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s ...@@ -868,7 +872,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows; void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr; const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use); new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
} }
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0); LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
} }
...@@ -877,7 +881,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s ...@@ -877,7 +881,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
// update the gguf meta data as we go // update the gguf meta data as we go
gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type); gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type);
gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data, new_size); GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size);
gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data);
// write tensor data + padding // write tensor data + padding
fout.write((const char *) new_data, new_size); fout.write((const char *) new_data, new_size);
...@@ -921,7 +926,7 @@ uint32_t llama_model_quantize( ...@@ -921,7 +926,7 @@ uint32_t llama_model_quantize(
const char * fname_out, const char * fname_out,
const llama_model_quantize_params * params) { const llama_model_quantize_params * params) {
try { try {
llama_model_quantize_internal(fname_inp, fname_out, params); llama_model_quantize_impl(fname_inp, fname_out, params);
} catch (const std::exception & err) { } catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what()); LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what());
return 1; return 1;
......
This diff is collapsed.
...@@ -2,7 +2,9 @@ ...@@ -2,7 +2,9 @@
// TODO: rename llama-sampling.h/.cpp to llama-sampler.h/.cpp ? // TODO: rename llama-sampling.h/.cpp to llama-sampler.h/.cpp ?
#include "llama-grammar.h" #include "llama.h"
#include <vector>
struct llama_vocab; struct llama_vocab;
struct llama_grammar; struct llama_grammar;
...@@ -21,24 +23,6 @@ struct llama_sampler_chain { ...@@ -21,24 +23,6 @@ struct llama_sampler_chain {
mutable int32_t n_sample; mutable int32_t n_sample;
}; };
struct llama_sampler * llama_sampler_init_grammar_impl(
const struct llama_vocab & vocab,
const char * grammar_str,
const char * grammar_root);
struct llama_sampler * llama_sampler_init_infill_impl(
const struct llama_vocab & vocab);
struct llama_sampler * llama_sampler_init_dry_impl(
const struct llama_vocab & vocab,
int32_t context_size,
float dry_multiplier,
float dry_base,
int32_t dry_allowed_length,
int32_t dry_penalty_last_n,
const char ** seq_breakers,
size_t num_breakers);
struct llama_sampler * llama_sampler_init_dry_testing( struct llama_sampler * llama_sampler_init_dry_testing(
int32_t context_size, int32_t context_size,
float dry_multiplier, float dry_multiplier,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment