Unverified Commit c68f367e authored by Daniel Hiltgen's avatar Daniel Hiltgen Committed by GitHub
Browse files

Update GGML to b6646 (#12245)

Notable EOLs with this change:
- MacOS v12 and v13 are no longer supported (v14+ required)
- AMD gfx900 and gfx906 are no longer supported
parent fdb10946
......@@ -89,9 +89,9 @@ if(CMAKE_CUDA_COMPILER)
)
endif()
set(WINDOWS_AMDGPU_TARGETS_EXCLUDE_REGEX "^gfx(906|908|90a|1200|1201):xnack[+-]$"
set(WINDOWS_AMDGPU_TARGETS_EXCLUDE_REGEX "^gfx(908|90a|1200|1201):xnack[+-]$"
CACHE STRING
"Regular expression describing AMDGPU_TARGETS not supported on Windows. Override to force building these targets. Default \"^gfx(906|908|90a|1200|1201):xnack[+-]$\"."
"Regular expression describing AMDGPU_TARGETS not supported on Windows. Override to force building these targets. Default \"^gfx(908|90a|1200|1201):xnack[+-]$\"."
)
check_language(HIP)
......@@ -100,7 +100,7 @@ if(CMAKE_HIP_COMPILER)
if(NOT AMDGPU_TARGETS)
find_package(hip REQUIRED)
list(FILTER AMDGPU_TARGETS INCLUDE REGEX "^gfx(900|94[012]|101[02]|1030|110[012]|120[01])$")
list(FILTER AMDGPU_TARGETS INCLUDE REGEX "^gfx(94[012]|101[02]|1030|110[012]|120[01])$")
endif()
if(WIN32 AND WINDOWS_AMDGPU_TARGETS_EXCLUDE_REGEX)
......
......@@ -68,7 +68,7 @@
"inherits": [ "ROCm" ],
"cacheVariables": {
"CMAKE_HIP_FLAGS": "-parallel-jobs=4",
"AMDGPU_TARGETS": "gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-"
"AMDGPU_TARGETS": "gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-"
}
}
],
......
UPSTREAM=https://github.com/ggml-org/llama.cpp.git
WORKDIR=llama/vendor
FETCH_HEAD=e54d41befcc1575f4c898c5ff4ef43970cead75f
FETCH_HEAD=364a7a6d4a786e98947c8a90430ea581213c0ba9
.PHONY: help
help:
......
......@@ -51,14 +51,14 @@ sudo modprobe nvidia_uvm`
Ollama supports the following AMD GPUs:
### Linux Support
| Family | Cards and accelerators |
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
| AMD Radeon RX | `7900 XTX` `7900 XT` `7900 GRE` `7800 XT` `7700 XT` `7600 XT` `7600` `6950 XT` `6900 XTX` `6900XT` `6800 XT` `6800` `Vega 64` `Vega 56` |
| AMD Radeon PRO | `W7900` `W7800` `W7700` `W7600` `W7500` `W6900X` `W6800X Duo` `W6800X` `W6800` `V620` `V420` `V340` `V320` `Vega II Duo` `Vega II` `VII` `SSG` |
| AMD Instinct | `MI300X` `MI300A` `MI300` `MI250X` `MI250` `MI210` `MI200` `MI100` `MI60` `MI50` |
| Family | Cards and accelerators |
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------- |
| AMD Radeon RX | `7900 XTX` `7900 XT` `7900 GRE` `7800 XT` `7700 XT` `7600 XT` `7600` `6950 XT` `6900 XTX` `6900XT` `6800 XT` `6800` `Vega 64` |
| AMD Radeon PRO | `W7900` `W7800` `W7700` `W7600` `W7500` `W6900X` `W6800X Duo` `W6800X` `W6800` `V620` `V420` `V340` `V320` `Vega II Duo` `Vega II` `SSG` |
| AMD Instinct | `MI300X` `MI300A` `MI300` `MI250X` `MI250` `MI210` `MI200` `MI100` `MI60` |
### Windows Support
With ROCm v6.1, the following GPUs are supported on Windows.
With ROCm v6.2, the following GPUs are supported on Windows.
| Family | Cards and accelerators |
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
......@@ -88,8 +88,6 @@ At this time, the known supported GPU types on linux are the following LLVM Targ
This table shows some example GPUs that map to these LLVM targets:
| **LLVM Target** | **An Example GPU** |
|-----------------|---------------------|
| gfx900 | Radeon RX Vega 56 |
| gfx906 | Radeon Instinct MI50 |
| gfx908 | Radeon Instinct MI100 |
| gfx90a | Radeon Instinct MI210 |
| gfx940 | Radeon Instinct MI300 |
......
......@@ -2,7 +2,7 @@
## System Requirements
* MacOS Monterey (v12) or newer
* MacOS Sonoma (v14) or newer
* Apple M series (CPU and GPU support) or x86 (CPU only)
......
......@@ -36,7 +36,7 @@ func TestLongInputContext(t *testing.T) {
if err := PullIfMissing(ctx, client, req.Model); err != nil {
t.Fatalf("PullIfMissing failed: %v", err)
}
DoGenerate(ctx, t, client, req, []string{"russia", "germany", "france", "england", "austria", "prussia", "europe", "individuals", "coalition", "conflict"}, 120*time.Second, 10*time.Second)
DoGenerate(ctx, t, client, req, []string{"russia", "german", "france", "england", "austria", "prussia", "europe", "individuals", "coalition", "conflict"}, 120*time.Second, 10*time.Second)
}
func TestContextExhaustion(t *testing.T) {
......
int LLAMA_BUILD_NUMBER = 0;
char const *LLAMA_COMMIT = "e54d41befcc1575f4c898c5ff4ef43970cead75f";
char const *LLAMA_COMMIT = "364a7a6d4a786e98947c8a90430ea581213c0ba9";
char const *LLAMA_COMPILER = "";
char const *LLAMA_BUILD_TARGET = "";
......@@ -14,6 +14,7 @@
#include <climits>
#include <cmath>
#include <codecvt>
#include <chrono>
#include <cstdarg>
#include <cstring>
#include <ctime>
......@@ -41,6 +42,7 @@
#endif
#include <locale>
#include <windows.h>
#include <string.h>
#include <fcntl.h>
#include <io.h>
#else
......@@ -49,6 +51,11 @@
#include <unistd.h>
#endif
#if defined(__linux__)
#include <sys/types.h>
#include <pwd.h>
#endif
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
......@@ -557,13 +564,6 @@ std::string string_from(const struct llama_context * ctx, const std::vector<llam
auto detokenized = common_token_to_piece(ctx, token);
detokenized.erase(
std::remove_if(
detokenized.begin(),
detokenized.end(),
[](const unsigned char c) { return !std::isprint(c); }),
detokenized.end());
buf << "'" << detokenized << "'"
<< ":" << std::to_string(token);
}
......@@ -588,13 +588,6 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
auto detokenized = common_token_to_piece(ctx, batch.token[i]);
detokenized.erase(
std::remove_if(
detokenized.begin(),
detokenized.end(),
[](const unsigned char c) { return !std::isprint(c); }),
detokenized.end());
buf << "\n" << std::to_string(i)
<< ", token '" << detokenized << "'"
<< ", pos " << std::to_string(batch.pos[i])
......@@ -877,8 +870,20 @@ std::string fs_get_cache_directory() {
#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || defined(__OpenBSD__)
if (std::getenv("XDG_CACHE_HOME")) {
cache_directory = std::getenv("XDG_CACHE_HOME");
} else {
} else if (std::getenv("HOME")) {
cache_directory = std::getenv("HOME") + std::string("/.cache/");
} else {
#if defined(__linux__)
/* no $HOME is defined, fallback to getpwuid */
struct passwd *pw = getpwuid(getuid());
if ((!pw) || (!pw->pw_dir)) {
throw std::runtime_error("Failed to find $HOME directory");
}
cache_directory = std::string(pw->pw_dir) + std::string("/.cache/");
#else /* defined(__linux__) */
throw std::runtime_error("Failed to find $HOME directory");
#endif /* defined(__linux__) */
}
#elif defined(__APPLE__)
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
......@@ -914,7 +919,8 @@ struct common_init_result common_init_from_params(common_params & params) {
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
if (model == NULL) {
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
LOG_ERR("%s: failed to load model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
__func__, params.model.path.c_str());
return iparams;
}
......@@ -924,7 +930,8 @@ struct common_init_result common_init_from_params(common_params & params) {
llama_context * lctx = llama_init_from_model(model, cparams);
if (lctx == NULL) {
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
__func__, params.model.path.c_str());
llama_model_free(model);
return iparams;
}
......@@ -971,15 +978,13 @@ struct common_init_result common_init_from_params(common_params & params) {
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
bool has_rerank_prompt = llama_model_chat_template(model, "rerank") != NULL;
if (!has_eos && !has_sep) {
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
if (!has_eos && !has_sep && !has_rerank_prompt) {
LOG_WRN("%s: warning: vocab does not have an EOS token, SEP token, or rerank prompt. Reranking will not work\n", __func__);
ok = false;
} else if (!has_eos) {
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
} else if (!has_sep) {
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
ok = false;
}
if (!ok) {
......@@ -1001,7 +1006,12 @@ struct common_init_result common_init_from_params(common_params & params) {
return iparams;
}
char buf[1024];
la.ptr = lora.get();
llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
la.task_name = buf;
llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
la.prompt_prefix = buf;
iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
}
......@@ -1165,11 +1175,10 @@ struct llama_context_params common_context_params_to_llama(const common_params &
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
cparams.pooling_type = params.pooling_type;
cparams.attention_type = params.attention_type;
cparams.defrag_thold = params.defrag_thold;
cparams.flash_attn_type = params.flash_attn_type;
cparams.cb_eval = params.cb_eval;
cparams.cb_eval_user_data = params.cb_eval_user_data;
cparams.offload_kqv = !params.no_kv_offload;
cparams.flash_attn = params.flash_attn;
cparams.no_perf = params.no_perf;
cparams.op_offload = !params.no_op_offload;
cparams.swa_full = params.swa_full;
......@@ -1565,3 +1574,56 @@ ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std
return result;
}
ggml_opt_optimizer_params common_opt_lr_pars(void * userdata) {
ggml_opt_optimizer_params result = ggml_opt_get_default_optimizer_params(nullptr);
const lr_opt & d = *(lr_opt *) userdata;
result.adamw.alpha = result.sgd.alpha = d.get_lr(d.epoch);
result.sgd.wd = result.adamw.wd = d.wd;
return result;
}
// TODO make all command line args case-insensitive
static inline bool eq_case_insensitive(char const* a, char const* b) {
return !
#if defined(_MSC_VER)
_stricmp
#else
strcasecmp
#endif // defined(_MSC_VER)
(a, b);
}
enum ggml_opt_optimizer_type common_opt_get_optimizer(const char * n) {
if (eq_case_insensitive("adamw", n)) {
return GGML_OPT_OPTIMIZER_TYPE_ADAMW;
}
if (eq_case_insensitive("sgd", n)) {
return GGML_OPT_OPTIMIZER_TYPE_SGD;
}
return GGML_OPT_OPTIMIZER_TYPE_COUNT;
}
// TODO simplify to use just log and exp
static float const k_log_2 = std::log(2.f);
void lr_opt::init() {
if (lr_min > 0 && lr_min < lr0) {
float nhalf = std::log(lr0 / lr_min) / k_log_2;
float e = epochs;
if (decay_epochs > 0 && decay_epochs < e) {
e = decay_epochs;
} else {
decay_epochs = e;
}
scale_epoch = nhalf / e;
}
}
float lr_opt::get_lr(float epoch) const {
float r = lr_min <= 0 ? lr0 :
epoch >= decay_epochs ? lr_min :
lr0 * std::pow(0.5f, epoch * scale_epoch);
LOG_INF("epoch %.2g lr=%.2g\n", epoch, r);
return r;
}
......@@ -2,14 +2,17 @@
#pragma once
#include "llama-cpp.h"
#include <set>
#include <sstream>
#include <string>
#include <string_view>
#include <vector>
#include <map>
#include <sstream>
#include <cmath>
#include "ggml-opt.h"
#include "llama-cpp.h"
#ifdef _WIN32
#define DIRECTORY_SEPARATOR '\\'
......@@ -31,6 +34,9 @@ struct common_adapter_lora_info {
std::string path;
float scale;
std::string task_name;
std::string prompt_prefix;
struct llama_adapter_lora * ptr;
};
......@@ -82,6 +88,7 @@ enum llama_example {
LLAMA_EXAMPLE_PARALLEL,
LLAMA_EXAMPLE_TTS,
LLAMA_EXAMPLE_DIFFUSION,
LLAMA_EXAMPLE_FINETUNE,
LLAMA_EXAMPLE_COUNT,
};
......@@ -186,10 +193,11 @@ struct common_params_sampling {
};
struct common_params_model {
std::string path = ""; // model local path // NOLINT
std::string url = ""; // model url to download // NOLINT
std::string hf_repo = ""; // HF repo // NOLINT
std::string hf_file = ""; // HF file // NOLINT
std::string path = ""; // model local path // NOLINT
std::string url = ""; // model url to download // NOLINT
std::string hf_repo = ""; // HF repo // NOLINT
std::string hf_file = ""; // HF file // NOLINT
std::string docker_repo = ""; // Docker repo // NOLINT
};
struct common_params_speculative {
......@@ -202,6 +210,7 @@ struct common_params_speculative {
float p_split = 0.1f; // speculative decoding split probability
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
......@@ -234,14 +243,36 @@ struct common_params_diffusion {
bool add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0
};
// reasoning API response format (not to be confused as chat template's reasoning format)
enum common_reasoning_format {
COMMON_REASONING_FORMAT_NONE,
COMMON_REASONING_FORMAT_AUTO,
COMMON_REASONING_FORMAT_AUTO, // Same as deepseek, using `message.reasoning_content`
COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
COMMON_REASONING_FORMAT_GRANITE, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
// do not extend this enum unless you absolutely have to
// in most cases, use COMMON_REASONING_FORMAT_AUTO
// see: https://github.com/ggml-org/llama.cpp/pull/15408
};
struct lr_opt {
float lr0 = 1e-5; // learning rate at first epoch
float lr_min = -1;
float decay_epochs = -1; // if >0, the learning rate starts at lr0 and decays to lr_min after this many epochs
float scale_epoch = 0;
float wd = 0;
unsigned epochs = 2;
unsigned epoch; // set by optimizer outer (epochs) loop
// learning rate decay - constant LR per epoch only for now
float get_lr(float e) const;
float get_lr() const { return get_lr(epoch); }
// must call after arg parse, before get_lr
void init();
};
struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
struct common_params {
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 4096; // context size
......@@ -257,11 +288,10 @@ struct common_params {
float rope_freq_base = 0.0f; // RoPE base frequency
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
float yarn_beta_fast = 32.0f; // YaRN low correction dim
float yarn_beta_slow = 1.0f; // YaRN high correction dim
float yarn_attn_factor = -1.0f; // YaRN magnitude scaling factor
float yarn_beta_fast = -1.0f; // YaRN low correction dim
float yarn_beta_slow = -1.0f; // YaRN high correction dim
int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = 0.1f; // KV cache defragmentation threshold
// offload params
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
......@@ -283,6 +313,7 @@ struct common_params {
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
enum llama_flash_attn_type flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO; // whether to use Flash Attention
struct common_params_sampling sampling;
struct common_params_speculative speculative;
......@@ -346,9 +377,8 @@ struct common_params {
bool multiline_input = false; // reverse the usage of `\`
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
bool cont_batching = true; // insert new sequences for decoding on-the-fly
bool flash_attn = false; // flash attention
bool no_perf = false; // disable performance metrics
bool ctx_shift = true; // context shift on inifinite text generation
bool ctx_shift = false; // context shift on infinite text generation
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
bool kv_unified = false; // enable unified KV cache
......@@ -376,6 +406,11 @@ struct common_params {
bool no_mmproj = false; // explicitly disable multimodal model
std::vector<std::string> image; // path to image file(s)
// finetune
struct lr_opt lr;
enum ggml_opt_optimizer_type optimizer = GGML_OPT_OPTIMIZER_TYPE_ADAMW;
float val_split = 0.05f; // fraction of the data used for the validation set
// embedding
bool embedding = false; // get only sentence embedding
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
......@@ -384,11 +419,12 @@ struct common_params {
std::string cls_sep = "\t"; // separator of classification sequences
// server params
int32_t port = 8080; // server listens on this network port
int32_t timeout_read = 600; // http read timeout in seconds
int32_t timeout_write = timeout_read; // http write timeout in seconds
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
int32_t port = 8080; // server listens on this network port
int32_t timeout_read = 600; // http read timeout in seconds
int32_t timeout_write = timeout_read; // http write timeout in seconds
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
int32_t n_swa_checkpoints = 3; // max number of SWA checkpoints per slot
std::string hostname = "127.0.0.1";
std::string public_path = ""; // NOLINT
......@@ -409,7 +445,7 @@ struct common_params {
// "advanced" endpoints are disabled by default for better security
bool webui = true;
bool endpoint_slots = false;
bool endpoint_slots = true;
bool endpoint_props = false; // only control POST requests, not GET
bool endpoint_metrics = false;
......@@ -417,7 +453,7 @@ struct common_params {
std::string slot_save_path;
float slot_prompt_similarity = 0.5f;
float slot_prompt_similarity = 0.1f;
// batched-bench params
bool is_pp_shared = false;
......@@ -698,8 +734,25 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
}
//
// MoE utils
//
const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_(ch|)exps";
static std::string llm_ffn_exps_block_regex(int idx) {
return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
}
static llama_model_tensor_buft_override llm_ffn_exps_cpu_override() {
return { LLM_FFN_EXPS_REGEX, ggml_backend_cpu_buffer_type() };
}
//
// training utils
//
ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
// "adamw" or "sgd" (case insensitive)
enum ggml_opt_optimizer_type common_opt_get_optimizer(const char *);
......@@ -257,12 +257,13 @@ std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
};
static bool is_reserved_name(const std::string & name) {
static std::unordered_set<std::string> RESERVED_NAMES;
if (RESERVED_NAMES.empty()) {
RESERVED_NAMES.insert("root");
for (const auto &p : PRIMITIVE_RULES) RESERVED_NAMES.insert(p.first);
for (const auto &p : STRING_FORMAT_RULES) RESERVED_NAMES.insert(p.first);
}
static const std::unordered_set<std::string> RESERVED_NAMES = [] {
std::unordered_set<std::string> s;
s.insert("root");
for (const auto & p : PRIMITIVE_RULES) s.insert(p.first);
for (const auto & p : STRING_FORMAT_RULES) s.insert(p.first);
return s;
}();
return RESERVED_NAMES.find(name) != RESERVED_NAMES.end();
}
......@@ -843,9 +844,10 @@ public:
_build_object_rule(
properties, required, name,
schema.contains("additionalProperties") ? schema["additionalProperties"] : json()));
} else if ((schema_type.is_null() || schema_type == "object") && schema.contains("allOf")) {
} else if ((schema_type.is_null() || schema_type == "object" || schema_type == "string") && schema.contains("allOf")) {
std::unordered_set<std::string> required;
std::vector<std::pair<std::string, json>> properties;
std::map<std::string, size_t> enum_values;
std::string hybrid_name = name;
std::function<void(const json &, bool)> add_component = [&](const json & comp_schema, bool is_required) {
if (comp_schema.contains("$ref")) {
......@@ -857,6 +859,14 @@ public:
required.insert(prop.key());
}
}
} else if (comp_schema.contains("enum")) {
for (const auto & v : comp_schema["enum"]) {
const auto rule = _generate_constant_rule(v);
if (enum_values.find(rule) == enum_values.end()) {
enum_values[rule] = 0;
}
enum_values[rule] += 1;
}
} else {
// todo warning
}
......@@ -870,6 +880,17 @@ public:
add_component(t, true);
}
}
if (!enum_values.empty()) {
std::vector<std::string> enum_intersection;
for (const auto & p : enum_values) {
if (p.second == schema["allOf"].size()) {
enum_intersection.push_back(p.first);
}
}
if (!enum_intersection.empty()) {
return _add_rule(rule_name, "(" + string_join(enum_intersection, " | ") + ") space");
}
}
return _add_rule(rule_name, _build_object_rule(properties, required, hybrid_name, json()));
} else if ((schema_type.is_null() || schema_type == "array") && (schema.contains("items") || schema.contains("prefixItems"))) {
json items = schema.contains("items") ? schema["items"] : schema["prefixItems"];
......
......@@ -4,17 +4,52 @@
#include <condition_variable>
#include <cstdarg>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <mutex>
#include <sstream>
#include <thread>
#include <vector>
#if defined(_WIN32)
# include <io.h>
# include <windows.h>
# define isatty _isatty
# define fileno _fileno
#else
# include <unistd.h>
#endif // defined(_WIN32)
int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;
void common_log_set_verbosity_thold(int verbosity) {
common_log_verbosity_thold = verbosity;
}
// Auto-detect if colors should be enabled based on terminal and environment
static bool common_log_should_use_colors_auto() {
// Check NO_COLOR environment variable (https://no-color.org/)
if (const char * no_color = std::getenv("NO_COLOR")) {
if (no_color[0] != '\0') {
return false;
}
}
// Check TERM environment variable
if (const char * term = std::getenv("TERM")) {
if (std::strcmp(term, "dumb") == 0) {
return false;
}
}
// Check if stdout and stderr are connected to a terminal
// We check both because log messages can go to either
bool stdout_is_tty = isatty(fileno(stdout));
bool stderr_is_tty = isatty(fileno(stderr));
return stdout_is_tty || stderr_is_tty;
}
static int64_t t_us() {
return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
}
......@@ -353,6 +388,11 @@ struct common_log * common_log_init() {
struct common_log * common_log_main() {
static struct common_log log;
static std::once_flag init_flag;
std::call_once(init_flag, [&]() {
// Set default to auto-detect colors
log.set_colors(common_log_should_use_colors_auto());
});
return &log;
}
......@@ -380,8 +420,19 @@ void common_log_set_file(struct common_log * log, const char * file) {
log->set_file(file);
}
void common_log_set_colors(struct common_log * log, bool colors) {
log->set_colors(colors);
void common_log_set_colors(struct common_log * log, log_colors colors) {
if (colors == LOG_COLORS_AUTO) {
log->set_colors(common_log_should_use_colors_auto());
return;
}
if (colors == LOG_COLORS_DISABLED) {
log->set_colors(false);
return;
}
GGML_ASSERT(colors == LOG_COLORS_ENABLED);
log->set_colors(true);
}
void common_log_set_prefix(struct common_log * log, bool prefix) {
......
......@@ -24,6 +24,12 @@
#define LOG_DEFAULT_DEBUG 1
#define LOG_DEFAULT_LLAMA 0
enum log_colors {
LOG_COLORS_AUTO = -1,
LOG_COLORS_DISABLED = 0,
LOG_COLORS_ENABLED = 1,
};
// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
// set via common_log_set_verbosity()
extern int common_log_verbosity_thold;
......@@ -65,10 +71,10 @@ void common_log_add(struct common_log * log, enum ggml_log_level level, const ch
// D - debug (stderr, V = LOG_DEFAULT_DEBUG)
//
void common_log_set_file (struct common_log * log, const char * file); // not thread-safe
void common_log_set_colors (struct common_log * log, bool colors); // not thread-safe
void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log
void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix
void common_log_set_file (struct common_log * log, const char * file); // not thread-safe
void common_log_set_colors (struct common_log * log, log_colors colors); // not thread-safe
void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log
void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix
// helper macros for logging
// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
......
......@@ -332,6 +332,7 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
}
if (ctx) {
llama_perf_context_print(ctx);
llama_memory_breakdown_print(ctx);
}
}
......@@ -426,8 +427,29 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
// helpers
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) {
return &gsmpl->cur_p;
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort) {
auto * res = &gsmpl->cur_p;
if (do_sort && !res->sorted) {
// remember the selected token before sorting
const llama_token id = res->data[res->selected].id;
std::sort(res->data, res->data + res->size, [](const llama_token_data & a, const llama_token_data & b) {
return a.p > b.p;
});
// restore the selected token after sorting
for (size_t i = 0; i < res->size; ++i) {
if (res->data[i].id == id) {
res->selected = i;
break;
}
}
res->sorted = true;
}
return res;
}
llama_token common_sampler_last(const struct common_sampler * gsmpl) {
......
......@@ -86,7 +86,9 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
// helpers
// access the internal list of current candidate tokens
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl);
// if do_sort == true, the candidates are guaranteed to be sorted afterwards (in descending order of probability)
// the .sorted flag of the result indicates whether the returned candidates are sorted
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort);
// get the last accepted token
llama_token common_sampler_last(const struct common_sampler * gsmpl);
......
......@@ -64,8 +64,6 @@ extern "C" {
typedef struct llama_memory_i * llama_memory_t;
struct llama_kv_cache; // DEPRECATED (use llama_memory instead)
typedef int32_t llama_pos;
typedef int32_t llama_token;
typedef int32_t llama_seq_id;
......@@ -181,6 +179,14 @@ extern "C" {
LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1,
};
enum llama_flash_attn_type {
LLAMA_FLASH_ATTN_TYPE_AUTO = -1,
LLAMA_FLASH_ATTN_TYPE_DISABLED = 0,
LLAMA_FLASH_ATTN_TYPE_ENABLED = 1,
};
LLAMA_API const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type);
enum llama_split_mode {
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
......@@ -200,7 +206,7 @@ extern "C" {
llama_token_data * data;
size_t size;
int64_t selected; // this is the index in the data array (i.e. not the token id)
bool sorted;
bool sorted; // note: do not assume the data is sorted - always check this flag
} llama_token_data_array;
typedef bool (*llama_progress_callback)(float progress, void * user_data);
......@@ -305,6 +311,7 @@ extern "C" {
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
enum llama_attention_type attention_type; // attention type to use for embeddings
enum llama_flash_attn_type flash_attn_type; // when to enable Flash Attention
// ref: https://github.com/ggml-org/llama.cpp/pull/2054
float rope_freq_base; // RoPE base frequency, 0 = from model
......@@ -314,7 +321,7 @@ extern "C" {
float yarn_beta_fast; // YaRN low correction dim
float yarn_beta_slow; // YaRN high correction dim
uint32_t yarn_orig_ctx; // YaRN original context size
float defrag_thold; // defragment the KV cache if holes/size > thold, <= 0 disabled (default)
float defrag_thold; // [DEPRECATED] defragment the KV cache if holes/size > thold, <= 0 disabled (default)
ggml_backend_sched_eval_callback cb_eval;
void * cb_eval_user_data;
......@@ -331,7 +338,6 @@ extern "C" {
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
bool embeddings; // if true, extract embeddings (together with logits)
bool offload_kqv; // offload the KQV ops (including the KV cache) to GPU
bool flash_attn; // use flash attention [EXPERIMENTAL]
bool no_perf; // measure performance timings
bool op_offload; // offload host tensor operations to device
bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
......@@ -469,8 +475,6 @@ extern "C" {
LLAMA_API llama_memory_t llama_get_memory (const struct llama_context * ctx);
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type
DEPRECATED(LLAMA_API struct llama_kv_cache * llama_get_kv_self(struct llama_context * ctx), "use llama_get_memory instead");
LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model);
......@@ -557,10 +561,32 @@ extern "C" {
struct llama_model * model,
const char * path_lora);
// Functions to access the adapter's GGUF metadata scalar values
// - The functions return the length of the string on success, or -1 on failure
// - The output string is always null-terminated and cleared on failure
// - When retrieving a string, an extra byte must be allocated to account for the null terminator
// - GGUF array values are not supported by these functions
// Get metadata value as a string by key name
LLAMA_API int32_t llama_adapter_meta_val_str(const struct llama_adapter_lora * adapter, const char * key, char * buf, size_t buf_size);
// Get the number of metadata key/value pairs
LLAMA_API int32_t llama_adapter_meta_count(const struct llama_adapter_lora * adapter);
// Get metadata key name by index
LLAMA_API int32_t llama_adapter_meta_key_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
// Get metadata value as a string by index
LLAMA_API int32_t llama_adapter_meta_val_str_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
// Manually free a LoRA adapter
// Note: loaded adapters will be free when the associated model is deleted
LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
// Get the invocation tokens if the current lora is an alora
LLAMA_API uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter);
LLAMA_API const llama_token * llama_adapter_get_alora_invocation_tokens (const struct llama_adapter_lora * adapter);
// The following functions operate on a llama_context, hence the naming: llama_verb_...
// Add a loaded LoRA adapter to given context
......@@ -667,111 +693,6 @@ extern "C" {
// Check if the memory supports shifting
LLAMA_API bool llama_memory_can_shift(llama_memory_t mem);
//
// KV cache for self-attention (TODO: deprecate in favor of llama_memory)
//
// Returns the number of tokens in the KV cache (slow, use only for debug)
// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
"Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
"Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
// Clear the KV cache - both cell info is erased and KV data is zeroed
DEPRECATED(LLAMA_API void llama_kv_self_clear(
struct llama_context * ctx),
"Use llama_memory_clear() instead");
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
// Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
// seq_id < 0 : match any sequence
// p0 < 0 : [0, p1]
// p1 < 0 : [p0, inf)
DEPRECATED(LLAMA_API bool llama_kv_self_seq_rm(
struct llama_context * ctx,
llama_seq_id seq_id,
llama_pos p0,
llama_pos p1),
"Use llama_memory_seq_rm() instead");
// Copy all tokens that belong to the specified sequence to another sequence
// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
// p0 < 0 : [0, p1]
// p1 < 0 : [p0, inf)
DEPRECATED(LLAMA_API void llama_kv_self_seq_cp(
struct llama_context * ctx,
llama_seq_id seq_id_src,
llama_seq_id seq_id_dst,
llama_pos p0,
llama_pos p1),
"Use llama_memory_seq_cp() instead");
// Removes all tokens that do not belong to the specified sequence
DEPRECATED(LLAMA_API void llama_kv_self_seq_keep(
struct llama_context * ctx,
llama_seq_id seq_id),
"Use llama_memory_seq_keep() instead");
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
// If the KV cache is RoPEd, the KV data is updated accordingly:
// - lazily on next llama_decode()
// p0 < 0 : [0, p1]
// p1 < 0 : [p0, inf)
DEPRECATED(LLAMA_API void llama_kv_self_seq_add(
struct llama_context * ctx,
llama_seq_id seq_id,
llama_pos p0,
llama_pos p1,
llama_pos delta),
"Use llama_memory_seq_add() instead");
// Integer division of the positions by factor of `d > 1`
// If the KV cache is RoPEd, the KV data is updated accordingly:
// - lazily on next llama_decode()
// p0 < 0 : [0, p1]
// p1 < 0 : [p0, inf)
DEPRECATED(LLAMA_API void llama_kv_self_seq_div(
struct llama_context * ctx,
llama_seq_id seq_id,
llama_pos p0,
llama_pos p1,
int d),
"Use llama_memory_seq_div() instead");
// Returns the smallest position present in the KV cache for the specified sequence
// This is typically non-zero only for SWA caches
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
// Return -1 if the sequence is empty
DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_min(
struct llama_context * ctx,
llama_seq_id seq_id),
"Use llama_memory_seq_pos_min() instead");
// Returns the largest position present in the KV cache for the specified sequence
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
// Return -1 if the sequence is empty
DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_max(
struct llama_context * ctx,
llama_seq_id seq_id),
"Use llama_memory_seq_pos_max() instead");
// Defragment the KV cache
// This will be applied:
// - lazily on next llama_decode()
DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx),
"simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
// Check if the context supports KV cache shifting
DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx),
"use llama_memory_can_shift() instead");
// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx),
"simply remove this call, updates are applied lazily on the next llama_decode()");
//
// State / sessions
//
......@@ -870,6 +791,29 @@ extern "C" {
size_t n_token_capacity,
size_t * n_token_count_out);
#define LLAMA_STATE_SEQ_FLAGS_SWA_ONLY 1
typedef uint32_t llama_state_seq_flags;
LLAMA_API size_t llama_state_seq_get_size_ext(
struct llama_context * ctx,
llama_seq_id seq_id,
llama_state_seq_flags flags);
LLAMA_API size_t llama_state_seq_get_data_ext(
struct llama_context * ctx,
uint8_t * dst,
size_t size,
llama_seq_id seq_id,
llama_state_seq_flags flags);
LLAMA_API size_t llama_state_seq_set_data_ext(
struct llama_context * ctx,
const uint8_t * src,
size_t size,
llama_seq_id dest_seq_id,
llama_state_seq_flags flags);
//
// Decoding
//
......@@ -1216,11 +1160,6 @@ extern "C" {
LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
/// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void),
"will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
/// Setting k <= 0 makes this a noop
LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
......@@ -1390,24 +1329,25 @@ extern "C" {
//
// Performance utils
//
// NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
// NOTE: Used by llama.cpp examples/tools, avoid using in third-party apps. Instead, do your own performance measurements.
//
struct llama_perf_context_data {
double t_start_ms;
double t_load_ms;
double t_p_eval_ms;
double t_eval_ms;
int32_t n_p_eval;
int32_t n_eval;
int32_t n_reused; // number of times a ggml compute graph had been reused
// ms == milliseconds
double t_start_ms; // absolute start time
double t_load_ms; // time needed for loading the model
double t_p_eval_ms; // time needed for processing the prompt
double t_eval_ms; // time needed for generating tokens
int32_t n_p_eval; // number of prompt tokens
int32_t n_eval; // number of generated tokens
int32_t n_reused; // number of times a ggml compute graph had been reused
};
struct llama_perf_sampler_data {
double t_sample_ms;
double t_sample_ms; // time needed for sampling in ms
int32_t n_sample;
int32_t n_sample; // number of sampled tokens
};
LLAMA_API struct llama_perf_context_data llama_perf_context (const struct llama_context * ctx);
......@@ -1419,6 +1359,9 @@ extern "C" {
LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
// print a breakdown of per-device memory use via LLAMA_LOG:
LLAMA_API void llama_memory_breakdown_print(const struct llama_context * ctx);
//
// training
//
......@@ -1437,6 +1380,8 @@ extern "C" {
ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
void * get_opt_pars_ud; // userdata for calculating optimizer parameters
enum ggml_opt_optimizer_type optimizer_type;
};
LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);
......
......@@ -6,6 +6,7 @@
#include <map>
#include <cassert>
#include <sstream>
#include <stdexcept>
// vec
......@@ -163,13 +164,38 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
// check metadata
{
const gguf_context * gguf_ctx = ctx_gguf.get();
LLAMA_LOG_INFO("%s: Dumping metadata keys/values.\n", __func__);
// get metadata as string
for (int i = 0; i < gguf_get_n_kv(gguf_ctx); i++) {
gguf_type type = gguf_get_kv_type(gguf_ctx, i);
const std::string type_name =
type == GGUF_TYPE_ARRAY
? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(gguf_ctx, i)), gguf_get_arr_n(gguf_ctx, i))
: gguf_type_name(type);
const char * name = gguf_get_key(gguf_ctx, i);
const std::string value = gguf_kv_to_str(gguf_ctx, i);
if (type != GGUF_TYPE_ARRAY) {
adapter.gguf_kv.emplace(name, value);
}
const size_t MAX_VALUE_LEN = 40;
std::string print_value = value.size() > MAX_VALUE_LEN ? format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str()) : value;
replace_all(print_value, "\n", "\\n");
LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), print_value.c_str());
}
auto get_kv_str = [&](const std::string & key) -> std::string {
int id = gguf_find_key(ctx_gguf.get(), key.c_str());
return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf.get(), id));
int id = gguf_find_key(gguf_ctx, key.c_str());
return id < 0 ? "" : std::string(gguf_get_val_str(gguf_ctx, id));
};
auto get_kv_f32 = [&](const std::string & key) -> float {
int id = gguf_find_key(ctx_gguf.get(), key.c_str());
return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf.get(), id);
int id = gguf_find_key(gguf_ctx, key.c_str());
return id < 0 ? 0.0f : gguf_get_val_f32(gguf_ctx, id);
};
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
......@@ -190,6 +216,26 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
}
adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA));
// parse alora invocation sequence vector
const auto & key = llm_kv(LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS);
const int kid = gguf_find_key(ctx_gguf.get(), key.c_str());
if (kid >= 0) {
if (gguf_get_kv_type(ctx_gguf.get(), kid) != GGUF_TYPE_ARRAY) {
throw std::runtime_error("invalid gguf type for " + key);
}
const auto arr_type = gguf_get_arr_type(ctx_gguf.get(), kid);
if (arr_type != GGUF_TYPE_UINT32) {
throw std::runtime_error("invalid gguf element type for " + key);
}
const size_t seq_len = gguf_get_arr_n(ctx_gguf.get(), kid);
const void * data = gguf_get_arr_data(ctx_gguf.get(), kid);
adapter.alora_invocation_tokens.resize(seq_len);
std::copy(
(const llama_token *)data,
(const llama_token *)data + seq_len,
adapter.alora_invocation_tokens.begin());
}
}
int n_tensors = gguf_get_n_tensors(ctx_gguf.get());
......@@ -383,6 +429,57 @@ llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * p
return nullptr;
}
int32_t llama_adapter_meta_val_str(const llama_adapter_lora * adapter, const char * key, char * buf, size_t buf_size) {
const auto & it = adapter->gguf_kv.find(key);
if (it == adapter->gguf_kv.end()) {
if (buf_size > 0) {
buf[0] = '\0';
}
return -1;
}
return snprintf(buf, buf_size, "%s", it->second.c_str());
}
int32_t llama_adapter_meta_count(const llama_adapter_lora * adapter) {
return (int)adapter->gguf_kv.size();
}
int32_t llama_adapter_meta_key_by_index(const llama_adapter_lora * adapter, int i, char * buf, size_t buf_size) {
if (i < 0 || i >= (int)adapter->gguf_kv.size()) {
if (buf_size > 0) {
buf[0] = '\0';
}
return -1;
}
auto it = adapter->gguf_kv.begin();
std::advance(it, i);
return snprintf(buf, buf_size, "%s", it->first.c_str());
}
int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size) {
if (i < 0 || i >= (int)adapter->gguf_kv.size()) {
if (buf_size > 0) {
buf[0] = '\0';
}
return -1;
}
auto it = adapter->gguf_kv.begin();
std::advance(it, i);
return snprintf(buf, buf_size, "%s", it->second.c_str());
}
void llama_adapter_lora_free(llama_adapter_lora * adapter) {
delete adapter;
}
uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter) {
if (!adapter) {
return 0;
}
return adapter->alora_invocation_tokens.size();
}
const llama_token * llama_adapter_get_alora_invocation_tokens(const llama_adapter_lora * adapter) {
GGML_ASSERT(adapter);
return adapter->alora_invocation_tokens.data();
}
......@@ -67,6 +67,12 @@ struct llama_adapter_lora {
float alpha;
// gguf metadata
std::unordered_map<std::string, std::string> gguf_kv;
// activated lora (aLoRA)
std::vector<llama_token> alora_invocation_tokens;
llama_adapter_lora() = default;
~llama_adapter_lora() = default;
......
......@@ -22,6 +22,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
{ LLM_ARCH_NEO_BERT, "neo-bert" },
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
{ LLM_ARCH_JINA_BERT_V3, "jina-bert-v3" },
{ LLM_ARCH_BLOOM, "bloom" },
{ LLM_ARCH_STABLELM, "stablelm" },
{ LLM_ARCH_QWEN, "qwen" },
......@@ -44,6 +45,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_GEMMA2, "gemma2" },
{ LLM_ARCH_GEMMA3, "gemma3" },
{ LLM_ARCH_GEMMA3N, "gemma3n" },
{ LLM_ARCH_GEMMA_EMBEDDING, "gemma-embedding" },
{ LLM_ARCH_STARCODER2, "starcoder2" },
{ LLM_ARCH_MAMBA, "mamba" },
{ LLM_ARCH_MAMBA2, "mamba2" },
......@@ -68,6 +70,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_T5ENCODER, "t5encoder" },
{ LLM_ARCH_JAIS, "jais" },
{ LLM_ARCH_NEMOTRON, "nemotron" },
{ LLM_ARCH_NEMOTRON_H, "nemotron_h" },
{ LLM_ARCH_EXAONE, "exaone" },
{ LLM_ARCH_EXAONE4, "exaone4" },
{ LLM_ARCH_RWKV6, "rwkv6" },
......@@ -94,6 +97,9 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_DREAM, "dream" },
{ LLM_ARCH_SMALLTHINKER, "smallthinker" },
{ LLM_ARCH_LLADA, "llada" },
{ LLM_ARCH_LLADA_MOE, "llada-moe" },
{ LLM_ARCH_SEED_OSS, "seed_oss" },
{ LLM_ARCH_GROVEMOE, "grovemoe" },
{ LLM_ARCH_UNKNOWN, "(unknown)" },
};
......@@ -121,6 +127,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
{ LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
{ LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, "%s.expert_shared_feed_forward_length" },
{ LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH, "%s.expert_chunk_feed_forward_length" },
{ LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
......@@ -129,12 +136,16 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
{ LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
{ LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
{ LLM_KV_EXPERT_GROUP_SCALE, "%s.expert_group_scale" },
{ LLM_KV_EXPERTS_PER_GROUP, "%s.experts_per_group" },
{ LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
{ LLM_KV_NEXTN_PREDICT_LAYERS, "%s.nextn_predict_layers" },
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
{ LLM_KV_DECODER_BLOCK_COUNT, "%s.decoder_block_count" },
{ LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
{ LLM_KV_ROUTER_LOGIT_SOFTCAPPING, "%s.router_logit_softcapping" },
{ LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
{ LLM_KV_SWIN_NORM, "%s.swin_norm" },
{ LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" },
......@@ -165,20 +176,26 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
{ LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
{ LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
{ LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
{ LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
{ LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
{ LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
{ LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" },
{ LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, "%s.rope.scaling.yarn_ext_factor" },
{ LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, "%s.rope.scaling.yarn_attn_factor" },
{ LLM_KV_ROPE_SCALING_YARN_BETA_FAST, "%s.rope.scaling.yarn_beta_fast" },
{ LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, "%s.rope.scaling.yarn_beta_slow" },
{ LLM_KV_SPLIT_NO, "split.no" },
{ LLM_KV_SPLIT_COUNT, "split.count" },
......@@ -235,8 +252,11 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_TOKENIZER_FIM_REP_ID, "tokenizer.ggml.fim_rep_token_id" },
{ LLM_KV_TOKENIZER_FIM_SEP_ID, "tokenizer.ggml.fim_sep_token_id" },
{ LLM_KV_ADAPTER_TYPE, "adapter.type" },
{ LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
{ LLM_KV_ADAPTER_TYPE, "adapter.type" },
{ LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
{ LLM_KV_ADAPTER_LORA_TASK_NAME, "adapter.lora.task_name" },
{ LLM_KV_ADAPTER_LORA_PROMPT_PREFIX, "adapter.lora.prompt_prefix" },
{ LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS, "adapter.alora.invocation_tokens" },
// deprecated
{ LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
......@@ -392,12 +412,16 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
},
......@@ -576,6 +600,20 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_CLS, "cls" },
},
},
{
LLM_ARCH_JINA_BERT_V3,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
},
},
{
LLM_ARCH_BLOOM,
{
......@@ -689,6 +727,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_CLS_OUT, "cls.output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
......@@ -1021,6 +1060,27 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_LAUREL_POST_NORM, "blk.%d.laurel_post_norm" },
},
},
{
LLM_ARCH_GEMMA_EMBEDDING,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
},
},
{
LLM_ARCH_STARCODER2,
{
......@@ -1534,6 +1594,31 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{
LLM_ARCH_NEMOTRON_H,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
// mamba(2) ssm layers
{ LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
{ LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
{ LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
{ LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
{ LLM_TENSOR_SSM_D, "blk.%d.ssm_d" },
{ LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" },
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
// attention layers
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
// dense FFN
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{
LLM_ARCH_EXAONE,
{
......@@ -2030,6 +2115,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_SHORTCONV_OUTPROJ, "blk.%d.shortconv.out_proj" },
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
}
},
{
......@@ -2087,6 +2173,66 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{
LLM_ARCH_LLADA_MOE,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
},
},
{
LLM_ARCH_SEED_OSS,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{
LLM_ARCH_GROVEMOE,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
{ LLM_TENSOR_FFN_GATE_CHEXPS, "blk.%d.ffn_gate_chexps" },
{ LLM_TENSOR_FFN_DOWN_CHEXPS, "blk.%d.ffn_down_chexps" },
{ LLM_TENSOR_FFN_UP_CHEXPS, "blk.%d.ffn_up_chexps" },
},
},
{
LLM_ARCH_UNKNOWN,
{
......@@ -2219,6 +2365,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_FFN_DOWN_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
{LLM_TENSOR_FFN_GATE_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
{LLM_TENSOR_FFN_DOWN_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
{LLM_TENSOR_FFN_GATE_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
{LLM_TENSOR_FFN_UP_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
{LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
// altup / laurel (gemma 3n)
{LLM_TENSOR_PER_LAYER_TOKEN_EMBD, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
......@@ -2340,6 +2489,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
case LLM_ARCH_PLAMO2:
case LLM_ARCH_GRANITE_HYBRID:
case LLM_ARCH_LFM2:
case LLM_ARCH_NEMOTRON_H:
return true;
default:
return false;
......@@ -2350,6 +2500,7 @@ bool llm_arch_is_diffusion(const llm_arch & arch) {
switch (arch) {
case LLM_ARCH_DREAM:
case LLM_ARCH_LLADA:
case LLM_ARCH_LLADA_MOE:
return true;
default:
return false;
......
......@@ -26,6 +26,7 @@ enum llm_arch {
LLM_ARCH_NOMIC_BERT_MOE,
LLM_ARCH_NEO_BERT,
LLM_ARCH_JINA_BERT_V2,
LLM_ARCH_JINA_BERT_V3,
LLM_ARCH_BLOOM,
LLM_ARCH_STABLELM,
LLM_ARCH_QWEN,
......@@ -48,6 +49,7 @@ enum llm_arch {
LLM_ARCH_GEMMA2,
LLM_ARCH_GEMMA3,
LLM_ARCH_GEMMA3N,
LLM_ARCH_GEMMA_EMBEDDING,
LLM_ARCH_STARCODER2,
LLM_ARCH_MAMBA,
LLM_ARCH_MAMBA2,
......@@ -72,6 +74,7 @@ enum llm_arch {
LLM_ARCH_T5ENCODER,
LLM_ARCH_JAIS,
LLM_ARCH_NEMOTRON,
LLM_ARCH_NEMOTRON_H,
LLM_ARCH_EXAONE,
LLM_ARCH_EXAONE4,
LLM_ARCH_RWKV6,
......@@ -98,6 +101,9 @@ enum llm_arch {
LLM_ARCH_DREAM,
LLM_ARCH_SMALLTHINKER,
LLM_ARCH_LLADA,
LLM_ARCH_LLADA_MOE,
LLM_ARCH_SEED_OSS,
LLM_ARCH_GROVEMOE,
LLM_ARCH_UNKNOWN,
};
......@@ -125,6 +131,7 @@ enum llm_kv {
LLM_KV_FEED_FORWARD_LENGTH,
LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH,
LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH,
LLM_KV_USE_PARALLEL_RESIDUAL,
LLM_KV_TENSOR_DATA_LAYOUT,
LLM_KV_EXPERT_COUNT,
......@@ -133,12 +140,16 @@ enum llm_kv {
LLM_KV_EXPERT_WEIGHTS_SCALE,
LLM_KV_EXPERT_WEIGHTS_NORM,
LLM_KV_EXPERT_GATING_FUNC,
LLM_KV_EXPERT_GROUP_SCALE,
LLM_KV_EXPERTS_PER_GROUP,
LLM_KV_MOE_EVERY_N_LAYERS,
LLM_KV_NEXTN_PREDICT_LAYERS,
LLM_KV_POOLING_TYPE,
LLM_KV_LOGIT_SCALE,
LLM_KV_DECODER_START_TOKEN_ID,
LLM_KV_DECODER_BLOCK_COUNT,
LLM_KV_ATTN_LOGIT_SOFTCAPPING,
LLM_KV_ROUTER_LOGIT_SOFTCAPPING,
LLM_KV_FINAL_LOGIT_SOFTCAPPING,
LLM_KV_SWIN_NORM,
LLM_KV_RESCALE_EVERY_N_LAYERS,
......@@ -169,6 +180,8 @@ enum llm_kv {
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_OUTPUT_SCALE,
LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
......@@ -183,6 +196,10 @@ enum llm_kv {
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
LLM_KV_ROPE_SCALING_FINETUNED,
LLM_KV_ROPE_SCALING_YARN_LOG_MUL,
LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR,
LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR,
LLM_KV_ROPE_SCALING_YARN_BETA_FAST,
LLM_KV_ROPE_SCALING_YARN_BETA_SLOW,
LLM_KV_SPLIT_NO,
LLM_KV_SPLIT_COUNT,
......@@ -231,6 +248,9 @@ enum llm_kv {
LLM_KV_ADAPTER_TYPE,
LLM_KV_ADAPTER_LORA_ALPHA,
LLM_KV_ADAPTER_LORA_TASK_NAME,
LLM_KV_ADAPTER_LORA_PROMPT_PREFIX,
LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS,
LLM_KV_POSNET_EMBEDDING_LENGTH,
LLM_KV_POSNET_BLOCK_COUNT,
......@@ -287,6 +307,9 @@ enum llm_tensor {
LLM_TENSOR_FFN_DOWN_SHEXP,
LLM_TENSOR_FFN_GATE_SHEXP,
LLM_TENSOR_FFN_UP_SHEXP,
LLM_TENSOR_FFN_DOWN_CHEXPS,
LLM_TENSOR_FFN_GATE_CHEXPS,
LLM_TENSOR_FFN_UP_CHEXPS,
LLM_TENSOR_FFN_EXP_PROBS_B,
LLM_TENSOR_ATTN_Q_NORM,
LLM_TENSOR_ATTN_K_NORM,
......
......@@ -477,7 +477,7 @@ llama_ubatch llama_batch_allocr::split_simple(uint32_t n_ubatch) {
llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch, bool sequential) {
if (sequential && has_cpl) {
LLAMA_LOG_ERROR("%s: sequential split is not supported when there are coupled sequences in the input batch\n", __func__);
LLAMA_LOG_ERROR("%s: sequential split is not supported when there are coupled sequences in the input batch (you may need to use the -kvu flag)\n", __func__);
return {};
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment