* Allow getting the HF file from the HF repo with tag (like ollama), for example:
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
* - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
* Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
*
* Return pair of <repo, file> (with "repo" already having tag removed)
*
* Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
std::stringmodel_url="";// model url to download // NOLINT
booluse_guide_tokens=false;// enable guide tokens to improve TTS accuracy // NOLINT
};
enumcommon_reasoning_format{
COMMON_REASONING_FORMAT_NONE,
COMMON_REASONING_FORMAT_DEEPSEEK,// Extract thinking tag contents and return as `message.reasoning_content`
};
structcommon_params{
...
...
@@ -240,14 +268,13 @@ struct common_params {
std::stringlookup_cache_static="";// path of static ngram cache file for lookup decoding // NOLINT
std::stringlookup_cache_dynamic="";// path of dynamic ngram cache file for lookup decoding // NOLINT
std::stringlogits_file="";// file for saving *all* logits // NOLINT
std::stringrpc_servers="";// comma separated list of RPC servers // NOLINT
std::vector<std::string>in_files;// all input files
std::vector<std::string>antiprompt;// strings upon which more user input is prompted (a.k.a. reverse prompts)
std::vector<llama_model_kv_override>kv_overrides;
boollora_init_without_apply=false;// only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
std::vector<common_lora_adapter_info>lora_adapters;// lora adapter path with user defined scale
boollora_init_without_apply=false;// only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
std::vector<common_adapter_lora_info>lora_adapters;// lora adapter path with user defined scale
std::vector<common_control_vector_load_info>control_vectors;// control vector with user defined scale
/** build image from pixels decoded by other libraries instead of stb_image.h for better performance. The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes */
LOG_ERR("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n",__func__,n_image_embd,n_llama_embd);
/// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
/// as plaintext. Does not insert a leading space.
LLAMA_APIint32_tllama_tokenize(
conststructllama_model*model,
conststructllama_vocab*vocab,
constchar*text,
int32_ttext_len,
llama_token*tokens,
...
...
@@ -972,7 +1019,7 @@ extern "C" {
// User can skip up to 'lstrip' leading spaces before copying (useful when encoding/decoding multiple tokens with 'add_space_prefix')
// @param special If true, special tokens are rendered in the output.
LLAMA_APIint32_tllama_token_to_piece(
conststructllama_model*model,
conststructllama_vocab*vocab,
llama_tokentoken,
char*buf,
int32_tlength,
...
...
@@ -986,7 +1033,7 @@ extern "C" {
/// @param remove_special Allow to remove BOS and EOS tokens if model is configured to do so.
/// @param unparse_special If true, special tokens are rendered in the output.
LLAMA_APIint32_tllama_detokenize(
conststructllama_model*model,
conststructllama_vocab*vocab,
constllama_token*tokens,
int32_tn_tokens,
char*text,
...
...
@@ -1000,7 +1047,7 @@ extern "C" {
/// Apply chat template. Inspired by hf apply_chat_template() on python.
/// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
/// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
/// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
/// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
/// @param chat Pointer to a list of multiple llama_chat_message
/// @param n_msg Number of llama_chat_message in this chat
...
...
@@ -1009,7 +1056,6 @@ extern "C" {
/// @param length The size of the allocated buffer
/// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
LLAMA_APIint32_tllama_chat_apply_template(
conststructllama_model*model,
constchar*tmpl,
conststructllama_chat_message*chat,
size_tn_msg,
...
...
@@ -1057,7 +1103,6 @@ extern "C" {
// llama_sampler_free(smpl);
//
// TODO: In the future, llama_sampler will be utilized to offload the sampling to the backends (e.g. GPU).
// TODO: in the future, the entire sampling API that uses llama_model should start using llama_vocab
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
int32_tpenalty_last_n,// last n tokens to penalize (0 = disable penalty, -1 = context size)
...
...
@@ -1170,7 +1231,8 @@ extern "C" {
/// @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982