* Allow getting the HF file from the HF repo with tag (like ollama), for example:
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
* - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
* Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
*
* Return pair of <repo, file> (with "repo" already having tag removed)
*
* Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
boollora_init_without_apply=false;// only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
std::vector<common_adapter_lora_info>lora_adapters;// lora adapter path with user defined scale
...
...
@@ -325,13 +333,15 @@ struct common_params {
boolwarmup=true;// warmup run
boolcheck_tensors=false;// validate tensor data
boolsingle_turn=false;// single turn chat conversation
ggml_typecache_type_k=GGML_TYPE_F16;// KV cache data type for the K
ggml_typecache_type_v=GGML_TYPE_F16;// KV cache data type for the V
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
#include "clip.h"
#include "clip-impl.h"
#include "ggml.h"
#include "ggml-cpp.h"
#include "ggml-cpu.h"
#include "ggml-alloc.h"
#include "ggml-backend.h"
#include "gguf.h"
#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
#endif
#ifdef GGML_USE_SYCL
#include "ggml-sycl.h"
#endif
#ifdef GGML_USE_METAL
#include "ggml-metal.h"
#endif
#ifdef GGML_USE_CANN
#include "ggml-cann.h"
#endif
#ifdef GGML_USE_VULKAN
#include "ggml-vulkan.h"
#endif
#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"
...
...
@@ -46,18 +28,6 @@
#include <cinttypes>
#include <limits>
#if defined(LLAVA_LOG_OFF)
# define LOG_INF(...)
# define LOG_WRN(...)
# define LOG_ERR(...)
# define LOG_DBG(...)
#else // defined(LLAVA_LOG_OFF)
# define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
# define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
# define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
# define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
@@ -1859,14 +1759,15 @@ static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int ta
}
// Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not
@@ -2189,88 +2089,64 @@ int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
// res_imgs memory is being allocated here, previous allocations will be freed if found
std::vector<clip_image_u8*>patches=divide_to_patches_u8(*temp,params.image_size);// prepare spatial sorted main patches of image_size each (336 in llava-1.6)
std::vector<clip_image_u8_ptr>patches=divide_to_patches_u8(*temp,params.image_size);// prepare spatial sorted main patches of image_size each (336 in llava-1.6)
/** build image from pixels decoded by other libraries instead of stb_image.h for better performance. The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes */
// use for accessing underlay data of clip_image_f32_batch
CLIP_APIsize_tclip_image_f32_batch_n_images(conststructclip_image_f32_batch*batch);// equivalent to batch->size()
CLIP_APIsize_tclip_image_f32_batch_nx(conststructclip_image_f32_batch*batch,intidx);// equivalent to batch[idx]->nx
CLIP_APIsize_tclip_image_f32_batch_ny(conststructclip_image_f32_batch*batch,intidx);// equivalent to batch[idx]->ny
CLIP_APIstructclip_image_f32*clip_image_f32_get_img(conststructclip_image_f32_batch*batch,intidx);// equivalent to batch[idx]->data
/**
* Build image from pixels decoded by other libraries instead of stb_image.h for better performance.
* The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes
// std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
constboolencoded=clip_image_encode(ctx_clip,n_threads,&img_res_v.data[i],image_embd_v[i]);// image data is in 3x336x336 format and will be converted to 336x336x3 inside
constboolencoded=clip_image_encode(ctx_clip,n_threads,img_res,image_embd_v[i]);// image data is in 3x336x336 format and will be converted to 336x336x3 inside
if(!encoded){
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n",(int)i+1,(int)img_res_v.size);
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n",(int)i+1,(int)n_imgs);
returnfalse;
}
}
constint64_tt_img_enc_batch_us=ggml_time_us();
LOG_INF("%s: %d segments encoded in %8.2f ms\n",__func__,(int)img_res_v.size,(t_img_enc_batch_us-t_img_enc_start_us)/1000.0);
LOG_INF("%s: %d segments encoded in %8.2f ms\n",__func__,(int)n_imgs,(t_img_enc_batch_us-t_img_enc_start_us)/1000.0);
/// @details Intializes a GBNF grammar, see grammars/README.md for details.
/// @param vocab The vocabulary that this grammar will be used with.
/// @param grammar_str The production rules for the grammar, encoded as a string. Returns an empty grammar if empty. Returns NULL if parsing of grammar_str fails.
/// @param grammar_root The name of the start symbol for the grammar.
/// @details Lazy grammar sampler, introduced in https://github.com/ggml-org/llama.cpp/pull/9639
/// @param trigger_patterns A list of patterns that will trigger the grammar sampler. Pattern will be matched from the start of the generation output, and grammar sampler will be fed content starting from its first match group.
/// @param trigger_tokens A list of tokens that will trigger the grammar sampler. Grammar sampler will be fed content starting from the trigger token included.