LOG_ERR("This gguf file seems to have no vision encoder\n");
returnnullptr;
// implementation of the 2D RoPE without adding a new op in ggml
// this is not efficient (use double the memory), but works on all backends
// TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
staticggml_tensor*build_rope_2d(
ggml_context*ctx0,
ggml_tensor*cur,
ggml_tensor*pos_h,
ggml_tensor*pos_w,
constfloatfreq_base
){
constint64_tn_dim=cur->ne[0];
constint64_tn_head=cur->ne[1];
constint64_tn_pos=cur->ne[2];
// for example, if we have cur tensor of shape (n_dim=8, n_head, n_pos)
// we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3
// first half of cur will use 1e-0, 1e-2 (even)
// second half of cur will use 1e-1, 1e-3 (odd)
// the trick here is to rotate just half of n_dim, so inv_freq will automatically be even
@@ -2194,11 +2754,6 @@ int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
// res_imgs memory is being allocated here, previous allocations will be freed if found
structggml_tensor*image_features=ggml_new_tensor_3d(model.ctx,GGML_TYPE_F32,clip_n_mmproj_embd(ctx_clip),clip_n_patches(ctx_clip),num_images-1);// example: 4096 x 576 x 4
structggml_tensor*image_features=ggml_new_tensor_3d(model.ctx,GGML_TYPE_F32,clip_n_mmproj_embd(ctx_clip),clip_n_output_tokens(ctx_clip,img_input),num_images-1);// example: 4096 x 576 x 4