Unverified Commit 0cf7794b authored by Daniel Hiltgen's avatar Daniel Hiltgen Committed by GitHub
Browse files

ggml update to b7108 (#12992)

* Revert "vulkan: temporary cary of vulkan fixes (#12971)"

This reverts commit 3a9e8e9f.

* ggml update to b7087

* fix argsort on metal

* update to b7108

* fix bakllava regression

This model lacks the metadata for the projector type.

* update to b7209

* fix TopK perf

* only build arm code on arm
parent 854d40ed
......@@ -39,6 +39,7 @@
#define KEY_FEATURE_LAYER "clip.vision.feature_layer"
#define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor"
#define KEY_SPATIAL_MERGE_SIZE "clip.vision.spatial_merge_size"
#define KEY_IS_DEEPSTACK_LAYERS "clip.vision.is_deepstack_layers"
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
......@@ -63,6 +64,7 @@
#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
#define TN_PATCH_EMBD_1 "v.patch_embd.weight.1"
#define TN_PATCH_BIAS "v.patch_embd.bias"
#define TN_ATTN_QKV "%s.blk.%d.attn_qkv.%s"
#define TN_ATTN_K "%s.blk.%d.attn_k.%s"
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
#define TN_ATTN_V "%s.blk.%d.attn_v.%s"
......@@ -93,6 +95,9 @@
#define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral
#define TN_TOK_GLM_BOI "adapter.boi" // glm-edge (these embeddings are not in text model)
#define TN_TOK_GLM_EOI "adapter.eoi" // glm-edge (these embeddings are not in text model)
#define TN_DEEPSTACK_NORM "v.deepstack.%d.norm.%s" // qwen3vl deepstack
#define TN_DEEPSTACK_FC1 "v.deepstack.%d.fc1.%s" // qwen3vl deepstack
#define TN_DEEPSTACK_FC2 "v.deepstack.%d.fc2.%s" // qwen3vl deepstack
// mimicpmv
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
......@@ -116,6 +121,14 @@
#define TN_MM_NORM_PRE "mm.a.norm_pre.%s"
#define TN_MM_NORM_MID "mm.a.norm_mid.%s"
// cogvlm
#define TN_MM_POST_FC_NORM "mm.post_fc_norm.%s"
#define TN_MM_H_TO_4H "mm.up.%s"
#define TN_MM_GATE "mm.gate.%s"
#define TN_MM_4H_TO_H "mm.down.%s"
#define TN_TOK_BOI "v.boi"
#define TN_TOK_EOI "v.eoi"
// align x to upper multiple of n
#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
......@@ -127,6 +140,7 @@ enum projector_type {
PROJECTOR_TYPE_MINICPMV,
PROJECTOR_TYPE_GLM_EDGE,
PROJECTOR_TYPE_QWEN2VL,
PROJECTOR_TYPE_QWEN3VL,
PROJECTOR_TYPE_GEMMA3,
PROJECTOR_TYPE_IDEFICS3,
PROJECTOR_TYPE_PIXTRAL,
......@@ -139,6 +153,9 @@ enum projector_type {
PROJECTOR_TYPE_VOXTRAL,
PROJECTOR_TYPE_LFM2,
PROJECTOR_TYPE_KIMIVL,
PROJECTOR_TYPE_LIGHTONOCR,
PROJECTOR_TYPE_COGVLM,
PROJECTOR_TYPE_JANUS_PRO,
PROJECTOR_TYPE_UNKNOWN,
};
......@@ -150,6 +167,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_GLM_EDGE, "adapter"},
{ PROJECTOR_TYPE_QWEN2VL, "qwen2vl_merger"},
{ PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger"},
{ PROJECTOR_TYPE_QWEN3VL, "qwen3vl_merger"},
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
{ PROJECTOR_TYPE_IDEFICS3, "idefics3"},
{ PROJECTOR_TYPE_PIXTRAL, "pixtral"},
......@@ -161,6 +179,9 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_VOXTRAL, "voxtral"},
{ PROJECTOR_TYPE_LFM2, "lfm2"},
{ PROJECTOR_TYPE_KIMIVL, "kimivl"},
{ PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
{ PROJECTOR_TYPE_COGVLM, "cogvlm"},
{ PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
};
static projector_type clip_projector_type_from_string(const std::string & str) {
......@@ -203,7 +224,6 @@ static void clip_log_callback_default(enum ggml_log_level level, const char * te
}
struct clip_logger_state {
ggml_log_level verbosity_thold;
ggml_log_callback log_callback;
void * log_callback_user_data;
};
......@@ -237,17 +257,11 @@ static void clip_log_internal(enum ggml_log_level level, const char * format, ..
va_end(args);
}
#define LOG_TMPL(level, ...) \
do { \
if ((level) >= g_logger_state.verbosity_thold) { \
clip_log_internal((level), __VA_ARGS__); \
} \
} while (0)
#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
#define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT, __VA_ARGS__)
#define LOG_INF(...) clip_log_internal(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
#define LOG_WRN(...) clip_log_internal(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
#define LOG_ERR(...) clip_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
#define LOG_DBG(...) clip_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
#define LOG_CNT(...) clip_log_internal(GGML_LOG_LEVEL_CONT, __VA_ARGS__)
//
// cpp wrappers
......
......@@ -6,7 +6,6 @@
#include "clip-impl.h"
#include "ggml.h"
#include "ggml-cpp.h"
#include "ggml-cpu.h"
#include "ggml-alloc.h"
#include "ggml-backend.h"
#include "gguf.h"
......@@ -17,15 +16,12 @@
#include <cstring>
#include <fstream>
#include <map>
#include <regex>
#include <stdexcept>
#include <unordered_set>
#include <vector>
#include <sstream>
#include <cinttypes>
#include <limits>
#include <array>
#include <numeric>
#include <functional>
#if defined(_WIN32)
......@@ -41,7 +37,7 @@
#endif
#endif
struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
struct clip_logger_state g_logger_state = {clip_log_callback_default, NULL};
enum ffn_op_type {
FFN_GELU,
......@@ -176,16 +172,18 @@ enum patch_merge_type {
};
struct clip_hparams {
int32_t image_size;
int32_t patch_size;
int32_t n_embd;
int32_t n_ff;
int32_t projection_dim;
int32_t n_head;
int32_t n_layer;
int32_t image_size = 0;
int32_t patch_size = 0;
int32_t n_embd = 0;
int32_t n_ff = 0;
int32_t projection_dim = 0;
int32_t n_head = 0;
int32_t n_layer = 0;
// idefics3
int32_t preproc_image_size = 0;
int32_t proj_scale_factor = 0;
int32_t image_longest_edge = 0;
int32_t image_min_pixels = -1;
int32_t image_max_pixels = -1;
int32_t n_merge = 0; // number of patch merges **per-side**
float image_mean[3];
float image_std[3];
......@@ -207,7 +205,6 @@ struct clip_hparams {
std::unordered_set<int32_t> vision_feature_layer;
int32_t attn_window_size = 0;
int32_t n_wa_pattern = 0;
int32_t spatial_merge_size = 0;
// audio
int32_t n_mel_bins = 0; // whisper preprocessor
......@@ -217,6 +214,26 @@ struct clip_hparams {
bool has_llava_projector = false;
int minicpmv_version = 0;
int32_t minicpmv_query_num = 0; // MiniCPM-V query number
// custom value provided by user, can be undefined if not set
int32_t custom_image_min_tokens = -1;
int32_t custom_image_max_tokens = -1;
void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) {
const int cur_merge = n_merge == 0 ? 1 : n_merge;
const int patch_area = patch_size * patch_size * cur_merge * cur_merge;
image_min_pixels = (custom_image_min_tokens > 0 ? custom_image_min_tokens : n_tokens_min) * patch_area;
image_max_pixels = (custom_image_max_tokens > 0 ? custom_image_max_tokens : n_tokens_max) * patch_area;
warmup_image_size = static_cast<int>(std::sqrt(image_max_pixels));
}
void set_warmup_n_tokens(int n_tokens) {
int n_tok_per_side = static_cast<int>(std::sqrt(n_tokens));
GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n");
const int cur_merge = n_merge == 0 ? 1 : n_merge;
warmup_image_size = n_tok_per_side * patch_size * cur_merge;
// TODO: support warmup size for custom token numbers
}
};
struct clip_layer {
......@@ -227,6 +244,8 @@ struct clip_layer {
ggml_tensor * q_b = nullptr;
ggml_tensor * v_w = nullptr;
ggml_tensor * v_b = nullptr;
ggml_tensor * qkv_w = nullptr;
ggml_tensor * qkv_b = nullptr;
ggml_tensor * o_w = nullptr;
ggml_tensor * o_b = nullptr;
......@@ -252,6 +271,18 @@ struct clip_layer {
// layer scale (no bias)
ggml_tensor * ls_1_w = nullptr;
ggml_tensor * ls_2_w = nullptr;
// qwen3vl deepstack merger
ggml_tensor * deepstack_norm_w = nullptr;
ggml_tensor * deepstack_norm_b = nullptr;
ggml_tensor * deepstack_fc1_w = nullptr;
ggml_tensor * deepstack_fc1_b = nullptr;
ggml_tensor * deepstack_fc2_w = nullptr;
ggml_tensor * deepstack_fc2_b = nullptr;
bool has_deepstack() const {
return deepstack_fc1_w != nullptr;
}
};
struct clip_model {
......@@ -271,6 +302,8 @@ struct clip_model {
std::vector<clip_layer> layers;
int32_t n_deepstack_layers = 0; // used by Qwen3-VL, calculated from clip_layer
ggml_tensor * post_ln_w;
ggml_tensor * post_ln_b;
......@@ -299,8 +332,6 @@ struct clip_model {
// GLMV-Edge projection
ggml_tensor * mm_model_adapter_conv_w = nullptr;
ggml_tensor * mm_model_adapter_conv_b = nullptr;
ggml_tensor * mm_glm_tok_boi = nullptr;
ggml_tensor * mm_glm_tok_eoi = nullptr;
// MobileVLM projection
ggml_tensor * mm_model_mlp_1_w = nullptr;
......@@ -372,6 +403,15 @@ struct clip_model {
ggml_tensor * mm_norm_pre_w = nullptr;
ggml_tensor * mm_norm_mid_w = nullptr;
// cogvlm
ggml_tensor * mm_post_fc_norm_w = nullptr;
ggml_tensor * mm_post_fc_norm_b = nullptr;
ggml_tensor * mm_h_to_4h_w = nullptr;
ggml_tensor * mm_gate_w = nullptr;
ggml_tensor * mm_4h_to_h_w = nullptr;
ggml_tensor * mm_boi = nullptr;
ggml_tensor * mm_eoi = nullptr;
bool audio_has_avgpool() const {
return proj_type == PROJECTOR_TYPE_QWEN2A
|| proj_type == PROJECTOR_TYPE_VOXTRAL;
......@@ -400,12 +440,14 @@ struct clip_ctx {
int max_nodes = 8192;
ggml_backend_sched_ptr sched;
clip_flash_attn_type flash_attn_type = CLIP_FLASH_ATTN_TYPE_AUTO;
// for debugging
bool debug_graph = false;
std::vector<ggml_tensor *> debug_print_tensors;
clip_ctx(clip_context_params & ctx_params) {
flash_attn_type = ctx_params.flash_attn_type;
debug_graph = std::getenv("MTMD_DEBUG_GRAPH") != nullptr;
backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
if (!backend_cpu) {
......@@ -434,6 +476,13 @@ struct clip_ctx {
LOG_INF("%s: CLIP using CPU backend\n", __func__);
}
if (ctx_params.image_min_tokens > 0) {
model.hparams.custom_image_min_tokens = ctx_params.image_min_tokens;
}
if (ctx_params.image_max_tokens > 0) {
model.hparams.custom_image_max_tokens = ctx_params.image_max_tokens;
}
backend_ptrs.push_back(backend_cpu);
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu));
......@@ -522,7 +571,7 @@ struct clip_graph {
const int batch_size = 1;
GGML_ASSERT(n_patches_x == n_patches_y);
const int patches_per_image = n_patches_x;
const int kernel_size = hparams.proj_scale_factor;
const int kernel_size = hparams.n_merge;
cur = ggml_transpose(ctx0, cur);
cur = ggml_cont_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size);
......@@ -544,13 +593,13 @@ struct clip_graph {
} else if (ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3) {
// pixel_shuffle
// https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
const int scale_factor = model.hparams.proj_scale_factor;
const int scale_factor = model.hparams.n_merge;
cur = build_patch_merge_permute(cur, scale_factor);
cur = ggml_mul_mat(ctx0, model.projection, cur);
} else if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) {
// pixel unshuffle block
const int scale_factor = model.hparams.proj_scale_factor;
const int scale_factor = model.hparams.n_merge;
cur = build_patch_merge_permute(cur, scale_factor);
// projection
......@@ -563,6 +612,15 @@ struct clip_graph {
cur = ggml_gelu(ctx0, cur);
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
cur = ggml_add(ctx0, cur, model.mm_2_b);
} else if (ctx->proj_type() == PROJECTOR_TYPE_JANUS_PRO) {
cur = build_ffn(cur,
model.mm_0_w, model.mm_0_b,
nullptr, nullptr,
model.mm_1_w, model.mm_1_b,
hparams.ffn_op,
-1);
} else {
GGML_ABORT("SigLIP: Unsupported projector type");
}
......@@ -574,7 +632,7 @@ struct clip_graph {
}
ggml_cgraph * build_pixtral() {
const int n_merge = hparams.spatial_merge_size;
const int n_merge = hparams.n_merge;
// 2D input positions
ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
......@@ -600,7 +658,7 @@ struct clip_graph {
// mistral small 3.1 patch merger
// ref: https://github.com/huggingface/transformers/blob/7a3e208892c06a5e278144eaf38c8599a42f53e7/src/transformers/models/mistral3/modeling_mistral3.py#L67
if (model.mm_patch_merger_w) {
GGML_ASSERT(hparams.spatial_merge_size > 0);
GGML_ASSERT(hparams.n_merge > 0);
cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.mm_input_norm_w);
......@@ -634,7 +692,7 @@ struct clip_graph {
}
// arrangement of the [IMG_BREAK] token
{
if (model.token_embd_img_break) {
// not efficient, but works
// the trick is to view the embeddings as a 3D tensor with shape [n_embd, n_patches_per_row, n_rows]
// and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension
......@@ -727,6 +785,15 @@ struct clip_graph {
ggml_set_name(window_mask, "window_mask");
ggml_set_input(window_mask);
// if flash attn is used, we need to pad the mask and cast to f16
if (ctx->flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
int n_pad = GGML_PAD(window_mask->ne[1], GGML_KQ_MASK_PAD) - window_mask->ne[1];
if (n_pad > 0) {
window_mask = ggml_pad(ctx0, window_mask, 0, n_pad, 0, 0);
}
window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16);
}
// inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size]
GGML_ASSERT(batch_size == 1);
inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4);
......@@ -844,17 +911,216 @@ struct clip_graph {
return gf;
}
ggml_cgraph * build_minicpmv() {
// Qwen3VL
ggml_cgraph * build_qwen3vl() {
GGML_ASSERT(model.patch_bias != nullptr);
GGML_ASSERT(model.position_embeddings != nullptr);
GGML_ASSERT(model.class_embedding == nullptr);
const int batch_size = 1;
const int n_pos = n_patches;
const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
norm_type norm_t = NORM_TYPE_NORMAL;
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
ggml_tensor * inp_raw = build_inp_raw();
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
GGML_ASSERT(img.nx % (patch_size * 2) == 0);
GGML_ASSERT(img.ny % (patch_size * 2) == 0);
// second conv dimension
{
auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
inp = ggml_add(ctx0, inp, inp_1);
inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b]
inp = ggml_cont_4d(
ctx0, inp,
n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
inp = ggml_reshape_4d(
ctx0, inp,
n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
inp = ggml_cont_3d(
ctx0, inp,
n_embd, n_patches_x * n_patches_y, batch_size);
}
// add patch bias
if (model.patch_bias != nullptr) {
inp = ggml_add(ctx0, inp, model.patch_bias);
cb(inp, "patch_bias", -1);
}
// calculate absolute position embedding and apply
ggml_tensor * learned_pos_embd = resize_position_embeddings();
learned_pos_embd = ggml_cont_4d(
ctx0, learned_pos_embd,
n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
learned_pos_embd = ggml_reshape_4d(
ctx0, learned_pos_embd,
n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
learned_pos_embd = ggml_permute(ctx0, learned_pos_embd, 0, 2, 1, 3);
learned_pos_embd = ggml_cont_3d(
ctx0, learned_pos_embd,
n_embd, n_patches_x * n_patches_y, batch_size);
inp = ggml_add(ctx0, inp, learned_pos_embd);
cb(inp, "inp_pos_emb", -1);
ggml_tensor * inpL = inp;
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
ggml_set_name(positions, "positions");
ggml_set_input(positions);
// pre-layernorm
if (model.pre_ln_w) {
inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
}
// deepstack features (stack along the feature dimension), [n_embd * len(deepstack_layers), n_patches_x * n_patches_y, batch_size]
ggml_tensor * deepstack_features = nullptr;
const int merge_factor = hparams.n_merge > 0 ? hparams.n_merge * hparams.n_merge : 4; // default 2x2=4 for qwen3vl
// loop over layers
for (int il = 0; il < n_layer; il++) {
auto & layer = model.layers[il];
ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
// layernorm1
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
cb(cur, "ln1", il);
// self-attention
{
cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
cur = ggml_add(ctx0, cur, layer.qkv_b);
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
/* nb1 */ ggml_row_size(cur->type, d_head),
/* nb2 */ cur->nb[1],
/* offset */ 0);
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
/* nb1 */ ggml_row_size(cur->type, d_head),
/* nb2 */ cur->nb[1],
/* offset */ ggml_row_size(cur->type, n_embd));
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
/* nb1 */ ggml_row_size(cur->type, d_head),
/* nb2 */ cur->nb[1],
/* offset */ ggml_row_size(cur->type, 2 * n_embd));
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
// apply M-RoPE
Qcur = ggml_rope_multi(
ctx0, Qcur, positions, nullptr,
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
Kcur = ggml_rope_multi(
ctx0, Kcur, positions, nullptr,
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
cb(Qcur, "Qcur_rope", il);
cb(Kcur, "Kcur_rope", il);
cur = build_attn(layer.o_w, layer.o_b,
Qcur, Kcur, Vcur, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
}
// re-add the layer input, e.g., residual
cur = ggml_add(ctx0, cur, inpL);
inpL = cur; // inpL = residual, cur = hidden_states
cb(cur, "ffn_inp", il);
// layernorm2
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
cb(cur, "ffn_inp_normed", il);
// ffn
cur = build_ffn(cur,
layer.ff_up_w, layer.ff_up_b,
layer.ff_gate_w, layer.ff_gate_b,
layer.ff_down_w, layer.ff_down_b,
hparams.ffn_op, il);
cb(cur, "ffn_out", il);
// residual 2
cur = ggml_add(ctx0, inpL, cur);
cb(cur, "layer_out", il);
if (layer.has_deepstack()) {
ggml_tensor * feat = ggml_reshape_3d(ctx0, cur, n_embd * merge_factor, n_pos / merge_factor, batch_size);
feat = build_norm(feat, layer.deepstack_norm_w, layer.deepstack_norm_b, norm_t, eps, il);
feat = build_ffn(feat,
layer.deepstack_fc1_w, layer.deepstack_fc1_b,
nullptr, nullptr,
layer.deepstack_fc2_w, layer.deepstack_fc2_b,
ffn_op_type::FFN_GELU, il);
if(!deepstack_features) {
deepstack_features = feat;
} else {
// concat along the feature dimension
deepstack_features = ggml_concat(ctx0, deepstack_features, feat, 0);
}
}
inpL = cur;
}
// post-layernorm
if (model.post_ln_w) {
inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
}
// multimodal projection
ggml_tensor * embeddings = inpL;
embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
embeddings = build_ffn(embeddings,
model.mm_0_w, model.mm_0_b,
nullptr, nullptr,
model.mm_1_w, model.mm_1_b,
ffn_op_type::FFN_GELU, -1);
embeddings = ggml_concat(ctx0, embeddings, deepstack_features, 0); // concat along the feature dimension
// build the graph
ggml_build_forward_expand(gf, embeddings);
return gf;
}
ggml_cgraph * build_minicpmv() {
GGML_ASSERT(model.class_embedding == nullptr);
const int n_pos = n_patches;
const int n_embd_proj = clip_n_mmproj_embd(ctx);
// position embeddings for the projector (not for ViT)
int n_output_dim = clip_n_mmproj_embd(ctx);
ggml_tensor * pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_output_dim, n_pos, batch_size);
ggml_set_name(pos_embed, "pos_embed");
ggml_set_input(pos_embed);
// see: https://huggingface.co/openbmb/MiniCPM-o-2_6/blob/main/resampler.py#L70
// base frequency omega
ggml_tensor * omega = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_embd_proj / 4);
ggml_set_name(omega, "omega");
ggml_set_input(omega);
// 2D input positions (using float for sinusoidal embeddings)
ggml_tensor * pos_h = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos);
ggml_set_name(pos_h, "pos_h");
ggml_set_input(pos_h);
ggml_tensor * pos_w = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos);
ggml_set_name(pos_w, "pos_w");
ggml_set_input(pos_w);
// for selecting learned pos embd, used by ViT
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
......@@ -865,7 +1131,7 @@ struct clip_graph {
ggml_tensor * inp = build_inp();
ggml_tensor * embeddings = build_vit(
inp, n_patches,
inp, n_pos,
NORM_TYPE_NORMAL,
hparams.ffn_op,
learned_pos_embd,
......@@ -880,14 +1146,36 @@ struct clip_graph {
q = build_norm(q, model.mm_model_ln_q_w, model.mm_model_ln_q_b, NORM_TYPE_NORMAL, eps, -1);
v = build_norm(v, model.mm_model_ln_kv_w, model.mm_model_ln_kv_b, NORM_TYPE_NORMAL, eps, -1);
// calculate sinusoidal pos embd
ggml_tensor * pos_embed = nullptr;
{
// outer product
ggml_tensor * omega_b = ggml_repeat_4d(ctx0, omega, omega->ne[0], n_pos, 1, 1); // n_pos rows
ggml_tensor * theta_x = ggml_mul(ctx0, omega_b, pos_w);
ggml_tensor * theta_y = ggml_mul(ctx0, omega_b, pos_h);
// sin and cos
ggml_tensor * pos_embd_x = ggml_concat(
ctx0,
ggml_sin(ctx0, theta_x),
ggml_cos(ctx0, theta_x),
0 // concat on first dim
);
ggml_tensor * pos_embd_y = ggml_concat(
ctx0,
ggml_sin(ctx0, theta_y),
ggml_cos(ctx0, theta_y),
0 // concat on first dim
);
pos_embed = ggml_concat(ctx0, pos_embd_x, pos_embd_y, 0);
}
// k = v + pos_embed
ggml_tensor * k = ggml_add(ctx0, v, pos_embed);
// attention
{
int n_embd = clip_n_mmproj_embd(ctx);
const int d_head = 128;
int n_head = n_embd/d_head;
int n_head = n_embd_proj/d_head;
// Use actual config value if available, otherwise fall back to hardcoded values
int num_query = ctx->model.hparams.minicpmv_query_num;
ggml_tensor * Q = ggml_add(ctx0,
......@@ -908,10 +1196,11 @@ struct clip_graph {
cb(K, "resampler_K", -1);
cb(V, "resampler_V", -1);
float resampler_kq_scale = 1.0f/ sqrtf(float(d_head));
embeddings = build_attn(
model.mm_model_attn_o_w,
model.mm_model_attn_o_b,
Q, K, V, nullptr, kq_scale, -1);
Q, K, V, nullptr, resampler_kq_scale, -1);
cb(embeddings, "resampler_attn_out", -1);
}
// layernorm
......@@ -956,7 +1245,7 @@ struct clip_graph {
// pixel shuffle
{
const int scale_factor = model.hparams.proj_scale_factor;
const int scale_factor = model.hparams.n_merge;
const int bsz = 1; // batch size, always 1 for now since we don't support batching
const int height = n_patches_y;
const int width = n_patches_x;
......@@ -1046,7 +1335,7 @@ struct clip_graph {
// based on Llama4VisionPixelShuffleMLP
// https://github.com/huggingface/transformers/blob/2932f318a20d9e54cc7aea052e040164d85de7d6/src/transformers/models/llama4/modeling_llama4.py#L1151
{
const int scale_factor = model.hparams.proj_scale_factor;
const int scale_factor = model.hparams.n_merge;
const int bsz = 1; // batch size, always 1 for now since we don't support batching
GGML_ASSERT(scale_factor > 0);
GGML_ASSERT(n_patches_x == n_patches_y); // llama4 only supports square images
......@@ -1118,7 +1407,7 @@ struct clip_graph {
{
// patch_merger
const int scale_factor = model.hparams.proj_scale_factor;
const int scale_factor = model.hparams.n_merge;
cur = build_patch_merge_permute(cur, scale_factor);
// projection norm
......@@ -1507,8 +1796,8 @@ struct clip_graph {
// note: these embeddings are not present in text model, hence we cannot process them as text tokens
// see: https://huggingface.co/THUDM/glm-edge-v-2b/blob/main/siglip.py#L53
{
embeddings = ggml_concat(ctx0, model.mm_glm_tok_boi, embeddings, 1); // BOI
embeddings = ggml_concat(ctx0, embeddings, model.mm_glm_tok_eoi, 1); // EOI
embeddings = ggml_concat(ctx0, model.mm_boi, embeddings, 1); // BOI
embeddings = ggml_concat(ctx0, embeddings, model.mm_eoi, 1); // EOI
}
}
......@@ -1521,7 +1810,6 @@ struct clip_graph {
return gf;
}
// whisper encoder with custom projector
ggml_cgraph * build_whisper_enc() {
const int n_frames = img.nx;
......@@ -1626,6 +1914,104 @@ struct clip_graph {
return gf;
}
// cogvlm vision encoder
ggml_cgraph * build_cogvlm() {
GGML_ASSERT(model.class_embedding != nullptr);
GGML_ASSERT(model.position_embeddings != nullptr);
const int n_pos = n_patches + 1; // +1 for [CLS]
// build input and concatenate class embedding
ggml_tensor * inp = build_inp();
inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
inp = ggml_add(ctx0, inp, model.position_embeddings);
cb(inp, "inp_pos", -1);
ggml_tensor * inpL = inp;
for (int il = 0; il < n_layer; il++) {
auto & layer = model.layers[il];
ggml_tensor * cur = inpL;
cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
cur = ggml_add(ctx0, cur, layer.qkv_b);
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float),
cur->nb[1], 0);
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float),
cur->nb[1], n_embd * sizeof(float));
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float),
cur->nb[1], 2 * n_embd * sizeof(float));
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
cur = build_attn(layer.o_w, layer.o_b,
Qcur, Kcur, Vcur, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
cb(cur, "attn_post_norm", il);
cur = ggml_add(ctx0, cur, inpL);
inpL = cur;
cur = build_ffn(cur,
layer.ff_up_w, layer.ff_up_b,
layer.ff_gate_w, layer.ff_gate_b,
layer.ff_down_w, layer.ff_down_b,
hparams.ffn_op, il);
cb(cur, "ffn_out", il);
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
cb(cur, "ffn_post_norm", il);
cur = ggml_add(ctx0, cur, inpL);
cb(cur, "layer_out", il);
inpL = cur;
}
// remove CLS token (like build_llama4 does)
ggml_tensor * cur = ggml_view_2d(ctx0, inpL,
n_embd, n_patches,
ggml_row_size(inpL->type, n_embd), 0);
// Multiply with mm_model_proj
cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
// Apply layernorm, weight, bias
cur = build_norm(cur, model.mm_post_fc_norm_w, model.mm_post_fc_norm_b, NORM_TYPE_NORMAL, 1e-5, -1);
// Apply GELU
cur = ggml_gelu_inplace(ctx0, cur);
// Branch 1: multiply with mm_h_to_4h_w
ggml_tensor * h_to_4h = ggml_mul_mat(ctx0, model.mm_h_to_4h_w, cur);
// Branch 2: multiply with mm_gate_w
ggml_tensor * gate = ggml_mul_mat(ctx0, model.mm_gate_w, cur);
// Apply silu
gate = ggml_swiglu_split(ctx0, gate, h_to_4h);
// Apply mm_4h_to_h_w
cur = ggml_mul_mat(ctx0, model.mm_4h_to_h_w, gate);
// Concatenate with boi and eoi
cur = ggml_concat(ctx0, model.mm_boi, cur, 1);
cur = ggml_concat(ctx0, cur, model.mm_eoi, 1);
// build the graph
ggml_build_forward_expand(gf, cur);
return gf;
}
private:
//
// utility functions
......@@ -1953,17 +2339,25 @@ private:
ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3);
//cb(k, "k", il);
ggml_tensor * cur;
if (ctx->flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
k = ggml_cast(ctx0, k, GGML_TYPE_F16);
v = ggml_cast(ctx0, v, GGML_TYPE_F16);
cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, 0.0f, 0.0f);
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*cur->ne[1], cur->ne[2]*cur->ne[3]);
} else {
ggml_tensor * v = ggml_permute(ctx0, v_cur, 1, 2, 0, 3);
v = ggml_cont(ctx0, v);
//cb(k, "v", il);
ggml_tensor * cur;
// TODO @ngxson : support flash attention
{
const auto n_tokens = q->ne[1];
const auto n_head = q->ne[2];
// const auto n_kv = k->ne[1]; // for flash attention
ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
// F32 may not needed for vision encoders?
......@@ -2108,6 +2502,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
res = graph.build_siglip();
} break;
case PROJECTOR_TYPE_PIXTRAL:
case PROJECTOR_TYPE_LIGHTONOCR:
{
res = graph.build_pixtral();
} break;
......@@ -2116,6 +2511,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
{
res = graph.build_qwen2vl();
} break;
case PROJECTOR_TYPE_QWEN3VL:
{
res = graph.build_qwen3vl();
} break;
case PROJECTOR_TYPE_MINICPMV:
{
res = graph.build_minicpmv();
......@@ -2138,6 +2537,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
{
res = graph.build_kimivl();
} break;
case PROJECTOR_TYPE_JANUS_PRO:
{
res = graph.build_siglip();
} break;
case PROJECTOR_TYPE_COGVLM:
{
res = graph.build_cogvlm();
} break;
default:
{
res = graph.build_llava();
......@@ -2241,6 +2648,10 @@ struct clip_model_loader {
if (proj_type.empty()) {
if (modality == CLIP_MODALITY_VISION) {
get_string(KEY_VISION_PROJ_TYPE, proj_type, false);
if (proj_type.empty()) {
// Assume MLP if no projector type listed
proj_type = "mlp";
}
} else if (modality == CLIP_MODALITY_AUDIO) {
get_string(KEY_AUDIO_PROJ_TYPE, proj_type, false);
} else {
......@@ -2277,7 +2688,6 @@ struct clip_model_loader {
if (is_vision) {
get_u32(KEY_IMAGE_SIZE, hparams.image_size);
get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.preproc_image_size, false);
get_u32(KEY_PATCH_SIZE, hparams.patch_size);
get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy
......@@ -2298,6 +2708,9 @@ struct clip_model_loader {
}
} else if (is_audio) {
get_u32(KEY_A_NUM_MEL_BINS, hparams.n_mel_bins);
// some hparams are unused, but still need to set to avoid issues
hparams.image_size = 0;
hparams.patch_size = 1;
} else {
GGML_ASSERT(false && "unknown modality");
......@@ -2386,58 +2799,69 @@ struct clip_model_loader {
hparams.minicpmv_version = 2; // default to 2 if not set
}
} break;
case PROJECTOR_TYPE_INTERNVL:
{
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
} break;
case PROJECTOR_TYPE_IDEFICS3:
{
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.image_longest_edge, false);
} break;
case PROJECTOR_TYPE_LFM2:
case PROJECTOR_TYPE_INTERNVL:
{
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
// ref: https://huggingface.co/LiquidAI/LFM2-VL-3B/blob/main/preprocessor_config.json
hparams.set_limit_image_tokens(64, 256);
} break;
case PROJECTOR_TYPE_PIXTRAL:
case PROJECTOR_TYPE_LIGHTONOCR:
{
// ref: https://huggingface.co/mistral-community/pixtral-12b/blob/main/preprocessor_config.json
// TODO: verify the image_min_tokens
hparams.n_merge = 1; // the original pixtral does not use patch merging
hparams.rope_theta = 10000.0f;
hparams.warmup_image_size = hparams.patch_size * 8;
// Mistral Small 2506 needs 1024x1024 image size cap to prevent OOM
// ref: https://github.com/ggml-org/llama.cpp/issues/14310
hparams.image_size = 1024;
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size, false);
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
hparams.set_limit_image_tokens(8, 1024);
hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
} break;
case PROJECTOR_TYPE_KIMIVL:
{
hparams.rope_theta = 10000.0f;
hparams.warmup_image_size = hparams.patch_size * 8;
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
// TODO: check kimivl preprocessor for exact values
hparams.set_limit_image_tokens(8, 1024);
hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
} break;
case PROJECTOR_TYPE_GEMMA3:
{
// default value (used by all model sizes in gemma 3 family)
// number of patches for each **side** is reduced by a factor of 4
hparams.proj_scale_factor = 4;
hparams.n_merge = 4;
// test model (tinygemma3) has a different value, we optionally read it
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
} break;
case PROJECTOR_TYPE_QWEN2VL:
{
// max image size = sqrt(max_pixels) = 3584
// ref: https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct/blob/main/preprocessor_config.json
// however, the model use unreasonable memory past 1024 size, we force it to 1024 otherwise it's unusable
// ref: https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct/discussions/10
hparams.image_size = 1024;
hparams.warmup_image_size = hparams.patch_size * 8;
} break;
case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_QWEN3VL:
{
// max image size = sqrt(max_pixels)
// https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json
// however, the model use unreasonable memory past 1024 size, we force it to 1024 otherwise it's unusable
// ref: https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct/discussions/10
hparams.image_size = 1024;
hparams.warmup_image_size = hparams.patch_size * 8;
get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern);
hparams.n_merge = 2; // default value for Qwen 2 and 2.5
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern, model.proj_type == PROJECTOR_TYPE_QWEN25VL); // only 2.5 requires it
// ref: https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json
hparams.set_limit_image_tokens(8, 4096);
hparams.set_warmup_n_tokens(46*46); // avoid OOM on warmup
const int warn_min_pixels = 1024 * hparams.n_merge * hparams.n_merge * hparams.patch_size * hparams.patch_size;
if (hparams.image_min_pixels < warn_min_pixels) {
LOG_WRN("%s: Qwen-VL models require at minimum 1024 image tokens to function correctly on grounding tasks\n", __func__);
LOG_WRN("%s: if you encounter problems with accuracy, try adding --image-min-tokens 1024\n", __func__);
LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__);
}
} break;
case PROJECTOR_TYPE_LLAMA4:
{
hparams.rope_theta = 10000.0f;
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor);
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
set_llava_uhd_res_candidates(model, 3);
} break;
case PROJECTOR_TYPE_ULTRAVOX:
......@@ -2457,6 +2881,13 @@ struct clip_model_loader {
break;
}
// sanity check
{
if (hparams.image_max_pixels < hparams.image_min_pixels) {
throw std::runtime_error(string_format("%s: image_max_pixels (%d) is less than image_min_pixels (%d)\n", __func__, hparams.image_max_pixels, hparams.image_min_pixels));
}
}
LOG_INF("%s: projector: %s\n", __func__, proj_type.c_str());
LOG_INF("%s: n_embd: %d\n", __func__, hparams.n_embd);
LOG_INF("%s: n_head: %d\n", __func__, hparams.n_head);
......@@ -2470,8 +2901,14 @@ struct clip_model_loader {
LOG_INF("%s: patch_size: %d\n", __func__, hparams.patch_size);
LOG_INF("%s: has_llava_proj: %d\n", __func__, hparams.has_llava_projector);
LOG_INF("%s: minicpmv_version: %d\n", __func__, hparams.minicpmv_version);
LOG_INF("%s: proj_scale_factor: %d\n", __func__, hparams.proj_scale_factor);
LOG_INF("%s: n_merge: %d\n", __func__, hparams.n_merge);
LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
if (hparams.image_min_pixels > 0) {
LOG_INF("%s: image_min_pixels: %d%s\n", __func__, hparams.image_min_pixels, hparams.custom_image_min_tokens > 0 ? " (custom value)" : "");
}
if (hparams.image_max_pixels > 0) {
LOG_INF("%s: image_max_pixels: %d%s\n", __func__, hparams.image_max_pixels, hparams.custom_image_max_tokens > 0 ? " (custom value)" : "");
}
} else if (is_audio) {
LOG_INF("\n--- audio hparams ---\n");
LOG_INF("%s: n_mel_bins: %d\n", __func__, hparams.n_mel_bins);
......@@ -2543,10 +2980,11 @@ struct clip_model_loader {
model.layers.resize(hparams.n_layer);
for (int il = 0; il < hparams.n_layer; ++il) {
auto & layer = model.layers[il];
layer.k_w = get_tensor(string_format(TN_ATTN_K, prefix, il, "weight"));
layer.q_w = get_tensor(string_format(TN_ATTN_Q, prefix, il, "weight"));
layer.v_w = get_tensor(string_format(TN_ATTN_V, prefix, il, "weight"));
layer.k_w = get_tensor(string_format(TN_ATTN_K, prefix, il, "weight"), false);
layer.q_w = get_tensor(string_format(TN_ATTN_Q, prefix, il, "weight"), false);
layer.v_w = get_tensor(string_format(TN_ATTN_V, prefix, il, "weight"), false);
layer.o_w = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "weight"));
layer.qkv_w = get_tensor(string_format(TN_ATTN_QKV, prefix, il, "weight"), false);
layer.k_norm = get_tensor(string_format(TN_ATTN_K_NORM, prefix, il, "weight"), false);
layer.q_norm = get_tensor(string_format(TN_ATTN_Q_NORM, prefix, il, "weight"), false);
layer.ln_1_w = get_tensor(string_format(TN_LN_1, prefix, il, "weight"), false);
......@@ -2558,6 +2996,7 @@ struct clip_model_loader {
layer.q_b = get_tensor(string_format(TN_ATTN_Q, prefix, il, "bias"), false);
layer.v_b = get_tensor(string_format(TN_ATTN_V, prefix, il, "bias"), false);
layer.o_b = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "bias"), false);
layer.qkv_b = get_tensor(string_format(TN_ATTN_QKV, prefix, il, "bias"), false);
layer.ln_1_b = get_tensor(string_format(TN_LN_1, prefix, il, "bias"), false);
layer.ln_2_b = get_tensor(string_format(TN_LN_2, prefix, il, "bias"), false);
......@@ -2569,6 +3008,18 @@ struct clip_model_loader {
layer.ff_down_w = get_tensor(string_format(TN_FFN_DOWN, prefix, il, "weight"));
layer.ff_down_b = get_tensor(string_format(TN_FFN_DOWN, prefix, il, "bias"), false);
// qwen3vl deepstack layer
layer.deepstack_norm_w = get_tensor(string_format(TN_DEEPSTACK_NORM, il, "weight"), false);
layer.deepstack_norm_b = get_tensor(string_format(TN_DEEPSTACK_NORM, il, "bias"), false);
layer.deepstack_fc1_w = get_tensor(string_format(TN_DEEPSTACK_FC1, il, "weight"), false);
layer.deepstack_fc1_b = get_tensor(string_format(TN_DEEPSTACK_FC1, il, "bias"), false);
layer.deepstack_fc2_w = get_tensor(string_format(TN_DEEPSTACK_FC2, il, "weight"), false);
layer.deepstack_fc2_b = get_tensor(string_format(TN_DEEPSTACK_FC2, il, "bias"), false);
if (layer.has_deepstack()) {
model.n_deepstack_layers++;
}
// some models already exported with legacy (incorrect) naming which is quite messy, let's fix it here
// note: Qwen model converted from the old surgery script has n_ff = 0, so we cannot use n_ff to check!
bool is_ffn_swapped = (
......@@ -2693,8 +3144,8 @@ struct clip_model_loader {
model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H, "weight"));
model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE, "weight"));
model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H, "weight"));
model.mm_glm_tok_boi = get_tensor(string_format(TN_TOK_GLM_BOI, "weight"));
model.mm_glm_tok_eoi = get_tensor(string_format(TN_TOK_GLM_EOI, "weight"));
model.mm_boi = get_tensor(string_format(TN_TOK_GLM_BOI, "weight"));
model.mm_eoi = get_tensor(string_format(TN_TOK_GLM_EOI, "weight"));
} break;
case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL:
......@@ -2704,6 +3155,13 @@ struct clip_model_loader {
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
} break;
case PROJECTOR_TYPE_QWEN3VL:
{
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
} break;
case PROJECTOR_TYPE_GEMMA3:
{
model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
......@@ -2735,6 +3193,15 @@ struct clip_model_loader {
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false);
} break;
case PROJECTOR_TYPE_LIGHTONOCR:
{
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false);
} break;
case PROJECTOR_TYPE_ULTRAVOX:
{
model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
......@@ -2779,6 +3246,24 @@ struct clip_model_loader {
model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight"));
} break;
case PROJECTOR_TYPE_COGVLM:
{
model.mm_model_proj = get_tensor(TN_MM_PROJECTOR);
model.mm_post_fc_norm_w = get_tensor(string_format(TN_MM_POST_FC_NORM, "weight"));
model.mm_post_fc_norm_b = get_tensor(string_format(TN_MM_POST_FC_NORM, "bias"));
model.mm_h_to_4h_w = get_tensor(string_format(TN_MM_H_TO_4H, "weight"));
model.mm_gate_w = get_tensor(string_format(TN_MM_GATE, "weight"));
model.mm_4h_to_h_w = get_tensor(string_format(TN_MM_4H_TO_H, "weight"));
model.mm_boi = get_tensor(TN_TOK_BOI);
model.mm_eoi = get_tensor(TN_TOK_EOI);
} break;
case PROJECTOR_TYPE_JANUS_PRO:
{
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
} break;
default:
GGML_ASSERT(false && "unknown projector type");
}
......@@ -2846,7 +3331,87 @@ struct clip_model_loader {
}
}
void alloc_compute_meta(clip_ctx & ctx_clip) {
struct support_info_op {
ggml_tensor * op;
// true if the op runs on the accelerated ctx_clip.backend
bool is_accel = true;
};
struct support_info_graph {
// whether the clip_ctx.backend supports flash attention
bool fattn = true;
ggml_tensor * fattn_op = nullptr; // for debugging
std::vector<support_info_op> ops;
};
static void warmup(clip_ctx & ctx_clip) {
support_info_graph info;
if (ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_AUTO) {
// try to enable flash attention to see if it's supported
ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_ENABLED;
info = alloc_compute_meta(ctx_clip);
if (!info.fattn && info.fattn_op) {
auto op = info.fattn_op;
LOG_WRN("%s: *****************************************************************\n", __func__);
LOG_WRN("%s: WARNING: flash attention not supported by %s, memory usage will increase\n", __func__, ggml_backend_name(ctx_clip.backend));
LOG_WRN("%s: op params: \n", __func__);
static auto print_shape = [](const char * fn, const char * name, ggml_tensor * t) {
LOG_WRN("%s: %s: type = %s, ne = [%d %d %d %d], nb = [%d %d %d %d]\n", fn,
name, ggml_type_name(t->type),
t->ne[0], t->ne[1], t->ne[2], t->ne[3],
t->nb[0], t->nb[1], t->nb[2], t->nb[3]);
};
print_shape(__func__, " dst", op);
print_shape(__func__, "src0", op->src[0]);
print_shape(__func__, "src1", op->src[1]);
print_shape(__func__, "src2", op->src[2]);
LOG_WRN("%s: please report this on github as an issue\n", __func__);
LOG_WRN("%s: *****************************************************************\n", __func__);
ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_DISABLED;
alloc_compute_meta(ctx_clip);
}
} else {
info = alloc_compute_meta(ctx_clip);
if (!info.fattn && ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
LOG_WRN("%s: flash attention is not supported by the current backend; falling back to CPU (performance will be degraded)\n", __func__);
}
}
LOG_INF("%s: flash attention is %s\n", __func__,
(ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) ? "enabled" : "disabled");
// print ops that are not supported by the GPU backend (if there is one)
if (ctx_clip.backend && ctx_clip.backend != ctx_clip.backend_cpu) {
std::vector<support_info_op> unsupported_ops;
for (const auto & op : info.ops) {
if (!op.is_accel) {
unsupported_ops.push_back(op);
}
}
if (!unsupported_ops.empty()) {
LOG_WRN("%s: *****************************************************************\n", __func__);
LOG_WRN("%s: WARNING: the CLIP graph uses unsupported operators by the backend\n", __func__);
LOG_WRN("%s: the performance will be suboptimal \n", __func__);
LOG_WRN("%s: list of unsupported ops (backend=%s):\n", __func__, ggml_backend_name(ctx_clip.backend));
for (const auto & op : unsupported_ops) {
LOG_WRN("%s: %16s: type = %s, ne = [%d %d %d %d]\n", __func__,
ggml_op_name(op.op->op),
ggml_type_name(op.op->type),
op.op->ne[0], op.op->ne[1], op.op->ne[2], op.op->ne[3]);
}
LOG_WRN("%s: flash attention is %s\n", __func__,
(ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) ? "enabled" : "disabled");
LOG_WRN("%s: please report this on github as an issue\n", __func__);
LOG_WRN("%s: ref: https://github.com/ggml-org/llama.cpp/pull/16837#issuecomment-3461676118\n", __func__);
LOG_WRN("%s: *****************************************************************\n", __func__);
}
}
}
static support_info_graph alloc_compute_meta(clip_ctx & ctx_clip) {
const auto & hparams = ctx_clip.model.hparams;
ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead());
......@@ -2856,9 +3421,11 @@ struct clip_model_loader {
if (ctx_clip.model.modality == CLIP_MODALITY_VISION) {
img->nx = hparams.warmup_image_size;
img->ny = hparams.warmup_image_size;
LOG_INF("%s: warmup with image size = %d x %d\n", __func__, img->nx, img->ny);
} else {
img->nx = hparams.warmup_audio_size;
img->ny = hparams.n_mel_bins;
LOG_INF("%s: warmup with audio size = %d\n", __func__, img->nx);
}
batch.entries.push_back(std::move(img));
......@@ -2875,57 +3442,95 @@ struct clip_model_loader {
size / 1024.0 / 1024.0);
}
}
const int n_splits = ggml_backend_sched_get_n_splits(ctx_clip.sched.get());
const int n_nodes = ggml_graph_n_nodes(gf);
LOG_INF("%s: graph splits = %d, nodes = %d\n", __func__, n_splits, n_nodes);
support_info_graph res {
/*.fattn = */ true,
/*.fattn_op = */ nullptr,
/*.ops = */ {},
};
// check op support
for (int i = 0; i < ggml_graph_n_nodes(gf); i++) {
ggml_tensor * node = ggml_graph_node(gf, i);
res.ops.push_back({node, true});
if (!ggml_backend_supports_op(ctx_clip.backend, node)) {
res.ops.back().is_accel = false;
if (node->op == GGML_OP_FLASH_ATTN_EXT) {
res.fattn = false;
res.fattn_op = node;
}
}
}
return res;
}
void get_bool(const std::string & key, bool & output, bool required = true) {
void get_bool(const std::string & key, bool & output, bool required = true) const {
const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
if (i < 0) {
if (required) throw std::runtime_error("Key not found: " + key);
if (required) {
throw std::runtime_error("Key not found: " + key);
}
return;
}
output = gguf_get_val_bool(ctx_gguf.get(), i);
}
void get_i32(const std::string & key, int & output, bool required = true) {
void get_i32(const std::string & key, int & output, bool required = true) const {
const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
if (i < 0) {
if (required) throw std::runtime_error("Key not found: " + key);
if (required) {
throw std::runtime_error("Key not found: " + key);
}
return;
}
output = gguf_get_val_i32(ctx_gguf.get(), i);
}
void get_u32(const std::string & key, int & output, bool required = true) {
void get_u32(const std::string & key, int & output, bool required = true) const {
const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
if (i < 0) {
if (required) throw std::runtime_error("Key not found: " + key);
if (required) {
throw std::runtime_error("Key not found: " + key);
}
return;
}
output = gguf_get_val_u32(ctx_gguf.get(), i);
}
void get_f32(const std::string & key, float & output, bool required = true) {
void get_f32(const std::string & key, float & output, bool required = true) const {
const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
if (i < 0) {
if (required) throw std::runtime_error("Key not found: " + key);
if (required) {
throw std::runtime_error("Key not found: " + key);
}
return;
}
output = gguf_get_val_f32(ctx_gguf.get(), i);
}
void get_string(const std::string & key, std::string & output, bool required = true) {
void get_string(const std::string & key, std::string & output, bool required = true) const {
const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
if (i < 0) {
if (required) throw std::runtime_error("Key not found: " + key);
if (required) {
throw std::runtime_error("Key not found: " + key);
}
return;
}
output = std::string(gguf_get_val_str(ctx_gguf.get(), i));
}
void get_arr_int(const std::string & key, std::vector<int> & output, bool required = true) {
void get_arr_int(const std::string & key, std::vector<int> & output, bool required = true) const {
const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
if (i < 0) {
if (required) throw std::runtime_error("Key not found: " + key);
if (required) {
throw std::runtime_error("Key not found: " + key);
}
return;
}
int n = gguf_get_arr_n(ctx_gguf.get(), i);
......@@ -2936,7 +3541,7 @@ struct clip_model_loader {
}
}
void set_llava_uhd_res_candidates(clip_model & model, const int max_patches_per_side) {
static void set_llava_uhd_res_candidates(clip_model & model, const int max_patches_per_side) {
auto & hparams = model.hparams;
for (int x = 1; x <= max_patches_per_side; x++) {
for (int y = 1; y <= max_patches_per_side; y++) {
......@@ -2953,7 +3558,6 @@ struct clip_model_loader {
};
struct clip_init_result clip_init(const char * fname, struct clip_context_params ctx_params) {
g_logger_state.verbosity_thold = ctx_params.verbosity;
clip_ctx * ctx_vision = nullptr;
clip_ctx * ctx_audio = nullptr;
......@@ -2964,24 +3568,22 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
ctx_vision = new clip_ctx(ctx_params);
loader.load_hparams(ctx_vision->model, CLIP_MODALITY_VISION);
loader.load_tensors(*ctx_vision);
loader.alloc_compute_meta(*ctx_vision);
loader.warmup(*ctx_vision);
}
if (loader.has_audio) {
ctx_audio = new clip_ctx(ctx_params);
loader.load_hparams(ctx_audio->model, CLIP_MODALITY_AUDIO);
loader.load_tensors(*ctx_audio);
loader.alloc_compute_meta(*ctx_audio);
loader.warmup(*ctx_audio);
}
} catch (const std::exception & e) {
LOG_ERR("%s: failed to load model '%s': %s\n", __func__, fname, e.what());
if (ctx_vision) {
delete ctx_vision;
}
if (ctx_audio) {
delete ctx_audio;
}
return {nullptr, nullptr};
}
......@@ -3019,10 +3621,10 @@ void clip_image_size_free(struct clip_image_size * load_image_size) {
}
delete load_image_size;
}
void clip_image_u8_free(struct clip_image_u8 * img) { if (img) delete img; }
void clip_image_f32_free(struct clip_image_f32 * img) { if (img) delete img; }
void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) { if (batch) delete batch; }
void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) { if (batch) delete batch; }
void clip_image_u8_free(struct clip_image_u8 * img) { delete img; }
void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) { delete batch; }
void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) { delete batch; }
size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch) {
return batch->entries.size();
......@@ -3074,9 +3676,169 @@ static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32
// set of tools to manupulate images
// in the future, we can have HW acceleration by allowing this struct to access 3rd party lib like imagick or opencv
struct image_manipulation {
struct img_tool {
enum resize_algo {
RESIZE_ALGO_BILINEAR,
RESIZE_ALGO_BICUBIC,
// RESIZE_ALGO_LANCZOS, // TODO
};
static void resize(
const clip_image_u8 & src,
clip_image_u8 & dst,
const clip_image_size & target_resolution,
resize_algo algo,
bool add_padding = true, // TODO: define the behavior for add_padding = false
std::array<uint8_t, 3> pad_color = {0, 0, 0}) {
dst.nx = target_resolution.width;
dst.ny = target_resolution.height;
dst.buf.resize(3 * dst.nx * dst.ny);
if (dst.nx == src.nx && dst.ny == src.ny) {
// no resize needed, simple copy
dst.buf = src.buf;
return;
}
if (!add_padding) {
// direct resize
switch (algo) {
case RESIZE_ALGO_BILINEAR:
resize_bilinear(src, dst, target_resolution.width, target_resolution.height);
break;
case RESIZE_ALGO_BICUBIC:
resize_bicubic(src, dst, target_resolution.width, target_resolution.height);
break;
default:
throw std::runtime_error("Unsupported resize algorithm");
}
} else {
// resize with padding
clip_image_u8 resized_image;
float scale_w = static_cast<float>(target_resolution.width) / src.nx;
float scale_h = static_cast<float>(target_resolution.height) / src.ny;
float scale = std::min(scale_w, scale_h);
int new_width = std::min(static_cast<int>(std::ceil(src.nx * scale)), target_resolution.width);
int new_height = std::min(static_cast<int>(std::ceil(src.ny * scale)), target_resolution.height);
switch (algo) {
case RESIZE_ALGO_BILINEAR:
resize_bilinear(src, resized_image, new_width, new_height);
break;
case RESIZE_ALGO_BICUBIC:
resize_bicubic(src, resized_image, new_width, new_height);
break;
default:
throw std::runtime_error("Unsupported resize algorithm");
}
// fill dst with pad_color
fill(dst, pad_color);
int offset_x = (target_resolution.width - new_width) / 2;
int offset_y = (target_resolution.height - new_height) / 2;
composite(dst, resized_image, offset_x, offset_y);
}
}
static void crop(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) {
dst.nx = w;
dst.ny = h;
dst.buf.resize(3 * w * h);
for (int i = 0; i < h; ++i) {
for (int j = 0; j < w; ++j) {
int src_idx = 3 * ((y + i)*image.nx + (x + j));
int dst_idx = 3 * (i*w + j);
dst.buf[dst_idx] = image.buf[src_idx];
dst.buf[dst_idx + 1] = image.buf[src_idx + 1];
dst.buf[dst_idx + 2] = image.buf[src_idx + 2];
}
}
}
// calculate the size of the **resized** image, while preserving the aspect ratio
// the calculated size will be aligned to the nearest multiple of align_size
// if H or W size is larger than longest_edge, it will be resized to longest_edge
static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int longest_edge) {
GGML_ASSERT(align_size > 0);
if (inp_size.width <= 0 || inp_size.height <= 0 || longest_edge <= 0) {
return {0, 0};
}
float scale = std::min(static_cast<float>(longest_edge) / inp_size.width,
static_cast<float>(longest_edge) / inp_size.height);
float target_width_f = static_cast<float>(inp_size.width) * scale;
float target_height_f = static_cast<float>(inp_size.height) * scale;
auto ceil_by_factor = [f = align_size](float x) { return static_cast<int>(std::ceil(x / static_cast<float>(f))) * f; };
int aligned_width = ceil_by_factor(target_width_f);
int aligned_height = ceil_by_factor(target_height_f);
return {aligned_width, aligned_height};
}
// calculate the size of the **resized** image, while preserving the aspect ratio
// the calculated size will have min_pixels <= W*H <= max_pixels
// this is referred as "smart_resize" in transformers code
static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int min_pixels, const int max_pixels) {
GGML_ASSERT(align_size > 0);
const int width = inp_size.width;
const int height = inp_size.height;
auto ceil_by_factor = [f = align_size](float x) { return static_cast<int>(std::ceil(x / static_cast<float>(f))) * f; };
auto floor_by_factor = [f = align_size](float x) { return static_cast<int>(std::floor(x / static_cast<float>(f))) * f; };
// always align up first
int h_bar = std::max(align_size, ceil_by_factor(height));
int w_bar = std::max(align_size, ceil_by_factor(width));
if (h_bar * w_bar > max_pixels) {
const auto beta = std::sqrt(static_cast<float>(height * width) / max_pixels);
h_bar = std::max(align_size, floor_by_factor(height / beta));
w_bar = std::max(align_size, floor_by_factor(width / beta));
} else if (h_bar * w_bar < min_pixels) {
const auto beta = std::sqrt(static_cast<float>(min_pixels) / (height * width));
h_bar = ceil_by_factor(height * beta);
w_bar = ceil_by_factor(width * beta);
}
return {w_bar, h_bar};
}
// draw src image into dst image at offset (offset_x, offset_y)
static void composite(clip_image_u8 & dst, const clip_image_u8 & src, int offset_x, int offset_y) {
for (int y = 0; y < src.ny; ++y) {
for (int x = 0; x < src.nx; ++x) {
int dx = x + offset_x;
int dy = y + offset_y;
// skip pixels that would be out of bounds in the destination
if (dx < 0 || dy < 0 || dx >= dst.nx || dy >= dst.ny) {
continue;
}
size_t dst_idx = 3 * (static_cast<size_t>(dy) * dst.nx + static_cast<size_t>(dx));
size_t src_idx = 3 * (static_cast<size_t>(y) * src.nx + static_cast<size_t>(x));
dst.buf[dst_idx + 0] = src.buf[src_idx + 0];
dst.buf[dst_idx + 1] = src.buf[src_idx + 1];
dst.buf[dst_idx + 2] = src.buf[src_idx + 2];
}
}
}
// fill the image with a solid color
static void fill(clip_image_u8 & img, const std::array<uint8_t, 3> & color) {
for (size_t i = 0; i < img.buf.size(); i += 3) {
img.buf[i] = color[0];
img.buf[i + 1] = color[1];
img.buf[i + 2] = color[2];
}
}
private:
// Bilinear resize function
static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height) {
static void resize_bilinear(const clip_image_u8 & src, clip_image_u8 & dst, int target_width, int target_height) {
dst.nx = target_width;
dst.ny = target_height;
dst.buf.resize(3 * target_width * target_height);
......@@ -3112,7 +3874,7 @@ struct image_manipulation {
// Bicubic resize function
// part of image will be cropped if the aspect ratio is different
static bool bicubic_resize(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) {
static bool resize_bicubic(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) {
const int nx = img.nx;
const int ny = img.ny;
......@@ -3175,93 +3937,6 @@ struct image_manipulation {
return true;
}
// llava-1.6 type of resize_and_pad
// if the ratio is not 1:1, padding with pad_color will be applied
// pad_color is single channel, default is 0 (black)
static void resize_and_pad_image(const clip_image_u8 & image, clip_image_u8 & dst, const clip_image_size & target_resolution, std::array<uint8_t, 3> pad_color = {0, 0, 0}) {
int target_width = target_resolution.width;
int target_height = target_resolution.height;
float scale_w = static_cast<float>(target_width) / image.nx;
float scale_h = static_cast<float>(target_height) / image.ny;
int new_width, new_height;
if (scale_w < scale_h) {
new_width = target_width;
new_height = std::min(static_cast<int>(std::ceil(image.ny * scale_w)), target_height);
} else {
new_height = target_height;
new_width = std::min(static_cast<int>(std::ceil(image.nx * scale_h)), target_width);
}
clip_image_u8 resized_image;
bicubic_resize(image, resized_image, new_width, new_height);
clip_image_u8 padded_image;
padded_image.nx = target_width;
padded_image.ny = target_height;
padded_image.buf.resize(3 * target_width * target_height);
// Fill the padded image with the fill color
for (size_t i = 0; i < padded_image.buf.size(); i += 3) {
padded_image.buf[i] = pad_color[0];
padded_image.buf[i + 1] = pad_color[1];
padded_image.buf[i + 2] = pad_color[2];
}
// Calculate padding offsets
int pad_x = (target_width - new_width) / 2;
int pad_y = (target_height - new_height) / 2;
// Copy the resized image into the center of the padded buffer
for (int y = 0; y < new_height; ++y) {
for (int x = 0; x < new_width; ++x) {
for (int c = 0; c < 3; ++c) {
padded_image.buf[3 * ((y + pad_y) * target_width + (x + pad_x)) + c] = resized_image.buf[3 * (y * new_width + x) + c];
}
}
}
dst = std::move(padded_image);
}
static void crop_image(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) {
dst.nx = w;
dst.ny = h;
dst.buf.resize(3 * w * h);
for (int i = 0; i < h; ++i) {
for (int j = 0; j < w; ++j) {
int src_idx = 3 * ((y + i)*image.nx + (x + j));
int dst_idx = 3 * (i*w + j);
dst.buf[dst_idx] = image.buf[src_idx];
dst.buf[dst_idx + 1] = image.buf[src_idx + 1];
dst.buf[dst_idx + 2] = image.buf[src_idx + 2];
}
}
}
// calculate the size of the **resized** image, while preserving the aspect ratio
// the calculated size will be aligned to the nearest multiple of align_size
// if H or W size is larger than max_dimension, it will be resized to max_dimension
static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int max_dimension) {
if (inp_size.width <= 0 || inp_size.height <= 0 || align_size <= 0 || max_dimension <= 0) {
return {0, 0};
}
float scale = std::min(1.0f, std::min(static_cast<float>(max_dimension) / inp_size.width,
static_cast<float>(max_dimension) / inp_size.height));
float target_width_f = static_cast<float>(inp_size.width) * scale;
float target_height_f = static_cast<float>(inp_size.height) * scale;
int aligned_width = CLIP_ALIGN((int)target_width_f, align_size);
int aligned_height = CLIP_ALIGN((int)target_height_f, align_size);
return {aligned_width, aligned_height};
}
private:
static inline int clip(int x, int lower, int upper) {
return std::max(lower, std::min(x, upper));
}
......@@ -3410,10 +4085,11 @@ struct llava_uhd {
static std::vector<clip_image_u8_ptr> slice_image(const clip_image_u8 * img, const slice_instructions & inst) {
std::vector<clip_image_u8_ptr> output;
img_tool::resize_algo interpolation = img_tool::RESIZE_ALGO_BILINEAR; // TODO: make it configurable
// resize to overview size
clip_image_u8_ptr resized_img(clip_image_u8_init());
image_manipulation::bicubic_resize(*img, *resized_img, inst.overview_size.width, inst.overview_size.height);
img_tool::resize(*img, *resized_img, inst.overview_size, interpolation);
output.push_back(std::move(resized_img));
if (inst.slices.empty()) {
// no slices, just return the resized image
......@@ -3423,9 +4099,11 @@ struct llava_uhd {
// resize to refined size
clip_image_u8_ptr refined_img(clip_image_u8_init());
if (inst.padding_refined) {
image_manipulation::resize_and_pad_image(*img, *refined_img, inst.refined_size);
img_tool::resize(*img, *refined_img, inst.refined_size, interpolation);
} else {
image_manipulation::bilinear_resize(*img, *refined_img, inst.refined_size.width, inst.refined_size.height);
// only algo bicubic preserves the ratio; old models rely on this behavior
// TODO: do we need to support other algos here?
img_tool::resize(*img, *refined_img, inst.refined_size, img_tool::RESIZE_ALGO_BICUBIC, false);
}
// create slices
......@@ -3436,7 +4114,7 @@ struct llava_uhd {
int h = slice.size.height;
clip_image_u8_ptr img_slice(clip_image_u8_init());
image_manipulation::crop_image(*refined_img, *img_slice, x, y, w, h);
img_tool::crop(*refined_img, *img_slice, x, y, w, h);
output.push_back(std::move(img_slice));
}
......@@ -3571,14 +4249,11 @@ private:
// res_imgs memory is being allocated here, previous allocations will be freed if found
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) {
clip_image_size original_size{img->nx, img->ny};
bool pad_to_square = true;
auto & params = ctx->model.hparams;
// The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing
if (params.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD) {
pad_to_square = false;
}
if (clip_is_minicpmv(ctx)) {
switch (ctx->proj_type()) {
case PROJECTOR_TYPE_MINICPMV:
{
auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
......@@ -3591,21 +4266,30 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
res_imgs->grid_x = inst.grid_size.width;
res_imgs->grid_y = inst.grid_size.height;
return true;
} break;
} else if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL) {
case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_QWEN3VL:
{
GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
clip_image_u8 resized;
auto patch_size = params.patch_size * 2;
auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, patch_size, params.image_size);
image_manipulation::bicubic_resize(*img, resized, new_size.width, new_size.height);
const clip_image_size new_size = img_tool::calc_size_preserved_ratio(
original_size,
params.patch_size * 2,
params.image_min_pixels,
params.image_max_pixels);
img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR, false);
// clip_image_save_to_bmp(resized, "preproc.bmp");
clip_image_f32_ptr img_f32(clip_image_f32_init());
// clip_image_f32_ptr res(clip_image_f32_init());
normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std);
// res_imgs->data[0] = *res;
res_imgs->entries.push_back(std::move(img_f32));
return true;
} else if (ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3) {
} break;
case PROJECTOR_TYPE_IDEFICS3:
{
// The refined size has two steps:
// 1. Resize w/ aspect-ratio preserving such that the longer side is
// the preprocessor longest size
......@@ -3613,8 +4297,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
// multiples of image_size (always rounding up)
//
// CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737
const clip_image_size refined_size = image_manipulation::calc_size_preserved_ratio(
original_size, params.image_size, params.preproc_image_size);
const clip_image_size refined_size = img_tool::calc_size_preserved_ratio(
original_size, params.image_size, params.image_longest_edge);
// LOG_INF("%s: original size: %d x %d, refined size: %d x %d\n",
// __func__, original_size.width, original_size.height,
// refined_size.width, refined_size.height);
llava_uhd::slice_instructions instructions;
instructions.overview_size = clip_image_size{params.image_size, params.image_size};
......@@ -3625,6 +4312,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
};
for (int y = 0; y < refined_size.height; y += params.image_size) {
for (int x = 0; x < refined_size.width; x += params.image_size) {
// LOG_INF("%s: adding slice at x=%d, y=%d\n", __func__, x, y);
instructions.slices.push_back(llava_uhd::slice_coordinates{
/* x */x,
/* y */y,
......@@ -3647,30 +4335,53 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
res_imgs->grid_x = instructions.grid_size.width;
res_imgs->grid_y = instructions.grid_size.height;
return true;
} else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE
|| ctx->proj_type() == PROJECTOR_TYPE_GEMMA3
|| ctx->proj_type() == PROJECTOR_TYPE_INTERNVL // TODO @ngxson : support dynamic resolution
) {
} break;
case PROJECTOR_TYPE_GLM_EDGE:
case PROJECTOR_TYPE_GEMMA3:
case PROJECTOR_TYPE_INTERNVL: // TODO @ngxson : support dynamic resolution
{
clip_image_u8 resized_image;
int sz = params.image_size;
image_manipulation::resize_and_pad_image(*img, resized_image, {sz, sz});
img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR);
clip_image_f32_ptr img_f32(clip_image_f32_init());
//clip_image_save_to_bmp(resized_image, "resized.bmp");
normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
res_imgs->entries.push_back(std::move(img_f32));
return true;
} break;
} else if (ctx->proj_type() == PROJECTOR_TYPE_PIXTRAL) {
case PROJECTOR_TYPE_JANUS_PRO:
{
// Janus Pro preprocessing: pad to square with gray(127), resize to 384x384
const std::array<uint8_t, 3> pad_color = {127, 127, 127};
clip_image_u8 resized_image;
auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, params.patch_size, params.image_size);
image_manipulation::bilinear_resize(*img, resized_image, new_size.width, new_size.height);
int sz = params.image_size;
img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color);
clip_image_f32_ptr img_f32(clip_image_f32_init());
normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
res_imgs->entries.push_back(std::move(img_f32));
return true;
} break;
case PROJECTOR_TYPE_PIXTRAL:
case PROJECTOR_TYPE_LIGHTONOCR:
{
GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
clip_image_u8 resized_image;
// the original pixtral model doesn't have n_merge
const int cur_merge = params.n_merge == 0 ? 1 : params.n_merge;
const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
original_size,
params.patch_size * cur_merge,
params.image_min_pixels,
params.image_max_pixels);
img_tool::resize(*img, resized_image, target_size, img_tool::RESIZE_ALGO_BILINEAR);
clip_image_f32_ptr img_f32(clip_image_f32_init());
normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
res_imgs->entries.push_back(std::move(img_f32));
} break;
} else if (ctx->proj_type() == PROJECTOR_TYPE_LLAMA4) {
case PROJECTOR_TYPE_LLAMA4:
{
GGML_ASSERT(!params.image_res_candidates.empty());
auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
......@@ -3683,55 +4394,41 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
res_imgs->grid_x = inst.grid_size.width;
res_imgs->grid_y = inst.grid_size.height;
return true;
} else if ( ctx->proj_type() == PROJECTOR_TYPE_LFM2
|| ctx->proj_type() == PROJECTOR_TYPE_KIMIVL
) {
GGML_ASSERT(params.proj_scale_factor);
// smart resize
const int width = img->nx;
const int height = img->ny;
const int total_factor = params.patch_size * params.proj_scale_factor;
constexpr int min_image_tokens = 64;
constexpr int max_image_tokens = 1024;
const float min_pixels = min_image_tokens * total_factor * total_factor;
const float max_pixels = max_image_tokens * total_factor * total_factor;
auto round_by_factor = [f = total_factor](float x) { return static_cast<int>(std::nearbyintf(x / static_cast<float>(f))) * f; };
auto ceil_by_factor = [f = total_factor](float x) { return static_cast<int>(std::ceil(x / static_cast<float>(f))) * f; };
auto floor_by_factor = [f = total_factor](float x) { return static_cast<int>(std::floor(x / static_cast<float>(f))) * f; };
int h_bar = std::max(total_factor, round_by_factor(height));
int w_bar = std::max(total_factor, round_by_factor(width));
if (h_bar * w_bar > max_pixels) {
const auto beta = std::sqrt((height * width) / max_pixels);
h_bar = std::max(total_factor, floor_by_factor(height / beta));
w_bar = std::max(total_factor, floor_by_factor(width / beta));
} else if (h_bar * w_bar < min_pixels) {
const auto beta = std::sqrt(min_pixels / (height * width));
h_bar = ceil_by_factor(height * beta);
w_bar = ceil_by_factor(width * beta);
}
} break;
case PROJECTOR_TYPE_LFM2:
case PROJECTOR_TYPE_KIMIVL:
{
GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
original_size,
params.patch_size * params.n_merge,
params.image_min_pixels,
params.image_max_pixels);
const std::array<uint8_t, 3> pad_color = {122, 116, 104};
clip_image_u8 resized_img;
image_manipulation::resize_and_pad_image(*img, resized_img, clip_image_size{w_bar, h_bar}, pad_color);
img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color);
clip_image_f32_ptr res(clip_image_f32_init());
normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std);
res_imgs->entries.push_back(std::move(res));
return true;
}
} break;
case PROJECTOR_TYPE_MLP:
case PROJECTOR_TYPE_MLP_NORM:
case PROJECTOR_TYPE_LDP:
case PROJECTOR_TYPE_LDPV2:
case PROJECTOR_TYPE_COGVLM: // TODO @ngxson : is this correct for cogvlm?
{
// TODO @ngxson : refactor the code below to avoid duplicated logic
// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
// see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
clip_image_u8_ptr temp(clip_image_u8_init()); // we will keep the input image data here temporarily
if (pad_to_square) {
// The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing
if (params.image_res_candidates.empty()) { // pad_to_square
// for llava-1.5, we resize image to a square, and pad the shorter side with a background color
// see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
const int longer_side = std::max(img->nx, img->ny);
......@@ -3743,14 +4440,13 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
const std::array<uint8_t, 3> pad_color = {122, 116, 104};
// resize the image to the target_size
image_manipulation::resize_and_pad_image(*img, *temp, clip_image_size{params.image_size, params.image_size}, pad_color);
img_tool::resize(*img, *temp, clip_image_size{params.image_size, params.image_size}, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color);
clip_image_f32_ptr res(clip_image_f32_init());
normalize_image_u8_to_f32(*temp, *res, params.image_mean, params.image_std);
res_imgs->entries.push_back(std::move(res));
return true;
} else if (!params.image_res_candidates.empty()) {
} else {
// "spatial_unpad" with "anyres" processing for llava-1.6
auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
......@@ -3761,12 +4457,15 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
res_imgs->entries.push_back(std::move(res));
}
}
} break;
return true;
} else {
GGML_ABORT("Unknown image preprocessing type");
default:
LOG_ERR("%s: unsupported projector type %d\n", __func__, ctx->proj_type());
return false;
}
return true;
}
ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
......@@ -3813,16 +4512,16 @@ const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
const auto & params = ctx->model.hparams;
const int n_total = clip_n_output_tokens(ctx, img);
if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL) {
return img->nx / (params.patch_size * 2) + (int)(img->nx % params.patch_size > 0);
if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL) {
return img->nx / (params.patch_size * 2);
}
return n_total;
}
int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
const auto & params = ctx->model.hparams;
if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL) {
return img->ny / (params.patch_size * 2) + (int)(img->ny % params.patch_size > 0);
if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL) {
return img->ny / (params.patch_size * 2);
}
return 1;
}
......@@ -3839,6 +4538,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
switch (proj) {
case PROJECTOR_TYPE_MLP:
case PROJECTOR_TYPE_MLP_NORM:
case PROJECTOR_TYPE_JANUS_PRO:
{
// do nothing
} break;
......@@ -3847,7 +4547,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
case PROJECTOR_TYPE_GLM_EDGE:
{
n_patches /= 4;
if (ctx->model.mm_glm_tok_boi) {
if (ctx->model.mm_boi) {
n_patches += 2; // for BOI and EOI token embeddings
}
} break;
......@@ -3877,11 +4577,11 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
} break;
case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_QWEN3VL:
{
// dynamic size (2 conv, so double patch size)
int patch_size = params.patch_size * 2;
int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
int x_patch = img->nx / (params.patch_size * 2);
int y_patch = img->ny / (params.patch_size * 2);
n_patches = x_patch * y_patch;
} break;
case PROJECTOR_TYPE_GEMMA3:
......@@ -3890,26 +4590,30 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
case PROJECTOR_TYPE_LLAMA4:
{
// both X and Y are downscaled by the scale factor
int scale_factor = ctx->model.hparams.proj_scale_factor;
int scale_factor = ctx->model.hparams.n_merge;
n_patches /= (scale_factor * scale_factor);
} break;
case PROJECTOR_TYPE_LFM2:
case PROJECTOR_TYPE_KIMIVL:
{
// dynamic size
int scale_factor = ctx->model.hparams.proj_scale_factor;
int out_patch_size = params.patch_size * scale_factor;
int out_patch_size = params.patch_size * ctx->model.hparams.n_merge;
int x_patch = CLIP_ALIGN(img->nx, out_patch_size) / out_patch_size;
int y_patch = CLIP_ALIGN(img->ny, out_patch_size) / out_patch_size;
n_patches = x_patch * y_patch;
} break;
case PROJECTOR_TYPE_PIXTRAL:
case PROJECTOR_TYPE_LIGHTONOCR:
{
// dynamic size
int n_merge = params.spatial_merge_size;
int n_merge = ctx->model.hparams.n_merge;
int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1);
int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1);
if (ctx->model.token_embd_img_break) {
n_patches = n_patches_y * n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
} else {
n_patches = n_patches_y * n_patches_x;
}
} break;
case PROJECTOR_TYPE_VOXTRAL:
case PROJECTOR_TYPE_ULTRAVOX:
......@@ -3932,6 +4636,10 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
n_patches /= 2;
}
} break;
case PROJECTOR_TYPE_COGVLM:
{
n_patches += 2; // for BOI and EOI token embeddings
} break;
default:
GGML_ABORT("unsupported projector type");
}
......@@ -3939,92 +4647,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
return n_patches;
}
static std::vector<std::vector<std::vector<float>>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector<std::vector<float>> & pos) {
assert(embed_dim % 2 == 0);
int H = pos.size();
int W = pos[0].size();
std::vector<float> omega(embed_dim / 2);
for (int i = 0; i < embed_dim / 2; ++i) {
omega[i] = 1.0 / pow(10000.0, static_cast<float>(i) / (embed_dim / 2));
}
std::vector<std::vector<std::vector<float>>> emb(H, std::vector<std::vector<float>>(W, std::vector<float>(embed_dim)));
for (int h = 0; h < H; ++h) {
for (int w = 0; w < W; ++w) {
for (int d = 0; d < embed_dim / 2; ++d) {
float out_value = pos[h][w] * omega[d];
emb[h][w][d] = sin(out_value);
emb[h][w][d + embed_dim / 2] = cos(out_value);
}
}
}
return emb;
}
static std::vector<std::vector<std::vector<float>>> get_2d_sincos_pos_embed_from_grid(int embed_dim, const std::vector<std::vector<std::vector<float>>> & grid) {
assert(embed_dim % 2 == 0);
std::vector<std::vector<std::vector<float>>> emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[0]); // (H, W, D/2)
std::vector<std::vector<std::vector<float>>> emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[1]); // (H, W, D/2)
int H = emb_h.size();
int W = emb_h[0].size();
std::vector<std::vector<std::vector<float>>> emb(H, std::vector<std::vector<float>>(W, std::vector<float>(embed_dim)));
for (int h = 0; h < H; ++h) {
for (int w = 0; w < W; ++w) {
for (int d = 0; d < embed_dim / 2; ++d) {
emb[h][w][d] = emb_h[h][w][d];
emb[h][w][d + embed_dim / 2] = emb_w[h][w][d];
}
}
}
return emb;
}
static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, const std::pair<int, int> image_size) {
int grid_h_size = image_size.first;
int grid_w_size = image_size.second;
std::vector<float> grid_h(grid_h_size);
std::vector<float> grid_w(grid_w_size);
for (int i = 0; i < grid_h_size; ++i) {
grid_h[i] = static_cast<float>(i);
}
for (int i = 0; i < grid_w_size; ++i) {
grid_w[i] = static_cast<float>(i);
}
std::vector<std::vector<float>> grid(grid_h_size, std::vector<float>(grid_w_size));
for (int h = 0; h < grid_h_size; ++h) {
for (int w = 0; w < grid_w_size; ++w) {
grid[h][w] = grid_w[w];
}
}
std::vector<std::vector<std::vector<float>>> grid_2d = {grid, grid};
for (int h = 0; h < grid_h_size; ++h) {
for (int w = 0; w < grid_w_size; ++w) {
grid_2d[0][h][w] = grid_h[h];
grid_2d[1][h][w] = grid_w[w];
}
}
std::vector<std::vector<std::vector<float>>> pos_embed_3d = get_2d_sincos_pos_embed_from_grid(embed_dim, grid_2d);
int H = image_size.first;
int W = image_size.second;
std::vector<std::vector<float>> pos_embed_2d(H * W, std::vector<float>(embed_dim));
for (int h = 0; h < H; ++h) {
for (int w = 0; w < W; ++w) {
pos_embed_2d[w * H + h] = pos_embed_3d[h][w];
}
}
return pos_embed_2d;
}
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
clip_image_f32_batch imgs;
clip_image_f32_ptr img_copy(clip_image_f32_init());
......@@ -4163,26 +4785,33 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
}
set_input_i32("positions", positions);
// inspired from resampler of Qwen-VL:
// -> https://huggingface.co/Qwen/Qwen-VL/tree/main
// -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
int embed_dim = clip_n_mmproj_embd(ctx);
// TODO @ngxson : this is very inefficient, can we do this using ggml_sin and ggml_cos?
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
std::vector<float> pos_embed(embed_dim * pos_w * pos_h);
for(int i = 0; i < pos_w * pos_h; ++i){
for(int j = 0; j < embed_dim; ++j){
pos_embed[i * embed_dim + j] = pos_embed_t[i][j];
// inputs for resampler projector
// set the 2D positions (using float for sinusoidal embedding)
int n_patches_per_col = image_size_width / patch_size;
std::vector<float> pos_data(n_pos);
// dimension H
for (int i = 0; i < n_pos; i++) {
pos_data[i] = static_cast<float>(i / n_patches_per_col);
}
set_input_f32("pos_h", pos_data);
// dimension W
for (int i = 0; i < n_pos; i++) {
pos_data[i] = static_cast<float>(i % n_patches_per_col);
}
set_input_f32("pos_embed", pos_embed);
set_input_f32("pos_w", pos_data);
// base frequency omega
const float base_freq = 10000.0f;
const int n_embd_proj = clip_n_mmproj_embd(ctx);
std::vector<float> omega(n_embd_proj / 4);
for (int i = 0; i < n_embd_proj / 4; ++i) {
omega[i] = 1.0f / std::pow(base_freq, static_cast<float>(i) / (n_embd_proj / 4));
}
set_input_f32("omega", omega);
} break;
case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN3VL:
{
const int merge_ratio = 2;
const int merge_ratio = hparams.n_merge;
const int pw = image_size_width / patch_size;
const int ph = image_size_height / patch_size;
std::vector<int> positions(n_pos * 4);
......@@ -4286,6 +4915,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
} break;
case PROJECTOR_TYPE_PIXTRAL:
case PROJECTOR_TYPE_KIMIVL:
case PROJECTOR_TYPE_LIGHTONOCR:
{
// set the 2D positions
int n_patches_per_col = image_size_width / patch_size;
......@@ -4339,6 +4969,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
case PROJECTOR_TYPE_ULTRAVOX:
case PROJECTOR_TYPE_LFM2:
case PROJECTOR_TYPE_VOXTRAL:
case PROJECTOR_TYPE_JANUS_PRO:
case PROJECTOR_TYPE_COGVLM:
{
// do nothing
} break;
......@@ -4416,6 +5048,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
return ctx->model.mm_model_peg_0_b->ne[0];
case PROJECTOR_TYPE_MLP:
case PROJECTOR_TYPE_PIXTRAL:
case PROJECTOR_TYPE_LIGHTONOCR:
return ctx->model.mm_2_w->ne[1];
case PROJECTOR_TYPE_MLP_NORM:
return ctx->model.mm_3_b->ne[0];
......@@ -4425,7 +5058,11 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
return ctx->model.mm_model_mlp_3_w->ne[1];
case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_JANUS_PRO:
return ctx->model.mm_1_b->ne[0];
case PROJECTOR_TYPE_QWEN3VL:
// main path + deepstack paths
return ctx->model.mm_1_b->ne[0] * (1 + ctx->model.n_deepstack_layers);
case PROJECTOR_TYPE_GEMMA3:
return ctx->model.mm_input_proj_w->ne[0];
case PROJECTOR_TYPE_IDEFICS3:
......@@ -4442,6 +5079,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
case PROJECTOR_TYPE_LFM2:
case PROJECTOR_TYPE_KIMIVL:
return ctx->model.mm_2_w->ne[1];
case PROJECTOR_TYPE_COGVLM:
return ctx->model.mm_4h_to_h_w->ne[1];
default:
GGML_ABORT("Unknown projector type");
}
......@@ -4460,7 +5099,8 @@ bool clip_is_glm(const struct clip_ctx * ctx) {
bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
return ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL;
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL;
}
bool clip_is_llava(const struct clip_ctx * ctx) {
......
#pragma once
#include "ggml.h"
#include <stddef.h>
#include <stdint.h>
......@@ -22,9 +23,17 @@ enum clip_modality {
CLIP_MODALITY_AUDIO,
};
enum clip_flash_attn_type {
CLIP_FLASH_ATTN_TYPE_AUTO = -1,
CLIP_FLASH_ATTN_TYPE_DISABLED = 0,
CLIP_FLASH_ATTN_TYPE_ENABLED = 1,
};
struct clip_context_params {
bool use_gpu;
enum ggml_log_level verbosity;
enum clip_flash_attn_type flash_attn_type;
int image_min_tokens;
int image_max_tokens;
};
struct clip_init_result {
......
......@@ -32,8 +32,65 @@
#define STB_IMAGE_IMPLEMENTATION
#include "stb/stb_image.h"
#define LOG_INF(...) fprintf(stdout, __VA_ARGS__)
#define LOG_ERR(...) fprintf(stderr, __VA_ARGS__)
//
// internal logging functions
//
struct mtmd_helper_logger {
ggml_log_callback default_callback = [](ggml_log_level level, const char * text, void * user_data) {
(void) level;
(void) user_data;
fputs(text, stderr);
fflush(stderr);
};
ggml_log_callback log_callback = default_callback;
void * log_callback_user_data;
void log_v(enum ggml_log_level level, const char * format, va_list args) {
if (format == NULL) {
return;
}
va_list args_copy;
va_copy(args_copy, args);
char buffer[128];
int len = vsnprintf(buffer, 128, format, args);
if (len < 128) {
log_callback(level, buffer, log_callback_user_data);
} else {
char * buffer2 = (char *) calloc(len + 1, sizeof(char));
vsnprintf(buffer2, len + 1, format, args_copy);
buffer2[len] = 0;
log_callback(level, buffer2, log_callback_user_data);
free(buffer2);
}
va_end(args_copy);
}
void log(enum ggml_log_level level, const char * format, ...) {
va_list args;
va_start(args, format);
log_v(level, format, args);
va_end(args);
}
} g_logger;
#define LOG_INF(...) g_logger.log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
#define LOG_WRN(...) g_logger.log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
#define LOG_ERR(...) g_logger.log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
void mtmd_helper_log_set(ggml_log_callback log_callback, void * user_data) {
if (log_callback == nullptr) {
log_callback = g_logger.default_callback;
}
g_logger.log_callback = log_callback;
g_logger.log_callback_user_data = user_data;
mtmd_log_set(log_callback, user_data);
}
//
// helper functions
//
size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks) {
size_t n_tokens = 0;
......@@ -182,7 +239,7 @@ int32_t mtmd_helper_decode_image_chunk(
}
const llama_model * model = llama_get_model(lctx);
int n_mmproj_embd = llama_model_n_embd(model);
int n_mmproj_embd = llama_model_n_embd_inp(model);
int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1;
int32_t n_tokens = mtmd_input_chunk_get_n_tokens(chunk);
......@@ -325,7 +382,7 @@ int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
llama_pos * new_n_past) {
size_t n_chunks = mtmd_input_chunks_size(chunks);
if (n_chunks == 0) {
LOG_ERR("no chunks to eval\n");
LOG_WRN("no chunks to eval\n");
return 0;
}
......
......@@ -20,6 +20,11 @@ extern "C" {
// BREAKING CHANGES are expected.
//
// Set callback for all future logging events.
// If this is not called, or NULL is supplied, everything is output on stderr.
// Note: this also call mtmd_log_set() internally
MTMD_API void mtmd_helper_log_set(ggml_log_callback log_callback, void * user_data);
// helper function to construct a mtmd_bitmap from a file
// it calls mtmd_helper_bitmap_init_from_buf() internally
// returns nullptr on failure
......
......@@ -5,12 +5,20 @@
#include "llama.h"
// fix problem with std::min and std::max
#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
# define NOMINMAX
#endif
#include <windows.h>
#endif
#include <algorithm>
#include <cerrno>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <limits>
#include <vector>
// represents raw image data, layout is RGBRGBRGB...
......@@ -93,14 +101,26 @@ const char * mtmd_default_marker() {
return "<__media__>";
}
static clip_flash_attn_type mtmd_get_clip_flash_attn_type(enum llama_flash_attn_type flash_attn_type) {
switch (flash_attn_type) {
case LLAMA_FLASH_ATTN_TYPE_AUTO: return CLIP_FLASH_ATTN_TYPE_AUTO;
case LLAMA_FLASH_ATTN_TYPE_DISABLED: return CLIP_FLASH_ATTN_TYPE_DISABLED;
case LLAMA_FLASH_ATTN_TYPE_ENABLED: return CLIP_FLASH_ATTN_TYPE_ENABLED;
}
return CLIP_FLASH_ATTN_TYPE_AUTO;
}
mtmd_context_params mtmd_context_params_default() {
mtmd_context_params params;
params.use_gpu = true;
params.print_timings = true;
params.n_threads = 4;
params.verbosity = GGML_LOG_LEVEL_INFO;
params.image_marker = MTMD_DEFAULT_IMAGE_MARKER;
params.media_marker = mtmd_default_marker();
mtmd_context_params params {
/* use_gpu */ true,
/* print_timings */ true,
/* n_threads */ 4,
/* image_marker */ MTMD_DEFAULT_IMAGE_MARKER,
/* media_marker */ mtmd_default_marker(),
/* flash_attn_type */ LLAMA_FLASH_ATTN_TYPE_AUTO,
/* image_min_tokens */ -1,
/* image_max_tokens */ -1,
};
return params;
}
......@@ -152,7 +172,7 @@ struct mtmd_context {
print_timings(ctx_params.print_timings),
n_threads (ctx_params.n_threads),
media_marker (ctx_params.media_marker),
n_embd_text (llama_model_n_embd(text_model))
n_embd_text (llama_model_n_embd_inp(text_model))
{
if (std::string(ctx_params.image_marker) != MTMD_DEFAULT_IMAGE_MARKER) {
throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead");
......@@ -162,9 +182,13 @@ struct mtmd_context {
throw std::runtime_error("media_marker must not be empty");
}
clip_context_params ctx_clip_params;
ctx_clip_params.use_gpu = ctx_params.use_gpu;
ctx_clip_params.verbosity = ctx_params.verbosity;
clip_context_params ctx_clip_params {
/* use_gpu */ ctx_params.use_gpu,
/* flash_attn_type */ CLIP_FLASH_ATTN_TYPE_AUTO,
/* image_min_tokens */ ctx_params.image_min_tokens,
/* image_max_tokens */ ctx_params.image_max_tokens,
};
auto res = clip_init(mmproj_fname, ctx_clip_params);
ctx_v = res.ctx_v;
ctx_a = res.ctx_a;
......@@ -268,7 +292,7 @@ struct mtmd_context {
// https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
img_end = "[IMG_END]";
} else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL) {
} else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL) {
// <|vision_start|> ... (image embeddings) ... <|vision_end|>
img_beg = "<|vision_start|>";
img_end = "<|vision_end|>";
......@@ -285,6 +309,11 @@ struct mtmd_context {
img_beg = "<img>";
img_end = "</img>";
} else if (proj == PROJECTOR_TYPE_LIGHTONOCR) {
// <|im_start|> ... (image embeddings) ... <|im_end|>
img_beg = "<|im_start|>";
img_end = "<|im_end|>";
}
}
......@@ -374,9 +403,7 @@ mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
}
void mtmd_free(mtmd_context * ctx) {
if (ctx) {
delete ctx;
}
}
struct mtmd_tokenizer {
......@@ -1036,7 +1063,9 @@ const char * mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
if (image_tokens->use_mrope_pos) {
return 1; // for M-RoPE, the whole image is 1 in temporal dimension
// for M-RoPE, temporal dimension = max(t,h,w)
// t is omitted as we don't support video input
return std::max(image_tokens->nx, image_tokens->ny);
}
return image_tokens->n_tokens();
}
......@@ -1075,3 +1104,8 @@ mtmd_input_chunks * mtmd_test_create_input_chunks() {
return chunks;
}
void mtmd_log_set(ggml_log_callback log_callback, void * user_data) {
g_logger_state.log_callback = log_callback ? log_callback : clip_log_callback_default;
g_logger_state.log_callback_user_data = user_data;
}
......@@ -82,9 +82,13 @@ struct mtmd_context_params {
bool use_gpu;
bool print_timings;
int n_threads;
enum ggml_log_level verbosity;
const char * image_marker; // deprecated, use media_marker instead
const char * media_marker;
enum llama_flash_attn_type flash_attn_type;
// limit number of image tokens, only for vision models with dynamic resolution
int image_min_tokens; // minimum number of tokens for image input (default: read from metadata)
int image_max_tokens; // maximum number of tokens for image input (default: read from metadata)
};
MTMD_API const char * mtmd_default_marker(void);
......@@ -156,7 +160,7 @@ MTMD_API const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd
MTMD_API size_t mtmd_input_chunk_get_n_tokens (const mtmd_input_chunk * chunk);
// returns nullptr for ID on text chunk
MTMD_API const char * mtmd_input_chunk_get_id (const mtmd_input_chunk * chunk);
// number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
// number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise)
MTMD_API llama_pos mtmd_input_chunk_get_n_pos (const mtmd_input_chunk * chunk);
// in case you want to use custom logic to handle the chunk (i.e. KV cache management)
......@@ -174,7 +178,7 @@ MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * i
MTMD_API size_t mtmd_image_tokens_get_nx (const mtmd_image_tokens * image_tokens);
MTMD_API size_t mtmd_image_tokens_get_ny (const mtmd_image_tokens * image_tokens);
MTMD_API const char * mtmd_image_tokens_get_id (const mtmd_image_tokens * image_tokens); // TODO: deprecate
// number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
// number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise)
MTMD_API llama_pos mtmd_image_tokens_get_n_pos (const mtmd_image_tokens * image_tokens); // TODO: deprecate
// tokenize an input text prompt and a list of bitmaps (images/audio)
......@@ -213,6 +217,10 @@ MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx,
// llama_model_n_embd(model) * mtmd_input_chunk_get_n_tokens(chunk) * sizeof(float)
MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
// Set callback for all future logging events.
// If this is not called, or NULL is supplied, everything is output on stderr.
MTMD_API void mtmd_log_set(ggml_log_callback log_callback, void * user_data);
/////////////////////////////////////////
// test function, to be used in test-mtmd-c-api.c
......
......@@ -23,7 +23,7 @@ problem.
8 files changed, 21 insertions(+), 2 deletions(-)
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index ff9135fe2..8ba86f824 100644
index 4cf377e7f..4882541c8 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -113,7 +113,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
......@@ -42,7 +42,7 @@ index ff9135fe2..8ba86f824 100644
}
static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
@@ -2075,6 +2075,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -2079,6 +2079,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
GGML_ASSERT(buffer);
ggml_aligned_free(buffer->context, buffer->size);
......@@ -54,7 +54,7 @@ index ff9135fe2..8ba86f824 100644
}
static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
@@ -2127,7 +2132,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
@@ -2131,7 +2136,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
};
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
......@@ -64,10 +64,10 @@ index ff9135fe2..8ba86f824 100644
/* .init_tensor = */ NULL, // no initialization required
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index 8bd5449f1..01e2df61a 100644
index df28d67fb..1f6a56ba2 100644
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -820,6 +820,7 @@ static bool ggml_backend_buffer_is_cann(ggml_backend_buffer_t buffer) {
@@ -831,6 +831,7 @@ static bool ggml_backend_buffer_is_cann(ggml_backend_buffer_t buffer) {
static void ggml_backend_cann_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
delete ctx;
......@@ -75,7 +75,7 @@ index 8bd5449f1..01e2df61a 100644
}
/**
@@ -1560,6 +1561,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
@@ -1570,6 +1571,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
*/
static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
ACL_CHECK(aclrtFreeHost(buffer->context));
......@@ -84,10 +84,10 @@ index 8bd5449f1..01e2df61a 100644
/**
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index bc396b521..aefc6935e 100644
index fa7e1e13a..8f3b1c173 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -576,6 +576,7 @@ struct ggml_backend_cuda_buffer_context {
@@ -579,6 +579,7 @@ struct ggml_backend_cuda_buffer_context {
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
delete ctx;
......@@ -95,7 +95,7 @@ index bc396b521..aefc6935e 100644
}
static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
@@ -831,6 +832,7 @@ struct ggml_backend_cuda_split_buffer_context {
@@ -834,6 +835,7 @@ struct ggml_backend_cuda_split_buffer_context {
static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
delete ctx;
......@@ -103,7 +103,7 @@ index bc396b521..aefc6935e 100644
}
static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -1112,6 +1114,7 @@ static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) {
@@ -1115,6 +1117,7 @@ static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) {
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
CUDA_CHECK(cudaFreeHost(buffer->context));
......@@ -112,7 +112,7 @@ index bc396b521..aefc6935e 100644
static void * ggml_cuda_host_malloc(size_t size) {
diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
index 7afc881fa..bf0962274 100644
index 70bf6f3d9..f2b7fe692 100644
--- a/ggml/src/ggml-metal/ggml-metal.cpp
+++ b/ggml/src/ggml-metal/ggml-metal.cpp
@@ -25,6 +25,7 @@ static void ggml_backend_metal_buffer_shared_free_buffer(ggml_backend_buffer_t b
......@@ -132,10 +132,10 @@ index 7afc881fa..bf0962274 100644
static void * ggml_backend_metal_buffer_private_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index db33a4ab6..c42ee26e1 100644
index e5302f455..43fa83e8f 100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -3266,6 +3266,7 @@ struct ggml_backend_opencl_buffer_context {
@@ -3412,6 +3412,7 @@ struct ggml_backend_opencl_buffer_context {
static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
delete ctx;
......@@ -144,10 +144,10 @@ index db33a4ab6..c42ee26e1 100644
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
index a38df5a97..fd07e4a21 100644
index 48fd99a76..da2aab3df 100644
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -528,6 +528,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
@@ -555,6 +555,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
RPC_STATUS_ASSERT(status);
delete ctx;
......@@ -156,10 +156,10 @@ index a38df5a97..fd07e4a21 100644
static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index b695ba051..37e853120 100644
index 3f1bdfb9f..a95c2f305 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -352,6 +352,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
@@ -355,6 +355,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
ggml_sycl_set_device(ctx->device);
delete ctx;
......@@ -167,7 +167,7 @@ index b695ba051..37e853120 100644
}
catch (sycl::exception const &exc) {
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -813,6 +814,7 @@ struct ggml_backend_sycl_split_buffer_context {
@@ -816,6 +817,7 @@ struct ggml_backend_sycl_split_buffer_context {
static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
delete ctx;
......@@ -175,7 +175,7 @@ index b695ba051..37e853120 100644
}
static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -1155,6 +1157,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
@@ -1158,6 +1160,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_sycl_host_free(buffer->context);
......@@ -184,10 +184,10 @@ index b695ba051..37e853120 100644
static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index b783f7805..216dc167c 100644
index 66dd0bfab..83cdec29e 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -11828,6 +11828,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
@@ -12368,6 +12368,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
ggml_vk_destroy_buffer(ctx->dev_buffer);
delete ctx;
......@@ -195,7 +195,7 @@ index b783f7805..216dc167c 100644
}
static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -11971,6 +11972,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
@@ -12511,6 +12512,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
ggml_vk_host_free(vk_instance.devices[0], buffer->context);
......
......@@ -10,10 +10,10 @@ logs instead of throwing an error
1 file changed, 3 insertions(+), 11 deletions(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 639fecbd3..a7ce6f8e1 100644
index a73c4c448..b9f0631f4 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1812,16 +1812,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
@@ -1825,16 +1825,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
if (type == LLAMA_VOCAB_TYPE_BPE) {
add_space_prefix = false;
clean_spaces = true;
......@@ -31,8 +31,8 @@ index 639fecbd3..a7ce6f8e1 100644
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (
tokenizer_pre == "llama3" ||
@@ -1993,7 +1984,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
pre_type = LLAMA_VOCAB_PRE_TYPE_GROK_2;
@@ -2014,7 +2005,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2;
clean_spaces = false;
} else {
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
......
......@@ -10,11 +10,11 @@ filesystems for paths that include wide characters
1 file changed, 39 insertions(+)
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index f2abf8852..c984e6282 100644
index 05777d2d9..f4c4d2c48 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -28,6 +28,19 @@
#include <numeric>
@@ -24,6 +24,19 @@
#include <array>
#include <functional>
+#if defined(_WIN32)
......@@ -30,10 +30,10 @@ index f2abf8852..c984e6282 100644
+#endif
+#endif
+
struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
struct clip_logger_state g_logger_state = {clip_log_callback_default, NULL};
enum ffn_op_type {
@@ -2774,7 +2787,29 @@ struct clip_model_loader {
@@ -3255,7 +3268,29 @@ struct clip_model_loader {
{
std::vector<uint8_t> read_buf;
......@@ -63,7 +63,7 @@ index f2abf8852..c984e6282 100644
if (!fin) {
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
}
@@ -2801,7 +2836,11 @@ struct clip_model_loader {
@@ -3282,7 +3317,11 @@ struct clip_model_loader {
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
}
}
......
......@@ -5,20 +5,36 @@ Subject: [PATCH] solar-pro
adds support for the Solar Pro architecture
---
src/llama-arch.cpp | 21 ++++
src/CMakeLists.txt | 1 +
src/llama-arch.cpp | 21 +++++
src/llama-arch.h | 3 +
src/llama-hparams.cpp | 8 ++
src/llama-hparams.h | 5 +
src/llama-hparams.h | 5 ++
src/llama-model-loader.cpp | 2 +-
src/llama-model.cpp | 207 +++++++++++++++++++++++++++++++++++++
src/llama-model.cpp | 48 +++++++++++
src/llama-model.h | 3 +
7 files changed, 248 insertions(+), 1 deletion(-)
src/models/models.h | 5 ++
src/models/solar.cpp | 158 +++++++++++++++++++++++++++++++++++++
10 files changed, 253 insertions(+), 1 deletion(-)
create mode 100644 src/models/solar.cpp
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 67c7807e0..fda881640 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -125,6 +125,7 @@ add_library(llama
models/seed-oss.cpp
models/smallthinker.cpp
models/smollm3.cpp
+ models/solar.cpp
models/stablelm.cpp
models/starcoder.cpp
models/starcoder2.cpp
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 8ca769c5f..ab262ec0c 100644
index 8571a2e02..b6bde25d5 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -82,6 +82,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
@@ -85,6 +85,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
{ LLM_ARCH_GRANITE_HYBRID, "granitehybrid" },
{ LLM_ARCH_CHAMELEON, "chameleon" },
......@@ -26,7 +42,7 @@ index 8ca769c5f..ab262ec0c 100644
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
{ LLM_ARCH_PLM, "plm" },
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
@@ -183,6 +184,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
@@ -204,6 +205,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
{ LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
......@@ -34,7 +50,7 @@ index 8ca769c5f..ab262ec0c 100644
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
@@ -1901,6 +1903,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
@@ -2023,6 +2025,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
},
},
......@@ -59,7 +75,7 @@ index 8ca769c5f..ab262ec0c 100644
{
LLM_ARCH_WAVTOKENIZER_DEC,
{
@@ -2469,6 +2489,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
@@ -2681,6 +2701,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_LAUREL_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
// this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
......@@ -68,10 +84,10 @@ index 8ca769c5f..ab262ec0c 100644
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h
index dea725c1a..ea2b4ffb9 100644
index 150646478..3936a4687 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -86,6 +86,7 @@ enum llm_arch {
@@ -89,6 +89,7 @@ enum llm_arch {
LLM_ARCH_GRANITE_MOE,
LLM_ARCH_GRANITE_HYBRID,
LLM_ARCH_CHAMELEON,
......@@ -79,7 +95,7 @@ index dea725c1a..ea2b4ffb9 100644
LLM_ARCH_WAVTOKENIZER_DEC,
LLM_ARCH_PLM,
LLM_ARCH_BAILINGMOE,
@@ -187,6 +188,7 @@ enum llm_kv {
@@ -208,6 +209,7 @@ enum llm_kv {
LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_OUTPUT_SCALE,
LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
......@@ -87,7 +103,7 @@ index dea725c1a..ea2b4ffb9 100644
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
@@ -436,6 +438,7 @@ enum llm_tensor {
@@ -459,6 +461,7 @@ enum llm_tensor {
LLM_TENSOR_ENC_OUTPUT_NORM,
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
......@@ -96,11 +112,11 @@ index dea725c1a..ea2b4ffb9 100644
LLM_TENSOR_CONVNEXT_DW,
LLM_TENSOR_CONVNEXT_NORM,
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
index db65d69ea..b6bf6bbf2 100644
index 8cdbaf69f..41127bf91 100644
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@@ -151,6 +151,14 @@ uint32_t llama_hparams::n_pos_per_embd() const {
return rope_type == LLAMA_ROPE_TYPE_MROPE ? 4 : 1;
@@ -161,6 +161,14 @@ uint32_t llama_hparams::n_pos_per_embd() const {
return rope_type == LLAMA_ROPE_TYPE_MROPE || rope_type == LLAMA_ROPE_TYPE_IMROPE ? 4 : 1;
}
+bool llama_hparams::n_bskcn(uint32_t n, uint32_t il) const {
......@@ -115,7 +131,7 @@ index db65d69ea..b6bf6bbf2 100644
if (il < n_layer) {
return swa_layers[il];
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index 6fcf91b7d..24569a258 100644
index c3a53be79..2ffe7dd30 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -64,6 +64,8 @@ struct llama_hparams {
......@@ -127,7 +143,7 @@ index 6fcf91b7d..24569a258 100644
uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0;
uint32_t n_lora_kv = 0;
@@ -250,6 +252,9 @@ struct llama_hparams {
@@ -256,6 +258,9 @@ struct llama_hparams {
uint32_t n_pos_per_embd() const;
......@@ -151,10 +167,10 @@ index aa3a65f87..ee303bd58 100644
llama_model_loader::llama_model_loader(
const std::string & fname,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 2a83d6627..54621ea39 100644
index c2a545531..4468de2f9 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1890,6 +1890,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
@@ -1961,6 +1961,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN;
}
} break;
......@@ -176,7 +192,7 @@ index 2a83d6627..54621ea39 100644
case LLM_ARCH_WAVTOKENIZER_DEC:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -5224,6 +5239,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
@@ -5350,6 +5365,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
......@@ -211,12 +227,71 @@ index 2a83d6627..54621ea39 100644
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
@@ -16515,6 +16558,165 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
}
@@ -7425,6 +7468,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
{
llm = std::make_unique<llm_build_chameleon>(*this, params);
} break;
+ case LLM_ARCH_SOLAR:
+ {
+ llm = std::make_unique<llm_build_solar>(*this, params);
+ } break;
case LLM_ARCH_WAVTOKENIZER_DEC:
{
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
@@ -7684,6 +7731,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_GRANITE_HYBRID:
case LLM_ARCH_CHAMELEON:
+ case LLM_ARCH_SOLAR:
case LLM_ARCH_BAILINGMOE:
case LLM_ARCH_NEO_BERT:
case LLM_ARCH_SMOLLM3:
diff --git a/src/llama-model.h b/src/llama-model.h
index f8342cf2c..cbf4e1bfa 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -76,6 +76,7 @@ enum llm_type {
LLM_TYPE_15B,
LLM_TYPE_16B,
LLM_TYPE_20B,
+ LLM_TYPE_22B,
LLM_TYPE_26B,
LLM_TYPE_27B,
LLM_TYPE_30B,
@@ -404,6 +405,8 @@ struct llama_layer {
struct ggml_tensor * ffn_act_beta = nullptr;
struct ggml_tensor * ffn_act_eps = nullptr;
+ struct ggml_tensor * bskcn_tv = nullptr;
+
struct llama_layer_posnet posnet;
struct llama_layer_convnext convnext;
diff --git a/src/models/models.h b/src/models/models.h
index 7ba225b47..71fea796d 100644
--- a/src/models/models.h
+++ b/src/models/models.h
@@ -510,6 +510,11 @@ struct llm_build_smollm3 : public llm_graph_context {
llm_build_smollm3(const llama_model & model, const llm_graph_params & params);
};
+struct llm_build_solar : public llm_graph_context {
+ llm_build_solar(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ llm_build_solar(const llama_model & model, const llm_graph_params & params);
+};
+
+
struct llm_build_stablelm : public llm_graph_context {
llm_build_stablelm(const llama_model & model, const llm_graph_params & params);
};
diff --git a/src/models/solar.cpp b/src/models/solar.cpp
new file mode 100644
index 000000000..97383928c
--- /dev/null
+++ b/src/models/solar.cpp
@@ -0,0 +1,158 @@
+#include "models.h"
+
+llm_build_solar::llm_build_solar(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
......@@ -371,49 +446,4 @@ index 2a83d6627..54621ea39 100644
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+ }
+};
+
// ref: https://github.com/facebookresearch/chameleon
// based on the original build_llama() function, changes:
// * qk-norm
@@ -20096,6 +20298,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
{
llm = std::make_unique<llm_build_chameleon>(*this, params);
} break;
+ case LLM_ARCH_SOLAR:
+ {
+ llm = std::make_unique<llm_build_solar>(*this, params);
+ } break;
case LLM_ARCH_WAVTOKENIZER_DEC:
{
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
@@ -20331,6 +20537,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_GRANITE_HYBRID:
case LLM_ARCH_CHAMELEON:
+ case LLM_ARCH_SOLAR:
case LLM_ARCH_BAILINGMOE:
case LLM_ARCH_NEO_BERT:
case LLM_ARCH_SMOLLM3:
diff --git a/src/llama-model.h b/src/llama-model.h
index 248f85410..4a7924aaa 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -76,6 +76,7 @@ enum llm_type {
LLM_TYPE_15B,
LLM_TYPE_16B,
LLM_TYPE_20B,
+ LLM_TYPE_22B,
LLM_TYPE_27B,
LLM_TYPE_30B,
LLM_TYPE_32B,
@@ -390,6 +391,8 @@ struct llama_layer {
struct ggml_tensor * ffn_act_beta = nullptr;
struct ggml_tensor * ffn_act_eps = nullptr;
+ struct ggml_tensor * bskcn_tv = nullptr;
+
struct llama_layer_posnet posnet;
struct llama_layer_convnext convnext;
+}
......@@ -12,7 +12,7 @@ regex
2 files changed, 22 insertions(+), 1 deletion(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index a7ce6f8e1..8064dc197 100644
index b9f0631f4..1525283d7 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -299,7 +299,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
......@@ -25,7 +25,7 @@ index a7ce6f8e1..8064dc197 100644
"\\s+$",
"[一-龥ࠀ-一가-퟿]+",
diff --git a/src/unicode.cpp b/src/unicode.cpp
index 65f366517..ce336a228 100644
index 77ba4fc46..040518e1e 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -2,6 +2,11 @@
......
......@@ -8,10 +8,10 @@ Subject: [PATCH] maintain ordering for rules for grammar
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index dd9b51a9e..d88f43209 100644
index c8421e1e8..cb659915d 100644
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -308,7 +308,7 @@ private:
@@ -310,7 +310,7 @@ private:
friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
std::function<json(const std::string &)> _fetch_json;
bool _dotall;
......
......@@ -8,10 +8,10 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
1 file changed, 2 insertions(+)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index ba281b8e6..ead235878 100644
index d93664b8b..800f98b65 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -314,6 +314,7 @@ function(ggml_add_cpu_backend_variant tag_name)
@@ -349,6 +349,7 @@ function(ggml_add_cpu_backend_variant tag_name)
endif()
ggml_add_cpu_backend_variant_impl(${tag_name})
......@@ -19,7 +19,7 @@ index ba281b8e6..ead235878 100644
endfunction()
ggml_add_backend(CPU)
@@ -324,6 +325,7 @@ if (GGML_CPU_ALL_VARIANTS)
@@ -359,6 +360,7 @@ if (GGML_CPU_ALL_VARIANTS)
elseif (GGML_CPU_ARM_ARCH)
message(FATAL_ERROR "Cannot use both GGML_CPU_ARM_ARCH and GGML_CPU_ALL_VARIANTS")
endif()
......
......@@ -9,10 +9,10 @@ disable amx as it reduces performance on some systems
1 file changed, 4 deletions(-)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index ead235878..f9a6587f1 100644
index 800f98b65..6d493a4ff 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -334,10 +334,6 @@ if (GGML_CPU_ALL_VARIANTS)
@@ -369,10 +369,6 @@ if (GGML_CPU_ALL_VARIANTS)
ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
......
......@@ -53,10 +53,10 @@ index 8cc4ef1cf..d950dbdf5 100644
}
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 8064dc197..31f49801c 100644
index 1525283d7..ea450c361 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1768,9 +1768,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
@@ -1781,9 +1781,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
if (precompiled_charsmap_keyidx != -1) {
const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
......
......@@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor
1 file changed, 6 insertions(+)
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 9ec485cfa..4b2f8b7bd 100644
index 3247af8bb..5be08d6f4 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -15,6 +15,8 @@
......@@ -20,7 +20,7 @@ index 9ec485cfa..4b2f8b7bd 100644
#if defined(_MSC_VER) || defined(__MINGW32__)
#include <malloc.h> // using malloc.h with MSC/MINGW
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
@@ -2891,6 +2893,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
@@ -2922,6 +2924,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
ggml_compute_forward(&params, node);
......
......@@ -6,14 +6,14 @@ Subject: [PATCH] add ollama vocab for grammar support
---
src/llama-grammar.cpp | 49 ++++++++++++++++++++++++++++++++++++------
src/llama-grammar.h | 14 ++++++++++++
src/llama-sampling.cpp | 4 ++--
3 files changed, 58 insertions(+), 9 deletions(-)
src/llama-sampling.cpp | 6 +++---
3 files changed, 59 insertions(+), 10 deletions(-)
diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp
index bed706bb2..b51cee090 100644
index b3c5eb571..a7307c47f 100644
--- a/src/llama-grammar.cpp
+++ b/src/llama-grammar.cpp
@@ -907,6 +907,7 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
@@ -915,6 +915,7 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
struct llama_grammar * llama_grammar_init_impl(
const struct llama_vocab * vocab,
......@@ -21,7 +21,7 @@ index bed706bb2..b51cee090 100644
const llama_grammar_element ** rules,
size_t n_rules,
size_t start_rule_index) {
@@ -962,6 +963,7 @@ struct llama_grammar * llama_grammar_init_impl(
@@ -970,6 +971,7 @@ struct llama_grammar * llama_grammar_init_impl(
// then the pointers would be invalidated when the local vec_rules goes out of scope.
return new llama_grammar {
vocab,
......@@ -29,7 +29,7 @@ index bed706bb2..b51cee090 100644
std::move(vec_rules),
std::move(stacks),
/* .partial_utf8 = */ {},
@@ -975,6 +977,7 @@ struct llama_grammar * llama_grammar_init_impl(
@@ -983,6 +985,7 @@ struct llama_grammar * llama_grammar_init_impl(
struct llama_grammar * llama_grammar_init_impl(
const struct llama_vocab * vocab,
......@@ -37,7 +37,7 @@ index bed706bb2..b51cee090 100644
const char * grammar_str,
const char * grammar_root,
bool lazy,
@@ -1067,6 +1070,7 @@ struct llama_grammar * llama_grammar_init_impl(
@@ -1075,6 +1078,7 @@ struct llama_grammar * llama_grammar_init_impl(
// then the pointers would be invalidated when the local vec_rules goes out of scope.
return new llama_grammar {
vocab,
......@@ -45,7 +45,7 @@ index bed706bb2..b51cee090 100644
std::move(vec_rules),
std::move(stacks),
/* .partial_utf8 = */ {},
@@ -1089,6 +1093,7 @@ void llama_grammar_free_impl(struct llama_grammar * grammar) {
@@ -1097,6 +1101,7 @@ void llama_grammar_free_impl(struct llama_grammar * grammar) {
struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar) {
auto * result = new llama_grammar {
grammar.vocab,
......@@ -53,7 +53,7 @@ index bed706bb2..b51cee090 100644
grammar.rules,
grammar.stacks,
grammar.partial_utf8,
@@ -1116,7 +1121,6 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
@@ -1124,7 +1129,6 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
}
void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_data_array * cur_p) {
......@@ -61,7 +61,7 @@ index bed706bb2..b51cee090 100644
if (grammar.awaiting_trigger) {
return;
@@ -1138,9 +1142,13 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
@@ -1146,9 +1150,13 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
for (size_t i = 0; i < cur_p->size; ++i) {
const llama_token id = cur_p->data[i].id;
......@@ -77,7 +77,7 @@ index bed706bb2..b51cee090 100644
if (!allow_eog) {
cur_p->data[i].logit = -INFINITY;
}
@@ -1159,9 +1167,10 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
@@ -1167,9 +1175,10 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
}
void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token) {
......@@ -90,7 +90,7 @@ index bed706bb2..b51cee090 100644
if (grammar.awaiting_trigger) {
if (std::find(grammar.trigger_tokens.begin(), grammar.trigger_tokens.end(), token) != grammar.trigger_tokens.end()) {
@@ -1201,13 +1210,14 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
@@ -1209,13 +1218,14 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
}
}
......@@ -107,7 +107,7 @@ index bed706bb2..b51cee090 100644
}
llama_grammar_accept_str(grammar, piece);
@@ -1227,3 +1237,28 @@ void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string
@@ -1235,3 +1245,28 @@ void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string
throw std::runtime_error("Unexpected empty grammar stack after accepting piece: " + piece);
}
}
......@@ -184,10 +184,10 @@ index f8c291de9..2a3a62db3 100644
const char * grammar_root,
bool lazy,
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 55d2e355f..da34526b1 100644
index 3f4a729bc..38a30ea05 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1563,7 +1563,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
@@ -1561,7 +1561,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
trigger_patterns_c.push_back(trigger_pattern.pattern.c_str());
}
......@@ -196,12 +196,15 @@ index 55d2e355f..da34526b1 100644
ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(),
ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size());
@@ -1645,7 +1645,7 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
@@ -1639,9 +1639,9 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
trigger_pattern += ")[\\s\\S]*";
std::array<const char *, 1> tmp_trigger_patterns = { trigger_pattern.c_str() };
- grammar = llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, tmp_trigger_patterns.data(), tmp_trigger_patterns.size(), trigger_tokens, num_trigger_tokens);
+ grammar = llama_grammar_init_impl(vocab, nullptr, grammar_str, grammar_root, lazy, tmp_trigger_patterns.data(), tmp_trigger_patterns.size(), trigger_tokens, num_trigger_tokens);
} else {
- grammar = llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_patterns, num_trigger_patterns, trigger_tokens, num_trigger_tokens);
+ grammar = llama_grammar_init_impl(vocab, nullptr, grammar_str, grammar_root, lazy, trigger_patterns, num_trigger_patterns, trigger_tokens, num_trigger_tokens);
}
*ctx = {
/* .vocab = */ vocab,
/* .grammar_str = */ grammar_str,
/* .grammar_root = */ grammar_root,
- /* .grammar = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_patterns, num_trigger_patterns, trigger_tokens, num_trigger_tokens),
+ /* .grammar = */ llama_grammar_init_impl(vocab, nullptr, grammar_str, grammar_root, lazy, trigger_patterns, num_trigger_patterns, trigger_tokens, num_trigger_tokens),
};
if (!ctx->grammar) {
delete ctx;
......@@ -8,14 +8,14 @@ Subject: [PATCH] add argsort and cuda copy for i32
ggml/src/ggml-cuda/argsort.cu | 122 ++++++++++++++++++++++++---
ggml/src/ggml-cuda/cpy-utils.cuh | 6 ++
ggml/src/ggml-cuda/cpy.cu | 40 +++++++++
ggml/src/ggml-metal/ggml-metal.metal | 64 ++++++++++++++
5 files changed, 263 insertions(+), 12 deletions(-)
ggml/src/ggml-metal/ggml-metal.metal | 69 +++++++++++++++
5 files changed, 268 insertions(+), 12 deletions(-)
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index b52f0f847..902fdad69 100644
index 2745fc54e..40666bab6 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -7889,6 +7889,45 @@ static void ggml_compute_forward_argsort_f32(
@@ -7846,6 +7846,45 @@ static void ggml_compute_forward_argsort_f32(
}
}
......@@ -61,7 +61,7 @@ index b52f0f847..902fdad69 100644
void ggml_compute_forward_argsort(
const ggml_compute_params * params,
ggml_tensor * dst) {
@@ -7900,6 +7939,10 @@ void ggml_compute_forward_argsort(
@@ -7857,6 +7896,10 @@ void ggml_compute_forward_argsort(
{
ggml_compute_forward_argsort_f32(params, dst);
} break;
......@@ -73,7 +73,7 @@ index b52f0f847..902fdad69 100644
{
GGML_ABORT("fatal error");
diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu
index 6e7b90d42..08dd30525 100644
index da9652c3b..b82be371c 100644
--- a/ggml/src/ggml-cuda/argsort.cu
+++ b/ggml/src/ggml-cuda/argsort.cu
@@ -168,13 +168,107 @@ static void argsort_f32_i32_cuda_bitonic(const float * x,
......@@ -220,11 +220,11 @@ index 6e7b90d42..08dd30525 100644
+ }
}
diff --git a/ggml/src/ggml-cuda/cpy-utils.cuh b/ggml/src/ggml-cuda/cpy-utils.cuh
index e621cb981..597c0c8b3 100644
index 7697c292d..00d773dd3 100644
--- a/ggml/src/ggml-cuda/cpy-utils.cuh
+++ b/ggml/src/ggml-cuda/cpy-utils.cuh
@@ -215,3 +215,9 @@ template<typename src_t, typename dst_t>
static __device__ void cpy_1_flt(const char * cxi, char * cdsti) {
static __device__ void cpy_1_scalar(const char * cxi, char * cdsti) {
*(dst_t *) cdsti = ggml_cuda_cast<dst_t>(*(const src_t *) cxi);
}
+
......@@ -234,10 +234,10 @@ index e621cb981..597c0c8b3 100644
+ *dst = *src;
+}
diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
index 12d5bf776..a0e34030e 100644
index c4ceb4fc5..0e53ecc39 100644
--- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu
@@ -251,6 +251,43 @@ static void ggml_cpy_f32_iq4_nl_cuda(
@@ -352,6 +352,43 @@ static void ggml_cpy_f32_iq4_nl_cuda(
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
}
......@@ -281,73 +281,76 @@ index 12d5bf776..a0e34030e 100644
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1) {
const int64_t ne = ggml_nelements(src0);
GGML_ASSERT(ne == ggml_nelements(src1));
@@ -332,6 +369,9 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
ggml_cpy_flt_cuda<half, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
ggml_cpy_flt_cuda<half, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
@@ -481,6 +518,9 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
ggml_cpy_scalar_cuda<half, float>
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
}
+ } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
+ // TODO consider converting to template
+ ggml_cpy_i32_i32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16) {
ggml_cpy_flt_cuda<nv_bfloat16, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F16) {
if (can_be_transposed) {
ggml_cpy_scalar_cuda<nv_bfloat16, nv_bfloat16, true>
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 2c2f01415..50b8071de 100644
index 73b45c762..aed013a9d 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -4467,8 +4467,72 @@ kernel void kernel_argsort_f32_i32(
@@ -4721,8 +4721,77 @@ kernel void kernel_argsort_f32_i32(
}
}
+typedef void (i32_argsort_t)(
+ constant ggml_metal_kargs_argsort & args,
+ device const int32_t * x,
+ device const int32_t * src0,
+ device int32_t * dst,
+ threadgroup int32_t * shared_values [[threadgroup(0)]],
+ threadgroup int32_t * shmem_i32 [[threadgroup(0)]],
+ uint3 tgpig[[threadgroup_position_in_grid]],
+ uint3 tpitg[[thread_position_in_threadgroup]]);
+ ushort3 tpitg[[thread_position_in_threadgroup]],
+ ushort3 ntg[[threads_per_threadgroup]]);
+
+template<ggml_sort_order order>
+kernel void kernel_argsort_i32_i32(
+ constant ggml_metal_kargs_argsort & args,
+ device const int32_t * x,
+ device const int32_t * src0,
+ device int32_t * dst,
+ threadgroup int32_t * shared_values [[threadgroup(0)]],
+ threadgroup int32_t * shmem_i32 [[threadgroup(0)]],
+ uint3 tgpig[[threadgroup_position_in_grid]],
+ uint3 tpitg[[thread_position_in_threadgroup]]) {
+ ushort3 tpitg[[thread_position_in_threadgroup]],
+ ushort3 ntg[[threads_per_threadgroup]]) {
+ // bitonic sort
+ int col = tpitg[0];
+ int row = tgpig[1];
+ const int col = tpitg[0];
+
+ if (col >= args.ncols_pad) return;
+ const int i00 = (tgpig[0]/args.ne01)*ntg.x;
+ const int i01 = tgpig[0]%args.ne01;
+ const int i02 = tgpig[1];
+ const int i03 = tgpig[2];
+
+ device const int32_t * x_row = x + row * args.ncols;
+ threadgroup int32_t * dst_row = shared_values;
+ device const int32_t * src0_row = (device const int32_t *) (src0 + args.nb01*i01 + args.nb02*i02 + args.nb03*i03);
+
+ // initialize indices
+ dst_row[col] = col;
+ shmem_i32[col] = i00 + col;
+
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+
+ for (int k = 2; k <= args.ncols_pad; k *= 2) {
+ for (int k = 2; k <= ntg.x; k *= 2) {
+ for (int j = k / 2; j > 0; j /= 2) {
+ int ixj = col ^ j;
+ if (ixj > col) {
+ if ((col & k) == 0) {
+ if (dst_row[col] >= args.ncols ||
+ (dst_row[ixj] < args.ncols && (order == GGML_SORT_ORDER_ASC ?
+ x_row[dst_row[col]] > x_row[dst_row[ixj]] :
+ x_row[dst_row[col]] < x_row[dst_row[ixj]]))
+ if (shmem_i32[col] >= args.ne00 ||
+ (shmem_i32[ixj] < args.ne00 && (order == GGML_SORT_ORDER_ASC ?
+ src0_row[shmem_i32[col]] > src0_row[shmem_i32[ixj]] :
+ src0_row[shmem_i32[col]] < src0_row[shmem_i32[ixj]]))
+ ) {
+ SWAP(dst_row[col], dst_row[ixj]);
+ SWAP(shmem_i32[col], shmem_i32[ixj]);
+ }
+ } else {
+ if (dst_row[ixj] >= args.ncols ||
+ (dst_row[col] < args.ncols && (order == GGML_SORT_ORDER_ASC ?
+ x_row[dst_row[col]] < x_row[dst_row[ixj]] :
+ x_row[dst_row[col]] > x_row[dst_row[ixj]]))
+ if (shmem_i32[ixj] >= args.ne00 ||
+ (shmem_i32[col] < args.ne00 && (order == GGML_SORT_ORDER_ASC ?
+ src0_row[shmem_i32[col]] < src0_row[shmem_i32[ixj]] :
+ src0_row[shmem_i32[col]] > src0_row[shmem_i32[ixj]]))
+ ) {
+ SWAP(dst_row[col], dst_row[ixj]);
+ SWAP(shmem_i32[col], shmem_i32[ixj]);
+ }
+ }
+ }
......@@ -356,8 +359,10 @@ index 2c2f01415..50b8071de 100644
+ }
+
+ // copy the result to dst without the padding
+ if (col < args.ncols) {
+ dst[row * args.ncols + col] = dst_row[col];
+ if (i00 + col < args.ne00) {
+ dst += i00 + args.ne00*i01 + args.ne00*args.ne01*i02 + args.ne00*args.ne01*args.ne02*i03;
+
+ dst[col] = shmem_i32[col];
+ }
+}
+
......@@ -366,5 +371,5 @@ index 2c2f01415..50b8071de 100644
+template [[host_name("kernel_argsort_i32_i32_asc")]] kernel i32_argsort_t kernel_argsort_i32_i32<GGML_SORT_ORDER_ASC>;
+template [[host_name("kernel_argsort_i32_i32_desc")]] kernel i32_argsort_t kernel_argsort_i32_i32<GGML_SORT_ORDER_DESC>;
kernel void kernel_leaky_relu_f32(
constant ggml_metal_kargs_leaky_relu & args,
typedef void (argsort_merge_t)(
constant ggml_metal_kargs_argsort_merge & args,
......@@ -35,10 +35,10 @@ index f1b740785..c54ff98bf 100644
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index c830c0965..363853873 100644
index 218222ece..06ee502ab 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -486,6 +486,7 @@ struct node_alloc {
@@ -493,6 +493,7 @@ struct node_alloc {
struct ggml_gallocr {
ggml_backend_buffer_type_t * bufts; // [n_buffers]
struct vbuffer ** buffers; // [n_buffers]
......@@ -46,7 +46,7 @@ index c830c0965..363853873 100644
struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
int n_buffers;
@@ -509,6 +510,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
@@ -516,6 +517,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
galloc->buffers = calloc(n_bufs, sizeof(struct vbuffer *));
GGML_ASSERT(galloc->buffers != NULL);
......@@ -56,7 +56,7 @@ index c830c0965..363853873 100644
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
GGML_ASSERT(galloc->buf_tallocs != NULL);
@@ -576,6 +580,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
@@ -583,6 +587,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
ggml_hash_set_free(&galloc->hash_set);
free(galloc->hash_values);
free(galloc->bufts);
......@@ -64,7 +64,7 @@ index c830c0965..363853873 100644
free(galloc->buffers);
free(galloc->buf_tallocs);
free(galloc->node_allocs);
@@ -891,6 +896,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
@@ -898,6 +903,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
}
}
......@@ -73,8 +73,8 @@ index c830c0965..363853873 100644
// reallocate buffers if needed
for (int i = 0; i < galloc->n_buffers; i++) {
// if the buffer type is used multiple times, we reuse the same buffer
@@ -920,14 +927,19 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
@@ -932,14 +939,19 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
#endif
ggml_vbuffer_free(galloc->buffers[i]);
galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
- if (galloc->buffers[i] == NULL) {
......@@ -96,7 +96,7 @@ index c830c0965..363853873 100644
}
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
@@ -1082,6 +1094,22 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
@@ -1094,6 +1106,22 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
return ggml_vbuffer_size(galloc->buffers[buffer_id]);
}
......@@ -120,10 +120,10 @@ index c830c0965..363853873 100644
static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 8ba86f824..cb2b99562 100644
index 4882541c8..ff41c7712 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -1809,6 +1809,13 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
@@ -1813,6 +1813,13 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment