Unverified Commit 943464cc authored by Jeffrey Morgan's avatar Jeffrey Morgan Committed by GitHub
Browse files

llama: update to commit 71e90e88 (#10192)

parent 369de832
...@@ -237,5 +237,5 @@ jobs: ...@@ -237,5 +237,5 @@ jobs:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
- name: Verify patches apply cleanly and do not change files - name: Verify patches apply cleanly and do not change files
run: | run: |
make -f Makefile.sync clean sync make -f Makefile.sync clean checkout apply-patches sync
git diff --compact-summary --exit-code git diff --compact-summary --exit-code
\ No newline at end of file
...@@ -51,7 +51,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cp ...@@ -51,7 +51,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cp
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu/amx) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu/amx)
set(GGML_CPU ON) set(GGML_CPU ON)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml)
set_property(TARGET ggml PROPERTY EXCLUDE_FROM_ALL TRUE) set_property(TARGET ggml PROPERTY EXCLUDE_FROM_ALL TRUE)
get_target_property(CPU_VARIANTS ggml-cpu MANUALLY_ADDED_DEPENDENCIES) get_target_property(CPU_VARIANTS ggml-cpu MANUALLY_ADDED_DEPENDENCIES)
......
UPSTREAM=https://github.com/ggerganov/llama.cpp.git UPSTREAM=https://github.com/ggerganov/llama.cpp.git
WORKDIR=llama/vendor WORKDIR=llama/vendor
FETCH_HEAD=d7cfe1ffe0f435d0048a6058d529daf76e072d9c FETCH_HEAD=71e90e8813f90097701e62f7fce137d96ddf41e2
.PHONY: help .PHONY: help
help: help:
...@@ -15,18 +15,18 @@ help: ...@@ -15,18 +15,18 @@ help:
@echo " make -f $(lastword $(MAKEFILE_LIST)) clean sync" @echo " make -f $(lastword $(MAKEFILE_LIST)) clean sync"
.PHONY: sync .PHONY: sync
sync: llama/build-info.cpp llama/llama.cpp ml/backend/ggml/ggml apply-patches sync: llama/build-info.cpp llama/llama.cpp ml/backend/ggml/ggml
.PHONY: llama/build-info.cpp .PHONY: llama/build-info.cpp
llama/build-info.cpp: llama/build-info.cpp.in llama/build-info.cpp: llama/build-info.cpp.in
sed -e 's|@FETCH_HEAD@|$(FETCH_HEAD)|' $< > $@ sed -e 's|@FETCH_HEAD@|$(FETCH_HEAD)|' $< > $@
.PHONY: llama/llama.cpp .PHONY: llama/llama.cpp
llama/llama.cpp: llama/vendor/ apply-patches llama/llama.cpp: llama/vendor/
rsync -arvzc -f "merge $@/.rsync-filter" $< $@ rsync -arvzc -f "merge $@/.rsync-filter" $< $@
.PHONY: ml/backend/ggml/ggml apply-patches .PHONY: ml/backend/ggml/ggml
ml/backend/ggml/ggml: llama/vendor/ggml/ apply-patches ml/backend/ggml/ggml: llama/vendor/ggml/
rsync -arvzc -f "merge $@/.rsync-filter" $< $@ rsync -arvzc -f "merge $@/.rsync-filter" $< $@
PATCHES=$(wildcard llama/patches/*.patch) PATCHES=$(wildcard llama/patches/*.patch)
......
int LLAMA_BUILD_NUMBER = 0; int LLAMA_BUILD_NUMBER = 0;
char const *LLAMA_COMMIT = "d7cfe1ffe0f435d0048a6058d529daf76e072d9c"; char const *LLAMA_COMMIT = "71e90e8813f90097701e62f7fce137d96ddf41e2";
char const *LLAMA_COMPILER = ""; char const *LLAMA_COMPILER = "";
char const *LLAMA_BUILD_TARGET = ""; char const *LLAMA_BUILD_TARGET = "";
...@@ -13,6 +13,7 @@ include include/llama-*.* ...@@ -13,6 +13,7 @@ include include/llama-*.*
include examples/ include examples/
include examples/llava/ include examples/llava/
include examples/llava/clip.* include examples/llava/clip.*
include examples/llava/clip-impl.*
include examples/llava/llava.* include examples/llava/llava.*
include src/ include src/
include src/llama.* include src/llama.*
......
...@@ -7,10 +7,6 @@ ...@@ -7,10 +7,6 @@
#include "common.h" #include "common.h"
#include "log.h" #include "log.h"
// Change JSON_ASSERT from assert() to GGML_ASSERT:
#define JSON_ASSERT GGML_ASSERT
#include "json.hpp"
#include "json-schema-to-grammar.h"
#include "llama.h" #include "llama.h"
#include <algorithm> #include <algorithm>
...@@ -52,47 +48,11 @@ ...@@ -52,47 +48,11 @@
#include <sys/stat.h> #include <sys/stat.h>
#include <unistd.h> #include <unistd.h>
#endif #endif
#if defined(LLAMA_USE_CURL)
#include <curl/curl.h>
#include <curl/easy.h>
#include <future>
#endif
#if defined(_MSC_VER) #if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
#endif #endif
#if defined(LLAMA_USE_CURL)
#ifdef __linux__
#include <linux/limits.h>
#elif defined(_WIN32)
# if !defined(PATH_MAX)
# define PATH_MAX MAX_PATH
# endif
#else
#include <sys/syslimits.h>
#endif
#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
//
// CURL utils
//
using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
struct curl_slist_ptr {
struct curl_slist * ptr = nullptr;
~curl_slist_ptr() {
if (ptr) {
curl_slist_free_all(ptr);
}
}
};
#endif // LLAMA_USE_CURL
using json = nlohmann::ordered_json;
// //
// CPU utils // CPU utils
// //
...@@ -483,6 +443,11 @@ void string_replace_all(std::string & s, const std::string & search, const std:: ...@@ -483,6 +443,11 @@ void string_replace_all(std::string & s, const std::string & search, const std::
s = std::move(builder); s = std::move(builder);
} }
std::string regex_escape(const std::string & s) {
static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
return std::regex_replace(s, special_chars, "\\$0");
}
std::string string_join(const std::vector<std::string> & values, const std::string & separator) { std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
std::ostringstream result; std::ostringstream result;
for (size_t i = 0; i < values.size(); ++i) { for (size_t i = 0; i < values.size(); ++i) {
...@@ -865,7 +830,7 @@ std::string fs_get_cache_directory() { ...@@ -865,7 +830,7 @@ std::string fs_get_cache_directory() {
if (getenv("LLAMA_CACHE")) { if (getenv("LLAMA_CACHE")) {
cache_directory = std::getenv("LLAMA_CACHE"); cache_directory = std::getenv("LLAMA_CACHE");
} else { } else {
#ifdef __linux__ #if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)
if (std::getenv("XDG_CACHE_HOME")) { if (std::getenv("XDG_CACHE_HOME")) {
cache_directory = std::getenv("XDG_CACHE_HOME"); cache_directory = std::getenv("XDG_CACHE_HOME");
} else { } else {
...@@ -875,7 +840,9 @@ std::string fs_get_cache_directory() { ...@@ -875,7 +840,9 @@ std::string fs_get_cache_directory() {
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/"); cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
#elif defined(_WIN32) #elif defined(_WIN32)
cache_directory = std::getenv("LOCALAPPDATA"); cache_directory = std::getenv("LOCALAPPDATA");
#endif // __linux__ #else
# error Unknown architecture
#endif
cache_directory = ensure_trailing_slash(cache_directory); cache_directory = ensure_trailing_slash(cache_directory);
cache_directory += "llama.cpp"; cache_directory += "llama.cpp";
} }
...@@ -896,22 +863,14 @@ std::string fs_get_cache_file(const std::string & filename) { ...@@ -896,22 +863,14 @@ std::string fs_get_cache_file(const std::string & filename) {
// //
// Model utils // Model utils
// //
struct common_init_result common_init_from_params(common_params & params) { struct common_init_result common_init_from_params(common_params & params) {
common_init_result iparams; common_init_result iparams;
auto mparams = common_model_params_to_llama(params); auto mparams = common_model_params_to_llama(params);
llama_model * model = nullptr; llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
model = common_load_model_from_hf(params.hf_repo, params.hf_file, params.model, params.hf_token, mparams);
} else if (!params.model_url.empty()) {
model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
} else {
model = llama_model_load_from_file(params.model.c_str(), mparams);
}
if (model == NULL) { if (model == NULL) {
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str()); LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
return iparams; return iparams;
} }
...@@ -946,13 +905,13 @@ struct common_init_result common_init_from_params(common_params & params) { ...@@ -946,13 +905,13 @@ struct common_init_result common_init_from_params(common_params & params) {
llama_context * lctx = llama_init_from_model(model, cparams); llama_context * lctx = llama_init_from_model(model, cparams);
if (lctx == NULL) { if (lctx == NULL) {
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str()); LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
llama_model_free(model); llama_model_free(model);
return iparams; return iparams;
} }
if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) { if (params.ctx_shift && !llama_kv_self_can_shift(lctx)) {
LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__); LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
params.ctx_shift = false; params.ctx_shift = false;
} }
...@@ -1029,6 +988,8 @@ struct common_init_result common_init_from_params(common_params & params) { ...@@ -1029,6 +988,8 @@ struct common_init_result common_init_from_params(common_params & params) {
if (params.warmup) { if (params.warmup) {
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__); LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
llama_set_warmup(lctx, true);
std::vector<llama_token> tmp; std::vector<llama_token> tmp;
llama_token bos = llama_vocab_bos(vocab); llama_token bos = llama_vocab_bos(vocab);
llama_token eos = llama_vocab_eos(vocab); llama_token eos = llama_vocab_eos(vocab);
...@@ -1056,9 +1017,10 @@ struct common_init_result common_init_from_params(common_params & params) { ...@@ -1056,9 +1017,10 @@ struct common_init_result common_init_from_params(common_params & params) {
if (llama_model_has_decoder(model)) { if (llama_model_has_decoder(model)) {
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch))); llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
} }
llama_kv_cache_clear(lctx); llama_kv_self_clear(lctx);
llama_synchronize(lctx); llama_synchronize(lctx);
llama_perf_context_reset(lctx); llama_perf_context_reset(lctx);
llama_set_warmup(lctx, false);
} }
iparams.model.reset(model); iparams.model.reset(model);
...@@ -1067,6 +1029,19 @@ struct common_init_result common_init_from_params(common_params & params) { ...@@ -1067,6 +1029,19 @@ struct common_init_result common_init_from_params(common_params & params) {
return iparams; return iparams;
} }
std::string get_model_endpoint() {
const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
// We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
const char * hf_endpoint_env = getenv("HF_ENDPOINT");
const char * endpoint_env = model_endpoint_env ? model_endpoint_env : hf_endpoint_env;
std::string model_endpoint = "https://huggingface.co/";
if (endpoint_env) {
model_endpoint = endpoint_env;
if (model_endpoint.back() != '/') model_endpoint += '/';
}
return model_endpoint;
}
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) { void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
llama_clear_adapter_lora(ctx); llama_clear_adapter_lora(ctx);
for (auto & la : lora) { for (auto & la : lora) {
...@@ -1082,15 +1057,18 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { ...@@ -1082,15 +1057,18 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
if (!params.devices.empty()) { if (!params.devices.empty()) {
mparams.devices = params.devices.data(); mparams.devices = params.devices.data();
} }
if (params.n_gpu_layers != -1) { if (params.n_gpu_layers != -1) {
mparams.n_gpu_layers = params.n_gpu_layers; mparams.n_gpu_layers = params.n_gpu_layers;
} }
mparams.main_gpu = params.main_gpu; mparams.main_gpu = params.main_gpu;
mparams.split_mode = params.split_mode; mparams.split_mode = params.split_mode;
mparams.tensor_split = params.tensor_split; mparams.tensor_split = params.tensor_split;
mparams.use_mmap = params.use_mmap; mparams.use_mmap = params.use_mmap;
mparams.use_mlock = params.use_mlock; mparams.use_mlock = params.use_mlock;
mparams.check_tensors = params.check_tensors; mparams.check_tensors = params.check_tensors;
if (params.kv_overrides.empty()) { if (params.kv_overrides.empty()) {
mparams.kv_overrides = NULL; mparams.kv_overrides = NULL;
} else { } else {
...@@ -1098,6 +1076,13 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { ...@@ -1098,6 +1076,13 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
mparams.kv_overrides = params.kv_overrides.data(); mparams.kv_overrides = params.kv_overrides.data();
} }
if (params.tensor_buft_overrides.empty()) {
mparams.tensor_buft_overrides = NULL;
} else {
GGML_ASSERT(params.tensor_buft_overrides.back().pattern == nullptr && "Tensor buffer overrides not terminated with empty pattern");
mparams.tensor_buft_overrides = params.tensor_buft_overrides.data();
}
return mparams; return mparams;
} }
...@@ -1157,451 +1142,6 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p ...@@ -1157,451 +1142,6 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
return tpp; return tpp;
} }
#ifdef LLAMA_USE_CURL
#define CURL_MAX_RETRY 3
#define CURL_RETRY_DELAY_SECONDS 2
static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
int remaining_attempts = max_attempts;
while (remaining_attempts > 0) {
LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
CURLcode res = curl_easy_perform(curl);
if (res == CURLE_OK) {
return true;
}
int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
remaining_attempts--;
std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
}
LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
return false;
}
static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
// Initialize libcurl
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
curl_slist_ptr http_headers;
if (!curl) {
LOG_ERR("%s: error initializing libcurl\n", __func__);
return false;
}
bool force_download = false;
// Set the URL, allow to follow http redirection
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
// Check if hf-token or bearer-token was specified
if (!hf_token.empty()) {
std::string auth_header = "Authorization: Bearer " + hf_token;
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
}
#if defined(_WIN32)
// CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
// operating system. Currently implemented under MS-Windows.
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
#endif
// Check if the file already exists locally
auto file_exists = std::filesystem::exists(path);
// If the file exists, check its JSON metadata companion file.
std::string metadata_path = path + ".json";
nlohmann::json metadata;
std::string etag;
std::string last_modified;
if (file_exists) {
// Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
std::ifstream metadata_in(metadata_path);
if (metadata_in.good()) {
try {
metadata_in >> metadata;
LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
if (metadata.contains("url") && metadata.at("url").is_string()) {
auto previous_url = metadata.at("url").get<std::string>();
if (previous_url != url) {
LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
return false;
}
}
if (metadata.contains("etag") && metadata.at("etag").is_string()) {
etag = metadata.at("etag");
}
if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
last_modified = metadata.at("lastModified");
}
} catch (const nlohmann::json::exception & e) {
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
return false;
}
}
} else {
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
}
// Send a HEAD request to retrieve the etag and last-modified headers
struct common_load_model_from_url_headers {
std::string etag;
std::string last_modified;
};
common_load_model_from_url_headers headers;
{
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
static std::regex header_regex("([^:]+): (.*)\r\n");
static std::regex etag_regex("ETag", std::regex_constants::icase);
static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
std::string header(buffer, n_items);
std::smatch match;
if (std::regex_match(header, match, header_regex)) {
const std::string & key = match[1];
const std::string & value = match[2];
if (std::regex_match(key, match, etag_regex)) {
headers->etag = value;
} else if (std::regex_match(key, match, last_modified_regex)) {
headers->last_modified = value;
}
}
return n_items;
};
curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
if (!was_perform_successful) {
return false;
}
long http_code = 0;
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
if (http_code != 200) {
// HEAD not supported, we don't know if the file has changed
// force trigger downloading
force_download = true;
LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
}
}
bool should_download = !file_exists || force_download;
if (!should_download) {
if (!etag.empty() && etag != headers.etag) {
LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
should_download = true;
} else if (!last_modified.empty() && last_modified != headers.last_modified) {
LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
should_download = true;
}
}
if (should_download) {
std::string path_temporary = path + ".downloadInProgress";
if (file_exists) {
LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
if (remove(path.c_str()) != 0) {
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
return false;
}
}
// Set the output file
struct FILE_deleter {
void operator()(FILE * f) const {
fclose(f);
}
};
std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
if (!outfile) {
LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str());
return false;
}
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
return fwrite(data, size, nmemb, (FILE *)fd);
};
curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L);
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get());
// display download progress
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
// helper function to hide password in URL
auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
std::size_t protocol_pos = url.find("://");
if (protocol_pos == std::string::npos) {
return url; // Malformed URL
}
std::size_t at_pos = url.find('@', protocol_pos + 3);
if (at_pos == std::string::npos) {
return url; // No password in URL
}
return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
};
// start the download
LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
if (!was_perform_successful) {
return false;
}
long http_code = 0;
curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
if (http_code < 200 || http_code >= 400) {
LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
return false;
}
// Causes file to be closed explicitly here before we rename it.
outfile.reset();
// Write the updated JSON metadata file.
metadata.update({
{"url", url},
{"etag", headers.etag},
{"lastModified", headers.last_modified}
});
std::ofstream(metadata_path) << metadata.dump(4);
LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
if (rename(path_temporary.c_str(), path.c_str()) != 0) {
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
return false;
}
}
return true;
}
struct llama_model * common_load_model_from_url(
const std::string & model_url,
const std::string & local_path,
const std::string & hf_token,
const struct llama_model_params & params) {
// Basic validation of the model_url
if (model_url.empty()) {
LOG_ERR("%s: invalid model_url\n", __func__);
return NULL;
}
if (!common_download_file(model_url, local_path, hf_token)) {
return NULL;
}
// check for additional GGUFs split to download
int n_split = 0;
{
struct gguf_init_params gguf_params = {
/*.no_alloc = */ true,
/*.ctx = */ NULL,
};
auto * ctx_gguf = gguf_init_from_file(local_path.c_str(), gguf_params);
if (!ctx_gguf) {
LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, local_path.c_str());
return NULL;
}
auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
if (key_n_split >= 0) {
n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
}
gguf_free(ctx_gguf);
}
if (n_split > 1) {
char split_prefix[PATH_MAX] = {0};
char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
// Verify the first split file format
// and extract split URL and PATH prefixes
{
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), local_path.c_str(), 0, n_split)) {
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, local_path.c_str(), n_split);
return NULL;
}
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url.c_str(), 0, n_split)) {
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url.c_str(), n_split);
return NULL;
}
}
// Prepare download in parallel
std::vector<std::future<bool>> futures_download;
for (int idx = 1; idx < n_split; idx++) {
futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split, hf_token](int download_idx) -> bool {
char split_path[PATH_MAX] = {0};
llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
return common_download_file(split_url, split_path, hf_token);
}, idx));
}
// Wait for all downloads to complete
for (auto & f : futures_download) {
if (!f.get()) {
return NULL;
}
}
}
return llama_model_load_from_file(local_path.c_str(), params);
}
struct llama_model * common_load_model_from_hf(
const std::string & repo,
const std::string & remote_path,
const std::string & local_path,
const std::string & hf_token,
const struct llama_model_params & params) {
// construct hugging face model url:
//
// --repo ggml-org/models --file tinyllama-1.1b/ggml-model-f16.gguf
// https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf
//
// --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf
// https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf
//
std::string model_url = "https://huggingface.co/";
model_url += repo;
model_url += "/resolve/main/";
model_url += remote_path;
return common_load_model_from_url(model_url, local_path, hf_token, params);
}
/**
* Allow getting the HF file from the HF repo with tag (like ollama), for example:
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
* - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
* Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
*
* Return pair of <repo, file> (with "repo" already having tag removed)
*
* Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
*/
std::pair<std::string, std::string> common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) {
auto parts = string_split<std::string>(hf_repo_with_tag, ':');
std::string tag = parts.size() > 1 ? parts.back() : "latest";
std::string hf_repo = parts[0];
if (string_split<std::string>(hf_repo, '/').size() != 2) {
throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
}
// fetch model info from Hugging Face Hub API
json model_info;
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
curl_slist_ptr http_headers;
std::string res_str;
std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
return size * nmemb;
};
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
#if defined(_WIN32)
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
#endif
if (!hf_token.empty()) {
std::string auth_header = "Authorization: Bearer " + hf_token;
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
}
// Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
CURLcode res = curl_easy_perform(curl.get());
if (res != CURLE_OK) {
throw std::runtime_error("error: cannot make GET request to HF API");
}
long res_code;
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
if (res_code == 200) {
model_info = json::parse(res_str);
} else if (res_code == 401) {
throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
} else {
throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
}
// check response
if (!model_info.contains("ggufFile")) {
throw std::runtime_error("error: model does not have ggufFile");
}
json & gguf_file = model_info.at("ggufFile");
if (!gguf_file.contains("rfilename")) {
throw std::runtime_error("error: ggufFile does not have rfilename");
}
return std::make_pair(hf_repo, gguf_file.at("rfilename"));
}
#else
struct llama_model * common_load_model_from_url(
const std::string & /*model_url*/,
const std::string & /*local_path*/,
const std::string & /*hf_token*/,
const struct llama_model_params & /*params*/) {
LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
return nullptr;
}
struct llama_model * common_load_model_from_hf(
const std::string & /*repo*/,
const std::string & /*remote_path*/,
const std::string & /*local_path*/,
const std::string & /*hf_token*/,
const struct llama_model_params & /*params*/) {
LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
return nullptr;
}
std::pair<std::string, std::string> common_get_hf_file(const std::string &, const std::string &) {
LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
return std::make_pair("", "");
}
#endif // LLAMA_USE_CURL
// //
// Batch utils // Batch utils
// //
...@@ -2025,4 +1565,3 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c ...@@ -2025,4 +1565,3 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
return result; return result;
} }
...@@ -110,9 +110,17 @@ enum common_conversation_mode { ...@@ -110,9 +110,17 @@ enum common_conversation_mode {
COMMON_CONVERSATION_MODE_AUTO = 2, COMMON_CONVERSATION_MODE_AUTO = 2,
}; };
enum common_grammar_trigger_type {
COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN,
COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
};
struct common_grammar_trigger { struct common_grammar_trigger {
std::string word; common_grammar_trigger_type type;
bool at_start; std::string value;
llama_token token = LLAMA_TOKEN_NULL;
}; };
// sampling parameters // sampling parameters
...@@ -163,8 +171,7 @@ struct common_params_sampling { ...@@ -163,8 +171,7 @@ struct common_params_sampling {
std::string grammar; // optional BNF-like grammar to constrain sampling std::string grammar; // optional BNF-like grammar to constrain sampling
bool grammar_lazy = false; bool grammar_lazy = false;
std::vector<common_grammar_trigger> grammar_trigger_words; // optional trigger words to trigger lazy grammar std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars)
std::vector<llama_token> grammar_trigger_tokens; // optional trigger tokens to trigger lazy grammar and print trigger special tokens.
std::set<llama_token> preserved_tokens; std::set<llama_token> preserved_tokens;
std::vector<llama_logit_bias> logit_bias; // logit biases to apply std::vector<llama_logit_bias> logit_bias; // logit biases to apply
...@@ -173,6 +180,13 @@ struct common_params_sampling { ...@@ -173,6 +180,13 @@ struct common_params_sampling {
std::string print() const; std::string print() const;
}; };
struct common_params_model {
std::string path = ""; // model local path // NOLINT
std::string url = ""; // model url to download // NOLINT
std::string hf_repo = ""; // HF repo // NOLINT
std::string hf_file = ""; // HF file // NOLINT
};
struct common_params_speculative { struct common_params_speculative {
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
...@@ -186,19 +200,13 @@ struct common_params_speculative { ...@@ -186,19 +200,13 @@ struct common_params_speculative {
struct cpu_params cpuparams; struct cpu_params cpuparams;
struct cpu_params cpuparams_batch; struct cpu_params cpuparams_batch;
std::string hf_repo = ""; // HF repo // NOLINT struct common_params_model model;
std::string hf_file = ""; // HF file // NOLINT
std::string model = ""; // draft model for speculative decoding // NOLINT
std::string model_url = ""; // model url to download // NOLINT
}; };
struct common_params_vocoder { struct common_params_vocoder {
std::string hf_repo = ""; // HF repo // NOLINT struct common_params_model model;
std::string hf_file = ""; // HF file // NOLINT
std::string model = ""; // model path // NOLINT std::string speaker_file = ""; // speaker file path // NOLINT
std::string model_url = ""; // model url to download // NOLINT
bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
}; };
...@@ -254,13 +262,12 @@ struct common_params { ...@@ -254,13 +262,12 @@ struct common_params {
struct common_params_speculative speculative; struct common_params_speculative speculative;
struct common_params_vocoder vocoder; struct common_params_vocoder vocoder;
std::string model = ""; // model path // NOLINT struct common_params_model model;
std::string model_alias = ""; // model alias // NOLINT std::string model_alias = ""; // model alias // NOLINT
std::string model_url = ""; // model url to download // NOLINT
std::string hf_token = ""; // HF token // NOLINT std::string hf_token = ""; // HF token // NOLINT
std::string hf_repo = ""; // HF repo // NOLINT
std::string hf_file = ""; // HF file // NOLINT
std::string prompt = ""; // NOLINT std::string prompt = ""; // NOLINT
std::string system_prompt = ""; // NOLINT
std::string prompt_file = ""; // store the external prompt file name // NOLINT std::string prompt_file = ""; // store the external prompt file name // NOLINT
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
std::string input_prefix = ""; // string to prefix user inputs with // NOLINT std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
...@@ -272,6 +279,7 @@ struct common_params { ...@@ -272,6 +279,7 @@ struct common_params {
std::vector<std::string> in_files; // all input files std::vector<std::string> in_files; // all input files
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts) std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
std::vector<llama_model_kv_override> kv_overrides; std::vector<llama_model_kv_override> kv_overrides;
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply) bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
...@@ -325,13 +333,15 @@ struct common_params { ...@@ -325,13 +333,15 @@ struct common_params {
bool warmup = true; // warmup run bool warmup = true; // warmup run
bool check_tensors = false; // validate tensor data bool check_tensors = false; // validate tensor data
bool single_turn = false; // single turn chat conversation
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO; common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
// multimodal models (see examples/llava) // multimodal models (see examples/llava)
std::string mmproj = ""; // path to multimodal projector // NOLINT struct common_params_model mmproj;
std::vector<std::string> image; // path to image file(s) std::vector<std::string> image; // path to image file(s)
// embedding // embedding
...@@ -391,8 +401,6 @@ struct common_params { ...@@ -391,8 +401,6 @@ struct common_params {
int32_t i_pos = -1; // position of the passkey in the junk text int32_t i_pos = -1; // position of the passkey in the junk text
// imatrix params // imatrix params
std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
int32_t i_chunk = 0; // start processing from this chunk int32_t i_chunk = 0; // start processing from this chunk
...@@ -404,16 +412,16 @@ struct common_params { ...@@ -404,16 +412,16 @@ struct common_params {
int n_pca_batch = 100; int n_pca_batch = 100;
int n_pca_iterations = 1000; int n_pca_iterations = 1000;
dimre_method cvector_dimre_method = DIMRE_METHOD_PCA; dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
std::string cvector_outfile = "control_vector.gguf";
std::string cvector_positive_file = "examples/cvector-generator/positive.txt"; std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
std::string cvector_negative_file = "examples/cvector-generator/negative.txt"; std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
bool spm_infill = false; // suffix/prefix/middle pattern for infill bool spm_infill = false; // suffix/prefix/middle pattern for infill
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
// batched-bench params // batched-bench params
bool batched_bench_output_jsonl = false; bool batched_bench_output_jsonl = false;
// common params
std::string out_file; // output filename for all example programs
}; };
// call once at the start of a program if it uses libcommon // call once at the start of a program if it uses libcommon
...@@ -453,6 +461,8 @@ std::string string_repeat(const std::string & str, size_t n); ...@@ -453,6 +461,8 @@ std::string string_repeat(const std::string & str, size_t n);
void string_replace_all(std::string & s, const std::string & search, const std::string & replace); void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
std::string regex_escape(const std::string & s);
template<class T> template<class T>
static std::vector<T> string_split(const std::string & str, char delim) { static std::vector<T> string_split(const std::string & str, char delim) {
static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string"); static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
...@@ -530,26 +540,11 @@ struct llama_model_params common_model_params_to_llama ( common_params ...@@ -530,26 +540,11 @@ struct llama_model_params common_model_params_to_llama ( common_params
struct llama_context_params common_context_params_to_llama(const common_params & params); struct llama_context_params common_context_params_to_llama(const common_params & params);
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params); struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
struct llama_model * common_load_model_from_url(
const std::string & model_url,
const std::string & local_path,
const std::string & hf_token,
const struct llama_model_params & params);
struct llama_model * common_load_model_from_hf(
const std::string & repo,
const std::string & remote_path,
const std::string & local_path,
const std::string & hf_token,
const struct llama_model_params & params);
std::pair<std::string, std::string> common_get_hf_file(
const std::string & hf_repo_with_tag,
const std::string & hf_token);
// clear LoRA adapters from context, then apply new list of adapters // clear LoRA adapters from context, then apply new list of adapters
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora); void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
std::string get_model_endpoint();
// //
// Batch utils // Batch utils
// //
......
...@@ -264,7 +264,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream & ...@@ -264,7 +264,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
throw std::runtime_error("At least one of min_value or max_value must be set"); throw std::runtime_error("At least one of min_value or max_value must be set");
} }
const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}"; const std::string SPACE_RULE = "| \" \" | \"\\n\"{1,2} [ \\t]{0,20}";
struct BuiltinRule { struct BuiltinRule {
std::string content; std::string content;
...@@ -764,11 +764,10 @@ private: ...@@ -764,11 +764,10 @@ private:
public: public:
SchemaConverter( SchemaConverter(
const std::function<json(const std::string &)> & fetch_json, const std::function<json(const std::string &)> & fetch_json,
bool dotall, bool dotall)
bool compact_spaces)
: _fetch_json(fetch_json), _dotall(dotall) : _fetch_json(fetch_json), _dotall(dotall)
{ {
_rules["space"] = compact_spaces ? "\" \"?" : SPACE_RULE; _rules["space"] = SPACE_RULE;
} }
void resolve_refs(json & schema, const std::string & url) { void resolve_refs(json & schema, const std::string & url) {
...@@ -1007,7 +1006,7 @@ std::string json_schema_to_grammar(const json & schema, bool force_gbnf) { ...@@ -1007,7 +1006,7 @@ std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
} }
std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options) { std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options) {
SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall, options.compact_spaces); SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall);
common_grammar_builder builder { common_grammar_builder builder {
/* .add_rule = */ [&](const std::string & name, const std::string & rule) { /* .add_rule = */ [&](const std::string & name, const std::string & rule) {
return converter._add_rule(name, rule); return converter._add_rule(name, rule);
......
...@@ -16,7 +16,6 @@ struct common_grammar_builder { ...@@ -16,7 +16,6 @@ struct common_grammar_builder {
struct common_grammar_options { struct common_grammar_options {
bool dotall = false; bool dotall = false;
bool compact_spaces = false;
}; };
std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options = {}); std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options = {});
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
#include <cmath> #include <cmath>
#include <unordered_map> #include <unordered_map>
#include <algorithm>
// the ring buffer works similarly to std::deque, but with a fixed capacity // the ring buffer works similarly to std::deque, but with a fixed capacity
// TODO: deduplicate with llama-impl.h // TODO: deduplicate with llama-impl.h
...@@ -159,17 +160,57 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co ...@@ -159,17 +160,57 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled"); GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
#endif // LLAMA_USE_LLGUIDANCE #endif // LLAMA_USE_LLGUIDANCE
} else { } else {
std::vector<const char *> trigger_words; std::vector<std::string> patterns_at_start;
trigger_words.reserve(params.grammar_trigger_words.size()); std::vector<std::string> patterns_anywhere;
for (const auto & str : params.grammar_trigger_words) { std::vector<llama_token> trigger_tokens;
trigger_words.push_back(str.word.c_str()); for (const auto & trigger : params.grammar_triggers) {
switch (trigger.type) {
case COMMON_GRAMMAR_TRIGGER_TYPE_WORD:
{
const auto & word = trigger.value;
patterns_anywhere.push_back(regex_escape(word));
break;
}
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START:
{
const auto & pattern = trigger.value;
(trigger.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START ? patterns_at_start : patterns_anywhere).push_back(pattern);
break;
}
case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN:
{
const auto token = trigger.token;
trigger_tokens.push_back(token);
break;
}
default:
GGML_ASSERT(false && "unknown trigger type");
}
}
std::vector<std::string> trigger_patterns;
if (!patterns_at_start.empty()) {
trigger_patterns.push_back("^(" + string_join(patterns_at_start, "|") + ")[\\s\\S]*");
}
if (!patterns_anywhere.empty()) {
trigger_patterns.push_back("^[\\s\\S]*?(" + string_join(patterns_anywhere, "|") + ")[\\s\\S]*");
}
std::vector<const char *> trigger_patterns_c;
trigger_patterns_c.reserve(trigger_patterns.size());
for (const auto & regex : trigger_patterns) {
trigger_patterns_c.push_back(regex.c_str());
} }
grmr = params.grammar_lazy grmr = params.grammar_lazy
? llama_sampler_init_grammar_lazy(vocab, params.grammar.c_str(), "root", ? llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
trigger_words.data(), trigger_words.size(), trigger_patterns_c.data(), trigger_patterns_c.size(),
params.grammar_trigger_tokens.data(), params.grammar_trigger_tokens.size()) trigger_tokens.data(), trigger_tokens.size())
: llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"); : llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
if (!grmr) {
return nullptr;
}
} }
auto * result = new common_sampler { auto * result = new common_sampler {
......
#include "ggml.h"
#include "gguf.h"
#include "clip.h"
#include "clip.h"
#include <climits>
#include <cstdarg>
#include <string>
#include <map>
#include <sstream>
#include <vector>
#include <memory>
// Internal header for clip.cpp
#define KEY_FTYPE "general.file_type"
#define KEY_NAME "general.name"
#define KEY_DESCRIPTION "general.description"
#define KEY_HAS_TEXT_ENC "clip.has_text_encoder"
#define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
#define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector"
#define KEY_HAS_GLM_PROJ "clip.has_glm_projector"
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
#define KEY_HAS_QWEN2VL_MERGER "clip.has_qwen2vl_merger"
#define KEY_USE_GELU "clip.use_gelu"
#define KEY_USE_SILU "clip.use_silu"
#define KEY_N_EMBD "clip.%s.embedding_length"
#define KEY_N_FF "clip.%s.feed_forward_length"
#define KEY_N_BLOCK "clip.%s.block_count"
#define KEY_N_HEAD "clip.%s.attention.head_count"
#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
#define KEY_PROJ_DIM "clip.%s.projection_dim"
#define KEY_TOKENS "tokenizer.ggml.tokens"
#define KEY_N_POSITIONS "clip.text.context_length"
#define KEY_IMAGE_SIZE "clip.vision.image_size"
#define KEY_PATCH_SIZE "clip.vision.patch_size"
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
#define KEY_IMAGE_STD "clip.vision.image_std"
#define KEY_PROJ_TYPE "clip.projector_type"
#define KEY_FEATURE_LAYER "clip.vision.feature_layer"
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
//
// tensor name constants
//
#define TN_TOKEN_EMBD "%s.token_embd.weight"
#define TN_POS_EMBD "%s.position_embd.weight"
#define TN_CLASS_EMBD "v.class_embd"
#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
#define TN_PATCH_EMBD_1 "v.patch_embd.weight.1"
#define TN_PATCH_BIAS "v.patch_embd.bias"
#define TN_ATTN_K "%s.blk.%d.attn_k.%s"
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
#define TN_ATTN_V "%s.blk.%d.attn_v.%s"
#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s"
#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s"
#define TN_FFN_UP "%s.blk.%d.ffn_up.%s"
#define TN_LN_1 "%s.blk.%d.ln1.%s"
#define TN_LN_2 "%s.blk.%d.ln2.%s"
#define TN_LN_PRE "%s.pre_ln.%s"
#define TN_LN_POST "%s.post_ln.%s"
#define TN_TEXT_PROJ "text_projection.weight"
#define TN_VIS_PROJ "visual_projection.weight"
#define TN_LLAVA_PROJ "mm.%d.%s"
#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
#define TN_IMAGE_NEWLINE "model.image_newline"
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
// mimicpmv
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
#define TN_MINICPMV_QUERY "resampler.query"
#define TN_MINICPMV_PROJ "resampler.proj.weight"
#define TN_MINICPMV_KV_PROJ "resampler.kv.weight"
#define TN_MINICPMV_ATTN "resampler.attn.%s.%s"
#define TN_MINICPMV_LN "resampler.ln_%s.%s"
#define TN_GLM_ADAPER_CONV "adapter.conv.%s"
#define TN_GLM_ADAPTER_LINEAR "adapter.linear.linear.%s"
#define TN_GLM_ADAPTER_NORM_1 "adapter.linear.norm1.%s"
#define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s"
#define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s"
#define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
#define TN_GLM_BOI_W "adapter.boi"
#define TN_GLM_EOI_W "adapter.eoi"
enum projector_type {
PROJECTOR_TYPE_MLP,
PROJECTOR_TYPE_MLP_NORM,
PROJECTOR_TYPE_LDP,
PROJECTOR_TYPE_LDPV2,
PROJECTOR_TYPE_RESAMPLER,
PROJECTOR_TYPE_GLM_EDGE,
PROJECTOR_TYPE_MERGER,
PROJECTOR_TYPE_GEMMA3,
PROJECTOR_TYPE_UNKNOWN,
};
static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_MLP, "mlp" },
{ PROJECTOR_TYPE_LDP, "ldp" },
{ PROJECTOR_TYPE_LDPV2, "ldpv2"},
{ PROJECTOR_TYPE_RESAMPLER, "resampler"},
{ PROJECTOR_TYPE_GLM_EDGE, "adapter"},
{ PROJECTOR_TYPE_MERGER, "qwen2vl_merger"},
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
};
static projector_type clip_projector_type_from_string(const std::string & str) {
for (const auto & pair : PROJECTOR_TYPE_NAMES) {
if (pair.second == str) {
return pair.first;
}
}
return PROJECTOR_TYPE_UNKNOWN;
}
// RGB uint8 image
struct clip_image_u8 {
int nx;
int ny;
std::vector<uint8_t> buf;
};
// RGB float32 image (NHWC)
// Memory layout: RGBRGBRGB...
struct clip_image_f32 {
int nx;
int ny;
std::vector<float> buf;
};
//
// logging
//
static void clip_log_callback_default(enum ggml_log_level level, const char * text, void * user_data) {
(void) level;
(void) user_data;
fputs(text, stderr);
fflush(stderr);
}
struct clip_logger_state {
ggml_log_level verbosity_thold;
ggml_log_callback log_callback;
void * log_callback_user_data;
};
extern struct clip_logger_state g_logger_state;
static void clip_log_internal_v(enum ggml_log_level level, const char * format, va_list args) {
if (format == NULL) {
return;
}
va_list args_copy;
va_copy(args_copy, args);
char buffer[128];
int len = vsnprintf(buffer, 128, format, args);
if (len < 128) {
g_logger_state.log_callback(level, buffer, g_logger_state.log_callback_user_data);
} else {
char * buffer2 = (char *) calloc(len + 1, sizeof(char));
vsnprintf(buffer2, len + 1, format, args_copy);
buffer2[len] = 0;
g_logger_state.log_callback(level, buffer2, g_logger_state.log_callback_user_data);
free(buffer2);
}
va_end(args_copy);
}
static void clip_log_internal(enum ggml_log_level level, const char * format, ...) {
va_list args;
va_start(args, format);
clip_log_internal_v(level, format, args);
va_end(args);
}
#define LOG_TMPL(level, ...) \
do { \
if ((level) >= g_logger_state.verbosity_thold) { \
clip_log_internal((level), __VA_ARGS__); \
} \
} while (0)
#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
#define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT, __VA_ARGS__)
//
// cpp wrappers
//
// wrapper for clip_image_size
struct clip_image_size_deleter {
void operator()(clip_image_size * val) { clip_image_size_free(val); }
};
typedef std::unique_ptr<clip_image_size, clip_image_size_deleter> clip_image_size_ptr;
// wrapper for clip_image_u8
struct clip_image_u8_deleter {
void operator()(clip_image_u8 * val) { clip_image_u8_free(val); }
};
typedef std::unique_ptr<clip_image_u8, clip_image_u8_deleter> clip_image_u8_ptr;
// wrapper for clip_image_f32
struct clip_image_f32_deleter {
void operator()(clip_image_f32 * val) { clip_image_f32_free(val); }
};
typedef std::unique_ptr<clip_image_f32, clip_image_f32_deleter> clip_image_f32_ptr;
struct clip_image_u8_batch {
std::vector<clip_image_u8_ptr> entries;
};
struct clip_image_f32_batch {
std::vector<clip_image_f32_ptr> entries;
};
//
// common utils
//
static std::string string_format(const char * fmt, ...) {
va_list ap;
va_list ap2;
va_start(ap, fmt);
va_copy(ap2, ap);
int size = vsnprintf(NULL, 0, fmt, ap);
GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
std::vector<char> buf(size + 1);
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
GGML_ASSERT(size2 == size);
va_end(ap2);
va_end(ap);
return std::string(buf.data(), buf.size());
}
static void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
if (search.empty()) {
return;
}
std::string builder;
builder.reserve(s.length());
size_t pos = 0;
size_t last_pos = 0;
while ((pos = s.find(search, last_pos)) != std::string::npos) {
builder.append(s, last_pos, pos - last_pos);
builder.append(replace);
last_pos = pos + search.length();
}
builder.append(s, last_pos, std::string::npos);
s = std::move(builder);
}
// split string by a `std::string delim` instead of `char delim`
static std::vector<std::string> string_split_str(std::string s, const std::string & delimiter) {
std::vector<std::string> tokens;
size_t pos = 0;
std::string token;
while ((pos = s.find(delimiter)) != std::string::npos) {
token = s.substr(0, pos);
tokens.push_back(token);
s.erase(0, pos + delimiter.length());
}
tokens.push_back(s);
return tokens;
}
//
// gguf utils
//
static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
switch (type) {
case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]);
case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]);
case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]);
case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]);
case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]);
case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]);
case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]);
case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
default: return string_format("unknown type %d", type);
}
}
static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
switch (type) {
case GGUF_TYPE_STRING:
return gguf_get_val_str(ctx_gguf, i);
case GGUF_TYPE_ARRAY:
{
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
int arr_n = gguf_get_arr_n(ctx_gguf, i);
const void * data = arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx_gguf, i);
std::stringstream ss;
ss << "[";
for (int j = 0; j < arr_n; j++) {
if (arr_type == GGUF_TYPE_STRING) {
std::string val = gguf_get_arr_str(ctx_gguf, i, j);
// escape quotes
string_replace_all(val, "\\", "\\\\");
string_replace_all(val, "\"", "\\\"");
ss << '"' << val << '"';
} else if (arr_type == GGUF_TYPE_ARRAY) {
ss << "???";
} else {
ss << gguf_data_to_str(arr_type, data, j);
}
if (j < arr_n - 1) {
ss << ", ";
}
}
ss << "]";
return ss.str();
}
default:
return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
}
}
//
// API used internally with mtmd
//
projector_type clip_get_projector_type(const struct clip_ctx * ctx);
...@@ -3,32 +3,14 @@ ...@@ -3,32 +3,14 @@
// I'll gradually clean and extend it // I'll gradually clean and extend it
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
#include "clip.h" #include "clip.h"
#include "clip-impl.h"
#include "ggml.h" #include "ggml.h"
#include "ggml-cpp.h"
#include "ggml-cpu.h" #include "ggml-cpu.h"
#include "ggml-alloc.h" #include "ggml-alloc.h"
#include "ggml-backend.h" #include "ggml-backend.h"
#include "gguf.h" #include "gguf.h"
#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
#endif
#ifdef GGML_USE_SYCL
#include "ggml-sycl.h"
#endif
#ifdef GGML_USE_METAL
#include "ggml-metal.h"
#endif
#ifdef GGML_USE_CANN
#include "ggml-cann.h"
#endif
#ifdef GGML_USE_VULKAN
#include "ggml-vulkan.h"
#endif
#define STB_IMAGE_IMPLEMENTATION #define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h" #include "stb_image.h"
...@@ -46,18 +28,6 @@ ...@@ -46,18 +28,6 @@
#include <cinttypes> #include <cinttypes>
#include <limits> #include <limits>
#if defined(LLAVA_LOG_OFF)
# define LOG_INF(...)
# define LOG_WRN(...)
# define LOG_ERR(...)
# define LOG_DBG(...)
#else // defined(LLAVA_LOG_OFF)
# define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
# define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
# define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
# define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
#endif // defined(LLAVA_LOG_OFF)
#if defined(_WIN32) #if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX #ifndef NOMINMAX
...@@ -71,267 +41,9 @@ ...@@ -71,267 +41,9 @@
#endif #endif
#endif #endif
//#define CLIP_DEBUG_FUNCTIONS struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
// RGB uint8 image
struct clip_image_u8 {
int nx;
int ny;
std::vector<uint8_t> buf;
};
// RGB float32 image (NHWC)
// Memory layout: RGBRGBRGB...
struct clip_image_f32 {
int nx;
int ny;
std::vector<float> buf;
};
static std::string format(const char * fmt, ...) {
va_list ap;
va_list ap2;
va_start(ap, fmt);
va_copy(ap2, ap);
int size = vsnprintf(NULL, 0, fmt, ap);
GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
std::vector<char> buf(size + 1);
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
GGML_ASSERT(size2 == size);
va_end(ap2);
va_end(ap);
return std::string(buf.data(), buf.size());
}
//
// key constants
//
#define KEY_FTYPE "general.file_type"
#define KEY_NAME "general.name"
#define KEY_DESCRIPTION "general.description"
#define KEY_HAS_TEXT_ENC "clip.has_text_encoder"
#define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
#define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector"
#define KEY_HAS_GLM_PROJ "clip.has_glm_projector"
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
#define KEY_HAS_QWEN2VL_MERGER "clip.has_qwen2vl_merger"
#define KEY_USE_GELU "clip.use_gelu"
#define KEY_USE_SILU "clip.use_silu"
#define KEY_N_EMBD "clip.%s.embedding_length"
#define KEY_N_FF "clip.%s.feed_forward_length"
#define KEY_N_BLOCK "clip.%s.block_count"
#define KEY_N_HEAD "clip.%s.attention.head_count"
#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
#define KEY_PROJ_DIM "clip.%s.projection_dim"
#define KEY_TOKENS "tokenizer.ggml.tokens"
#define KEY_N_POSITIONS "clip.text.context_length"
#define KEY_IMAGE_SIZE "clip.vision.image_size"
#define KEY_PATCH_SIZE "clip.vision.patch_size"
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
#define KEY_IMAGE_STD "clip.vision.image_std"
#define KEY_PROJ_TYPE "clip.projector_type"
#define KEY_FEATURE_LAYER "clip.vision.feature_layer"
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
//#define CLIP_DEBUG_FUNCTIONS
//
// tensor name constants
//
#define TN_TOKEN_EMBD "%s.token_embd.weight"
#define TN_POS_EMBD "%s.position_embd.weight"
#define TN_CLASS_EMBD "v.class_embd"
#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
#define TN_PATCH_EMBD_1 "v.patch_embd.weight.1"
#define TN_PATCH_BIAS "v.patch_embd.bias"
#define TN_ATTN_K "%s.blk.%d.attn_k.%s"
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
#define TN_ATTN_V "%s.blk.%d.attn_v.%s"
#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s"
#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s"
#define TN_FFN_UP "%s.blk.%d.ffn_up.%s"
#define TN_LN_1 "%s.blk.%d.ln1.%s"
#define TN_LN_2 "%s.blk.%d.ln2.%s"
#define TN_LN_PRE "%s.pre_ln.%s"
#define TN_LN_POST "%s.post_ln.%s"
#define TN_TEXT_PROJ "text_projection.weight"
#define TN_VIS_PROJ "visual_projection.weight"
#define TN_LLAVA_PROJ "mm.%d.%s"
#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
#define TN_IMAGE_NEWLINE "model.image_newline"
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
#define TN_MINICPMV_QUERY "resampler.query"
#define TN_MINICPMV_PROJ "resampler.proj.weight"
#define TN_MINICPMV_KV_PROJ "resampler.kv.weight"
#define TN_MINICPMV_ATTN "resampler.attn.%s.%s"
#define TN_MINICPMV_LN "resampler.ln_%s.%s"
#define TN_GLM_ADAPER_CONV "adapter.conv.%s"
#define TN_GLM_ADAPTER_LINEAR "adapter.linear.linear.%s"
#define TN_GLM_ADAPTER_NORM_1 "adapter.linear.norm1.%s"
#define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s"
#define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s"
#define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
#define TN_GLM_BOI_W "adapter.boi"
#define TN_GLM_EOI_W "adapter.eoi"
enum projector_type {
PROJECTOR_TYPE_MLP,
PROJECTOR_TYPE_MLP_NORM,
PROJECTOR_TYPE_LDP,
PROJECTOR_TYPE_LDPV2,
PROJECTOR_TYPE_RESAMPLER,
PROJECTOR_TYPE_GLM_EDGE,
PROJECTOR_TYPE_MERGER,
PROJECTOR_TYPE_UNKNOWN,
};
static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_MLP, "mlp" },
{ PROJECTOR_TYPE_LDP, "ldp" },
{ PROJECTOR_TYPE_LDPV2, "ldpv2"},
{ PROJECTOR_TYPE_RESAMPLER, "resampler"},
{ PROJECTOR_TYPE_GLM_EDGE, "adapter"},
{ PROJECTOR_TYPE_MERGER, "qwen2vl_merger"},
};
//
// utilities to get data from a gguf file
//
static int get_key_idx(const gguf_context * ctx, const char * key) {
int i = gguf_find_key(ctx, key);
if (i == -1) {
LOG_ERR("key %s not found in file\n", key);
throw std::runtime_error(format("Missing required key: %s", key));
}
return i;
}
static uint32_t get_u32(const gguf_context * ctx, const std::string & key) {
const int i = get_key_idx(ctx, key.c_str());
return gguf_get_val_u32(ctx, i);
}
static float get_f32(const gguf_context * ctx, const std::string & key) {
const int i = get_key_idx(ctx, key.c_str());
return gguf_get_val_f32(ctx, i);
}
static struct ggml_tensor * get_tensor(struct ggml_context * ctx, const std::string & name) {
struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str());
if (!cur) {
throw std::runtime_error(format("%s: unable to find tensor %s\n", __func__, name.c_str()));
}
return cur;
}
static std::string get_ftype(int ftype) {
return ggml_type_name(static_cast<ggml_type>(ftype));
}
static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
switch (type) {
case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]);
case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]);
case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]);
case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]);
case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]);
case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]);
case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]);
case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
default: return format("unknown type %d", type);
}
}
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
if (search.empty()) {
return;
}
std::string builder;
builder.reserve(s.length());
size_t pos = 0;
size_t last_pos = 0;
while ((pos = s.find(search, last_pos)) != std::string::npos) {
builder.append(s, last_pos, pos - last_pos);
builder.append(replace);
last_pos = pos + search.length();
}
builder.append(s, last_pos, std::string::npos);
s = std::move(builder);
}
static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
switch (type) {
case GGUF_TYPE_STRING:
return gguf_get_val_str(ctx_gguf, i);
case GGUF_TYPE_ARRAY:
{
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
int arr_n = gguf_get_arr_n(ctx_gguf, i);
const void * data = arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx_gguf, i);
std::stringstream ss;
ss << "[";
for (int j = 0; j < arr_n; j++) {
if (arr_type == GGUF_TYPE_STRING) {
std::string val = gguf_get_arr_str(ctx_gguf, i, j);
// escape quotes
replace_all(val, "\\", "\\\\");
replace_all(val, "\"", "\\\"");
ss << '"' << val << '"';
} else if (arr_type == GGUF_TYPE_ARRAY) {
ss << "???";
} else {
ss << gguf_data_to_str(arr_type, data, j);
}
if (j < arr_n - 1) {
ss << ", ";
}
}
ss << "]";
return ss.str();
}
default:
return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
}
}
static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") {
size_t tensor_size = ggml_nbytes(tensor);
LOG_INF("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
}
static projector_type clip_projector_type_from_string(const std::string & name) {
for (const auto & kv : PROJECTOR_TYPE_NAMES) { // NOLINT
if (kv.second == name) {
return kv.first;
}
}
return PROJECTOR_TYPE_UNKNOWN;
}
#ifdef CLIP_DEBUG_FUNCTIONS #ifdef CLIP_DEBUG_FUNCTIONS
static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) { static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
...@@ -446,6 +158,11 @@ static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u ...@@ -446,6 +158,11 @@ static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u
// clip layers // clip layers
// //
enum patch_merge_type {
PATCH_MERGE_FLAT,
PATCH_MERGE_SPATIAL_UNPAD,
};
struct clip_hparams { struct clip_hparams {
int32_t image_size; int32_t image_size;
int32_t patch_size; int32_t patch_size;
...@@ -455,9 +172,9 @@ struct clip_hparams { ...@@ -455,9 +172,9 @@ struct clip_hparams {
int32_t n_head; int32_t n_head;
int32_t n_layer; int32_t n_layer;
float eps; patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT;
char mm_patch_merge_type[32] = "flat"; // spatial_unpad or flat (default) float eps;
std::vector<int32_t> image_grid_pinpoints; std::vector<int32_t> image_grid_pinpoints;
int32_t image_crop_resolution; int32_t image_crop_resolution;
...@@ -466,44 +183,44 @@ struct clip_hparams { ...@@ -466,44 +183,44 @@ struct clip_hparams {
struct clip_layer { struct clip_layer {
// attention // attention
struct ggml_tensor * k_w; struct ggml_tensor * k_w = nullptr;
struct ggml_tensor * k_b; struct ggml_tensor * k_b = nullptr;
struct ggml_tensor * q_w; struct ggml_tensor * q_w = nullptr;
struct ggml_tensor * q_b; struct ggml_tensor * q_b = nullptr;
struct ggml_tensor * v_w; struct ggml_tensor * v_w = nullptr;
struct ggml_tensor * v_b; struct ggml_tensor * v_b = nullptr;
struct ggml_tensor * o_w; struct ggml_tensor * o_w = nullptr;
struct ggml_tensor * o_b; struct ggml_tensor * o_b = nullptr;
// layernorm 1 // layernorm 1
struct ggml_tensor * ln_1_w; struct ggml_tensor * ln_1_w = nullptr;
struct ggml_tensor * ln_1_b; struct ggml_tensor * ln_1_b = nullptr;
// ff // ff
struct ggml_tensor * ff_i_w; struct ggml_tensor * ff_i_w = nullptr;
struct ggml_tensor * ff_i_b; struct ggml_tensor * ff_i_b = nullptr;
struct ggml_tensor * ff_o_w; struct ggml_tensor * ff_o_w = nullptr;
struct ggml_tensor * ff_o_b; struct ggml_tensor * ff_o_b = nullptr;
// layernorm 2 // layernorm 2
struct ggml_tensor * ln_2_w; struct ggml_tensor * ln_2_w = nullptr;
struct ggml_tensor * ln_2_b; struct ggml_tensor * ln_2_b = nullptr;
}; };
struct clip_vision_model { struct clip_vision_model {
struct clip_hparams hparams; struct clip_hparams hparams;
// embeddings // embeddings
struct ggml_tensor * class_embedding; struct ggml_tensor * class_embedding = nullptr;
struct ggml_tensor * patch_embeddings_0; struct ggml_tensor * patch_embeddings_0 = nullptr;
struct ggml_tensor * patch_embeddings_1; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL) struct ggml_tensor * patch_embeddings_1 = nullptr; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
struct ggml_tensor * patch_bias; struct ggml_tensor * patch_bias = nullptr;
struct ggml_tensor * position_embeddings; struct ggml_tensor * position_embeddings = nullptr;
struct ggml_tensor * pre_ln_w; struct ggml_tensor * pre_ln_w = nullptr;
struct ggml_tensor * pre_ln_b; struct ggml_tensor * pre_ln_b = nullptr;
std::vector<clip_layer> layers; std::vector<clip_layer> layers;
...@@ -513,80 +230,84 @@ struct clip_vision_model { ...@@ -513,80 +230,84 @@ struct clip_vision_model {
struct ggml_tensor * projection; struct ggml_tensor * projection;
// LLaVA projection // LLaVA projection
struct ggml_tensor * mm_0_w = NULL; struct ggml_tensor * mm_0_w = nullptr;
struct ggml_tensor * mm_0_b = NULL; struct ggml_tensor * mm_0_b = nullptr;
struct ggml_tensor * mm_2_w = NULL; struct ggml_tensor * mm_2_w = nullptr;
struct ggml_tensor * mm_2_b = NULL; struct ggml_tensor * mm_2_b = nullptr;
struct ggml_tensor * image_newline = NULL; struct ggml_tensor * image_newline = nullptr;
// Yi type models with mlp+normalization projection // Yi type models with mlp+normalization projection
struct ggml_tensor * mm_1_w = NULL; // Yi type models have 0, 1, 3, 4 struct ggml_tensor * mm_1_w = nullptr; // Yi type models have 0, 1, 3, 4
struct ggml_tensor * mm_1_b = NULL; struct ggml_tensor * mm_1_b = nullptr;
struct ggml_tensor * mm_3_w = NULL; struct ggml_tensor * mm_3_w = nullptr;
struct ggml_tensor * mm_3_b = NULL; struct ggml_tensor * mm_3_b = nullptr;
struct ggml_tensor * mm_4_w = NULL; struct ggml_tensor * mm_4_w = nullptr;
struct ggml_tensor * mm_4_b = NULL; struct ggml_tensor * mm_4_b = nullptr;
//GLMV-Edge projection //GLMV-Edge projection
struct ggml_tensor * mm_model_adapter_conv_w; struct ggml_tensor * mm_model_adapter_conv_w = nullptr;
struct ggml_tensor * mm_model_adapter_conv_b; struct ggml_tensor * mm_model_adapter_conv_b = nullptr;
struct ggml_tensor * boi_w; struct ggml_tensor * boi_w = nullptr;
struct ggml_tensor * eoi_w; struct ggml_tensor * eoi_w = nullptr;
// MobileVLM projection // MobileVLM projection
struct ggml_tensor * mm_model_mlp_1_w; struct ggml_tensor * mm_model_mlp_1_w = nullptr;
struct ggml_tensor * mm_model_mlp_1_b; struct ggml_tensor * mm_model_mlp_1_b = nullptr;
struct ggml_tensor * mm_model_mlp_3_w; struct ggml_tensor * mm_model_mlp_3_w = nullptr;
struct ggml_tensor * mm_model_mlp_3_b; struct ggml_tensor * mm_model_mlp_3_b = nullptr;
struct ggml_tensor * mm_model_block_1_block_0_0_w; struct ggml_tensor * mm_model_block_1_block_0_0_w = nullptr;
struct ggml_tensor * mm_model_block_1_block_0_1_w; struct ggml_tensor * mm_model_block_1_block_0_1_w = nullptr;
struct ggml_tensor * mm_model_block_1_block_0_1_b; struct ggml_tensor * mm_model_block_1_block_0_1_b = nullptr;
struct ggml_tensor * mm_model_block_1_block_1_fc1_w; struct ggml_tensor * mm_model_block_1_block_1_fc1_w = nullptr;
struct ggml_tensor * mm_model_block_1_block_1_fc1_b; struct ggml_tensor * mm_model_block_1_block_1_fc1_b = nullptr;
struct ggml_tensor * mm_model_block_1_block_1_fc2_w; struct ggml_tensor * mm_model_block_1_block_1_fc2_w = nullptr;
struct ggml_tensor * mm_model_block_1_block_1_fc2_b; struct ggml_tensor * mm_model_block_1_block_1_fc2_b = nullptr;
struct ggml_tensor * mm_model_block_1_block_2_0_w; struct ggml_tensor * mm_model_block_1_block_2_0_w = nullptr;
struct ggml_tensor * mm_model_block_1_block_2_1_w; struct ggml_tensor * mm_model_block_1_block_2_1_w = nullptr;
struct ggml_tensor * mm_model_block_1_block_2_1_b; struct ggml_tensor * mm_model_block_1_block_2_1_b = nullptr;
struct ggml_tensor * mm_model_block_2_block_0_0_w; struct ggml_tensor * mm_model_block_2_block_0_0_w = nullptr;
struct ggml_tensor * mm_model_block_2_block_0_1_w; struct ggml_tensor * mm_model_block_2_block_0_1_w = nullptr;
struct ggml_tensor * mm_model_block_2_block_0_1_b; struct ggml_tensor * mm_model_block_2_block_0_1_b = nullptr;
struct ggml_tensor * mm_model_block_2_block_1_fc1_w; struct ggml_tensor * mm_model_block_2_block_1_fc1_w = nullptr;
struct ggml_tensor * mm_model_block_2_block_1_fc1_b; struct ggml_tensor * mm_model_block_2_block_1_fc1_b = nullptr;
struct ggml_tensor * mm_model_block_2_block_1_fc2_w; struct ggml_tensor * mm_model_block_2_block_1_fc2_w = nullptr;
struct ggml_tensor * mm_model_block_2_block_1_fc2_b; struct ggml_tensor * mm_model_block_2_block_1_fc2_b = nullptr;
struct ggml_tensor * mm_model_block_2_block_2_0_w; struct ggml_tensor * mm_model_block_2_block_2_0_w = nullptr;
struct ggml_tensor * mm_model_block_2_block_2_1_w; struct ggml_tensor * mm_model_block_2_block_2_1_w = nullptr;
struct ggml_tensor * mm_model_block_2_block_2_1_b; struct ggml_tensor * mm_model_block_2_block_2_1_b = nullptr;
// MobileVLM_V2 projection // MobileVLM_V2 projection
struct ggml_tensor * mm_model_mlp_0_w; struct ggml_tensor * mm_model_mlp_0_w = nullptr;
struct ggml_tensor * mm_model_mlp_0_b; struct ggml_tensor * mm_model_mlp_0_b = nullptr;
struct ggml_tensor * mm_model_mlp_2_w; struct ggml_tensor * mm_model_mlp_2_w = nullptr;
struct ggml_tensor * mm_model_mlp_2_b; struct ggml_tensor * mm_model_mlp_2_b = nullptr;
struct ggml_tensor * mm_model_peg_0_w; struct ggml_tensor * mm_model_peg_0_w = nullptr;
struct ggml_tensor * mm_model_peg_0_b; struct ggml_tensor * mm_model_peg_0_b = nullptr;
// MINICPMV projection // MINICPMV projection
struct ggml_tensor * mm_model_pos_embed_k; struct ggml_tensor * mm_model_pos_embed_k = nullptr;
struct ggml_tensor * mm_model_query; struct ggml_tensor * mm_model_query = nullptr;
struct ggml_tensor * mm_model_proj; struct ggml_tensor * mm_model_proj = nullptr;
struct ggml_tensor * mm_model_kv_proj; struct ggml_tensor * mm_model_kv_proj = nullptr;
struct ggml_tensor * mm_model_attn_q_w; struct ggml_tensor * mm_model_attn_q_w = nullptr;
struct ggml_tensor * mm_model_attn_q_b; struct ggml_tensor * mm_model_attn_q_b = nullptr;
struct ggml_tensor * mm_model_attn_k_w; struct ggml_tensor * mm_model_attn_k_w = nullptr;
struct ggml_tensor * mm_model_attn_k_b; struct ggml_tensor * mm_model_attn_k_b = nullptr;
struct ggml_tensor * mm_model_attn_v_w; struct ggml_tensor * mm_model_attn_v_w = nullptr;
struct ggml_tensor * mm_model_attn_v_b; struct ggml_tensor * mm_model_attn_v_b = nullptr;
struct ggml_tensor * mm_model_attn_o_w; struct ggml_tensor * mm_model_attn_o_w = nullptr;
struct ggml_tensor * mm_model_attn_o_b; struct ggml_tensor * mm_model_attn_o_b = nullptr;
struct ggml_tensor * mm_model_ln_q_w; struct ggml_tensor * mm_model_ln_q_w = nullptr;
struct ggml_tensor * mm_model_ln_q_b; struct ggml_tensor * mm_model_ln_q_b = nullptr;
struct ggml_tensor * mm_model_ln_kv_w; struct ggml_tensor * mm_model_ln_kv_w = nullptr;
struct ggml_tensor * mm_model_ln_kv_b; struct ggml_tensor * mm_model_ln_kv_b = nullptr;
struct ggml_tensor * mm_model_ln_post_w; struct ggml_tensor * mm_model_ln_post_w = nullptr;
struct ggml_tensor * mm_model_ln_post_b; struct ggml_tensor * mm_model_ln_post_b = nullptr;
// gemma3
struct ggml_tensor * mm_input_proj_w = nullptr;
struct ggml_tensor * mm_soft_emb_norm_w = nullptr;
}; };
struct clip_ctx { struct clip_ctx {
...@@ -601,33 +322,211 @@ struct clip_ctx { ...@@ -601,33 +322,211 @@ struct clip_ctx {
struct clip_vision_model vision_model; struct clip_vision_model vision_model;
projector_type proj_type = PROJECTOR_TYPE_MLP; projector_type proj_type = PROJECTOR_TYPE_MLP;
int32_t max_feature_layer; int32_t max_feature_layer; // unused in newer models like gemma3
float image_mean[3]; float image_mean[3];
float image_std[3]; float image_std[3];
bool use_gelu = false; bool use_gelu = false;
bool use_silu = false; bool use_silu = false;
int32_t ftype = 1;
bool has_class_embedding = true; gguf_context_ptr ctx_gguf;
bool has_pre_norm = true; ggml_context_ptr ctx_data;
bool has_post_norm = false;
bool has_patch_bias = false;
struct gguf_context * ctx_gguf;
struct ggml_context * ctx_data;
std::vector<uint8_t> buf_compute_meta; std::vector<uint8_t> buf_compute_meta;
// memory buffers to evaluate the model std::vector<ggml_backend_t> backend_ptrs;
ggml_backend_buffer_t params_buffer = NULL; std::vector<ggml_backend_buffer_type_t> backend_buft;
ggml_backend_t backend;
ggml_backend_t backend_cpu;
ggml_backend_buffer_ptr buf;
ggml_backend_sched_ptr sched;
ggml_backend_t backend = NULL; clip_image_size load_image_size;
ggml_gallocr_t compute_alloc = NULL;
struct clip_image_size * load_image_size; clip_ctx(clip_context_params & ctx_params) {
backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
backend = ctx_params.use_gpu
? ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr)
: nullptr;
if (backend) {
LOG_INF("%s: CLIP using %s backend\n", __func__, ggml_backend_name(backend));
backend_ptrs.push_back(backend);
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
} else {
backend = backend_cpu;
LOG_INF("%s: CLIP using CPU backend\n", __func__);
}
backend_ptrs.push_back(backend_cpu);
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu));
sched.reset(
ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false)
);
}
~clip_ctx() {
ggml_backend_free(backend);
if (backend != backend_cpu) {
ggml_backend_free(backend_cpu);
}
}
}; };
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) { static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
const auto & model = ctx->vision_model;
const auto & hparams = model.hparams;
const int image_size = hparams.image_size;
int image_size_width = image_size;
int image_size_height = image_size;
const int patch_size = hparams.patch_size;
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
const int hidden_size = hparams.hidden_size;
const int n_head = hparams.n_head;
const int d_head = hidden_size / n_head;
const int n_layer = hparams.n_layer;
const float eps = hparams.eps;
GGML_ASSERT(imgs.entries.size() == 1); // batch_size == 1
struct ggml_init_params params = {
/*.mem_size =*/ ctx->buf_compute_meta.size(),
/*.mem_buffer =*/ ctx->buf_compute_meta.data(),
/*.no_alloc =*/ true,
};
ggml_context_ptr ctx0_ptr(ggml_init(params));
auto ctx0 = ctx0_ptr.get();
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
// input raw
struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3);
ggml_set_name(inp_raw, "inp_raw");
ggml_set_input(inp_raw);
struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
inp = ggml_reshape_2d(ctx0, inp, num_patches, hidden_size);
inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
inp = ggml_add(ctx0, inp, model.patch_bias);
// position embeddings
struct ggml_tensor * embeddings = ggml_add(ctx0, inp, model.position_embeddings);
// loop over layers
for (int il = 0; il < n_layer; il++) {
struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
// layernorm1
{
cur = ggml_norm(ctx0, cur, eps);
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_1_w), model.layers[il].ln_1_b);
}
// self-attention
{
struct ggml_tensor * Q =
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_patches);
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
struct ggml_tensor * K =
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
K = ggml_reshape_3d(ctx0, K, d_head, n_head, num_patches);
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
struct ggml_tensor * V =
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].v_w, cur), model.layers[il].v_b);
V = ggml_reshape_3d(ctx0, V, d_head, n_head, num_patches);
V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f);
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
KQV = ggml_reshape_3d(ctx0, KQV, d_head, num_patches, n_head);
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
cur = ggml_cont_2d(ctx0, KQV, hidden_size, num_patches);
}
// attention output
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].o_w, cur), model.layers[il].o_b);
// re-add the layer input, e.g., residual
cur = ggml_add(ctx0, cur, embeddings);
embeddings = cur; // embeddings = residual, cur = hidden_states
// layernorm2
{
cur = ggml_norm(ctx0, cur, eps);
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_2_w), model.layers[il].ln_2_b);
}
cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b);
// siglip uses gelu
cur = ggml_gelu(ctx0, cur);
cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
cur = ggml_add(ctx0, cur, model.layers[il].ff_o_b);
// residual 2
cur = ggml_add(ctx0, embeddings, cur);
embeddings = cur;
}
// post-layernorm
if (model.post_ln_w) {
embeddings = ggml_norm(ctx0, embeddings, eps);
ggml_set_name(embeddings, "post_ln");
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
}
if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
const int batch_size = 1;
const int mm_tokens_per_image = 256; // default value for gemma3
const int tokens_per_side = sqrt(mm_tokens_per_image);
const int patches_per_image = sqrt(num_patches);
const int kernel_size = patches_per_image / tokens_per_side;
embeddings = ggml_cont(ctx0, ggml_transpose(ctx0, embeddings));
embeddings = ggml_reshape_4d(ctx0, embeddings, patches_per_image, patches_per_image, hidden_size, batch_size);
// doing a pool2d to reduce the number of output tokens to 256
embeddings = ggml_pool_2d(ctx0, embeddings, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
embeddings = ggml_reshape_3d(ctx0, embeddings, embeddings->ne[0] * embeddings->ne[0], hidden_size, batch_size);
embeddings = ggml_cont(ctx0, ggml_transpose(ctx0, embeddings));
// apply norm before projection
embeddings = ggml_rms_norm(ctx0, embeddings, eps);
embeddings = ggml_mul(ctx0, embeddings, model.mm_soft_emb_norm_w);
// apply projection
embeddings = ggml_mul_mat(ctx0,
ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)),
embeddings);
}
// build the graph
ggml_build_forward_expand(gf, embeddings);
return gf;
}
static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_image_f32_batch & imgs, struct clip_image_size load_image_size, bool is_inf = false) {
if (!ctx->has_vision_encoder) { if (!ctx->has_vision_encoder) {
LOG_ERR("This gguf file seems to have no vision encoder\n"); LOG_ERR("This gguf file seems to have no vision encoder\n");
return nullptr; return nullptr;
...@@ -640,30 +539,27 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 ...@@ -640,30 +539,27 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
int image_size_width = image_size; int image_size_width = image_size;
int image_size_height = image_size; int image_size_height = image_size;
if (ctx->has_minicpmv_projector) { if (ctx->has_minicpmv_projector) {
if (load_image_size == nullptr) { LOG_DBG("%s: %d %d\n", __func__, load_image_size.width, load_image_size.height);
load_image_size = clip_image_size_init(); image_size_width = load_image_size.width;
} image_size_height = load_image_size.height;
LOG_DBG("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
image_size_width = load_image_size->width;
image_size_height = load_image_size->height;
if (is_inf) { if (is_inf) {
image_size_width = imgs->data->nx; image_size_width = imgs.entries[0]->nx;
image_size_height = imgs->data->ny; image_size_height = imgs.entries[0]->ny;
} }
} }
else if (ctx->has_qwen2vl_merger) { else if (ctx->has_qwen2vl_merger) {
// use the image's native resolution when image is avaible // use the image's native resolution when image is avaible
if (is_inf) { if (is_inf) {
// if (imgs->data->nx && imgs->data->ny) { // if (imgs->data->nx && imgs->data->ny) {
image_size_width = imgs->data->nx; image_size_width = imgs.entries[0]->nx;
image_size_height = imgs->data->ny; image_size_height = imgs.entries[0]->ny;
} }
} }
const int patch_size = hparams.patch_size; const int patch_size = hparams.patch_size;
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
const int patches_w = image_size_width / patch_size; const int patches_w = image_size_width / patch_size;
const int patches_h = image_size_height / patch_size; const int patches_h = image_size_height / patch_size;
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0); const int num_positions = num_patches + (model.class_embedding ? 1 : 0);
const int num_position_ids = ctx->has_qwen2vl_merger ? num_positions * 4 : num_positions; const int num_position_ids = ctx->has_qwen2vl_merger ? num_positions * 4 : num_positions;
const int hidden_size = hparams.hidden_size; const int hidden_size = hparams.hidden_size;
const int n_head = hparams.n_head; const int n_head = hparams.n_head;
...@@ -671,7 +567,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 ...@@ -671,7 +567,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
const float eps = hparams.eps; const float eps = hparams.eps;
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4}; int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
const int batch_size = imgs->size; const int batch_size = imgs.entries.size();
if (ctx->has_llava_projector || ctx->has_minicpmv_projector || ctx->has_glm_projector) { if (ctx->has_llava_projector || ctx->has_minicpmv_projector || ctx->has_glm_projector) {
GGML_ASSERT(batch_size == 1); GGML_ASSERT(batch_size == 1);
...@@ -683,7 +579,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 ...@@ -683,7 +579,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
/*.no_alloc =*/ true, /*.no_alloc =*/ true,
}; };
struct ggml_context * ctx0 = ggml_init(params); ggml_context_ptr ctx0_ptr(ggml_init(params));
auto ctx0 = ctx0_ptr.get();
struct ggml_cgraph * gf = ggml_new_graph(ctx0); struct ggml_cgraph * gf = ggml_new_graph(ctx0);
struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3, batch_size); struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3, batch_size);
...@@ -715,7 +613,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 ...@@ -715,7 +613,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
} }
if (ctx->has_patch_bias) { if (model.patch_bias) {
// inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp)); // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
inp = ggml_add(ctx0, inp, model.patch_bias); inp = ggml_add(ctx0, inp, model.patch_bias);
} }
...@@ -724,7 +622,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 ...@@ -724,7 +622,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
if (ctx->has_llava_projector) { if (ctx->has_llava_projector) {
// concat class_embeddings and patch_embeddings // concat class_embeddings and patch_embeddings
if (ctx->has_class_embedding) { if (model.class_embedding) {
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
ggml_set_name(embeddings, "embeddings"); ggml_set_name(embeddings, "embeddings");
ggml_set_input(embeddings); ggml_set_input(embeddings);
...@@ -761,7 +659,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 ...@@ -761,7 +659,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
} }
// pre-layernorm // pre-layernorm
if (ctx->has_pre_norm) { if (model.pre_ln_w) {
embeddings = ggml_norm(ctx0, embeddings, eps); embeddings = ggml_norm(ctx0, embeddings, eps);
ggml_set_name(embeddings, "pre_ln"); ggml_set_name(embeddings, "pre_ln");
...@@ -803,7 +701,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 ...@@ -803,7 +701,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
ctx0, Q, positions, nullptr, ctx0, Q, positions, nullptr,
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
} }
Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size); Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size);
...@@ -827,7 +724,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 ...@@ -827,7 +724,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size); V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
KQ = ggml_soft_max_inplace(ctx0, KQ); KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f);
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size); KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size);
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3); KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
...@@ -871,7 +768,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 ...@@ -871,7 +768,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
} }
// post-layernorm // post-layernorm
if (ctx->has_post_norm) { if (model.post_ln_w) {
embeddings = ggml_norm(ctx0, embeddings, eps); embeddings = ggml_norm(ctx0, embeddings, eps);
ggml_set_name(embeddings, "post_ln"); ggml_set_name(embeddings, "post_ln");
...@@ -911,8 +808,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 ...@@ -911,8 +808,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
embeddings = ggml_gelu(ctx0, embeddings); embeddings = ggml_gelu(ctx0, embeddings);
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); if (model.mm_2_w) {
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
}
} }
else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
...@@ -1115,7 +1014,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 ...@@ -1115,7 +1014,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
} }
struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b); struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b); struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b);
struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b); struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b);
// permute // permute
...@@ -1129,7 +1027,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 ...@@ -1129,7 +1027,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size); V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
KQ = ggml_soft_max_inplace(ctx0, KQ); KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f);
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size); KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size);
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3); KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
...@@ -1171,9 +1069,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 ...@@ -1171,9 +1069,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings); embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
} }
} else { } else {
GGML_ABORT("fatel error"); GGML_ABORT("fatal error");
} }
} else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) { }
else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size); embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
...@@ -1190,572 +1089,539 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 ...@@ -1190,572 +1089,539 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
// build the graph // build the graph
ggml_build_forward_expand(gf, embeddings); ggml_build_forward_expand(gf, embeddings);
ggml_free(ctx0);
return gf; return gf;
} }
// read and create ggml_context containing the tensors and their data static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs, struct clip_image_size load_image_size, bool is_inf = false) {
struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
struct ggml_context * meta = NULL; return clip_image_build_graph_siglip(ctx, imgs);
} else {
// TODO: we should have one build_* function per model
return clip_image_build_graph_legacy(ctx, imgs, load_image_size, is_inf);
}
}
struct gguf_init_params params = { struct clip_model_loader {
/*.no_alloc = */ true, ggml_context_ptr ctx_meta;
/*.ctx = */ &meta, gguf_context_ptr ctx_gguf;
};
struct gguf_context * ctx = gguf_init_from_file(fname, params); clip_ctx & ctx_clip;
if (!ctx) { std::string fname;
throw std::runtime_error(format("%s: failed to load CLIP model from %s. Does this file exist?\n", __func__, fname));
}
if (verbosity >= 1) {
const int n_tensors = gguf_get_n_tensors(ctx);
const int n_kv = gguf_get_n_kv(ctx);
const int ftype = get_u32(ctx, KEY_FTYPE);
const std::string ftype_str = get_ftype(ftype);
const int idx_desc = get_key_idx(ctx, KEY_DESCRIPTION);
const std::string description = gguf_get_val_str(ctx, idx_desc);
const int idx_name = gguf_find_key(ctx, KEY_NAME);
if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
const std::string name = gguf_get_val_str(ctx, idx_name);
LOG_INF("%s: model name: %s\n", __func__, name.c_str());
}
LOG_INF("%s: description: %s\n", __func__, description.c_str());
LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
LOG_INF("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
LOG_INF("%s: n_tensors: %d\n", __func__, n_tensors);
LOG_INF("%s: n_kv: %d\n", __func__, n_kv);
LOG_INF("%s: ftype: %s\n", __func__, ftype_str.c_str());
LOG_INF("\n");
}
const int n_tensors = gguf_get_n_tensors(ctx);
// kv
const int n_kv = gguf_get_n_kv(ctx);
LOG_INF("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
__func__, n_kv, n_tensors, fname);
{
std::map<enum ggml_type, uint32_t> n_type;
for (int i = 0; i < n_tensors; i++) { size_t model_size; // in bytes
enum ggml_type type = gguf_get_tensor_type(ctx, i);
// TODO @ngxson : we should not pass clip_ctx here, it should be clip_vision_model
clip_model_loader(const char * fname, clip_ctx & ctx_clip) : ctx_clip(ctx_clip), fname(fname) {
struct ggml_context * meta = nullptr;
struct gguf_init_params params = {
/*.no_alloc = */ true,
/*.ctx = */ &meta,
};
n_type[type]++; ctx_gguf = gguf_context_ptr(gguf_init_from_file(fname, params));
if (!ctx_gguf.get()) {
throw std::runtime_error(string_format("%s: failed to load CLIP model from %s. Does this file exist?\n", __func__, fname));
} }
LOG_INF("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); ctx_meta.reset(meta);
for (int i = 0; i < n_kv; i++) {
const char * name = gguf_get_key(ctx, i);
const enum gguf_type type = gguf_get_kv_type(ctx, i);
const std::string type_name =
type == GGUF_TYPE_ARRAY
? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx, i)), gguf_get_arr_n(ctx, i))
: gguf_type_name(type);
std::string value = gguf_kv_to_str(ctx, i); const int n_tensors = gguf_get_n_tensors(ctx_gguf.get());
const size_t MAX_VALUE_LEN = 40;
if (value.size() > MAX_VALUE_LEN) {
value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
}
replace_all(value, "\n", "\\n");
LOG_INF("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str()); // print gguf info
{
std::string name;
get_string(KEY_NAME, name, false);
std::string description;
get_string(KEY_DESCRIPTION, description, false);
LOG_INF("%s: model name: %s\n", __func__, name.c_str());
LOG_INF("%s: description: %s\n", __func__, description.c_str());
LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx_gguf.get()));
LOG_INF("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx_gguf.get()));
LOG_INF("%s: n_tensors: %d\n", __func__, n_tensors);
LOG_INF("%s: n_kv: %d\n", __func__, (int)gguf_get_n_kv(ctx_gguf.get()));
LOG_INF("\n");
} }
// print type counts // tensors
for (auto & kv : n_type) { {
if (kv.second == 0) { for (int i = 0; i < n_tensors; ++i) {
continue; const char * name = gguf_get_tensor_name(ctx_gguf.get(), i);
const size_t offset = gguf_get_tensor_offset(ctx_gguf.get(), i);
enum ggml_type type = gguf_get_tensor_type(ctx_gguf.get(), i);
struct ggml_tensor * cur = ggml_get_tensor(meta, name);
size_t tensor_size = ggml_nbytes(cur);
model_size += tensor_size;
LOG_DBG("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
__func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
} }
LOG_INF("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
} }
} }
// data void load_hparams() {
size_t model_size = 0; // projector type
{ {
for (int i = 0; i < n_tensors; ++i) { std::string proj_type;
const char * name = gguf_get_tensor_name(ctx, i); get_string(KEY_PROJ_TYPE, proj_type, false);
const size_t offset = gguf_get_tensor_offset(ctx, i); if (!proj_type.empty()) {
enum ggml_type type = gguf_get_tensor_type(ctx, i); ctx_clip.proj_type = clip_projector_type_from_string(proj_type);
struct ggml_tensor * cur = ggml_get_tensor(meta, name); }
size_t tensor_size = ggml_nbytes(cur); if (ctx_clip.proj_type == PROJECTOR_TYPE_UNKNOWN) {
model_size += tensor_size; throw std::runtime_error(string_format("%s: unknown projector type: %s\n", __func__, proj_type.c_str()));
if (verbosity >= 3) {
LOG_INF("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
__func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
} }
} }
}
clip_ctx * new_clip = new clip_ctx{}; // other hparams
{
get_bool(KEY_HAS_TEXT_ENC, ctx_clip.has_text_encoder, false);
get_bool(KEY_HAS_VIS_ENC, ctx_clip.has_vision_encoder, false);
GGML_ASSERT(ctx_clip.has_vision_encoder);
GGML_ASSERT(!ctx_clip.has_text_encoder);
// legacy keys, use KEY_PROJ_TYPE instead
get_bool(KEY_HAS_LLAVA_PROJ, ctx_clip.has_llava_projector, false);
get_bool(KEY_HAS_MINICPMV_PROJ, ctx_clip.has_minicpmv_projector, false);
get_i32(KEY_MINICPMV_VERSION, ctx_clip.minicpmv_version, false);
get_bool(KEY_HAS_GLM_PROJ, ctx_clip.has_glm_projector, false);
get_bool(KEY_HAS_QWEN2VL_MERGER, ctx_clip.has_qwen2vl_merger, false);
// !!! do NOT extend the list above, use KEY_PROJ_TYPE instead
get_bool(KEY_USE_GELU, ctx_clip.use_gelu, false);
get_bool(KEY_USE_SILU, ctx_clip.use_silu, false);
auto & hparams = ctx_clip.vision_model.hparams;
get_u32(string_format(KEY_N_EMBD, "vision"), hparams.hidden_size);
get_u32(string_format(KEY_N_HEAD, "vision"), hparams.n_head);
get_u32(string_format(KEY_N_FF, "vision"), hparams.n_intermediate);
get_u32(string_format(KEY_N_BLOCK, "vision"), hparams.n_layer);
get_u32(string_format(KEY_PROJ_DIM, "vision"), hparams.projection_dim);
get_f32(string_format(KEY_LAYER_NORM_EPS, "vision"), hparams.eps);
get_u32(KEY_IMAGE_SIZE, hparams.image_size);
get_u32(KEY_PATCH_SIZE, hparams.patch_size);
get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
get_arr_int(KEY_IMAGE_GRID_PINPOINTS, hparams.image_grid_pinpoints, false);
// update projector type {
{ std::string mm_patch_merge_type;
int idx = gguf_find_key(ctx, KEY_PROJ_TYPE); get_string(KEY_MM_PATCH_MERGE_TYPE, mm_patch_merge_type, false);
if (idx != -1) { if (mm_patch_merge_type == "spatial_unpad") {
const std::string proj_type = gguf_get_val_str(ctx, idx); hparams.mm_patch_merge_type = PATCH_MERGE_SPATIAL_UNPAD;
new_clip->proj_type = clip_projector_type_from_string(proj_type); }
} else { }
new_clip->proj_type = PROJECTOR_TYPE_MLP;
}
if (new_clip->proj_type == PROJECTOR_TYPE_MLP) { {
if (gguf_find_tensor(ctx, format(TN_LLAVA_PROJ, 3, "weight").c_str()) != -1) { int idx_mean = gguf_find_key(ctx_gguf.get(), KEY_IMAGE_MEAN);
new_clip->proj_type = PROJECTOR_TYPE_MLP_NORM; int idx_std = gguf_find_key(ctx_gguf.get(), KEY_IMAGE_STD);
GGML_ASSERT(idx_mean >= 0 && "image_mean not found");
GGML_ASSERT(idx_std >= 0 && "image_std not found");
const float * mean_data = (const float *) gguf_get_arr_data(ctx_gguf.get(), idx_mean);
const float * std_data = (const float *) gguf_get_arr_data(ctx_gguf.get(), idx_std);
for (int i = 0; i < 3; ++i) {
ctx_clip.image_mean[i] = mean_data[i];
ctx_clip.image_std[i] = std_data[i];
}
} }
}
}
ggml_backend_t backend = ggml_backend_init_best(); // Load the vision feature layer indices if they are explicitly provided;
if (backend == nullptr) { // if multiple vision feature layers are present, the values will be concatenated
LOG_ERR("%s: failed to initialize backend\n", __func__); // to form the final visual features.
clip_free(new_clip); // NOTE: gguf conversions should standardize the values of the vision feature layer to
gguf_free(ctx); // be non-negative, since we use -1 to mark values as unset here.
return nullptr; std::vector<int> vision_feature_layer;
} get_arr_int(KEY_FEATURE_LAYER, vision_feature_layer, false);
LOG_INF("%s: using %s backend\n", __func__, ggml_backend_name(backend)); // convert std::vector to std::unordered_set
new_clip->backend = backend; for (auto & layer : vision_feature_layer) {
hparams.vision_feature_layer.insert(layer);
}
// Calculate the deepest feature layer based on hparams and projector type
ctx_clip.max_feature_layer = get_deepest_feature_layer(&ctx_clip);
// model size and capabilities LOG_INF("%s: text_encoder: %d\n", __func__, ctx_clip.has_text_encoder);
{ LOG_INF("%s: vision_encoder: %d\n", __func__, ctx_clip.has_vision_encoder);
int idx = get_key_idx(ctx, KEY_HAS_TEXT_ENC); LOG_INF("%s: llava_projector: %d\n", __func__, ctx_clip.has_llava_projector);
new_clip->has_text_encoder = gguf_get_val_bool(ctx, idx); LOG_INF("%s: minicpmv_projector: %d\n", __func__, ctx_clip.has_minicpmv_projector);
LOG_INF("%s: minicpmv_version: %d\n", __func__, ctx_clip.minicpmv_version);
LOG_INF("%s: glm_projector: %d\n", __func__, ctx_clip.has_glm_projector);
LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);
LOG_INF("%s: metadata size: %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0);
}
}
idx = get_key_idx(ctx, KEY_HAS_VIS_ENC); void load_tensors() {
new_clip->has_vision_encoder = gguf_get_val_bool(ctx, idx); std::map<std::string, size_t> tensor_offset;
std::vector<ggml_tensor *> tensors_to_load;
idx = gguf_find_key(ctx, KEY_HAS_LLAVA_PROJ); // get offsets
if (idx != -1) { for (int64_t i = 0; i < gguf_get_n_tensors(ctx_gguf.get()); ++i) {
new_clip->has_llava_projector = gguf_get_val_bool(ctx, idx); const char * name = gguf_get_tensor_name(ctx_gguf.get(), i);
tensor_offset[name] = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), i);
} }
idx = gguf_find_key(ctx, KEY_HAS_MINICPMV_PROJ); // create data context
if (idx != -1) { struct ggml_init_params params = {
new_clip->has_minicpmv_projector = gguf_get_val_bool(ctx, idx); /*.mem_size =*/ (gguf_get_n_tensors(ctx_gguf.get()) + 1) * ggml_tensor_overhead(),
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
ctx_clip.ctx_data.reset(ggml_init(params));
if (!ctx_clip.ctx_data) {
throw std::runtime_error(string_format("%s: failed to init ggml context\n", __func__));
} }
idx = gguf_find_key(ctx, KEY_MINICPMV_VERSION); // helper function
if (idx != -1) { auto get_tensor = [&](const std::string & name, bool required = true) {
new_clip->minicpmv_version = gguf_get_val_i32(ctx, idx); struct ggml_tensor * cur = ggml_get_tensor(ctx_meta.get(), name.c_str());
} if (!cur && required) {
throw std::runtime_error(string_format("%s: unable to find tensor %s\n", __func__, name.c_str()));
}
if (cur) {
tensors_to_load.push_back(cur);
// add tensors to context
struct ggml_tensor * data_tensor = ggml_dup_tensor(ctx_clip.ctx_data.get(), cur);
ggml_set_name(data_tensor, cur->name);
cur = data_tensor;
}
return cur;
};
idx = gguf_find_key(ctx, KEY_HAS_GLM_PROJ); auto & vision_model = ctx_clip.vision_model;
if (idx != -1) {
new_clip->has_glm_projector = gguf_get_val_bool(ctx, idx);
}
idx = gguf_find_key(ctx, KEY_HAS_QWEN2VL_MERGER); vision_model.class_embedding = get_tensor(TN_CLASS_EMBD, false);
if (idx != -1) {
new_clip->has_qwen2vl_merger = gguf_get_val_bool(ctx, idx);
}
// GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
GGML_ASSERT(new_clip->has_vision_encoder); vision_model.pre_ln_w = get_tensor(string_format(TN_LN_PRE, "v", "weight"), false);
GGML_ASSERT(!new_clip->has_text_encoder); vision_model.pre_ln_b = get_tensor(string_format(TN_LN_PRE, "v", "bias"), false);
idx = get_key_idx(ctx, KEY_USE_GELU); vision_model.post_ln_w = get_tensor(string_format(TN_LN_POST, "v", "weight"), false);
new_clip->use_gelu = gguf_get_val_bool(ctx, idx); vision_model.post_ln_b = get_tensor(string_format(TN_LN_POST, "v", "bias"), false);
try { vision_model.patch_bias = get_tensor(TN_PATCH_BIAS, false);
idx = get_key_idx(ctx, KEY_USE_SILU); vision_model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD, false);
new_clip->use_silu = gguf_get_val_bool(ctx, idx); vision_model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false);
} catch (std::runtime_error & /*e*/) { if (vision_model.patch_embeddings_1 == nullptr) {
new_clip->use_silu = false; ctx_clip.has_qwen2vl_merger = false;
} }
if (verbosity >= 1) { vision_model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, "v"), false);
LOG_INF("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
LOG_INF("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
LOG_INF("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector);
LOG_INF("%s: glm_projector: %d\n", __func__, new_clip->has_glm_projector);
LOG_INF("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
LOG_INF("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
}
}
LOG_INF("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors); // layers
vision_model.layers.resize(vision_model.hparams.n_layer);
for (int il = 0; il < vision_model.hparams.n_layer; ++il) {
auto & layer = vision_model.layers[il];
layer.k_w = get_tensor(string_format(TN_ATTN_K, "v", il, "weight"));
layer.q_w = get_tensor(string_format(TN_ATTN_Q, "v", il, "weight"));
layer.v_w = get_tensor(string_format(TN_ATTN_V, "v", il, "weight"));
layer.o_w = get_tensor(string_format(TN_ATTN_OUTPUT, "v", il, "weight"));
layer.ln_1_w = get_tensor(string_format(TN_LN_1, "v", il, "weight"), false);
layer.ln_2_w = get_tensor(string_format(TN_LN_2, "v", il, "weight"), false);
layer.ff_i_w = get_tensor(string_format(TN_FFN_DOWN, "v", il, "weight"));
layer.ff_o_w = get_tensor(string_format(TN_FFN_UP, "v", il, "weight"));
layer.k_b = get_tensor(string_format(TN_ATTN_K, "v", il, "bias"), false);
layer.q_b = get_tensor(string_format(TN_ATTN_Q, "v", il, "bias"), false);
layer.v_b = get_tensor(string_format(TN_ATTN_V, "v", il, "bias"), false);
layer.o_b = get_tensor(string_format(TN_ATTN_OUTPUT, "v", il, "bias"), false);
layer.ln_1_b = get_tensor(string_format(TN_LN_1, "v", il, "bias"), false);
layer.ln_2_b = get_tensor(string_format(TN_LN_2, "v", il, "bias"), false);
layer.ff_i_b = get_tensor(string_format(TN_FFN_DOWN, "v", il, "bias"), false);
layer.ff_o_b = get_tensor(string_format(TN_FFN_UP, "v", il, "bias"), false);
}
switch (ctx_clip.proj_type) {
case PROJECTOR_TYPE_MLP:
case PROJECTOR_TYPE_MLP_NORM:
{
// LLaVA projection
vision_model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"), false);
vision_model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"), false);
// Yi-type llava
vision_model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"), false);
vision_model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
// missing in Yi-type llava
vision_model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"), false);
vision_model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
// Yi-type llava
vision_model.mm_3_w = get_tensor(string_format(TN_LLAVA_PROJ, 3, "weight"), false);
vision_model.mm_3_b = get_tensor(string_format(TN_LLAVA_PROJ, 3, "bias"), false);
vision_model.mm_4_w = get_tensor(string_format(TN_LLAVA_PROJ, 4, "weight"), false);
vision_model.mm_4_b = get_tensor(string_format(TN_LLAVA_PROJ, 4, "bias"), false);
if (vision_model.mm_3_w) {
// TODO: this is a hack to support Yi-type llava
ctx_clip.proj_type = PROJECTOR_TYPE_MLP_NORM;
}
vision_model.image_newline = get_tensor(TN_IMAGE_NEWLINE, false);
} break;
case PROJECTOR_TYPE_LDP:
{
// MobileVLM projection
vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
vision_model.mm_model_mlp_1_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "bias"));
vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
vision_model.mm_model_mlp_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
vision_model.mm_model_block_1_block_0_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight"));
vision_model.mm_model_block_1_block_0_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight"));
vision_model.mm_model_block_1_block_0_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias"));
vision_model.mm_model_block_1_block_1_fc1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight"));
vision_model.mm_model_block_1_block_1_fc1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias"));
vision_model.mm_model_block_1_block_1_fc2_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight"));
vision_model.mm_model_block_1_block_1_fc2_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias"));
vision_model.mm_model_block_1_block_2_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight"));
vision_model.mm_model_block_1_block_2_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight"));
vision_model.mm_model_block_1_block_2_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias"));
vision_model.mm_model_block_2_block_0_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight"));
vision_model.mm_model_block_2_block_0_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight"));
vision_model.mm_model_block_2_block_0_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias"));
vision_model.mm_model_block_2_block_1_fc1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight"));
vision_model.mm_model_block_2_block_1_fc1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias"));
vision_model.mm_model_block_2_block_1_fc2_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight"));
vision_model.mm_model_block_2_block_1_fc2_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias"));
vision_model.mm_model_block_2_block_2_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
vision_model.mm_model_block_2_block_2_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
vision_model.mm_model_block_2_block_2_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
} break;
case PROJECTOR_TYPE_LDPV2:
{
// MobilVLM_V2 projection
vision_model.mm_model_mlp_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
vision_model.mm_model_mlp_0_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "bias"));
vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight"));
vision_model.mm_model_mlp_2_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "bias"));
vision_model.mm_model_peg_0_w = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "weight"));
vision_model.mm_model_peg_0_b = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "bias"));
} break;
case PROJECTOR_TYPE_RESAMPLER:
{
// vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD);
vision_model.mm_model_pos_embed_k = get_tensor(TN_MINICPMV_POS_EMBD_K);
vision_model.mm_model_query = get_tensor(TN_MINICPMV_QUERY);
vision_model.mm_model_proj = get_tensor(TN_MINICPMV_PROJ);
vision_model.mm_model_kv_proj = get_tensor(TN_MINICPMV_KV_PROJ);
vision_model.mm_model_attn_q_w = get_tensor(string_format(TN_MINICPMV_ATTN, "q", "weight"));
vision_model.mm_model_attn_k_w = get_tensor(string_format(TN_MINICPMV_ATTN, "k", "weight"));
vision_model.mm_model_attn_v_w = get_tensor(string_format(TN_MINICPMV_ATTN, "v", "weight"));
vision_model.mm_model_attn_q_b = get_tensor(string_format(TN_MINICPMV_ATTN, "q", "bias"));
vision_model.mm_model_attn_k_b = get_tensor(string_format(TN_MINICPMV_ATTN, "k", "bias"));
vision_model.mm_model_attn_v_b = get_tensor(string_format(TN_MINICPMV_ATTN, "v", "bias"));
vision_model.mm_model_attn_o_w = get_tensor(string_format(TN_MINICPMV_ATTN, "out", "weight"));
vision_model.mm_model_attn_o_b = get_tensor(string_format(TN_MINICPMV_ATTN, "out", "bias"));
vision_model.mm_model_ln_q_w = get_tensor(string_format(TN_MINICPMV_LN, "q", "weight"));
vision_model.mm_model_ln_q_b = get_tensor(string_format(TN_MINICPMV_LN, "q", "bias"));
vision_model.mm_model_ln_kv_w = get_tensor(string_format(TN_MINICPMV_LN, "kv", "weight"));
vision_model.mm_model_ln_kv_b = get_tensor(string_format(TN_MINICPMV_LN, "kv", "bias"));
vision_model.mm_model_ln_post_w = get_tensor(string_format(TN_MINICPMV_LN, "post", "weight"));
vision_model.mm_model_ln_post_b = get_tensor(string_format(TN_MINICPMV_LN, "post", "bias"));
} break;
case PROJECTOR_TYPE_GLM_EDGE:
{
vision_model.mm_model_adapter_conv_w = get_tensor(string_format(TN_GLM_ADAPER_CONV, "weight"));
vision_model.mm_model_adapter_conv_b = get_tensor(string_format(TN_GLM_ADAPER_CONV, "bias"));
vision_model.mm_model_mlp_0_w = get_tensor(string_format(TN_GLM_ADAPTER_LINEAR,"weight"));
vision_model.mm_model_ln_q_w = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1,"weight"));
vision_model.mm_model_ln_q_b = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1,"bias"));
vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H,"weight"));
vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE,"weight"));
vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H,"weight"));
vision_model.boi_w = get_tensor(TN_GLM_BOI_W);
vision_model.eoi_w = get_tensor(TN_GLM_EOI_W);
} break;
case PROJECTOR_TYPE_MERGER:
{
vision_model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
vision_model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
vision_model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
vision_model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
} break;
case PROJECTOR_TYPE_GEMMA3:
{
vision_model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
vision_model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
} break;
default:
GGML_ASSERT(false && "unknown projector type");
}
// load tensors // load data
{ {
std::vector<uint8_t> read_buf; std::vector<uint8_t> read_buf;
struct ggml_init_params params = {
/*.mem_size =*/ (n_tensors + 1) * ggml_tensor_overhead(),
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
new_clip->ctx_data = ggml_init(params);
if (!new_clip->ctx_data) {
LOG_ERR("%s: ggml_init() failed\n", __func__);
clip_free(new_clip);
gguf_free(ctx);
return nullptr;
}
#ifdef _WIN32 #ifdef _WIN32
int wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0); int wlen = MultiByteToWideChar(CP_UTF8, 0, fname.c_str(), -1, NULL, 0);
if (!wlen) { if (!wlen) {
return NULL; throw std::runtime_error(string_format("%s: failed to convert filename to wide string\n", __func__));
} }
wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t)); wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t));
wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, wbuf, wlen); wlen = MultiByteToWideChar(CP_UTF8, 0, fname.c_str(), -1, wbuf, wlen);
if (!wlen) { if (!wlen) {
free(wbuf); free(wbuf);
return NULL; throw std::runtime_error(string_format("%s: failed to convert filename to wide string\n", __func__));
} }
#if __GLIBCXX__ #if __GLIBCXX__
int fd = _wopen(wbuf, _O_RDONLY | _O_BINARY); int fd = _wopen(wbuf, _O_RDONLY | _O_BINARY);
__gnu_cxx::stdio_filebuf<char> buffer(fd, std::ios_base::in); __gnu_cxx::stdio_filebuf<char> buffer(fd, std::ios_base::in);
std::istream fin(&buffer); std::istream fin(&buffer);
#else // MSVC #else // MSVC
// unused in our current build // unused in our current build
auto fin = std::ifstream(wbuf, std::ios::binary); auto fin = std::ifstream(wbuf, std::ios::binary);
#endif #endif
free(wbuf); free(wbuf);
#else #else
auto fin = std::ifstream(fname, std::ios::binary); auto fin = std::ifstream(fname, std::ios::binary);
#endif #endif
if (!fin) {
LOG_ERR("cannot open model file for loading tensors\n");
clip_free(new_clip);
gguf_free(ctx);
return nullptr;
}
// add tensors to context
for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name(ctx, i);
struct ggml_tensor * t = ggml_get_tensor(meta, name);
struct ggml_tensor * cur = ggml_dup_tensor(new_clip->ctx_data, t);
ggml_set_name(cur, name);
}
// alloc memory and offload data
new_clip->params_buffer = ggml_backend_alloc_ctx_tensors(new_clip->ctx_data, new_clip->backend);
for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name(ctx, i);
struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx_data, name);
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
fin.seekg(offset, std::ios::beg);
if (!fin) { if (!fin) {
LOG_ERR("%s: failed to seek for tensor %s\n", __func__, name); throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
clip_free(new_clip);
gguf_free(ctx);
return nullptr;
} }
int num_bytes = ggml_nbytes(cur);
if (ggml_backend_buffer_is_host(new_clip->params_buffer)) { // alloc memory and offload data
// for the CPU and Metal backend, we can read directly into the tensor ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(ctx_clip.backend);
fin.read(reinterpret_cast<char *>(cur->data), num_bytes); ctx_clip.buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx_clip.ctx_data.get(), buft));
} else { ggml_backend_buffer_set_usage(ctx_clip.buf.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
// read into a temporary buffer first, then copy to device memory for (auto & t : tensors_to_load) {
read_buf.resize(num_bytes); struct ggml_tensor * cur = ggml_get_tensor(ctx_clip.ctx_data.get(), t->name);
fin.read(reinterpret_cast<char *>(read_buf.data()), num_bytes); const size_t offset = tensor_offset[t->name];
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); fin.seekg(offset, std::ios::beg);
if (!fin) {
throw std::runtime_error(string_format("%s: failed to seek for tensor %s\n", __func__, t->name));
}
size_t num_bytes = ggml_nbytes(cur);
if (ggml_backend_buft_is_host(buft)) {
// for the CPU and Metal backend, we can read directly into the tensor
fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
} else {
// read into a temporary buffer first, then copy to device memory
read_buf.resize(num_bytes);
fin.read(reinterpret_cast<char *>(read_buf.data()), num_bytes);
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
}
} }
}
#if defined(_WIN32) && defined(__GLIBCXX__) #if defined(_WIN32) && defined(__GLIBCXX__)
close(fd); close(fd);
#else #else
fin.close(); fin.close();
#endif #endif
LOG_DBG("%s: loaded %zu tensors from %s\n", __func__, tensors_to_load.size(), fname.c_str());
}
} }
// vision model void alloc_compute_meta() {
if (new_clip->has_vision_encoder) { ctx_clip.buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
// load vision model
auto & vision_model = new_clip->vision_model; // create a fake batch
auto & hparams = vision_model.hparams; clip_image_f32_batch batch;
hparams.hidden_size = get_u32(ctx, format(KEY_N_EMBD, "vision")); clip_image_f32_ptr img(clip_image_f32_init());
hparams.n_head = get_u32(ctx, format(KEY_N_HEAD, "vision")); clip_image_size image_size;
hparams.n_intermediate = get_u32(ctx, format(KEY_N_FF, "vision")); image_size.width = clip_get_image_size(&ctx_clip);
hparams.n_layer = get_u32(ctx, format(KEY_N_BLOCK, "vision")); image_size.height = clip_get_image_size(&ctx_clip);
hparams.image_size = get_u32(ctx, KEY_IMAGE_SIZE); int n_patches = clip_get_image_size(&ctx_clip) / image_size.width;
hparams.patch_size = get_u32(ctx, KEY_PATCH_SIZE); img->nx = n_patches;
hparams.projection_dim = get_u32(ctx, format(KEY_PROJ_DIM, "vision")); img->ny = n_patches;
hparams.eps = get_f32(ctx, format(KEY_LAYER_NORM_EPS, "vision")); img->buf.resize(n_patches * image_size.width * image_size.height * 3);
batch.entries.push_back(std::move(img));
try {
int idx = get_key_idx(ctx, KEY_IMAGE_GRID_PINPOINTS); ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch, image_size, false);
int n = gguf_get_arr_n(ctx, idx); ggml_backend_sched_reserve(ctx_clip.sched.get(), gf);
const int32_t * pinpoints = (const int32_t *)gguf_get_arr_data(ctx, idx); for (size_t i = 0; i < ctx_clip.backend_ptrs.size(); ++i) {
for (int i = 0; i < n; ++i) { ggml_backend_t backend = ctx_clip.backend_ptrs[i];
hparams.image_grid_pinpoints.push_back(pinpoints[i]); ggml_backend_buffer_type_t buft = ctx_clip.backend_buft[i];
size_t size = ggml_backend_sched_get_buffer_size(ctx_clip.sched.get(), backend);
if (size > 1) {
LOG_INF("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
ggml_backend_buft_name(buft),
size / 1024.0 / 1024.0);
} }
} catch (std::runtime_error & /*e*/) { } }
}
// Load the vision feature layer indices if they are explicitly provided; void get_bool(const std::string & key, bool & output, bool required = true) {
// if multiple vision feature layers are present, the values will be concatenated const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
// to form the final visual features. if (i < 0) {
// NOTE: gguf conversions should standardize the values of the vision feature layer to if (required) throw std::runtime_error("Key not found: " + key);
// be non-negative, since we use -1 to mark values as unset here. return;
try { }
int idx = get_key_idx(ctx, KEY_FEATURE_LAYER); output = gguf_get_val_bool(ctx_gguf.get(), i);
int n = gguf_get_arr_n(ctx, idx); }
const int32_t * vision_feature_layer = (const int32_t *)gguf_get_arr_data(ctx, idx); void get_i32(const std::string & key, int & output, bool required = true) {
const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
if (i < 0) {
if (required) throw std::runtime_error("Key not found: " + key);
return;
}
output = gguf_get_val_i32(ctx_gguf.get(), i);
}
for (int i = 0; i < n; ++i) { void get_u32(const std::string & key, int & output, bool required = true) {
hparams.vision_feature_layer.insert(vision_feature_layer[i]); const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
} if (i < 0) {
} catch (std::runtime_error & /*e*/) { } if (required) throw std::runtime_error("Key not found: " + key);
return;
try {
int idx = get_key_idx(ctx, KEY_MM_PATCH_MERGE_TYPE);
strcpy(hparams.mm_patch_merge_type, gguf_get_val_str(ctx, idx));
} catch (std::runtime_error & /*e*/) {
strcpy(hparams.mm_patch_merge_type, "flat");
}
try {
hparams.image_crop_resolution = get_u32(ctx, KEY_IMAGE_CROP_RESOLUTION); // llava-1.6
} catch(const std::exception& /*e*/) {
hparams.image_crop_resolution = hparams.image_size;
}
int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN);
int idx_std = get_key_idx(ctx, KEY_IMAGE_STD);
const float * mean_data = (const float *)gguf_get_arr_data(ctx, idx_mean);
const float * std_data = (const float *)gguf_get_arr_data(ctx, idx_std);
for (int i = 0; i < 3; ++i) {
new_clip->image_mean[i] = mean_data[i];
new_clip->image_std[i] = std_data[i];
}
// Calculate the deepest feature layer based on hparams and projector type
new_clip->max_feature_layer = get_deepest_feature_layer(new_clip);
if (verbosity >= 2) {
LOG_INF("\n%s: vision model hparams\n", __func__);
LOG_INF("image_size %d\n", hparams.image_size);
LOG_INF("patch_size %d\n", hparams.patch_size);
LOG_INF("v_hidden_size %d\n", hparams.hidden_size);
LOG_INF("v_n_intermediate %d\n", hparams.n_intermediate);
LOG_INF("v_projection_dim %d\n", hparams.projection_dim);
LOG_INF("v_n_head %d\n", hparams.n_head);
LOG_INF("v_n_layer %d\n", hparams.n_layer);
LOG_INF("v_eps %f\n", hparams.eps);
LOG_INF("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
LOG_INF("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
LOG_INF("v_image_grid_pinpoints: ");
for (const auto & pp : hparams.image_grid_pinpoints) {
LOG_INF("%d ", pp);
}
LOG_INF("\n");
LOG_INF("v_vision_feature_layer: ");
for (const auto & feature_layer: hparams.vision_feature_layer) {
LOG_INF("%d ", feature_layer);
}
LOG_INF("\n");
LOG_INF("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
}
try {
vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
new_clip->has_class_embedding = true;
} catch (const std::exception& /*e*/) {
new_clip->has_class_embedding = false;
}
try {
vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
new_clip->has_pre_norm = true;
} catch (std::exception & /*e*/) {
new_clip->has_pre_norm = false;
}
try {
vision_model.post_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight"));
vision_model.post_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias"));
new_clip->has_post_norm = true;
} catch (std::exception & /*e*/) {
new_clip->has_post_norm = false;
}
try {
vision_model.patch_bias = get_tensor(new_clip->ctx_data, TN_PATCH_BIAS);
new_clip->has_patch_bias = true;
} catch (std::exception & /*e*/) {
new_clip->has_patch_bias = false;
}
try {
vision_model.patch_embeddings_0 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
} catch(const std::exception& /*e*/) {
LOG_ERR("%s: failed to load vision model tensors\n", __func__);
}
try {
vision_model.patch_embeddings_1 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD_1);
} catch(const std::exception& /*e*/) {
new_clip->has_qwen2vl_merger = false;
}
// LLaVA projection
if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) {
vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
try {
// Yi-type llava
vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "weight"));
vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "bias"));
} catch (std::runtime_error & /*e*/) { }
try {
// missing in Yi-type llava
vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
} catch (std::runtime_error & /*e*/) { }
try {
// Yi-type llava
vision_model.mm_3_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "weight"));
vision_model.mm_3_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "bias"));
} catch (std::runtime_error & /*e*/) { }
try {
// Yi-type llava
vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight"));
vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias"));
} catch (std::runtime_error & /*e*/) { }
try {
vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
// LOG_INF("%s: image_newline tensor (llava-1.6) found\n", __func__);
} catch (std::runtime_error & /*e*/) { }
} else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
// MobileVLM projection
vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight"));
vision_model.mm_model_mlp_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "bias"));
vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "weight"));
vision_model.mm_model_mlp_3_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "bias"));
vision_model.mm_model_block_1_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight"));
vision_model.mm_model_block_1_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight"));
vision_model.mm_model_block_1_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias"));
vision_model.mm_model_block_1_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight"));
vision_model.mm_model_block_1_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias"));
vision_model.mm_model_block_1_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight"));
vision_model.mm_model_block_1_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias"));
vision_model.mm_model_block_1_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight"));
vision_model.mm_model_block_1_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight"));
vision_model.mm_model_block_1_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias"));
vision_model.mm_model_block_2_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight"));
vision_model.mm_model_block_2_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight"));
vision_model.mm_model_block_2_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias"));
vision_model.mm_model_block_2_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight"));
vision_model.mm_model_block_2_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias"));
vision_model.mm_model_block_2_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight"));
vision_model.mm_model_block_2_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias"));
vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
}
else if (new_clip->proj_type == PROJECTOR_TYPE_LDPV2)
{
// MobilVLM_V2 projection
vision_model.mm_model_mlp_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "weight"));
vision_model.mm_model_mlp_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "bias"));
vision_model.mm_model_mlp_2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "weight"));
vision_model.mm_model_mlp_2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "bias"));
vision_model.mm_model_peg_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "weight"));
vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias"));
}
else if (new_clip->proj_type == PROJECTOR_TYPE_RESAMPLER) {
// vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD);
vision_model.mm_model_pos_embed_k = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD_K);
vision_model.mm_model_query = get_tensor(new_clip->ctx_data, TN_MINICPMV_QUERY);
vision_model.mm_model_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_PROJ);
vision_model.mm_model_kv_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_KV_PROJ);
vision_model.mm_model_attn_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "weight"));
vision_model.mm_model_attn_k_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "weight"));
vision_model.mm_model_attn_v_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "weight"));
vision_model.mm_model_attn_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "bias"));
vision_model.mm_model_attn_k_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "bias"));
vision_model.mm_model_attn_v_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "bias"));
vision_model.mm_model_attn_o_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "weight"));
vision_model.mm_model_attn_o_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "bias"));
vision_model.mm_model_ln_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "weight"));
vision_model.mm_model_ln_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "bias"));
vision_model.mm_model_ln_kv_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "weight"));
vision_model.mm_model_ln_kv_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "bias"));
vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight"));
vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias"));
}
else if (new_clip->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
vision_model.mm_model_adapter_conv_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPER_CONV, "weight"));
vision_model.mm_model_adapter_conv_b = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPER_CONV, "bias"));
vision_model.mm_model_mlp_0_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_LINEAR,"weight"));
vision_model.mm_model_ln_q_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_NORM_1,"weight"));
vision_model.mm_model_ln_q_b = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_NORM_1,"bias"));
vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_D_H_2_4H,"weight"));
vision_model.mm_model_mlp_2_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_GATE,"weight"));
vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_D_4H_2_H,"weight"));
vision_model.boi_w = get_tensor(new_clip->ctx_data, TN_GLM_BOI_W);
vision_model.eoi_w = get_tensor(new_clip->ctx_data, TN_GLM_EOI_W);
}
else if (new_clip->proj_type == PROJECTOR_TYPE_MERGER) {
vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
} }
else { output = gguf_get_val_u32(ctx_gguf.get(), i);
std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type]; }
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
void get_f32(const std::string & key, float & output, bool required = true) {
const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
if (i < 0) {
if (required) throw std::runtime_error("Key not found: " + key);
return;
}
output = gguf_get_val_f32(ctx_gguf.get(), i);
}
void get_string(const std::string & key, std::string & output, bool required = true) {
const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
if (i < 0) {
if (required) throw std::runtime_error("Key not found: " + key);
return;
} }
output = std::string(gguf_get_val_str(ctx_gguf.get(), i));
}
vision_model.layers.resize(hparams.n_layer); void get_arr_int(const std::string & key, std::vector<int> & output, bool required = true) {
const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
if (i < 0) {
if (required) throw std::runtime_error("Key not found: " + key);
return;
}
int n = gguf_get_arr_n(ctx_gguf.get(), i);
output.resize(n);
const int32_t * values = (const int32_t *)gguf_get_arr_data(ctx_gguf.get(), i);
for (int i = 0; i < n; ++i) {
output[i] = values[i];
}
}
};
for (int il = 0; il < hparams.n_layer; ++il) { // read and create ggml_context containing the tensors and their data
auto & layer = vision_model.layers[il]; struct clip_ctx * clip_model_load(const char * fname, const int verbosity) {
layer.k_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_K, "v", il, "weight")); return clip_init(fname, clip_context_params{
layer.q_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_Q, "v", il, "weight")); /* use_gpu */ true,
layer.v_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_V, "v", il, "weight")); /* verbosity */ static_cast<ggml_log_level>(verbosity),
layer.o_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_OUTPUT, "v", il, "weight")); });
layer.ln_1_w = get_tensor(new_clip->ctx_data, format(TN_LN_1, "v", il, "weight")); }
layer.ln_2_w = get_tensor(new_clip->ctx_data, format(TN_LN_2, "v", il, "weight"));
layer.ff_i_w = get_tensor(new_clip->ctx_data, format(TN_FFN_DOWN, "v", il, "weight")); struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params) {
layer.ff_o_w = get_tensor(new_clip->ctx_data, format(TN_FFN_UP, "v", il, "weight")); g_logger_state.verbosity_thold = ctx_params.verbosity;
layer.k_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_K, "v", il, "bias")); clip_ctx * ctx_clip = new clip_ctx(ctx_params);
layer.q_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_Q, "v", il, "bias"));
layer.v_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_V, "v", il, "bias")); try {
layer.o_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_OUTPUT, "v", il, "bias")); clip_model_loader loader(fname, *ctx_clip);
layer.ln_1_b = get_tensor(new_clip->ctx_data, format(TN_LN_1, "v", il, "bias")); loader.load_hparams();
layer.ln_2_b = get_tensor(new_clip->ctx_data, format(TN_LN_2, "v", il, "bias")); loader.load_tensors();
layer.ff_i_b = get_tensor(new_clip->ctx_data, format(TN_FFN_DOWN, "v", il, "bias")); loader.alloc_compute_meta();
layer.ff_o_b = get_tensor(new_clip->ctx_data, format(TN_FFN_UP, "v", il, "bias")); } catch (const std::exception & e) {
} LOG_ERR("%s: failed to load model '%s': %s\n", __func__, fname, e.what());
} delete ctx_clip;
return nullptr;
ggml_free(meta);
new_clip->ctx_gguf = ctx;
// measure mem requirement and allocate
{
new_clip->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
clip_image_f32_batch batch;
batch.size = 1;
batch.data = nullptr;
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
ggml_gallocr_reserve(new_clip->compute_alloc, gf);
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
LOG_INF("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
} }
return new_clip; return ctx_clip;
} }
void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) { void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) {
ctx_clip->load_image_size = load_image_size; ctx_clip->load_image_size = *load_image_size; // copy
} }
struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip) { struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip) {
return ctx_clip->load_image_size; return &ctx_clip->load_image_size;
} }
struct clip_image_size * clip_image_size_init() { struct clip_image_size * clip_image_size_init() {
...@@ -1773,22 +1639,56 @@ struct clip_image_f32 * clip_image_f32_init() { ...@@ -1773,22 +1639,56 @@ struct clip_image_f32 * clip_image_f32_init() {
return new clip_image_f32(); return new clip_image_f32();
} }
void clip_image_u8_free(struct clip_image_u8 * img) { delete img; } struct clip_image_f32_batch * clip_image_f32_batch_init() {
void clip_image_f32_free(struct clip_image_f32 * img) { delete img; } return new clip_image_f32_batch();
void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) { }
if (batch->size > 0) {
delete[] batch->data; unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny) {
batch->size = 0; if (nx) *nx = img->nx;
if (ny) *ny = img->ny;
return img->buf.data();
}
void clip_image_size_free(struct clip_image_size * load_image_size) {
if (load_image_size == nullptr) {
return;
}
delete load_image_size;
}
void clip_image_u8_free(struct clip_image_u8 * img) { if (img) delete img; }
void clip_image_f32_free(struct clip_image_f32 * img) { if (img) delete img; }
void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) { if (batch) delete batch; }
void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) { if (batch) delete batch; }
size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch) {
return batch->entries.size();
}
size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx) {
if (idx < 0 || idx >= (int)batch->entries.size()) {
LOG_ERR("%s: invalid index %d\n", __func__, idx);
return 0;
}
return batch->entries[idx]->nx;
}
size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx) {
if (idx < 0 || idx >= (int)batch->entries.size()) {
LOG_ERR("%s: invalid index %d\n", __func__, idx);
return 0;
} }
return batch->entries[idx]->ny;
} }
void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) {
if (batch->size > 0) { clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx) {
delete[] batch->data; if (idx < 0 || idx >= (int)batch->entries.size()) {
batch->size = 0; LOG_ERR("%s: invalid index %d\n", __func__, idx);
return nullptr;
} }
return batch->entries[idx].get();
} }
void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img) { void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img) {
img->nx = nx; img->nx = nx;
img->ny = ny; img->ny = ny;
img->buf.resize(3 * nx * ny); img->buf.resize(3 * nx * ny);
...@@ -1859,14 +1759,15 @@ static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int ta ...@@ -1859,14 +1759,15 @@ static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int ta
} }
// Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not // Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not
static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32* dst, const float mean[3], const float std[3]) { static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) {
dst->nx = src->nx; dst.nx = src.nx;
dst->ny = src->ny; dst.ny = src.ny;
dst->buf.resize(src->buf.size()); dst.buf.resize(src.buf.size());
for (size_t i = 0; i < src->buf.size(); ++i) { // TODO @ngxson : seems like this could be done more efficiently on cgraph
for (size_t i = 0; i < src.buf.size(); ++i) {
int c = i % 3; // rgb int c = i % 3; // rgb
dst->buf[i] = (static_cast<float>(src->buf[i]) / 255.0f - mean[c]) / std[c]; dst.buf[i] = (static_cast<float>(src.buf[i]) / 255.0f - mean[c]) / std[c];
} }
} }
...@@ -1874,7 +1775,7 @@ inline int clip(int x, int lower, int upper) { ...@@ -1874,7 +1775,7 @@ inline int clip(int x, int lower, int upper) {
return std::max(lower, std::min(x, upper)); return std::max(lower, std::min(x, upper));
} }
static bool bicubic_resize(const clip_image_u8 &img, clip_image_u8 &dst, int target_width, int target_height) { static bool bicubic_resize(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) {
const int nx = img.nx; const int nx = img.nx;
const int ny = img.ny; const int ny = img.ny;
...@@ -2012,13 +1913,13 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int> & or ...@@ -2012,13 +1913,13 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int> & or
return best_fit; return best_fit;
} }
static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & image, int patch_size) { static std::vector<clip_image_u8_ptr> divide_to_patches_u8(const clip_image_u8 & image, int patch_size) {
std::vector<clip_image_u8*> patches; std::vector<clip_image_u8_ptr> patches;
int width = image.nx; int width = image.nx;
int height = image.ny; int height = image.ny;
for (int i = 0; i < height; i += patch_size) { for (int i = 0; i < height; i += patch_size) {
for (int j = 0; j < width; j += patch_size) { for (int j = 0; j < width; j += patch_size) {
clip_image_u8 *patch = clip_image_u8_init(); clip_image_u8_ptr patch(clip_image_u8_init());
patch->nx = std::min(patch_size, width - j); patch->nx = std::min(patch_size, width - j);
patch->ny = std::min(patch_size, height - i); patch->ny = std::min(patch_size, height - i);
patch->buf.resize(3 * patch->nx * patch->ny); patch->buf.resize(3 * patch->nx * patch->ny);
...@@ -2029,7 +1930,7 @@ static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & im ...@@ -2029,7 +1930,7 @@ static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & im
} }
} }
} }
patches.push_back(patch); patches.push_back(std::move(patch));
} }
} }
return patches; return patches;
...@@ -2110,7 +2011,7 @@ static std::pair<int, int> uhd_best_grid(const int max_slice_nums, const int mul ...@@ -2110,7 +2011,7 @@ static std::pair<int, int> uhd_best_grid(const int max_slice_nums, const int mul
// -> https://arxiv.org/pdf/2403.11703 // -> https://arxiv.org/pdf/2403.11703
// -> https://github.com/thunlp/LLaVA-UHD // -> https://github.com/thunlp/LLaVA-UHD
// -> https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118 // -> https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118
static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_image_u8 * img, const int max_slice_nums=9, const int scale_resolution=448, const int patch_size=14) { static std::vector<std::vector<clip_image_u8_ptr>> uhd_slice_image(const clip_image_u8 * img, const int max_slice_nums=9, const int scale_resolution=448, const int patch_size=14) {
const std::pair<int, int> original_size={img->nx,img->ny}; const std::pair<int, int> original_size={img->nx,img->ny};
const int original_width = img->nx; const int original_width = img->nx;
const int original_height = img->ny; const int original_height = img->ny;
...@@ -2118,33 +2019,33 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag ...@@ -2118,33 +2019,33 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution); const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
const int multiple = fmin(ceil(ratio), max_slice_nums); const int multiple = fmin(ceil(ratio), max_slice_nums);
std::vector<std::vector<clip_image_u8 *>> images; std::vector<std::vector<clip_image_u8_ptr>> images;
LOG_INF("%s: multiple %d\n", __func__, multiple); LOG_DBG("%s: multiple %d\n", __func__, multiple);
images.push_back(std::vector<clip_image_u8 *>()); images.push_back(std::vector<clip_image_u8_ptr>());
if (multiple <= 1) { if (multiple <= 1) {
auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size, true); auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size, true);
clip_image_u8 * source_image = clip_image_u8_init(); clip_image_u8_ptr source_image(clip_image_u8_init());
bicubic_resize(*img, *source_image, best_size.first, best_size.second); bicubic_resize(*img, *source_image, best_size.first, best_size.second);
// source_image = image.resize(best_size, Image.Resampling.BICUBIC) // source_image = image.resize(best_size, Image.Resampling.BICUBIC)
images[images.size()-1].push_back(source_image); images.back().push_back(std::move(source_image));
} }
else if (multiple > 1) { else if (multiple > 1) {
auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size); auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size);
clip_image_u8 * source_image = clip_image_u8_init(); clip_image_u8_ptr source_image(clip_image_u8_init());
bicubic_resize(*img, *source_image, best_size.first, best_size.second); bicubic_resize(*img, *source_image, best_size.first, best_size.second);
// source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC) // source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
LOG_INF("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second); LOG_DBG("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
images[images.size()-1].push_back(source_image); images.back().push_back(std::move(source_image));
std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio); std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
LOG_INF("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second); LOG_DBG("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true); auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
clip_image_u8 * refine_image = clip_image_u8_init(); clip_image_u8_ptr refine_image(clip_image_u8_init());
bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second); bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second);
LOG_INF("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second); LOG_DBG("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
// split_to_patches // split_to_patches
int width = refine_image->nx; int width = refine_image->nx;
...@@ -2152,9 +2053,9 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag ...@@ -2152,9 +2053,9 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
int grid_x = int(width / best_grid.first); int grid_x = int(width / best_grid.first);
int grid_y = int(height / best_grid.second); int grid_y = int(height / best_grid.second);
for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.second; patches_i += grid_y, ic += 1){ for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.second; patches_i += grid_y, ic += 1){
images.push_back(std::vector<clip_image_u8 *>()); images.push_back(std::vector<clip_image_u8_ptr>());
for(int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.first; patches_j += grid_x, jc += 1){ for(int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.first; patches_j += grid_x, jc += 1){
clip_image_u8 * patch = clip_image_u8_init(); clip_image_u8_ptr patch(clip_image_u8_init());
patch->nx = grid_x; patch->nx = grid_x;
patch->ny = grid_y; patch->ny = grid_y;
patch->buf.resize(3 * patch->nx * patch->ny); patch->buf.resize(3 * patch->nx * patch->ny);
...@@ -2167,10 +2068,9 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag ...@@ -2167,10 +2068,9 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
patch->buf[j+2] = refine_image->buf[i+2]; patch->buf[j+2] = refine_image->buf[i+2];
} }
} }
images[images.size()-1].push_back(patch); images.back().push_back(std::move(patch));
} }
} }
clip_image_u8_free(refine_image);
} }
return images; return images;
} }
...@@ -2178,8 +2078,8 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag ...@@ -2178,8 +2078,8 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) { int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
const int max_slice_nums=9; const int max_slice_nums=9;
const int scale_resolution=448; const int scale_resolution=448;
const int original_width = ctx_clip->load_image_size->width; const int original_width = ctx_clip->load_image_size.width;
const int original_height = ctx_clip->load_image_size->height; const int original_height = ctx_clip->load_image_size.height;
const float log_ratio = log(1.0*original_width/original_height); const float log_ratio = log(1.0*original_width/original_height);
const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution); const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
const int multiple = fmin(ceil(ratio), max_slice_nums); const int multiple = fmin(ceil(ratio), max_slice_nums);
...@@ -2189,88 +2089,64 @@ int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) { ...@@ -2189,88 +2089,64 @@ int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
// res_imgs memory is being allocated here, previous allocations will be freed if found // res_imgs memory is being allocated here, previous allocations will be freed if found
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) { bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) {
if(clip_is_minicpmv(ctx)){ if (clip_is_minicpmv(ctx)) {
int max_slice_nums = 9; int max_slice_nums = 9;
std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image(img, max_slice_nums); std::vector<std::vector<clip_image_u8_ptr>> imgs = uhd_slice_image(img, max_slice_nums);
res_imgs->size = 0;
for (size_t i = 0; i < imgs.size(); ++i){
res_imgs->size += imgs[i].size();
}
res_imgs->data = new clip_image_f32[res_imgs->size];
int idx = 0;
for (size_t i = 0; i < imgs.size(); ++i) { for (size_t i = 0; i < imgs.size(); ++i) {
for (size_t j = 0; j < imgs[i].size(); ++j) { for (size_t j = 0; j < imgs[i].size(); ++j) {
LOG_DBG("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny); LOG_DBG("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
clip_image_f32 * res = clip_image_f32_init(); clip_image_f32_ptr res(clip_image_f32_init());
normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std); normalize_image_u8_to_f32(*imgs[i][j], *res, ctx->image_mean, ctx->image_std);
res_imgs->data[idx++] = *res; res_imgs->entries.push_back(std::move(res));
clip_image_f32_free(res);
}
}
for (size_t i = 0; i < imgs.size(); ++i) {
for (size_t j = 0; j < imgs[i].size(); ++j) {
if (imgs[i][j] != nullptr) {
clip_image_u8_free(imgs[i][j]);
}
} }
} }
return true; return true;
} }
else if (ctx->has_qwen2vl_merger) { else if (ctx->has_qwen2vl_merger) {
clip_image_u8 * resized = clip_image_u8_init(); clip_image_u8 resized;
auto patch_size = clip_patch_size(ctx) * 2; auto patch_size = clip_get_patch_size(ctx) * 2;
int nx = ceil((float)img->nx / patch_size) * patch_size; int nx = ceil((float)img->nx / patch_size) * patch_size;
int ny = ceil((float)img->ny / patch_size) * patch_size; int ny = ceil((float)img->ny / patch_size) * patch_size;
bicubic_resize(*img, *resized, nx, ny); bicubic_resize(*img, resized, nx, ny);
res_imgs->data = new clip_image_f32[1]; clip_image_f32_ptr img_f32(clip_image_f32_init());
// clip_image_f32 * res = clip_image_f32_init(); // clip_image_f32_ptr res(clip_image_f32_init());
normalize_image_u8_to_f32(resized, res_imgs->data, ctx->image_mean, ctx->image_std); normalize_image_u8_to_f32(resized, *img_f32, ctx->image_mean, ctx->image_std);
// res_imgs->data[0] = *res; // res_imgs->data[0] = *res;
res_imgs->size = 1; res_imgs->entries.push_back(std::move(img_f32));
// clip_image_f32_free(res);
clip_image_u8_free(resized);
return true; return true;
} }
if (ctx->has_glm_projector) { if (ctx->has_glm_projector || ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
res_imgs->size = 1;
res_imgs->data = new clip_image_f32[res_imgs->size];
clip_image_u8 resized_image; clip_image_u8 resized_image;
int32_t sz=ctx->vision_model.hparams.image_size; int32_t sz=ctx->vision_model.hparams.image_size;
bicubic_resize(*img, resized_image,sz,sz); bicubic_resize(*img, resized_image,sz,sz);
clip_image_f32 * res = clip_image_f32_init(); clip_image_f32_ptr img_f32(clip_image_f32_init());
//clip_image_save_to_bmp(resized_image, "resized.bmp"); //clip_image_save_to_bmp(resized_image, "resized.bmp");
normalize_image_u8_to_f32(&resized_image, res, ctx->image_mean, ctx->image_std); normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std);
res_imgs->data[0] = *res; res_imgs->entries.push_back(std::move(img_f32));
clip_image_f32_free(res);
return true; return true;
} }
bool pad_to_square = true; bool pad_to_square = true;
if (!ctx->has_vision_encoder) { if (!ctx->has_vision_encoder) {
LOG_ERR("This gguf file seems to have no vision encoder\n"); LOG_ERR("%s: This gguf file seems to have no vision encoder\n", __func__);
return false; return false;
} }
auto & params = ctx->vision_model.hparams; auto & params = ctx->vision_model.hparams;
// The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing
if (strcmp(params.mm_patch_merge_type, "spatial_unpad") == 0) { if (params.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD) {
pad_to_square = false; pad_to_square = false;
} }
// free the previous res_imgs if any set // free the previous res_imgs if any set
if (res_imgs->size > 0) { res_imgs->entries.clear();
clip_image_f32_batch_free(res_imgs);
}
res_imgs->data = nullptr;
res_imgs->size = 0;
// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104) // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
// see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
clip_image_u8 * temp = clip_image_u8_init(); // we will keep the input image data here temporarily clip_image_u8_ptr temp(clip_image_u8_init()); // we will keep the input image data here temporarily
if (pad_to_square && img->nx != img->ny) { if (pad_to_square && img->nx != img->ny) {
int longer_side = std::max(img->nx, img->ny); int longer_side = std::max(img->nx, img->ny);
temp->nx = longer_side; temp->nx = longer_side;
...@@ -2313,28 +2189,18 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli ...@@ -2313,28 +2189,18 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
// clip_image_u8_free(temp2); // clip_image_u8_free(temp2);
// } // }
std::vector<clip_image_u8 *> patches = divide_to_patches_u8(*temp, params.image_size); // prepare spatial sorted main patches of image_size each (336 in llava-1.6) std::vector<clip_image_u8_ptr> patches = divide_to_patches_u8(*temp, params.image_size); // prepare spatial sorted main patches of image_size each (336 in llava-1.6)
clip_image_u8 *image_original_resize = clip_image_u8_init(); clip_image_u8_ptr image_original_resize(clip_image_u8_init());
// bilinear_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square // bilinear_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
patches.insert(patches.begin(), image_original_resize); patches.insert(patches.begin(), std::move(image_original_resize));
// clip_image_f32_batch_init(patches.size()); for (auto & patch : patches) {
res_imgs->size = patches.size(); clip_image_f32_ptr res(clip_image_f32_init());
res_imgs->data = new clip_image_f32[res_imgs->size]; normalize_image_u8_to_f32(*patch, *res, ctx->image_mean, ctx->image_std);
int num=0; res_imgs->entries.push_back(std::move(res));
for (auto& patch : patches) {
normalize_image_u8_to_f32(patch, &res_imgs->data[num], ctx->image_mean, ctx->image_std);
num++;
} }
for (size_t i = 0; i < patches.size(); i++) {
// LOG_DBG("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
clip_image_u8_free(patches[i]);
}
clip_image_u8_free(temp);
return true; return true;
} else { } else {
temp->nx = img->nx; temp->nx = img->nx;
...@@ -2350,7 +2216,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli ...@@ -2350,7 +2216,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
const int nx2 = ctx->vision_model.hparams.image_size; const int nx2 = ctx->vision_model.hparams.image_size;
const int ny2 = ctx->vision_model.hparams.image_size; const int ny2 = ctx->vision_model.hparams.image_size;
clip_image_f32 * res = clip_image_f32_init(); clip_image_f32_ptr res(clip_image_f32_init());
res->nx = nx2; res->nx = nx2;
res->ny = ny2; res->ny = ny2;
res->buf.resize(3 * nx2 * ny2); res->buf.resize(3 * nx2 * ny2);
...@@ -2402,7 +2268,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli ...@@ -2402,7 +2268,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
} }
} }
} }
clip_image_u8_free(temp);
// { // {
// clip_image_u8 * temp2 = clip_image_u8_init(); // clip_image_u8 * temp2 = clip_image_u8_init();
...@@ -2412,10 +2277,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli ...@@ -2412,10 +2277,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
// } // }
// res_imgs.push_back(res); // res_imgs.push_back(res);
res_imgs->size = 1; res_imgs->entries.push_back(std::move(res));
res_imgs->data = new clip_image_f32[res_imgs->size];
res_imgs->data[0] = *res;
clip_image_f32_free(res);
return true; return true;
} }
...@@ -2425,12 +2287,9 @@ ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) { ...@@ -2425,12 +2287,9 @@ ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
} }
void clip_free(clip_ctx * ctx) { void clip_free(clip_ctx * ctx) {
ggml_free(ctx->ctx_data); if (ctx == nullptr) {
gguf_free(ctx->ctx_gguf); return;
}
ggml_backend_buffer_free(ctx->params_buffer);
ggml_backend_free(ctx->backend);
ggml_gallocr_free(ctx->compute_alloc);
delete ctx; delete ctx;
} }
...@@ -2446,20 +2305,20 @@ size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w ...@@ -2446,20 +2305,20 @@ size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w
return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float); return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
} }
int32_t clip_image_size(const struct clip_ctx * ctx) { int32_t clip_get_image_size(const struct clip_ctx * ctx) {
return ctx->vision_model.hparams.image_size; return ctx->vision_model.hparams.image_size;
} }
int32_t clip_patch_size(const struct clip_ctx * ctx) { int32_t clip_get_patch_size(const struct clip_ctx * ctx) {
return ctx->vision_model.hparams.patch_size; return ctx->vision_model.hparams.patch_size;
} }
int32_t clip_hidden_size(const struct clip_ctx * ctx) { int32_t clip_get_hidden_size(const struct clip_ctx * ctx) {
return ctx->vision_model.hparams.hidden_size; return ctx->vision_model.hparams.hidden_size;
} }
const char * clip_patch_merge_type(const struct clip_ctx * ctx) { const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
return ctx->vision_model.hparams.mm_patch_merge_type; return ctx->vision_model.hparams.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD ? "spatial_unpad" : "flat";
} }
const int32_t * clip_image_grid(const struct clip_ctx * ctx) { const int32_t * clip_image_grid(const struct clip_ctx * ctx) {
...@@ -2502,6 +2361,8 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i ...@@ -2502,6 +2361,8 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i
int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0); int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0); int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
n_patches = x_patch * y_patch; n_patches = x_patch * y_patch;
} else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
n_patches = 256;
} }
return n_patches; return n_patches;
...@@ -2595,23 +2456,27 @@ static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, co ...@@ -2595,23 +2456,27 @@ static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, co
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) { bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
if (!ctx->has_vision_encoder) { if (!ctx->has_vision_encoder) {
LOG_ERR("This gguf file seems to have no vision encoder\n"); LOG_ERR("%s: This gguf file seems to have no vision encoder\n", __func__);
return false; return false;
} }
clip_image_f32_batch imgs{}; clip_image_f32_batch imgs;
imgs.size = 1; clip_image_f32_ptr img_copy(clip_image_f32_init());
imgs.data = img; *img_copy = *img;
imgs.entries.push_back(std::move(img_copy));
return clip_image_batch_encode(ctx, n_threads, &imgs, vec); return clip_image_batch_encode(ctx, n_threads, &imgs, vec);
} }
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) { bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
const clip_image_f32_batch & imgs = *imgs_c_ptr;
if (!ctx->has_vision_encoder) { if (!ctx->has_vision_encoder) {
LOG_ERR("This gguf file seems to have no vision encoder\n"); LOG_ERR("%s: This gguf file seems to have no vision encoder\n", __func__);
return false; return false;
} }
int batch_size = imgs->size; int batch_size = imgs.entries.size();
if (ctx->has_llava_projector) { if (ctx->has_llava_projector) {
GGML_ASSERT(batch_size == 1); // TODO: support multiple images GGML_ASSERT(batch_size == 1); // TODO: support multiple images
} }
...@@ -2626,8 +2491,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima ...@@ -2626,8 +2491,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
} }
// build the inference graph // build the inference graph
ggml_backend_sched_reset(ctx->sched.get());
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true); ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true);
ggml_gallocr_alloc_graph(ctx->compute_alloc, gf); ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
// set inputs // set inputs
const auto & model = ctx->vision_model; const auto & model = ctx->vision_model;
...@@ -2637,25 +2503,22 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima ...@@ -2637,25 +2503,22 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
int image_size_width = image_size; int image_size_width = image_size;
int image_size_height = image_size; int image_size_height = image_size;
if (ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger) { if (ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger) {
image_size_width = imgs->data[0].nx; image_size_width = imgs.entries[0]->nx;
image_size_height = imgs->data[0].ny; image_size_height = imgs.entries[0]->ny;
} }
const int patch_size = hparams.patch_size; const int patch_size = hparams.patch_size;
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0); const int num_positions = num_patches + (model.class_embedding ? 1 : 0);
if(ctx->load_image_size==nullptr){ const int pos_w = ctx->load_image_size.width / patch_size;
ctx->load_image_size= clip_image_size_init(); const int pos_h = ctx->load_image_size.height / patch_size;
}
const int pos_w = ctx->load_image_size->width/patch_size;
const int pos_h = ctx->load_image_size->height/patch_size;
{ {
struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw"); struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
float * data = (float *)malloc(ggml_nbytes(inp_raw)); float * data = (float *)malloc(ggml_nbytes(inp_raw));
for (size_t i = 0; i < imgs->size; i++) { for (size_t i = 0; i < imgs.entries.size(); i++) {
const int nx = imgs->data[i].nx; const int nx = imgs.entries[i]->nx;
const int ny = imgs->data[i].ny; const int ny = imgs.entries[i]->ny;
if (!(ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger)) { if (!(ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger)) {
GGML_ASSERT(nx == image_size && ny == image_size); GGML_ASSERT(nx == image_size && ny == image_size);
} }
...@@ -2666,7 +2529,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima ...@@ -2666,7 +2529,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
for (int k = 0; k < 3; k++) { for (int k = 0; k < 3; k++) {
for (int y = 0; y < ny; y++) { for (int y = 0; y < ny; y++) {
for (int x = 0; x < nx; x++) { for (int x = 0; x < nx; x++) {
data[(b * 3 * n) + k * n + y * nx + x] = imgs->data[b].buf[3 * (y * nx + x) + k]; data[(b * 3 * n) + k * n + y * nx + x] = imgs.entries[b]->buf[3 * (y * nx + x) + k];
} }
} }
} }
...@@ -2727,16 +2590,14 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima ...@@ -2727,16 +2590,14 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
free(pos_embed_data); free(pos_embed_data);
} }
} }
else{ else {
{ if (model.class_embedding) {
if (ctx->has_class_embedding) { struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
void* zero_mem = malloc(ggml_nbytes(embeddings)); void* zero_mem = malloc(ggml_nbytes(embeddings));
memset(zero_mem, 0, ggml_nbytes(embeddings)); memset(zero_mem, 0, ggml_nbytes(embeddings));
ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings)); ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
free(zero_mem); free(zero_mem);
}
} }
if (ctx->has_qwen2vl_merger) { if (ctx->has_qwen2vl_merger) {
...@@ -2766,6 +2627,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima ...@@ -2766,6 +2627,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
free(positions_data); free(positions_data);
} }
else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
// do nothing
}
else { else {
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
...@@ -2781,7 +2645,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima ...@@ -2781,7 +2645,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
// The patches vector is used to get rows to index into the embeds with; // The patches vector is used to get rows to index into the embeds with;
// we should skip dim 0 only if we have CLS to avoid going out of bounds // we should skip dim 0 only if we have CLS to avoid going out of bounds
// when retrieving the rows. // when retrieving the rows.
int patch_offset = ctx->has_class_embedding ? 1 : 0; int patch_offset = model.class_embedding ? 1 : 0;
int* patches_data = (int*)malloc(ggml_nbytes(patches)); int* patches_data = (int*)malloc(ggml_nbytes(patches));
for (int i = 0; i < num_patches; i++) { for (int i = 0; i < num_patches; i++) {
patches_data[i] = i + patch_offset; patches_data[i] = i + patch_offset;
...@@ -2792,11 +2656,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima ...@@ -2792,11 +2656,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
} }
} }
if (ggml_backend_is_cpu(ctx->backend)) { ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads);
ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
}
ggml_backend_graph_compute(ctx->backend, gf); auto status = ggml_backend_sched_graph_compute(ctx->sched.get(), gf);
if (status != GGML_STATUS_SUCCESS) {
LOG_ERR("%s: ggml_backend_sched_graph_compute failed with error %d\n", __func__, status);
return false;
}
// the last node is the embedding tensor // the last node is the embedding tensor
struct ggml_tensor * embeddings = ggml_graph_node(gf, -1); struct ggml_tensor * embeddings = ggml_graph_node(gf, -1);
...@@ -2818,10 +2684,13 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i ...@@ -2818,10 +2684,13 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
assert(itype < GGML_TYPE_COUNT); assert(itype < GGML_TYPE_COUNT);
ggml_type type = static_cast<ggml_type>(itype); ggml_type type = static_cast<ggml_type>(itype);
auto * ctx_clip = clip_model_load(fname_inp, 2); auto * ctx_clip = clip_init(fname_inp, clip_context_params{
/* use_gpu */ false,
/* verbosity */ GGML_LOG_LEVEL_ERROR,
});
const auto & ctx_src = ctx_clip->ctx_gguf; const auto & ctx_src = ctx_clip->ctx_gguf.get();
const auto & ctx_data = ctx_clip->ctx_data; const auto & ctx_data = ctx_clip->ctx_data.get();
auto * ctx_out = gguf_init_empty(); auto * ctx_out = gguf_init_empty();
gguf_set_kv(ctx_out, ctx_src); gguf_set_kv(ctx_out, ctx_src);
...@@ -2895,7 +2764,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i ...@@ -2895,7 +2764,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
f32_data = (float *)conv_buf.data(); f32_data = (float *)conv_buf.data();
break; break;
default: default:
LOG_ERR("Please use an input file in f32 or f16\n"); LOG_ERR("%s: Please use an input file in f32 or f16\n", __func__);
gguf_free(ctx_out); gguf_free(ctx_out);
return false; return false;
} }
...@@ -2976,9 +2845,12 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { ...@@ -2976,9 +2845,12 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
if (ctx->proj_type == PROJECTOR_TYPE_MERGER) { if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
return ctx->vision_model.mm_1_b->ne[0]; return ctx->vision_model.mm_1_b->ne[0];
} }
if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
return ctx->vision_model.mm_input_proj_w->ne[0];
}
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type]; std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); throw std::runtime_error(string_format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
} }
int clip_is_minicpmv(const struct clip_ctx * ctx) { int clip_is_minicpmv(const struct clip_ctx * ctx) {
...@@ -2991,10 +2863,19 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) { ...@@ -2991,10 +2863,19 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) {
bool clip_is_glm(const struct clip_ctx * ctx) { bool clip_is_glm(const struct clip_ctx * ctx) {
return ctx->has_glm_projector; return ctx->has_glm_projector;
} }
bool clip_is_qwen2vl(const struct clip_ctx * ctx) { bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
return ctx->has_qwen2vl_merger; return ctx->has_qwen2vl_merger;
} }
bool clip_is_llava(const struct clip_ctx * ctx) {
return ctx->has_llava_projector;
}
bool clip_is_gemma3(const struct clip_ctx * ctx) {
return ctx->proj_type == PROJECTOR_TYPE_GEMMA3;
}
// Determine the number of encoder layers to iterate over // Determine the number of encoder layers to iterate over
int get_deepest_feature_layer(const struct clip_ctx * ctx) { int get_deepest_feature_layer(const struct clip_ctx * ctx) {
// Get the index of the second to last layer; this is the // Get the index of the second to last layer; this is the
...@@ -3030,3 +2911,11 @@ bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, ...@@ -3030,3 +2911,11 @@ bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img,
clip_image_encode(ctx, n_threads, &clip_img, vec); clip_image_encode(ctx, n_threads, &clip_img, vec);
return true; return true;
} }
//
// API used internally with mtmd
//
projector_type clip_get_projector_type(const struct clip_ctx * ctx) {
return ctx->proj_type;
}
#ifndef CLIP_H #ifndef CLIP_H
#define CLIP_H #define CLIP_H
#include "ggml.h"
#include <stddef.h> #include <stddef.h>
#include <stdint.h> #include <stdint.h>
...@@ -29,27 +30,28 @@ struct clip_image_size { ...@@ -29,27 +30,28 @@ struct clip_image_size {
int height; int height;
}; };
struct clip_image_u8_batch { struct clip_image_f32;
struct clip_image_u8 * data; struct clip_image_u8_batch;
size_t size; struct clip_image_f32_batch;
};
struct clip_image_f32_batch { struct clip_context_params {
struct clip_image_f32 * data; bool use_gpu;
size_t size; enum ggml_log_level verbosity;
}; };
CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity); // deprecated, use clip_init
CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity); CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity);
CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params);
CLIP_API void clip_free(struct clip_ctx * ctx); CLIP_API void clip_free(struct clip_ctx * ctx);
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx); CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w); CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w);
CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx); CLIP_API int32_t clip_get_image_size (const struct clip_ctx * ctx);
CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx); CLIP_API int32_t clip_get_patch_size (const struct clip_ctx * ctx);
CLIP_API int32_t clip_hidden_size(const struct clip_ctx * ctx); CLIP_API int32_t clip_get_hidden_size(const struct clip_ctx * ctx);
// TODO: should be enum, not string // TODO: should be enum, not string
CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx); CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
...@@ -65,16 +67,30 @@ CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip); ...@@ -65,16 +67,30 @@ CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size); CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip); CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);
CLIP_API struct clip_image_size * clip_image_size_init(); CLIP_API struct clip_image_size * clip_image_size_init();
CLIP_API struct clip_image_u8 * clip_image_u8_init (); CLIP_API struct clip_image_u8 * clip_image_u8_init ();
CLIP_API struct clip_image_f32 * clip_image_f32_init(); CLIP_API struct clip_image_f32 * clip_image_f32_init();
CLIP_API struct clip_image_f32_batch * clip_image_f32_batch_init(); // only used by libllava
// nx, ny are the output image dimensions
CLIP_API unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);
CLIP_API void clip_image_size_free (struct clip_image_size * img_size);
CLIP_API void clip_image_u8_free (struct clip_image_u8 * img); CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
CLIP_API void clip_image_f32_free(struct clip_image_f32 * img); CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch * batch); CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch * batch);
CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch); CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
/** build image from pixels decoded by other libraries instead of stb_image.h for better performance. The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes */ // use for accessing underlay data of clip_image_f32_batch
CLIP_API size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch); // equivalent to batch->size()
CLIP_API size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->nx
CLIP_API size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny
CLIP_API struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
/**
* Build image from pixels decoded by other libraries instead of stb_image.h for better performance.
* The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes
*/
CLIP_API void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img); CLIP_API void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img); CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
...@@ -95,6 +111,8 @@ CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out ...@@ -95,6 +111,8 @@ CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out
CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx); CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
CLIP_API bool clip_is_glm(const struct clip_ctx * ctx); CLIP_API bool clip_is_glm(const struct clip_ctx * ctx);
CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx); CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
CLIP_API bool clip_is_llava(const struct clip_ctx * ctx);
CLIP_API bool clip_is_gemma3(const struct clip_ctx * ctx);
CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx); CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx);
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#include <cstring> #include <cstring>
#include <limits> #include <limits>
#include <vector> #include <vector>
#include <memory>
#if defined(LLAVA_LOG_OFF) #if defined(LLAVA_LOG_OFF)
# define LOG_INF(...) # define LOG_INF(...)
...@@ -45,6 +46,17 @@ struct clip_image_grid_shape { ...@@ -45,6 +46,17 @@ struct clip_image_grid_shape {
int second; int second;
}; };
// convenience cpp wrapper
struct clip_image_f32_batch_deleter {
void operator()(clip_image_f32_batch * val) { clip_image_f32_batch_free(val); }
};
typedef std::unique_ptr<clip_image_f32_batch, clip_image_f32_batch_deleter> clip_image_f32_batch_ptr;
struct clip_image_size_deleter {
void operator()(clip_image_f32_batch * val) { clip_image_f32_batch_free(val); }
};
typedef std::unique_ptr<clip_image_size, clip_image_size_deleter> clip_image_size_ptr;
/** /**
* Selects the best resolution from a list of possible resolutions based on the original size. * Selects the best resolution from a list of possible resolutions based on the original size.
* *
...@@ -105,8 +117,8 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> ...@@ -105,8 +117,8 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
struct ggml_context * ctx; struct ggml_context * ctx;
} model; } model;
const int32_t image_size = clip_image_size(ctx_clip); const int32_t image_size = clip_get_image_size(ctx_clip);
const int32_t patch_size = clip_patch_size(ctx_clip); const int32_t patch_size = clip_get_patch_size(ctx_clip);
int32_t num_patches_per_side = image_size / patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches) int32_t num_patches_per_side = image_size / patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches)
...@@ -246,12 +258,9 @@ static clip_image_f32 * reshape_by_patch(clip_image_f32 * image, int patch_size) ...@@ -246,12 +258,9 @@ static clip_image_f32 * reshape_by_patch(clip_image_f32 * image, int patch_size)
static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) { static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
// std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336 // std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
clip_image_f32_batch img_res_v; clip_image_f32_batch_ptr img_res_v(clip_image_f32_batch_init());
img_res_v.size = 0; if (!clip_image_preprocess(ctx_clip, img, img_res_v.get())) {
img_res_v.data = nullptr;
if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
LOG_ERR("%s: unable to preprocess image\n", __func__); LOG_ERR("%s: unable to preprocess image\n", __func__);
delete[] img_res_v.data;
return false; return false;
} }
...@@ -259,66 +268,72 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli ...@@ -259,66 +268,72 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip); const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
const size_t n_imgs = clip_image_f32_batch_n_images(img_res_v.get());
if (clip_is_minicpmv(ctx_clip) || clip_is_qwen2vl(ctx_clip)) { if (clip_is_minicpmv(ctx_clip) || clip_is_qwen2vl(ctx_clip)) {
std::vector<float *> image_embd_v; std::vector<float *> image_embd_v;
image_embd_v.resize(img_res_v.size); image_embd_v.resize(n_imgs);
struct clip_image_size * load_image_size = clip_image_size_init(); clip_image_size load_image_size;
for (size_t i = 0; i < img_res_v.size; i++) { for (size_t i = 0; i < n_imgs; i++) {
const int64_t t_img_enc_step_start_us = ggml_time_us(); const int64_t t_img_enc_step_start_us = ggml_time_us();
image_embd_v[i] = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny)); int nx = clip_image_f32_batch_nx(img_res_v.get(), i);
int patch_size=14; int ny = clip_image_f32_batch_ny(img_res_v.get(), i);
load_image_size->width = img_res_v.data[i].nx; image_embd_v[i] = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, nx, ny));
load_image_size->height = img_res_v.data[i].ny; int patch_size = 14;
clip_add_load_image_size(ctx_clip, load_image_size); load_image_size.width = nx;
load_image_size.height = ny;
clip_add_load_image_size(ctx_clip, &load_image_size);
bool encoded = false; bool encoded = false;
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), i);
if (clip_is_qwen2vl(ctx_clip)) { if (clip_is_qwen2vl(ctx_clip)) {
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd_v[i]);
} }
else { else {
encoded = clip_image_encode(ctx_clip, n_threads, reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]); encoded = clip_image_encode(ctx_clip, n_threads, reshape_by_patch(img_res, patch_size), image_embd_v[i]);
} }
if (!encoded) { if (!encoded) {
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) n_imgs);
return false; return false;
} }
const int64_t t_img_enc_steop_batch_us = ggml_time_us(); const int64_t t_img_enc_steop_batch_us = ggml_time_us();
LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0); LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)n_imgs, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
} }
const int64_t t_img_enc_batch_us = ggml_time_us(); const int64_t t_img_enc_batch_us = ggml_time_us();
LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)n_imgs, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
int n_img_pos_out = 0; int n_img_pos_out = 0;
for (size_t i = 0; i < image_embd_v.size(); i++) { for (size_t i = 0; i < image_embd_v.size(); i++) {
int nx = clip_image_f32_batch_nx(img_res_v.get(), i);
int ny = clip_image_f32_batch_ny(img_res_v.get(), i);
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), i);
std::memcpy( std::memcpy(
image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
image_embd_v[i], image_embd_v[i],
clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny)); clip_embd_nbytes_by_img(ctx_clip, nx, ny));
n_img_pos_out += clip_n_patches_by_img(ctx_clip, &img_res_v.data[i]); n_img_pos_out += clip_n_patches_by_img(ctx_clip, img_res);
} }
*n_img_pos = n_img_pos_out; *n_img_pos = n_img_pos_out;
for (size_t i = 0; i < image_embd_v.size(); i++) { for (size_t i = 0; i < image_embd_v.size(); i++) {
free(image_embd_v[i]); free(image_embd_v[i]);
} }
image_embd_v.clear(); image_embd_v.clear();
load_image_size->width = img->nx; load_image_size.width = img->nx;
load_image_size->height = img->ny; load_image_size.height = img->ny;
clip_add_load_image_size(ctx_clip, load_image_size); clip_add_load_image_size(ctx_clip, &load_image_size);
LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height); LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size.width, load_image_size.height);
delete[] img_res_v.data;
img_res_v.size = 0;
img_res_v.data = nullptr;
} }
else if (clip_is_glm(ctx_clip)){ else if (clip_is_glm(ctx_clip)){
struct clip_image_size * load_image_size = clip_image_size_init(); struct clip_image_size * load_image_size = clip_image_size_init();
load_image_size->width = img_res_v.data[0].nx; load_image_size->width = clip_image_f32_batch_nx(img_res_v.get(), 0);
load_image_size->height = img_res_v.data[0].ny; load_image_size->height = clip_image_f32_batch_ny(img_res_v.get(), 0);
clip_add_load_image_size(ctx_clip, load_image_size); clip_add_load_image_size(ctx_clip, load_image_size);
bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
int pos = int(load_image_size->width/clip_patch_size(ctx_clip)/2); bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd);
int pos = int(load_image_size->width/clip_get_patch_size(ctx_clip)/2);
*n_img_pos = (pos * pos + 2); *n_img_pos = (pos * pos + 2);
if (!encoded){ if (!encoded){
LOG_ERR("Unable to encode image \n"); LOG_ERR("Unable to encode image \n");
...@@ -328,8 +343,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli ...@@ -328,8 +343,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) { else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
// flat / default llava-1.5 type embedding // flat / default llava-1.5 type embedding
*n_img_pos = clip_n_patches(ctx_clip); *n_img_pos = clip_n_patches(ctx_clip);
bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096 clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
delete[] img_res_v.data; bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
if (!encoded) { if (!encoded) {
LOG_ERR("Unable to encode image\n"); LOG_ERR("Unable to encode image\n");
...@@ -340,17 +355,18 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli ...@@ -340,17 +355,18 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
// spatial_unpad llava-1.6 type embedding // spatial_unpad llava-1.6 type embedding
// TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working // TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working
std::vector<float *> image_embd_v; std::vector<float *> image_embd_v;
image_embd_v.resize(img_res_v.size); image_embd_v.resize(n_imgs);
for (size_t i = 0; i < img_res_v.size; i++) { for (size_t i = 0; i < n_imgs; i++) {
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), i);
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184 image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside const bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
if (!encoded) { if (!encoded) {
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) n_imgs);
return false; return false;
} }
} }
const int64_t t_img_enc_batch_us = ggml_time_us(); const int64_t t_img_enc_batch_us = ggml_time_us();
LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)n_imgs, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
const int32_t * image_grid = clip_image_grid(ctx_clip); const int32_t * image_grid = clip_image_grid(ctx_clip);
const size_t num_gridpoints = get_clip_image_grid_size(ctx_clip); const size_t num_gridpoints = get_clip_image_grid_size(ctx_clip);
...@@ -360,12 +376,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli ...@@ -360,12 +376,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
grid_pinpoints.push_back({image_grid[i], image_grid[i+1]}); grid_pinpoints.push_back({image_grid[i], image_grid[i+1]});
} }
// free all img_res_v - not needed anymore const int32_t image_size = clip_get_image_size(ctx_clip);
delete[] img_res_v.data;
img_res_v.size = 0;
img_res_v.data = nullptr;
const int32_t image_size = clip_image_size(ctx_clip);
struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size); struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);
......
...@@ -60,6 +60,7 @@ extern "C" { ...@@ -60,6 +60,7 @@ extern "C" {
struct llama_model; struct llama_model;
struct llama_context; struct llama_context;
struct llama_sampler; struct llama_sampler;
struct llama_kv_cache;
typedef int32_t llama_pos; typedef int32_t llama_pos;
typedef int32_t llama_token; typedef int32_t llama_token;
...@@ -106,6 +107,10 @@ extern "C" { ...@@ -106,6 +107,10 @@ extern "C" {
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27, LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28, LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
LLAMA_VOCAB_PRE_TYPE_GPT4O = 29, LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
}; };
enum llama_rope_type { enum llama_rope_type {
...@@ -277,10 +282,18 @@ extern "C" { ...@@ -277,10 +282,18 @@ extern "C" {
}; };
}; };
struct llama_model_tensor_buft_override {
const char * pattern;
ggml_backend_buffer_type_t buft;
};
struct llama_model_params { struct llama_model_params {
// NULL-terminated list of devices to use for offloading (if NULL, all available devices are used) // NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
ggml_backend_dev_t * devices; ggml_backend_dev_t * devices;
// NULL-terminated list of buffer types to use for tensors that match a pattern
const struct llama_model_tensor_buft_override * tensor_buft_overrides;
int32_t n_gpu_layers; // number of layers to store in VRAM int32_t n_gpu_layers; // number of layers to store in VRAM
enum llama_split_mode split_mode; // how to split the model across multiple GPUs enum llama_split_mode split_mode; // how to split the model across multiple GPUs
...@@ -356,17 +369,18 @@ extern "C" { ...@@ -356,17 +369,18 @@ extern "C" {
// model quantization parameters // model quantization parameters
typedef struct llama_model_quantize_params { typedef struct llama_model_quantize_params {
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
enum llama_ftype ftype; // quantize to this llama_ftype enum llama_ftype ftype; // quantize to this llama_ftype
enum ggml_type output_tensor_type; // output tensor type enum ggml_type output_tensor_type; // output tensor type
enum ggml_type token_embedding_type; // token embeddings tensor type enum ggml_type token_embedding_type; // token embeddings tensor type
bool allow_requantize; // allow quantizing non-f32/f16 tensors bool allow_requantize; // allow quantizing non-f32/f16 tensors
bool quantize_output_tensor; // quantize output.weight bool quantize_output_tensor; // quantize output.weight
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
bool pure; // quantize all tensors to the default type bool pure; // quantize all tensors to the default type
bool keep_split; // quantize to the same number of shards bool keep_split; // quantize to the same number of shards
void * imatrix; // pointer to importance matrix data void * imatrix; // pointer to importance matrix data
void * kv_overrides; // pointer to vector containing overrides void * kv_overrides; // pointer to vector containing overrides
void * tensor_types; // pointer to vector containing tensor types
} llama_model_quantize_params; } llama_model_quantize_params;
typedef struct llama_logit_bias { typedef struct llama_logit_bias {
...@@ -475,7 +489,8 @@ extern "C" { ...@@ -475,7 +489,8 @@ extern "C" {
DEPRECATED(LLAMA_API int32_t llama_n_vocab (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead"); DEPRECATED(LLAMA_API int32_t llama_n_vocab (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
LLAMA_API const struct llama_model * llama_get_model (const struct llama_context * ctx); LLAMA_API const struct llama_model * llama_get_model (const struct llama_context * ctx);
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); LLAMA_API struct llama_kv_cache * llama_get_kv_self ( struct llama_context * ctx);
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type
LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model); LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model); LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model);
...@@ -592,7 +607,7 @@ extern "C" { ...@@ -592,7 +607,7 @@ extern "C" {
// KV cache // KV cache
// //
// TODO: remove llama_kv_cache_view_* API // TODO: start using struct llama_kv_cache
// Information associated with an individual cell in the KV cache view. // Information associated with an individual cell in the KV cache view.
struct llama_kv_cache_view_cell { struct llama_kv_cache_view_cell {
...@@ -647,13 +662,19 @@ extern "C" { ...@@ -647,13 +662,19 @@ extern "C" {
// Returns the number of tokens in the KV cache (slow, use only for debug) // Returns the number of tokens in the KV cache (slow, use only for debug)
// If a KV cell has multiple sequences assigned to it, it will be counted multiple times // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx); LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx);
DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx),
"use llama_kv_self_n_tokens instead");
// Returns the number of used KV cells (i.e. have at least one sequence assigned to them) // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx); LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx);
DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx),
"use llama_kv_self_used_cells instead");
// Clear the KV cache - both cell info is erased and KV data is zeroed // Clear the KV cache - both cell info is erased and KV data is zeroed
LLAMA_API void llama_kv_cache_clear( LLAMA_API void llama_kv_self_clear(
struct llama_context * ctx); struct llama_context * ctx);
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1) // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
...@@ -661,7 +682,7 @@ extern "C" { ...@@ -661,7 +682,7 @@ extern "C" {
// seq_id < 0 : match any sequence // seq_id < 0 : match any sequence
// p0 < 0 : [0, p1] // p0 < 0 : [0, p1]
// p1 < 0 : [p0, inf) // p1 < 0 : [p0, inf)
LLAMA_API bool llama_kv_cache_seq_rm( LLAMA_API bool llama_kv_self_seq_rm(
struct llama_context * ctx, struct llama_context * ctx,
llama_seq_id seq_id, llama_seq_id seq_id,
llama_pos p0, llama_pos p0,
...@@ -671,7 +692,7 @@ extern "C" { ...@@ -671,7 +692,7 @@ extern "C" {
// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
// p0 < 0 : [0, p1] // p0 < 0 : [0, p1]
// p1 < 0 : [p0, inf) // p1 < 0 : [p0, inf)
LLAMA_API void llama_kv_cache_seq_cp( LLAMA_API void llama_kv_self_seq_cp(
struct llama_context * ctx, struct llama_context * ctx,
llama_seq_id seq_id_src, llama_seq_id seq_id_src,
llama_seq_id seq_id_dst, llama_seq_id seq_id_dst,
...@@ -679,17 +700,17 @@ extern "C" { ...@@ -679,17 +700,17 @@ extern "C" {
llama_pos p1); llama_pos p1);
// Removes all tokens that do not belong to the specified sequence // Removes all tokens that do not belong to the specified sequence
LLAMA_API void llama_kv_cache_seq_keep( LLAMA_API void llama_kv_self_seq_keep(
struct llama_context * ctx, struct llama_context * ctx,
llama_seq_id seq_id); llama_seq_id seq_id);
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1) // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
// If the KV cache is RoPEd, the KV data is updated accordingly: // If the KV cache is RoPEd, the KV data is updated accordingly:
// - lazily on next llama_decode() // - lazily on next llama_decode()
// - explicitly with llama_kv_cache_update() // - explicitly with llama_kv_self_update()
// p0 < 0 : [0, p1] // p0 < 0 : [0, p1]
// p1 < 0 : [p0, inf) // p1 < 0 : [p0, inf)
LLAMA_API void llama_kv_cache_seq_add( LLAMA_API void llama_kv_self_seq_add(
struct llama_context * ctx, struct llama_context * ctx,
llama_seq_id seq_id, llama_seq_id seq_id,
llama_pos p0, llama_pos p0,
...@@ -699,10 +720,10 @@ extern "C" { ...@@ -699,10 +720,10 @@ extern "C" {
// Integer division of the positions by factor of `d > 1` // Integer division of the positions by factor of `d > 1`
// If the KV cache is RoPEd, the KV data is updated accordingly: // If the KV cache is RoPEd, the KV data is updated accordingly:
// - lazily on next llama_decode() // - lazily on next llama_decode()
// - explicitly with llama_kv_cache_update() // - explicitly with llama_kv_self_update()
// p0 < 0 : [0, p1] // p0 < 0 : [0, p1]
// p1 < 0 : [p0, inf) // p1 < 0 : [p0, inf)
LLAMA_API void llama_kv_cache_seq_div( LLAMA_API void llama_kv_self_seq_div(
struct llama_context * ctx, struct llama_context * ctx,
llama_seq_id seq_id, llama_seq_id seq_id,
llama_pos p0, llama_pos p0,
...@@ -710,24 +731,76 @@ extern "C" { ...@@ -710,24 +731,76 @@ extern "C" {
int d); int d);
// Returns the largest position present in the KV cache for the specified sequence // Returns the largest position present in the KV cache for the specified sequence
LLAMA_API llama_pos llama_kv_cache_seq_pos_max( LLAMA_API llama_pos llama_kv_self_seq_pos_max(
struct llama_context * ctx, struct llama_context * ctx,
llama_seq_id seq_id); llama_seq_id seq_id);
// TODO: the llama_kv_cache_defrag and llama_kv_cache_update API tightly couples llama_context with llama_kv_cache
// how to avoid this?
// Defragment the KV cache // Defragment the KV cache
// This will be applied: // This will be applied:
// - lazily on next llama_decode() // - lazily on next llama_decode()
// - explicitly with llama_kv_cache_update() // - explicitly with llama_kv_self_update()
LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx); LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx);
// Check if the context supports KV cache shifting
LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx);
// Apply the KV cache updates (such as K-shifts, defragmentation, etc.) // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
LLAMA_API void llama_kv_cache_update(struct llama_context * ctx); LLAMA_API void llama_kv_self_update(struct llama_context * ctx);
DEPRECATED(LLAMA_API void llama_kv_cache_clear(
struct llama_context * ctx),
"use llama_kv_self_clear instead");
DEPRECATED(LLAMA_API bool llama_kv_cache_seq_rm(
struct llama_context * ctx,
llama_seq_id seq_id,
llama_pos p0,
llama_pos p1),
"use llama_kv_self_seq_rm instead");
DEPRECATED(LLAMA_API void llama_kv_cache_seq_cp(
struct llama_context * ctx,
llama_seq_id seq_id_src,
llama_seq_id seq_id_dst,
llama_pos p0,
llama_pos p1),
"use llama_kv_self_seq_cp instead");
DEPRECATED(LLAMA_API void llama_kv_cache_seq_keep(
struct llama_context * ctx,
llama_seq_id seq_id),
"use llama_kv_self_seq_keep instead");
DEPRECATED(LLAMA_API void llama_kv_cache_seq_add(
struct llama_context * ctx,
llama_seq_id seq_id,
llama_pos p0,
llama_pos p1,
llama_pos delta),
"use llama_kv_self_seq_add instead");
DEPRECATED(LLAMA_API void llama_kv_cache_seq_div(
struct llama_context * ctx,
llama_seq_id seq_id,
llama_pos p0,
llama_pos p1,
int d),
"use llama_kv_self_seq_div instead");
DEPRECATED(LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
struct llama_context * ctx,
llama_seq_id seq_id),
"use llama_kv_self_seq_pos_max instead");
DEPRECATED(LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx),
"use llama_kv_self_defrag instead");
DEPRECATED(LLAMA_API bool llama_kv_cache_can_shift(const struct llama_context * ctx),
"use llama_kv_self_can_shift instead");
DEPRECATED(LLAMA_API void llama_kv_cache_update(struct llama_context * ctx),
"use llama_kv_self_update instead");
// Check if the context supports KV cache shifting
LLAMA_API bool llama_kv_cache_can_shift(struct llama_context * ctx);
// //
// State / sessions // State / sessions
...@@ -891,6 +964,10 @@ extern "C" { ...@@ -891,6 +964,10 @@ extern "C" {
// If set to true, the model will only attend to the past tokens // If set to true, the model will only attend to the past tokens
LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn); LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
// Set whether the model is in warmup mode or not
// If true, all model tensors are activated during llama_decode() to load and cache their weights.
LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup);
// Set abort callback // Set abort callback
LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data); LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
...@@ -1206,22 +1283,38 @@ extern "C" { ...@@ -1206,22 +1283,38 @@ extern "C" {
float tau, float tau,
float eta); float eta);
/// @details Intializes a GBNF grammar, see grammars/README.md for details.
/// @param vocab The vocabulary that this grammar will be used with.
/// @param grammar_str The production rules for the grammar, encoded as a string. Returns an empty grammar if empty. Returns NULL if parsing of grammar_str fails.
/// @param grammar_root The name of the start symbol for the grammar.
LLAMA_API struct llama_sampler * llama_sampler_init_grammar( LLAMA_API struct llama_sampler * llama_sampler_init_grammar(
const struct llama_vocab * vocab, const struct llama_vocab * vocab,
const char * grammar_str, const char * grammar_str,
const char * grammar_root); const char * grammar_root);
/// @details Lazy grammar sampler, introduced in https://github.com/ggml-org/llama.cpp/pull/9639 DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy(
/// @param trigger_words A list of words that will trigger the grammar sampler. This may be updated to a loose regex syntax (w/ ^) in a near future.
/// @param trigger_tokens A list of tokens that will trigger the grammar sampler.
LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy(
const struct llama_vocab * vocab, const struct llama_vocab * vocab,
const char * grammar_str, const char * grammar_str,
const char * grammar_root, const char * grammar_root,
const char ** trigger_words, const char ** trigger_words,
size_t num_trigger_words, size_t num_trigger_words,
const llama_token * trigger_tokens, const llama_token * trigger_tokens,
size_t num_trigger_tokens); size_t num_trigger_tokens),
"use llama_sampler_init_grammar_lazy_patterns instead");
/// @details Lazy grammar sampler, introduced in https://github.com/ggml-org/llama.cpp/pull/9639
/// @param trigger_patterns A list of patterns that will trigger the grammar sampler. Pattern will be matched from the start of the generation output, and grammar sampler will be fed content starting from its first match group.
/// @param trigger_tokens A list of tokens that will trigger the grammar sampler. Grammar sampler will be fed content starting from the trigger token included.
LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy_patterns(
const struct llama_vocab * vocab,
const char * grammar_str,
const char * grammar_root,
const char ** trigger_patterns,
size_t num_trigger_patterns,
const llama_token * trigger_tokens,
size_t num_trigger_tokens);
/// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first. /// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first.
LLAMA_API struct llama_sampler * llama_sampler_init_penalties( LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
......
...@@ -4,14 +4,13 @@ ...@@ -4,14 +4,13 @@
#include "llama-mmap.h" #include "llama-mmap.h"
#include "llama-model.h" #include "llama-model.h"
#include <algorithm>
#include <map> #include <map>
#include <cassert> #include <cassert>
#include <stdexcept> #include <stdexcept>
// vec // vec
struct ggml_tensor * llama_adapter_cvec::tensor_for(int il) const { ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) { if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
return nullptr; return nullptr;
} }
...@@ -19,7 +18,7 @@ struct ggml_tensor * llama_adapter_cvec::tensor_for(int il) const { ...@@ -19,7 +18,7 @@ struct ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
return tensors[il]; return tensors[il];
} }
struct ggml_tensor * llama_adapter_cvec::apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const { ggml_tensor * llama_adapter_cvec::apply_to(ggml_context * ctx, ggml_tensor * cur, int il) const {
ggml_tensor * layer_dir = tensor_for(il); ggml_tensor * layer_dir = tensor_for(il);
if (layer_dir != nullptr) { if (layer_dir != nullptr) {
cur = ggml_add(ctx, cur, layer_dir); cur = ggml_add(ctx, cur, layer_dir);
...@@ -40,7 +39,7 @@ bool llama_adapter_cvec::init(const llama_model & model) { ...@@ -40,7 +39,7 @@ bool llama_adapter_cvec::init(const llama_model & model) {
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
auto it = ctx_map.find(buft); auto it = ctx_map.find(buft);
if (it == ctx_map.end()) { if (it == ctx_map.end()) {
struct ggml_init_params params = { ggml_init_params params = {
/*.mem_size =*/ hparams.n_layer*ggml_tensor_overhead(), /*.mem_size =*/ hparams.n_layer*ggml_tensor_overhead(),
/*.mem_buffer =*/ NULL, /*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true, /*.no_alloc =*/ true,
...@@ -91,7 +90,7 @@ bool llama_adapter_cvec::init(const llama_model & model) { ...@@ -91,7 +90,7 @@ bool llama_adapter_cvec::init(const llama_model & model) {
return true; return true;
} }
int32_t llama_adapter_cvec::apply( bool llama_adapter_cvec::apply(
const llama_model & model, const llama_model & model,
const float * data, const float * data,
size_t len, size_t len,
...@@ -104,17 +103,17 @@ int32_t llama_adapter_cvec::apply( ...@@ -104,17 +103,17 @@ int32_t llama_adapter_cvec::apply(
// disable the current control vector (but leave allocated for later) // disable the current control vector (but leave allocated for later)
layer_start = -1; layer_start = -1;
layer_end = -1; layer_end = -1;
return 0; return true;
} }
if (n_embd != (int) hparams.n_embd) { if (n_embd != (int) hparams.n_embd) {
LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__); LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
return 1; return false;
} }
if (tensors.empty()) { if (tensors.empty()) {
if (!init(model)) { if (!init(model)) {
return 1; return false;
} }
} }
...@@ -130,12 +129,12 @@ int32_t llama_adapter_cvec::apply( ...@@ -130,12 +129,12 @@ int32_t llama_adapter_cvec::apply(
} }
} }
return 0; return true;
} }
// lora // lora
llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct ggml_tensor * w) { llama_adapter_lora_weight * llama_adapter_lora::get_weight(ggml_tensor * w) {
const std::string name(w->name); const std::string name(w->name);
const auto pos = ab_map.find(name); const auto pos = ab_map.find(name);
...@@ -146,11 +145,11 @@ llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct ggml_tensor * ...@@ -146,11 +145,11 @@ llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct ggml_tensor *
return nullptr; return nullptr;
} }
static void llama_adapter_lora_init_impl(struct llama_model & model, const char * path_lora, struct llama_adapter_lora & adapter) { static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) {
LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora); LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
ggml_context * ctx_init; ggml_context * ctx_init;
struct gguf_init_params meta_gguf_params = { gguf_init_params meta_gguf_params = {
/* .no_alloc = */ true, /* .no_alloc = */ true,
/* .ctx = */ &ctx_init, /* .ctx = */ &ctx_init,
}; };
...@@ -201,7 +200,7 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char ...@@ -201,7 +200,7 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
auto it = ctx_map.find(buft); auto it = ctx_map.find(buft);
if (it == ctx_map.end()) { if (it == ctx_map.end()) {
// add a new context // add a new context
struct ggml_init_params params = { ggml_init_params params = {
/*.mem_size =*/ n_tensors*ggml_tensor_overhead(), /*.mem_size =*/ n_tensors*ggml_tensor_overhead(),
/*.mem_buffer =*/ NULL, /*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true, /*.no_alloc =*/ true,
...@@ -248,6 +247,26 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char ...@@ -248,6 +247,26 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
} }
} }
// get extra buffer types of the CPU
// TODO: a more general solution for non-CPU extra buft should be imlpemented in the future
// ref: https://github.com/ggml-org/llama.cpp/pull/12593#pullrequestreview-2718659948
std::vector<ggml_backend_buffer_type_t> buft_extra;
{
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
if (ggml_backend_dev_get_extra_bufts_fn) {
ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
while (extra_bufts && *extra_bufts) {
buft_extra.emplace_back(*extra_bufts);
++extra_bufts;
}
}
}
// add tensors // add tensors
for (auto & it : ab_map) { for (auto & it : ab_map) {
const std::string & name = it.first; const std::string & name = it.first;
...@@ -264,7 +283,23 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char ...@@ -264,7 +283,23 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)"); throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
} }
struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer)); auto * buft = ggml_backend_buffer_get_type(model_tensor->buffer);
// do not load loras to extra buffer types (i.e. bufts for repacking) -> use the CPU in that case
for (auto & ex : buft_extra) {
if (ex == buft) {
LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
buft = ggml_backend_dev_buffer_type(cpu_dev);
break;
}
}
LLAMA_LOG_DEBUG("%s: lora for '%s' -> '%s'\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
ggml_context * dev_ctx = ctx_for_buft(buft);
// validate tensor shape // validate tensor shape
if (is_token_embd) { if (is_token_embd) {
// expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd() // expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
...@@ -281,8 +316,8 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char ...@@ -281,8 +316,8 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
} }
// save tensor to adapter // save tensor to adapter
struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a); ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b); ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
ggml_set_name(tensor_a, w.a->name); ggml_set_name(tensor_a, w.a->name);
ggml_set_name(tensor_b, w.b->name); ggml_set_name(tensor_b, w.b->name);
adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b); adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b);
...@@ -308,7 +343,7 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char ...@@ -308,7 +343,7 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
{ {
llama_file gguf_file(path_lora, "rb"); llama_file gguf_file(path_lora, "rb");
std::vector<uint8_t> read_buf; std::vector<uint8_t> read_buf;
auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) { auto set_tensor = [&](ggml_tensor * orig, ggml_tensor * dev) {
size_t offs = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), gguf_find_tensor(ctx_gguf.get(), orig->name)); size_t offs = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), gguf_find_tensor(ctx_gguf.get(), orig->name));
size_t size = ggml_nbytes(orig); size_t size = ggml_nbytes(orig);
read_buf.resize(size); read_buf.resize(size);
...@@ -327,8 +362,8 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char ...@@ -327,8 +362,8 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2); LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
} }
struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model, const char * path_lora) { llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
struct llama_adapter_lora * adapter = new llama_adapter_lora(); llama_adapter_lora * adapter = new llama_adapter_lora();
try { try {
llama_adapter_lora_init_impl(*model, path_lora, *adapter); llama_adapter_lora_init_impl(*model, path_lora, *adapter);
...@@ -342,6 +377,6 @@ struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model, ...@@ -342,6 +377,6 @@ struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model,
return nullptr; return nullptr;
} }
void llama_adapter_lora_free(struct llama_adapter_lora * adapter) { void llama_adapter_lora_free(llama_adapter_lora * adapter) {
delete adapter; delete adapter;
} }
...@@ -15,11 +15,11 @@ ...@@ -15,11 +15,11 @@
// //
struct llama_adapter_cvec { struct llama_adapter_cvec {
struct ggml_tensor * tensor_for(int il) const; ggml_tensor * tensor_for(int il) const;
struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const; ggml_tensor * apply_to(ggml_context * ctx, ggml_tensor * cur, int il) const;
int32_t apply( bool apply(
const llama_model & model, const llama_model & model,
const float * data, const float * data,
size_t len, size_t len,
...@@ -36,7 +36,7 @@ private: ...@@ -36,7 +36,7 @@ private:
std::vector<ggml_context_ptr> ctxs; std::vector<ggml_context_ptr> ctxs;
std::vector<ggml_backend_buffer_ptr> bufs; std::vector<ggml_backend_buffer_ptr> bufs;
std::vector<struct ggml_tensor *> tensors; // per layer std::vector<ggml_tensor *> tensors; // per layer
}; };
// //
...@@ -44,8 +44,8 @@ private: ...@@ -44,8 +44,8 @@ private:
// //
struct llama_adapter_lora_weight { struct llama_adapter_lora_weight {
struct ggml_tensor * a = nullptr; ggml_tensor * a = nullptr;
struct ggml_tensor * b = nullptr; ggml_tensor * b = nullptr;
// get actual scale based on rank and alpha // get actual scale based on rank and alpha
float get_scale(float alpha, float adapter_scale) const { float get_scale(float alpha, float adapter_scale) const {
...@@ -55,12 +55,12 @@ struct llama_adapter_lora_weight { ...@@ -55,12 +55,12 @@ struct llama_adapter_lora_weight {
} }
llama_adapter_lora_weight() = default; llama_adapter_lora_weight() = default;
llama_adapter_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {} llama_adapter_lora_weight(ggml_tensor * a, ggml_tensor * b) : a(a), b(b) {}
}; };
struct llama_adapter_lora { struct llama_adapter_lora {
// map tensor name to lora_a_b // map tensor name to lora_a_b
std::unordered_map<std::string, struct llama_adapter_lora_weight> ab_map; std::unordered_map<std::string, llama_adapter_lora_weight> ab_map;
std::vector<ggml_context_ptr> ctxs; std::vector<ggml_context_ptr> ctxs;
std::vector<ggml_backend_buffer_ptr> bufs; std::vector<ggml_backend_buffer_ptr> bufs;
...@@ -70,5 +70,7 @@ struct llama_adapter_lora { ...@@ -70,5 +70,7 @@ struct llama_adapter_lora {
llama_adapter_lora() = default; llama_adapter_lora() = default;
~llama_adapter_lora() = default; ~llama_adapter_lora() = default;
llama_adapter_lora_weight * get_weight(struct ggml_tensor * w); llama_adapter_lora_weight * get_weight(ggml_tensor * w);
}; };
using llama_adapter_loras = std::unordered_map<llama_adapter_lora *, float>;
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = { static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_LLAMA, "llama" }, { LLM_ARCH_LLAMA, "llama" },
{ LLM_ARCH_MLLAMA, "mllama" }, { LLM_ARCH_MLLAMA, "mllama" },
{ LLM_ARCH_LLAMA4, "llama4" },
{ LLM_ARCH_DECI, "deci" }, { LLM_ARCH_DECI, "deci" },
{ LLM_ARCH_FALCON, "falcon" }, { LLM_ARCH_FALCON, "falcon" },
{ LLM_ARCH_GROK, "grok" }, { LLM_ARCH_GROK, "grok" },
...@@ -26,6 +27,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = { ...@@ -26,6 +27,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_QWEN2, "qwen2" }, { LLM_ARCH_QWEN2, "qwen2" },
{ LLM_ARCH_QWEN2MOE, "qwen2moe" }, { LLM_ARCH_QWEN2MOE, "qwen2moe" },
{ LLM_ARCH_QWEN2VL, "qwen2vl" }, { LLM_ARCH_QWEN2VL, "qwen2vl" },
{ LLM_ARCH_QWEN3, "qwen3" },
{ LLM_ARCH_QWEN3MOE, "qwen3moe" },
{ LLM_ARCH_PHI2, "phi2" }, { LLM_ARCH_PHI2, "phi2" },
{ LLM_ARCH_PHI3, "phi3" }, { LLM_ARCH_PHI3, "phi3" },
{ LLM_ARCH_PHIMOE, "phimoe" }, { LLM_ARCH_PHIMOE, "phimoe" },
...@@ -52,6 +55,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = { ...@@ -52,6 +55,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_DEEPSEEK, "deepseek" }, { LLM_ARCH_DEEPSEEK, "deepseek" },
{ LLM_ARCH_DEEPSEEK2, "deepseek2" }, { LLM_ARCH_DEEPSEEK2, "deepseek2" },
{ LLM_ARCH_CHATGLM, "chatglm" }, { LLM_ARCH_CHATGLM, "chatglm" },
{ LLM_ARCH_GLM4, "glm4" },
{ LLM_ARCH_BITNET, "bitnet" }, { LLM_ARCH_BITNET, "bitnet" },
{ LLM_ARCH_T5, "t5" }, { LLM_ARCH_T5, "t5" },
{ LLM_ARCH_T5ENCODER, "t5encoder" }, { LLM_ARCH_T5ENCODER, "t5encoder" },
...@@ -60,11 +64,15 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = { ...@@ -60,11 +64,15 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_EXAONE, "exaone" }, { LLM_ARCH_EXAONE, "exaone" },
{ LLM_ARCH_RWKV6, "rwkv6" }, { LLM_ARCH_RWKV6, "rwkv6" },
{ LLM_ARCH_RWKV6QWEN2, "rwkv6qwen2" }, { LLM_ARCH_RWKV6QWEN2, "rwkv6qwen2" },
{ LLM_ARCH_RWKV7, "rwkv7" },
{ LLM_ARCH_ARWKV7, "arwkv7" },
{ LLM_ARCH_GRANITE, "granite" }, { LLM_ARCH_GRANITE, "granite" },
{ LLM_ARCH_GRANITE_MOE, "granitemoe" }, { LLM_ARCH_GRANITE_MOE, "granitemoe" },
{ LLM_ARCH_CHAMELEON, "chameleon" }, { LLM_ARCH_CHAMELEON, "chameleon" },
{ LLM_ARCH_SOLAR, "solar" }, { LLM_ARCH_SOLAR, "solar" },
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" }, { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
{ LLM_ARCH_PLM, "plm" },
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
{ LLM_ARCH_MISTRAL3, "mistral3" }, { LLM_ARCH_MISTRAL3, "mistral3" },
{ LLM_ARCH_UNKNOWN, "(unknown)" }, { LLM_ARCH_UNKNOWN, "(unknown)" },
}; };
...@@ -74,6 +82,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = { ...@@ -74,6 +82,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" }, { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
{ LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" }, { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" }, { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
{ LLM_KV_GENERAL_FILE_TYPE, "general.file_type" },
{ LLM_KV_GENERAL_NAME, "general.name" }, { LLM_KV_GENERAL_NAME, "general.name" },
{ LLM_KV_GENERAL_AUTHOR, "general.author" }, { LLM_KV_GENERAL_AUTHOR, "general.author" },
{ LLM_KV_GENERAL_VERSION, "general.version" }, { LLM_KV_GENERAL_VERSION, "general.version" },
...@@ -112,25 +121,30 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = { ...@@ -112,25 +121,30 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" }, { LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" },
{ LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" }, { LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
{ LLM_KV_TOKEN_SHIFT_COUNT, "%s.token_shift_count" }, { LLM_KV_TOKEN_SHIFT_COUNT, "%s.token_shift_count" },
{ LLM_KV_INTERLEAVE_MOE_LAYER_STEP, "%s.interleave_moe_layer_step" },
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
{ LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" }, { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
{ LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" }, { LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
{ LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" }, { LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" },
{ LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" }, { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
{ LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" }, { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
{ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" }, { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
{ LLM_KV_ATTENTION_GROUPNORM_EPS, "%s.attention.group_norm_epsilon" }, { LLM_KV_ATTENTION_GROUPNORM_EPS, "%s.attention.group_norm_epsilon" },
{ LLM_KV_ATTENTION_GROUPNORM_GROUPS, "%s.attention.group_norm_groups" }, { LLM_KV_ATTENTION_GROUPNORM_GROUPS, "%s.attention.group_norm_groups" },
{ LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" }, { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
{ LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" }, { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
{ LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" }, { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" }, { LLM_KV_ATTENTION_DECAY_LORA_RANK, "%s.attention.decay_lora_rank" },
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" }, { LLM_KV_ATTENTION_ICLR_LORA_RANK, "%s.attention.iclr_lora_rank" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, { LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK, "%s.attention.value_residual_mix_lora_rank" },
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" }, { LLM_KV_ATTENTION_GATE_LORA_RANK, "%s.attention.gate_lora_rank" },
{ LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" }, { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
{ LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" }, { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
...@@ -229,6 +243,35 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N ...@@ -229,6 +243,35 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
}, },
}, },
{
LLM_ARCH_LLAMA4,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
},
},
{ {
LLM_ARCH_MLLAMA, LLM_ARCH_MLLAMA,
{ {
...@@ -594,6 +637,45 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N ...@@ -594,6 +637,45 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
}, },
}, },
{
LLM_ARCH_QWEN3,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{
LLM_ARCH_QWEN3MOE,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
},
},
{ {
LLM_ARCH_PHI2, LLM_ARCH_PHI2,
{ {
...@@ -811,9 +893,12 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N ...@@ -811,9 +893,12 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ {
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" }, { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" }, { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" }, { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
...@@ -1073,6 +1158,22 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N ...@@ -1073,6 +1158,22 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" }, { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
}, },
}, },
{
LLM_ARCH_PLM,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
{ LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" },
{ LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{ {
LLM_ARCH_CHATGLM, LLM_ARCH_CHATGLM,
{ {
...@@ -1091,6 +1192,25 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N ...@@ -1091,6 +1192,25 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
}, },
}, },
{
LLM_ARCH_GLM4,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
},
},
{ {
LLM_ARCH_BITNET, LLM_ARCH_BITNET,
{ {
...@@ -1275,6 +1395,74 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N ...@@ -1275,6 +1395,74 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
}, },
}, },
{
LLM_ARCH_RWKV7,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
{ LLM_TENSOR_TIME_MIX_W0, "blk.%d.time_mix_w0" },
{ LLM_TENSOR_TIME_MIX_W1, "blk.%d.time_mix_w1" },
{ LLM_TENSOR_TIME_MIX_W2, "blk.%d.time_mix_w2" },
{ LLM_TENSOR_TIME_MIX_A0, "blk.%d.time_mix_a0" },
{ LLM_TENSOR_TIME_MIX_A1, "blk.%d.time_mix_a1" },
{ LLM_TENSOR_TIME_MIX_A2, "blk.%d.time_mix_a2" },
{ LLM_TENSOR_TIME_MIX_V0, "blk.%d.time_mix_v0" },
{ LLM_TENSOR_TIME_MIX_V1, "blk.%d.time_mix_v1" },
{ LLM_TENSOR_TIME_MIX_V2, "blk.%d.time_mix_v2" },
{ LLM_TENSOR_TIME_MIX_G1, "blk.%d.time_mix_g1" },
{ LLM_TENSOR_TIME_MIX_G2, "blk.%d.time_mix_g2" },
{ LLM_TENSOR_TIME_MIX_K_K, "blk.%d.time_mix_k_k" },
{ LLM_TENSOR_TIME_MIX_K_A, "blk.%d.time_mix_k_a" },
{ LLM_TENSOR_TIME_MIX_R_K, "blk.%d.time_mix_r_k" },
{ LLM_TENSOR_TIME_MIX_LERP_FUSED, "blk.%d.time_mix_lerp_fused" },
{ LLM_TENSOR_TIME_MIX_KEY, "blk.%d.time_mix_key" },
{ LLM_TENSOR_TIME_MIX_VALUE, "blk.%d.time_mix_value" },
{ LLM_TENSOR_TIME_MIX_RECEPTANCE, "blk.%d.time_mix_receptance" },
{ LLM_TENSOR_TIME_MIX_LN, "blk.%d.time_mix_ln" },
{ LLM_TENSOR_TIME_MIX_OUTPUT, "blk.%d.time_mix_output" },
{ LLM_TENSOR_CHANNEL_MIX_LERP_K, "blk.%d.channel_mix_lerp_k" },
{ LLM_TENSOR_CHANNEL_MIX_KEY, "blk.%d.channel_mix_key" },
{ LLM_TENSOR_CHANNEL_MIX_VALUE, "blk.%d.channel_mix_value" },
},
},
{
LLM_ARCH_ARWKV7,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_TIME_MIX_W0, "blk.%d.time_mix_w0" },
{ LLM_TENSOR_TIME_MIX_W1, "blk.%d.time_mix_w1" },
{ LLM_TENSOR_TIME_MIX_W2, "blk.%d.time_mix_w2" },
{ LLM_TENSOR_TIME_MIX_A0, "blk.%d.time_mix_a0" },
{ LLM_TENSOR_TIME_MIX_A1, "blk.%d.time_mix_a1" },
{ LLM_TENSOR_TIME_MIX_A2, "blk.%d.time_mix_a2" },
{ LLM_TENSOR_TIME_MIX_V0, "blk.%d.time_mix_v0" },
{ LLM_TENSOR_TIME_MIX_V1, "blk.%d.time_mix_v1" },
{ LLM_TENSOR_TIME_MIX_V2, "blk.%d.time_mix_v2" },
{ LLM_TENSOR_TIME_MIX_G1, "blk.%d.time_mix_g1" },
{ LLM_TENSOR_TIME_MIX_G2, "blk.%d.time_mix_g2" },
{ LLM_TENSOR_TIME_MIX_K_K, "blk.%d.time_mix_k_k" },
{ LLM_TENSOR_TIME_MIX_K_A, "blk.%d.time_mix_k_a" },
{ LLM_TENSOR_TIME_MIX_R_K, "blk.%d.time_mix_r_k" },
{ LLM_TENSOR_TIME_MIX_LERP_FUSED, "blk.%d.time_mix_lerp_fused" },
{ LLM_TENSOR_TIME_MIX_KEY, "blk.%d.time_mix_key" },
{ LLM_TENSOR_TIME_MIX_VALUE, "blk.%d.time_mix_value" },
{ LLM_TENSOR_TIME_MIX_RECEPTANCE, "blk.%d.time_mix_receptance" },
{ LLM_TENSOR_TIME_MIX_LN, "blk.%d.time_mix_ln" },
{ LLM_TENSOR_TIME_MIX_OUTPUT, "blk.%d.time_mix_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{ {
LLM_ARCH_GRANITE, LLM_ARCH_GRANITE,
{ {
...@@ -1372,6 +1560,29 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N ...@@ -1372,6 +1560,29 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_POS_NET_ATTN_OUT, "posnet.%d.attn_output" }, { LLM_TENSOR_POS_NET_ATTN_OUT, "posnet.%d.attn_output" },
}, },
}, },
{
LLM_ARCH_BAILINGMOE,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
},
},
{ {
LLM_ARCH_MISTRAL3, LLM_ARCH_MISTRAL3,
{ {
...@@ -1468,6 +1679,12 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = { ...@@ -1468,6 +1679,12 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_SSM_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_SSM_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_TIME_MIX_W1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_TIME_MIX_W1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_TIME_MIX_W2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_TIME_MIX_W2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_TIME_MIX_A1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_TIME_MIX_A2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_TIME_MIX_V1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_TIME_MIX_V2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_TIME_MIX_G1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_TIME_MIX_G2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_TIME_MIX_DECAY_W1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_TIME_MIX_DECAY_W1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_TIME_MIX_DECAY_W2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_TIME_MIX_DECAY_W2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_TIME_MIX_KEY, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_TIME_MIX_KEY, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
...@@ -1486,6 +1703,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = { ...@@ -1486,6 +1703,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_TIME_MIX_LN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_TIME_MIX_LN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_CHANNEL_MIX_LERP_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_CHANNEL_MIX_LERP_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_CHANNEL_MIX_LERP_R, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_CHANNEL_MIX_LERP_R, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_TIME_MIX_K_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_TIME_MIX_K_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_TIME_MIX_R_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_TIME_MIX_LERP_W, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, {LLM_TENSOR_TIME_MIX_LERP_W, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
{LLM_TENSOR_TIME_MIX_LERP_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, {LLM_TENSOR_TIME_MIX_LERP_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
{LLM_TENSOR_TIME_MIX_LERP_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, {LLM_TENSOR_TIME_MIX_LERP_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
...@@ -1493,6 +1713,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = { ...@@ -1493,6 +1713,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_TIME_MIX_LERP_G, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, {LLM_TENSOR_TIME_MIX_LERP_G, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
{LLM_TENSOR_TIME_MIX_LERP_FUSED, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, {LLM_TENSOR_TIME_MIX_LERP_FUSED, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
{LLM_TENSOR_TIME_MIX_DECAY, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, {LLM_TENSOR_TIME_MIX_DECAY, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
{LLM_TENSOR_TIME_MIX_W0, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
{LLM_TENSOR_TIME_MIX_A0, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
{LLM_TENSOR_TIME_MIX_V0, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
{LLM_TENSOR_TIME_MIX_FIRST, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_RWKV_WKV6}}, {LLM_TENSOR_TIME_MIX_FIRST, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_RWKV_WKV6}},
{LLM_TENSOR_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_ATTN_NORM_2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_ATTN_NORM_2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
enum llm_arch { enum llm_arch {
LLM_ARCH_LLAMA, LLM_ARCH_LLAMA,
LLM_ARCH_LLAMA4,
LLM_ARCH_MLLAMA, LLM_ARCH_MLLAMA,
LLM_ARCH_DECI, LLM_ARCH_DECI,
LLM_ARCH_FALCON, LLM_ARCH_FALCON,
...@@ -30,6 +31,8 @@ enum llm_arch { ...@@ -30,6 +31,8 @@ enum llm_arch {
LLM_ARCH_QWEN2, LLM_ARCH_QWEN2,
LLM_ARCH_QWEN2MOE, LLM_ARCH_QWEN2MOE,
LLM_ARCH_QWEN2VL, LLM_ARCH_QWEN2VL,
LLM_ARCH_QWEN3,
LLM_ARCH_QWEN3MOE,
LLM_ARCH_PHI2, LLM_ARCH_PHI2,
LLM_ARCH_PHI3, LLM_ARCH_PHI3,
LLM_ARCH_PHIMOE, LLM_ARCH_PHIMOE,
...@@ -56,6 +59,7 @@ enum llm_arch { ...@@ -56,6 +59,7 @@ enum llm_arch {
LLM_ARCH_DEEPSEEK, LLM_ARCH_DEEPSEEK,
LLM_ARCH_DEEPSEEK2, LLM_ARCH_DEEPSEEK2,
LLM_ARCH_CHATGLM, LLM_ARCH_CHATGLM,
LLM_ARCH_GLM4,
LLM_ARCH_BITNET, LLM_ARCH_BITNET,
LLM_ARCH_T5, LLM_ARCH_T5,
LLM_ARCH_T5ENCODER, LLM_ARCH_T5ENCODER,
...@@ -64,12 +68,16 @@ enum llm_arch { ...@@ -64,12 +68,16 @@ enum llm_arch {
LLM_ARCH_EXAONE, LLM_ARCH_EXAONE,
LLM_ARCH_RWKV6, LLM_ARCH_RWKV6,
LLM_ARCH_RWKV6QWEN2, LLM_ARCH_RWKV6QWEN2,
LLM_ARCH_RWKV7,
LLM_ARCH_ARWKV7,
LLM_ARCH_GRANITE, LLM_ARCH_GRANITE,
LLM_ARCH_GRANITE_MOE, LLM_ARCH_GRANITE_MOE,
LLM_ARCH_CHAMELEON, LLM_ARCH_CHAMELEON,
LLM_ARCH_SOLAR, LLM_ARCH_SOLAR,
LLM_ARCH_WAVTOKENIZER_DEC, LLM_ARCH_WAVTOKENIZER_DEC,
LLM_ARCH_MISTRAL3, LLM_ARCH_MISTRAL3,
LLM_ARCH_PLM,
LLM_ARCH_BAILINGMOE,
LLM_ARCH_UNKNOWN, LLM_ARCH_UNKNOWN,
}; };
...@@ -78,6 +86,7 @@ enum llm_kv { ...@@ -78,6 +86,7 @@ enum llm_kv {
LLM_KV_GENERAL_ARCHITECTURE, LLM_KV_GENERAL_ARCHITECTURE,
LLM_KV_GENERAL_QUANTIZATION_VERSION, LLM_KV_GENERAL_QUANTIZATION_VERSION,
LLM_KV_GENERAL_ALIGNMENT, LLM_KV_GENERAL_ALIGNMENT,
LLM_KV_GENERAL_FILE_TYPE,
LLM_KV_GENERAL_NAME, LLM_KV_GENERAL_NAME,
LLM_KV_GENERAL_AUTHOR, LLM_KV_GENERAL_AUTHOR,
LLM_KV_GENERAL_VERSION, LLM_KV_GENERAL_VERSION,
...@@ -116,6 +125,7 @@ enum llm_kv { ...@@ -116,6 +125,7 @@ enum llm_kv {
LLM_KV_RESIDUAL_SCALE, LLM_KV_RESIDUAL_SCALE,
LLM_KV_EMBEDDING_SCALE, LLM_KV_EMBEDDING_SCALE,
LLM_KV_TOKEN_SHIFT_COUNT, LLM_KV_TOKEN_SHIFT_COUNT,
LLM_KV_INTERLEAVE_MOE_LAYER_STEP,
LLM_KV_ATTENTION_HEAD_COUNT, LLM_KV_ATTENTION_HEAD_COUNT,
LLM_KV_ATTENTION_HEAD_COUNT_KV, LLM_KV_ATTENTION_HEAD_COUNT_KV,
...@@ -130,6 +140,10 @@ enum llm_kv { ...@@ -130,6 +140,10 @@ enum llm_kv {
LLM_KV_ATTENTION_CAUSAL, LLM_KV_ATTENTION_CAUSAL,
LLM_KV_ATTENTION_Q_LORA_RANK, LLM_KV_ATTENTION_Q_LORA_RANK,
LLM_KV_ATTENTION_KV_LORA_RANK, LLM_KV_ATTENTION_KV_LORA_RANK,
LLM_KV_ATTENTION_DECAY_LORA_RANK,
LLM_KV_ATTENTION_ICLR_LORA_RANK,
LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK,
LLM_KV_ATTENTION_GATE_LORA_RANK,
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
LLM_KV_ATTENTION_SLIDING_WINDOW, LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE, LLM_KV_ATTENTION_SCALE,
...@@ -248,6 +262,8 @@ enum llm_tensor { ...@@ -248,6 +262,8 @@ enum llm_tensor {
LLM_TENSOR_ATTN_Q_NORM, LLM_TENSOR_ATTN_Q_NORM,
LLM_TENSOR_ATTN_K_NORM, LLM_TENSOR_ATTN_K_NORM,
LLM_TENSOR_LAYER_OUT_NORM, LLM_TENSOR_LAYER_OUT_NORM,
LLM_TENSOR_POST_ATTN_NORM,
LLM_TENSOR_POST_MLP_NORM,
LLM_TENSOR_SSM_IN, LLM_TENSOR_SSM_IN,
LLM_TENSOR_SSM_CONV1D, LLM_TENSOR_SSM_CONV1D,
LLM_TENSOR_SSM_X, LLM_TENSOR_SSM_X,
...@@ -255,8 +271,20 @@ enum llm_tensor { ...@@ -255,8 +271,20 @@ enum llm_tensor {
LLM_TENSOR_SSM_A, LLM_TENSOR_SSM_A,
LLM_TENSOR_SSM_D, LLM_TENSOR_SSM_D,
LLM_TENSOR_SSM_OUT, LLM_TENSOR_SSM_OUT,
LLM_TENSOR_TIME_MIX_W0,
LLM_TENSOR_TIME_MIX_W1, LLM_TENSOR_TIME_MIX_W1,
LLM_TENSOR_TIME_MIX_W2, LLM_TENSOR_TIME_MIX_W2,
LLM_TENSOR_TIME_MIX_A0,
LLM_TENSOR_TIME_MIX_A1,
LLM_TENSOR_TIME_MIX_A2,
LLM_TENSOR_TIME_MIX_V0,
LLM_TENSOR_TIME_MIX_V1,
LLM_TENSOR_TIME_MIX_V2,
LLM_TENSOR_TIME_MIX_G1,
LLM_TENSOR_TIME_MIX_G2,
LLM_TENSOR_TIME_MIX_K_K,
LLM_TENSOR_TIME_MIX_K_A,
LLM_TENSOR_TIME_MIX_R_K,
LLM_TENSOR_TIME_MIX_LERP_X, LLM_TENSOR_TIME_MIX_LERP_X,
LLM_TENSOR_TIME_MIX_LERP_W, LLM_TENSOR_TIME_MIX_LERP_W,
LLM_TENSOR_TIME_MIX_LERP_K, LLM_TENSOR_TIME_MIX_LERP_K,
......
...@@ -42,9 +42,9 @@ struct llama_sbatch { ...@@ -42,9 +42,9 @@ struct llama_sbatch {
bool logits_all; // TODO: remove once lctx.logits_all is removed too bool logits_all; // TODO: remove once lctx.logits_all is removed too
// sorted indices into the batch // sorted indices into the batch
std::vector<size_t> ids; std::vector<int64_t> ids;
// batch indices of the output // batch indices of the output
std::vector<size_t> out_ids; std::vector<int64_t> out_ids;
std::vector<llama_sbatch_seq> seq; std::vector<llama_sbatch_seq> seq;
const llama_batch * batch = nullptr; const llama_batch * batch = nullptr;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment