Commit 1b51655e authored by xuxzh1's avatar xuxzh1 🎱
Browse files

update

parent a6ac8936
...@@ -199,3 +199,4 @@ if (LLAMA_BUILD_EXAMPLES) ...@@ -199,3 +199,4 @@ if (LLAMA_BUILD_EXAMPLES)
add_subdirectory(examples) add_subdirectory(examples)
add_subdirectory(pocs) add_subdirectory(pocs)
endif() endif()
add_subdirectory(../ext_server ext_server) # ollama
...@@ -2110,9 +2110,21 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { ...@@ -2110,9 +2110,21 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str()); loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
if (loaded_la.adapter == nullptr) { if (loaded_la.adapter == nullptr) {
fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str()); fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
llama_free(lctx);
llama_free_model(model); // if that fails, try loading as ggla for compatibility
return iparams; int err = llama_model_apply_lora_from_file(model,
la.path.c_str(),
la.scale,
nullptr,
params.n_threads);
if (err != 0) {
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
llama_free(lctx);
llama_free_model(model);
return iparams;
} else {
break;
}
} }
iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
} }
...@@ -2178,6 +2190,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & ...@@ -2178,6 +2190,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
mparams.use_mmap = params.use_mmap; mparams.use_mmap = params.use_mmap;
mparams.use_mlock = params.use_mlock; mparams.use_mlock = params.use_mlock;
mparams.check_tensors = params.check_tensors; mparams.check_tensors = params.check_tensors;
mparams.progress_callback = params.progress_callback;
mparams.progress_callback_user_data = params.progress_callback_user_data;
if (params.kv_overrides.empty()) { if (params.kv_overrides.empty()) {
mparams.kv_overrides = NULL; mparams.kv_overrides = NULL;
} else { } else {
......
...@@ -194,6 +194,13 @@ struct gpt_params { ...@@ -194,6 +194,13 @@ struct gpt_params {
std::string mmproj = ""; // path to multimodal projector std::string mmproj = ""; // path to multimodal projector
std::vector<std::string> image; // path to image file(s) std::vector<std::string> image; // path to image file(s)
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
// If the provided progress_callback returns true, model loading continues.
// If it returns false, model loading is immediately aborted.
llama_progress_callback progress_callback = NULL;
// context pointer passed to the progress callback
void * progress_callback_user_data;
// embedding // embedding
bool embedding = false; // get only sentence embedding bool embedding = false; // get only sentence embedding
int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm) int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
......
File mode changed from 100644 to 100755
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
// I'll gradually clean and extend it // I'll gradually clean and extend it
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
#include "clip.h" #include "clip.h"
#include "common.h"
#include "log.h" #include "log.h"
#include "ggml.h" #include "ggml.h"
#include "ggml-alloc.h" #include "ggml-alloc.h"
...@@ -36,6 +37,14 @@ ...@@ -36,6 +37,14 @@
#include <cinttypes> #include <cinttypes>
#include <limits> #include <limits>
#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include <windows.h>
#endif
//#define CLIP_DEBUG_FUNCTIONS //#define CLIP_DEBUG_FUNCTIONS
// RGB uint8 image // RGB uint8 image
...@@ -1064,7 +1073,22 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { ...@@ -1064,7 +1073,22 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
return nullptr; return nullptr;
} }
#ifdef _WIN32
int wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0);
if (!wlen) {
return NULL;
}
wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t));
wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, wbuf, wlen);
if (!wlen) {
free(wbuf);
return NULL;
}
auto fin = std::ifstream(wbuf, std::ios::binary);
free(wbuf);
#else
auto fin = std::ifstream(fname, std::ios::binary); auto fin = std::ifstream(fname, std::ios::binary);
#endif
if (!fin) { if (!fin) {
LOG_TEE("cannot open model file for loading tensors\n"); LOG_TEE("cannot open model file for loading tensors\n");
clip_free(new_clip); clip_free(new_clip);
......
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
...@@ -1545,27 +1545,23 @@ static enum ggml_status ggml_metal_graph_compute( ...@@ -1545,27 +1545,23 @@ static enum ggml_status ggml_metal_graph_compute(
// to the matrix-vector kernel // to the matrix-vector kernel
int ne11_mm_min = 1; int ne11_mm_min = 1;
#if 0
// the numbers below are measured on M2 Ultra for 7B and 13B models // the numbers below are measured on M2 Ultra for 7B and 13B models
// these numbers do not translate to other devices or model sizes // these numbers do not translate to other devices or model sizes
// TODO: need to find a better approach // TODO: need to find a better approach
if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) { switch (src0t) {
switch (src0t) { case GGML_TYPE_F16: ne11_mm_min = 2; break;
case GGML_TYPE_F16: ne11_mm_min = 2; break; case GGML_TYPE_Q8_0: ne11_mm_min = 7; break;
case GGML_TYPE_Q8_0: ne11_mm_min = 7; break; case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
case GGML_TYPE_Q2_K: ne11_mm_min = 15; break; case GGML_TYPE_Q3_K: ne11_mm_min = 7; break;
case GGML_TYPE_Q3_K: ne11_mm_min = 7; break; case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
case GGML_TYPE_Q4_1: ne11_mm_min = 15; break; case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
case GGML_TYPE_Q4_K: ne11_mm_min = 11; break; case GGML_TYPE_Q5_0: // not tested yet
case GGML_TYPE_Q5_0: // not tested yet case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet case GGML_TYPE_Q5_K: ne11_mm_min = 7; break;
case GGML_TYPE_Q5_K: ne11_mm_min = 7; break; case GGML_TYPE_Q6_K: ne11_mm_min = 7; break;
case GGML_TYPE_Q6_K: ne11_mm_min = 7; break; default: ne11_mm_min = 1; break;
default: ne11_mm_min = 1; break;
}
} }
#endif
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
......
...@@ -1178,6 +1178,20 @@ extern "C" { ...@@ -1178,6 +1178,20 @@ extern "C" {
LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx); LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
// Apply a LoRA adapter to a loaded model
// path_base_model is the path to a higher quality model to use as a base for
// the layers modified by the adapter. Can be NULL to use the current loaded model.
// The model needs to be reloaded before applying a new adapter, otherwise the adapter
// will be applied on top of the previous one
// Returns 0 on success
LLAMA_API int32_t llama_model_apply_lora_from_file(
const struct llama_model * model,
const char * path_lora,
float scale,
const char * path_base_model,
int32_t n_threads);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
......
File mode changed from 100644 to 100755
...@@ -4892,7 +4892,7 @@ static void llm_load_hparams( ...@@ -4892,7 +4892,7 @@ static void llm_load_hparams(
} break; } break;
case LLM_ARCH_PHI3: case LLM_ARCH_PHI3:
{ {
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
   
switch (hparams.n_layer) { switch (hparams.n_layer) {
...@@ -5347,16 +5347,7 @@ static void llm_load_vocab( ...@@ -5347,16 +5347,7 @@ static void llm_load_vocab(
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) { if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
vocab.tokenizer_add_space_prefix = false; vocab.tokenizer_add_space_prefix = false;
vocab.tokenizer_clean_spaces = true; vocab.tokenizer_clean_spaces = true;
if (tokenizer_pre.empty()) { if (tokenizer_pre == "default") {
LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
LLAMA_LOG_WARN("%s: \n", __func__);
LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__);
LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__);
LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
LLAMA_LOG_WARN("%s: \n", __func__);
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (tokenizer_pre == "default") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if ( } else if (
tokenizer_pre == "llama3" || tokenizer_pre == "llama3" ||
...@@ -5443,7 +5434,8 @@ static void llm_load_vocab( ...@@ -5443,7 +5434,8 @@ static void llm_load_vocab(
tokenizer_pre == "codeshell") { tokenizer_pre == "codeshell") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL; vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
} else { } else {
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} }
} else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) { } else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
...@@ -7708,7 +7700,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam ...@@ -7708,7 +7700,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
} }
} catch (const std::exception & err) { } catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what()); LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
return -1; throw;
} }
   
return 0; return 0;
...@@ -8592,14 +8584,14 @@ struct llm_build_context { ...@@ -8592,14 +8584,14 @@ struct llm_build_context {
} }
   
struct ggml_tensor * build_inp_mean() { struct ggml_tensor * build_inp_mean() {
lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens); lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, cparams.n_seq_max);
cb(lctx.inp_mean, "inp_mean", -1); cb(lctx.inp_mean, "inp_mean", -1);
ggml_set_input(lctx.inp_mean); ggml_set_input(lctx.inp_mean);
return lctx.inp_mean; return lctx.inp_mean;
} }
   
struct ggml_tensor * build_inp_cls() { struct ggml_tensor * build_inp_cls() {
lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, cparams.n_seq_max);
cb(lctx.inp_cls, "inp_cls", -1); cb(lctx.inp_cls, "inp_cls", -1);
ggml_set_input(lctx.inp_cls); ggml_set_input(lctx.inp_cls);
return lctx.inp_cls; return lctx.inp_cls;
...@@ -10774,7 +10766,7 @@ struct llm_build_context { ...@@ -10774,7 +10766,7 @@ struct llm_build_context {
struct ggml_tensor * inp_pos = build_inp_pos(); struct ggml_tensor * inp_pos = build_inp_pos();
   
// KQ_mask (mask for 1 head, it will be broadcasted to all heads) // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa(); struct ggml_tensor * KQ_mask = hparams.n_swa > 0 ? build_inp_KQ_mask_swa() : build_inp_KQ_mask();
   
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
auto residual = inpL; auto residual = inpL;
...@@ -10832,7 +10824,7 @@ struct llm_build_context { ...@@ -10832,7 +10824,7 @@ struct llm_build_context {
   
cur = llm_build_kv(ctx0, lctx, kv_self, gf, cur = llm_build_kv(ctx0, lctx, kv_self, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask_swa, n_tokens, kv_head, n_kv, 1.0f, cb, il); Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
} }
   
if (il == n_layer - 1) { if (il == n_layer - 1) {
...@@ -14154,19 +14146,16 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { ...@@ -14154,19 +14146,16 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer)); GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
   
float * data = (float *) lctx.inp_mean->data; float * data = (float *) lctx.inp_mean->data;
memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean)); memset(lctx.inp_mean->data, 0, n_tokens * cparams.n_seq_max * ggml_element_size(lctx.inp_mean));
   
std::vector<uint64_t> sum(n_tokens, 0); std::vector<uint64_t> sum(n_tokens, 0);
for (int i = 0; i < n_tokens; ++i) { for (int i = 0; i < n_tokens; ++i) {
const llama_seq_id seq_id = batch.seq_id[i][0]; const llama_seq_id seq_id = batch.seq_id[i][0];
GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
sum[seq_id] += 1; sum[seq_id] += 1;
} }
   
std::vector<float> div(n_tokens, 0.0f); std::vector<float> div(cparams.n_seq_max, 0.0f);
for (int i = 0; i < n_tokens; ++i) { for (uint32_t i = 0; i < cparams.n_seq_max; ++i) {
const uint64_t s = sum[i]; const uint64_t s = sum[i];
if (s > 0) { if (s > 0) {
div[i] = 1.0f/float(s); div[i] = 1.0f/float(s);
...@@ -14186,14 +14175,11 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { ...@@ -14186,14 +14175,11 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer)); GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
   
uint32_t * data = (uint32_t *) lctx.inp_cls->data; uint32_t * data = (uint32_t *) lctx.inp_cls->data;
memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls)); memset(lctx.inp_cls->data, 0, cparams.n_seq_max * ggml_element_size(lctx.inp_cls));
   
for (int i = 0; i < n_tokens; ++i) { for (int i = 0; i < n_tokens; ++i) {
const llama_seq_id seq_id = batch.seq_id[i][0]; const llama_seq_id seq_id = batch.seq_id[i][0];
const llama_pos pos = batch.pos[i]; const llama_pos pos = batch.pos[i];
GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS");
if (pos == 0) { if (pos == 0) {
data[seq_id] = i; data[seq_id] = i;
} }
...@@ -14356,7 +14342,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) { ...@@ -14356,7 +14342,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
const auto n_embd = hparams.n_embd; const auto n_embd = hparams.n_embd;
   
// TODO: use a per-batch flag for logits presence instead // TODO: use a per-batch flag for logits presence instead
const bool has_logits = !cparams.embeddings; const bool has_logits = cparams.causal_attn;
const bool has_embd = lctx.is_encoding || (cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE)); const bool has_embd = lctx.is_encoding || (cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE));
   
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0; const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
...@@ -14627,17 +14613,25 @@ static int llama_decode_internal( ...@@ -14627,17 +14613,25 @@ static int llama_decode_internal(
// no output // no output
res = nullptr; res = nullptr;
embd = nullptr; embd = nullptr;
} else if (cparams.embeddings) { }
res = nullptr; // do not extract logits for embedding case
embd = gf->nodes[gf->n_nodes - 1]; if (cparams.embeddings) {
if (strcmp(embd->name, "result_embd_pooled") != 0) { for (int i = gf->n_nodes - 1; i >= 0; --i) {
embd = gf->nodes[gf->n_nodes - 2]; embd = gf->nodes[i];
if (strcmp(embd->name, "result_embd_pooled") == 0) {
break;
}
} }
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor"); GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
} else { } else {
embd = nullptr; // do not extract embeddings when not needed embd = nullptr; // do not extract embeddings when not needed
GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor"); GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
} }
if (!cparams.causal_attn) {
res = nullptr; // do not extract logits when not needed
}
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
   
ggml_backend_sched_alloc_graph(lctx.sched, gf); ggml_backend_sched_alloc_graph(lctx.sched, gf);
...@@ -16429,16 +16423,23 @@ struct llama_model * llama_load_model_from_file( ...@@ -16429,16 +16423,23 @@ struct llama_model * llama_load_model_from_file(
} }
model->rpc_servers.push_back(servers); model->rpc_servers.push_back(servers);
} }
int status = llama_model_load(path_model, *model, params);
GGML_ASSERT(status <= 0); try {
if (status < 0) { int status = llama_model_load(path_model, *model, params);
if (status == -1) { GGML_ASSERT(status <= 0);
LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); if (status < 0) {
} else if (status == -2) { if (status == -1) {
LLAMA_LOG_INFO("%s: cancelled model load\n", __func__); LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
} else if (status == -2) {
LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
}
delete model;
return nullptr;
} }
} catch (...) {
LLAMA_LOG_ERROR("%s: exception loading model\n", __func__);
delete model; delete model;
return nullptr; throw;
} }
   
return model; return model;
...@@ -19171,3 +19172,290 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void * ...@@ -19171,3 +19172,290 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void *
fputs(text, stderr); fputs(text, stderr);
fflush(stderr); fflush(stderr);
} }
static int llama_apply_lora_from_file_internal(
const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads
) {
LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
const int64_t t_start_lora_us = ggml_time_us();
llama_file fin(path_lora, "rb");
// verify magic and version
{
uint32_t magic = fin.read_u32();
if (magic != LLAMA_FILE_MAGIC_GGLA) {
LLAMA_LOG_ERROR("%s: bad file magic\n", __func__);
return 1;
}
uint32_t format_version = fin.read_u32();
if (format_version != 1) {
LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ );
return 1;
}
}
int32_t lora_r = fin.read_u32();
int32_t lora_alpha = fin.read_u32();
float scaling = scale * (float)lora_alpha / (float)lora_r;
LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
// load base model
std::unique_ptr<llama_model_loader> ml;
if (path_base_model) {
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr));
ml->init_mappings(/*prefetch*/ false); // no prefetching
}
struct tensor_meta {
std::string name;
ggml_type type;
int32_t ne[2];
size_t offset;
};
std::map<std::string, tensor_meta> tensor_meta_map;
// load all tensor meta
while (true) {
if (fin.tell() == fin.size) {
// eof
break;
}
int32_t n_dims;
int32_t name_len;
int32_t ftype;
fin.read_raw(&n_dims, sizeof(n_dims));
fin.read_raw(&name_len, sizeof(name_len));
fin.read_raw(&ftype, sizeof(ftype));
if (n_dims != 1 && n_dims != 2) {
LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
return 1;
}
int32_t ne[2] = { 1, 1 };
for (int i = 0; i < n_dims; ++i) {
fin.read_raw(&ne[i], sizeof(ne[i]));
}
std::string name;
{
GGML_ASSERT(name_len < GGML_MAX_NAME);
char buf[GGML_MAX_NAME];
fin.read_raw(buf, name_len);
name = std::string(buf, name_len);
}
// check for lora suffix
std::string lora_suffix;
if (name.length() > 6) {
lora_suffix = name.substr(name.length() - 6);
}
if (lora_suffix != ".loraA" && lora_suffix != ".loraB") {
LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
return 1;
}
// tensor type
ggml_type wtype;
switch (ftype) {
case 0: wtype = GGML_TYPE_F32; break;
case 1: wtype = GGML_TYPE_F16; break;
default:
{
LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
__func__, ftype);
return 1;
}
}
// data offset
size_t offset = fin.tell();
offset = (offset + 31) & -32;
// skip tensor data
fin.seek(offset + ggml_row_size(wtype, ne[0]) * ne[1], SEEK_SET);
tensor_meta_map.emplace(name, tensor_meta{ name, wtype, { ne[0], ne[1] }, offset });
}
bool warned = false;
int n_tensors = 0;
// apply
ggml_backend_t backend_cpu = ggml_backend_cpu_init();
if (backend_cpu == nullptr) {
LLAMA_LOG_ERROR("%s: error: failed to initialize cpu backend\n", __func__);
return 1;
}
ggml_backend_cpu_set_n_threads(backend_cpu, n_threads);
std::vector<no_init<uint8_t>> read_buf;
for (const auto & it : model.tensors_by_name) {
const std::string & base_name = it.first;
ggml_tensor * model_t = it.second;
if (tensor_meta_map.find(base_name + ".loraA") == tensor_meta_map.end() ||
tensor_meta_map.find(base_name + ".loraB") == tensor_meta_map.end()) {
continue;
}
tensor_meta & metaA = tensor_meta_map.at(base_name + ".loraA");
tensor_meta & metaB = tensor_meta_map.at(base_name + ".loraB");
ggml_init_params lora_init_params = {
/* .mem_size */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
/* .mem_buffer */ nullptr,
/* .no_alloc */ true,
};
ggml_context * lora_ctx = ggml_init(lora_init_params);
if (lora_ctx == nullptr) {
LLAMA_LOG_ERROR("%s: error: failed to initialize lora context\n", __func__);
ggml_backend_free(backend_cpu);
return 1;
}
// create tensors
ggml_tensor * loraA = ggml_new_tensor_2d(lora_ctx, metaA.type, metaA.ne[0], metaA.ne[1]);
ggml_tensor * loraB = ggml_new_tensor_2d(lora_ctx, metaB.type, metaB.ne[0], metaB.ne[1]);
ggml_set_name(loraA, metaA.name.c_str());
ggml_set_name(loraB, metaB.name.c_str());
ggml_tensor * base_t;
if (ml) {
if (!ml->get_tensor_meta(base_name.c_str())) {
LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
return 1;
}
base_t = ggml_dup_tensor(lora_ctx, ml->get_tensor_meta(base_name.c_str()));
} else {
base_t = ggml_dup_tensor(lora_ctx, model_t);
}
ggml_set_name(base_t, base_name.c_str());
// allocate in backend buffer
ggml_backend_buffer_t lora_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
if (lora_buf == nullptr) {
LLAMA_LOG_ERROR("%s: error: failed to allocate lora tensors\n", __func__);
return 1;
}
// load tensor data
auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, ggml_tensor * tensor) {
read_buf.resize(ggml_nbytes(tensor));
fin.seek(tensor_meta.offset, SEEK_SET);
fin.read_raw(read_buf.data(), ggml_nbytes(tensor));
ggml_backend_tensor_set(tensor, read_buf.data(), 0, read_buf.size());
};
load_tensor(metaA, loraA);
load_tensor(metaB, loraB);
// load base model tensor data
if (ml) {
ml->load_data_for(base_t);
} else {
ggml_backend_tensor_copy(model_t, base_t);
}
if (ggml_is_quantized(base_t->type) && !warned) {
LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, "
"use a f16 or f32 base model with --lora-base\n", __func__);
warned = true;
}
if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
" are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
ggml_free(lora_ctx);
ggml_backend_buffer_free(lora_buf);
ggml_backend_free(backend_cpu);
return 1;
}
auto build_lora_graph = [&]() {
// w = w + BA*s
ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
ggml_set_name(BA, "BA");
if (scaling != 1.0f) {
BA = ggml_scale(lora_ctx, BA, scaling);
ggml_set_name(BA, "BA_scaled");
}
ggml_tensor * r;
r = ggml_add_inplace(lora_ctx, base_t, BA);
ggml_set_name(r, "r_add");
if (base_t->type != model_t->type) {
// convert the result to the model type
r = ggml_cast(lora_ctx, r, model_t->type);
ggml_set_name(r, "r_cast");
}
return r;
};
ggml_cgraph * gf = ggml_new_graph(lora_ctx);
ggml_tensor * r = build_lora_graph();
ggml_build_forward_expand(gf, r);
ggml_backend_buffer_t graph_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
if (graph_buf == nullptr) {
LLAMA_LOG_ERROR("%s: error: failed to allocate graph tensors\n", __func__);
ggml_free(lora_ctx);
ggml_backend_buffer_free(lora_buf);
ggml_backend_free(backend_cpu);
return 1;
}
ggml_backend_graph_compute(backend_cpu, gf);
ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r));
#if 0
// TODO: use scheduler with fallback to CPU for less copies between CPU and GPU
//ggml_backend_sched_t sched = ggml_backend_sched_new(backends.data(), backends.size(), GGML_DEFAULT_GRAPH_SIZE);
// sched compute
ggml_build_forward_expand(gf, build_graph());
ggml_backend_sched_init_measure(sched, gf);
// create the graph again, since the previous one was destroyed by the measure
ggml_graph_clear(gf);
ggml_build_forward_expand(gf, build_graph());
ggml_backend_sched_graph_compute(sched, gf);
ggml_backend_sched_free(sched);
#endif
ggml_backend_buffer_free(lora_buf);
ggml_backend_buffer_free(graph_buf);
ggml_free(lora_ctx);
n_tensors++;
if (n_tensors % 4 == 0) {
LLAMA_LOG_INFO(".");
}
}
ggml_backend_free(backend_cpu);
const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
return 0;
}
int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
try {
return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
return 1;
}
}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment