Commit 97ef6ff8 authored by xuxzh1's avatar xuxzh1 🎱
Browse files

update

parent 4cc1a614
Pipeline #2023 canceled with stages
#include "clip.h"
#include "common.h"
#include "llama.h"
#include "llava.h"
#include "base64.hpp"
#include "llama.h"
#include <algorithm>
#include <cerrno>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <limits>
#include <vector>
#include <numeric>
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
#define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
// RGB uint8 image
struct clip_image_u8 {
......@@ -54,7 +64,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int>& ori
int downscaled_height = static_cast<int>(original_height * scale);
int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
int wasted_resolution = (width * height) - effective_resolution;
// LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
// LOG_DBG("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
max_effective_resolution = effective_resolution;
min_wasted_resolution = wasted_resolution;
......@@ -184,7 +194,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
// ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
ggml_build_forward_expand(gf, flatten);
ggml_graph_compute_with_ctx(model.ctx, gf, 1);
struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1];
struct ggml_tensor* result = ggml_graph_node(gf, -1);
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
// append without newline tokens (default behavior in llava_arch when not using unpad ):
......@@ -202,6 +212,33 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
return true;
}
static clip_image_f32 * only_v2_5_reshape_by_patch(clip_image_f32 * image, int patch_size) {
int width = image->nx;
int height = image->ny;
int num_patches = (height / patch_size) * (width / patch_size);
clip_image_f32 * patch = clip_image_f32_init();
patch->nx = patch_size * num_patches;
patch->ny = patch_size;
patch->buf.resize(3 * patch->nx * patch->ny);
int patch_index = 0;
for (int i = 0; i < height; i += patch_size) {
for (int j = 0; j < width; j += patch_size) {
for (int pi = 0; pi < patch_size; ++pi) {
for (int pj = 0; pj < patch_size; ++pj) {
int input_index = ((i + pi) * width + (j + pj)) * 3;
int output_index = (pi * patch_size * num_patches + patch_index * patch_size + pj) * 3;
patch->buf[output_index] = image->buf[input_index];
patch->buf[output_index+1] = image->buf[input_index+1];
patch->buf[output_index+2] = image->buf[input_index+2];
}
}
patch_index++;
}
}
return patch;
}
static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
// std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
......@@ -209,7 +246,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
img_res_v.size = 0;
img_res_v.data = nullptr;
if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
LOG_TEE("%s: unable to preprocess image\n", __func__);
LOG_ERR("%s: unable to preprocess image\n", __func__);
delete[] img_res_v.data;
return false;
}
......@@ -218,17 +255,62 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
if (clip_is_minicpmv(ctx_clip)) {
std::vector<float *> image_embd_v;
image_embd_v.resize(img_res_v.size);
struct clip_image_size * load_image_size = clip_image_size_init();
for (size_t i = 0; i < img_res_v.size; i++) {
const int64_t t_img_enc_step_start_us = ggml_time_us();
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip));
int patch_size=14;
load_image_size->width = img_res_v.data[i].nx;
load_image_size->height = img_res_v.data[i].ny;
clip_add_load_image_size(ctx_clip, load_image_size);
bool encoded = false;
int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
if (has_minicpmv_projector == 2) {
encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
}
else if (has_minicpmv_projector == 3) {
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
}
if (!encoded) {
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
return false;
}
const int64_t t_img_enc_steop_batch_us = ggml_time_us();
LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
}
const int64_t t_img_enc_batch_us = ggml_time_us();
LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
int n_img_pos_out = 0;
for (size_t i = 0; i < image_embd_v.size(); i++) {
std::memcpy(image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i], clip_embd_nbytes(ctx_clip));
n_img_pos_out += clip_n_patches(ctx_clip);
}
*n_img_pos = n_img_pos_out;
for (size_t i = 0; i < image_embd_v.size(); i++) {
free(image_embd_v[i]);
}
image_embd_v.clear();
load_image_size->width = img->nx;
load_image_size->height = img->ny;
clip_add_load_image_size(ctx_clip, load_image_size);
LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
}
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
// flat / default llava-1.5 type embedding
*n_img_pos = clip_n_patches(ctx_clip);
bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
delete[] img_res_v.data;
if (!encoded) {
LOG_TEE("Unable to encode image\n");
LOG_ERR("Unable to encode image\n");
return false;
}
} else {
}
else {
// spatial_unpad llava-1.6 type embedding
// TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working
std::vector<float *> image_embd_v;
......@@ -237,12 +319,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
if (!encoded) {
LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
return false;
}
}
const int64_t t_img_enc_batch_us = ggml_time_us();
LOG_TEE("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
const int32_t * image_grid = clip_image_grid(ctx_clip);
......@@ -275,12 +357,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
// clip_image_save_to_bmp(*tmp, "image_feature.bmp");
}
LOG_TEE("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
LOG_INF("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
const int64_t t_img_enc_end_us = ggml_time_us();
float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
LOG_TEE("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
LOG_INF("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
return true;
}
......@@ -290,22 +372,26 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
if (n_image_embd != n_llama_embd) {
LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
LOG_ERR("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
return false;
}
return true;
}
bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model
int num_max_patches = 6;
if (clip_is_minicpmv(ctx_clip)) {
num_max_patches = 10;
}
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
if (!image_embd) {
LOG_TEE("Unable to allocate memory for image embeddings\n");
LOG_ERR("Unable to allocate memory for image embeddings\n");
return false;
}
int n_img_pos;
if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
LOG_TEE("%s: cannot encode image, aborting\n", __func__);
LOG_ERR("%s: cannot encode image, aborting\n", __func__);
free(image_embd);
return false;
}
......@@ -315,6 +401,39 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
return true;
}
struct llava_embd_batch {
std::vector<llama_pos> pos;
std::vector<int32_t> n_seq_id;
std::vector<llama_seq_id> seq_id_0;
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
pos .resize(n_tokens);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
logits .resize(n_tokens);
seq_id_0.resize(1);
seq_id_0[0] = seq_id;
seq_ids [n_tokens] = nullptr;
batch = {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ embd,
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
/*logits =*/ logits.data(),
};
for (int i = 0; i < n_tokens; i++) {
batch.pos [i] = pos_0 + i;
batch.n_seq_id[i] = 1;
batch.seq_id [i] = seq_id_0.data();
batch.logits [i] = false;
}
}
};
bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, int n_batch, int * n_past) {
int n_embd = llama_n_embd(llama_get_model(ctx_llama));
......@@ -323,9 +442,10 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
if (n_eval > n_batch) {
n_eval = n_batch;
}
llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
if (llama_decode(ctx_llama, batch)) {
LOG_TEE("%s : failed to eval\n", __func__);
float * embd = image_embed->embed+i*n_embd;
llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
if (llama_decode(ctx_llama, llava_batch.batch)) {
LOG_ERR("%s : failed to eval\n", __func__);
return false;
}
*n_past += n_eval;
......@@ -337,7 +457,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
clip_image_u8 * img = clip_image_u8_init();
if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
clip_image_u8_free(img);
LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__);
LOG_ERR("%s: can't load image from bytes, is it a valid image?", __func__);
return NULL;
}
......@@ -346,7 +466,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
if (!image_embed_result) {
clip_image_u8_free(img);
LOG_TEE("%s: coulnd't embed the image\n", __func__);
LOG_ERR("%s: couldn't embed the image\n", __func__);
return NULL;
}
......@@ -360,7 +480,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) {
auto file = fopen(path, "rb");
if (file == NULL) {
LOG_TEE("%s: can't read file %s\n", __func__, path);
LOG_ERR("%s: can't read file %s\n", __func__, path);
return false;
}
......@@ -370,7 +490,7 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long
auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
if (buffer == NULL) {
LOG_TEE("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
LOG_ERR("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
perror("Memory allocation error");
fclose(file);
return false;
......@@ -395,7 +515,7 @@ struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx
long image_bytes_length;
auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
if (!loaded) {
LOG_TEE("%s: failed to load %s\n", __func__, image_path);
LOG_ERR("%s: failed to load %s\n", __func__, image_path);
return NULL;
}
......
......@@ -17,12 +17,11 @@
# define LLAVA_API
#endif
struct clip_ctx;
#ifdef __cplusplus
extern "C" {
#endif
struct clip_ctx;
struct llava_image_embed {
float * embed;
int n_image_pos;
......@@ -37,8 +36,8 @@ LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip,
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
/** build an image embed from a path to an image filename */
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
/** free an embedding made with llava_image_embed_make_* */
LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
/** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
......
#include "arg.h"
#include "log.h"
#include "common.h"
#include "sampling.h"
#include "clip.h"
#include "llava.h"
#include "llama.h"
#include "ggml.h"
#include <algorithm>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <vector>
#include <iostream> // TODO: remove me
struct llava_context {
struct clip_ctx * ctx_clip = NULL;
struct llama_context * ctx_llama = NULL;
struct llama_model * model = NULL;
};
static void show_additional_info(int /*argc*/, char ** argv) {
LOG("\nexample usage:\n\n%s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
LOG("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
}
static struct llama_model * llava_init(common_params * params) {
llama_backend_init();
llama_numa_init(params->numa);
llama_model_params model_params = common_model_params_to_llama(*params);
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
if (model == NULL) {
LOG_ERR("%s: unable to load model\n" , __func__);
return NULL;
}
return model;
}
static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
auto prompt = params->prompt;
if (prompt.empty()) {
prompt = "describe the image in detail.";
}
llama_context_params ctx_params = common_context_params_to_llama(*params);
if (params->n_ctx < 2048) {
// warn user here, "Image processing requires at least 2048 context, setting context to 2048"
LOG_WRN("%s: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
ctx_params.n_ctx = 2048;
} else {
ctx_params.n_ctx = params->n_ctx;
}
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
if (ctx_llama == NULL) {
LOG_ERR("%s: failed to create the llama_context\n" , __func__);
return NULL;
}
auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
ctx_llava->ctx_llama = ctx_llama;
ctx_llava->model = model;
return ctx_llava;
}
static void llava_free(struct llava_context * ctx_llava) {
if (ctx_llava->ctx_clip) {
clip_free(ctx_llava->ctx_clip);
ctx_llava->ctx_clip = NULL;
}
llama_free(ctx_llava->ctx_llama);
llama_free_model(ctx_llava->model);
llama_backend_free();
}
static struct clip_ctx * clip_init_context(common_params * params) {
const char * clip_path = params->mmproj.c_str();
auto prompt = params->prompt;
if (prompt.empty()) {
prompt = "describe the image in detail.";
}
auto * ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
return ctx_clip;
}
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
int N = (int) tokens.size();
for (int i = 0; i < N; i += n_batch) {
int n_eval = (int) tokens.size() - i;
if (n_eval > n_batch) {
n_eval = n_batch;
}
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval))) {
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
return false;
}
*n_past += n_eval;
}
return true;
}
static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
std::vector<llama_token> tokens;
tokens.push_back(id);
return eval_tokens(ctx_llama, tokens, 1, n_past);
}
static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
std::string str2 = str;
std::vector<llama_token> embd_inp = common_tokenize(ctx_llama, str2, add_bos, true);
return eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
}
static void process_eval_image_embed(struct llava_context * ctx_llava, const struct llava_image_embed * embeds, int n_batch, int * n_past, int idx) {
float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip));
std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip));
auto * slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
slice_embed->embed = image_embed;
slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip);
llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past);
llava_image_embed_free(slice_embed);
}
static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, common_params * params, int &n_past) {
std::string system_prompt;
int idx = 0;
int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip);
int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
if (has_minicpmv_projector == 2) {
system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n";
}
else if (has_minicpmv_projector == 3) {
system_prompt = "<|im_start|>user\n";
}
LOG_INF("%s: image token past: %d\n", __func__, n_past);
eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
if (num_image_embeds > 1) {
size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
for (size_t j = 0; j < num_image_embeds_col; ++j) {
eval_string(ctx_llava->ctx_llama, std::string("<image>").c_str(), params->n_batch, &n_past, false);
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
if (j == num_image_embeds_col - 1) {
eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
}
}
}
eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
}
LOG_INF("%s: image token past: %d\n", __func__, n_past);
}
static const char * sample(struct common_sampler * smpl,
struct llama_context * ctx_llama,
int * n_past) {
const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
common_sampler_accept(smpl, id, true);
static std::string ret;
if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
ret = "</s>";
} else {
ret = common_token_to_piece(ctx_llama, id);
}
eval_id(ctx_llama, id, n_past);
return ret.c_str();
}
static struct llava_context * minicpmv_init(common_params * params, const std::string & fname, int &n_past){
auto * ctx_clip = clip_init_context(params);
auto * embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
if (!embeds) {
LOG_ERR("failed to load image %s. Terminating\n\n", fname.c_str());
return NULL;
}
// process the prompt
if (params->prompt.empty() && params->interactive == false) {
LOG_ERR("prompt should be given or interactive mode should be on");
return NULL;
}
auto * model = llava_init(params);
if (model == NULL) {
fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
return NULL;
}
const int64_t t_llava_init_start_us = ggml_time_us();
auto * ctx_llava = llava_init_context(params, model);
ctx_llava->ctx_clip = ctx_clip;
const int64_t t_llava_init_end_us = ggml_time_us();
float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
LOG_INF("%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
const int64_t t_process_image_start_us = ggml_time_us();
process_image(ctx_llava, embeds, params, n_past);
const int64_t t_process_image_end_us = ggml_time_us();
float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
LOG_INF("%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
llava_image_embed_free(embeds);
return ctx_llava;
}
static struct common_sampler * llama_init(struct llava_context * ctx_llava, common_params * params, const std::string & prompt, int & n_past, bool is_first = false){
std::string user_prompt = prompt;
int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
if (!is_first) {
if (has_minicpmv_projector == 2) {
user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt;
}
else if (has_minicpmv_projector == 3) {
user_prompt = "<|im_start|>user\n" + prompt;
}
}
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
if (has_minicpmv_projector == 2) {
eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false);
}
else if (has_minicpmv_projector == 3) {
eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
}
// generate the response
LOG_INF("\n");
struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling);
return smpl;
}
static const char * llama_loop(struct llava_context * ctx_llava,struct common_sampler * smpl, int &n_past){
const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
return tmp;
}
int main(int argc, char ** argv) {
ggml_time_init();
common_params params;
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
return 1;
}
common_init();
if (params.mmproj.empty() || (params.image.empty())) {
show_additional_info(argc, argv);
return 1;
}
for (auto & image : params.image) {
int n_past = 0;
auto * ctx_llava = minicpmv_init(&params, image, n_past);
if (!params.prompt.empty()) {
LOG("<user>%s\n", params.prompt.c_str());
LOG("<assistant>");
auto * smpl = llama_init(ctx_llava, &params, params.prompt, n_past, true);
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
std::string response;
bool have_tmp = false;
for (int i = 0; i < max_tgt_len; i++) {
const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
response += tmp;
if (strcmp(tmp, "</s>") == 0){
if (!have_tmp) {
continue;
}
break;
}
if (strstr(tmp, "###")) break; // Yi-VL behavior
have_tmp = true;
printf("%s", tmp);
if (strstr(response.c_str(), "<user>")) break; // minicpm-v
fflush(stdout);
}
common_sampler_free(smpl);
}else {
while (true) {
LOG("<user>");
std::string prompt;
std::getline(std::cin, prompt);
LOG("<assistant>");
auto * smpl = llama_init(ctx_llava, &params, prompt, n_past, true);
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
std::string response;
for (int i = 0; i < max_tgt_len; i++) {
const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
response += tmp;
if (strcmp(tmp, "</s>") == 0) break;
if (strstr(tmp, "###")) break; // Yi-VL behavior
printf("%s", tmp);// mistral llava-1.6
if (strstr(response.c_str(), "<user>")) break; // minicpm-v
fflush(stdout);
}
common_sampler_free(smpl);
}
}
printf("\n");
llama_perf_context_print(ctx_llava->ctx_llama);
ctx_llava->model = NULL;
llava_free(ctx_llava);
}
return 0;
}
# coding=utf-8
# Copyright 2024 Google AI and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch Siglip model. """
# Copied from HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit and add tgt_sizes
import os
import math
import warnings
import numpy as np
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
from torch.nn.init import _calculate_fan_in_and_fan_out
from transformers.activations import ACT2FN
from transformers.modeling_utils import PreTrainedModel
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import (
logging,
)
from transformers.utils import logging
logger = logging.get_logger(__name__)
class SiglipVisionConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`SiglipVisionModel`]. It is used to instantiate a
Siglip vision encoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip
[google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
hidden_size (`int`, *optional*, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
intermediate_size (`int`, *optional*, defaults to 3072):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
num_channels (`int`, *optional*, defaults to 3):
Number of channels in the input images.
image_size (`int`, *optional*, defaults to 224):
The size (resolution) of each image.
patch_size (`int`, *optional*, defaults to 16):
The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-06):
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
Example:
```python
>>> from transformers import SiglipVisionConfig, SiglipVisionModel
>>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224 style configuration
>>> configuration = SiglipVisionConfig()
>>> # Initializing a SiglipVisionModel (with random weights) from the google/siglip-base-patch16-224 style configuration
>>> model = SiglipVisionModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "siglip_vision_model"
def __init__(
self,
hidden_size=768,
intermediate_size=3072,
num_hidden_layers=12,
num_attention_heads=12,
num_channels=3,
image_size=224,
patch_size=16,
hidden_act="gelu_pytorch_tanh",
layer_norm_eps=1e-6,
attention_dropout=0.0,
**kwargs,
):
super().__init__(**kwargs)
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.num_channels = num_channels
self.patch_size = patch_size
self.image_size = image_size
self.attention_dropout = attention_dropout
self.layer_norm_eps = layer_norm_eps
self.hidden_act = hidden_act
_CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"
SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
"google/siglip-base-patch16-224",
# See all SigLIP models at https://huggingface.co/models?filter=siglip
]
# Copied from transformers.models.llama.modeling_llama._get_unpad_data
def _get_unpad_data(attention_mask):
seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
max_seqlen_in_batch = seqlens_in_batch.max().item()
cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
return (
indices,
cu_seqlens,
max_seqlen_in_batch,
)
def _trunc_normal_(tensor, mean, std, a, b):
# Cut & paste from PyTorch official master until it's in a few official releases - RW
# Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
def norm_cdf(x):
# Computes standard normal cumulative distribution function
return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
if (mean < a - 2 * std) or (mean > b + 2 * std):
warnings.warn(
"mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
"The distribution of values may be incorrect.",
stacklevel=2,
)
# Values are generated by using a truncated uniform distribution and
# then using the inverse CDF for the normal distribution.
# Get upper and lower cdf values
l = norm_cdf((a - mean) / std)
u = norm_cdf((b - mean) / std)
# Uniformly fill tensor with values from [l, u], then translate to
# [2l-1, 2u-1].
tensor.uniform_(2 * l - 1, 2 * u - 1)
# Use inverse cdf transform for normal distribution to get truncated
# standard normal
if tensor.dtype in [torch.float16, torch.bfloat16]:
# The `erfinv_` op is not (yet?) defined in float16+cpu, bfloat16+gpu
og_dtype = tensor.dtype
tensor = tensor.to(torch.float32)
tensor.erfinv_()
tensor = tensor.to(og_dtype)
else:
tensor.erfinv_()
# Transform to proper mean, std
tensor.mul_(std * math.sqrt(2.0))
tensor.add_(mean)
# Clamp to ensure it's in the proper range
if tensor.dtype == torch.float16:
# The `clamp_` op is not (yet?) defined in float16+cpu
tensor = tensor.to(torch.float32)
tensor.clamp_(min=a, max=b)
tensor = tensor.to(torch.float16)
else:
tensor.clamp_(min=a, max=b)
def trunc_normal_tf_(
tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0
):
"""Fills the input Tensor with values drawn from a truncated
normal distribution. The values are effectively drawn from the
normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
with values outside :math:`[a, b]` redrawn until they are within
the bounds. The method used for generating the random values works
best when :math:`a \\leq \text{mean} \\leq b`.
NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
and the result is subsquently scaled and shifted by the mean and std args.
Args:
tensor: an n-dimensional `torch.Tensor`
mean: the mean of the normal distribution
std: the standard deviation of the normal distribution
a: the minimum cutoff value
b: the maximum cutoff value
"""
with torch.no_grad():
_trunc_normal_(tensor, 0, 1.0, a, b)
tensor.mul_(std).add_(mean)
def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
denom = fan_in
if mode == "fan_in":
denom = fan_in
elif mode == "fan_out":
denom = fan_out
elif mode == "fan_avg":
denom = (fan_in + fan_out) / 2
variance = scale / denom
if distribution == "truncated_normal":
# constant is stddev of standard normal truncated to (-2, 2)
trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
elif distribution == "normal":
with torch.no_grad():
tensor.normal_(std=math.sqrt(variance))
elif distribution == "uniform":
bound = math.sqrt(3 * variance)
with torch.no_grad():
tensor.uniform_(-bound, bound)
else:
raise ValueError(f"invalid distribution {distribution}")
def lecun_normal_(tensor):
variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
def default_flax_embed_init(tensor):
variance_scaling_(tensor, mode="fan_in", distribution="normal")
class SiglipVisionEmbeddings(nn.Module):
def __init__(self, config: SiglipVisionConfig):
super().__init__()
self.config = config
self.embed_dim = config.hidden_size
self.image_size = config.image_size
self.patch_size = config.patch_size
self.patch_embedding = nn.Conv2d(
in_channels=config.num_channels,
out_channels=self.embed_dim,
kernel_size=self.patch_size,
stride=self.patch_size,
padding="valid",
)
self.num_patches_per_side = self.image_size // self.patch_size
self.num_patches = self.num_patches_per_side**2
self.num_positions = self.num_patches
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
class SiglipAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
# Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
def __init__(self, config):
super().__init__()
self.config = config
self.embed_dim = config.hidden_size
self.num_heads = config.num_attention_heads
self.head_dim = self.embed_dim // self.num_heads
if self.head_dim * self.num_heads != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
f" {self.num_heads})."
)
self.scale = self.head_dim**-0.5
self.dropout = config.attention_dropout
self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip
class SiglipMLP(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.activation_fn = ACT2FN[config.hidden_act]
self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->Siglip
class SiglipEncoderLayer(nn.Module):
def __init__(self, config: SiglipVisionConfig):
super().__init__()
self.embed_dim = config.hidden_size
self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
self.self_attn = (
SiglipAttention(config)
)
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.mlp = SiglipMLP(config)
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
class SiglipPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = SiglipVisionConfig
base_model_prefix = "siglip"
supports_gradient_checkpointing = True
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, SiglipVisionEmbeddings):
width = self.config.hidden_size
nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width))
elif isinstance(module, nn.Embedding):
default_flax_embed_init(module.weight)
elif isinstance(module, SiglipAttention):
nn.init.normal_(module.q_proj.weight)
nn.init.normal_(module.k_proj.weight)
nn.init.normal_(module.v_proj.weight)
nn.init.normal_(module.out_proj.weight)
nn.init.zeros_(module.q_proj.bias)
nn.init.zeros_(module.k_proj.bias)
nn.init.zeros_(module.v_proj.bias)
nn.init.zeros_(module.out_proj.bias)
elif isinstance(module, SiglipMLP):
nn.init.normal_(module.fc1.weight)
nn.init.normal_(module.fc2.weight)
nn.init.normal_(module.fc1.bias, std=1e-6)
nn.init.normal_(module.fc2.bias, std=1e-6)
elif isinstance(module, (nn.Linear, nn.Conv2d)):
lecun_normal_(module.weight)
if module.bias is not None:
nn.init.zeros_(module.bias)
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
SIGLIP_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`SiglipVisionConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
SIGLIP_VISION_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->Siglip
class SiglipEncoder(nn.Module):
"""
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`SiglipEncoderLayer`].
Args:
config: SiglipConfig
"""
def __init__(self, config: SiglipVisionConfig):
super().__init__()
self.config = config
self.layers = nn.ModuleList([SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
class SiglipVisionTransformer(SiglipPreTrainedModel):
config_class = SiglipVisionConfig
main_input_name = "pixel_values"
_supports_flash_attn_2 = True
def __init__(self, config: SiglipVisionConfig):
super().__init__(config)
self.config = config
embed_dim = config.hidden_size
self.embeddings = SiglipVisionEmbeddings(config)
self.encoder = SiglipEncoder(config)
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self) -> nn.Module:
return self.embeddings.patch_embedding
import argparse
import json
import re
import numpy as np
from gguf import *
from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer, Idefics2VisionConfig
TEXT = "clip.text"
VISION = "clip.vision"
def add_key_str(raw_key: str, arch: str) -> str:
return raw_key.format(arch=arch)
def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_minicpmv: bool) -> bool:
if name in (
"logit_scale",
"text_model.embeddings.position_ids",
"vision_model.embeddings.position_ids",
):
return True
if has_minicpmv and name in ["visual_projection.weight"]:
return True
if name.startswith("v") and not has_vision:
return True
if name.startswith("t") and not has_text:
return True
return False
def get_tensor_name(name: str) -> str:
if "projection" in name:
return name
if "mm_projector" in name:
name = name.replace("model.mm_projector", "mm")
name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1)
name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1)
return name
return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
def bytes_to_unicode():
"""
Returns list of utf-8 byte and a corresponding list of unicode strings.
The reversible bpe codes work on unicode strings.
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
This is a significant percentage of your normal, say, 32K bpe vocab.
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
And avoids mapping to whitespace/control characters the bpe code barfs on.
"""
bs = (
list(range(ord("!"), ord("~") + 1))
+ list(range(ord("¡"), ord("¬") + 1))
+ list(range(ord("®"), ord("ÿ") + 1))
)
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8 + n)
n += 1
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))
ap = argparse.ArgumentParser()
ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
ap.add_argument("--text-only", action="store_true", required=False,
help="Save a text-only model. It can't be used to encode images")
ap.add_argument("--vision-only", action="store_true", required=False,
help="Save a vision-only model. It can't be used to encode texts")
ap.add_argument("--clip-model-is-vision", action="store_true", required=False,
help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
ap.add_argument("--clip-model-is-openclip", action="store_true", required=False,
help="The clip model is from openclip (for ViT-SO400M type))")
ap.add_argument("--minicpmv-projector", help="Path to minicpmv.projector file. If specified, save an image encoder for MiniCPM-V models.")
ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp")
ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
default_image_mean = [0.48145466, 0.4578275, 0.40821073]
default_image_std = [0.26862954, 0.26130258, 0.27577711]
ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
ap.add_argument('--minicpmv_version', type=int, help='minicpmv_version: MiniCPM-V-2 use 1; MiniCPM-V-2.5 use 2; MiniCPM-V-2.6 use 3', default=2)
# with proper
args = ap.parse_args()
if args.text_only and args.vision_only:
print("--text-only and --image-only arguments cannot be specified at the same time.")
exit(1)
if args.use_f32:
print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.")
# output in the same directory as the model if output_dir is None
dir_model = args.model_dir
if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip:
vocab = None
tokens = None
else:
with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
vocab = json.load(f)
tokens = [key for key in vocab]
# possible data types
# ftype == 0 -> float32
# ftype == 1 -> float16
#
# map from ftype to string
ftype_str = ["f32", "f16"]
ftype = 1
if args.use_f32:
ftype = 0
# if args.clip_model_is_vision or args.clip_model_is_openclip:
# model = CLIPVisionModel.from_pretrained(dir_model)
# processor = None
# else:
# model = CLIPModel.from_pretrained(dir_model)
# processor = CLIPProcessor.from_pretrained(dir_model)
minicpmv_version = args.minicpmv_version
emb_dim = 4096
if minicpmv_version == 1:
emb_dim = 2304
elif minicpmv_version == 2:
emb_dim = 4096
elif minicpmv_version == 3:
emb_dim = 3584
default_vision_config = {
"hidden_size": 1152,
"image_size": 980,
"intermediate_size": 4304,
"model_type": "idefics2",
"num_attention_heads": 16,
"num_hidden_layers": 27,
"patch_size": 14,
}
vision_config = Idefics2VisionConfig(**default_vision_config)
model = Idefics2VisionTransformer(vision_config)
if minicpmv_version == 3:
vision_config = SiglipVisionConfig(**default_vision_config)
model = SiglipVisionTransformer(vision_config)
processor = None
# if model.attn_pool is not None:
# model.attn_pool = torch.nn.Identity()
# model.blocks = model.blocks[:-1]
model.load_state_dict(torch.load(os.path.join(dir_model, "minicpmv.clip")))
fname_middle = None
has_text_encoder = True
has_vision_encoder = True
has_minicpmv_projector = False
if args.text_only:
fname_middle = "text-"
has_vision_encoder = False
elif args.minicpmv_projector is not None:
fname_middle = "mmproj-"
has_text_encoder = False
has_minicpmv_projector = True
minicpmv_version = 3
elif args.vision_only:
fname_middle = "vision-"
has_text_encoder = False
else:
fname_middle = ""
output_dir = args.output_dir if args.output_dir is not None else dir_model
os.makedirs(output_dir, exist_ok=True)
output_prefix = os.path.basename(output_dir).replace("ggml_", "")
fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
fout = GGUFWriter(path=fname_out, arch="clip")
fout.add_bool("clip.has_text_encoder", has_text_encoder)
fout.add_bool("clip.has_vision_encoder", has_vision_encoder)
fout.add_bool("clip.has_minicpmv_projector", has_minicpmv_projector)
fout.add_file_type(ftype)
if args.text_only:
fout.add_description("text-only CLIP model")
elif args.vision_only and not has_minicpmv_projector:
fout.add_description("vision-only CLIP model")
elif has_minicpmv_projector:
fout.add_description("image encoder for MiniCPM-V")
# add projector type
fout.add_string("clip.projector_type", "resampler")
fout.add_int32("clip.minicpmv_version", minicpmv_version)
else:
fout.add_description("two-tower CLIP model")
if has_vision_encoder:
# vision_model hparams
fout.add_uint32("clip.vision.image_size", 448)
fout.add_uint32("clip.vision.patch_size", 14)
fout.add_uint32(add_key_str(KEY_EMBEDDING_LENGTH, VISION), 1152)
fout.add_uint32(add_key_str(KEY_FEED_FORWARD_LENGTH, VISION), 4304)
fout.add_uint32("clip.vision.projection_dim", 0)
fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), 16)
fout.add_float32(add_key_str(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
block_count = 26
fout.add_uint32(add_key_str(KEY_BLOCK_COUNT, VISION), block_count)
if processor is not None:
image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std
else:
image_mean = args.image_mean if args.image_mean is not None else default_image_mean
image_std = args.image_std if args.image_std is not None else default_image_std
fout.add_array("clip.vision.image_mean", image_mean)
fout.add_array("clip.vision.image_std", image_std)
use_gelu = True
fout.add_bool("clip.use_gelu", use_gelu)
def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
"""
embed_dim: output dimension for each position
pos: a list of positions to be encoded: size (M,)
out: (M, D)
"""
assert embed_dim % 2 == 0
omega = np.arange(embed_dim // 2, dtype=np.float32)
omega /= embed_dim / 2.
omega = 1. / 10000 ** omega # (D/2,)
pos = pos.reshape(-1) # (M,)
out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product
emb_sin = np.sin(out) # (M, D/2)
emb_cos = np.cos(out) # (M, D/2)
emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
return emb
def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
assert embed_dim % 2 == 0
# use half of dimensions to encode grid_h
emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2)
emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2)
emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
return emb
# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
"""
grid_size: int of the grid height and width
return:
pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
"""
if isinstance(grid_size, int):
grid_h_size, grid_w_size = grid_size, grid_size
else:
grid_h_size, grid_w_size = grid_size[0], grid_size[1]
grid_h = np.arange(grid_h_size, dtype=np.float32)
grid_w = np.arange(grid_w_size, dtype=np.float32)
grid = np.meshgrid(grid_w, grid_h) # here w goes first
grid = np.stack(grid, axis=0)
grid = grid.reshape([2, 1, grid_h_size, grid_w_size])
pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
if cls_token:
pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
return pos_embed
def _replace_name_resampler(s, v):
if re.match("resampler.pos_embed", s):
return {
s: v,
re.sub("pos_embed", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(emb_dim, (70, 70))),
}
if re.match("resampler.proj", s):
return {
re.sub("proj", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(emb_dim, (70, 70))),
re.sub("proj", "proj.weight", s): v.transpose(-1, -2).contiguous(),
}
if re.match("resampler.attn.in_proj_.*", s):
return {
re.sub("attn.in_proj_", "attn.q.", s): v.chunk(3, dim=0)[0],
re.sub("attn.in_proj_", "attn.k.", s): v.chunk(3, dim=0)[1],
re.sub("attn.in_proj_", "attn.v.", s): v.chunk(3, dim=0)[2],
}
return {s: v}
if has_minicpmv_projector:
projector = torch.load(args.minicpmv_projector)
new_state_dict = {}
for k, v in projector.items():
kvs = _replace_name_resampler(k, v)
for nk, nv in kvs.items():
new_state_dict[nk] = nv
projector = new_state_dict
ftype_cur = 0
for name, data in projector.items():
name = get_tensor_name(name)
data = data.squeeze().numpy()
n_dims = len(data.shape)
if ftype == 1:
if name[-7:] == ".weight" and n_dims == 2:
print(" Converting to float16")
data = data.astype(np.float16)
ftype_cur = 1
else:
print(" Converting to float32")
data = data.astype(np.float32)
ftype_cur = 0
else:
if data.dtype != np.float32:
print(" Converting to float32")
data = data.astype(np.float32)
ftype_cur = 0
fout.add_tensor(name, data)
print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
print("Projector tensors added\n")
def _replace_name(s, v):
s = "vision_model." + s
if re.match("vision_model.embeddings.position_embedding", s):
v = v.unsqueeze(0)
return {s: v}
return {s: v}
state_dict = model.state_dict()
new_state_dict = {}
for k, v in state_dict.items():
kvs = _replace_name(k, v)
for nk, nv in kvs.items():
new_state_dict[nk] = nv
state_dict = new_state_dict
for name, data in state_dict.items():
if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_minicpmv_projector):
# we don't need this
print(f"skipping parameter: {name}")
continue
name = get_tensor_name(name)
data = data.squeeze().numpy()
n_dims = len(data.shape)
# ftype == 0 -> float32, ftype == 1 -> float16
ftype_cur = 0
if n_dims == 4:
print(f"tensor {name} is always saved in f16")
data = data.astype(np.float16)
ftype_cur = 1
elif ftype == 1:
if name[-7:] == ".weight" and n_dims == 2:
print(" Converting to float16")
data = data.astype(np.float16)
ftype_cur = 1
else:
print(" Converting to float32")
data = data.astype(np.float32)
ftype_cur = 0
else:
if data.dtype != np.float32:
print(" Converting to float32")
data = data.astype(np.float32)
ftype_cur = 0
print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
fout.add_tensor(name, data)
fout.write_header_to_file()
fout.write_kv_data_to_file()
fout.write_tensors_to_file()
fout.close()
print("Done. Output file: " + fname_out)
import argparse
import os
import torch
from transformers import AutoModel, AutoTokenizer
ap = argparse.ArgumentParser()
ap.add_argument("-m", "--model", help="Path to MiniCPM-V model")
args = ap.parse_args()
# find the model part that includes the the multimodal projector weights
model = AutoModel.from_pretrained(args.model, trust_remote_code=True, local_files_only=True)
checkpoint = model.state_dict()
# get a list of mm tensor names
mm_tensors = [k for k, v in checkpoint.items() if k.startswith("resampler")]
# store these tensors in a new dictionary and torch.save them
projector = {name: checkpoint[name].float() for name in mm_tensors}
torch.save(projector, f"{args.model}/minicpmv.projector")
clip_tensors = [k for k, v in checkpoint.items() if k.startswith("vpm")]
if len(clip_tensors) > 0:
clip = {name.replace("vpm.", ""): checkpoint[name].float() for name in clip_tensors}
torch.save(clip, f"{args.model}/minicpmv.clip")
# added tokens should be removed to be able to convert Mistral models
if os.path.exists(f"{args.model}/added_tokens.json"):
with open(f"{args.model}/added_tokens.json", "w") as f:
f.write("{}\n")
config = model.llm.config
config.auto_map = {
"AutoConfig": "configuration_minicpm.MiniCPMConfig",
"AutoModel": "modeling_minicpm.MiniCPMModel",
"AutoModelForCausalLM": "modeling_minicpm.MiniCPMForCausalLM",
"AutoModelForSeq2SeqLM": "modeling_minicpm.MiniCPMForCausalLM",
"AutoModelForSequenceClassification": "modeling_minicpm.MiniCPMForSequenceClassification"
}
model.llm.save_pretrained(f"{args.model}/model")
tok = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
tok.save_pretrained(f"{args.model}/model")
print("Done!")
print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
print(f"Also, use {args.model}/minicpmv.projector to prepare a minicpmv-encoder.gguf file.")
......@@ -2,3 +2,4 @@
--extra-index-url https://download.pytorch.org/whl/cpu
pillow~=10.2.0
torch~=2.2.1
torchvision~=0.17.1
#include "arg.h"
#include "common.h"
#include "sampling.h"
#include "log.h"
#include "llama.h"
#include <cmath>
#include <cstdio>
#include <string>
#include <vector>
......@@ -35,31 +37,26 @@ struct ngram_container {
};
int main(int argc, char ** argv) {
gpt_params params;
common_params params;
if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
return 1;
}
common_init();
const int W = 15; // lookahead window
const int N = 5; // n-gram size
const int G = 15; // max verification n-grams
const bool dump_kv_cache = params.dump_kv_cache;
#ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("lookahead", "log"));
LOG_TEE("Log start\n");
log_dump_cmdline(argc, argv);
#endif // LOG_DISABLE_LOGS
// init llama.cpp
llama_backend_init();
llama_numa_init(params.numa);
// load the target model
llama_init_result llama_init = llama_init_from_gpt_params(params);
common_init_result llama_init = common_init_from_params(params);
llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context;
......@@ -68,21 +65,21 @@ int main(int argc, char ** argv) {
std::vector<llama_token> inp;
std::vector<llama_token> all;
inp = ::llama_tokenize(ctx, params.prompt, true, true);
inp = common_tokenize(ctx, params.prompt, true, true);
all = inp;
const int max_context_size = llama_n_ctx(ctx);
const int max_tokens_list_size = max_context_size - 4;
if ((int) inp.size() > max_tokens_list_size) {
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
return 1;
}
fprintf(stderr, "\n\n");
LOG("\n\n");
for (auto id : inp) {
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
LOG("%s", common_token_to_piece(ctx, id).c_str());
}
fflush(stderr);
......@@ -92,8 +89,8 @@ int main(int argc, char ** argv) {
const auto t_enc_start = ggml_time_us();
// eval the prompt
llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1, 0, 0));
llama_decode(ctx, llama_batch_get_one(&inp.back(), 1, n_input - 1, 0));
llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1));
llama_decode(ctx, llama_batch_get_one(&inp.back(), 1));
for (int s = 1; s < W + G + 1; ++s) {
llama_kv_cache_seq_cp(ctx, 0, s, -1, -1);
......@@ -118,7 +115,7 @@ int main(int argc, char ** argv) {
llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
// target model sampling context
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
struct common_sampler * smpl = common_sampler_init(model, params.sampling);
// verification n-grams
std::vector<ngram_data> ngrams_cur(G);
......@@ -159,14 +156,14 @@ int main(int argc, char ** argv) {
// sample first token
{
id = llama_sampling_sample(ctx_sampling, ctx, NULL, 0);
id = common_sampler_sample(smpl, ctx, 0);
llama_sampling_accept(ctx_sampling, ctx, id, true);
common_sampler_accept(smpl, id, true);
{
const std::string token_str = llama_token_to_piece(ctx, id);
const std::string token_str = common_token_to_piece(ctx, id);
printf("%s", token_str.c_str());
LOG("%s", token_str.c_str());
fflush(stdout);
}
}
......@@ -175,7 +172,7 @@ int main(int argc, char ** argv) {
// debug
if (dump_kv_cache) {
llama_kv_cache_view_update(ctx, &kvc_view);
llama_kv_cache_dump_view_seqs(kvc_view, 40);
common_kv_cache_dump_view_seqs(kvc_view, 40);
}
// build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/
......@@ -204,10 +201,10 @@ int main(int argc, char ** argv) {
// V V V V V V
// id
{
llama_batch_clear(batch);
common_batch_clear(batch);
// current token - first token of the first level
llama_batch_add(batch, id, n_past, seq_id_all, true);
common_batch_add(batch, id, n_past, seq_id_all, true);
// verification n-grams - queue this before the lookahead tokens for less KV cache fragmentation
{
......@@ -232,7 +229,7 @@ int main(int argc, char ** argv) {
ngrams_cur[g].tokens [j + 1] = t;
ngrams_cur[g].i_batch[j + 1] = batch.n_tokens;
llama_batch_add(batch, t, n_past + j + 1, { W + 1 + g }, true);
common_batch_add(batch, t, n_past + j + 1, { W + 1 + g }, true);
}
}
}
......@@ -244,19 +241,19 @@ int main(int argc, char ** argv) {
seq_id_look[j] = i + j + 1;
}
llama_batch_add(batch, tokens_j[0][i], n_past + i, seq_id_look, false);
common_batch_add(batch, tokens_j[0][i], n_past + i, seq_id_look, false);
}
// fill the rest of the levels
for (int j = 1; j < N - 1; j++) {
for (int i = 0; i < W; i++) {
llama_batch_add(batch, tokens_j[j][i], n_past + j + i, { i + 1 }, j == N - 2);
common_batch_add(batch, tokens_j[j][i], n_past + j + i, { i + 1 }, j == N - 2);
}
}
}
if (llama_decode(ctx, batch) != 0) {
fprintf(stderr, "\n\n%s: error: llama_decode failed - increase KV cache size\n", __func__);
LOG_ERR("\n\n%s: llama_decode failed - increase KV cache size\n", __func__);
return 1;
}
......@@ -284,19 +281,19 @@ int main(int argc, char ** argv) {
}
// sample the next token
id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_batch);
id = common_sampler_sample(smpl, ctx, i_batch);
llama_sampling_accept(ctx_sampling, ctx, id, true);
common_sampler_accept(smpl, id, true);
// print
{
const std::string token_str = llama_token_to_piece(ctx, id);
const std::string token_str = common_token_to_piece(ctx, id);
if (v == 0) {
printf("%s", token_str.c_str());
LOG("%s", token_str.c_str());
} else {
// print light cyan
printf("\033[0;96m%s\033[0m", token_str.c_str());
LOG("\033[0;96m%s\033[0m", token_str.c_str());
}
fflush(stdout);
......@@ -330,21 +327,21 @@ int main(int argc, char ** argv) {
// print known n-grams starting with token id (debug)
if (0 && v == 0) {
if (ngrams_observed.cnt[id] > 0) {
printf("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str());
LOG("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], common_token_to_piece(ctx, id).c_str());
}
for (int i = 0; i < ngrams_observed.cnt[id]; i++) {
printf(" - ngram %2d: ", i);
LOG(" - ngram %2d: ", i);
const int idx = id*(N - 1)*G + i*(N - 1);
for (int j = 0; j < N - 1; j++) {
const std::string token_str = llama_token_to_piece(ctx, ngrams_observed.tokens[idx + j]);
const std::string token_str = common_token_to_piece(ctx, ngrams_observed.tokens[idx + j]);
printf("%s", token_str.c_str());
LOG("%s", token_str.c_str());
}
printf("\n");
LOG("\n");
}
}
......@@ -361,7 +358,7 @@ int main(int argc, char ** argv) {
if (v == 0) {
// sample from the last level
for (int i = 0; i < W; i++) {
tokens_j[N - 2][i] = llama_sampling_sample(ctx_sampling, ctx, NULL, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
tokens_j[N - 2][i] = common_sampler_sample(smpl, ctx, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
}
} else {
for (int i = 0; i < W; i++) {
......@@ -455,23 +452,25 @@ int main(int argc, char ** argv) {
auto t_dec_end = ggml_time_us();
LOG_TEE("\n\n");
LOG("\n\n");
LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
LOG_INF("\n");
LOG_INF("W = %2d\n", W);
LOG_INF("N = %2d\n", N);
LOG_INF("G = %2d\n", G);
LOG_INF("\n");
LOG_INF("n_predict = %d\n", n_predict);
LOG_INF("n_accept = %d\n", n_accept);
LOG_TEE("\n");
LOG_TEE("W = %2d\n", W);
LOG_TEE("N = %2d\n", N);
LOG_TEE("G = %2d\n", G);
LOG_TEE("\n");
LOG_TEE("n_predict = %d\n", n_predict);
LOG_TEE("n_accept = %d\n", n_accept);
LOG_INF("\n");
common_perf_print(ctx, smpl);
llama_print_timings(ctx);
common_sampler_free(smpl);
llama_kv_cache_view_free(&kvc_view);
llama_sampling_free(ctx_sampling);
llama_batch_free(batch);
......@@ -480,7 +479,7 @@ int main(int argc, char ** argv) {
llama_backend_free();
fprintf(stderr, "\n\n");
LOG("\n\n");
return 0;
}
#include "ggml.h"
#include "llama.h"
#include "arg.h"
#include "common.h"
#include "ngram-cache.h"
#include "ggml.h"
#include "llama.h"
#include <cstdint>
#include <fstream>
......@@ -11,10 +12,9 @@
#include <vector>
int main(int argc, char ** argv){
gpt_params params;
common_params params;
if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
return 1;
}
......@@ -23,7 +23,7 @@ int main(int argc, char ** argv){
llama_numa_init(params.numa);
// load the model
llama_init_result llama_init = llama_init_from_gpt_params(params);
common_init_result llama_init = common_init_from_params(params);
llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context;
......@@ -31,13 +31,15 @@ int main(int argc, char ** argv){
// tokenize the prompt
std::vector<llama_token> inp;
inp = ::llama_tokenize(ctx, params.prompt, true, true);
inp = common_tokenize(ctx, params.prompt, true, true);
fprintf(stderr, "%s: tokenization done\n", __func__);
llama_ngram_cache ngram_cache;
llama_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true);
common_ngram_cache ngram_cache;
common_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true);
fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
llama_ngram_cache_save(ngram_cache, params.lookup_cache_static);
common_ngram_cache_save(ngram_cache, params.lookup_cache_static);
return 0;
}
......@@ -33,15 +33,15 @@ int main(int argc, char ** argv){
}
fprintf(stderr, "lookup-merge: loading file %s\n", args[0].c_str());
llama_ngram_cache ngram_cache_merged = llama_ngram_cache_load(args[0]);
common_ngram_cache ngram_cache_merged = common_ngram_cache_load(args[0]);
for (size_t i = 1; i < args.size()-1; ++i) {
fprintf(stderr, "lookup-merge: loading file %s\n", args[i].c_str());
llama_ngram_cache ngram_cache = llama_ngram_cache_load(args[i]);
common_ngram_cache ngram_cache = common_ngram_cache_load(args[i]);
llama_ngram_cache_merge(ngram_cache_merged, ngram_cache);
common_ngram_cache_merge(ngram_cache_merged, ngram_cache);
}
fprintf(stderr, "lookup-merge: saving file %s\n", args.back().c_str());
llama_ngram_cache_save(ngram_cache_merged, args.back());
common_ngram_cache_save(ngram_cache_merged, args.back());
}
#include "ggml.h"
#include "arg.h"
#include "common.h"
#include "llama.h"
#include "log.h"
#include "ngram-cache.h"
#include "llama.h"
#include "ggml.h"
#include <cmath>
#include <cstdint>
#include <cstdio>
#include <cinttypes>
#include <fstream>
#include <string>
#include <vector>
#include <unordered_map>
int main(int argc, char ** argv){
gpt_params params;
common_params params;
if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
return 1;
}
const int n_draft = params.n_draft;
common_init();
const int n_draft = params.speculative.n_max;
// init llama.cpp
llama_backend_init();
llama_numa_init(params.numa);
// load the model
llama_init_result llama_init = llama_init_from_gpt_params(params);
common_init_result llama_init = common_init_from_params(params);
llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context;
// tokenize the prompt
std::vector<llama_token> inp;
inp = ::llama_tokenize(ctx, params.prompt, true, true);
inp = common_tokenize(ctx, params.prompt, true, true);
common_ngram_cache ngram_cache_context;
common_ngram_cache ngram_cache_dynamic;
common_ngram_cache ngram_cache_static;
llama_ngram_cache ngram_cache_context;
llama_ngram_cache ngram_cache_dynamic;
llama_ngram_cache ngram_cache_static;
int64_t t_draft_flat_us = 0;
int64_t t_draft_us = 0;
......@@ -47,16 +49,16 @@ int main(int argc, char ** argv){
if (!params.lookup_cache_static.empty()) {
try {
ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
ngram_cache_static = common_ngram_cache_load(params.lookup_cache_static);
} catch (std::ifstream::failure const &) {
fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
exit(1);
}
}
if (!params.lookup_cache_dynamic.empty()) {
try {
ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic);
ngram_cache_dynamic = common_ngram_cache_load(params.lookup_cache_dynamic);
} catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
}
......@@ -85,7 +87,7 @@ int main(int argc, char ** argv){
{
const int64_t t_start_draft_us = ggml_time_us();
llama_ngram_cache_draft(pseudo_output, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
common_ngram_cache_draft(pseudo_output, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
t_draft_us += ggml_time_us() - t_start_draft_us;
}
......@@ -104,7 +106,7 @@ int main(int argc, char ** argv){
{
const int64_t t_start_draft_us = ggml_time_us();
llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
t_draft_us += ggml_time_us() - t_start_draft_us;
}
}
......@@ -114,7 +116,7 @@ int main(int argc, char ** argv){
pseudo_output.push_back(inp_slice[pseudo_output.size()]);
{
const int64_t t_start_draft_us = ggml_time_us();
llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
t_draft_us += ggml_time_us() - t_start_draft_us;
}
}
......@@ -128,32 +130,32 @@ int main(int argc, char ** argv){
const int64_t eta_min = eta_ms / (60*1000);
const int64_t eta_s = (eta_ms - 60*1000*eta_min) / 1000;
LOG_TEE("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s);
LOG_INF("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s);
}
// After each chunk, update the dynamic ngram cache with the context ngram cache:
llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
common_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
ngram_cache_context.clear();
}
LOG_TEE("\n");
LOG("\n");
LOG_TEE("\n");
LOG_TEE("n_draft = %d\n", n_draft);
LOG_TEE("n_predict = %d\n", n_input - n_input % n_ctx);
LOG_TEE("n_drafted = %d\n", n_drafted);
LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
LOG_INF("\n");
LOG_INF("n_draft = %d\n", n_draft);
LOG_INF("n_predict = %d\n", n_input - n_input % n_ctx);
LOG_INF("n_drafted = %d\n", n_drafted);
LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
LOG_INF("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
LOG_TEE("n_accept = %d\n", n_accept);
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
LOG_INF("n_accept = %d\n", n_accept);
LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
llama_free(ctx);
llama_free_model(model);
llama_backend_free();
fprintf(stderr, "\n\n");
LOG("\n\n");
return 0;
}
#include "arg.h"
#include "ggml.h"
#include "llama.h"
#include "common.h"
#include "ngram-cache.h"
#include "sampling.h"
#include "log.h"
#include "llama.h"
#include <cmath>
#include <cstdint>
#include <cstdio>
#include <fstream>
#include <string>
#include <vector>
#include <unordered_map>
int main(int argc, char ** argv){
gpt_params params;
common_params params;
if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
return 1;
}
common_init();
// max. number of additional tokens to draft if match is found
const int n_draft = params.n_draft;
const int n_draft = params.speculative.n_max;
const bool dump_kv_cache = params.dump_kv_cache;
#ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("lookup", "log"));
LOG_TEE("Log start\n");
log_dump_cmdline(argc, argv);
#endif // LOG_DISABLE_LOGS
// init llama.cpp
llama_backend_init();
llama_numa_init(params.numa);
// load the model
llama_init_result llama_init = llama_init_from_gpt_params(params);
common_init_result llama_init = common_init_from_params(params);
llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context;
// tokenize the prompt
std::vector<llama_token> inp;
inp = ::llama_tokenize(ctx, params.prompt, true, true);
inp = common_tokenize(ctx, params.prompt, true, true);
llama_ngram_cache ngram_cache_context;
llama_ngram_cache ngram_cache_dynamic;
llama_ngram_cache ngram_cache_static;
common_ngram_cache ngram_cache_context;
common_ngram_cache ngram_cache_dynamic;
common_ngram_cache ngram_cache_static;
int64_t t_draft_flat_us = 0;
int64_t t_draft_us = 0;
{
// Fill up context ngram cache with tokens from user input:
const int64_t t_start_draft_us = ggml_time_us();
llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false);
common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false);
if (!params.lookup_cache_static.empty()) {
try {
ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
ngram_cache_static = common_ngram_cache_load(params.lookup_cache_static);
} catch (std::ifstream::failure const &) {
fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
exit(1);
}
}
if (!params.lookup_cache_dynamic.empty()) {
try {
ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic);
ngram_cache_dynamic = common_ngram_cache_load(params.lookup_cache_dynamic);
} catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
}
......@@ -77,14 +73,14 @@ int main(int argc, char ** argv){
const int max_tokens_list_size = max_context_size - 4;
if ((int) inp.size() > max_tokens_list_size) {
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
return 1;
}
fprintf(stderr, "\n\n");
LOG("\n\n");
for (auto id : inp) {
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
LOG("%s", common_token_to_piece(ctx, id).c_str());
}
fflush(stderr);
......@@ -93,8 +89,8 @@ int main(int argc, char ** argv){
const auto t_enc_start = ggml_time_us();
llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1, 0, 0));
llama_decode(ctx, llama_batch_get_one(&inp.back(), 1, n_input - 1, 0));
llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1));
llama_decode(ctx, llama_batch_get_one(&inp.back(), 1));
const auto t_enc_end = ggml_time_us();
......@@ -106,7 +102,7 @@ int main(int argc, char ** argv){
bool has_eos = false;
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
struct common_sampler * smpl = common_sampler_init(model, params.sampling);
std::vector<llama_token> draft;
......@@ -121,23 +117,23 @@ int main(int argc, char ** argv){
// debug
if (dump_kv_cache) {
llama_kv_cache_view_update(ctx, &kvc_view);
llama_kv_cache_dump_view_seqs(kvc_view, 40);
common_kv_cache_dump_view_seqs(kvc_view, 40);
}
// print current draft sequence
LOG("drafted %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, draft).c_str());
LOG_DBG("drafted %s\n", string_from(ctx, draft).c_str());
int i_dft = 0;
while (true) {
// sample from the target model
llama_token id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_dft);
llama_token id = common_sampler_sample(smpl, ctx, i_dft);
llama_sampling_accept(ctx_sampling, ctx, id, true);
common_sampler_accept(smpl, id, true);
const std::string token_str = llama_token_to_piece(ctx, id);
const std::string token_str = common_token_to_piece(ctx, id);
if (!params.use_color) {
printf("%s", token_str.c_str());
LOG("%s", token_str.c_str());
}
if (llama_token_is_eog(model, id)) {
......@@ -148,7 +144,7 @@ int main(int argc, char ** argv){
// check if the target token matches the draft
if (i_dft < (int) draft.size() && id == draft[i_dft]) {
LOG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
LOG_DBG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
++n_accept;
++n_past;
++i_dft;
......@@ -156,25 +152,25 @@ int main(int argc, char ** argv){
{
// Update context ngram cache with the newly accepted token:
const int64_t t_start_draft_us = ggml_time_us();
llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
t_draft_us += ggml_time_us() - t_start_draft_us;
}
if (params.use_color) {
// color accepted draft token
printf("\033[34m%s\033[0m", token_str.c_str());
LOG("\033[34m%s\033[0m", token_str.c_str());
fflush(stdout);
}
continue;
}
if (params.use_color) {
printf("%s", token_str.c_str());
LOG("%s", token_str.c_str());
}
fflush(stdout);
LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
LOG_DBG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
draft.clear();
draft.push_back(id);
......@@ -182,7 +178,7 @@ int main(int argc, char ** argv){
{
// Update context ngram cache with the newly accepted token:
const int64_t t_start_draft_us = ggml_time_us();
llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
t_draft_us += ggml_time_us() - t_start_draft_us;
}
break;
......@@ -196,18 +192,18 @@ int main(int argc, char ** argv){
// clean the cache of draft tokens that weren't accepted
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
llama_batch_clear(batch_tgt);
llama_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
common_batch_clear(batch_tgt);
common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
// Draft already contains a single token sampled from the model:
GGML_ASSERT(draft.size() == 1);
GGML_ASSERT(draft[0] == inp.back());
const int64_t t_start_draft_us = ggml_time_us();
llama_ngram_cache_draft(inp, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
common_ngram_cache_draft(inp, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
for (size_t i = 1; i < draft.size(); ++i) {
llama_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
common_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
}
t_draft_us += ggml_time_us() - t_start_draft_us;
......@@ -222,28 +218,29 @@ int main(int argc, char ** argv){
auto t_dec_end = ggml_time_us();
// Update dynamic ngram cache with context ngram cache and save it to disk:
llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
llama_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic);
common_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
common_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic);
LOG_TEE("\n\n");
LOG("\n\n");
LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
LOG_TEE("\n");
LOG_TEE("n_draft = %d\n", n_draft);
LOG_TEE("n_predict = %d\n", n_predict);
LOG_TEE("n_drafted = %d\n", n_drafted);
LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
LOG_INF("\n");
LOG_INF("n_draft = %d\n", n_draft);
LOG_INF("n_predict = %d\n", n_predict);
LOG_INF("n_drafted = %d\n", n_drafted);
LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
LOG_INF("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
LOG_TEE("n_accept = %d\n", n_accept);
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
LOG_INF("n_accept = %d\n", n_accept);
LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
LOG_INF("\ntarget:\n\n");
common_perf_print(ctx, smpl);
LOG_TEE("\ntarget:\n");
llama_print_timings(ctx);
common_sampler_free(smpl);
llama_sampling_free(ctx_sampling);
llama_batch_free(batch_tgt);
llama_free(ctx);
......@@ -251,7 +248,7 @@ int main(int argc, char ** argv){
llama_backend_free();
fprintf(stderr, "\n\n");
LOG("\n\n");
return 0;
}
......@@ -69,7 +69,7 @@ In this section, we cover the most commonly used options for running the `llama-
- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
- `-mli, --multiline-input`: Allows you to write or paste multiple lines without ending each in '\'
- `-t N, --threads N`: Set the number of threads to use during generation. For optimal performance, it is recommended to set this value to the number of physical CPU cores your system has.
- - `-ngl N, --n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
- `-ngl N, --n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
## Input Prompts
......@@ -161,6 +161,8 @@ A value of -1 will enable infinite text generation, even though we have a finite
If the pause is undesirable, a value of -2 will stop generation immediately when the context is filled.
The `--no-context-shift` option allows you to stop the infinite text generation once the finite context window is full.
It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode, text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `--predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter.
### Temperature
......@@ -185,6 +187,30 @@ Use the `--no-penalize-nl` option to disable newline penalization when applying
Example usage: `--repeat-penalty 1.15 --repeat-last-n 128 --no-penalize-nl`
### DRY Repetition Penalty
DRY (Don't Repeat Yourself) sampling is an effective technique for reducing repetition in generated text even across long contexts by penalizing tokens based on their recent usage patterns (original [PR link](https://github.com/oobabooga/text-generation-webui/pull/5677)).
- `--dry-multiplier N`: Set the DRY sampling multiplier (default: 0.0, 0.0 = disabled).
- `--dry-base N`: Set the DRY sampling base value (default: 1.75).
- `--dry-allowed-length N`: Set the allowed length for DRY sampling (default: 2).
- `--dry-penalty-last-n N`: Set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size).
- `--dry-sequence-breaker STRING`: Add a sequence breaker for DRY sampling. Can be used more than once to add multiple sequence breakers. Using this clears out the default breakers, which consist of: `['\n', ':', '"', '*']`. If the string `"none"` is supplied, no sequence breakers are used.
The `dry-multiplier` option controls the strength of the DRY sampling effect. A value of 0.0 disables DRY sampling, while higher values increase its influence. A typical recommended value is 0.8.
The `dry-base` option sets the base value for the exponential penalty calculation in DRY sampling. Higher values lead to more aggressive penalization of repetitions.
The `dry-allowed-length` option sets the maximum length of repeated sequences that will not be penalized. Repetitions shorter than or equal to this length are not penalized, allowing for natural repetitions of short phrases or common words.
The `dry-penalty-last-n` option controls how many recent tokens to consider when applying the DRY penalty. A value of -1 considers the entire context. Use a positive value to limit the consideration to a specific number of recent tokens.
The `dry-sequence-breaker` option adds a single sequence breaker and can be used more than once to specify multiple sequence breakers. Sequence breakers interrupt sequence matching and break the input into parts where matching can be applied.
DRY sampling provides more nuanced control over text generation, particularly for reducing long-range repetitions and maintaining global coherence.
Example usage: `--dry-multiplier 0.8 --dry-base 1.75 --dry-allowed-length 2 --dry-penalty-last-n -1 --dry-sequence-breaker "—" --dry-sequence-breaker "##"`
### Top-K Sampling
- `--top-k N`: Limit the next token selection to the K most probable tokens (default: 40).
......@@ -209,14 +235,6 @@ The Min-P sampling method was designed as an alternative to Top-P, and aims to e
Example usage: `--min-p 0.05`
### Tail-Free Sampling (TFS)
- `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).
Tail-free sampling (TFS) is a text generation technique that aims to reduce the impact of less likely tokens, which may be less relevant, less coherent, or nonsensical, on the output. Similar to Top-P it tries to determine the bulk of the most likely tokens dynamically. But TFS filters out logits based on the second derivative of their probabilities. Adding tokens is stopped after the sum of the second derivatives reaches the parameter z. In short: TFS looks at how quickly the probabilities of the tokens decrease and cuts off the tail of unlikely tokens using the parameter z. Typical values for z are in the range of 0.9 to 0.95. A value of 1.0 would include all tokens and thus disables the effect of TFS.
Example usage: `--tfs 0.95`
### Locally Typical Sampling
- `--typical N`: Enable locally typical sampling with parameter p (default: 1.0, 1.0 = disabled).
......@@ -239,6 +257,19 @@ The `--mirostat-ent` option sets the Mirostat target entropy (tau), which repres
Example usage: `--mirostat 2 --mirostat-lr 0.05 --mirostat-ent 3.0`
### XTC Sampling
- `--xtc-probability N`: Sets the chance for token removal (checked once on sampler start) (default: 0.0).
- `--xtc-threshold N`: Sets a minimum probability threshold for tokens to be removed (default: 0.1).
Exclude Top Choices (XTC) is a unique sampler that is designed to remove top tokens from consideration and avoid more obvious and repetitive outputs. With a chance of `xtc-probability` it searches for tokens with probabilities of `xtc-threshold` and above, then removes all such tokens except the least probable one.
By removing top tokens XTC can improve the variety of answers, break writing clichés and inhibit repition, since clichés and repeated phrases are usually more likely to appear. By keeping the last token above the threshold, XTC ensures that the answer is still coherent. XTC is meant to be used for creative tasks, but feel free to experiment with different settings for different models.
Being experimental and unique, XTC is disabled by default. The recommended combination of samplers is Min-P followed by XTC on its default settings: `--sampling-seq mx --min-p 0.02 --xtc-probability 0.5`.
Example usage: `--xtc-probability 0.5 --xtc-threshold 0.1`
### Logit Bias
- `-l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS`: Modify the likelihood of a token appearing in the generated text completion.
......@@ -282,10 +313,6 @@ These options help improve the performance and memory usage of the LLaMA models.
These flags attempt optimizations that help on some systems with non-uniform memory access. This currently consists of one of the above strategies, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop_caches' as root.
### Memory Float 32
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement and cached prompt file size but does not appear to increase generation quality in a measurable way. Not recommended.
### Batch Size
- `-b N, --batch-size N`: Set the batch size for prompt processing (default: `2048`). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.
......@@ -306,6 +333,15 @@ These options help improve the performance and memory usage of the LLaMA models.
For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-and-quantize).
## LoRA (Low-Rank Adaptation) adapters
- `--lora FNAME`: Optional path to a LoRA adapter to use with scaling of 1.0. Can be mixed with `--lora-scaled` and can be repeated to use multiple adapters.
- `--lora-scaled FNAME`: Optional path to a LoRA adapter with user-defined scaling. Can be mixed with `--lora` and can repeated to use multiple adapters.
You can add LoRA adapters using `--lora` or `--lora-scaled`. For example: `--lora my_adapter_1.gguf --lora my_adapter_2.gguf ...` or `--lora-scaled lora_task_A.gguf 0.5 --lora-scaled lora_task_B.gguf 0.5`.
LoRA adapters should be in GGUF format. To convert from Hugging Face format use the `convert-lora-to-gguf.py` script. LoRA adapters are loaded separately and applied during inference - they are not merged with the main model. This means that mmap model loading is fully supported when using LoRA adapters. The old `--lora-base` flag has been removed now that merging is no longer performed.
## Additional Options
These options provide extra functionality and customization when running the LLaMA models:
......@@ -314,6 +350,4 @@ These options provide extra functionality and customization when running the LLa
- `--verbose-prompt`: Print the prompt before generating text.
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used.
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
- `-hfr URL --hf-repo URL`: The url to the Hugging Face model repository. Used in conjunction with `--hf-file` or `-hff`. The model is downloaded and stored in the file provided by `-m` or `--model`. If `-m` is not provided, the model is auto-stored in the path specified by the `LLAMA_CACHE` environment variable or in an OS-specific local cache.
#include "arg.h"
#include "common.h"
#include "console.h"
#include "log.h"
#include "sampling.h"
#include "llama.h"
#include <cassert>
#include <cinttypes>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <ctime>
......@@ -33,13 +33,23 @@
static llama_context ** g_ctx;
static llama_model ** g_model;
static gpt_params * g_params;
static common_sampler ** g_smpl;
static common_params * g_params;
static std::vector<llama_token> * g_input_tokens;
static std::ostringstream * g_output_ss;
static std::vector<llama_token> * g_output_tokens;
static bool is_interacting = false;
static bool need_insert_eot = false;
static void print_usage(int argc, char ** argv) {
(void) argc;
LOG("\nexample usage:\n");
LOG("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
LOG("\n chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
LOG("\n");
}
static bool file_exists(const std::string & path) {
std::ifstream f(path.c_str());
return f.good();
......@@ -52,50 +62,6 @@ static bool file_is_empty(const std::string & path) {
return f.tellg() == 0;
}
static void write_logfile(
const llama_context * ctx, const gpt_params & params, const llama_model * model,
const std::vector<llama_token> & input_tokens, const std::string & output,
const std::vector<llama_token> & output_tokens
) {
if (params.logdir.empty()) {
return;
}
const std::string timestamp = string_get_sortable_timestamp();
const bool success = fs_create_directory_with_parents(params.logdir);
if (!success) {
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
__func__, params.logdir.c_str());
return;
}
const std::string logfile_path = params.logdir + timestamp + ".yml";
FILE * logfile = fopen(logfile_path.c_str(), "w");
if (logfile == NULL) {
fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
return;
}
fprintf(logfile, "binary: main\n");
char model_desc[128];
llama_model_desc(model, model_desc, sizeof(model_desc));
yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
fprintf(logfile, "\n");
fprintf(logfile, "######################\n");
fprintf(logfile, "# Generation Results #\n");
fprintf(logfile, "######################\n");
fprintf(logfile, "\n");
yaml_dump_string_multiline(logfile, "output", output.c_str());
yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
llama_dump_timing_info_yaml(logfile, ctx);
fclose(logfile);
}
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
static void sigint_handler(int signo) {
if (signo == SIGINT) {
......@@ -104,50 +70,37 @@ static void sigint_handler(int signo) {
need_insert_eot = true;
} else {
console::cleanup();
printf("\n");
llama_print_timings(*g_ctx);
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
LOG("\n");
common_perf_print(*g_ctx, *g_smpl);
// make sure all logs are flushed
LOG("Interrupted by user\n");
common_log_pause(common_log_main());
_exit(130);
}
}
}
#endif
static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
(void) level;
(void) user_data;
LOG_TEE("%s", text);
}
static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, std::string role, std::string content) {
llama_chat_msg new_msg{role, content};
auto formatted = llama_chat_format_single(
model, g_params->chat_template, chat_msgs, new_msg, role == "user");
static std::string chat_add_and_format(struct llama_model * model, std::vector<common_chat_msg> & chat_msgs, const std::string & role, const std::string & content) {
common_chat_msg new_msg{role, content};
auto formatted = common_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user");
chat_msgs.push_back({role, content});
LOG("formatted: %s\n", formatted.c_str());
LOG_DBG("formatted: '%s'\n", formatted.c_str());
return formatted;
}
int main(int argc, char ** argv) {
gpt_params params;
common_params params;
g_params = &params;
if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
return 1;
}
llama_sampling_params & sparams = params.sparams;
common_init();
#ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("main", "log"));
LOG_TEE("Log start\n");
log_dump_cmdline(argc, argv);
llama_log_set(llama_log_callback_logTee, nullptr);
#endif // LOG_DISABLE_LOGS
// TODO: Dump params ?
//LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity));
auto & sparams = params.sampling;
// save choice to use color for later
// (note for later: this is a slightly awkward choice)
......@@ -155,123 +108,141 @@ int main(int argc, char ** argv) {
atexit([]() { console::cleanup(); });
if (params.logits_all) {
printf("\n************\n");
printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
printf("************\n\n");
LOG_ERR("************\n");
LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
LOG_ERR("************\n\n");
return 0;
}
if (params.embedding) {
printf("\n************\n");
printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
printf("************\n\n");
LOG_ERR("************\n");
LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
LOG_ERR("************\n\n");
return 0;
}
if (params.n_ctx != 0 && params.n_ctx < 8) {
LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
LOG_WRN("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
params.n_ctx = 8;
}
if (params.rope_freq_base != 0.0) {
LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
LOG_WRN("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
}
if (params.rope_freq_scale != 0.0) {
LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
}
LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
LOG_INF("%s: llama backend init\n", __func__);
LOG_TEE("%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
LOG("%s: llama backend init\n", __func__);
llama_backend_init();
llama_numa_init(params.numa);
llama_model * model;
llama_context * ctx;
llama_context * ctx_guidance = NULL;
std::vector<llama_chat_msg> chat_msgs;
llama_model * model = nullptr;
llama_context * ctx = nullptr;
common_sampler * smpl = nullptr;
std::vector<common_chat_msg> chat_msgs;
g_model = &model;
g_ctx = &ctx;
g_smpl = &smpl;
// load the model and apply lora adapter, if any
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
llama_init_result llama_init = llama_init_from_gpt_params(params);
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
common_init_result llama_init = common_init_from_params(params);
model = llama_init.model;
ctx = llama_init.context;
if (sparams.cfg_scale > 1.f) {
struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
ctx_guidance = llama_new_context_with_model(model, lparams);
}
if (model == NULL) {
LOG_TEE("%s: error: unable to load model\n", __func__);
LOG_ERR("%s: error: unable to load model\n", __func__);
return 1;
}
LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
struct ggml_threadpool_params tpp_batch =
ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
struct ggml_threadpool_params tpp =
ggml_threadpool_params_from_cpu_params(params.cpuparams);
set_process_priority(params.cpuparams.priority);
struct ggml_threadpool * threadpool_batch = NULL;
if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
threadpool_batch = ggml_threadpool_new(&tpp_batch);
if (!threadpool_batch) {
LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
return 1;
}
// Start the non-batch threadpool in the paused state
tpp.paused = true;
}
struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
if (!threadpool) {
LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
return 1;
}
llama_attach_threadpool(ctx, threadpool, threadpool_batch);
const int n_ctx_train = llama_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx);
LOG("n_ctx: %d\n", n_ctx);
if (n_ctx > n_ctx_train) {
LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, n_ctx);
LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
}
// print chat template example in conversation mode
if (params.conversation) {
if (params.enable_chat_template) {
LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(model, params.chat_template).c_str());
} else {
LOG_TEE("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
}
}
// print system information
{
LOG_TEE("\n");
LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
LOG_INF("\n");
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
LOG_INF("\n");
}
std::string path_session = params.path_prompt_cache;
std::vector<llama_token> session_tokens;
if (!path_session.empty()) {
LOG_TEE("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
LOG_INF("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
if (!file_exists(path_session)) {
LOG_TEE("%s: session file does not exist, will create.\n", __func__);
LOG_INF("%s: session file does not exist, will create.\n", __func__);
} else if (file_is_empty(path_session)) {
LOG_TEE("%s: The session file is empty. A new session will be initialized.\n", __func__);
LOG_INF("%s: The session file is empty. A new session will be initialized.\n", __func__);
} else {
// The file exists and is not empty
session_tokens.resize(n_ctx);
size_t n_token_count_out = 0;
if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
LOG_ERR("%s: failed to load session file '%s'\n", __func__, path_session.c_str());
return 1;
}
session_tokens.resize(n_token_count_out);
LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
LOG_INF("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
}
}
const bool add_bos = llama_should_add_bos_token(model);
const bool add_bos = llama_add_bos_token(model);
if (!llama_model_has_encoder(model)) {
GGML_ASSERT(llama_add_eos_token(model) != 1);
GGML_ASSERT(!llama_add_eos_token(model));
}
LOG("add_bos: %d\n", add_bos);
LOG_DBG("n_ctx: %d, add_bos: %d\n", n_ctx, add_bos);
std::vector<llama_token> embd_inp;
......@@ -280,49 +251,31 @@ int main(int argc, char ** argv) {
? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
: params.prompt;
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
LOG("tokenize the prompt\n");
embd_inp = ::llama_tokenize(ctx, prompt, true, true);
LOG_DBG("tokenize the prompt\n");
embd_inp = common_tokenize(ctx, prompt, true, true);
} else {
LOG("use session tokens\n");
LOG_DBG("use session tokens\n");
embd_inp = session_tokens;
}
LOG("prompt: \"%s\"\n", log_tostr(prompt));
LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
LOG_DBG("prompt: \"%s\"\n", prompt.c_str());
LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
}
// Should not run without any tokens
if (embd_inp.empty()) {
if (add_bos) {
embd_inp.push_back(llama_token_bos(model));
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
} else {
LOG_TEE("error: input is empty\n");
LOG_ERR("input is empty\n");
return -1;
}
}
// Tokenize negative prompt
std::vector<llama_token> guidance_inp;
int guidance_offset = 0;
int original_prompt_len = 0;
if (ctx_guidance) {
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true);
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true, true);
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
original_prompt_len = original_inp.size();
guidance_offset = (int)guidance_inp.size() - original_prompt_len;
LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
LOG("guidance_offset: %s", log_tostr(guidance_offset));
}
if ((int) embd_inp.size() > n_ctx - 4) {
LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
return 1;
}
......@@ -336,14 +289,14 @@ int main(int argc, char ** argv) {
n_matching_session_tokens++;
}
if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
LOG_TEE("%s: using full prompt from session file\n", __func__);
LOG_INF("%s: using full prompt from session file\n", __func__);
} else if (n_matching_session_tokens >= embd_inp.size()) {
LOG_TEE("%s: session file has exact match for prompt!\n", __func__);
LOG_INF("%s: session file has exact match for prompt!\n", __func__);
} else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
LOG_TEE("%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
LOG_WRN("%s: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
__func__, n_matching_session_tokens, embd_inp.size());
} else {
LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
LOG_INF("%s: session file matches %zu / %zu tokens of prompt\n",
__func__, n_matching_session_tokens, embd_inp.size());
}
......@@ -351,14 +304,13 @@ int main(int argc, char ** argv) {
llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
}
LOGLN(
"recalculate the cached logits (check): embd_inp.empty() %s, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu, embd_inp.size() %zu",
log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size(), embd_inp.size());
LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
embd_inp.size(), n_matching_session_tokens, embd_inp.size(), session_tokens.size());
// if we will use the cache for the full prompt without reaching the end of the cache, force
// reevaluation of the last token to recalculate the cached logits
if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) {
LOGLN("recalculate the cached logits (do): session_tokens.resize( %zu )", embd_inp.size() - 1);
LOG_DBG("recalculate the cached logits (do): session_tokens.resize( %zu )\n", embd_inp.size() - 1);
session_tokens.resize(embd_inp.size() - 1);
}
......@@ -380,30 +332,20 @@ int main(int argc, char ** argv) {
}
if (params.verbose_prompt) {
LOG_TEE("\n");
LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
for (int i = 0; i < (int) embd_inp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
}
if (ctx_guidance) {
LOG_TEE("\n");
LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
for (int i = 0; i < (int) guidance_inp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
}
LOG_INF("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str());
}
if (params.n_keep > add_bos) {
LOG_TEE("%s: static prompt based on n_keep: '", __func__);
LOG_INF("%s: static prompt based on n_keep: '", __func__);
for (int i = 0; i < params.n_keep; i++) {
LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
LOG_CNT("%s", common_token_to_piece(ctx, embd_inp[i]).c_str());
}
LOG_TEE("'\n");
LOG_CNT("'\n");
}
LOG_TEE("\n");
LOG_INF("\n");
}
// ctrl+C handling
......@@ -423,47 +365,56 @@ int main(int argc, char ** argv) {
}
if (params.interactive) {
LOG_TEE("%s: interactive mode on.\n", __func__);
LOG_INF("%s: interactive mode on.\n", __func__);
if (!params.antiprompt.empty()) {
for (const auto & antiprompt : params.antiprompt) {
LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
LOG_INF("Reverse prompt: '%s'\n", antiprompt.c_str());
if (params.verbose_prompt) {
auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
auto tmp = common_tokenize(ctx, antiprompt, false, true);
for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
}
}
}
}
if (params.input_prefix_bos) {
LOG_TEE("Input prefix with BOS\n");
LOG_INF("Input prefix with BOS\n");
}
if (!params.input_prefix.empty()) {
LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
if (params.verbose_prompt) {
auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
auto tmp = common_tokenize(ctx, params.input_prefix, true, true);
for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
}
}
}
if (!params.input_suffix.empty()) {
LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
if (params.verbose_prompt) {
auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
auto tmp = common_tokenize(ctx, params.input_suffix, false, true);
for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
}
}
}
}
LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
LOG_TEE("sampling order: \n%s\n", llama_sampling_order_print(sparams).c_str());
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
smpl = common_sampler_init(model, sparams);
if (!smpl) {
LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
return 1;
}
LOG_INF("sampler seed: %u\n", common_sampler_get_seed(smpl));
LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
LOG_INF("sampler chain: %s\n", common_sampler_print(smpl).c_str());
LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
// group-attention state
// number of grouped KV tokens so far (used only if params.grp_attn_n > 1)
......@@ -477,9 +428,9 @@ int main(int argc, char ** argv) {
GGML_ASSERT(ga_w % ga_n == 0 && "grp_attn_w must be a multiple of grp_attn_n"); // NOLINT
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of grp_attn_w"); // NOLINT
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT
LOG_TEE("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
LOG_INF("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
}
LOG_TEE("\n\n");
LOG_INF("\n");
if (params.interactive) {
const char * control_message;
......@@ -491,11 +442,11 @@ int main(int argc, char ** argv) {
" - To return control without starting a new line, end your input with '/'.\n"
" - If you want to submit another line, end your input with '\\'.\n";
}
LOG_TEE("== Running in interactive mode. ==\n");
LOG_INF("== Running in interactive mode. ==\n");
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
LOG_TEE( " - Press Ctrl+C to interject at any time.\n");
LOG_INF( " - Press Ctrl+C to interject at any time.\n");
#endif
LOG_TEE( "%s\n", control_message);
LOG_INF( "%s\n", control_message);
is_interacting = params.interactive_first;
}
......@@ -509,7 +460,6 @@ int main(int argc, char ** argv) {
int n_remain = params.n_predict;
int n_consumed = 0;
int n_session_consumed = 0;
int n_past_guidance = 0;
std::vector<int> input_tokens; g_input_tokens = &input_tokens;
std::vector<int> output_tokens; g_output_tokens = &output_tokens;
......@@ -521,28 +471,21 @@ int main(int argc, char ** argv) {
display = params.display_prompt;
std::vector<llama_token> embd;
std::vector<llama_token> embd_guidance;
// tokenized antiprompts
std::vector<std::vector<llama_token>> antiprompt_ids;
antiprompt_ids.reserve(params.antiprompt.size());
for (const std::string & antiprompt : params.antiprompt) {
antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true));
}
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
if (!ctx_sampling) {
fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
exit(1);
antiprompt_ids.emplace_back(::common_tokenize(ctx, antiprompt, false, true));
}
if (llama_model_has_encoder(model)) {
int enc_input_size = embd_inp.size();
llama_token * enc_input_buf = embd_inp.data();
if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) {
LOG_TEE("%s : failed to eval\n", __func__);
if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size))) {
LOG_ERR("%s : failed to eval\n", __func__);
return 1;
}
......@@ -568,9 +511,8 @@ int main(int argc, char ** argv) {
embd.resize(max_embd_size);
console::set_display(console::error);
printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
console::set_display(console::reset);
fflush(stdout);
}
if (ga_n == 1) {
......@@ -578,16 +520,22 @@ int main(int argc, char ** argv) {
// if we run out of context:
// - take the n_keep first tokens from the original prompt (via n_past)
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) >= n_ctx) {
if (n_past + (int) embd.size() >= n_ctx) {
if (!params.ctx_shift){
LOG_DBG("\n\n%s: context full and context shift is disabled => stopping\n", __func__);
break;
}
if (params.n_predict == -2) {
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
break;
}
const int n_left = n_past - params.n_keep;
const int n_discard = n_left/2;
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
n_past, n_left, n_ctx, params.n_keep, n_discard);
llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
......@@ -595,15 +543,11 @@ int main(int argc, char ** argv) {
n_past -= n_discard;
if (ctx_guidance) {
n_past_guidance -= n_discard;
}
LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
LOG_DBG("after swap: n_past = %d\n", n_past);
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
LOG("clear session path\n");
LOG_DBG("clear session path\n");
path_session.clear();
}
} else {
......@@ -613,10 +557,10 @@ int main(int argc, char ** argv) {
const int bd = (ga_w/ga_n)*(ga_n - 1);
const int dd = (ga_w/ga_n) - ib*bd - ga_w;
LOG("\n");
LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd);
LOG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
LOG_DBG("\n");
LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd);
LOG_DBG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
llama_kv_cache_seq_add(ctx, 0, ga_i, n_past, ib*bd);
llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
......@@ -626,7 +570,7 @@ int main(int argc, char ** argv) {
ga_i += ga_w/ga_n;
LOG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i);
LOG_DBG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i);
}
}
......@@ -652,65 +596,25 @@ int main(int argc, char ** argv) {
}
}
// evaluate tokens in batches
// embd is typically prepared beforehand to fit within a batch, but not always
if (ctx_guidance) {
int input_size = 0;
llama_token * input_buf = NULL;
if (n_past_guidance < (int) guidance_inp.size()) {
// Guidance context should have the same data with these modifications:
//
// * Replace the initial prompt
// * Shift everything by guidance_offset
embd_guidance = guidance_inp;
if (embd.begin() + original_prompt_len < embd.end()) {
embd_guidance.insert(
embd_guidance.end(),
embd.begin() + original_prompt_len,
embd.end()
);
}
input_buf = embd_guidance.data();
input_size = embd_guidance.size();
LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
} else {
input_buf = embd.data();
input_size = embd.size();
}
for (int i = 0; i < input_size; i += params.n_batch) {
int n_eval = std::min(input_size - i, params.n_batch);
if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) {
LOG_TEE("%s : failed to eval\n", __func__);
return 1;
}
n_past_guidance += n_eval;
}
}
for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
int n_eval = (int) embd.size() - i;
if (n_eval > params.n_batch) {
n_eval = params.n_batch;
}
LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
LOG_TEE("%s : failed to eval\n", __func__);
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {
LOG_ERR("%s : failed to eval\n", __func__);
return 1;
}
n_past += n_eval;
LOG("n_past = %d\n", n_past);
LOG_DBG("n_past = %d\n", n_past);
// Display total tokens alongside total time
if (params.n_print > 0 && n_past % params.n_print == 0) {
LOG_TEE("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
LOG_DBG("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
}
}
......@@ -721,7 +625,6 @@ int main(int argc, char ** argv) {
}
embd.clear();
embd_guidance.clear();
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
// optionally save the session on first sample (for faster prompt loading next time)
......@@ -729,14 +632,14 @@ int main(int argc, char ** argv) {
need_to_save_session = false;
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
LOG("saved session to %s\n", path_session.c_str());
LOG_DBG("saved session to %s\n", path_session.c_str());
}
const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
const llama_token id = common_sampler_sample(smpl, ctx, -1);
llama_sampling_accept(ctx_sampling, ctx, id, /* apply_grammar= */ true);
common_sampler_accept(smpl, id, /* accept_grammar= */ true);
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
// LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
embd.push_back(id);
......@@ -746,16 +649,16 @@ int main(int argc, char ** argv) {
// decrement remaining sampling budget
--n_remain;
LOG("n_remain: %d\n", n_remain);
LOG_DBG("n_remain: %d\n", n_remain);
} else {
// some user input remains from prompt or interaction, forward it to processing
LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
while ((int) embd_inp.size() > n_consumed) {
embd.push_back(embd_inp[n_consumed]);
// push the prompt in the sampling context in order to apply repetition penalties later
// for the prompt, we don't apply grammar rules
llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], /* apply_grammar= */ false);
common_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false);
++n_consumed;
if ((int) embd.size() >= params.n_batch) {
......@@ -767,10 +670,10 @@ int main(int argc, char ** argv) {
// display text
if (input_echo && display) {
for (auto id : embd) {
const std::string token_str = llama_token_to_piece(ctx, id, params.special);
const std::string token_str = common_token_to_piece(ctx, id, params.special);
// Console/Stream Output
fprintf(stdout, "%s", token_str.c_str());
LOG("%s", token_str.c_str());
// Record Displayed Tokens To Log
// Note: Generated tokens are created one by one hence this check
......@@ -782,8 +685,6 @@ int main(int argc, char ** argv) {
output_tokens.push_back(id);
output_ss << token_str;
}
fflush(stdout);
}
}
......@@ -798,7 +699,7 @@ int main(int argc, char ** argv) {
// check for reverse prompt in the last n_prev tokens
if (!params.antiprompt.empty()) {
const int n_prev = 32;
const std::string last_output = llama_sampling_prev_str(ctx_sampling, ctx, n_prev);
const std::string last_output = common_sampler_prev_str(smpl, ctx, n_prev);
is_antiprompt = false;
// Check if each of the reverse prompts appears at the end of the output.
......@@ -820,7 +721,7 @@ int main(int argc, char ** argv) {
}
// check for reverse prompt using special tokens
llama_token last_token = llama_sampling_last(ctx_sampling);
llama_token last_token = common_sampler_last(smpl);
for (std::vector<llama_token> ids : antiprompt_ids) {
if (ids.size() == 1 && last_token == ids[0]) {
if (params.interactive) {
......@@ -832,18 +733,18 @@ int main(int argc, char ** argv) {
}
if (is_antiprompt) {
LOG("found antiprompt: %s\n", last_output.c_str());
LOG_DBG("found antiprompt: %s\n", last_output.c_str());
}
}
// deal with end of generation tokens in interactive mode
if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
LOG("found an EOG token\n");
if (llama_token_is_eog(model, common_sampler_last(smpl))) {
LOG_DBG("found an EOG token\n");
if (params.interactive) {
if (!params.antiprompt.empty()) {
// tokenize and inject first reverse prompt
const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false, true);
const auto first_antiprompt = common_tokenize(ctx, params.antiprompt.front(), false, true);
embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
is_antiprompt = true;
}
......@@ -852,32 +753,32 @@ int main(int argc, char ** argv) {
chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
}
is_interacting = true;
printf("\n");
LOG("\n");
}
}
// if current token is not EOG, we add it to current assistant message
if (params.conversation) {
auto id = llama_sampling_last(ctx_sampling);
assistant_ss << llama_token_to_piece(ctx, id, false);
const auto id = common_sampler_last(smpl);
assistant_ss << common_token_to_piece(ctx, id, false);
}
if (n_past > 0 && is_interacting) {
LOG("waiting for user input\n");
LOG_DBG("waiting for user input\n");
if (params.conversation) {
printf("\n> ");
LOG("\n> ");
}
if (params.input_prefix_bos) {
LOG("adding input prefix BOS token\n");
LOG_DBG("adding input prefix BOS token\n");
embd_inp.push_back(llama_token_bos(model));
}
std::string buffer;
if (!params.input_prefix.empty() && !params.conversation) {
LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
printf("%s", params.input_prefix.c_str());
LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
LOG("%s", params.input_prefix.c_str());
}
// color user input only
......@@ -900,11 +801,11 @@ int main(int argc, char ** argv) {
if (buffer.length() > 1) {
// append input suffix if any
if (!params.input_suffix.empty() && !params.conversation) {
LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
printf("%s", params.input_suffix.c_str());
LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
LOG("%s", params.input_suffix.c_str());
}
LOG("buffer: '%s'\n", buffer.c_str());
LOG_DBG("buffer: '%s'\n", buffer.c_str());
const size_t original_size = embd_inp.size();
......@@ -917,11 +818,11 @@ int main(int argc, char ** argv) {
? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
: std::move(buffer);
// TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
const auto line_inp = ::llama_tokenize(ctx, user_inp, false, format_chat);
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
const auto line_pfx = common_tokenize(ctx, params.input_prefix, false, true);
const auto line_inp = common_tokenize(ctx, user_inp, false, format_chat);
const auto line_sfx = common_tokenize(ctx, params.input_suffix, false, true);
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
// if user stop generation mid-way, we must add EOT to finish model's last response
if (need_insert_eot && format_chat) {
......@@ -937,16 +838,16 @@ int main(int argc, char ** argv) {
for (size_t i = original_size; i < embd_inp.size(); ++i) {
const llama_token token = embd_inp[i];
output_tokens.push_back(token);
output_ss << llama_token_to_piece(ctx, token);
output_ss << common_token_to_piece(ctx, token);
}
// reset assistant message
assistant_ss.str("");
n_remain -= line_inp.size();
LOG("n_remain: %d\n", n_remain);
LOG_DBG("n_remain: %d\n", n_remain);
} else {
LOG("empty line, passing control back\n");
LOG_DBG("empty line, passing control back\n");
}
input_echo = false; // do not echo this again
......@@ -954,7 +855,7 @@ int main(int argc, char ** argv) {
if (n_past > 0) {
if (is_interacting) {
llama_sampling_reset(ctx_sampling);
common_sampler_reset(smpl);
}
is_interacting = false;
}
......@@ -962,7 +863,7 @@ int main(int argc, char ** argv) {
// end of generation
if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) {
LOG_TEE(" [end of text]\n");
LOG(" [end of text]\n");
break;
}
......@@ -975,23 +876,22 @@ int main(int argc, char ** argv) {
}
if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
LOG("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
}
llama_print_timings(ctx);
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
LOG("\n\n");
common_perf_print(ctx, smpl);
common_sampler_free(smpl);
if (ctx_guidance) { llama_free(ctx_guidance); }
llama_free(ctx);
llama_free_model(model);
llama_sampling_free(ctx_sampling);
llama_backend_free();
#ifndef LOG_DISABLE_LOGS
LOG_TEE("Log end\n");
#endif // LOG_DISABLE_LOGS
ggml_threadpool_free(threadpool);
ggml_threadpool_free(threadpool_batch);
return 0;
}
// A basic application simulating a server with multiple clients.
// The clients submit requests to the server and they are processed in parallel.
#include "arg.h"
#include "common.h"
#include "sampling.h"
#include "log.h"
#include "llama.h"
#include <cmath>
......@@ -50,8 +53,8 @@ static std::vector<std::string> k_prompts = {
struct client {
~client() {
if (ctx_sampling) {
llama_sampling_free(ctx_sampling);
if (smpl) {
common_sampler_free(smpl);
}
}
......@@ -72,7 +75,7 @@ struct client {
std::string prompt;
std::string response;
struct llama_sampling_context * ctx_sampling = nullptr;
struct common_sampler * smpl = nullptr;
};
static void print_date_time() {
......@@ -81,7 +84,9 @@ static void print_date_time() {
char buffer[80];
strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", local_time);
printf("\n\033[35mrun parameters as at %s\033[0m\n", buffer);
LOG_INF("\n");
LOG_INF("\033[35mrun parameters as of %s\033[0m\n", buffer);
LOG_INF("\n");
}
// Define a split string function to ...
......@@ -98,13 +103,14 @@ static std::vector<std::string> split_string(const std::string& input, char deli
int main(int argc, char ** argv) {
srand(1234);
gpt_params params;
common_params params;
if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
return 1;
}
common_init();
// number of simultaneous "clients" to simulate
const int32_t n_clients = params.n_parallel;
......@@ -119,41 +125,34 @@ int main(int argc, char ** argv) {
const bool dump_kv_cache = params.dump_kv_cache;
#ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("parallel", "log"));
LOG_TEE("Log start\n");
log_dump_cmdline(argc, argv);
#endif // LOG_DISABLE_LOGS
// init llama.cpp
llama_backend_init();
llama_numa_init(params.numa);
// load the target model
llama_init_result llama_init = llama_init_from_gpt_params(params);
common_init_result llama_init = common_init_from_params(params);
llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context;
// load the prompts from an external file if there are any
if (params.prompt.empty()) {
printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
LOG_INF("\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
} else {
// Output each line of the input params.prompts vector and copy to k_prompts
int index = 0;
printf("\n\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str());
LOG_INF("\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str());
std::vector<std::string> prompts = split_string(params.prompt, '\n');
for (const auto& prompt : prompts) {
k_prompts.resize(index + 1);
k_prompts[index] = prompt;
index++;
printf("%3d prompt: %s\n", index, prompt.c_str());
LOG_INF("%3d prompt: %s\n", index, prompt.c_str());
}
}
fprintf(stderr, "\n\n");
fflush(stderr);
LOG_INF("\n\n");
const int n_ctx = llama_n_ctx(ctx);
......@@ -161,11 +160,11 @@ int main(int argc, char ** argv) {
for (size_t i = 0; i < clients.size(); ++i) {
auto & client = clients[i];
client.id = i;
client.ctx_sampling = llama_sampling_init(params.sparams);
client.smpl = common_sampler_init(model, params.sampling);
}
std::vector<llama_token> tokens_system;
tokens_system = ::llama_tokenize(ctx, k_system, true);
tokens_system = common_tokenize(ctx, k_system, true);
const int32_t n_tokens_system = tokens_system.size();
llama_seq_id g_seq_id = 0;
......@@ -182,19 +181,19 @@ int main(int argc, char ** argv) {
const auto t_main_start = ggml_time_us();
LOG_TEE("%s: Simulating parallel requests from clients:\n", __func__);
LOG_TEE("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
LOG_TEE("\n");
LOG_INF("%s: Simulating parallel requests from clients:\n", __func__);
LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
LOG_INF("\n");
{
LOG_TEE("%s: Evaluating the system prompt ...\n", __func__);
LOG_INF("%s: Evaluating the system prompt ...\n", __func__);
for (int32_t i = 0; i < n_tokens_system; ++i) {
llama_batch_add(batch, tokens_system[i], i, { 0 }, false);
common_batch_add(batch, tokens_system[i], i, { 0 }, false);
}
if (llama_decode(ctx, batch) != 0) {
LOG_TEE("%s: llama_decode() failed\n", __func__);
LOG_ERR("%s: llama_decode() failed\n", __func__);
return 1;
}
......@@ -203,18 +202,18 @@ int main(int argc, char ** argv) {
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
}
LOG_TEE("\n");
LOG_INF("\n");
}
LOG_TEE("Processing requests ...\n\n");
LOG_INF("Processing requests ...\n\n");
while (true) {
if (dump_kv_cache) {
llama_kv_cache_view_update(ctx, &kvc_view);
llama_kv_cache_dump_view_seqs(kvc_view, 40);
common_kv_cache_dump_view_seqs(kvc_view, 40);
}
llama_batch_clear(batch);
common_batch_clear(batch);
// decode any currently ongoing sequences
for (auto & client : clients) {
......@@ -224,7 +223,7 @@ int main(int argc, char ** argv) {
client.i_batch = batch.n_tokens;
llama_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true);
common_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true);
client.n_decoded += 1;
}
......@@ -237,7 +236,7 @@ int main(int argc, char ** argv) {
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
}
LOG_TEE("%s: clearing the KV cache\n", __func__);
LOG_INF("%s: clearing the KV cache\n", __func__);
}
// insert new sequences for decoding
......@@ -253,14 +252,14 @@ int main(int argc, char ** argv) {
client.prompt = client.input + "\nAssistant:";
client.response = "";
llama_sampling_reset(client.ctx_sampling);
common_sampler_reset(client.smpl);
// do not prepend BOS because we have a system prompt!
std::vector<llama_token> tokens_prompt;
tokens_prompt = ::llama_tokenize(ctx, client.prompt, false);
tokens_prompt = common_tokenize(ctx, client.prompt, false);
for (size_t i = 0; i < tokens_prompt.size(); ++i) {
llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false);
common_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false);
}
// extract the logits only for the last token
......@@ -272,7 +271,7 @@ int main(int argc, char ** argv) {
client.n_decoded = 0;
client.i_batch = batch.n_tokens - 1;
LOG_TEE("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
LOG_INF("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
g_seq_id += 1;
......@@ -309,18 +308,17 @@ int main(int argc, char ** argv) {
batch.n_seq_id + i,
batch.seq_id + i,
batch.logits + i,
0, 0, 0, // unused
};
const int ret = llama_decode(ctx, batch_view);
if (ret != 0) {
if (n_batch == 1 || ret < 0) {
// if you get here, it means the KV cache is full - try increasing it via the context size
LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
LOG_ERR("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
return 1;
}
LOG("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
LOG_ERR("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
n_cache_miss += 1;
......@@ -331,7 +329,7 @@ int main(int argc, char ** argv) {
continue;
}
LOG("%s : decoded batch of %d tokens\n", __func__, n_tokens);
LOG_DBG("%s : decoded batch of %d tokens\n", __func__, n_tokens);
for (auto & client : clients) {
if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) {
......@@ -341,9 +339,9 @@ int main(int argc, char ** argv) {
//printf("client %d, seq %d, token %d, pos %d, batch %d\n",
// client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
const llama_token id = llama_sampling_sample(client.ctx_sampling, ctx, NULL, client.i_batch - i);
const llama_token id = common_sampler_sample(client.smpl, ctx, client.i_batch - i);
llama_sampling_accept(client.ctx_sampling, ctx, id, true);
common_sampler_accept(client.smpl, id, true);
if (client.n_decoded == 1) {
// start measuring generation time after the first token to make sure all concurrent clients
......@@ -351,7 +349,7 @@ int main(int argc, char ** argv) {
client.t_start_gen = ggml_time_us();
}
const std::string token_str = llama_token_to_piece(ctx, id);
const std::string token_str = common_token_to_piece(ctx, id);
client.response += token_str;
client.sampled = id;
......@@ -376,7 +374,7 @@ int main(int argc, char ** argv) {
const auto t_main_end = ggml_time_us();
LOG_TEE("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \nInput: %s\n\033[35mResponse: %s\033[0m\n\n",
LOG_INF("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \n\nInput: %s\n\033[35mResponse: %s\033[0m\n\n",
client.id, client.seq_id, n_seq, client.n_prompt, client.n_decoded,
(t_main_end - client.t_start_prompt) / 1e6,
(double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6,
......@@ -399,21 +397,22 @@ int main(int argc, char ** argv) {
print_date_time();
LOG_TEE("\n%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
if (params.prompt_file.empty()) {
params.prompt_file = "used built-in defaults";
}
LOG_TEE("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
LOG_TEE("Model and path used: \033[32m%s\033[0m\n\n", params.model.c_str());
LOG_INF("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
LOG_INF("Model and path used: \033[32m%s\033[0m\n\n", params.model.c_str());
LOG_TEE("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6);
LOG_TEE("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6);
LOG_TEE("Total speed (AVG): %6s speed: %5.2f t/s\n", "", (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
LOG_TEE("Cache misses: %6d\n", n_cache_miss);
LOG_INF("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6);
LOG_INF("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6);
LOG_INF("Total speed (AVG): %6s speed: %5.2f t/s\n", "", (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
LOG_INF("Cache misses: %6d\n", n_cache_miss);
LOG_TEE("\n");
LOG_INF("\n");
llama_print_timings(ctx);
// TODO: print sampling/grammar timings for all clients
llama_perf_context_print(ctx);
llama_batch_free(batch);
......@@ -422,7 +421,7 @@ int main(int argc, char ** argv) {
llama_backend_free();
fprintf(stderr, "\n\n");
LOG("\n\n");
return 0;
}
#include "arg.h"
#include "common.h"
#include "log.h"
#include "llama.h"
#include <cmath>
......@@ -6,27 +8,24 @@
#include <string>
#include <vector>
static void print_usage(int argc, char ** argv, const gpt_params & params) {
gpt_params_print_usage(argc, argv, params);
LOG_TEE("\nexample usage:\n");
LOG_TEE("\n %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
LOG_TEE("\n");
static void print_usage(int, char ** argv) {
LOG("\nexample usage:\n");
LOG("\n %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
LOG("\n");
}
int main(int argc, char ** argv) {
gpt_params params;
common_params params;
params.n_junk = 250;
params.n_keep = 32;
params.i_pos = -1;
if (!gpt_params_parse(argc, argv, params)) {
print_usage(argc, argv, params);
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PASSKEY, print_usage)) {
return 1;
}
srand(params.seed == LLAMA_DEFAULT_SEED ? time(NULL) : params.seed);
common_init();
int n_junk = params.n_junk;
int n_keep = params.n_keep;
......@@ -62,36 +61,41 @@ int main(int argc, char ** argv) {
// initialize the model
llama_model_params model_params = llama_model_params_from_gpt_params(params);
llama_model_params model_params = common_model_params_to_llama(params);
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
if (model == NULL) {
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
LOG_ERR("%s: unable to load model\n" , __func__);
return 1;
}
// initialize the context
llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
llama_context_params ctx_params = common_context_params_to_llama(params);
ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep;
GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp");
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
if (ctx == NULL) {
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
LOG_ERR("%s: failed to create the llama_context\n" , __func__);
return 1;
}
auto sparams = llama_sampler_chain_default_params();
llama_sampler * smpl = llama_sampler_chain_init(sparams);
llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
// tokenize the prompt
std::vector<llama_token> tokens_list;
tokens_list = ::llama_tokenize(ctx, params.prompt, true);
tokens_list = common_tokenize(ctx, params.prompt, true);
// tokenize the prefix and use it as a sink
const int n_tokens_prefix = ::llama_tokenize(ctx, prompt_prefix, true).size();
const int n_tokens_prefix = common_tokenize(ctx, prompt_prefix, true).size();
const int n_tokens_all = tokens_list.size();
......@@ -106,14 +110,14 @@ int main(int argc, char ** argv) {
const int n_batch = ctx_params.n_batch;
const int n_batch_grp = ctx_params.n_batch/n_grp;
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos);
LOG_INF("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos);
// print the prompt token-by-token
LOG_TEE("\n");
LOG_TEE("prefix tokens: %d\n", n_tokens_prefix);
LOG_TEE("prompt tokens: %d\n", n_tokens_all);
//LOG_TEE("prompt: %s\n", params.prompt.c_str());
LOG_INF("\n");
LOG_INF("prefix tokens: %d\n", n_tokens_prefix);
LOG_INF("prompt tokens: %d\n", n_tokens_all);
//LOG_INF("prompt: %s\n", params.prompt.c_str());
llama_batch batch = llama_batch_init(params.n_batch, 0, 1);
......@@ -133,10 +137,10 @@ int main(int argc, char ** argv) {
n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
}
llama_batch_clear(batch);
common_batch_clear(batch);
for (int j = 0; j < n_batch && i + j < n_tokens_all; j++) {
llama_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false);
common_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false);
}
if (i + n_batch >= n_tokens_all) {
......@@ -144,11 +148,11 @@ int main(int argc, char ** argv) {
}
if (llama_decode(ctx, batch) != 0) {
LOG_TEE("%s: llama_decode() failed\n", __func__);
LOG_INF("%s: llama_decode() failed\n", __func__);
return 1;
}
LOG_TEE("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
LOG_INF("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
if (i + n_batch >= n_tokens_all) {
break;
......@@ -158,7 +162,7 @@ int main(int argc, char ** argv) {
for (int i = n_ctx; i < n_tokens_all; i += n_batch) {
const int n_discard = n_batch;
LOG_TEE("%s: shifting KV cache with %d\n", __func__, n_discard);
LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard);
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
......@@ -167,10 +171,10 @@ int main(int argc, char ** argv) {
n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
llama_batch_clear(batch);
common_batch_clear(batch);
for (int j = 0; j < n_batch && i + j < n_tokens_all; j++) {
llama_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false);
common_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false);
}
if (i + n_batch >= n_tokens_all) {
......@@ -178,18 +182,18 @@ int main(int argc, char ** argv) {
}
if (llama_decode(ctx, batch) != 0) {
LOG_TEE("%s: llama_decode() failed\n", __func__);
LOG_ERR("%s: llama_decode() failed\n", __func__);
return 1;
}
LOG_TEE("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
LOG_INF("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
}
{
const int n_discard = n_past - n_ctx + n_predict;
if (n_discard > 0) {
LOG_TEE("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
......@@ -200,76 +204,64 @@ int main(int argc, char ** argv) {
}
}
LOG_TEE("\n");
LOG_TEE("%s: passkey = %d, inserted at position %d / %d (token pos: ~%d)\n", __func__, passkey, i_pos, n_junk, (i_pos * n_tokens_all) / n_junk);
LOG_TEE("\n");
LOG_INF("\n");
LOG_INF("%s: passkey = %d, inserted at position %d / %d (token pos: ~%d)\n", __func__, passkey, i_pos, n_junk, (i_pos * n_tokens_all) / n_junk);
LOG_INF("\n");
// main loop
int n_cur = n_tokens_all;
int n_decode = 0;
LOG_TEE("%s", prompt_suffix.c_str());
fflush(stdout);
LOG_INF("%s", prompt_suffix.c_str());
const auto t_main_start = ggml_time_us();
while (n_cur <= n_len) {
// sample the next token
{
auto n_vocab = llama_n_vocab(model);
auto * logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
// sample the most likely token
const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);
// is it an end of generation?
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
LOG_TEE("\n");
LOG("\n");
break;
}
LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
fflush(stdout);
LOG("%s", common_token_to_piece(ctx, new_token_id).c_str());
n_decode += 1;
// prepare the next batch
llama_batch_clear(batch);
common_batch_clear(batch);
// push this new token for next evaluation
llama_batch_add(batch, new_token_id, n_past++, { 0 }, true);
common_batch_add(batch, new_token_id, n_past++, { 0 }, true);
}
n_cur += 1;
// evaluate the current batch with the transformer model
if (llama_decode(ctx, batch)) {
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
return 1;
}
}
LOG_TEE("\n");
LOG("\n");
const auto t_main_end = ggml_time_us();
LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
llama_print_timings(ctx);
LOG("\n");
llama_perf_context_print(ctx);
LOG("\n");
fprintf(stderr, "\n");
llama_sampler_free(smpl);
llama_batch_free(batch);
......
#include "arg.h"
#include "common.h"
#include "log.h"
#include "llama.h"
#include <algorithm>
#include <array>
#include <atomic>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <fstream>
#include <mutex>
#include <random>
#include <sstream>
#include <thread>
#include <mutex>
#include <atomic>
#include <vector>
#include <array>
#include <fstream>
#include <sstream>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
......@@ -31,55 +34,6 @@ struct results_log_softmax {
float prob;
};
static void write_logfile(
const llama_context * ctx, const gpt_params & params, const llama_model * model,
const struct results_perplexity & results
) {
if (params.logdir.empty()) {
return;
}
if (params.hellaswag) {
fprintf(stderr, "%s: warning: logging results is not implemented for HellaSwag. No files will be written.\n", __func__);
return;
}
const std::string timestamp = string_get_sortable_timestamp();
const bool success = fs_create_directory_with_parents(params.logdir);
if (!success) {
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
__func__, params.logdir.c_str());
return;
}
const std::string logfile_path = params.logdir + timestamp + ".yml";
FILE * logfile = fopen(logfile_path.c_str(), "w");
if (logfile == NULL) {
fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
return;
}
fprintf(logfile, "binary: main\n");
char model_desc[128];
llama_model_desc(model, model_desc, sizeof(model_desc));
yaml_dump_non_result_info(logfile, params, ctx, timestamp, results.tokens, model_desc);
fprintf(logfile, "\n");
fprintf(logfile, "######################\n");
fprintf(logfile, "# Perplexity Results #\n");
fprintf(logfile, "######################\n");
fprintf(logfile, "\n");
yaml_dump_vector_float(logfile, "logits", results.logits);
fprintf(logfile, "ppl_value: %f\n", results.ppl_value);
yaml_dump_vector_float(logfile, "probs", results.probs);
llama_dump_timing_info_yaml(logfile, ctx);
fclose(logfile);
}
static std::vector<float> softmax(const std::vector<float>& logits) {
std::vector<float> probs(logits.size());
float max_logit = logits[0];
......@@ -166,7 +120,7 @@ static void process_logits(
break;
}
lock.unlock();
const results_log_softmax results = log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]);
const results_log_softmax results = log_softmax(n_vocab, logits + size_t(i)*n_vocab, tokens[i+1]);
const double v = -results.log_softmax;
local_nll += v;
local_nll2 += v*v;
......@@ -200,7 +154,7 @@ static void process_logits(std::ostream& out, int n_vocab, const float * logits,
break;
}
lock.unlock();
const double v = log_softmax(n_vocab, logits + i*n_vocab, log_probs.data() + i*nv, tokens[i+1]);
const double v = log_softmax(n_vocab, logits + size_t(i)*n_vocab, log_probs.data() + i*nv, tokens[i+1]);
local_nll += v;
local_nll2 += v*v;
}
......@@ -278,7 +232,9 @@ static std::pair<double, float> log_softmax(int n_vocab, const float * logits, c
kld.sum_kld += sum;
kld.sum_kld2 += sum*sum;
++kld.count;
if (imax == imax_base) ++kld.n_same_top;
if (imax == imax_base) {
++kld.n_same_top;
}
const float p_base = expf(-nll_base);
const float p = expf(-nll);
......@@ -320,7 +276,7 @@ static void process_logits(int n_vocab, const float * logits, const int * tokens
break;
}
lock.unlock();
std::pair<double, float> v = log_softmax(n_vocab, logits + i*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld);
std::pair<double, float> v = log_softmax(n_vocab, logits + size_t(i)*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld);
kld_values[i] = (float)v.first;
p_diff_values[i] = v.second;
}
......@@ -334,25 +290,25 @@ static void process_logits(int n_vocab, const float * logits, const int * tokens
}
}
static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) {
static results_perplexity perplexity_v2(llama_context * ctx, const common_params & params) {
// Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
// Output: `perplexity: 13.5106 [114/114]`
// BOS tokens will be added for each chunk before eval
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
LOG_INF("%s: tokenizing the input ..\n", __func__);
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, true);
const int n_ctx = llama_n_ctx(ctx);
if (int(tokens.size()) < 2*n_ctx) {
fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
n_ctx);
fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
return {std::move(tokens), 0., {}, {}};
}
......@@ -363,16 +319,16 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
prob_history.resize(tokens.size());
if (params.ppl_stride <= 0) {
fprintf(stderr, "%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
LOG_ERR("%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
return {tokens, -1, logit_history, prob_history};
}
const int calc_chunk = n_ctx;
fprintf(stderr, "%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);
LOG_INF("%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);
if (int(tokens.size()) <= calc_chunk) {
fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
LOG_ERR("%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
tokens.size(), n_ctx, params.ppl_stride);
return {tokens, -1, logit_history, prob_history};
}
......@@ -380,20 +336,21 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
const int n_chunk_max = (tokens.size() - calc_chunk + params.ppl_stride - 1) / params.ppl_stride;
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
const int n_batch = params.n_batch;
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
int count = 0;
double nll = 0.0;
fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
LOG_INF("%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
for (int i = 0; i < n_chunk; ++i) {
const int start = i * params.ppl_stride;
const int end = start + calc_chunk;
const int num_batches = (calc_chunk + n_batch - 1) / n_batch;
//fprintf(stderr, "%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches);
//LOG_DBG("%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches);
std::vector<float> logits;
......@@ -402,14 +359,21 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
// clear the KV cache
llama_kv_cache_clear(ctx);
llama_batch batch = llama_batch_init(n_batch, 0, 1);
for (int j = 0; j < num_batches; ++j) {
const int batch_start = start + j * n_batch;
const int batch_size = std::min(end - batch_start, n_batch);
//fprintf(stderr, " Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
// TODO: use llama_batch.logits instead of relying on logits_all == true
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
//fprintf(stderr, "%s : failed to eval\n", __func__);
common_batch_clear(batch);
for (int i = 0; i < batch_size; i++) {
common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true);
}
//LOG_DBG(" Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
if (llama_decode(ctx, batch)) {
//LOG_ERR("%s : failed to eval\n", __func__);
llama_batch_free(batch);
return {tokens, -1, logit_history, prob_history};
}
......@@ -421,34 +385,35 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
}
const auto batch_logits = llama_get_logits(ctx);
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
const auto * batch_logits = llama_get_logits(ctx);
logits.insert(logits.end(), batch_logits, batch_logits + size_t(batch_size) * n_vocab);
if (j == 0) {
tokens[batch_start] = token_org;
}
}
llama_batch_free(batch);
const auto t_end = std::chrono::high_resolution_clock::now();
if (i == 0) {
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
int total_seconds = (int)(t_total * n_chunk);
if (total_seconds >= 60*60) {
fprintf(stderr, "%d hours ", total_seconds / (60*60));
LOG("%d hours ", total_seconds / (60*60));
total_seconds = total_seconds % (60*60);
}
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
LOG("%.2f minutes\n", total_seconds / 60.0);
}
//fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
//LOG_DBG("%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) {
// Calculate probability of next token, given the previous ones.
const std::vector<float> tok_logits(
logits.begin() + (j + 0) * n_vocab,
logits.begin() + (j + 1) * n_vocab);
logits.begin() + size_t(j + 0) * n_vocab,
logits.begin() + size_t(j + 1) * n_vocab);
const float prob = softmax(tok_logits)[tokens[start + j + 1]];
logit_history[start + j + 1] = tok_logits[tokens[start + j + 1]];
......@@ -459,18 +424,17 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
}
// perplexity is e^(average negative log-likelihood)
if (params.ppl_output_type == 0) {
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
} else {
printf("%8d %.4lf\n", i*params.ppl_stride, std::exp(nll / count));
LOG("%8d %.4lf\n", i*params.ppl_stride, std::exp(nll / count));
}
fflush(stdout);
}
printf("\n");
LOG("\n");
return {tokens, std::exp(nll / count), logit_history, prob_history};
}
static results_perplexity perplexity(llama_context * ctx, const gpt_params & params, const int32_t n_ctx) {
static results_perplexity perplexity(llama_context * ctx, const common_params & params, const int32_t n_ctx) {
if (params.ppl_stride > 0) {
return perplexity_v2(ctx, params);
}
......@@ -480,33 +444,33 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
// Output: `perplexity: 13.5106 [114/114]`
// BOS tokens will be added for each chunk before eval
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
std::ofstream logits_stream;
if (!params.logits_file.empty()) {
logits_stream.open(params.logits_file.c_str(), std::ios::binary);
if (!logits_stream.is_open()) {
fprintf(stderr, "%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
LOG_ERR("%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
return {};
}
fprintf(stderr, "%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
LOG_INF("%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
logits_stream.write("_logits_", 8);
logits_stream.write(reinterpret_cast<const char *>(&n_ctx), sizeof(n_ctx));
}
auto tim1 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
LOG_INF("%s: tokenizing the input ..\n", __func__);
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, true);
auto tim2 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
if (int(tokens.size()) < 2*n_ctx) {
fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
n_ctx);
fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
return {std::move(tokens), 0., {}, {}};
}
......@@ -519,9 +483,10 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
const int n_chunk_max = tokens.size() / n_ctx;
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
const int n_batch = params.n_batch;
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
int count = 0;
double nll = 0.0;
double nll2 = 0.0;
......@@ -536,10 +501,10 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
std::vector<float> logits;
if (num_batches > 1) {
logits.reserve((size_t)n_ctx * n_vocab);
logits.reserve(size_t(n_ctx) * n_vocab);
}
fprintf(stderr, "%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
LOG_INF("%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
......@@ -612,13 +577,13 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
}
if (llama_decode(ctx, batch)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
LOG_INF("%s : failed to eval\n", __func__);
return {tokens, -1, logit_history, prob_history};
}
if (num_batches > 1 && n_outputs > 0) {
const auto * batch_logits = llama_get_logits(ctx);
logits.insert(logits.end(), batch_logits, batch_logits + n_outputs * n_vocab);
logits.insert(logits.end(), batch_logits, batch_logits + size_t(n_outputs) * n_vocab);
}
}
......@@ -627,13 +592,13 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
llama_synchronize(ctx);
const auto t_end = std::chrono::high_resolution_clock::now();
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
int total_seconds = (int)(t_total*n_chunk/n_seq);
if (total_seconds >= 60*60) {
fprintf(stderr, "%d hours ", total_seconds / (60*60));
LOG("%d hours ", total_seconds / (60*60));
total_seconds = total_seconds % (60*60);
}
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
LOG("%.2f minutes\n", total_seconds / 60.0);
}
for (int seq = 0; seq < n_seq_batch; seq++) {
......@@ -655,19 +620,20 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
// perplexity is e^(average negative log-likelihood)
if (params.ppl_output_type == 0) {
printf("[%d]%.4lf,", i + seq + 1, std::exp(nll / count));
LOG("[%d]%.4lf,", i + seq + 1, std::exp(nll / count));
} else {
double av = nll/count;
double av2 = nll2/count - av*av;
if (av2 > 0) av2 = sqrt(av2/(count-1));
printf("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
if (av2 > 0) {
av2 = sqrt(av2/(count-1));
}
LOG("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
}
}
fflush(stdout);
logits.clear();
}
printf("\n");
LOG("\n");
nll2 /= count;
nll /= count;
......@@ -675,9 +641,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
nll2 -= nll * nll;
if (nll2 > 0) {
nll2 = sqrt(nll2/(count-1));
printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
LOG_INF("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
} else {
printf("Unexpected negative standard deviation of log(prob)\n");
LOG_ERR("Unexpected negative standard deviation of log(prob)\n");
}
llama_batch_free(batch);
......@@ -685,10 +651,10 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
return {tokens, ppl, logit_history, prob_history};
}
static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<float> & batch_logits, int32_t n_batch, int32_t n_vocab) {
static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<float> & batch_logits, int n_batch, int n_vocab) {
int prev_outputs = 0;
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
for (int i = 0; i < (int) batch.n_tokens; i += n_batch) {
const int n_tokens = std::min<int>(n_batch, batch.n_tokens - i);
llama_batch batch_view = {
n_tokens,
......@@ -698,12 +664,11 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
batch.n_seq_id + i,
batch.seq_id + i,
batch.logits + i,
0, 0, 0, // unused
};
const int ret = llama_decode(ctx, batch_view);
if (ret != 0) {
LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
return false;
}
......@@ -712,7 +677,7 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
n_outputs += batch_view.logits[i] != 0;
}
memcpy(batch_logits.data() + prev_outputs*n_vocab, llama_get_logits(ctx), n_outputs*n_vocab*sizeof(float));
memcpy(batch_logits.data() + size_t(prev_outputs)*n_vocab, llama_get_logits(ctx), size_t(n_outputs)*n_vocab*sizeof(float));
prev_outputs += n_outputs;
}
......@@ -727,7 +692,9 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto
if (eval_results.size() != eval_pairs.size()) {
eval_results.resize(eval_pairs.size());
}
if (eval_pairs.empty()) return;
if (eval_pairs.empty()) {
return;
}
size_t max_threads = std::min((eval_pairs.size() + K_TOKEN_CHUNK - 1)/K_TOKEN_CHUNK, workers.size());
......@@ -735,11 +702,13 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto
auto compute = [&counter, &eval_pairs, &eval_results, batch_logits, n_vocab] () {
float local_logprobs[K_TOKEN_CHUNK];
while (true) {
size_t first = counter.fetch_add(K_TOKEN_CHUNK, std::memory_order_relaxed);
if (first >= eval_results.size()) break;
size_t last = std::min(first + K_TOKEN_CHUNK, eval_results.size());
const size_t first = counter.fetch_add(K_TOKEN_CHUNK, std::memory_order_relaxed);
if (first >= eval_results.size()) {
break;
}
const size_t last = std::min(first + K_TOKEN_CHUNK, eval_results.size());
for (size_t i = first; i < last; ++i) {
auto logits = batch_logits + eval_pairs[i].first * n_vocab;
const auto * logits = batch_logits + eval_pairs[i].first * n_vocab;
float max_logit = logits[0];
for (int j = 1; j < n_vocab; ++j) {
max_logit = std::max(max_logit, logits[j]);
......@@ -762,7 +731,7 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto
}
}
static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
static void hellaswag_score(llama_context * ctx, const common_params & params) {
// Calculates hellaswag score (acc_norm) from prompt
//
// Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
......@@ -789,15 +758,15 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
}
if (prompt_lines.size() % 6 != 0) {
fprintf(stderr, "%s : number of lines in prompt not a multiple of 6.\n", __func__);
LOG_ERR("%s : number of lines in prompt not a multiple of 6.\n", __func__);
return;
}
size_t hs_task_count = prompt_lines.size()/6;
fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
fprintf(stderr, "================================= is_spm = %d\n", is_spm);
LOG_INF("================================= is_spm = %d\n", is_spm);
// The tasks should be randomized so the score stabilizes quickly.
bool randomize_tasks = true;
......@@ -824,7 +793,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
std::vector<llama_token> seq_tokens[4];
};
fprintf(stderr, "%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first") );
LOG_INF("%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first") );
// Select and read data from prompt lines
std::vector<hs_data_t> hs_data(hs_task_count);
......@@ -843,7 +812,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
hs_cur.gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
for (size_t j = 0; j < 4; j++) {
hs_cur.ending[j] = prompt_lines[idx*6+2+j];
hs_cur.seq_tokens[j] = ::llama_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], true);
hs_cur.seq_tokens[j] = common_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], true);
}
// determine the common prefix of the endings
......@@ -870,16 +839,17 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
}
}
fprintf(stderr, "%s : calculating hellaswag score over selected tasks.\n", __func__);
LOG_INF("%s : calculating hellaswag score over selected tasks.\n", __func__);
printf("\ntask\tacc_norm\n");
LOG("\ntask\tacc_norm\n");
double acc = 0.0f;
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
const int n_ctx = llama_n_ctx(ctx);
const int n_batch = params.n_batch;
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
const int max_tasks_per_batch = 32;
const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
......@@ -887,7 +857,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
std::vector<float> tok_logits(n_vocab);
// TODO: this could be made smaller; it's currently the worst-case size
std::vector<float> batch_logits(n_vocab*n_ctx);
std::vector<float> batch_logits(size_t(n_ctx)*n_vocab);
std::vector<std::pair<size_t, llama_token>> eval_pairs;
std::vector<float> eval_results;
......@@ -899,7 +869,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
size_t i1 = i0;
size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch
llama_batch_clear(batch);
common_batch_clear(batch);
// batch as much tasks as possible into the available context
// each task has 4 unique sequence ids - one for each ending
......@@ -915,7 +885,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
}
for (size_t i = 0; i < hs_cur.common_prefix; ++i) {
llama_batch_add(batch, hs_cur.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3 }, false);
common_batch_add(batch, hs_cur.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3 }, false);
}
batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
n_logits += 1;
......@@ -925,7 +895,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
// TODO: don't evaluate the last token of each sequence
for (size_t i = hs_cur.common_prefix; i < seq_tokens_size; ++i) {
const bool needs_logits = i < seq_tokens_size - 1;
llama_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, needs_logits);
common_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, needs_logits);
n_logits += needs_logits;
}
}
......@@ -940,7 +910,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
}
if (i0 == i1) {
fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
return;
}
......@@ -948,7 +918,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
// decode all tasks [i0, i1)
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
fprintf(stderr, "%s: llama_decode() failed\n", __func__);
LOG_ERR("%s: llama_decode() failed\n", __func__);
return;
}
......@@ -974,7 +944,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
auto & hs_cur = hs_data[i];
// get the logits of the last token of the common prefix
std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*hs_cur.i_logits, n_vocab*sizeof(float));
std::memcpy(tok_logits.data(), batch_logits.data() + hs_cur.i_logits*n_vocab, n_vocab*sizeof(float));
const auto first_probs = softmax(tok_logits);
......@@ -998,7 +968,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
}
}
//printf("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx);
//LOG("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx);
// If the gold ending got the maximum logprobe add one accuracy point
if (ending_logprob_max_idx == hs_cur.gold_ending_idx) {
......@@ -1006,8 +976,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
}
// Print the accumulated accuracy mean x 100
printf("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0);
fflush(stdout);
LOG("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0);
}
i0 = i1 - 1;
......@@ -1015,7 +984,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
llama_batch_free(batch);
printf("\n");
LOG("\n");
}
struct winogrande_entry {
......@@ -1059,7 +1028,7 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string
}
}
if (ipos != 4) {
printf("%s: failed to find comma separators in <%s>\n", __func__, line.c_str());
LOG_ERR("%s: failed to find comma separators in <%s>\n", __func__, line.c_str());
continue;
}
auto sentence = line[comma_pos[0]+1] == '"' ? line.substr(comma_pos[0]+2, comma_pos[1] - comma_pos[0] - 3)
......@@ -1073,13 +1042,13 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string
if (sentence[where] == '_') break;
}
if (where == int(sentence.size())) {
printf("%s: no _ in <%s>\n", __func__, sentence.c_str());
LOG_ERR("%s: no _ in <%s>\n", __func__, sentence.c_str());
continue;
}
std::istringstream stream(answer.c_str());
int i_answer; stream >> i_answer;
if (stream.fail() || i_answer < 1 || i_answer > 2) {
printf("%s: failed to parse answer <%s>\n", __func__, answer.c_str());
LOG_ERR("%s: failed to parse answer <%s>\n", __func__, answer.c_str());
continue;
}
result.emplace_back();
......@@ -1102,20 +1071,20 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string
* 0,Sarah was a much better surgeon than Maria so _ always got the easier cases.,Sarah,Maria,2
*
*/
static void winogrande_score(llama_context * ctx, const gpt_params & params) {
static void winogrande_score(llama_context * ctx, const common_params & params) {
constexpr int k_min_trailing_ctx = 3;
auto data = load_winogrande_from_csv(params.prompt);
if (data.empty()) {
fprintf(stderr, "%s: no tasks\n", __func__);
LOG_ERR("%s: no tasks\n", __func__);
return;
}
fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, data.size());
LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, data.size());
if (params.winogrande_tasks > 0 && params.winogrande_tasks < data.size()) {
fprintf(stderr, "%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks);
LOG_INF("%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks);
std::mt19937 rng(1);
std::vector<int> aux(data.size());
for (int i = 0; i < int(data.size()); ++i) {
......@@ -1133,11 +1102,11 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
data = std::move(selected);
}
fprintf(stderr, "%s : tokenizing selected tasks\n", __func__);
LOG_INF("%s : tokenizing selected tasks\n", __func__);
for (auto & task : data) {
task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true);
task.seq_tokens[1] = ::llama_tokenize(ctx, task.first + task.choices[1] + task.second, true);
task.seq_tokens[0] = common_tokenize(ctx, task.first + task.choices[0] + task.second, true);
task.seq_tokens[1] = common_tokenize(ctx, task.first + task.choices[1] + task.second, true);
task.common_prefix = 0;
for (size_t k = 0; k < task.seq_tokens[0].size(); k++) {
......@@ -1152,16 +1121,17 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
task.seq_tokens[0].size() - task.common_prefix +
task.seq_tokens[1].size() - task.common_prefix;
task.n_base1 = ::llama_tokenize(ctx, task.first + task.choices[0], true).size();
task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size();
task.n_base1 = common_tokenize(ctx, task.first + task.choices[0], true).size();
task.n_base2 = common_tokenize(ctx, task.first + task.choices[1], true).size();
}
fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__);
LOG_INF("%s : calculating winogrande score over selected tasks.\n", __func__);
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
const int n_ctx = llama_n_ctx(ctx);
const int n_batch = params.n_batch;
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
const int max_tasks_per_batch = 128;
const int max_seq = std::min(2*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
......@@ -1169,7 +1139,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
std::vector<float> tok_logits(n_vocab);
// TODO: this could be made smaller; it's currently the worst-case size
std::vector<float> batch_logits(n_vocab*n_ctx);
std::vector<float> batch_logits(size_t(n_ctx)*n_vocab);
std::vector<std::pair<size_t, llama_token>> eval_pairs;
std::vector<float> eval_results;
......@@ -1184,7 +1154,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
size_t i1 = i0;
size_t i_logits = 0;
llama_batch_clear(batch);
common_batch_clear(batch);
while (n_cur + (int) data[i1].required_tokens <= n_ctx) {
int n_logits = 0;
......@@ -1194,7 +1164,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
}
for (size_t i = 0; i < data[i1].common_prefix; ++i) {
llama_batch_add(batch, data[i1].seq_tokens[0][i], i, { s0 + 0, s0 + 1 }, false);
common_batch_add(batch, data[i1].seq_tokens[0][i], i, { s0 + 0, s0 + 1 }, false);
}
batch.logits[batch.n_tokens - 1] = true;
n_logits += 1;
......@@ -1202,7 +1172,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
for (int s = 0; s < 2; ++s) {
// TODO: end before the last token, no need to predict past the end of the sequences
for (size_t i = data[i1].common_prefix; i < data[i1].seq_tokens[s].size(); ++i) {
llama_batch_add(batch, data[i1].seq_tokens[s][i], i, { s0 + s }, true);
common_batch_add(batch, data[i1].seq_tokens[s][i], i, { s0 + s }, true);
n_logits += 1;
}
}
......@@ -1217,7 +1187,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
}
if (i0 == i1) {
fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
return;
}
......@@ -1225,7 +1195,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
// decode all tasks [i0, i1)
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
fprintf(stderr, "%s: llama_decode() failed\n", __func__);
LOG_ERR("%s: llama_decode() failed\n", __func__);
return;
}
......@@ -1285,20 +1255,20 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
++n_done;
// print the accumulated accuracy mean x 100
printf("%zu\t%.4lf\t%10.6f %10.6f %d %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer);
fflush(stdout);
LOG("%zu\t%.4lf\t%10.6f %10.6f %d %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer);
}
i0 = i1 - 1;
}
printf("\n");
LOG("\n");
if (n_done < 100) return;
const float p = 1.f*n_correct/n_done;
const float sigma = 100.f*sqrt(p*(1-p)/(n_done-1));
printf("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
LOG_INF("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
}
static bool deserialize_string(std::istream & in, std::string & str) {
......@@ -1347,7 +1317,7 @@ struct multiple_choice_task {
static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choice_task& task, bool log_error) {
if (task.question.empty() || task.mc1.answers.empty()) {
if (log_error) {
printf("%s: found bad task with empty question and/or answers\n", __func__);
LOG_ERR("%s: found bad task with empty question and/or answers\n", __func__);
}
return false;
}
......@@ -1355,11 +1325,11 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic
for (auto& answer : task.mc1.answers) {
if (answer.empty()) {
if (log_error) {
printf("%s: found empty answer\n", __func__);
LOG_ERR("%s: found empty answer\n", __func__);
}
return false;
}
task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, true));
task.seq_tokens.emplace_back(::common_tokenize(ctx, task.question + " " + answer, true));
}
auto min_len = task.seq_tokens.front().size();
for (auto& seq : task.seq_tokens) {
......@@ -1403,20 +1373,20 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic
// git@hf.co:datasets/Stevross/mmlu
// https://huggingface.co/datasets/truthful_qa
//
static void multiple_choice_score(llama_context * ctx, const gpt_params & params) {
static void multiple_choice_score(llama_context * ctx, const common_params & params) {
std::istringstream strstream(params.prompt);
uint32_t n_task;
strstream.read((char *)&n_task, sizeof(n_task));
if (strstream.fail() || n_task == 0) {
printf("%s: no tasks\n", __func__);
LOG_ERR("%s: no tasks\n", __func__);
return;
}
printf("%s: there are %u tasks in prompt\n", __func__, n_task);
LOG_INF("%s: there are %u tasks in prompt\n", __func__, n_task);
std::vector<uint32_t> task_pos(n_task);
strstream.read((char *)task_pos.data(), task_pos.size()*sizeof(uint32_t));
if (strstream.fail()) {
printf("%s: failed to read task positions from prompt\n", __func__);
LOG_ERR("%s: failed to read task positions from prompt\n", __func__);
return;
}
......@@ -1424,21 +1394,21 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
if (params.multiple_choice_tasks == 0 || params.multiple_choice_tasks >= (size_t)n_task) {
// Use all tasks
tasks.resize(n_task);
printf("%s: reading tasks", __func__);
LOG_INF("%s: reading tasks", __func__);
int n_dot = std::max((int) n_task/100, 1);
int i = 0;
for (auto& task : tasks) {
++i;
if (!task.deserialize(strstream)) {
printf("%s: failed to read task %d of %u\n", __func__, i, n_task);
LOG_ERR("%s: failed to read task %d of %u\n", __func__, i, n_task);
return;
}
if (i%n_dot == 0) printf(".");
if (i%n_dot == 0) LOG(".");
}
printf("done\n");
LOG("done\n");
}
else {
printf("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task);
LOG_INF("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task);
std::mt19937 rng(1);
std::vector<int> aux(n_task);
for (uint32_t i = 0; i < n_task; ++i) aux[i] = i;
......@@ -1451,18 +1421,16 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
aux.pop_back();
strstream.seekg(task_pos[idx], std::ios::beg);
if (!task.deserialize(strstream)) {
printf("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]);
LOG_ERR("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]);
return;
}
}
n_task = params.multiple_choice_tasks;
}
printf("%s: preparing task data", __func__);
fflush(stdout);
LOG_INF("%s: preparing task data", __func__);
if (n_task > 500) {
printf("...");
fflush(stdout);
LOG("...");
std::atomic<int> counter(0);
std::atomic<int> n_bad(0);
auto prepare = [&counter, &n_bad, &tasks, ctx] () {
......@@ -1486,11 +1454,10 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
for (auto& w : workers) w = std::thread(prepare);
prepare();
for (auto& w : workers) w.join();
printf("done\n");
fflush(stdout);
LOG("done\n");
int nbad = n_bad;
if (nbad > 0) {
printf("%s: found %d malformed tasks\n", __func__, nbad);
LOG_ERR("%s: found %d malformed tasks\n", __func__, nbad);
return;
}
} else {
......@@ -1502,28 +1469,28 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
return;
}
if (i_task%n_dot == 0) {
printf(".");
fflush(stdout);
LOG(".");
}
}
printf("done\n");
LOG("done\n");
}
printf("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size());
LOG_INF("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size());
printf("\ntask\tacc_norm\n");
LOG("\ntask\tacc_norm\n");
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
const int n_ctx = llama_n_ctx(ctx);
const int n_batch = params.n_batch;
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
const int max_tasks_per_batch = 32;
const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
std::vector<float> tok_logits(n_vocab);
std::vector<float> batch_logits(n_vocab*n_ctx);
std::vector<float> batch_logits(size_t(n_ctx)*n_vocab);
std::vector<std::pair<size_t, llama_token>> eval_pairs;
std::vector<float> eval_results;
......@@ -1540,7 +1507,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
size_t i1 = i0;
size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch
llama_batch_clear(batch);
common_batch_clear(batch);
// batch as much tasks as possible into the available context
// each task has 4 unique sequence ids - one for each ending
......@@ -1563,7 +1530,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
for (size_t i = 0; i < cur_task.common_prefix; ++i) {
//llama_batch_add(batch, cur_task.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false);
llama_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false);
common_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false);
}
batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
n_logits += 1;
......@@ -1573,7 +1540,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
// TODO: don't evaluate the last token of each sequence
for (size_t i = cur_task.common_prefix; i < seq_tokens_size; ++i) {
const bool needs_logits = i < seq_tokens_size - 1;
llama_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, needs_logits);
common_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, needs_logits);
n_logits += needs_logits;
}
}
......@@ -1590,7 +1557,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
}
if (i0 == i1) {
fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
return;
}
......@@ -1598,7 +1565,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
// decode all tasks [i0, i1)
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
fprintf(stderr, "%s: llama_decode() failed\n", __func__);
LOG_ERR("%s: llama_decode() failed\n", __func__);
return;
}
......@@ -1622,16 +1589,16 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
// compute the logprobs for each ending of the decoded tasks
for (size_t i = i0; i < i1; ++i) {
auto & cur_task = tasks[i];
//printf("==== Evaluating <%s> with correct answer ", cur_task.question.c_str());
//LOG("==== Evaluating <%s> with correct answer ", cur_task.question.c_str());
//for (int j = 0; j < int(cur_task.mc1.labels.size()); ++j) {
// if (cur_task.mc1.labels[j] == 1) {
// printf("%d", j+1);
// LOG("%d", j+1);
// }
//}
//printf("\n common_prefix: %zu\n", cur_task.common_prefix);
//LOG("\n common_prefix: %zu\n", cur_task.common_prefix);
// get the logits of the last token of the common prefix
std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*cur_task.i_logits, n_vocab*sizeof(float));
std::memcpy(tok_logits.data(), batch_logits.data() + cur_task.i_logits*n_vocab, n_vocab*sizeof(float));
const auto first_probs = softmax(tok_logits);
......@@ -1640,13 +1607,13 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
size_t count = 1;
float log_prob = std::log(first_probs[cur_task.seq_tokens[s][cur_task.common_prefix]]);
for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
//printf(" %zu %g\n", ir, eval_results[ir]);
//LOG(" %zu %g\n", ir, eval_results[ir]);
++count;
log_prob += eval_results[ir++];
}
cur_task.log_probs[s] = log_prob / count;
//printf(" Final: %g\n", log_prob / count);
//printf(" <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count);
//LOG(" Final: %g\n", log_prob / count);
//LOG(" <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count);
}
// Find the ending with maximum logprob
......@@ -1666,8 +1633,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
++n_done;
// Print the accumulated accuracy mean x 100
printf("%d\t%.8lf\n", n_done, 100.*n_correct/n_done);
fflush(stdout);
LOG("%d\t%.8lf\n", n_done, 100.*n_correct/n_done);
}
i0 = i1 - 1;
......@@ -1679,29 +1645,30 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
float p = 1.f*n_correct/n_done;
float sigma = sqrt(p*(1-p)/(n_done-1));
printf("\n Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
LOG("\n");
LOG_INF("Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
p = 1.f*n_done/n_tot_answers;
sigma = sqrt(p*(1-p)/(n_done-1));
printf("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
LOG_INF("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
printf("\n");
LOG_INF("\n");
}
static void kl_divergence(llama_context * ctx, const gpt_params & params) {
static void kl_divergence(llama_context * ctx, const common_params & params) {
if (params.logits_file.empty()) {
fprintf(stderr, "%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
return;
}
std::ifstream in(params.logits_file.c_str(), std::ios::binary);
if (!in) {
fprintf(stderr, "%s: failed to open %s\n", __func__, params.logits_file.c_str());
LOG_ERR("%s: failed to open %s\n", __func__, params.logits_file.c_str());
return;
}
{
char check[9]; check[8] = 0;
in.read(check, 8);
if (in.fail() || strncmp("_logits_", check, 8) != 0) {
fprintf(stderr, "%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str());
LOG_ERR("%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str());
return;
}
}
......@@ -1709,39 +1676,40 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
uint32_t n_ctx;
in.read((char *)&n_ctx, sizeof(n_ctx));
if (n_ctx > llama_n_ctx(ctx)) {
fprintf(stderr, "%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
LOG_ERR("%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
__func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
}
int n_vocab, n_chunk;
int n_vocab;
int n_chunk;
in.read((char *)&n_vocab, sizeof(n_vocab));
in.read((char *)&n_chunk, sizeof(n_chunk));
if (in.fail()) {
fprintf(stderr, "%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
LOG_ERR("%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
return;
}
if (n_vocab != llama_n_vocab(llama_get_model(ctx))) {
fprintf(stderr, "%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
}
std::vector<llama_token> tokens(n_ctx * n_chunk);
std::vector<llama_token> tokens(size_t(n_ctx) * n_chunk);
if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) {
fprintf(stderr, "%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
LOG_ERR("%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
return;
}
const int n_batch = params.n_batch;
const int num_batches = (n_ctx + n_batch - 1)/n_batch;
const int nv = 2*((n_vocab + 1)/2) + 4;
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
std::vector<float> p_diff_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
std::vector<float> logits;
if (num_batches > 1) {
logits.reserve(n_ctx * n_vocab);
logits.reserve(size_t(n_ctx) * n_vocab);
}
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
......@@ -1775,13 +1743,15 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
const auto t_start = std::chrono::high_resolution_clock::now();
if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) {
fprintf(stderr, "%s: failed reading log-probs for chunk %d\n", __func__, i);
LOG_ERR("%s: failed reading log-probs for chunk %d\n", __func__, i);
return;
}
// clear the KV cache
llama_kv_cache_clear(ctx);
llama_batch batch = llama_batch_init(n_batch, 0, 1);
for (int j = 0; j < num_batches; ++j) {
const int batch_start = start + j * n_batch;
const int batch_size = std::min(end - batch_start, n_batch);
......@@ -1794,9 +1764,14 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
}
// TODO: use llama_batch.logits instead of relying on logits_all == true
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
fprintf(stderr, "%s : failed to eval\n", __func__);
common_batch_clear(batch);
for (int i = 0; i < batch_size; i++) {
common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true);
}
if (llama_decode(ctx, batch)) {
LOG_ERR("%s : failed to eval\n", __func__);
llama_batch_free(batch);
return;
}
......@@ -1805,105 +1780,105 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
if (num_batches > 1) {
const auto * batch_logits = llama_get_logits(ctx);
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
logits.insert(logits.end(), batch_logits, batch_logits + size_t(batch_size) * n_vocab);
}
}
llama_batch_free(batch);
const auto t_end = std::chrono::high_resolution_clock::now();
if (i == 0) {
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
int total_seconds = (int)(t_total * n_chunk);
if (total_seconds >= 60*60) {
fprintf(stderr, "%d hours ", total_seconds / (60*60));
LOG("%d hours ", total_seconds / (60*60));
total_seconds = total_seconds % (60*60);
}
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
printf("\nchunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n");
LOG("%.2f minutes\n", total_seconds / 60.0);
}
LOG("\n");
LOG("chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n");
const int first = n_ctx/2;
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
process_logits(n_vocab, all_logits + size_t(first)*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
workers, log_probs_uint16, kld, kld_ptr, p_diff_ptr);
p_diff_ptr += n_ctx - 1 - first;
kld_ptr += n_ctx - 1 - first;
printf("%4d", i+1);
LOG("%4d", i+1);
auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
const double ppl_val = exp(log_ppl.first);
const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
printf(" %9.4lf ± %9.4lf", ppl_val, ppl_unc);
LOG(" %9.4lf ± %9.4lf", ppl_val, ppl_unc);
auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
printf(" %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc);
LOG(" %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc);
auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
printf(" %10.5lf ± %10.5lf", kl_div.first, kl_div.second);
LOG(" %10.5lf ± %10.5lf", kl_div.first, kl_div.second);
auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
const double p_diff_rms_val = sqrt(p_diff_mse.first);
const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
printf(" %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
LOG(" %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
double p_top_val = 1.*kld.n_same_top/kld.count;
double p_top_unc = sqrt(p_top_val*(1 - p_top_val)/(kld.count - 1));
printf(" %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc);
printf("\n");
LOG(" %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc);
fflush(stdout);
LOG("\n");
logits.clear();
}
printf("\n");
LOG("\n");
if (kld.count < 100) return; // we do not wish to do statistics on so few values
std::sort(kld_values.begin(), kld_values.end());
std::sort(p_diff_values.begin(), p_diff_values.end());
printf("====== Perplexity statistics ======\n");
LOG("====== Perplexity statistics ======\n");
auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
const double ppl_val = exp(log_ppl.first);
const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
printf("Mean PPL(Q) : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc);
LOG("Mean PPL(Q) : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc);
auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
const double ppl_base_val = exp(log_ppl_base.first);
const double ppl_base_unc = ppl_base_val * log_ppl_base.second; // ppl_base_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_base.second ** 2 )
printf("Mean PPL(base) : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc);
LOG("Mean PPL(base) : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc);
const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
// printf("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov);
// LOG("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov);
const double log_ppl_cor = log_ppl_cov / (log_ppl.second*log_ppl_base.second);
printf("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor);
LOG("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor);
const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
printf("Mean ln(PPL(Q)/PPL(base)) : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc);
LOG("Mean ln(PPL(Q)/PPL(base)) : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc);
const double ppl_ratio_val = exp(log_ppl_ratio_val);
const double ppl_ratio_unc = ppl_ratio_val * log_ppl_ratio_unc; // ppl_ratio_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_ratio.second ** 2 )
printf("Mean PPL(Q)/PPL(base) : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc);
LOG("Mean PPL(Q)/PPL(base) : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc);
const double ppl_cov = ppl_val * ppl_base_val * log_ppl_cov;
const double ppl_diff_val = ppl_val - ppl_base_val;
const double ppl_diff_unc = sqrt(ppl_unc*ppl_unc + ppl_base_unc*ppl_base_unc - 2.0*ppl_cov);
printf("Mean PPL(Q)-PPL(base) : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc);
LOG("Mean PPL(Q)-PPL(base) : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc);
printf("\n");
LOG("\n");
printf("====== KL divergence statistics ======\n");
LOG("====== KL divergence statistics ======\n");
auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
printf("Mean KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second);
LOG("Mean KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second);
auto kld_median = kld_values.size()%2 == 0 ? 0.5f*(kld_values[kld_values.size()/2] + kld_values[kld_values.size()/2-1])
: kld_values[kld_values.size()/2];
......@@ -1915,67 +1890,68 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
return (1 - p)*values[ip] + p*values[std::min(ip+1, values.size()-1)];
};
printf("Maximum KLD: %10.6f\n", kld_values.back());
printf("99.9%% KLD: %10.6f\n", percentile(kld_values, 0.999f));
printf("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
printf("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
printf("Median KLD: %10.6f\n", kld_median);
printf("10.0%% KLD: %10.6f\n", percentile(kld_values, 0.100f));
printf(" 5.0%% KLD: %10.6f\n", percentile(kld_values, 0.050f));
printf(" 1.0%% KLD: %10.6f\n", percentile(kld_values, 0.010f));
printf("Minimum KLD: %10.6f\n", kld_values.front());
LOG("Maximum KLD: %10.6f\n", kld_values.back());
LOG("99.9%% KLD: %10.6f\n", percentile(kld_values, 0.999f));
LOG("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
LOG("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
LOG("Median KLD: %10.6f\n", kld_median);
LOG("10.0%% KLD: %10.6f\n", percentile(kld_values, 0.100f));
LOG(" 5.0%% KLD: %10.6f\n", percentile(kld_values, 0.050f));
LOG(" 1.0%% KLD: %10.6f\n", percentile(kld_values, 0.010f));
LOG("Minimum KLD: %10.6f\n", kld_values.front());
printf("\n");
LOG("\n");
printf("====== Token probability statistics ======\n");
LOG("====== Token probability statistics ======\n");
auto p_diff = mean_and_uncertainty(kld.sum_p_diff, kld.sum_p_diff2, kld.count);
printf("Mean Δp: %6.3lf ± %5.3lf %%\n", 100.0*p_diff.first, 100.0*p_diff.second);
LOG("Mean Δp: %6.3lf ± %5.3lf %%\n", 100.0*p_diff.first, 100.0*p_diff.second);
auto p_diff_median = p_diff_values.size()%2 == 0 ? 0.5f*(p_diff_values[p_diff_values.size()/2] + p_diff_values[p_diff_values.size()/2-1])
: p_diff_values[p_diff_values.size()/2];
printf("Maximum Δp: %6.3lf%%\n", 100.0*p_diff_values.back());
printf("99.9%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f));
printf("99.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f));
printf("95.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f));
printf("90.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f));
printf("75.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f));
printf("Median Δp: %6.3lf%%\n", 100.0*p_diff_median);
printf("25.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f));
printf("10.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f));
printf(" 5.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f));
printf(" 1.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f));
printf(" 0.1%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f));
printf("Minimum Δp: %6.3lf%%\n", 100.0*p_diff_values.front());
LOG("Maximum Δp: %6.3lf%%\n", 100.0*p_diff_values.back());
LOG("99.9%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f));
LOG("99.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f));
LOG("95.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f));
LOG("90.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f));
LOG("75.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f));
LOG("Median Δp: %6.3lf%%\n", 100.0*p_diff_median);
LOG("25.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f));
LOG("10.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f));
LOG(" 5.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f));
LOG(" 1.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f));
LOG(" 0.1%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f));
LOG("Minimum Δp: %6.3lf%%\n", 100.0*p_diff_values.front());
auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
// printf("MSE Δp : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second);
// LOG("MSE Δp : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second);
const double p_diff_rms_val = sqrt(p_diff_mse.first);
const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
printf("RMS Δp : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
LOG("RMS Δp : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
const double same_top_p = 1.0*kld.n_same_top/kld.count;
printf("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1)));
LOG("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1)));
}
int main(int argc, char ** argv) {
gpt_params params;
common_params params;
params.n_ctx = 512;
params.logits_all = true;
params.escape = false;
if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
return 1;
}
common_init();
const int32_t n_ctx = params.n_ctx;
if (n_ctx <= 0) {
fprintf(stderr, "%s: perplexity tool requires '--ctx-size' > 0\n", __func__);
LOG_ERR("%s: perplexity tool requires '--ctx-size' > 0\n", __func__);
return 1;
}
......@@ -2000,45 +1976,35 @@ int main(int argc, char ** argv) {
}
if (params.ppl_stride > 0) {
fprintf(stderr, "Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
LOG_INF("Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
params.n_ctx, params.n_ctx + params.ppl_stride/2);
params.n_ctx += params.ppl_stride/2;
}
print_build_info();
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
llama_backend_init();
llama_numa_init(params.numa);
// load the model and apply lora adapter, if any
llama_init_result llama_init = llama_init_from_gpt_params(params);
common_init_result llama_init = common_init_from_params(params);
llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context;
if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__);
LOG_ERR("%s: unable to load model\n", __func__);
return 1;
}
const int n_ctx_train = llama_n_ctx_train(model);
if (params.n_ctx > n_ctx_train) {
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, params.n_ctx);
}
// print system information
{
fprintf(stderr, "\n");
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
LOG_INF("\n");
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
}
struct results_perplexity results;
......@@ -2054,8 +2020,8 @@ int main(int argc, char ** argv) {
results = perplexity(ctx, params, n_ctx);
}
llama_print_timings(ctx);
write_logfile(ctx, params, model, results);
LOG("\n");
llama_perf_context_print(ctx);
llama_free(ctx);
llama_free_model(model);
......
#define LLAMA_API_INTERNAL
#include "common.h"
#include "ggml.h"
#include "llama.h"
#include "llama-impl.h"
#include <algorithm>
#include <cassert>
......@@ -142,7 +142,7 @@ static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
}
static void test_roundtrip_on_chunk(
const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits_t & qfns, bool use_reference,
const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits & qfns, const ggml_type_traits_cpu & qfns_cpu, bool use_reference,
float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats
) {
if (layer->type == GGML_TYPE_F16) {
......@@ -156,7 +156,7 @@ static void test_roundtrip_on_chunk(
if (use_reference) {
qfns.from_float_ref(input_scratch, quantized_scratch, chunk_size);
} else {
qfns.from_float(input_scratch, quantized_scratch, chunk_size);
qfns_cpu.from_float(input_scratch, quantized_scratch, chunk_size);
}
qfns.to_float(quantized_scratch, output_scratch, chunk_size);
......@@ -166,7 +166,7 @@ static void test_roundtrip_on_chunk(
// Run quantization function for a single layer and update error stats
static void test_roundtrip_on_layer(
std::string & name, bool print_layer_stats, const ggml_type_traits_t & qfns, bool use_reference,
std::string & name, bool print_layer_stats, const ggml_type_traits & qfns, const ggml_type_traits_cpu & qfns_cpu, bool use_reference,
const ggml_tensor * layer, std::vector<float> & input_scratch, std::vector<char> & quantized_scratch,
std::vector<float> & output_scratch, error_stats & total_error, int max_thread = 0
) {
......@@ -187,13 +187,13 @@ static void test_roundtrip_on_layer(
int num_chunks = (nelements + chunk_size - 1)/chunk_size;
if (num_chunks < 2 || max_thread < 2) {
test_roundtrip_on_chunk(layer, 0, nelements, qfns, use_reference, input_scratch_ptr, quantized_scratch.data(),
test_roundtrip_on_chunk(layer, 0, nelements, qfns, qfns_cpu, use_reference, input_scratch_ptr, quantized_scratch.data(),
output_scratch.data(), print_layer_stats ? layer_error : total_error);
} else {
auto & stats = print_layer_stats ? layer_error : total_error;
std::mutex mutex;
uint64_t counter = 0;
auto compute = [&mutex, &counter, &stats, &qfns, nelements, layer, use_reference, input_scratch_ptr,
auto compute = [&mutex, &counter, &stats, &qfns, &qfns_cpu, nelements, layer, use_reference, input_scratch_ptr,
&quantized_scratch, &output_scratch, chunk_size] () {
error_stats local_stats {};
while (true) {
......@@ -205,7 +205,7 @@ static void test_roundtrip_on_layer(
}
lock.unlock();
uint64_t chunk = offset + chunk_size < nelements ? chunk_size : nelements - offset;
test_roundtrip_on_chunk(layer, offset, chunk, qfns, use_reference, input_scratch_ptr + offset,
test_roundtrip_on_chunk(layer, offset, chunk, qfns, qfns_cpu, use_reference, input_scratch_ptr + offset,
quantized_scratch.data() + 4*offset, output_scratch.data() + offset, local_stats);
}
};
......@@ -320,7 +320,6 @@ int main(int argc, char ** argv) {
auto cparams = llama_context_default_params();
cparams.n_ctx = 256;
cparams.seed = 1;
ctx = llama_new_context_with_model(model, cparams);
......@@ -372,8 +371,9 @@ int main(int argc, char ** argv) {
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
continue;
}
ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
if (qfns.from_float && qfns.to_float) {
const auto * qfns = ggml_get_type_traits(type);
const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
if (qfns_cpu->from_float && qfns->to_float) {
if (params.verbose) {
printf("testing %s ...\n", ggml_type_name(type));
}
......@@ -394,7 +394,7 @@ int main(int argc, char ** argv) {
test_roundtrip_on_layer(
layer_name,
params.per_layer_stats,
qfns,
*qfns, *qfns_cpu,
params.reference,
kv_tensor.second,
input_scratch,
......
set(TARGET llama-quantize)
add_executable(${TARGET} quantize.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(${TARGET} PRIVATE ../../common)
target_compile_features(${TARGET} PRIVATE cxx_std_11)
......@@ -34,7 +34,7 @@ Run the quantized model:
```bash
# start inference on a gguf model
./llama-cli -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128
./llama-cli -m ./models/mymodel/ggml-model-Q4_K_M.gguf -cnv -p "You are a helpful assistant"
```
When running the larger models, make sure you have enough disk space to store all the intermediate files.
......@@ -54,6 +54,8 @@ As the models are currently fully loaded into memory, you will need adequate dis
Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
The quantization formats `Q4_0_4_4`, `Q4_0_4_8` and `Q4_0_8_8` are block interleaved variants of the `Q4_0` format, providing a data layout that is better suited for specific implementations of optimized mulmat kernels. Since these formats differ only in data layout, they have the same quantized size as the `Q4_0` format.
*(outdated)*
| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |
......
......@@ -26,6 +26,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", },
{ "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
{ "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
{ "TQ1_0", LLAMA_FTYPE_MOSTLY_TQ1_0, " 1.69 bpw ternarization", },
{ "TQ2_0", LLAMA_FTYPE_MOSTLY_TQ2_0, " 2.06 bpw ternarization", },
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", },
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", },
{ "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", },
......@@ -61,6 +63,16 @@ static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET = "quantize.imatrix
static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES = "quantize.imatrix.entries_count";
static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS = "quantize.imatrix.chunks_count";
static bool striequals(const char * a, const char * b) {
while (*a && *b) {
if (std::tolower(*a) != std::tolower(*b)) {
return false;
}
a++; b++;
}
return *a == *b;
}
static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
std::string ftype_str;
......@@ -68,7 +80,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
ftype_str.push_back(std::toupper(ch));
}
for (auto & it : QUANT_OPTIONS) {
if (it.name == ftype_str) {
if (striequals(it.name.c_str(), ftype_str.c_str())) {
ftype = it.ftype;
ftype_str_out = it.name;
return true;
......@@ -91,7 +103,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
}
// usage:
// ./quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
// ./llama-quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
//
[[noreturn]]
static void usage(const char * executable) {
......@@ -104,7 +116,7 @@ static void usage(const char * executable) {
printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
printf(" --keep-split: will generate quatized model in the same shards as input");
printf(" --keep-split: will generate quantized model in the same shards as input\n");
printf(" --override-kv KEY=TYPE:VALUE\n");
printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
printf("Note: --include-weights and --exclude-weights cannot be used together\n");
......@@ -223,15 +235,15 @@ static int prepare_imatrix(const std::string & imatrix_file,
}
static ggml_type parse_ggml_type(const char * arg) {
ggml_type result = GGML_TYPE_COUNT;
for (int j = 0; j < GGML_TYPE_COUNT; ++j) {
auto type = ggml_type(j);
for (int i = 0; i < GGML_TYPE_COUNT; ++i) {
auto type = (ggml_type)i;
const auto * name = ggml_type_name(type);
if (name && strcmp(arg, name) == 0) {
result = type; break;
if (name && striequals(name, arg)) {
return type;
}
}
return result;
fprintf(stderr, "%s: invalid ggml_type '%s'\n", __func__, arg);
return GGML_TYPE_COUNT;
}
int main(int argc, char ** argv) {
......@@ -252,12 +264,18 @@ int main(int argc, char ** argv) {
} else if (strcmp(argv[arg_idx], "--output-tensor-type") == 0) {
if (arg_idx < argc-1) {
params.output_tensor_type = parse_ggml_type(argv[++arg_idx]);
if (params.output_tensor_type == GGML_TYPE_COUNT) {
usage(argv[0]);
}
} else {
usage(argv[0]);
}
} else if (strcmp(argv[arg_idx], "--token-embedding-type") == 0) {
if (arg_idx < argc-1) {
params.token_embedding_type = parse_ggml_type(argv[++arg_idx]);
if (params.token_embedding_type == GGML_TYPE_COUNT) {
usage(argv[0]);
}
} else {
usage(argv[0]);
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment