Commit 97ef6ff8 authored by xuxzh1's avatar xuxzh1 🎱
Browse files

update

parent 4cc1a614
Pipeline #2023 canceled with stages
This diff is collapsed.
This diff is collapsed.
......@@ -94,6 +94,9 @@ namespace console {
simple_io = true;
}
}
if (simple_io) {
_setmode(_fileno(stdin), _O_U8TEXT);
}
#else
// POSIX-specific console initialization
if (!simple_io) {
......
......@@ -611,7 +611,7 @@ private:
}
return join_seq();
};
return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space");
return _add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space");
}
/*
......
#include "log.h"
#include <condition_variable>
#include <cstdarg>
#include <cstdio>
#include <mutex>
#include <sstream>
#include <thread>
#include <vector>
int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;
void common_log_set_verbosity_thold(int verbosity) {
common_log_verbosity_thold = verbosity;
}
#define LOG_COL_DEFAULT "\033[0m"
#define LOG_COL_BOLD "\033[1m"
#define LOG_COL_RED "\033[31m"
#define LOG_COL_GREEN "\033[32m"
#define LOG_COL_YELLOW "\033[33m"
#define LOG_COL_BLUE "\033[34m"
#define LOG_COL_MAGENTA "\033[35m"
#define LOG_COL_CYAN "\033[36m"
#define LOG_COL_WHITE "\033[37m"
static int64_t t_us() {
return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
}
// colors
enum common_log_col : int {
COMMON_LOG_COL_DEFAULT = 0,
COMMON_LOG_COL_BOLD,
COMMON_LOG_COL_RED,
COMMON_LOG_COL_GREEN,
COMMON_LOG_COL_YELLOW,
COMMON_LOG_COL_BLUE,
COMMON_LOG_COL_MAGENTA,
COMMON_LOG_COL_CYAN,
COMMON_LOG_COL_WHITE,
};
// disable colors by default
static std::vector<const char *> g_col = {
"",
"",
"",
"",
"",
"",
"",
"",
"",
};
struct common_log_entry {
enum ggml_log_level level;
bool prefix;
int64_t timestamp;
std::vector<char> msg;
// signals the worker thread to stop
bool is_end;
void print(FILE * file = nullptr) const {
FILE * fcur = file;
if (!fcur) {
// stderr displays DBG messages only when their verbosity level is not higher than the threshold
// these messages will still be logged to a file
if (level == GGML_LOG_LEVEL_DEBUG && common_log_verbosity_thold < LOG_DEFAULT_DEBUG) {
return;
}
fcur = stdout;
if (level != GGML_LOG_LEVEL_NONE) {
fcur = stderr;
}
}
if (level != GGML_LOG_LEVEL_NONE && level != GGML_LOG_LEVEL_CONT && prefix) {
if (timestamp) {
// [M.s.ms.us]
fprintf(fcur, "%s%d.%02d.%03d.%03d%s ",
g_col[COMMON_LOG_COL_BLUE],
(int) (timestamp / 1000000 / 60),
(int) (timestamp / 1000000 % 60),
(int) (timestamp / 1000 % 1000),
(int) (timestamp % 1000),
g_col[COMMON_LOG_COL_DEFAULT]);
}
switch (level) {
case GGML_LOG_LEVEL_INFO: fprintf(fcur, "%sI %s", g_col[COMMON_LOG_COL_GREEN], g_col[COMMON_LOG_COL_DEFAULT]); break;
case GGML_LOG_LEVEL_WARN: fprintf(fcur, "%sW %s", g_col[COMMON_LOG_COL_MAGENTA], "" ); break;
case GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[COMMON_LOG_COL_RED], "" ); break;
case GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[COMMON_LOG_COL_YELLOW], "" ); break;
default:
break;
}
}
fprintf(fcur, "%s", msg.data());
if (level == GGML_LOG_LEVEL_WARN || level == GGML_LOG_LEVEL_ERROR || level == GGML_LOG_LEVEL_DEBUG) {
fprintf(fcur, "%s", g_col[COMMON_LOG_COL_DEFAULT]);
}
fflush(fcur);
}
};
struct common_log {
// default capacity - will be expanded if needed
common_log() : common_log(256) {}
common_log(size_t capacity) {
file = nullptr;
prefix = false;
timestamps = false;
running = false;
t_start = t_us();
// initial message size - will be expanded if longer messages arrive
entries.resize(capacity);
for (auto & entry : entries) {
entry.msg.resize(256);
}
head = 0;
tail = 0;
resume();
}
~common_log() {
pause();
if (file) {
fclose(file);
}
}
private:
std::mutex mtx;
std::thread thrd;
std::condition_variable cv;
FILE * file;
bool prefix;
bool timestamps;
bool running;
int64_t t_start;
// ring buffer of entries
std::vector<common_log_entry> entries;
size_t head;
size_t tail;
// worker thread copies into this
common_log_entry cur;
public:
void add(enum ggml_log_level level, const char * fmt, va_list args) {
std::lock_guard<std::mutex> lock(mtx);
if (!running) {
// discard messages while the worker thread is paused
return;
}
auto & entry = entries[tail];
{
// cannot use args twice, so make a copy in case we need to expand the buffer
va_list args_copy;
va_copy(args_copy, args);
#if 1
const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args);
if (n >= entry.msg.size()) {
entry.msg.resize(n + 1);
vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args_copy);
}
#else
// hack for bolding arguments
std::stringstream ss;
for (int i = 0; fmt[i] != 0; i++) {
if (fmt[i] == '%') {
ss << LOG_COL_BOLD;
while (fmt[i] != ' ' && fmt[i] != ')' && fmt[i] != ']' && fmt[i] != 0) ss << fmt[i++];
ss << LOG_COL_DEFAULT;
if (fmt[i] == 0) break;
}
ss << fmt[i];
}
const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args);
if (n >= entry.msg.size()) {
entry.msg.resize(n + 1);
vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args_copy);
}
#endif
}
entry.level = level;
entry.prefix = prefix;
entry.timestamp = 0;
if (timestamps) {
entry.timestamp = t_us() - t_start;
}
entry.is_end = false;
tail = (tail + 1) % entries.size();
if (tail == head) {
// expand the buffer
std::vector<common_log_entry> new_entries(2*entries.size());
size_t new_tail = 0;
do {
new_entries[new_tail] = std::move(entries[head]);
head = (head + 1) % entries.size();
new_tail = (new_tail + 1);
} while (head != tail);
head = 0;
tail = new_tail;
for (size_t i = tail; i < new_entries.size(); i++) {
new_entries[i].msg.resize(256);
}
entries = std::move(new_entries);
}
cv.notify_one();
}
void resume() {
std::lock_guard<std::mutex> lock(mtx);
if (running) {
return;
}
running = true;
thrd = std::thread([this]() {
while (true) {
{
std::unique_lock<std::mutex> lock(mtx);
cv.wait(lock, [this]() { return head != tail; });
cur = entries[head];
head = (head + 1) % entries.size();
}
if (cur.is_end) {
break;
}
cur.print(); // stdout and stderr
if (file) {
cur.print(file);
}
}
});
}
void pause() {
{
std::lock_guard<std::mutex> lock(mtx);
if (!running) {
return;
}
running = false;
// push an entry to signal the worker thread to stop
{
auto & entry = entries[tail];
entry.is_end = true;
tail = (tail + 1) % entries.size();
}
cv.notify_one();
}
thrd.join();
}
void set_file(const char * path) {
pause();
if (file) {
fclose(file);
}
if (path) {
file = fopen(path, "w");
} else {
file = nullptr;
}
resume();
}
void set_colors(bool colors) {
pause();
if (colors) {
g_col[COMMON_LOG_COL_DEFAULT] = LOG_COL_DEFAULT;
g_col[COMMON_LOG_COL_BOLD] = LOG_COL_BOLD;
g_col[COMMON_LOG_COL_RED] = LOG_COL_RED;
g_col[COMMON_LOG_COL_GREEN] = LOG_COL_GREEN;
g_col[COMMON_LOG_COL_YELLOW] = LOG_COL_YELLOW;
g_col[COMMON_LOG_COL_BLUE] = LOG_COL_BLUE;
g_col[COMMON_LOG_COL_MAGENTA] = LOG_COL_MAGENTA;
g_col[COMMON_LOG_COL_CYAN] = LOG_COL_CYAN;
g_col[COMMON_LOG_COL_WHITE] = LOG_COL_WHITE;
} else {
for (size_t i = 0; i < g_col.size(); i++) {
g_col[i] = "";
}
}
resume();
}
void set_prefix(bool prefix) {
std::lock_guard<std::mutex> lock(mtx);
this->prefix = prefix;
}
void set_timestamps(bool timestamps) {
std::lock_guard<std::mutex> lock(mtx);
this->timestamps = timestamps;
}
};
//
// public API
//
struct common_log * common_log_init() {
return new common_log;
}
struct common_log * common_log_main() {
static struct common_log log;
return &log;
}
void common_log_pause(struct common_log * log) {
log->pause();
}
void common_log_resume(struct common_log * log) {
log->resume();
}
void common_log_free(struct common_log * log) {
delete log;
}
void common_log_add(struct common_log * log, enum ggml_log_level level, const char * fmt, ...) {
va_list args;
va_start(args, fmt);
log->add(level, fmt, args);
va_end(args);
}
void common_log_set_file(struct common_log * log, const char * file) {
log->set_file(file);
}
void common_log_set_colors(struct common_log * log, bool colors) {
log->set_colors(colors);
}
void common_log_set_prefix(struct common_log * log, bool prefix) {
log->set_prefix(prefix);
}
void common_log_set_timestamps(struct common_log * log, bool timestamps) {
log->set_timestamps(timestamps);
}
This diff is collapsed.
......@@ -2,10 +2,13 @@
#include "common.h"
#include "log.h"
#include <cinttypes>
#include <cstdint>
#include <cstdio>
#include <fstream>
#include <thread>
void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
std::vector<llama_token> & inp, int nnew, bool print_progress) {
const int64_t t_start_ms = ggml_time_ms();
const int64_t inp_size = inp.size();
......@@ -17,16 +20,16 @@ void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, in
const int64_t i_start = std::max(inp_size - nnew, ngram_size);
for (int64_t i = i_start; i < inp_size; ++i) {
const int64_t ngram_start = i - ngram_size;
llama_ngram ngram(&inp[ngram_start], ngram_size);
common_ngram ngram(&inp[ngram_start], ngram_size);
const llama_token token = inp[i];
llama_ngram_cache::iterator part_it = ngram_cache.find(ngram);
common_ngram_cache::iterator part_it = ngram_cache.find(ngram);
if (part_it == ngram_cache.end()) {
llama_ngram_cache_part part;
common_ngram_cache_part part;
part.emplace(token, 1);
ngram_cache.emplace(ngram, part);
} else {
llama_ngram_cache_part::iterator token_count_it = part_it->second.find(token);
common_ngram_cache_part::iterator token_count_it = part_it->second.find(token);
if (token_count_it == part_it->second.end()) {
part_it->second.emplace(token, 1);
} else {
......@@ -59,12 +62,12 @@ constexpr int draft_min_sample_size_strict[LLAMA_NGRAM_MAX] = { 4, 3, 2, 2};
constexpr int draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66};
// Helper function that tries to draft a token from only the static ngram cache:
static llama_token try_draft(llama_ngram_cache & nc_static, const llama_ngram ngram_static) {
llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) {
common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
if (part_static_it == nc_static.end()) {
return -1;
}
const llama_ngram_cache_part part_static = part_static_it->second;
const common_ngram_cache_part part_static = part_static_it->second;
int max_count_static = 0;
int sum_count_static = 0;
......@@ -92,19 +95,19 @@ static llama_token try_draft(llama_ngram_cache & nc_static, const llama_ngram ng
// Try to draft a token from primary cache (context/dynamic), validate with static cache:
static llama_token try_draft(
llama_ngram_cache & nc_primary, const std::vector<llama_ngram> & ngrams_primary, llama_ngram_cache_part & part_static,
common_ngram_cache & nc_primary, const std::vector<common_ngram> & ngrams_primary, common_ngram_cache_part & part_static,
const int * min_sample_size, const int * min_percent) {
llama_token drafted_token = -1;
for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) {
const llama_ngram ngram_primary = ngrams_primary[i];
const common_ngram ngram_primary = ngrams_primary[i];
llama_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
common_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
if (part_primary_it == nc_primary.end()) {
continue;
}
const llama_ngram_cache_part part_primary = part_primary_it->second;
const common_ngram_cache_part part_primary = part_primary_it->second;
int max_count_primary = 0;
int max_count_static = 0;
......@@ -114,7 +117,7 @@ static llama_token try_draft(
for (std::pair<llama_token, int> token_count_primary : part_primary) {
const llama_token token = token_count_primary.first;
llama_ngram_cache_part::iterator token_count_static_it = part_static.find(token);
common_ngram_cache_part::iterator token_count_static_it = part_static.find(token);
const int32_t count_primary = token_count_primary.second;
const int32_t count_static = token_count_static_it != part_static.end() ? 100*token_count_static_it->second : 1;
......@@ -139,9 +142,9 @@ static llama_token try_draft(
return drafted_token;
}
void llama_ngram_cache_draft(
void common_ngram_cache_draft(
std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static
common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static
) {
GGML_ASSERT(draft.size() == 1);
const int inp_size = inp.size();
......@@ -154,21 +157,21 @@ void llama_ngram_cache_draft(
llama_token drafted_token = -1;
const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
llama_ngram ngram_static;
common_ngram ngram_static;
for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) {
ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j);
}
llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
llama_ngram_cache_part part_static;
common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
common_ngram_cache_part part_static;
if (part_static_it != nc_static.end()) {
part_static = part_static_it->second;
}
// cd = context + dynamic
std::vector<llama_ngram> ngrams_cd;
std::vector<common_ngram> ngrams_cd;
for (int ngram_size_cd = ngram_min; ngram_size_cd <= ngram_max; ++ngram_size_cd) {
const int ngram_start_cd = inp_size-ngram_size_cd + draft.size()-1;
llama_ngram ngram_cd;
common_ngram ngram_cd;
for (int j = ngram_start_cd; j < ngram_start_cd + ngram_size_cd; ++j) {
ngram_cd.tokens[j-ngram_start_cd] = get_token(inp, draft, j);
}
......@@ -193,16 +196,16 @@ void llama_ngram_cache_draft(
}
}
void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename) {
void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & filename) {
std::ofstream file_out(filename, std::ios::binary);
for (std::pair<llama_ngram, llama_ngram_cache_part> item : ngram_cache) {
const llama_ngram ngram = item.first;
llama_ngram_cache_part token_counts = item.second;
for (std::pair<common_ngram, common_ngram_cache_part> item : ngram_cache) {
const common_ngram ngram = item.first;
common_ngram_cache_part token_counts = item.second;
GGML_ASSERT(!token_counts.empty());
const int32_t ntokens = token_counts.size();
GGML_ASSERT(ntokens > 0);
file_out.write(reinterpret_cast<const char *>(&ngram), sizeof(llama_ngram));
file_out.write(reinterpret_cast<const char *>(&ngram), sizeof(common_ngram));
file_out.write(reinterpret_cast<const char *>(&ntokens), sizeof(int32_t));
for (std::pair<llama_token, int32_t> item2 : token_counts) {
const llama_token token = item2.first;
......@@ -216,14 +219,14 @@ void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filen
}
llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
common_ngram_cache common_ngram_cache_load(std::string & filename) {
std::ifstream hashmap_file(filename, std::ios::binary);
if (!hashmap_file) {
throw std::ifstream::failure("Unable to open file " + filename);
}
llama_ngram_cache ngram_cache;
common_ngram_cache ngram_cache;
llama_ngram ngram;
common_ngram ngram;
int32_t ntokens;
llama_token token;
int32_t count;
......@@ -232,11 +235,11 @@ llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
char * ntokensc = reinterpret_cast<char*>(&ntokens);
char * tokenc = reinterpret_cast<char*>(&token);
char * countc = reinterpret_cast<char*>(&count);
while(hashmap_file.read(ngramc, sizeof(llama_ngram))) {
while(hashmap_file.read(ngramc, sizeof(common_ngram))) {
GGML_ASSERT(!hashmap_file.eof());
GGML_ASSERT(hashmap_file.read(ntokensc, sizeof(int32_t)));
GGML_ASSERT(ntokens > 0);
llama_ngram_cache_part token_counts;
common_ngram_cache_part token_counts;
for (int i = 0; i < ntokens; ++i) {
GGML_ASSERT(!hashmap_file.eof());
......@@ -254,12 +257,12 @@ llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
return ngram_cache;
}
void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add) {
for (std::pair<llama_ngram, llama_ngram_cache_part> ngram_part : ngram_cache_add) {
const llama_ngram ngram = ngram_part.first;
llama_ngram_cache_part part = ngram_part.second;
void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ngram_cache & ngram_cache_add) {
for (std::pair<common_ngram, common_ngram_cache_part> ngram_part : ngram_cache_add) {
const common_ngram ngram = ngram_part.first;
common_ngram_cache_part part = ngram_part.second;
llama_ngram_cache::iterator part_merged_it = ngram_cache_target.find(ngram);
common_ngram_cache::iterator part_merged_it = ngram_cache_target.find(ngram);
if (part_merged_it == ngram_cache_target.end()) {
ngram_cache_target.emplace(ngram, part);
continue;
......@@ -270,7 +273,7 @@ void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram
const int32_t count = token_count.second;
GGML_ASSERT(count > 0);
llama_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second.find(token);
common_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second.find(token);
if (token_count_merged_it == part_merged_it->second.end()) {
part_merged_it->second.emplace(token, count);
continue;
......
......@@ -12,22 +12,22 @@
// Data structures to map n-grams to empirical token probabilities:
struct llama_ngram {
struct common_ngram {
llama_token tokens[LLAMA_NGRAM_MAX];
llama_ngram() {
common_ngram() {
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
tokens[i] = -1;
}
}
llama_ngram(const llama_token * input, const int ngram_size) {
common_ngram(const llama_token * input, const int ngram_size) {
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
tokens[i] = i < ngram_size ? input[i] : -1;
}
}
bool operator==(const llama_ngram & other) const {
bool operator==(const common_ngram & other) const {
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
if (tokens[i] != other.tokens[i]) {
return false;
......@@ -37,28 +37,28 @@ struct llama_ngram {
}
};
struct llama_token_hash_function {
struct common_token_hash_function {
size_t operator()(const llama_token token) const {
// see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
return token * 11400714819323198485llu;
}
};
struct llama_ngram_hash_function {
size_t operator()(const llama_ngram & ngram) const {
size_t hash = llama_token_hash_function{}(ngram.tokens[0]);
struct common_ngram_hash_function {
size_t operator()(const common_ngram & ngram) const {
size_t hash = common_token_hash_function{}(ngram.tokens[0]);
for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) {
hash ^= llama_token_hash_function{}(ngram.tokens[i]);
hash ^= common_token_hash_function{}(ngram.tokens[i]);
}
return hash;
}
};
// token -> number of times token has been seen
typedef std::unordered_map<llama_token, int32_t> llama_ngram_cache_part;
typedef std::unordered_map<llama_token, int32_t> common_ngram_cache_part;
// n-gram -> empirical distribution of following tokens
typedef std::unordered_map<llama_ngram, llama_ngram_cache_part, llama_ngram_hash_function> llama_ngram_cache;
typedef std::unordered_map<common_ngram, common_ngram_cache_part, common_ngram_hash_function> common_ngram_cache;
// Update an ngram cache with tokens.
......@@ -70,8 +70,8 @@ typedef std::unordered_map<llama_ngram, llama_ngram_cache_part, llama_ngram_hash
//
// In order to get correct results inp_data can ONLY BE APPENDED TO.
// Changes in the middle need a complete rebuild.
void llama_ngram_cache_update(
llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress);
void common_ngram_cache_update(
common_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress);
// Try to draft tokens from ngram caches.
// inp: the tokens generated so far.
......@@ -81,21 +81,21 @@ void llama_ngram_cache_update(
// nc_context: ngram cache based on current context.
// nc_dynamic: ngram cache based on previous user generations.
// nc_static: ngram cache generated from a large text corpus, used for validation.
void llama_ngram_cache_draft(
void common_ngram_cache_draft(
std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static);
common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static);
// Save an ngram cache to a file.
// ngram_cache: the ngram cache to save.
// filename: the path under which to save the ngram cache.
void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename);
void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & filename);
// Load an ngram cache saved with llama_ngram_cache_save.
// Load an ngram cache saved with common_ngram_cache_save.
// filename: the path from which to load the ngram cache.
// returns: an ngram cache containing the information saved to filename.
llama_ngram_cache llama_ngram_cache_load(std::string & filename);
common_ngram_cache common_ngram_cache_load(std::string & filename);
// Merge two ngram caches.
// ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add.
// ngram_cache_add: the ngram cache to add to ngram_cache_target.
void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add);
void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ngram_cache & ngram_cache_add);
This diff is collapsed.
......@@ -2,159 +2,103 @@
#include "llama.h"
#include "grammar-parser.h"
#include "common.h"
#include <random>
#include <string>
#include <unordered_map>
#include <vector>
// sampler types
enum class llama_sampler_type : char {
TOP_K = 'k',
TOP_P = 'p',
MIN_P = 'm',
TFS_Z = 'f',
TYPICAL_P = 'y',
TEMPERATURE = 't'
};
// sampling parameters
typedef struct llama_sampling_params {
int32_t n_prev = 64; // number of previous tokens to remember
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
int32_t top_k = 40; // <= 0 to use vocab size
float top_p = 0.95f; // 1.0 = disabled
float min_p = 0.05f; // 0.0 = disabled
float tfs_z = 1.00f; // 1.0 = disabled
float typical_p = 1.00f; // 1.0 = disabled
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
float dynatemp_range = 0.00f; // 0.0 = disabled
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
float penalty_repeat = 1.00f; // 1.0 = disabled
float penalty_freq = 0.00f; // 0.0 = disabled
float penalty_present = 0.00f; // 0.0 = disabled
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate
bool penalize_nl = false; // consider newlines as a repeatable token
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context
std::vector<llama_sampler_type> samplers_sequence = {
llama_sampler_type::TOP_K,
llama_sampler_type::TFS_Z,
llama_sampler_type::TYPICAL_P,
llama_sampler_type::TOP_P,
llama_sampler_type::MIN_P,
llama_sampler_type::TEMPERATURE
};
std::string grammar; // optional BNF-like grammar to constrain sampling
// Classifier-Free Guidance
// https://arxiv.org/abs/2306.17806
std::string cfg_negative_prompt; // string to help guidance
float cfg_scale = 1.f; // how strong is guidance
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
std::vector<llama_token> penalty_prompt_tokens;
bool use_penalty_prompt_tokens = false;
} llama_sampling_params;
// general sampler context
// TODO: move to llama.h
struct llama_sampling_context {
// parameters that will be used for sampling
llama_sampling_params params;
// mirostat sampler state
float mirostat_mu;
llama_grammar * grammar;
// internal
grammar_parser::parse_state parsed_grammar;
// TODO: replace with ring-buffer
std::vector<llama_token> prev;
std::vector<llama_token_data> cur;
size_t n_valid; // Number of correct top tokens with correct probabilities.
std::mt19937 rng;
};
// common_sampler extends llama_sampler with additional functionality:
//
// - grammar support
// - custom sampler logic based on the parameters
// - history of the last accepted tokens
// - performance metrics
//
// This goal is to have a common implementation of the sampling logic shared across the examples.
// For example, depending on the temperature, the sampling chain can be very simple (greedy) or more
// complex (top-k, top-p, etc).
//
// Another example is related to the grammar. In general, the grammar constraints applied on the full
// vocabulary can be very taxing. To improve performance, the grammar can be applied only to the sampled
// token in order to verify if it fits the grammar. And only if the token doesn't fit the grammar, the
// grammar constraints are applied to the full vocabulary and the token is resampled.
//
// The common_sampler also maintains a container with the last accepted tokens. In the future, this can
// be moved into the core llama library.
//
// For convenience, the common_sampler also maintains a container with the current candidate tokens.
// This can be used to access the probabilities of the rest of the non-sampled tokens.
//
// TODO: measure grammar performance
//
#include "common.h"
struct common_sampler;
// Create a new sampling context instance.
struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params);
// llama_sampler API overloads
void llama_sampling_free(struct llama_sampling_context * ctx);
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params);
// Reset the sampler context
// - clear prev tokens
// - reset grammar
void llama_sampling_reset(llama_sampling_context * ctx);
void common_sampler_free(struct common_sampler * gsmpl);
// Set the sampler seed
void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed);
// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar);
void common_sampler_reset (struct common_sampler * gsmpl);
struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
// Copy the sampler context
void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);
// arguments can be nullptr to skip printing
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
// Get the last sampled token
llama_token llama_sampling_last(llama_sampling_context * ctx);
// extended sampling implementation:
//
// - set logits
// - apply the configured sampler chain
// - check if the token fits the grammar (if any)
// - if not: resample by first applying the grammar constraints and then sampling again (slower path)
//
// if grammar_first is true, the grammar is applied before the samplers (slower)
// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
//
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
// Get a string representation of the last sampled tokens
std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n);
// generalized version of common_sampler_sample
//
// will cross-reference the sampled tokens with a batch of draft tokens and accept those that match
// if the sampler disagrees at some point, we stop and return the accepted tokens up to now
//
// common_sampler_sample_n(gsmpl, ctx, { idx }, {});
//
// is equivalent to
//
// common_sampler_sample(gsmpl, ctx, idx);
// common_sampler_accept(gsmpl, token, true);
//
// requires: idxs.size() == draft.size() + 1
//
// returns at least 1 token, up to idxs.size()
//
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false);
// Print sampling parameters into a string
std::string llama_sampling_print(const llama_sampling_params & params);
// assume idxs == [ 0, 1, 2, ..., draft.size() ]
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false);
// Print sampling order into a string
std::string llama_sampling_order_print(const llama_sampling_params & params);
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
std::string llama_sampling_type_to_str(llama_sampler_type sampler_type);
// helpers
std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string);
// access the internal list of current candidate tokens
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl);
// this is a common sampling function used across the examples for convenience
// it can serve as a starting point for implementing your own sampling function
// Note: When using multiple sequences, it is the caller's responsibility to call
// llama_sampling_reset when a sequence ends
//
// required:
// - ctx_main: context to use for sampling
// - ctx_sampling: sampling-specific context
//
// optional:
// - ctx_cfg: context to use for classifier-free guidance
// - idx: sample from llama_get_logits_ith(ctx, idx)
//
// returns:
// - token: sampled token
// - candidates: vector of candidate tokens
//
llama_token llama_sampling_sample(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
struct llama_context * ctx_cfg,
int idx = -1);
// get the last accepted token
llama_token common_sampler_last(const struct common_sampler * gsmpl);
// print the sampler chain into a string
std::string common_sampler_print(const struct common_sampler * gsmpl);
// get a string representation of the last accepted tokens
std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx, int n);
// Prepares and adjusts the set of token candidates for sampling based on penalties, biases, and sampling parameters.
llama_token_data_array llama_sampling_prepare(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
struct llama_context * ctx_cfg,
int idx = 0,
bool apply_grammar = true,
std::vector<float> * original_logits = nullptr);
char common_sampler_type_to_chr(enum common_sampler_type cnstr);
std::string common_sampler_type_to_str(enum common_sampler_type cnstr);
void llama_sampling_accept(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
llama_token id,
bool apply_grammar);
std::vector<enum common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std::string & chars);
This diff is collapsed.
#pragma once
#include "llama.h"
#include "common.h"
struct common_speculative;
struct common_speculative_params {
int n_draft = 16; // max drafted tokens
int n_reuse = 256;
float p_min = 0.9f; // min probabiliy required to accept a token in the draft
};
struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);
void common_speculative_free(struct common_speculative * spec);
bool common_speculative_are_compatible(
const struct llama_context * ctx_tgt,
const struct llama_context * ctx_dft);
// sample up to n_draft tokens and add them to the batch using the draft model
llama_tokens common_speculative_gen_draft(
struct common_speculative * spec,
struct common_speculative_params params,
const llama_tokens & prompt,
llama_token id_last);
This diff is collapsed.
This diff is collapsed.
......@@ -31,6 +31,7 @@ import re
import requests
import sys
import json
import shutil
from hashlib import sha256
from enum import IntEnum, auto
......@@ -71,6 +72,7 @@ models = [
{"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
{"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
{"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
{"name": "bert-bge-large", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/BAAI/bge-large-zh-v1.5", },
{"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
{"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
{"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
......@@ -80,6 +82,7 @@ models = [
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
{"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
{"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
{"name": "jina-v1-en", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-reranker-v1-tiny-en", },
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
......@@ -94,6 +97,11 @@ models = [
{"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
{"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
{"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
{'name': "bloom", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
{'name': "gpt3-finnish", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
{"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
{"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
{"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
]
......@@ -122,6 +130,21 @@ def download_model(model):
if tokt == TOKENIZER_TYPE.UGM:
files.append("spiece.model")
if os.path.isdir(repo):
# If repo is a path on the file system, copy the directory
for file in files:
src_path = os.path.join(repo, file)
dst_path = f"models/tokenizers/{name}/{file}"
if os.path.isfile(dst_path):
logger.info(f"{name}: File {dst_path} already exists - skipping")
continue
if os.path.isfile(src_path):
shutil.copy2(src_path, dst_path)
logger.info(f"{name}: Copied {src_path} to {dst_path}")
else:
logger.warning(f"{name}: Source file {src_path} does not exist")
else:
# If repo is a URL, download the files
for file in files:
save_path = f"models/tokenizers/{name}/{file}"
if os.path.isfile(save_path):
......
......@@ -116,7 +116,7 @@ class Tensor:
assert quant is not None, 'Unknown tensor type'
(blksize, tysize) = quant
offset += 12
self.dtype= dtype
self.dtype= gguf.GGMLQuantizationType(dtype)
self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)])
offset += 4 * n_dims
self.name = bytes(data[offset:offset + name_len])
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment