Commit 97ef6ff8 authored by xuxzh1's avatar xuxzh1 🎱
Browse files

update

parent 4cc1a614
Pipeline #2023 canceled with stages
This diff is collapsed.
This diff is collapsed.
...@@ -94,6 +94,9 @@ namespace console { ...@@ -94,6 +94,9 @@ namespace console {
simple_io = true; simple_io = true;
} }
} }
if (simple_io) {
_setmode(_fileno(stdin), _O_U8TEXT);
}
#else #else
// POSIX-specific console initialization // POSIX-specific console initialization
if (!simple_io) { if (!simple_io) {
......
...@@ -611,7 +611,7 @@ private: ...@@ -611,7 +611,7 @@ private:
} }
return join_seq(); return join_seq();
}; };
return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space"); return _add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space");
} }
/* /*
......
#include "log.h"
#include <condition_variable>
#include <cstdarg>
#include <cstdio>
#include <mutex>
#include <sstream>
#include <thread>
#include <vector>
int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;
void common_log_set_verbosity_thold(int verbosity) {
common_log_verbosity_thold = verbosity;
}
#define LOG_COL_DEFAULT "\033[0m"
#define LOG_COL_BOLD "\033[1m"
#define LOG_COL_RED "\033[31m"
#define LOG_COL_GREEN "\033[32m"
#define LOG_COL_YELLOW "\033[33m"
#define LOG_COL_BLUE "\033[34m"
#define LOG_COL_MAGENTA "\033[35m"
#define LOG_COL_CYAN "\033[36m"
#define LOG_COL_WHITE "\033[37m"
static int64_t t_us() {
return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
}
// colors
enum common_log_col : int {
COMMON_LOG_COL_DEFAULT = 0,
COMMON_LOG_COL_BOLD,
COMMON_LOG_COL_RED,
COMMON_LOG_COL_GREEN,
COMMON_LOG_COL_YELLOW,
COMMON_LOG_COL_BLUE,
COMMON_LOG_COL_MAGENTA,
COMMON_LOG_COL_CYAN,
COMMON_LOG_COL_WHITE,
};
// disable colors by default
static std::vector<const char *> g_col = {
"",
"",
"",
"",
"",
"",
"",
"",
"",
};
struct common_log_entry {
enum ggml_log_level level;
bool prefix;
int64_t timestamp;
std::vector<char> msg;
// signals the worker thread to stop
bool is_end;
void print(FILE * file = nullptr) const {
FILE * fcur = file;
if (!fcur) {
// stderr displays DBG messages only when their verbosity level is not higher than the threshold
// these messages will still be logged to a file
if (level == GGML_LOG_LEVEL_DEBUG && common_log_verbosity_thold < LOG_DEFAULT_DEBUG) {
return;
}
fcur = stdout;
if (level != GGML_LOG_LEVEL_NONE) {
fcur = stderr;
}
}
if (level != GGML_LOG_LEVEL_NONE && level != GGML_LOG_LEVEL_CONT && prefix) {
if (timestamp) {
// [M.s.ms.us]
fprintf(fcur, "%s%d.%02d.%03d.%03d%s ",
g_col[COMMON_LOG_COL_BLUE],
(int) (timestamp / 1000000 / 60),
(int) (timestamp / 1000000 % 60),
(int) (timestamp / 1000 % 1000),
(int) (timestamp % 1000),
g_col[COMMON_LOG_COL_DEFAULT]);
}
switch (level) {
case GGML_LOG_LEVEL_INFO: fprintf(fcur, "%sI %s", g_col[COMMON_LOG_COL_GREEN], g_col[COMMON_LOG_COL_DEFAULT]); break;
case GGML_LOG_LEVEL_WARN: fprintf(fcur, "%sW %s", g_col[COMMON_LOG_COL_MAGENTA], "" ); break;
case GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[COMMON_LOG_COL_RED], "" ); break;
case GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[COMMON_LOG_COL_YELLOW], "" ); break;
default:
break;
}
}
fprintf(fcur, "%s", msg.data());
if (level == GGML_LOG_LEVEL_WARN || level == GGML_LOG_LEVEL_ERROR || level == GGML_LOG_LEVEL_DEBUG) {
fprintf(fcur, "%s", g_col[COMMON_LOG_COL_DEFAULT]);
}
fflush(fcur);
}
};
struct common_log {
// default capacity - will be expanded if needed
common_log() : common_log(256) {}
common_log(size_t capacity) {
file = nullptr;
prefix = false;
timestamps = false;
running = false;
t_start = t_us();
// initial message size - will be expanded if longer messages arrive
entries.resize(capacity);
for (auto & entry : entries) {
entry.msg.resize(256);
}
head = 0;
tail = 0;
resume();
}
~common_log() {
pause();
if (file) {
fclose(file);
}
}
private:
std::mutex mtx;
std::thread thrd;
std::condition_variable cv;
FILE * file;
bool prefix;
bool timestamps;
bool running;
int64_t t_start;
// ring buffer of entries
std::vector<common_log_entry> entries;
size_t head;
size_t tail;
// worker thread copies into this
common_log_entry cur;
public:
void add(enum ggml_log_level level, const char * fmt, va_list args) {
std::lock_guard<std::mutex> lock(mtx);
if (!running) {
// discard messages while the worker thread is paused
return;
}
auto & entry = entries[tail];
{
// cannot use args twice, so make a copy in case we need to expand the buffer
va_list args_copy;
va_copy(args_copy, args);
#if 1
const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args);
if (n >= entry.msg.size()) {
entry.msg.resize(n + 1);
vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args_copy);
}
#else
// hack for bolding arguments
std::stringstream ss;
for (int i = 0; fmt[i] != 0; i++) {
if (fmt[i] == '%') {
ss << LOG_COL_BOLD;
while (fmt[i] != ' ' && fmt[i] != ')' && fmt[i] != ']' && fmt[i] != 0) ss << fmt[i++];
ss << LOG_COL_DEFAULT;
if (fmt[i] == 0) break;
}
ss << fmt[i];
}
const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args);
if (n >= entry.msg.size()) {
entry.msg.resize(n + 1);
vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args_copy);
}
#endif
}
entry.level = level;
entry.prefix = prefix;
entry.timestamp = 0;
if (timestamps) {
entry.timestamp = t_us() - t_start;
}
entry.is_end = false;
tail = (tail + 1) % entries.size();
if (tail == head) {
// expand the buffer
std::vector<common_log_entry> new_entries(2*entries.size());
size_t new_tail = 0;
do {
new_entries[new_tail] = std::move(entries[head]);
head = (head + 1) % entries.size();
new_tail = (new_tail + 1);
} while (head != tail);
head = 0;
tail = new_tail;
for (size_t i = tail; i < new_entries.size(); i++) {
new_entries[i].msg.resize(256);
}
entries = std::move(new_entries);
}
cv.notify_one();
}
void resume() {
std::lock_guard<std::mutex> lock(mtx);
if (running) {
return;
}
running = true;
thrd = std::thread([this]() {
while (true) {
{
std::unique_lock<std::mutex> lock(mtx);
cv.wait(lock, [this]() { return head != tail; });
cur = entries[head];
head = (head + 1) % entries.size();
}
if (cur.is_end) {
break;
}
cur.print(); // stdout and stderr
if (file) {
cur.print(file);
}
}
});
}
void pause() {
{
std::lock_guard<std::mutex> lock(mtx);
if (!running) {
return;
}
running = false;
// push an entry to signal the worker thread to stop
{
auto & entry = entries[tail];
entry.is_end = true;
tail = (tail + 1) % entries.size();
}
cv.notify_one();
}
thrd.join();
}
void set_file(const char * path) {
pause();
if (file) {
fclose(file);
}
if (path) {
file = fopen(path, "w");
} else {
file = nullptr;
}
resume();
}
void set_colors(bool colors) {
pause();
if (colors) {
g_col[COMMON_LOG_COL_DEFAULT] = LOG_COL_DEFAULT;
g_col[COMMON_LOG_COL_BOLD] = LOG_COL_BOLD;
g_col[COMMON_LOG_COL_RED] = LOG_COL_RED;
g_col[COMMON_LOG_COL_GREEN] = LOG_COL_GREEN;
g_col[COMMON_LOG_COL_YELLOW] = LOG_COL_YELLOW;
g_col[COMMON_LOG_COL_BLUE] = LOG_COL_BLUE;
g_col[COMMON_LOG_COL_MAGENTA] = LOG_COL_MAGENTA;
g_col[COMMON_LOG_COL_CYAN] = LOG_COL_CYAN;
g_col[COMMON_LOG_COL_WHITE] = LOG_COL_WHITE;
} else {
for (size_t i = 0; i < g_col.size(); i++) {
g_col[i] = "";
}
}
resume();
}
void set_prefix(bool prefix) {
std::lock_guard<std::mutex> lock(mtx);
this->prefix = prefix;
}
void set_timestamps(bool timestamps) {
std::lock_guard<std::mutex> lock(mtx);
this->timestamps = timestamps;
}
};
//
// public API
//
struct common_log * common_log_init() {
return new common_log;
}
struct common_log * common_log_main() {
static struct common_log log;
return &log;
}
void common_log_pause(struct common_log * log) {
log->pause();
}
void common_log_resume(struct common_log * log) {
log->resume();
}
void common_log_free(struct common_log * log) {
delete log;
}
void common_log_add(struct common_log * log, enum ggml_log_level level, const char * fmt, ...) {
va_list args;
va_start(args, fmt);
log->add(level, fmt, args);
va_end(args);
}
void common_log_set_file(struct common_log * log, const char * file) {
log->set_file(file);
}
void common_log_set_colors(struct common_log * log, bool colors) {
log->set_colors(colors);
}
void common_log_set_prefix(struct common_log * log, bool prefix) {
log->set_prefix(prefix);
}
void common_log_set_timestamps(struct common_log * log, bool timestamps) {
log->set_timestamps(timestamps);
}
This diff is collapsed.
...@@ -2,10 +2,13 @@ ...@@ -2,10 +2,13 @@
#include "common.h" #include "common.h"
#include "log.h" #include "log.h"
#include <cinttypes>
#include <cstdint> #include <cstdint>
#include <cstdio>
#include <fstream> #include <fstream>
#include <thread>
void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max, void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
std::vector<llama_token> & inp, int nnew, bool print_progress) { std::vector<llama_token> & inp, int nnew, bool print_progress) {
const int64_t t_start_ms = ggml_time_ms(); const int64_t t_start_ms = ggml_time_ms();
const int64_t inp_size = inp.size(); const int64_t inp_size = inp.size();
...@@ -17,16 +20,16 @@ void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, in ...@@ -17,16 +20,16 @@ void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, in
const int64_t i_start = std::max(inp_size - nnew, ngram_size); const int64_t i_start = std::max(inp_size - nnew, ngram_size);
for (int64_t i = i_start; i < inp_size; ++i) { for (int64_t i = i_start; i < inp_size; ++i) {
const int64_t ngram_start = i - ngram_size; const int64_t ngram_start = i - ngram_size;
llama_ngram ngram(&inp[ngram_start], ngram_size); common_ngram ngram(&inp[ngram_start], ngram_size);
const llama_token token = inp[i]; const llama_token token = inp[i];
llama_ngram_cache::iterator part_it = ngram_cache.find(ngram); common_ngram_cache::iterator part_it = ngram_cache.find(ngram);
if (part_it == ngram_cache.end()) { if (part_it == ngram_cache.end()) {
llama_ngram_cache_part part; common_ngram_cache_part part;
part.emplace(token, 1); part.emplace(token, 1);
ngram_cache.emplace(ngram, part); ngram_cache.emplace(ngram, part);
} else { } else {
llama_ngram_cache_part::iterator token_count_it = part_it->second.find(token); common_ngram_cache_part::iterator token_count_it = part_it->second.find(token);
if (token_count_it == part_it->second.end()) { if (token_count_it == part_it->second.end()) {
part_it->second.emplace(token, 1); part_it->second.emplace(token, 1);
} else { } else {
...@@ -59,12 +62,12 @@ constexpr int draft_min_sample_size_strict[LLAMA_NGRAM_MAX] = { 4, 3, 2, 2}; ...@@ -59,12 +62,12 @@ constexpr int draft_min_sample_size_strict[LLAMA_NGRAM_MAX] = { 4, 3, 2, 2};
constexpr int draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66}; constexpr int draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66};
// Helper function that tries to draft a token from only the static ngram cache: // Helper function that tries to draft a token from only the static ngram cache:
static llama_token try_draft(llama_ngram_cache & nc_static, const llama_ngram ngram_static) { static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) {
llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static); common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
if (part_static_it == nc_static.end()) { if (part_static_it == nc_static.end()) {
return -1; return -1;
} }
const llama_ngram_cache_part part_static = part_static_it->second; const common_ngram_cache_part part_static = part_static_it->second;
int max_count_static = 0; int max_count_static = 0;
int sum_count_static = 0; int sum_count_static = 0;
...@@ -92,19 +95,19 @@ static llama_token try_draft(llama_ngram_cache & nc_static, const llama_ngram ng ...@@ -92,19 +95,19 @@ static llama_token try_draft(llama_ngram_cache & nc_static, const llama_ngram ng
// Try to draft a token from primary cache (context/dynamic), validate with static cache: // Try to draft a token from primary cache (context/dynamic), validate with static cache:
static llama_token try_draft( static llama_token try_draft(
llama_ngram_cache & nc_primary, const std::vector<llama_ngram> & ngrams_primary, llama_ngram_cache_part & part_static, common_ngram_cache & nc_primary, const std::vector<common_ngram> & ngrams_primary, common_ngram_cache_part & part_static,
const int * min_sample_size, const int * min_percent) { const int * min_sample_size, const int * min_percent) {
llama_token drafted_token = -1; llama_token drafted_token = -1;
for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) { for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) {
const llama_ngram ngram_primary = ngrams_primary[i]; const common_ngram ngram_primary = ngrams_primary[i];
llama_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary); common_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
if (part_primary_it == nc_primary.end()) { if (part_primary_it == nc_primary.end()) {
continue; continue;
} }
const llama_ngram_cache_part part_primary = part_primary_it->second; const common_ngram_cache_part part_primary = part_primary_it->second;
int max_count_primary = 0; int max_count_primary = 0;
int max_count_static = 0; int max_count_static = 0;
...@@ -114,7 +117,7 @@ static llama_token try_draft( ...@@ -114,7 +117,7 @@ static llama_token try_draft(
for (std::pair<llama_token, int> token_count_primary : part_primary) { for (std::pair<llama_token, int> token_count_primary : part_primary) {
const llama_token token = token_count_primary.first; const llama_token token = token_count_primary.first;
llama_ngram_cache_part::iterator token_count_static_it = part_static.find(token); common_ngram_cache_part::iterator token_count_static_it = part_static.find(token);
const int32_t count_primary = token_count_primary.second; const int32_t count_primary = token_count_primary.second;
const int32_t count_static = token_count_static_it != part_static.end() ? 100*token_count_static_it->second : 1; const int32_t count_static = token_count_static_it != part_static.end() ? 100*token_count_static_it->second : 1;
...@@ -139,9 +142,9 @@ static llama_token try_draft( ...@@ -139,9 +142,9 @@ static llama_token try_draft(
return drafted_token; return drafted_token;
} }
void llama_ngram_cache_draft( void common_ngram_cache_draft(
std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max, std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static
) { ) {
GGML_ASSERT(draft.size() == 1); GGML_ASSERT(draft.size() == 1);
const int inp_size = inp.size(); const int inp_size = inp.size();
...@@ -154,21 +157,21 @@ void llama_ngram_cache_draft( ...@@ -154,21 +157,21 @@ void llama_ngram_cache_draft(
llama_token drafted_token = -1; llama_token drafted_token = -1;
const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1; const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
llama_ngram ngram_static; common_ngram ngram_static;
for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) { for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) {
ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j); ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j);
} }
llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static); common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
llama_ngram_cache_part part_static; common_ngram_cache_part part_static;
if (part_static_it != nc_static.end()) { if (part_static_it != nc_static.end()) {
part_static = part_static_it->second; part_static = part_static_it->second;
} }
// cd = context + dynamic // cd = context + dynamic
std::vector<llama_ngram> ngrams_cd; std::vector<common_ngram> ngrams_cd;
for (int ngram_size_cd = ngram_min; ngram_size_cd <= ngram_max; ++ngram_size_cd) { for (int ngram_size_cd = ngram_min; ngram_size_cd <= ngram_max; ++ngram_size_cd) {
const int ngram_start_cd = inp_size-ngram_size_cd + draft.size()-1; const int ngram_start_cd = inp_size-ngram_size_cd + draft.size()-1;
llama_ngram ngram_cd; common_ngram ngram_cd;
for (int j = ngram_start_cd; j < ngram_start_cd + ngram_size_cd; ++j) { for (int j = ngram_start_cd; j < ngram_start_cd + ngram_size_cd; ++j) {
ngram_cd.tokens[j-ngram_start_cd] = get_token(inp, draft, j); ngram_cd.tokens[j-ngram_start_cd] = get_token(inp, draft, j);
} }
...@@ -193,16 +196,16 @@ void llama_ngram_cache_draft( ...@@ -193,16 +196,16 @@ void llama_ngram_cache_draft(
} }
} }
void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename) { void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & filename) {
std::ofstream file_out(filename, std::ios::binary); std::ofstream file_out(filename, std::ios::binary);
for (std::pair<llama_ngram, llama_ngram_cache_part> item : ngram_cache) { for (std::pair<common_ngram, common_ngram_cache_part> item : ngram_cache) {
const llama_ngram ngram = item.first; const common_ngram ngram = item.first;
llama_ngram_cache_part token_counts = item.second; common_ngram_cache_part token_counts = item.second;
GGML_ASSERT(!token_counts.empty()); GGML_ASSERT(!token_counts.empty());
const int32_t ntokens = token_counts.size(); const int32_t ntokens = token_counts.size();
GGML_ASSERT(ntokens > 0); GGML_ASSERT(ntokens > 0);
file_out.write(reinterpret_cast<const char *>(&ngram), sizeof(llama_ngram)); file_out.write(reinterpret_cast<const char *>(&ngram), sizeof(common_ngram));
file_out.write(reinterpret_cast<const char *>(&ntokens), sizeof(int32_t)); file_out.write(reinterpret_cast<const char *>(&ntokens), sizeof(int32_t));
for (std::pair<llama_token, int32_t> item2 : token_counts) { for (std::pair<llama_token, int32_t> item2 : token_counts) {
const llama_token token = item2.first; const llama_token token = item2.first;
...@@ -216,14 +219,14 @@ void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filen ...@@ -216,14 +219,14 @@ void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filen
} }
llama_ngram_cache llama_ngram_cache_load(std::string & filename) { common_ngram_cache common_ngram_cache_load(std::string & filename) {
std::ifstream hashmap_file(filename, std::ios::binary); std::ifstream hashmap_file(filename, std::ios::binary);
if (!hashmap_file) { if (!hashmap_file) {
throw std::ifstream::failure("Unable to open file " + filename); throw std::ifstream::failure("Unable to open file " + filename);
} }
llama_ngram_cache ngram_cache; common_ngram_cache ngram_cache;
llama_ngram ngram; common_ngram ngram;
int32_t ntokens; int32_t ntokens;
llama_token token; llama_token token;
int32_t count; int32_t count;
...@@ -232,11 +235,11 @@ llama_ngram_cache llama_ngram_cache_load(std::string & filename) { ...@@ -232,11 +235,11 @@ llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
char * ntokensc = reinterpret_cast<char*>(&ntokens); char * ntokensc = reinterpret_cast<char*>(&ntokens);
char * tokenc = reinterpret_cast<char*>(&token); char * tokenc = reinterpret_cast<char*>(&token);
char * countc = reinterpret_cast<char*>(&count); char * countc = reinterpret_cast<char*>(&count);
while(hashmap_file.read(ngramc, sizeof(llama_ngram))) { while(hashmap_file.read(ngramc, sizeof(common_ngram))) {
GGML_ASSERT(!hashmap_file.eof()); GGML_ASSERT(!hashmap_file.eof());
GGML_ASSERT(hashmap_file.read(ntokensc, sizeof(int32_t))); GGML_ASSERT(hashmap_file.read(ntokensc, sizeof(int32_t)));
GGML_ASSERT(ntokens > 0); GGML_ASSERT(ntokens > 0);
llama_ngram_cache_part token_counts; common_ngram_cache_part token_counts;
for (int i = 0; i < ntokens; ++i) { for (int i = 0; i < ntokens; ++i) {
GGML_ASSERT(!hashmap_file.eof()); GGML_ASSERT(!hashmap_file.eof());
...@@ -254,12 +257,12 @@ llama_ngram_cache llama_ngram_cache_load(std::string & filename) { ...@@ -254,12 +257,12 @@ llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
return ngram_cache; return ngram_cache;
} }
void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add) { void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ngram_cache & ngram_cache_add) {
for (std::pair<llama_ngram, llama_ngram_cache_part> ngram_part : ngram_cache_add) { for (std::pair<common_ngram, common_ngram_cache_part> ngram_part : ngram_cache_add) {
const llama_ngram ngram = ngram_part.first; const common_ngram ngram = ngram_part.first;
llama_ngram_cache_part part = ngram_part.second; common_ngram_cache_part part = ngram_part.second;
llama_ngram_cache::iterator part_merged_it = ngram_cache_target.find(ngram); common_ngram_cache::iterator part_merged_it = ngram_cache_target.find(ngram);
if (part_merged_it == ngram_cache_target.end()) { if (part_merged_it == ngram_cache_target.end()) {
ngram_cache_target.emplace(ngram, part); ngram_cache_target.emplace(ngram, part);
continue; continue;
...@@ -270,7 +273,7 @@ void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram ...@@ -270,7 +273,7 @@ void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram
const int32_t count = token_count.second; const int32_t count = token_count.second;
GGML_ASSERT(count > 0); GGML_ASSERT(count > 0);
llama_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second.find(token); common_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second.find(token);
if (token_count_merged_it == part_merged_it->second.end()) { if (token_count_merged_it == part_merged_it->second.end()) {
part_merged_it->second.emplace(token, count); part_merged_it->second.emplace(token, count);
continue; continue;
......
...@@ -12,22 +12,22 @@ ...@@ -12,22 +12,22 @@
// Data structures to map n-grams to empirical token probabilities: // Data structures to map n-grams to empirical token probabilities:
struct llama_ngram { struct common_ngram {
llama_token tokens[LLAMA_NGRAM_MAX]; llama_token tokens[LLAMA_NGRAM_MAX];
llama_ngram() { common_ngram() {
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
tokens[i] = -1; tokens[i] = -1;
} }
} }
llama_ngram(const llama_token * input, const int ngram_size) { common_ngram(const llama_token * input, const int ngram_size) {
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
tokens[i] = i < ngram_size ? input[i] : -1; tokens[i] = i < ngram_size ? input[i] : -1;
} }
} }
bool operator==(const llama_ngram & other) const { bool operator==(const common_ngram & other) const {
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
if (tokens[i] != other.tokens[i]) { if (tokens[i] != other.tokens[i]) {
return false; return false;
...@@ -37,28 +37,28 @@ struct llama_ngram { ...@@ -37,28 +37,28 @@ struct llama_ngram {
} }
}; };
struct llama_token_hash_function { struct common_token_hash_function {
size_t operator()(const llama_token token) const { size_t operator()(const llama_token token) const {
// see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/ // see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
return token * 11400714819323198485llu; return token * 11400714819323198485llu;
} }
}; };
struct llama_ngram_hash_function { struct common_ngram_hash_function {
size_t operator()(const llama_ngram & ngram) const { size_t operator()(const common_ngram & ngram) const {
size_t hash = llama_token_hash_function{}(ngram.tokens[0]); size_t hash = common_token_hash_function{}(ngram.tokens[0]);
for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) { for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) {
hash ^= llama_token_hash_function{}(ngram.tokens[i]); hash ^= common_token_hash_function{}(ngram.tokens[i]);
} }
return hash; return hash;
} }
}; };
// token -> number of times token has been seen // token -> number of times token has been seen
typedef std::unordered_map<llama_token, int32_t> llama_ngram_cache_part; typedef std::unordered_map<llama_token, int32_t> common_ngram_cache_part;
// n-gram -> empirical distribution of following tokens // n-gram -> empirical distribution of following tokens
typedef std::unordered_map<llama_ngram, llama_ngram_cache_part, llama_ngram_hash_function> llama_ngram_cache; typedef std::unordered_map<common_ngram, common_ngram_cache_part, common_ngram_hash_function> common_ngram_cache;
// Update an ngram cache with tokens. // Update an ngram cache with tokens.
...@@ -70,8 +70,8 @@ typedef std::unordered_map<llama_ngram, llama_ngram_cache_part, llama_ngram_hash ...@@ -70,8 +70,8 @@ typedef std::unordered_map<llama_ngram, llama_ngram_cache_part, llama_ngram_hash
// //
// In order to get correct results inp_data can ONLY BE APPENDED TO. // In order to get correct results inp_data can ONLY BE APPENDED TO.
// Changes in the middle need a complete rebuild. // Changes in the middle need a complete rebuild.
void llama_ngram_cache_update( void common_ngram_cache_update(
llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress); common_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress);
// Try to draft tokens from ngram caches. // Try to draft tokens from ngram caches.
// inp: the tokens generated so far. // inp: the tokens generated so far.
...@@ -81,21 +81,21 @@ void llama_ngram_cache_update( ...@@ -81,21 +81,21 @@ void llama_ngram_cache_update(
// nc_context: ngram cache based on current context. // nc_context: ngram cache based on current context.
// nc_dynamic: ngram cache based on previous user generations. // nc_dynamic: ngram cache based on previous user generations.
// nc_static: ngram cache generated from a large text corpus, used for validation. // nc_static: ngram cache generated from a large text corpus, used for validation.
void llama_ngram_cache_draft( void common_ngram_cache_draft(
std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max, std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static); common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static);
// Save an ngram cache to a file. // Save an ngram cache to a file.
// ngram_cache: the ngram cache to save. // ngram_cache: the ngram cache to save.
// filename: the path under which to save the ngram cache. // filename: the path under which to save the ngram cache.
void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename); void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & filename);
// Load an ngram cache saved with llama_ngram_cache_save. // Load an ngram cache saved with common_ngram_cache_save.
// filename: the path from which to load the ngram cache. // filename: the path from which to load the ngram cache.
// returns: an ngram cache containing the information saved to filename. // returns: an ngram cache containing the information saved to filename.
llama_ngram_cache llama_ngram_cache_load(std::string & filename); common_ngram_cache common_ngram_cache_load(std::string & filename);
// Merge two ngram caches. // Merge two ngram caches.
// ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add. // ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add.
// ngram_cache_add: the ngram cache to add to ngram_cache_target. // ngram_cache_add: the ngram cache to add to ngram_cache_target.
void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add); void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ngram_cache & ngram_cache_add);
This diff is collapsed.
...@@ -2,159 +2,103 @@ ...@@ -2,159 +2,103 @@
#include "llama.h" #include "llama.h"
#include "grammar-parser.h" #include "common.h"
#include <random>
#include <string> #include <string>
#include <unordered_map>
#include <vector> #include <vector>
// sampler types // common_sampler extends llama_sampler with additional functionality:
enum class llama_sampler_type : char { //
TOP_K = 'k', // - grammar support
TOP_P = 'p', // - custom sampler logic based on the parameters
MIN_P = 'm', // - history of the last accepted tokens
TFS_Z = 'f', // - performance metrics
TYPICAL_P = 'y', //
TEMPERATURE = 't' // This goal is to have a common implementation of the sampling logic shared across the examples.
}; // For example, depending on the temperature, the sampling chain can be very simple (greedy) or more
// complex (top-k, top-p, etc).
// sampling parameters //
typedef struct llama_sampling_params { // Another example is related to the grammar. In general, the grammar constraints applied on the full
int32_t n_prev = 64; // number of previous tokens to remember // vocabulary can be very taxing. To improve performance, the grammar can be applied only to the sampled
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. // token in order to verify if it fits the grammar. And only if the token doesn't fit the grammar, the
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens // grammar constraints are applied to the full vocabulary and the token is resampled.
int32_t top_k = 40; // <= 0 to use vocab size //
float top_p = 0.95f; // 1.0 = disabled // The common_sampler also maintains a container with the last accepted tokens. In the future, this can
float min_p = 0.05f; // 0.0 = disabled // be moved into the core llama library.
float tfs_z = 1.00f; // 1.0 = disabled //
float typical_p = 1.00f; // 1.0 = disabled // For convenience, the common_sampler also maintains a container with the current candidate tokens.
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities // This can be used to access the probabilities of the rest of the non-sampled tokens.
float dynatemp_range = 0.00f; // 0.0 = disabled //
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler // TODO: measure grammar performance
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) //
float penalty_repeat = 1.00f; // 1.0 = disabled
float penalty_freq = 0.00f; // 0.0 = disabled
float penalty_present = 0.00f; // 0.0 = disabled
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate
bool penalize_nl = false; // consider newlines as a repeatable token
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context
std::vector<llama_sampler_type> samplers_sequence = {
llama_sampler_type::TOP_K,
llama_sampler_type::TFS_Z,
llama_sampler_type::TYPICAL_P,
llama_sampler_type::TOP_P,
llama_sampler_type::MIN_P,
llama_sampler_type::TEMPERATURE
};
std::string grammar; // optional BNF-like grammar to constrain sampling
// Classifier-Free Guidance
// https://arxiv.org/abs/2306.17806
std::string cfg_negative_prompt; // string to help guidance
float cfg_scale = 1.f; // how strong is guidance
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
std::vector<llama_token> penalty_prompt_tokens;
bool use_penalty_prompt_tokens = false;
} llama_sampling_params;
// general sampler context
// TODO: move to llama.h
struct llama_sampling_context {
// parameters that will be used for sampling
llama_sampling_params params;
// mirostat sampler state
float mirostat_mu;
llama_grammar * grammar;
// internal
grammar_parser::parse_state parsed_grammar;
// TODO: replace with ring-buffer
std::vector<llama_token> prev;
std::vector<llama_token_data> cur;
size_t n_valid; // Number of correct top tokens with correct probabilities.
std::mt19937 rng;
};
#include "common.h" struct common_sampler;
// Create a new sampling context instance. // llama_sampler API overloads
struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params);
void llama_sampling_free(struct llama_sampling_context * ctx); struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params);
// Reset the sampler context void common_sampler_free(struct common_sampler * gsmpl);
// - clear prev tokens
// - reset grammar
void llama_sampling_reset(llama_sampling_context * ctx);
// Set the sampler seed // if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed); void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar);
void common_sampler_reset (struct common_sampler * gsmpl);
struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
// Copy the sampler context // arguments can be nullptr to skip printing
void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst); void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
// Get the last sampled token // extended sampling implementation:
llama_token llama_sampling_last(llama_sampling_context * ctx); //
// - set logits
// - apply the configured sampler chain
// - check if the token fits the grammar (if any)
// - if not: resample by first applying the grammar constraints and then sampling again (slower path)
//
// if grammar_first is true, the grammar is applied before the samplers (slower)
// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
//
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
// Get a string representation of the last sampled tokens // generalized version of common_sampler_sample
std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n); //
// will cross-reference the sampled tokens with a batch of draft tokens and accept those that match
// if the sampler disagrees at some point, we stop and return the accepted tokens up to now
//
// common_sampler_sample_n(gsmpl, ctx, { idx }, {});
//
// is equivalent to
//
// common_sampler_sample(gsmpl, ctx, idx);
// common_sampler_accept(gsmpl, token, true);
//
// requires: idxs.size() == draft.size() + 1
//
// returns at least 1 token, up to idxs.size()
//
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false);
// Print sampling parameters into a string // assume idxs == [ 0, 1, 2, ..., draft.size() ]
std::string llama_sampling_print(const llama_sampling_params & params); std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false);
// Print sampling order into a string uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
std::string llama_sampling_order_print(const llama_sampling_params & params);
std::string llama_sampling_type_to_str(llama_sampler_type sampler_type); // helpers
std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names); // access the internal list of current candidate tokens
std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string); llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl);
// this is a common sampling function used across the examples for convenience // get the last accepted token
// it can serve as a starting point for implementing your own sampling function llama_token common_sampler_last(const struct common_sampler * gsmpl);
// Note: When using multiple sequences, it is the caller's responsibility to call
// llama_sampling_reset when a sequence ends // print the sampler chain into a string
// std::string common_sampler_print(const struct common_sampler * gsmpl);
// required:
// - ctx_main: context to use for sampling // get a string representation of the last accepted tokens
// - ctx_sampling: sampling-specific context std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx, int n);
//
// optional:
// - ctx_cfg: context to use for classifier-free guidance
// - idx: sample from llama_get_logits_ith(ctx, idx)
//
// returns:
// - token: sampled token
// - candidates: vector of candidate tokens
//
llama_token llama_sampling_sample(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
struct llama_context * ctx_cfg,
int idx = -1);
// Prepares and adjusts the set of token candidates for sampling based on penalties, biases, and sampling parameters. char common_sampler_type_to_chr(enum common_sampler_type cnstr);
llama_token_data_array llama_sampling_prepare( std::string common_sampler_type_to_str(enum common_sampler_type cnstr);
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
struct llama_context * ctx_cfg,
int idx = 0,
bool apply_grammar = true,
std::vector<float> * original_logits = nullptr);
void llama_sampling_accept( std::vector<enum common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
struct llama_sampling_context * ctx_sampling, std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std::string & chars);
struct llama_context * ctx_main,
llama_token id,
bool apply_grammar);
This diff is collapsed.
#pragma once
#include "llama.h"
#include "common.h"
struct common_speculative;
struct common_speculative_params {
int n_draft = 16; // max drafted tokens
int n_reuse = 256;
float p_min = 0.9f; // min probabiliy required to accept a token in the draft
};
struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);
void common_speculative_free(struct common_speculative * spec);
bool common_speculative_are_compatible(
const struct llama_context * ctx_tgt,
const struct llama_context * ctx_dft);
// sample up to n_draft tokens and add them to the batch using the draft model
llama_tokens common_speculative_gen_draft(
struct common_speculative * spec,
struct common_speculative_params params,
const llama_tokens & prompt,
llama_token id_last);
This diff is collapsed.
This diff is collapsed.
...@@ -31,6 +31,7 @@ import re ...@@ -31,6 +31,7 @@ import re
import requests import requests
import sys import sys
import json import json
import shutil
from hashlib import sha256 from hashlib import sha256
from enum import IntEnum, auto from enum import IntEnum, auto
...@@ -71,6 +72,7 @@ models = [ ...@@ -71,6 +72,7 @@ models = [
{"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", }, {"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
{"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", }, {"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
{"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", }, {"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
{"name": "bert-bge-large", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/BAAI/bge-large-zh-v1.5", },
{"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", }, {"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
{"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", }, {"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
{"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", }, {"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
...@@ -80,6 +82,7 @@ models = [ ...@@ -80,6 +82,7 @@ models = [
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", }, {"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
{"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", }, {"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
{"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", }, {"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
{"name": "jina-v1-en", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-reranker-v1-tiny-en", },
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM! {"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", }, {"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", }, {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
...@@ -94,6 +97,11 @@ models = [ ...@@ -94,6 +97,11 @@ models = [
{"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", }, {"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
{"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", }, {"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
{"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", }, {"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
{'name': "bloom", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
{'name': "gpt3-finnish", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
{"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
{"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
{"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
] ]
...@@ -122,6 +130,21 @@ def download_model(model): ...@@ -122,6 +130,21 @@ def download_model(model):
if tokt == TOKENIZER_TYPE.UGM: if tokt == TOKENIZER_TYPE.UGM:
files.append("spiece.model") files.append("spiece.model")
if os.path.isdir(repo):
# If repo is a path on the file system, copy the directory
for file in files:
src_path = os.path.join(repo, file)
dst_path = f"models/tokenizers/{name}/{file}"
if os.path.isfile(dst_path):
logger.info(f"{name}: File {dst_path} already exists - skipping")
continue
if os.path.isfile(src_path):
shutil.copy2(src_path, dst_path)
logger.info(f"{name}: Copied {src_path} to {dst_path}")
else:
logger.warning(f"{name}: Source file {src_path} does not exist")
else:
# If repo is a URL, download the files
for file in files: for file in files:
save_path = f"models/tokenizers/{name}/{file}" save_path = f"models/tokenizers/{name}/{file}"
if os.path.isfile(save_path): if os.path.isfile(save_path):
......
...@@ -116,7 +116,7 @@ class Tensor: ...@@ -116,7 +116,7 @@ class Tensor:
assert quant is not None, 'Unknown tensor type' assert quant is not None, 'Unknown tensor type'
(blksize, tysize) = quant (blksize, tysize) = quant
offset += 12 offset += 12
self.dtype= dtype self.dtype= gguf.GGMLQuantizationType(dtype)
self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)]) self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)])
offset += 4 * n_dims offset += 4 * n_dims
self.name = bytes(data[offset:offset + name_len]) self.name = bytes(data[offset:offset + name_len])
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment