Commit a83eaa7a authored by Michael Yang's avatar Michael Yang
Browse files

update llama.cpp to e782c9e735f93ab4767ffc37462c523b73a17ddc

parent 5156e48c
This diff is collapsed.
/** /**
* llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066 * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066 * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
* *
* MIT License * MIT License
* *
......
// +build darwin // +build darwin
/** /**
* llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066 * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
* *
* MIT License * MIT License
* *
...@@ -722,8 +722,8 @@ void ggml_metal_graph_compute( ...@@ -722,8 +722,8 @@ void ggml_metal_graph_compute(
GGML_ASSERT(ne02 == 1); GGML_ASSERT(ne02 == 1);
GGML_ASSERT(ne12 == 1); GGML_ASSERT(ne12 == 1);
nth0 = 4; nth0 = 2;
nth1 = 16; nth1 = 32;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32]; [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
} break; } break;
case GGML_TYPE_Q5_K: case GGML_TYPE_Q5_K:
...@@ -731,8 +731,8 @@ void ggml_metal_graph_compute( ...@@ -731,8 +731,8 @@ void ggml_metal_graph_compute(
GGML_ASSERT(ne02 == 1); GGML_ASSERT(ne02 == 1);
GGML_ASSERT(ne12 == 1); GGML_ASSERT(ne12 == 1);
nth0 = 4; nth0 = 2;
nth1 = 16; nth1 = 32;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32]; [encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32];
} break; } break;
case GGML_TYPE_Q6_K: case GGML_TYPE_Q6_K:
...@@ -740,8 +740,8 @@ void ggml_metal_graph_compute( ...@@ -740,8 +740,8 @@ void ggml_metal_graph_compute(
GGML_ASSERT(ne02 == 1); GGML_ASSERT(ne02 == 1);
GGML_ASSERT(ne12 == 1); GGML_ASSERT(ne12 == 1);
nth0 = 4; nth0 = 2;
nth1 = 16; nth1 = 32;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32]; [encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32];
} break; } break;
default: default:
...@@ -767,15 +767,18 @@ void ggml_metal_graph_compute( ...@@ -767,15 +767,18 @@ void ggml_metal_graph_compute(
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13]; [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13];
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14]; [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) { if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
[encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0]; src0t == GGML_TYPE_Q4_K) {
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
}
else if (src0t == GGML_TYPE_Q5_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3) / 4, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
}
else if (src0t == GGML_TYPE_Q6_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} }
else if (src0t == GGML_TYPE_Q2_K || else if (src0t == GGML_TYPE_Q2_K ||
src0t == GGML_TYPE_Q3_K || src0t == GGML_TYPE_Q3_K) {
src0t == GGML_TYPE_Q4_K ||
src0t == GGML_TYPE_Q5_K ||
src0t == GGML_TYPE_Q6_K) {
[encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0]; [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake(ne01, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} else { } else {
...@@ -821,7 +824,7 @@ void ggml_metal_graph_compute( ...@@ -821,7 +824,7 @@ void ggml_metal_graph_compute(
const float eps = 1e-6f; const float eps = 1e-6f;
const int nth = 256; const int nth = 512;
[encoder setComputePipelineState:ctx->pipeline_rms_norm]; [encoder setComputePipelineState:ctx->pipeline_rms_norm];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
...@@ -829,7 +832,7 @@ void ggml_metal_graph_compute( ...@@ -829,7 +832,7 @@ void ggml_metal_graph_compute(
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3]; [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
[encoder setBytes:&eps length:sizeof( float) atIndex:4]; [encoder setBytes:&eps length:sizeof( float) atIndex:4];
[encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0]; [encoder setThreadgroupMemoryLength:nth/32*sizeof(float) atIndex:0];
const int64_t nrows = ggml_nrows(src0); const int64_t nrows = ggml_nrows(src0);
...@@ -910,28 +913,35 @@ void ggml_metal_graph_compute( ...@@ -910,28 +913,35 @@ void ggml_metal_graph_compute(
const int n_past = ((int32_t *)(src1->data))[0]; const int n_past = ((int32_t *)(src1->data))[0];
float freq_base;
float freq_scale;
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
[encoder setComputePipelineState:ctx->pipeline_rope]; [encoder setComputePipelineState:ctx->pipeline_rope];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5]; [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6]; [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7]; [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8]; [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9]; [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10]; [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11]; [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12]; [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13]; [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14]; [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15]; [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
[encoder setBytes:&n_past length:sizeof( int) atIndex:18]; [encoder setBytes:&n_past length:sizeof( int) atIndex:18];
[encoder setBytes:&n_dims length:sizeof( int) atIndex:19]; [encoder setBytes:&n_dims length:sizeof( int) atIndex:19];
[encoder setBytes:&mode length:sizeof( int) atIndex:20]; [encoder setBytes:&mode length:sizeof( int) atIndex:20];
[encoder setBytes:&freq_base length:sizeof(float) atIndex:21];
[encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break; } break;
......
This diff is collapsed.
This diff is collapsed.
/** /**
* llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066 * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
* *
* MIT License * MIT License
* *
...@@ -227,8 +227,13 @@ ...@@ -227,8 +227,13 @@
#define GGML_MAX_NAME 48 #define GGML_MAX_NAME 48
#define GGML_DEFAULT_N_THREADS 4 #define GGML_DEFAULT_N_THREADS 4
#define GGML_EXIT_SUCCESS 0
#define GGML_EXIT_ABORTED 1
#define GGML_UNUSED(x) (void)(x) #define GGML_UNUSED(x) (void)(x)
#define GGML_ASSERT(x) \ #define GGML_ASSERT(x) \
do { \ do { \
if (!(x)) { \ if (!(x)) { \
...@@ -389,6 +394,8 @@ extern "C" { ...@@ -389,6 +394,8 @@ extern "C" {
GGML_OP_CLAMP, GGML_OP_CLAMP,
GGML_OP_CONV_1D, GGML_OP_CONV_1D,
GGML_OP_CONV_2D, GGML_OP_CONV_2D,
GGML_OP_POOL_1D,
GGML_OP_POOL_2D,
GGML_OP_FLASH_ATTN, GGML_OP_FLASH_ATTN,
GGML_OP_FLASH_FF, GGML_OP_FLASH_FF,
...@@ -468,6 +475,10 @@ extern "C" { ...@@ -468,6 +475,10 @@ extern "C" {
// the `n_tasks` of nodes, 1:1 mapping to cgraph nodes // the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
int n_tasks[GGML_MAX_NODES]; int n_tasks[GGML_MAX_NODES];
// abort ggml_graph_compute when true
bool (*abort_callback)(void * data);
void * abort_callback_data;
}; };
// computation graph // computation graph
...@@ -1136,6 +1147,17 @@ extern "C" { ...@@ -1136,6 +1147,17 @@ extern "C" {
int mode, int mode,
int n_ctx); int n_ctx);
// custom RoPE, in-place, returns view(a)
GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_past,
int n_dims,
int mode,
float freq_base,
float freq_scale,
int n_ctx);
// rotary position embedding backward, i.e compute dx from dy // rotary position embedding backward, i.e compute dx from dy
// a - dy // a - dy
GGML_API struct ggml_tensor * ggml_rope_back( GGML_API struct ggml_tensor * ggml_rope_back(
...@@ -1190,6 +1212,31 @@ extern "C" { ...@@ -1190,6 +1212,31 @@ extern "C" {
int s, int s,
int d); int d);
enum ggml_op_pool {
GGML_OP_POOL_MAX,
GGML_OP_POOL_AVG,
GGML_OP_POOL_COUNT,
};
GGML_API struct ggml_tensor* ggml_pool_1d(
struct ggml_context * ctx,
struct ggml_tensor * a,
enum ggml_op_pool op,
int k0, // kernel size
int s0, // stride
int p0); // padding
GGML_API struct ggml_tensor* ggml_pool_2d(
struct ggml_context * ctx,
struct ggml_tensor * a,
enum ggml_op_pool op,
int k0,
int k1,
int s0,
int s1,
int p0,
int p1);
GGML_API struct ggml_tensor * ggml_flash_attn( GGML_API struct ggml_tensor * ggml_flash_attn(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * q, struct ggml_tensor * q,
...@@ -1329,7 +1376,7 @@ extern "C" { ...@@ -1329,7 +1376,7 @@ extern "C" {
// ggml_graph_plan() has to be called before ggml_graph_compute() // ggml_graph_plan() has to be called before ggml_graph_compute()
// when plan.work_size > 0, caller must allocate memory for plan.work_data // when plan.work_size > 0, caller must allocate memory for plan.work_data
GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/); GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
GGML_API void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
// same as ggml_graph_compute() but the work data is allocated as a part of the context // same as ggml_graph_compute() but the work data is allocated as a part of the context
......
/** /**
* llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066 * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
* *
* MIT License * MIT License
* *
......
/** /**
* llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066 * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
* *
* MIT License * MIT License
* *
...@@ -41,6 +41,14 @@ ...@@ -41,6 +41,14 @@
#define K_SCALE_SIZE 12 #define K_SCALE_SIZE 12
#endif #endif
#ifndef static_assert
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
#define static_assert(cond, msg) _Static_assert(cond, msg)
#else
#define static_assert(cond, msg) struct global_scope_noop_trick
#endif
#endif
// //
// Super-block quantization structures // Super-block quantization structures
// //
......
/** /**
* llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066 * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
* *
* MIT License * MIT License
* *
...@@ -201,13 +201,13 @@ struct llama_mmap { ...@@ -201,13 +201,13 @@ struct llama_mmap {
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) { llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
size = file->size; size = file->size;
int fd = fileno(file->fp); int fd = fileno(file->fp);
int flags = MAP_PRIVATE; int flags = MAP_SHARED;
// prefetch/readahead impairs performance on NUMA systems // prefetch/readahead impairs performance on NUMA systems
if (numa) { prefetch = 0; } if (numa) { prefetch = 0; }
#ifdef __linux__ #ifdef __linux__
if (prefetch) { flags |= MAP_POPULATE; } if (prefetch) { flags |= MAP_POPULATE; }
#endif #endif
addr = mmap(NULL, file->size, PROT_READ | PROT_WRITE, flags, fd, 0); addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
if (addr == MAP_FAILED) { if (addr == MAP_FAILED) {
throw std::runtime_error(format("mmap failed: %s", strerror(errno))); throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
} }
...@@ -249,7 +249,7 @@ struct llama_mmap { ...@@ -249,7 +249,7 @@ struct llama_mmap {
throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str())); throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
} }
addr = MapViewOfFile(hMapping, FILE_MAP_COPY, 0, 0, 0); addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
error = GetLastError(); error = GetLastError();
CloseHandle(hMapping); CloseHandle(hMapping);
......
/** /**
* llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066 * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
* *
* MIT License * MIT License
* *
...@@ -127,14 +127,15 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * ...@@ -127,14 +127,15 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
// memory sizes // memory sizes
// //
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0() static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
{ {
static std::map<e_model, size_t> k_sizes = { static std::map<e_model, size_t> k_sizes = {
{ MODEL_3B, 256ull * MB }, /* empirical scaling, still a guess */
{ MODEL_7B, 512ull * MB }, { MODEL_3B, ((size_t) n_ctx / 16ull + 128ull) * MB },
{ MODEL_13B, 512ull * MB }, { MODEL_7B, ((size_t) n_ctx / 16ull + 256ull) * MB },
{ MODEL_30B, 512ull * MB }, { MODEL_13B, ((size_t) n_ctx / 12ull + 256ull) * MB },
{ MODEL_65B, 1024ull * MB }, { MODEL_30B, ((size_t) n_ctx / 10ull + 256ull) * MB },
{ MODEL_65B, ((size_t) n_ctx / 8ull + 512ull) * MB },
}; };
return k_sizes; return k_sizes;
} }
...@@ -166,14 +167,14 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF() ...@@ -166,14 +167,14 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
// this is mostly needed for temporary mul_mat buffers to dequantize the data // this is mostly needed for temporary mul_mat buffers to dequantize the data
// not actually needed if BLAS is disabled // not actually needed if BLAS is disabled
static const std::map<e_model, size_t> & MEM_REQ_EVAL() static const std::map<e_model, size_t> & MEM_REQ_EVAL(int n_ctx)
{ {
static std::map<e_model, size_t> k_sizes = { static std::map<e_model, size_t> k_sizes = {
{ MODEL_3B, 512ull * MB }, { MODEL_3B, ((size_t) n_ctx / 256ull + 512ull) * MB },
{ MODEL_7B, 768ull * MB }, { MODEL_7B, ((size_t) n_ctx / 256ull + 768ull) * MB },
{ MODEL_13B, 1024ull * MB }, { MODEL_13B, ((size_t) n_ctx / 256ull + 1024ull) * MB },
{ MODEL_30B, 1280ull * MB }, { MODEL_30B, ((size_t) n_ctx / 256ull + 1280ull) * MB },
{ MODEL_65B, 1536ull * MB }, { MODEL_65B, ((size_t) n_ctx / 256ull + 1536ull) * MB },
}; };
return k_sizes; return k_sizes;
} }
...@@ -215,6 +216,10 @@ struct llama_hparams { ...@@ -215,6 +216,10 @@ struct llama_hparams {
uint32_t n_head = 32; uint32_t n_head = 32;
uint32_t n_layer = 32; uint32_t n_layer = 32;
uint32_t n_rot = 64; uint32_t n_rot = 64;
float rope_freq_base = 10000.0f;
float rope_freq_scale = 1.0f;
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16; enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
bool operator!=(const llama_hparams & other) const { bool operator!=(const llama_hparams & other) const {
...@@ -329,7 +334,7 @@ struct llama_model { ...@@ -329,7 +334,7 @@ struct llama_model {
}; };
struct llama_context { struct llama_context {
llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {} llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
~llama_context() { ~llama_context() {
if (ctx_metal) { if (ctx_metal) {
...@@ -350,7 +355,6 @@ struct llama_context { ...@@ -350,7 +355,6 @@ struct llama_context {
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1) int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
const llama_model & model; const llama_model & model;
const llama_vocab & vocab;
bool model_owner = false; bool model_owner = false;
...@@ -577,7 +581,9 @@ struct llama_file_loader { ...@@ -577,7 +581,9 @@ struct llama_file_loader {
} }
// skip to the next multiple of 32 bytes // skip to the next multiple of 32 bytes
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR); if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
}
tensor.file_off = file.tell(); tensor.file_off = file.tell();
tensor.name = name; tensor.name = name;
...@@ -674,7 +680,7 @@ struct llama_model_loader { ...@@ -674,7 +680,7 @@ struct llama_model_loader {
*ctx_size_p = *mmapped_size_p = 0; *ctx_size_p = *mmapped_size_p = 0;
for (const llama_load_tensor & lt : tensors_map.tensors) { for (const llama_load_tensor & lt : tensors_map.tensors) {
*ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE; *ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
*(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size; *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size + 16;
} }
} }
...@@ -870,6 +876,8 @@ struct llama_context_params llama_context_default_params() { ...@@ -870,6 +876,8 @@ struct llama_context_params llama_context_default_params() {
/*.gpu_layers =*/ 0, /*.gpu_layers =*/ 0,
/*.main_gpu =*/ 0, /*.main_gpu =*/ 0,
/*.tensor_split =*/ {0}, /*.tensor_split =*/ {0},
/*.rope_freq_base =*/ 10000.0f,
/*.rope_freq_scale =*/ 1.0f,
/*.progress_callback =*/ nullptr, /*.progress_callback =*/ nullptr,
/*.progress_callback_user_data =*/ nullptr, /*.progress_callback_user_data =*/ nullptr,
/*.low_vram =*/ false, /*.low_vram =*/ false,
...@@ -895,6 +903,10 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { ...@@ -895,6 +903,10 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
return result; return result;
} }
int llama_max_devices() {
return LLAMA_MAX_DEVICES;
}
bool llama_mmap_supported() { bool llama_mmap_supported() {
return llama_mmap::SUPPORTED; return llama_mmap::SUPPORTED;
} }
...@@ -993,6 +1005,8 @@ static void llama_model_load_internal( ...@@ -993,6 +1005,8 @@ static void llama_model_load_internal(
int n_gpu_layers, int n_gpu_layers,
int main_gpu, int main_gpu,
const float * tensor_split, const float * tensor_split,
float rope_freq_base,
float rope_freq_scale,
bool low_vram, bool low_vram,
ggml_type memory_type, ggml_type memory_type,
bool use_mmap, bool use_mmap,
...@@ -1027,22 +1041,27 @@ static void llama_model_load_internal( ...@@ -1027,22 +1041,27 @@ static void llama_model_load_internal(
} }
hparams.n_ctx = n_ctx; hparams.n_ctx = n_ctx;
hparams.rope_freq_base = rope_freq_base;
hparams.rope_freq_scale = rope_freq_scale;
} }
const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult; const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
{ {
fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version)); fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab); fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx); fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd); fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult); fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head); fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer); fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype)); fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff); fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type)); fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
} }
if (file_version < LLAMA_FILE_VERSION_GGJT_V2) { if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
...@@ -1191,9 +1210,9 @@ static void llama_model_load_internal( ...@@ -1191,9 +1210,9 @@ static void llama_model_load_internal(
const size_t mem_required = const size_t mem_required =
ctx_size + ctx_size +
mmapped_size - vram_weights + // weights in VRAM not in memory mmapped_size - vram_weights + // weights in VRAM not in memory
MEM_REQ_SCRATCH0().at(model.type) + MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
MEM_REQ_SCRATCH1().at(model.type) + MEM_REQ_SCRATCH1().at(model.type) +
MEM_REQ_EVAL().at (model.type); MEM_REQ_EVAL(hparams.n_ctx).at(model.type);
// this is the memory required by one llama_state // this is the memory required by one llama_state
const size_t mem_required_state = const size_t mem_required_state =
...@@ -1297,6 +1316,8 @@ static bool llama_model_load( ...@@ -1297,6 +1316,8 @@ static bool llama_model_load(
int n_gpu_layers, int n_gpu_layers,
int main_gpu, int main_gpu,
float * tensor_split, float * tensor_split,
float rope_freq_base,
float rope_freq_scale,
bool low_vram, bool low_vram,
ggml_type memory_type, ggml_type memory_type,
bool use_mmap, bool use_mmap,
...@@ -1305,7 +1326,7 @@ static bool llama_model_load( ...@@ -1305,7 +1326,7 @@ static bool llama_model_load(
llama_progress_callback progress_callback, llama_progress_callback progress_callback,
void *progress_callback_user_data) { void *progress_callback_user_data) {
try { try {
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type, llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data); use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
return true; return true;
} catch (const std::exception & err) { } catch (const std::exception & err) {
...@@ -1357,6 +1378,9 @@ static bool llama_eval_internal( ...@@ -1357,6 +1378,9 @@ static bool llama_eval_internal(
const int n_rot = hparams.n_embd/hparams.n_head; const int n_rot = hparams.n_embd/hparams.n_head;
const int n_gpu_layers = model.n_gpu_layers; const int n_gpu_layers = model.n_gpu_layers;
const float freq_base = hparams.rope_freq_base;
const float freq_scale = hparams.rope_freq_scale;
auto & mem_per_token = lctx.mem_per_token; auto & mem_per_token = lctx.mem_per_token;
auto & buf_compute = lctx.buf_compute; auto & buf_compute = lctx.buf_compute;
...@@ -1454,11 +1478,11 @@ static bool llama_eval_internal( ...@@ -1454,11 +1478,11 @@ static bool llama_eval_internal(
offload_func_kq(tmpq); offload_func_kq(tmpq);
ggml_set_name(tmpq, "tmpq"); ggml_set_name(tmpq, "tmpq");
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0); struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, 0);
offload_func_kq(Kcur); offload_func_kq(Kcur);
ggml_set_name(Kcur, "Kcur"); ggml_set_name(Kcur, "Kcur");
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0); struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, 0);
offload_func_kq(Qcur); offload_func_kq(Qcur);
ggml_set_name(Qcur, "Qcur"); ggml_set_name(Qcur, "Qcur");
...@@ -2032,9 +2056,18 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * ...@@ -2032,9 +2056,18 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
} }
// Normalize the second derivatives // Normalize the second derivatives
float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f); {
for (float & value : second_derivatives) { const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
value /= second_derivatives_sum;
if (second_derivatives_sum > 1e-6f) {
for (float & value : second_derivatives) {
value /= second_derivatives_sum;
}
} else {
for (float & value : second_derivatives) {
value = 1.0f / second_derivatives.size();
}
}
} }
float cum_sum = 0.0f; float cum_sum = 0.0f;
...@@ -2213,7 +2246,7 @@ void llama_sample_classifier_free_guidance( ...@@ -2213,7 +2246,7 @@ void llama_sample_classifier_free_guidance(
struct llama_context * guidance_ctx, struct llama_context * guidance_ctx,
float scale, float scale,
float smooth_factor) { float smooth_factor) {
int64_t t_start_sample_us = t_start_sample_us = ggml_time_us(); int64_t t_start_sample_us = ggml_time_us();
assert(ctx); assert(ctx);
auto n_vocab = llama_n_vocab(ctx); auto n_vocab = llama_n_vocab(ctx);
...@@ -2701,8 +2734,9 @@ struct llama_model * llama_load_model_from_file( ...@@ -2701,8 +2734,9 @@ struct llama_model * llama_load_model_from_file(
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers, if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock, params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) { memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
params.progress_callback_user_data)) {
delete model; delete model;
fprintf(stderr, "%s: failed to load model\n", __func__); fprintf(stderr, "%s: failed to load model\n", __func__);
return nullptr; return nullptr;
...@@ -2723,7 +2757,7 @@ struct llama_context * llama_new_context_with_model( ...@@ -2723,7 +2757,7 @@ struct llama_context * llama_new_context_with_model(
return nullptr; return nullptr;
} }
llama_context * ctx = new llama_context(*model, model->vocab); llama_context * ctx = new llama_context(*model);
if (params.seed == LLAMA_DEFAULT_SEED) { if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL); params.seed = time(NULL);
...@@ -2777,9 +2811,9 @@ struct llama_context * llama_new_context_with_model( ...@@ -2777,9 +2811,9 @@ struct llama_context * llama_new_context_with_model(
ctx->embedding.resize(hparams.n_embd); ctx->embedding.resize(hparams.n_embd);
} }
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type)); ctx->buf_compute.resize(MEM_REQ_EVAL(hparams.n_ctx).at(ctx->model.type));
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type)); ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type)); ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
} }
...@@ -3561,13 +3595,13 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) { ...@@ -3561,13 +3595,13 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
return 0; return 0;
} }
int llama_tokenize( int llama_tokenize_with_model(
struct llama_context * ctx, const struct llama_model * model,
const char * text, const char * text,
llama_token * tokens, llama_token * tokens,
int n_max_tokens, int n_max_tokens,
bool add_bos) { bool add_bos) {
auto res = llama_tokenize(ctx->vocab, text, add_bos); auto res = llama_tokenize(model->vocab, text, add_bos);
if (n_max_tokens < (int) res.size()) { if (n_max_tokens < (int) res.size()) {
fprintf(stderr, "%s: too many tokens\n", __func__); fprintf(stderr, "%s: too many tokens\n", __func__);
...@@ -3581,8 +3615,29 @@ int llama_tokenize( ...@@ -3581,8 +3615,29 @@ int llama_tokenize(
return res.size(); return res.size();
} }
int llama_tokenize(
struct llama_context * ctx,
const char * text,
llama_token * tokens,
int n_max_tokens,
bool add_bos) {
return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
}
int llama_n_vocab_from_model(const struct llama_model * model) {
return model->vocab.id_to_token.size();
}
int llama_n_ctx_from_model(const struct llama_model * model) {
return model->hparams.n_ctx;
}
int llama_n_embd_from_model(const struct llama_model * model) {
return model->hparams.n_embd;
}
int llama_n_vocab(const struct llama_context * ctx) { int llama_n_vocab(const struct llama_context * ctx) {
return ctx->vocab.id_to_token.size(); return ctx->model.vocab.id_to_token.size();
} }
int llama_n_ctx(const struct llama_context * ctx) { int llama_n_ctx(const struct llama_context * ctx) {
...@@ -3593,19 +3648,27 @@ int llama_n_embd(const struct llama_context * ctx) { ...@@ -3593,19 +3648,27 @@ int llama_n_embd(const struct llama_context * ctx) {
return ctx->model.hparams.n_embd; return ctx->model.hparams.n_embd;
} }
int llama_get_vocab( int llama_get_vocab_from_model(
const struct llama_context * ctx, const struct llama_model * model,
const char * * strings, const char * * strings,
float * scores, float * scores,
int capacity) { int capacity) {
int n = std::min(capacity, (int) ctx->vocab.id_to_token.size()); int n = std::min(capacity, (int) model->vocab.id_to_token.size());
for (int i = 0; i<n; ++i) { for (int i = 0; i<n; ++i) {
strings[i] = ctx->vocab.id_to_token[i].tok.c_str(); strings[i] = model->vocab.id_to_token[i].tok.c_str();
scores[i] = ctx->vocab.id_to_token[i].score; scores[i] = model->vocab.id_to_token[i].score;
} }
return n; return n;
} }
int llama_get_vocab(
const struct llama_context * ctx,
const char * * strings,
float * scores,
int capacity) {
return llama_get_vocab_from_model(&ctx->model, strings, scores, capacity);
}
float * llama_get_logits(struct llama_context * ctx) { float * llama_get_logits(struct llama_context * ctx) {
return ctx->logits.data(); return ctx->logits.data();
} }
...@@ -3614,12 +3677,16 @@ float * llama_get_embeddings(struct llama_context * ctx) { ...@@ -3614,12 +3677,16 @@ float * llama_get_embeddings(struct llama_context * ctx) {
return ctx->embedding.data(); return ctx->embedding.data();
} }
const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) { const char * llama_token_to_str_with_model(const struct llama_model * model, llama_token token) {
if (token >= llama_n_vocab(ctx)) { if (token >= llama_n_vocab_from_model(model)) {
return nullptr; return nullptr;
} }
return ctx->vocab.id_to_token[token].tok.c_str(); return model->vocab.id_to_token[token].tok.c_str();
}
const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
return llama_token_to_str_with_model(&ctx->model, token);
} }
llama_token llama_token_bos() { llama_token llama_token_bos() {
......
/** /**
* llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066 * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
* *
* MIT License * MIT License
* *
...@@ -115,6 +115,11 @@ extern "C" { ...@@ -115,6 +115,11 @@ extern "C" {
int32_t n_gpu_layers; // number of layers to store in VRAM int32_t n_gpu_layers; // number of layers to store in VRAM
int32_t main_gpu; // the GPU that is used for scratch and small tensors int32_t main_gpu; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
float rope_freq_base; // RoPE base frequency
float rope_freq_scale; // RoPE frequency scaling factor
// called with a progress value between 0 and 1, pass NULL to disable // called with a progress value between 0 and 1, pass NULL to disable
llama_progress_callback progress_callback; llama_progress_callback progress_callback;
// context pointer passed to the progress callback // context pointer passed to the progress callback
...@@ -174,6 +179,8 @@ extern "C" { ...@@ -174,6 +179,8 @@ extern "C" {
int32_t n_eval; int32_t n_eval;
}; };
LLAMA_API int llama_max_devices();
LLAMA_API struct llama_context_params llama_context_default_params(); LLAMA_API struct llama_context_params llama_context_default_params();
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(); LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
...@@ -296,10 +303,21 @@ extern "C" { ...@@ -296,10 +303,21 @@ extern "C" {
int n_max_tokens, int n_max_tokens,
bool add_bos); bool add_bos);
LLAMA_API int llama_tokenize_with_model(
const struct llama_model * model,
const char * text,
llama_token * tokens,
int n_max_tokens,
bool add_bos);
LLAMA_API int llama_n_vocab(const struct llama_context * ctx); LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
LLAMA_API int llama_n_ctx (const struct llama_context * ctx); LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
LLAMA_API int llama_n_embd (const struct llama_context * ctx); LLAMA_API int llama_n_embd (const struct llama_context * ctx);
LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model);
LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
// Get the vocabulary as output parameters. // Get the vocabulary as output parameters.
// Returns number of results. // Returns number of results.
LLAMA_API int llama_get_vocab( LLAMA_API int llama_get_vocab(
...@@ -308,6 +326,12 @@ extern "C" { ...@@ -308,6 +326,12 @@ extern "C" {
float * scores, float * scores,
int capacity); int capacity);
LLAMA_API int llama_get_vocab_from_model(
const struct llama_model * model,
const char * * strings,
float * scores,
int capacity);
// Token logits obtained from the last call to llama_eval() // Token logits obtained from the last call to llama_eval()
// The logits for the last token are stored in the last row // The logits for the last token are stored in the last row
// Can be mutated in order to change the probabilities of the next token // Can be mutated in order to change the probabilities of the next token
...@@ -320,7 +344,13 @@ extern "C" { ...@@ -320,7 +344,13 @@ extern "C" {
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
// Token Id -> String. Uses the vocabulary in the provided context // Token Id -> String. Uses the vocabulary in the provided context
LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token); LLAMA_API const char * llama_token_to_str(
const struct llama_context * ctx,
llama_token token);
LLAMA_API const char * llama_token_to_str_with_model(
const struct llama_model * model,
llama_token token);
// Special tokens // Special tokens
LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment