Commit 22885aea authored by Jeffrey Morgan's avatar Jeffrey Morgan
Browse files

update `llama.cpp` to `f64d44a`

parent ed969d2a
/**
* llama.cpp - git 8183159cf3def112f6d1fe94815fce70e1bffa12
* llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
*
* MIT License
*
......@@ -420,6 +420,14 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
if (parent == NULL) {
break;
}
// if the node's data is external, then we cannot re-use it
if ((char *) parent->data < (char *) alloc->data ||
(char *) parent->data >= ((char *) alloc->data + alloc->size)) {
AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
continue;
}
struct hash_node * p_hn = hash_get(ht, parent);
if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
if (ggml_is_view(parent)) {
......
/**
* llama.cpp - git 8183159cf3def112f6d1fe94815fce70e1bffa12
* llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
*
* MIT License
*
......
This diff is collapsed.
/**
* llama.cpp - git 8183159cf3def112f6d1fe94815fce70e1bffa12
* llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
*
* MIT License
*
......
//go:build darwin
/**
* llama.cpp - git 8183159cf3def112f6d1fe94815fce70e1bffa12
* llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
*
* MIT License
*
......
//go:build darwin
/**
* llama.cpp - git 8183159cf3def112f6d1fe94815fce70e1bffa12
* llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
*
* MIT License
*
......@@ -35,6 +35,11 @@
#import <Metal/Metal.h>
#import <MetalPerformanceShaders/MetalPerformanceShaders.h>
#undef MIN
#undef MAX
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#ifdef GGML_METAL_NDEBUG
#define metal_printf(...)
#else
......@@ -43,6 +48,8 @@
#define UNUSED(x) (void)(x)
#define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
struct ggml_metal_buffer {
const char * name;
......@@ -64,7 +71,7 @@ struct ggml_metal_context {
int n_buffers;
struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
int concur_list[GGML_MAX_NODES];
int concur_list[GGML_MAX_CONCUR];
int concur_list_len;
// custom kernels
......@@ -398,10 +405,10 @@ void ggml_metal_graph_find_concurrency(
struct ggml_metal_context * ctx,
struct ggml_cgraph * gf) {
int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time
int nodes_unused[GGML_MAX_NODES];
int nodes_unused[GGML_MAX_CONCUR];
for (int i = 0; i < GGML_MAX_NODES; i++) {ctx->concur_list[i] = 0;}
for (int i = 0; i < gf->n_nodes; i++) {nodes_unused[i] = 1;}
for (int i = 0; i < GGML_MAX_CONCUR; i++) { ctx->concur_list[i] = 0; }
for (int i = 0; i < gf->n_nodes; i++) { nodes_unused[i] = 1; }
ctx->concur_list_len = 0;
int n_left = gf->n_nodes;
......@@ -414,21 +421,33 @@ void ggml_metal_graph_find_concurrency(
for (int i = n_start; i < ((n_start + search_depth > gf->n_nodes) ? gf->n_nodes : n_start + search_depth); i++) {
if (nodes_unused[i]) {
// if the requirements for gf->nodes[i] are satisfied
int exe_flag=1;
int exe_flag = 1;
// scan all srcs
for (int src_ind = 0; src_ind < GGML_MAX_SRC; src_ind++) {
struct ggml_tensor * src_cur = gf->nodes[i]->src[src_ind];
if (src_cur) {
// if is leaf nodes it's satisfied.
if (src_cur->op == GGML_OP_NONE && src_cur->grad == NULL) {continue;}
// TODO: ggml_is_leaf()
if (src_cur->op == GGML_OP_NONE && src_cur->grad == NULL) {
continue;
}
// otherwise this src should be the output from previous nodes.
int is_found = 0;
// scan 2*search_depth back because we inserted barrier.
for (int j = ((level_pos - 2*search_depth) < 0 ? 0 : (level_pos - 2*search_depth)); j < level_pos; j++) {
if (gf->nodes[ctx->concur_list[j]] == src_cur) {is_found = 1; break;}
//for (int j = ((level_pos - 2*search_depth) < 0 ? 0 : (level_pos - 2*search_depth)); j < level_pos; j++) {
for (int j = MAX(0, level_pos - 2*search_depth); j < level_pos; j++) {
if (ctx->concur_list[j] >= 0 && gf->nodes[ctx->concur_list[j]] == src_cur) {
is_found = 1;
break;
}
}
if (is_found == 0) {
exe_flag = 0;
break;
}
if (is_found == 0) {exe_flag = 0; break;}
}
}
if (exe_flag) {
......@@ -444,9 +463,9 @@ void ggml_metal_graph_find_concurrency(
if (((int64_t)gf->nodes[j]->data) >= data_start + length || \
((int64_t)gf->nodes[j]->data) + (int64_t) ggml_nbytes(gf->nodes[j]) <= data_start) {
continue;
} else {
exe_flag = 0;
}
exe_flag = 0;
}
}
}
......@@ -463,11 +482,13 @@ void ggml_metal_graph_find_concurrency(
ctx->concur_list[level_pos + concurrency] = -1;
ctx->concur_list_len++;
// jump all sorted nodes at nodes_bak
while (!nodes_unused[n_start]) {n_start++;}
while (!nodes_unused[n_start]) {
n_start++;
}
level_pos += concurrency + 1;
}
if (ctx->concur_list_len > GGML_MAX_NODES) {
if (ctx->concur_list_len > GGML_MAX_CONCUR) {
fprintf(stderr, "%s: too many elements for metal ctx->concur_list!\n", __func__);
}
}
......@@ -481,7 +502,7 @@ void ggml_metal_graph_compute(
// else fallback to serial dispatch
MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;
const bool has_concur = ctx->concur_list_len && ctx->concur_list_len <= GGML_MAX_NODES;
const bool has_concur = ctx->concur_list_len && ctx->concur_list_len <= GGML_MAX_CONCUR;
const int n_nodes = has_concur ? ctx->concur_list_len : gf->n_nodes;
edesc.dispatchType = has_concur ? MTLDispatchTypeConcurrent : MTLDispatchTypeSerial;
......
//go:build darwin
/**
* llama.cpp - git 8183159cf3def112f6d1fe94815fce70e1bffa12
* llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
*
* MIT License
*
......
//go:build mpi
/**
* llama.cpp - git 8183159cf3def112f6d1fe94815fce70e1bffa12
* llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
*
* MIT License
*
......
//go:build mpi
/**
* llama.cpp - git 8183159cf3def112f6d1fe94815fce70e1bffa12
* llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
*
* MIT License
*
......
//go:build opencl
/**
* llama.cpp - git 8183159cf3def112f6d1fe94815fce70e1bffa12
* llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
*
* MIT License
*
......
//go:build opencl
/**
* llama.cpp - git 8183159cf3def112f6d1fe94815fce70e1bffa12
* llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
*
* MIT License
*
......
This diff is collapsed.
/**
* llama.cpp - git 8183159cf3def112f6d1fe94815fce70e1bffa12
* llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
*
* MIT License
*
......@@ -209,6 +209,15 @@
# define GGML_API
#endif
// TODO: support for clang
#ifdef __GNUC__
# define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
#elif defined(_MSC_VER)
# define GGML_DEPRECATED(func, hint) __declspec(deprecated(hint)) func
#else
# define GGML_DEPRECATED(func, hint) func
#endif
#include <stdint.h>
#include <stddef.h>
#include <stdbool.h>
......@@ -400,6 +409,10 @@ extern "C" {
GGML_OP_MAP_UNARY,
GGML_OP_MAP_BINARY,
GGML_OP_MAP_CUSTOM1_F32,
GGML_OP_MAP_CUSTOM2_F32,
GGML_OP_MAP_CUSTOM3_F32,
GGML_OP_MAP_CUSTOM1,
GGML_OP_MAP_CUSTOM2,
GGML_OP_MAP_CUSTOM3,
......@@ -596,6 +609,8 @@ extern "C" {
GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
// use this to compute the memory overhead of a tensor
GGML_API size_t ggml_tensor_overhead(void);
......@@ -1266,7 +1281,7 @@ extern "C" {
// conv_1d with padding = half
// alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
GGML_API struct ggml_tensor* ggml_conv_1d_ph(
GGML_API struct ggml_tensor * ggml_conv_1d_ph(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
......@@ -1279,7 +1294,7 @@ extern "C" {
GGML_OP_POOL_COUNT,
};
GGML_API struct ggml_tensor* ggml_pool_1d(
GGML_API struct ggml_tensor * ggml_pool_1d(
struct ggml_context * ctx,
struct ggml_tensor * a,
enum ggml_op_pool op,
......@@ -1287,7 +1302,7 @@ extern "C" {
int s0, // stride
int p0); // padding
GGML_API struct ggml_tensor* ggml_pool_2d(
GGML_API struct ggml_tensor * ggml_pool_2d(
struct ggml_context * ctx,
struct ggml_tensor * a,
enum ggml_op_pool op,
......@@ -1341,6 +1356,16 @@ extern "C" {
int h0,
int w);
GGML_API struct ggml_tensor * ggml_unary(
struct ggml_context * ctx,
struct ggml_tensor * a,
enum ggml_unary_op op);
GGML_API struct ggml_tensor * ggml_unary_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
enum ggml_unary_op op);
// custom operators
typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
......@@ -1350,73 +1375,129 @@ extern "C" {
typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
GGML_API struct ggml_tensor * ggml_unary(
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_f32(
struct ggml_context * ctx,
struct ggml_tensor * a,
enum ggml_unary_op op);
ggml_unary_op_f32_t fun),
"use ggml_map_custom1 instead");
GGML_API struct ggml_tensor * ggml_unary_inplace(
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
struct ggml_context * ctx,
struct ggml_tensor * a,
enum ggml_unary_op op);
ggml_unary_op_f32_t fun),
"use ggml_map_custom1_inplace instead");
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_f32(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
ggml_binary_op_f32_t fun),
"use ggml_map_custom2 instead");
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
ggml_binary_op_f32_t fun),
"use ggml_map_custom2_inplace instead");
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_f32(
struct ggml_context * ctx,
struct ggml_tensor * a,
ggml_custom1_op_f32_t fun),
"use ggml_map_custom1 instead");
GGML_API struct ggml_tensor * ggml_map_unary_f32(
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
struct ggml_context * ctx,
struct ggml_tensor * a,
ggml_unary_op_f32_t fun);
ggml_custom1_op_f32_t fun),
"use ggml_map_custom1_inplace instead");
GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_f32(
struct ggml_context * ctx,
struct ggml_tensor * a,
ggml_unary_op_f32_t fun);
struct ggml_tensor * b,
ggml_custom2_op_f32_t fun),
"use ggml_map_custom2 instead");
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
ggml_custom2_op_f32_t fun),
"use ggml_map_custom2_inplace instead");
GGML_API struct ggml_tensor * ggml_map_binary_f32(
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_f32(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
ggml_binary_op_f32_t fun);
struct ggml_tensor * c,
ggml_custom3_op_f32_t fun),
"use ggml_map_custom3 instead");
GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
ggml_binary_op_f32_t fun);
struct ggml_tensor * c,
ggml_custom3_op_f32_t fun),
"use ggml_map_custom3_inplace instead");
// custom operators v2
typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
#define GGML_N_TASKS_MAX -1
GGML_API struct ggml_tensor * ggml_map_custom1_f32(
GGML_API struct ggml_tensor * ggml_map_custom1(
struct ggml_context * ctx,
struct ggml_tensor * a,
ggml_custom1_op_f32_t fun);
ggml_custom1_op_t fun,
int n_tasks,
void * userdata);
GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
GGML_API struct ggml_tensor * ggml_map_custom1_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
ggml_custom1_op_f32_t fun);
ggml_custom1_op_t fun,
int n_tasks,
void * userdata);
GGML_API struct ggml_tensor * ggml_map_custom2_f32(
GGML_API struct ggml_tensor * ggml_map_custom2(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
ggml_custom2_op_f32_t fun);
ggml_custom2_op_t fun,
int n_tasks,
void * userdata);
GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
GGML_API struct ggml_tensor * ggml_map_custom2_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
ggml_custom2_op_f32_t fun);
ggml_custom2_op_t fun,
int n_tasks,
void * userdata);
GGML_API struct ggml_tensor * ggml_map_custom3_f32(
GGML_API struct ggml_tensor * ggml_map_custom3(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c,
ggml_custom3_op_f32_t fun);
ggml_custom3_op_t fun,
int n_tasks,
void * userdata);
GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
GGML_API struct ggml_tensor * ggml_map_custom3_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c,
ggml_custom3_op_f32_t fun);
ggml_custom3_op_t fun,
int n_tasks,
void * userdata);
// loss function
......
/**
* llama.cpp - git 8183159cf3def112f6d1fe94815fce70e1bffa12
* llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
*
* MIT License
*
......
/**
* llama.cpp - git 8183159cf3def112f6d1fe94815fce70e1bffa12
* llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
*
* MIT License
*
......
/**
* llama.cpp - git 8183159cf3def112f6d1fe94815fce70e1bffa12
* llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
*
* MIT License
*
......@@ -175,6 +175,46 @@ struct llama_file {
}
};
// llama_context_data
struct llama_data_context {
virtual void write(const void * src, size_t size) = 0;
virtual size_t get_size_written() = 0;
virtual ~llama_data_context() = default;
};
struct llama_data_buffer_context : llama_data_context {
uint8_t* ptr;
size_t size_written = 0;
llama_data_buffer_context(uint8_t * p) : ptr(p) {}
void write(const void * src, size_t size) override {
memcpy(ptr, src, size);
ptr += size;
size_written += size;
}
size_t get_size_written() override {
return size_written;
}
};
struct llama_data_file_context : llama_data_context {
llama_file* file;
size_t size_written = 0;
llama_data_file_context(llama_file * f) : file(f) {}
void write(const void * src, size_t size) override {
file->write_raw(src, size);
size_written += size;
}
size_t get_size_written() override {
return size_written;
}
};
#if defined(_WIN32)
static std::string llama_format_win_err(DWORD err) {
LPSTR buf;
......@@ -205,7 +245,7 @@ struct llama_mmap {
// prefetch/readahead impairs performance on NUMA systems
if (numa) { prefetch = 0; }
#ifdef __linux__
if (prefetch) { flags |= MAP_POPULATE; }
if (prefetch >= file->size) { flags |= MAP_POPULATE; }
#endif
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
if (addr == MAP_FAILED) {
......
This diff is collapsed.
package llm
/*
#cgo CPPFLAGS: -O3 -Wall -Wextra -Wno-unused-function -Wno-unused-variable -DNDEBUG -DGGML_USE_K_QUANTS
#cgo CXXFLAGS: -std=gnu++11
#cgo CFLAGS: -Ofast -std=c11 -fPIC
#cgo CPPFLAGS: -Ofast -Wall -Wextra -Wno-unused-function -Wno-unused-variable -DNDEBUG -DGGML_USE_K_QUANTS
#cgo CXXFLAGS: -std=c++11 -fPIC
#cgo darwin CPPFLAGS: -DGGML_USE_ACCELERATE
#cgo darwin,arm64 CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_NDEBUG
#cgo darwin LDFLAGS: -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
......
/**
* llama.cpp - git 8183159cf3def112f6d1fe94815fce70e1bffa12
* llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
*
* MIT License
*
......@@ -112,6 +112,19 @@ extern "C" {
typedef void (*llama_progress_callback)(float progress, void *ctx);
enum llama_log_level {
LLAMA_LOG_LEVEL_ERROR = 2,
LLAMA_LOG_LEVEL_WARN = 3,
LLAMA_LOG_LEVEL_INFO = 4
};
// Signature for logging events
// Note that text includes the new line character at the end for most events.
// If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
// if it exists.
// It might not exist for progress report where '.' is output repeatedly.
typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
struct llama_context_params {
uint32_t seed; // RNG seed, -1 for random
int32_t n_ctx; // text context
......@@ -221,6 +234,10 @@ extern "C" {
int32_t n_eval;
};
// Set callback for all future logging events.
// If this is not called, or NULL is supplied, everything is output on stderr.
LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
LLAMA_API int llama_max_devices();
LLAMA_API struct llama_context_params llama_context_default_params();
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment