Commit 0ce8bcfd authored by xuxzh1's avatar xuxzh1 🎱
Browse files

init

parent b0135f4b
/*
* xxHash - Extremely Fast Hash algorithm
* Copyright (C) 2012-2023 Yann Collet
*
* BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following disclaimer
* in the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* You can contact the author at:
* - xxHash homepage: https://www.xxhash.com
* - xxHash source repository: https://github.com/Cyan4973/xxHash
*/
/*
* xxhash.c instantiates functions defined in xxhash.h
*/
#define XXH_STATIC_LINKING_ONLY /* access advanced declarations */
#define XXH_IMPLEMENTATION /* access definitions */
#include "xxhash.h"
This source diff could not be displayed because it is too large. You can view the blob instead.
#include "ggml.h"
#include <cstdlib> /* abort() */
#include <cstddef>
#include <cstdio>
#include <string>
#include <stdexcept>
#include <algorithm>
#include <cstring>
#include <sstream>
#include <fstream>
#ifdef __cplusplus
extern "C" {
#endif
#include "xxhash/xxhash.h"
#include "sha1/sha1.h"
#include "sha256/sha256.h"
#ifdef __cplusplus
}
#endif
// uuid.uuid5(uuid.NAMESPACE_URL, 'en.wikipedia.org/wiki/Llama.cpp')
#define UUID_NAMESPACE_LLAMA_CPP "ef001206-dadc-5f6d-a15f-3359e577d4e5"
#define UUID_NAMESPACE_LLAMA_CPP_HEX 0xef, 0x00, 0x12, 0x06, 0xda, 0xdc, 0x5f, 0x6d, 0xa1, 0x5f, 0x33, 0x59, 0xe5, 0x77, 0xd4, 0xe5
#define HASH_TYPE_SHA256_STR "sha256"
#define HASH_TYPE_SHA1_STR "sha1"
#define HASH_TYPE_XXH64_STR "xxh64"
#define HASH_TYPE_UUID_STR "uuid"
typedef enum {
HASH_EXIT_SUCCESS = 0, // All hash has been generated or validated
HASH_EXIT_FAILURE = 1, // Generic Failure
HASH_EXIT_MISMATCH = 2, // Hash mismatched during validation
HASH_EXIT_MANIFEST_MISSING_ENTRY = 3, // Hash attempted validation but missing entry in manifest
HASH_EXIT_MANIFEST_UNKNOWN_HASH = 4, // Manifest is present, but we do not know any hash format within it
HASH_EXIT_MANIFEST_FILE_ERROR = 5 // Manifest is either missing or not a known format
} hash_exit_code_t;
typedef enum {
HASH_MANIFEST_NOT_FOUND,
HASH_MANIFEST_MISMATCH,
HASH_MANIFEST_OK,
} hash_manifest_result_t;
struct hash_params {
std::string input;
bool xxh64 = false;
bool sha1 = false;
bool sha256 = false;
bool uuid = false;
bool no_layer = false;
bool manifest_is_usable = false;
std::string manifest_file;
};
struct manifest_check_params {
bool xxh64 = false;
bool sha1 = false;
bool sha256 = false;
bool uuid = false;
};
static char const * hash_manifest_result_to_str(hash_manifest_result_t value) {
switch (value) {
case HASH_MANIFEST_NOT_FOUND: return "Not Found";
case HASH_MANIFEST_MISMATCH: return "Mismatch";
case HASH_MANIFEST_OK: return "Ok";
}
return "?";
}
static char const * hash_exit_code_to_str(hash_exit_code_t value) {
switch (value) {
case HASH_EXIT_SUCCESS: return "Success";
case HASH_EXIT_FAILURE: return "Failure";
case HASH_EXIT_MISMATCH: return "Mismatch";
case HASH_EXIT_MANIFEST_MISSING_ENTRY: return "Manifest Missing Entry";
case HASH_EXIT_MANIFEST_UNKNOWN_HASH: return "Manifest Unknown Hash";
case HASH_EXIT_MANIFEST_FILE_ERROR: return "Manifest File Error";
}
return "?";
}
static void hash_print_usage(const char * executable) {
const hash_params default_params;
printf("\n");
printf("usage: %s [options] GGUF_IN\n", executable);
printf("\n");
printf("Hash a GGUF file");
printf("\n");
printf("options:\n");
printf(" -h, --help show this help message and exit\n");
printf(" --xxh64 use xxh64 hash\n");
printf(" --sha1 use sha1 hash\n");
printf(" --sha256 use sha256 hash\n");
printf(" --all use all hash\n");
printf(" --no-layer exclude per layer hash\n");
printf(" --uuid generate UUIDv5 ID\n");
printf(" -c, --check <manifest> verify against a manifest\n");
printf("\n");
}
static void hash_params_parse_ex(int argc, const char ** argv, hash_params & params) {
std::string arg;
bool invalid_param = false;
const std::string arg_prefix = "--";
int arg_idx = 1;
for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
arg = argv[arg_idx];
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
std::replace(arg.begin(), arg.end(), '_', '-');
}
bool arg_found = false;
if (arg == "-h" || arg == "--help") {
hash_print_usage(argv[0]);
exit(0);
}
if (arg == "--xxh64") {
arg_found = true;
params.xxh64 = true;
}
if (arg == "--sha1") {
arg_found = true;
params.sha1 = true;
}
if (arg == "--uuid") {
arg_found = true;
params.uuid = true;
}
if (arg == "--sha256") {
arg_found = true;
params.sha256 = true;
}
if (arg == "--all") {
arg_found = true;
params.sha256 = true;
params.sha1 = true;
params.xxh64 = true;
}
if (arg == "--no-layer") {
arg_found = true;
params.no_layer = true;
}
if (arg == "-c" || arg == "--check") {
if (++arg_idx >= argc) {
invalid_param = true;
break;
}
arg_found = true;
params.manifest_file = argv[arg_idx];
}
if (!arg_found) {
throw std::invalid_argument("error: unknown argument: " + arg);
}
}
if (invalid_param) {
throw std::invalid_argument("error: invalid parameter for argument:" + arg);
}
if (argc - arg_idx < 1) {
throw std::invalid_argument("error: bad arguments");
}
params.input = argv[arg_idx++];
}
static bool hash_params_parse(int argc, const char ** argv, hash_params & params) {
bool result = true;
try {
hash_params_parse_ex(argc, argv, params);
}
catch (const std::invalid_argument & ex) {
fprintf(stderr, "%s\n", ex.what());
hash_print_usage(argv[0]);
exit(EXIT_FAILURE);
}
return result;
}
static bool manifest_type(const std::string & manifest_file, manifest_check_params & manifest_check) {
if (manifest_file.empty()) {
return false;
}
std::ifstream file(manifest_file);
if (!file.is_open()) {
return false;
}
std::string manifest_entry_line;
while (getline(file, manifest_entry_line)) {
// hash_type_str hash_str tensor_name
// e.g. 'xxh64 f66e9cd66a4396a0 test.gguf:tensor_0'
std::istringstream line_stream(manifest_entry_line);
std::string file_hash_type;
if (line_stream >> file_hash_type) {
if (file_hash_type == HASH_TYPE_SHA256_STR) {
manifest_check.sha256 = true;
} else if (file_hash_type == HASH_TYPE_SHA1_STR) {
manifest_check.sha1 = true;
} else if (file_hash_type == HASH_TYPE_XXH64_STR) {
manifest_check.xxh64 = true;
} else if (file_hash_type == HASH_TYPE_UUID_STR) {
manifest_check.uuid = true;
}
}
}
return true;
}
static hash_manifest_result_t manifest_verify(const std::string& manifest_file, const std::string& hash_type_str, const std::string& hash_str, const std::string& tensor_name) {
if (manifest_file.empty()) {
return HASH_MANIFEST_NOT_FOUND;
}
std::ifstream file(manifest_file);
if (!file.is_open()) {
return HASH_MANIFEST_NOT_FOUND;
}
std::string manifest_entry_line;
while (getline(file, manifest_entry_line)) {
std::istringstream line_stream(manifest_entry_line);
std::string file_hash_type;
std::string file_hash;
std::string file_tensor_name;
if (line_stream >> file_hash_type >> file_hash >> file_tensor_name) {
// Line parsed. Check hash validity
if (file_hash_type != hash_type_str) {
continue;
}
if (file_tensor_name != tensor_name) {
continue;
}
return (file_hash == hash_str) ? HASH_MANIFEST_OK : HASH_MANIFEST_MISMATCH;
}
}
return HASH_MANIFEST_NOT_FOUND;
}
static void generate_uuidv5(const unsigned char sha1_digest[20], unsigned char uuid[16]) {
// Ref: https://www.rfc-editor.org/rfc/rfc9562.html#section-5.5
// Assumes that digest was processed correctly with the expected namespace
for (int i = 0; i < 16; i++) {
uuid[i] = sha1_digest[i];
}
// Set bits corresponding to UUID ver 5
uuid[ 6] &= ~(0xF << 4);
uuid[ 6] |= (5 << 4);
// Set bits corresponding to UUID variant 0b10XX
uuid[ 8] &= ~(0xc << 4);
uuid[ 8] |= (0x8 << 4);
}
static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
const std::string & fname = hash_params.input;
struct ggml_context * ctx_data = NULL;
struct gguf_init_params params = {
/*.no_alloc = */ false,
/*.ctx = */ &ctx_data,
};
// xxh64 init
XXH64_state_t* xxh64_model_hash_state = NULL;
if (hash_params.xxh64) {
xxh64_model_hash_state = XXH64_createState();
if (xxh64_model_hash_state==NULL) {
abort();
}
XXH64_hash_t const seed = 0;
if (XXH64_reset(xxh64_model_hash_state, seed) == XXH_ERROR) {
abort();
}
}
// sha1 init
SHA1_CTX sha1_model_hash_ctx;
if (hash_params.sha1) {
SHA1Init(&sha1_model_hash_ctx);
}
// sha256 init
sha256_t sha256_model_hash_ctx;
if (hash_params.sha256) {
sha256_init(&sha256_model_hash_ctx);
}
// sha1 for uuid init
SHA1_CTX sha1_for_uuid_ctx;
if (hash_params.uuid) {
unsigned char const uuidv5_namespace[] = {UUID_NAMESPACE_LLAMA_CPP_HEX};
SHA1Init(&sha1_for_uuid_ctx);
SHA1Update( &sha1_for_uuid_ctx, (unsigned char const *)uuidv5_namespace, sizeof(uuidv5_namespace));
}
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
const int n_tensors = gguf_get_n_tensors(ctx);
bool tensor_layer_in_manifest = false;
bool model_in_manifest = false;
bool tensor_layer_has_mismatch = false;
bool model_has_mismatch = false;
for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name(ctx, i);
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
auto n_bytes = ggml_nbytes(cur);
auto *raw_data = cur->data;
const std::string tensor_layer_name = fname + ":" + name;
if (hash_params.xxh64) {
if (!hash_params.no_layer) {
// Per Layer Hash
XXH64_hash_t hash = XXH64(raw_data, n_bytes, 0);
char hex_result[17];
for (int offset = 0; offset < 8; offset++) {
unsigned int shift_bits_by = (8 * (8 - offset - 1));
snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
}
if (hash_params.manifest_is_usable) {
hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_XXH64_STR, hex_result, tensor_layer_name);
switch (verify_result) {
case HASH_MANIFEST_NOT_FOUND:
break;
case HASH_MANIFEST_MISMATCH:
tensor_layer_in_manifest = true;
tensor_layer_has_mismatch = true;
break;
case HASH_MANIFEST_OK:
tensor_layer_in_manifest = true;
break;
}
printf("%-8s %-s %s - %s\n", HASH_TYPE_XXH64_STR, hex_result, tensor_layer_name.c_str(), hash_manifest_result_to_str(verify_result));
} else {
printf("%-8s %-s %s\n", HASH_TYPE_XXH64_STR, hex_result, tensor_layer_name.c_str());
}
}
// Overall Model Hash
if (XXH64_update(xxh64_model_hash_state, raw_data, n_bytes) == XXH_ERROR) abort();
}
if (hash_params.sha1) {
if (!hash_params.no_layer) {
// Per Layer Hash
char result[21]; // sha1 outputs 20 bytes
SHA1( result, (const char *)raw_data, n_bytes);
char hex_result[41] = {0};
for (int offset = 0; offset < 20; offset++) {
snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
}
if (hash_params.manifest_is_usable) {
hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA1_STR, hex_result, tensor_layer_name);
switch (verify_result) {
case HASH_MANIFEST_NOT_FOUND:
break;
case HASH_MANIFEST_MISMATCH:
tensor_layer_in_manifest = true;
tensor_layer_has_mismatch = true;
break;
case HASH_MANIFEST_OK:
tensor_layer_in_manifest = true;
break;
}
printf("%-8s %-s %s - %s\n", HASH_TYPE_SHA1_STR, hex_result, tensor_layer_name.c_str(), hash_manifest_result_to_str(verify_result));
} else {
printf("%-8s %-s %s\n", HASH_TYPE_SHA1_STR, hex_result, tensor_layer_name.c_str());
}
}
// Overall Model Hash
SHA1Update( &sha1_model_hash_ctx, (unsigned char const *)raw_data, n_bytes);
}
if (hash_params.sha256) {
if (!hash_params.no_layer) {
// Per Layer Hash
unsigned char result[SHA256_DIGEST_SIZE]; // sha256 outputs 32 bytes
sha256_hash((unsigned char*) result, (const unsigned char *)raw_data, n_bytes);
char hex_result[SHA256_DIGEST_SIZE * 2 + 1] = {0};
for (int offset = 0; offset < SHA256_DIGEST_SIZE; offset++) {
snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
}
if (hash_params.manifest_is_usable) {
hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA256_STR, hex_result, tensor_layer_name);
switch (verify_result) {
case HASH_MANIFEST_NOT_FOUND:
break;
case HASH_MANIFEST_MISMATCH:
tensor_layer_in_manifest = true;
tensor_layer_has_mismatch = true;
break;
case HASH_MANIFEST_OK:
tensor_layer_in_manifest = true;
break;
}
printf("%-8s %-s %s - %s\n", HASH_TYPE_SHA256_STR, hex_result, tensor_layer_name.c_str(), hash_manifest_result_to_str(verify_result));
} else {
printf("%-8s %-s %s\n", HASH_TYPE_SHA256_STR, hex_result, tensor_layer_name.c_str());
}
}
// Overall Model Hash
sha256_update( &sha256_model_hash_ctx, (unsigned char const *)raw_data, n_bytes);
}
if (hash_params.uuid) {
SHA1Update( &sha1_for_uuid_ctx, (unsigned char const *)raw_data, n_bytes);
}
}
if (hash_params.xxh64) {
XXH64_hash_t const hash = XXH64_digest(xxh64_model_hash_state);
char hex_result[17];
for (int offset = 0; offset < 8; offset++) {
unsigned int shift_bits_by = (8 * (8 - offset - 1));
snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
}
if (hash_params.manifest_is_usable) {
hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_XXH64_STR, hex_result, fname);
switch (verify_result) {
case HASH_MANIFEST_NOT_FOUND:
break;
case HASH_MANIFEST_MISMATCH:
model_in_manifest = true;
model_has_mismatch = true;
break;
case HASH_MANIFEST_OK:
model_in_manifest = true;
break;
}
printf("%-8s %-s %s - %s\n", HASH_TYPE_XXH64_STR, hex_result, fname.c_str(), hash_manifest_result_to_str(verify_result));
} else {
printf("%-8s %-s %s\n", HASH_TYPE_XXH64_STR, hex_result, fname.c_str());
}
}
if (hash_params.sha1) {
unsigned char result[21];
SHA1Final(result, &sha1_model_hash_ctx);
char hex_result[41];
for (int offset = 0; offset < 20; offset++) {
snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
}
if (hash_params.manifest_is_usable) {
hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA1_STR, hex_result, fname);
switch (verify_result) {
case HASH_MANIFEST_NOT_FOUND:
break;
case HASH_MANIFEST_MISMATCH:
model_in_manifest = true;
model_has_mismatch = true;
break;
case HASH_MANIFEST_OK:
model_in_manifest = true;
break;
}
printf("%-8s %-s %s - %s\n", HASH_TYPE_SHA1_STR, hex_result, fname.c_str(), hash_manifest_result_to_str(verify_result));
} else {
printf("%-8s %-s %s\n", HASH_TYPE_SHA1_STR, hex_result, fname.c_str());
}
}
if (hash_params.sha256) {
unsigned char result[SHA256_DIGEST_SIZE]; // sha256 outputs 32 bytes
sha256_final( &sha256_model_hash_ctx, result);
char hex_result[SHA256_DIGEST_SIZE * 2 + 1] = {0};
for (int offset = 0; offset < SHA256_DIGEST_SIZE; offset++) {
snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
}
if (hash_params.manifest_is_usable) {
hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA256_STR, hex_result, fname);
switch (verify_result) {
case HASH_MANIFEST_NOT_FOUND:
break;
case HASH_MANIFEST_MISMATCH:
model_in_manifest = true;
model_has_mismatch = true;
break;
case HASH_MANIFEST_OK:
model_in_manifest = true;
break;
}
printf("%-8s %-s %s - %s\n", HASH_TYPE_SHA256_STR, hex_result, fname.c_str(), hash_manifest_result_to_str(verify_result));
} else {
printf("%-8s %-s %s\n", HASH_TYPE_SHA256_STR, hex_result, fname.c_str());
}
}
if (hash_params.uuid) {
unsigned char result[21];
SHA1Final(result, &sha1_for_uuid_ctx);
unsigned char uuid[16];
generate_uuidv5(result, uuid);
char string_buffer[37] = {0};
snprintf(string_buffer, sizeof(string_buffer), "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
uuid[0], uuid[1], uuid[2], uuid[3],
uuid[4], uuid[5], uuid[6], uuid[7],
uuid[8], uuid[9], uuid[10], uuid[11],
uuid[12], uuid[13], uuid[14], uuid[15]);
if (hash_params.manifest_is_usable) {
hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA256_STR, string_buffer, fname);
switch (verify_result) {
case HASH_MANIFEST_NOT_FOUND:
break;
case HASH_MANIFEST_MISMATCH:
model_in_manifest = true;
model_has_mismatch = true;
break;
case HASH_MANIFEST_OK:
model_in_manifest = true;
break;
}
printf("%-8s %-s %s - %s\n", HASH_TYPE_UUID_STR, string_buffer, fname.c_str(), hash_manifest_result_to_str(verify_result));
} else {
printf("%-8s %-s %s\n", HASH_TYPE_UUID_STR, string_buffer, fname.c_str());
}
}
ggml_free(ctx_data);
gguf_free(ctx);
if (hash_params.manifest_is_usable) {
// In hash verification mode
if (!model_in_manifest) {
// model missing in manifest?
// Check tensor layer...
if (!tensor_layer_in_manifest) {
// Still missing? Maybe we are reading the wrong manifest.
return HASH_EXIT_MANIFEST_MISSING_ENTRY;
}
if (tensor_layer_has_mismatch) {
// Per tensor check found error
return HASH_EXIT_FAILURE;
}
// All per tensor layer checks passed? Sounds good enough.
return HASH_EXIT_SUCCESS;
}
// Overall model check passed, but let's check per layer just in case
// If missing, we don't care too much as the overall model checked
if (tensor_layer_in_manifest && tensor_layer_has_mismatch) {
return HASH_EXIT_FAILURE;
}
if (model_has_mismatch) {
// model has failed hash somewhere in the model
return HASH_EXIT_FAILURE;
}
// All checks appears to be fine
return HASH_EXIT_SUCCESS;
}
// In hash generation mode
return HASH_EXIT_SUCCESS;
}
int main(int argc, const char ** argv) {
hash_params params;
manifest_check_params manifest_check;
hash_params_parse(argc, argv, params);
if (!params.manifest_file.empty()) {
if (!manifest_type(params.manifest_file, manifest_check)) {
printf("ERROR cannot open manifest %s", params.manifest_file.c_str());
return HASH_EXIT_MANIFEST_FILE_ERROR;
}
if (!manifest_check.sha256 && !manifest_check.sha1 && !manifest_check.xxh64 && !manifest_check.uuid) {
printf("ERROR manifest does not have any known hash format in %s", params.manifest_file.c_str());
return HASH_EXIT_MANIFEST_UNKNOWN_HASH;
}
printf("manifest %s", params.manifest_file.c_str());
if (manifest_check.sha256) {
printf(" sha256");
}
if (manifest_check.sha1) {
printf(" sha1");
}
if (manifest_check.xxh64) {
printf(" xxh64");
}
if (manifest_check.uuid) {
printf(" uuid");
}
printf("\n");
// Autoselect the highest security hash if manifest is provided but
// the user has not specifically defined the hash they care about
if (!params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) {
// User has not selected a specific value, pick most secure hash
if (manifest_check.sha256) {
params.sha256 = true;
} else if (manifest_check.sha1) {
params.sha1 = true;
} else if (manifest_check.xxh64) {
params.xxh64 = true;
} else if (manifest_check.uuid) {
params.uuid = true;
}
}
params.manifest_is_usable = true;
}
// By default if no swich argument provided, assume xxh64
if (!params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) {
params.xxh64 = true;
}
hash_exit_code_t exit_code = gguf_hash(params);
if (params.manifest_is_usable) {
printf("\nVerification results for %s - %s\n", params.manifest_file.c_str(), hash_exit_code_to_str(exit_code));
}
return exit_code;
}
set(TARGET gguf-split)
set(TARGET llama-gguf-split)
add_executable(${TARGET} gguf-split.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
......
......@@ -61,10 +61,10 @@ static size_t split_str_to_n_bytes(std::string str) {
int n;
if (str.back() == 'M') {
sscanf(str.c_str(), "%d", &n);
n_bytes = (size_t)n * 1024 * 1024; // megabytes
n_bytes = (size_t)n * 1000 * 1000; // megabytes
} else if (str.back() == 'G') {
sscanf(str.c_str(), "%d", &n);
n_bytes = (size_t)n * 1024 * 1024 * 1024; // gigabytes
n_bytes = (size_t)n * 1000 * 1000 * 1000; // gigabytes
} else {
throw std::invalid_argument("error: supported units are M (megabytes) or G (gigabytes), but got: " + std::string(1, str.back()));
}
......@@ -284,7 +284,7 @@ struct split_strategy {
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_out, i));
total_size += ggml_nbytes(t);
}
total_size = total_size / 1024 / 1024; // convert to megabytes
total_size = total_size / 1000 / 1000; // convert to megabytes
printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
i_split++;
}
......
......@@ -18,8 +18,8 @@ fi
set -x
SPLIT=$1/gguf-split
MAIN=$1/main
SPLIT=$1/llama-gguf-split
MAIN=$1/llama-cli
WORK_PATH=$TMP_DIR/gguf-split
ROOT_DIR=$(realpath $(dirname $0)/../../)
......@@ -41,7 +41,7 @@ echo PASS
echo
# 2b. Test the sharded model is loading properly
$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --random-prompt --n-predict 32
$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --n-predict 32
echo PASS
echo
......@@ -51,7 +51,7 @@ echo PASS
echo
# 3b. Test the merged model is loading properly
$MAIN --model $WORK_PATH/ggml-model-merge.gguf --random-prompt --n-predict 32
$MAIN --model $WORK_PATH/ggml-model-merge.gguf --n-predict 32
echo PASS
echo
......@@ -61,7 +61,7 @@ echo PASS
echo
# 4b. Test the sharded model is loading properly
$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --random-prompt --n-predict 32
$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --n-predict 32
echo PASS
echo
......@@ -71,7 +71,7 @@ echo
#echo
# 5b. Test the merged model is loading properly
#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --random-prompt --n-predict 32
#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --n-predict 32
#echo PASS
#echo
......@@ -81,7 +81,7 @@ echo PASS
echo
# 6b. Test the sharded model is loading properly
$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --random-prompt --n-predict 32
$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --n-predict 32
echo PASS
echo
......
set(TARGET gguf)
set(TARGET llama-gguf)
add_executable(${TARGET} gguf.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
......
......@@ -92,6 +92,11 @@ static bool gguf_ex_read_0(const std::string & fname) {
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
if (!ctx) {
fprintf(stderr, "%s: failed to load '%s'\n", __func__, fname.c_str());
return false;
}
printf("%s: version: %d\n", __func__, gguf_get_version(ctx));
printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
......
set(TARGET gritlm)
set(TARGET llama-gritlm)
add_executable(${TARGET} gritlm.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
......
......@@ -26,7 +26,7 @@ $ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf --ou
Run the example using the downloaded model:
```console
$ ./gritlm -m models/gritlm-7b_q4_1.gguf
$ ./llama-gritlm -m models/gritlm-7b_q4_1.gguf
Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605
Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103
......
......@@ -44,6 +44,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
// clear previous kv_cache values (irrelevant for embeddings)
llama_kv_cache_clear(ctx);
llama_set_embeddings(ctx, true);
llama_set_causal_attn(ctx, false);
// run model
......@@ -98,7 +99,9 @@ static std::string generate(llama_context * ctx, const std::string & prompt, boo
llama_token eos_token = llama_token_eos(mdl);
llama_kv_cache_clear(ctx);
llama_set_embeddings(ctx, false);
llama_set_causal_attn(ctx, true);
llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
std::vector<llama_token> inputs = llama_tokenize(mdl, prompt, false, true);
......@@ -153,7 +156,9 @@ static std::string gritlm_instruction(const std::string & instruction) {
int main(int argc, char * argv[]) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
return 1;
}
......@@ -164,8 +169,7 @@ int main(int argc, char * argv[]) {
llama_model * mdl = llama_load_model_from_file(params.model.c_str(), mparams);
// create new context - set to embedding mode
cparams.embeddings = true;
// create generation context
llama_context * ctx = llama_new_context_with_model(mdl, cparams);
// ### Embedding/Representation ###
......
set(TARGET imatrix)
set(TARGET llama-imatrix)
add_executable(${TARGET} imatrix.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
......
# llama.cpp/examples/imatrix
Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantum models.
Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantized models.
More information is available here: https://github.com/ggerganov/llama.cpp/pull/4861
## Usage
```
./imatrix -m <some_fp_model> -f <some_training_data> [-o <output_file>] [--verbosity <verbosity_level>]
[-ofreq num_chunks] [-ow <0 or 1>] [other common params]
./llama-imatrix \
-m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \
[--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \
[--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]
```
Here `-m` with a model name and `-f` with a file containing training data (such as e.g. `wiki.train.raw`) are mandatory.
The parameters in square brackets are optional and have the following meaning:
* `-o` (or `--output-file`) specifies the name of the file where the computed data will be stored. If missing `imatrix.dat` is used.
* `--verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`.
* `-ofreq` (or `--output-frequency`) specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
* `-ow` (or `--output-weight`) specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
* `--output-frequency` specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
* `--save-frequency` specifies how often to save a copy of the imatrix in a separate file. Default is 0 (i.e., never)
* `--process-output` specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
For faster computation, make sure to use GPU offloading via the `-ngl` argument
## Example
```bash
LLAMA_CUDA=1 make -j
GGML_CUDA=1 make -j
# generate importance matrix (imatrix.dat)
./imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
./llama-imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
# use the imatrix to perform a Q4_K_M quantization
./quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m
./llama-quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m
```
......@@ -17,39 +17,37 @@
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
static void print_usage(int argc, char ** argv, const gpt_params & params) {
gpt_params_print_usage(argc, argv, params);
LOG_TEE("\nexample usage:\n");
LOG_TEE("\n %s \\\n"
" -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n"
" [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
" [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
LOG_TEE("\n");
}
struct Stats {
std::vector<float> values;
std::vector<int> counts;
int ncall = 0;
};
struct StatParams {
std::string dataset;
std::string ofile = "imatrix.dat";
int n_output_frequency = 10;
int verbosity = 1;
int keep_every = 0;
bool collect_output_weight = false;
};
class IMatrixCollector {
public:
IMatrixCollector() = default;
void set_parameters(StatParams&& params) { m_params = std::move(params); }
void set_params(gpt_params params) { m_params = std::move(params); }
bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
void save_imatrix() const;
bool load_imatrix(const char * file_name, bool add);
static bool load_imatrix(const char * file_name, std::unordered_map<std::string, Stats>& imatrix);
void save_imatrix(int ncall = -1) const;
bool load_imatrix(const char * file_name);
private:
std::unordered_map<std::string, Stats> m_stats;
StatParams m_params;
gpt_params m_params;
std::mutex m_mutex;
int m_last_call = 0;
std::vector<float> m_src1_data;
std::vector<char> m_ids; // the expert ids from ggml_mul_mat_id
//
void save_imatrix(const char * file_name, const char * dataset) const;
void keep_imatrix(int ncall) const;
};
// remove any prefix and suffixes from the name
......@@ -85,7 +83,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
if (t->op != GGML_OP_MUL_MAT) return false;
// why are small batches ignored (<16 tokens)?
if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
if (!(wname.substr(0, 4) == "blk." || (m_params.collect_output_weight && wname == "output.weight"))) return false;
if (!(wname.substr(0, 4) == "blk." || (m_params.process_output && wname == "output.weight"))) return false;
return true;
}
......@@ -129,7 +127,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
}
else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
exit(1); //GGML_ASSERT(false);
exit(1); //GGML_ABORT("fatal error");
}
if (m_params.verbosity > 1) {
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
......@@ -153,28 +151,32 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
for (int j = 0; j < (int)src1->ne[0]; ++j) {
e.values[e_start + j] += x[j]*x[j];
e.counts[e_start + j]++;
if (!std::isfinite(e.values[e_start + j])) {
fprintf(stderr, "%f detected in %s\n", e.values[e_start + j], wname.c_str());
exit(1);
}
}
}
}
if (e.ncall > m_last_call) {
m_last_call = e.ncall;
if (m_last_call % m_params.n_output_frequency == 0) {
if (m_last_call % m_params.n_out_freq == 0) {
save_imatrix();
}
if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
keep_imatrix(m_last_call);
if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
save_imatrix(m_last_call);
}
}
}
} else {
auto& e = m_stats[wname];
auto & e = m_stats[wname];
if (e.values.empty()) {
e.values.resize(src1->ne[0], 0);
e.counts.resize(src1->ne[0], 0);
}
else if (e.values.size() != (size_t)src1->ne[0]) {
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
exit(1); //GGML_ASSERT(false);
exit(1); //GGML_ABORT("fatal error");
}
++e.ncall;
if (m_params.verbosity > 1) {
......@@ -185,15 +187,19 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
for (int j = 0; j < (int)src1->ne[0]; ++j) {
e.values[j] += x[j]*x[j];
e.counts[j]++;
if (!std::isfinite(e.values[j])) {
fprintf(stderr, "%f detected in %s\n", e.values[j], wname.c_str());
exit(1);
}
}
}
if (e.ncall > m_last_call) {
m_last_call = e.ncall;
if (m_last_call % m_params.n_output_frequency == 0) {
if (m_last_call % m_params.n_out_freq == 0) {
save_imatrix();
}
if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
keep_imatrix(m_last_call);
if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
save_imatrix(m_last_call);
}
}
}
......@@ -201,33 +207,75 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
return true;
}
void IMatrixCollector::save_imatrix() const {
save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str(), m_params.dataset.c_str());
}
void IMatrixCollector::save_imatrix(int ncall) const {
auto fname = m_params.out_file;
if (fname.empty()) {
fname = "imatrix.dat";
}
void IMatrixCollector::keep_imatrix(int ncall) const {
auto file_name = m_params.ofile;
if (file_name.empty()) file_name = "imatrix.dat";
file_name += ".at_";
file_name += std::to_string(ncall);
save_imatrix(file_name.c_str(), m_params.dataset.c_str());
}
if (ncall > 0) {
fname += ".at_";
fname += std::to_string(ncall);
}
// avoid writing imatrix entries that do not have full data
// this can happen with MoE models where some of the experts end up not being exercised by the provided training data
int n_entries = 0;
std::vector<std::string> to_store;
bool is_first = true; // for printing
for (const auto & kv : m_stats) {
const int n_all = kv.second.counts.size();
if (n_all == 0) {
continue;
}
int n_zeros = 0;
for (const int c : kv.second.counts) {
if (c == 0) {
n_zeros++;
}
}
if (n_zeros != 0 && is_first) {
fprintf(stderr, "\n");
is_first = false;
}
if (n_zeros == n_all) {
fprintf(stderr, "%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
continue;
}
if (n_zeros > 0) {
fprintf(stderr, "%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
continue;
}
n_entries++;
to_store.push_back(kv.first);
}
if (to_store.size() < m_stats.size()) {
fprintf(stderr, "%s: warning: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
}
void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) const {
std::ofstream out(fname, std::ios::binary);
int n_entries = m_stats.size();
out.write((const char *) &n_entries, sizeof(n_entries));
for (const auto & p : m_stats) {
int len = p.first.size();
for (const auto & name : to_store) {
const auto & stat = m_stats.at(name);
int len = name.size();
out.write((const char *) &len, sizeof(len));
out.write(p.first.c_str(), len);
out.write((const char *) &p.second.ncall, sizeof(p.second.ncall));
int nval = p.second.values.size();
out.write(name.c_str(), len);
out.write((const char *) &stat.ncall, sizeof(stat.ncall));
int nval = stat.values.size();
out.write((const char *) &nval, sizeof(nval));
if (nval > 0) {
std::vector<float> tmp(nval);
for (int i = 0; i < nval; i++) {
tmp[i] = (p.second.values[i] / static_cast<float>(p.second.counts[i])) * static_cast<float>(p.second.ncall);
tmp[i] = (stat.values[i] / static_cast<float>(stat.counts[i])) * static_cast<float>(stat.ncall);
}
out.write((const char*)tmp.data(), nval*sizeof(float));
}
......@@ -236,26 +284,28 @@ void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) co
// Write the number of call the matrix was computed with
out.write((const char *) &m_last_call, sizeof(m_last_call));
// Write the dataset name at the end of the file to later on specify it in quantize
int n_dataset = strlen(dataset);
out.write((const char *) &n_dataset, sizeof(n_dataset));
out.write(dataset, n_dataset);
// Write the input filename at the end of the file to later on specify it in quantize
{
int len = m_params.prompt_file.size();
out.write((const char *) &len, sizeof(len));
out.write(m_params.prompt_file.c_str(), len);
}
if (m_params.verbosity > 0) {
fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname);
fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
}
}
bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_map<std::string, Stats>& imatrix_data) {
std::ifstream in(imatrix_file, std::ios::binary);
bool IMatrixCollector::load_imatrix(const char * fname) {
std::ifstream in(fname, std::ios::binary);
if (!in) {
printf("%s: failed to open %s\n",__func__,imatrix_file);
printf("%s: failed to open %s\n",__func__, fname);
return false;
}
int n_entries;
in.read((char*)&n_entries, sizeof(n_entries));
if (in.fail() || n_entries < 1) {
printf("%s: no data in file %s\n", __func__, imatrix_file);
printf("%s: no data in file %s\n", __func__, fname);
return false;
}
for (int i = 0; i < n_entries; ++i) {
......@@ -263,23 +313,22 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
std::vector<char> name_as_vec(len+1);
in.read((char *)name_as_vec.data(), len);
if (in.fail()) {
printf("%s: failed reading name for entry %d from %s\n",__func__,i+1,imatrix_file);
printf("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
return false;
}
name_as_vec[len] = 0;
std::string name{name_as_vec.data()};
auto& e = imatrix_data[std::move(name)];
auto & e = m_stats[std::move(name)];
int ncall;
in.read((char*)&ncall, sizeof(ncall));
int nval;
in.read((char *)&nval, sizeof(nval));
if (in.fail() || nval < 1) {
printf("%s: failed reading number of values for entry %d\n",__func__,i);
imatrix_data = {};
m_stats = {};
return false;
}
// When re-called from load_imatrix() with add set, this will already be created.
if (e.values.empty()) {
e.values.resize(nval, 0);
e.counts.resize(nval, 0);
......@@ -289,7 +338,7 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
in.read((char*)tmp.data(), nval*sizeof(float));
if (in.fail()) {
printf("%s: failed reading data for entry %d\n",__func__,i);
imatrix_data = {};
m_stats = {};
return false;
}
......@@ -304,13 +353,6 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
return true;
}
bool IMatrixCollector::load_imatrix(const char * file_name, bool add) {
if (!add) {
m_stats.clear();
}
return load_imatrix(file_name, m_stats);
}
static IMatrixCollector g_collector;
static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
......@@ -324,7 +366,7 @@ struct results_log_softmax {
float prob;
};
static std::vector<float> softmax(const std::vector<float>& logits) {
static std::vector<float> softmax(const std::vector<float> & logits) {
std::vector<float> probs(logits.size());
float max_logit = logits[0];
for (float v : logits) {
......@@ -358,8 +400,7 @@ static results_log_softmax log_softmax(int n_vocab, const float * logits, int to
static void process_logits(
int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
double & nll, double & nll2, float * logit_history, float * prob_history
) {
double & nll, double & nll2, float * logit_history, float * prob_history) {
std::mutex mutex;
int counter = 0;
auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
......@@ -391,8 +432,7 @@ static void process_logits(
}
}
static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) {
static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
const int n_ctx = llama_n_ctx(ctx);
......@@ -405,13 +445,13 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
auto tim2 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
if (from_chunk > 0) {
if (size_t((from_chunk + 2)*n_ctx) >= tokens.size()) {
fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, from_chunk);
if (params.i_chunk > 0) {
if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) {
fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
return false;
}
fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, from_chunk, from_chunk*n_ctx);
tokens.erase(tokens.begin(), tokens.begin() + from_chunk*n_ctx);
fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx);
}
if (int(tokens.size()) < 2*n_ctx) {
......@@ -424,7 +464,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
std::vector<float> logit_history;
std::vector<float> prob_history;
if (compute_ppl) {
if (params.compute_ppl) {
logit_history.resize(tokens.size());
prob_history.resize(tokens.size());
}
......@@ -446,7 +486,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
const int num_batches = (n_ctx + n_batch - 1) / n_batch;
std::vector<float> logits;
if (compute_ppl && num_batches > 1) {
if (params.compute_ppl && num_batches > 1) {
logits.reserve((size_t)n_ctx * n_vocab);
}
......@@ -482,7 +522,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
// restore the original token in case it was set to BOS
tokens[batch_start] = token_org;
if (compute_ppl && num_batches > 1) {
if (params.compute_ppl && num_batches > 1) {
const auto * batch_logits = llama_get_logits(ctx);
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
}
......@@ -501,7 +541,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
}
if (compute_ppl) {
if (params.compute_ppl) {
const int first = n_ctx/2;
const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
......@@ -516,7 +556,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
}
printf("\n");
if (compute_ppl) {
if (params.compute_ppl) {
nll2 /= count;
nll /= count;
const double ppl = exp(nll);
......@@ -533,111 +573,32 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
}
int main(int argc, char ** argv) {
StatParams sparams;
std::string prev_result_file;
std::string combine_files;
bool compute_ppl = true;
int from_chunk = 0;
std::vector<char*> args;
args.push_back(argv[0]);
int iarg = 1;
for (; iarg < argc-1; ++iarg) {
std::string arg{argv[iarg]};
if (arg == "-o" || arg == "--output-file") {
sparams.ofile = argv[++iarg];
}
else if (arg == "-ofreq" || arg == "--output-frequency") {
sparams.n_output_frequency = std::stoi(argv[++iarg]);
}
else if (arg == "-ow" || arg == "--output-weight") {
sparams.collect_output_weight = std::stoi(argv[++iarg]);
}
else if (arg == "--verbosity") {
sparams.verbosity = std::stoi(argv[++iarg]);
} else if (arg == "--no-ppl") {
compute_ppl = false;
} else if (arg == "--keep-imatrix") {
sparams.keep_every = std::stoi(argv[++iarg]);
} else if (arg == "--continue-from") {
prev_result_file = argv[++iarg];
} else if (arg == "--combine") {
combine_files = argv[++iarg];
}
else if (arg == "--from-chunk") {
from_chunk = std::stoi(argv[++iarg]);
} else {
args.push_back(argv[iarg]);
}
}
if (iarg < argc) {
std::string arg{argv[iarg]};
if (arg == "--no-ppl") {
compute_ppl = false;
} else {
args.push_back(argv[iarg]);
}
}
gpt_params params;
params.n_batch = 512;
if (!gpt_params_parse(args.size(), args.data(), params)) {
return 1;
}
params.n_ctx = 512;
params.logits_all = true;
params.n_batch = std::min(params.n_batch, params.n_ctx);
params.verbosity = 1;
print_build_info();
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
if (!gpt_params_parse(argc, argv, params)) {
print_usage(argc, argv, params);
return 1;
}
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
if (params.random_prompt) {
params.prompt = string_random_prompt(rng);
}
params.n_batch = std::min(params.n_batch, params.n_ctx);
sparams.dataset = params.prompt_file;
g_collector.set_parameters(std::move(sparams));
g_collector.set_params(params);
if (!combine_files.empty()) {
std::vector<std::string> files;
size_t pos = 0;
while (true) {
auto new_pos = combine_files.find(',', pos);
if (new_pos != std::string::npos) {
files.emplace_back(combine_files.substr(pos, new_pos - pos));
pos = new_pos + 1;
} else {
files.emplace_back(combine_files.substr(pos));
break;
}
}
if (files.size() < 2) {
fprintf(stderr, "You must provide at least two comma separated files to use --combine\n");
for (const auto & in_file : params.in_files) {
printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
if (!g_collector.load_imatrix(in_file.c_str())) {
fprintf(stderr, "%s : failed to load %s\n", __func__, in_file.c_str());
return 1;
}
printf("Combining the following %d files\n", int(files.size()));
for (auto& file : files) {
printf(" %s\n", file.c_str());
if (!g_collector.load_imatrix(file.c_str(), true)) {
fprintf(stderr, "Failed to load %s\n", file.c_str());
return 1;
}
}
g_collector.save_imatrix();
return 0;
}
if (!prev_result_file.empty()) {
if (!g_collector.load_imatrix(prev_result_file.c_str(), false)) {
fprintf(stderr, "=============== Failed to load %s\n", prev_result_file.c_str());
return 1;
}
if (params.in_files.size() > 1) {
printf("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
g_collector.save_imatrix();
}
llama_backend_init();
......@@ -650,9 +611,10 @@ int main(int argc, char ** argv) {
params.warmup = false;
// init
llama_model * model;
llama_context * ctx;
std::tie(model, ctx) = llama_init_from_gpt_params(params);
llama_init_result llama_init = llama_init_from_gpt_params(params);
llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context;
if (model == nullptr || ctx == nullptr) {
fprintf(stderr, "%s : failed to init\n", __func__);
return 1;
......@@ -670,8 +632,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
}
bool OK = compute_imatrix(ctx, params, compute_ppl, from_chunk);
if (!OK) {
if (!compute_imatrix(ctx, params)) {
return 1;
}
......
set(TARGET infill)
set(TARGET llama-infill)
add_executable(${TARGET} infill.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
......
......@@ -15,6 +15,7 @@ In this section, we cover the most commonly used options for running the `infill
- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
- `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
- `--spm-infill`: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.
## Input Prompts
......@@ -42,5 +43,5 @@ scripts/hf.sh --repo TheBloke/CodeLlama-13B-GGUF --file codellama-13b.Q5_K_S.ggu
```
```bash
./infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n print(\"hell" --in-suffix "\n print(\"goodbye world\")\n "
./llama-infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n print(\"hell" --in-suffix "\n print(\"goodbye world\")\n "
```
......@@ -107,6 +107,7 @@ int main(int argc, char ** argv) {
g_params = &params;
if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
return 1;
}
......@@ -139,27 +140,6 @@ int main(int argc, char ** argv) {
LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
params.n_ctx = 8;
}
if (params.instruct) {
printf("\n************\n");
printf("%s: please use the 'main' tool for instruct mode\n", __func__);
printf("************\n\n");
return 0;
}
if (params.chatml) {
printf("\n************\n");
printf("%s: please use the 'main' tool for chatml mode\n", __func__);
printf("************\n\n");
return 0;
}
if (!params.antiprompt.empty()) {
printf("\n************\n");
printf("%s: please use the 'main' tool for antiprompt mode\n", __func__);
printf("************\n\n");
return 0;
}
if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
printf("\n************\n");
printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
......@@ -167,20 +147,6 @@ int main(int argc, char ** argv) {
return 0;
}
if (params.random_prompt) {
printf("\n************\n");
printf("%s: please use the 'main' tool for random prompt mode\n", __func__);
printf("************\n\n");
return 0;
}
if (!params.path_prompt_cache.empty()) {
printf("\n************\n");
printf("%s: infill does not support prompt caching\n", __func__);
printf("************\n\n");
return 0;
}
if (params.rope_freq_base != 0.0) {
LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
......@@ -207,17 +173,16 @@ int main(int argc, char ** argv) {
llama_model * model;
llama_context * ctx;
llama_context * ctx_guidance = NULL;
g_model = &model;
g_ctx = &ctx;
// load the model and apply lora adapter, if any
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (sparams.cfg_scale > 1.f) {
struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
ctx_guidance = llama_new_context_with_model(model, lparams);
}
llama_init_result llama_init = llama_init_from_gpt_params(params);
model = llama_init.model;
ctx = llama_init.context;
if (model == NULL) {
LOG_TEE("%s: error: unable to load model\n", __func__);
......@@ -242,26 +207,28 @@ int main(int argc, char ** argv) {
GGML_ASSERT(llama_add_eos_token(model) != 1);
LOG("add_bos: %d\n", add_bos);
bool suff_rm_leading_spc = params.escape;
if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
params.input_suffix.erase(0, 1);
suff_rm_leading_spc = false;
}
std::vector<llama_token> embd_inp;
std::vector<llama_token> embd_end;
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
const int space_token = 29871;
if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
inp_sfx.erase(inp_sfx.begin());
}
GGML_ASSERT(llama_token_prefix(model) >= 0);
GGML_ASSERT(llama_token_suffix(model) >= 0);
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
if (add_bos) {
inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
}
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
const llama_token middle_token = llama_token_middle(model);
if (middle_token >= 0) {
embd_inp.push_back(middle_token);
}
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
embd_inp = inp_pfx;
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
embd_inp.push_back(llama_token_middle(model));
LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
......@@ -273,25 +240,6 @@ int main(int argc, char ** argv) {
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
}
// Tokenize negative prompt
std::vector<llama_token> guidance_inp;
int guidance_offset = 0;
int original_prompt_len = 0;
if (ctx_guidance) {
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true);
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true);
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
original_prompt_len = original_inp.size();
guidance_offset = (int)guidance_inp.size() - original_prompt_len;
LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
LOG("guidance_offset: %s", log_tostr(guidance_offset));
}
if ((int) embd_inp.size() > n_ctx - 4) {
LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
return 1;
......@@ -319,15 +267,6 @@ int main(int argc, char ** argv) {
LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
}
if (ctx_guidance) {
LOG_TEE("\n");
LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
for (int i = 0; i < (int) guidance_inp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
}
}
if (params.n_keep > 0) {
LOG_TEE("%s: static prompt based on n_keep: '", __func__);
for (int i = 0; i < params.n_keep; i++) {
......@@ -395,12 +334,11 @@ int main(int argc, char ** argv) {
is_interacting = params.interactive_first;
}
bool input_echo = true;
bool input_echo = true;
int n_past = 0;
int n_remain = params.n_predict;
int n_consumed = 0;
int n_past_guidance = 0;
int n_past = 0;
int n_remain = params.n_predict;
int n_consumed = 0;
std::vector<int> input_tokens; g_input_tokens = &input_tokens;
std::vector<int> output_tokens; g_output_tokens = &output_tokens;
......@@ -410,7 +348,6 @@ int main(int argc, char ** argv) {
console::set_display(console::prompt);
std::vector<llama_token> embd;
std::vector<llama_token> embd_guidance;
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
......@@ -436,7 +373,7 @@ int main(int argc, char ** argv) {
// if we run out of context:
// - take the n_keep first tokens from the original prompt (via n_past)
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
if (n_past + (int) embd.size() > n_ctx) {
if (params.n_predict == -2) {
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
break;
......@@ -453,11 +390,7 @@ int main(int argc, char ** argv) {
n_past -= n_discard;
if (ctx_guidance) {
n_past_guidance -= n_discard;
}
LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
LOG("after swap: n_past = %d\n", n_past);
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
......@@ -465,45 +398,6 @@ int main(int argc, char ** argv) {
// evaluate tokens in batches
// embd is typically prepared beforehand to fit within a batch, but not always
if (ctx_guidance) {
int input_size = 0;
llama_token * input_buf = NULL;
if (n_past_guidance < (int) guidance_inp.size()) {
// Guidance context should have the same data with these modifications:
//
// * Replace the initial prompt
// * Shift everything by guidance_offset
embd_guidance = guidance_inp;
if (embd.begin() + original_prompt_len < embd.end()) {
embd_guidance.insert(
embd_guidance.end(),
embd.begin() + original_prompt_len,
embd.end()
);
}
input_buf = embd_guidance.data();
input_size = embd_guidance.size();
LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
} else {
input_buf = embd.data();
input_size = embd.size();
}
for (int i = 0; i < input_size; i += params.n_batch) {
int n_eval = std::min(input_size - i, params.n_batch);
if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) {
LOG_TEE("%s : failed to eval\n", __func__);
return 1;
}
n_past_guidance += n_eval;
}
}
for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
int n_eval = (int) embd.size() - i;
if (n_eval > params.n_batch) {
......@@ -525,11 +419,9 @@ int main(int argc, char ** argv) {
}
embd.clear();
embd_guidance.clear();
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
const llama_token id = llama_sampling_sample(ctx_sampling, ctx, nullptr);
llama_sampling_accept(ctx_sampling, ctx, id, true);
......@@ -583,7 +475,6 @@ int main(int argc, char ** argv) {
// if not currently processing queued inputs;
if ((int) embd_inp.size() <= n_consumed) {
// deal with eot token in infill mode
if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
if (is_interacting && !params.interactive_first) {
......@@ -624,27 +515,26 @@ int main(int argc, char ** argv) {
string_process_escapes(params.input_prefix);
string_process_escapes(params.input_suffix);
}
suff_rm_leading_spc = params.escape;
if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
params.input_suffix.erase(0, 1);
suff_rm_leading_spc = false;
}
// tokenize new prefix and suffix
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
inp_sfx.erase(inp_sfx.begin());
}
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
if (add_bos) {
inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
}
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
embd_inp = inp_pfx;
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
embd_inp.push_back(llama_token_middle(model));
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
if (middle_token >= 0) {
embd_inp.push_back(middle_token);
}
embd.clear();
embd_guidance.clear();
n_remain = params.n_predict;
n_past = 0;
n_consumed = 0;
......@@ -751,7 +641,6 @@ int main(int argc, char ** argv) {
llama_print_timings(ctx);
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
if (ctx_guidance) { llama_free(ctx_guidance); }
llama_free(ctx);
llama_free_model(model);
......@@ -764,4 +653,3 @@ int main(int argc, char ** argv) {
return 0;
}
......@@ -21,7 +21,7 @@ counter=1
echo 'Running'
while IFS= read -r question
do
exe_cmd="./main -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
exe_cmd="./llama-cli -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
echo $counter
echo "Current Question: $question"
eval "$exe_cmd"
......
# Usage:
#! ./llama-server -m some-model.gguf &
#! pip install pydantic
#! python json_schema_pydantic_example.py
from pydantic import BaseModel, Field, TypeAdapter
from annotated_types import MinLen
from typing import Annotated, List, Optional
import json, requests
if True:
def create_completion(*, response_model=None, endpoint="http://localhost:8080/v1/chat/completions", messages, **kwargs):
'''
Creates a chat completion using an OpenAI-compatible endpoint w/ JSON schema support
(llama.cpp server, llama-cpp-python, Anyscale / Together...)
The response_model param takes a type (+ supports Pydantic) and behaves just as w/ Instructor (see below)
'''
response_format = None
type_adapter = None
if response_model:
type_adapter = TypeAdapter(response_model)
schema = type_adapter.json_schema()
messages = [{
"role": "system",
"content": f"You respond in JSON format with the following schema: {json.dumps(schema, indent=2)}"
}] + messages
response_format={"type": "json_object", "schema": schema}
data = requests.post(endpoint, headers={"Content-Type": "application/json"},
json=dict(messages=messages, response_format=response_format, **kwargs)).json()
if 'error' in data:
raise Exception(data['error']['message'])
content = data["choices"][0]["message"]["content"]
return type_adapter.validate_json(content) if type_adapter else content
else:
# This alternative branch uses Instructor + OpenAI client lib.
# Instructor support streamed iterable responses, retry & more.
# (see https://python.useinstructor.com/)
#! pip install instructor openai
import instructor, openai
client = instructor.patch(
openai.OpenAI(api_key="123", base_url="http://localhost:8080"),
mode=instructor.Mode.JSON_SCHEMA)
create_completion = client.chat.completions.create
if __name__ == '__main__':
class QAPair(BaseModel):
class Config:
extra = 'forbid' # triggers additionalProperties: false in the JSON schema
question: str
concise_answer: str
justification: str
stars: Annotated[int, Field(ge=1, le=5)]
class PyramidalSummary(BaseModel):
class Config:
extra = 'forbid' # triggers additionalProperties: false in the JSON schema
title: str
summary: str
question_answers: Annotated[List[QAPair], MinLen(2)]
sub_sections: Optional[Annotated[List['PyramidalSummary'], MinLen(2)]]
print("# Summary\n", create_completion(
model="...",
response_model=PyramidalSummary,
messages=[{
"role": "user",
"content": f"""
You are a highly efficient corporate document summarizer.
Create a pyramidal summary of an imaginary internal document about our company processes
(starting high-level, going down to each sub sections).
Keep questions short, and answers even shorter (trivia / quizz style).
"""
}]))
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import itertools
import json
import re
import sys
from typing import Any, Dict, List, Set, Tuple, Union
from typing import Any, List, Optional, Set, Tuple, Union
def _build_repetition(item_rule, min_items, max_items, separator_rule=None, item_rule_is_literal=False):
if not separator_rule:
if min_items == 0 and max_items == 1:
return f'{item_rule}?'
elif min_items == 1 and max_items is None:
return f'{item_rule}+'
def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
result = ''
if min_items == 0 and max_items == 1:
return f'{item_rule}?'
if min_items > 0:
if item_rule_is_literal and separator_rule is None:
result = '"' + (item_rule[1:-1] * min_items) + '"'
else:
result = (f' {separator_rule} ' if separator_rule else ' ').join([item_rule] * min_items)
def opt_repetitions(up_to_n, prefix_with_sep=False):
'''
- n=4, no sep: '(a (a (a (a)?)?)?)?'
- n=4, sep=',', prefix: '("," a ("," a ("," a ("," a)?)?)?)?'
- n=4, sep=',', no prefix: '(a ("," a ("," a ("," a)?)?)?)?'
'''
content = f'{separator_rule} {item_rule}' if prefix_with_sep and separator_rule else item_rule
if up_to_n == 0:
return ''
elif up_to_n == 1:
return f'({content})?'
elif separator_rule and not prefix_with_sep:
return f'({content} {opt_repetitions(up_to_n - 1, prefix_with_sep=True)})?'
if not separator_rule:
if min_items == 1 and max_items is None:
return f'{item_rule}+'
elif min_items == 0 and max_items is None:
return f'{item_rule}*'
else:
return (f'({content} ' * up_to_n).rstrip() + (')?' * up_to_n)
return f'{item_rule}{{{min_items},{max_items if max_items is not None else ""}}}'
if min_items > 0 and max_items != min_items:
result += ' '
result = item_rule + ' ' + _build_repetition(f'({separator_rule} {item_rule})', min_items - 1 if min_items > 0 else 0, max_items - 1 if max_items is not None else None)
return f'({result})?' if min_items == 0 else result
if max_items is not None:
result += opt_repetitions(max_items - min_items, prefix_with_sep=min_items > 0)
else:
item_operator = f'({separator_rule + " " if separator_rule else ""}{item_rule})'
def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], out: list, decimals_left: int = 16, top_level: bool = True):
has_min = min_value != None
has_max = max_value != None
if min_items == 0 and separator_rule:
result = f'({item_rule} {item_operator}*)?'
def digit_range(from_char: str, to_char: str):
out.append("[")
if from_char == to_char:
out.append(from_char)
else:
result += f'{item_operator}*'
return result
out.append(from_char)
out.append("-")
out.append(to_char)
out.append("]")
def more_digits(min_digits: int, max_digits: int):
out.append("[0-9]")
if min_digits == max_digits and min_digits == 1:
return
out.append("{")
out.append(str(min_digits))
if max_digits != min_digits:
out.append(",")
if max_digits != sys.maxsize:
out.append(str(max_digits))
out.append("}")
def uniform_range(from_str: str, to_str: str):
i = 0
while i < len(from_str) and from_str[i] == to_str[i]:
i += 1
if i > 0:
out.append("\"")
out.append(from_str[:i])
out.append("\"")
if i < len(from_str):
if i > 0:
out.append(" ")
sub_len = len(from_str) - i - 1
if sub_len > 0:
from_sub = from_str[i+1:]
to_sub = to_str[i+1:]
sub_zeros = "0" * sub_len
sub_nines = "9" * sub_len
to_reached = False
out.append("(")
if from_sub == sub_zeros:
digit_range(from_str[i], chr(ord(to_str[i]) - 1))
out.append(" ")
more_digits(sub_len, sub_len)
else:
out.append("[")
out.append(from_str[i])
out.append("] ")
out.append("(")
uniform_range(from_sub, sub_nines)
out.append(")")
if ord(from_str[i]) < ord(to_str[i]) - 1:
out.append(" | ")
if to_sub == sub_nines:
digit_range(chr(ord(from_str[i]) + 1), to_str[i])
to_reached = True
else:
digit_range(chr(ord(from_str[i]) + 1), chr(ord(to_str[i]) - 1))
out.append(" ")
more_digits(sub_len, sub_len)
if not to_reached:
out.append(" | ")
digit_range(to_str[i], to_str[i])
out.append(" ")
uniform_range(sub_zeros, to_sub)
out.append(")")
else:
out.append("[")
out.append(from_str[i])
out.append("-")
out.append(to_str[i])
out.append("]")
if has_min and has_max:
if min_value < 0 and max_value < 0:
out.append("\"-\" (")
_generate_min_max_int(-max_value, -min_value, out, decimals_left, top_level=True)
out.append(")")
return
if min_value < 0:
out.append("\"-\" (")
_generate_min_max_int(0, -min_value, out, decimals_left, top_level=True)
out.append(") | ")
min_value = 0
min_s = str(min_value)
max_s = str(max_value)
min_digits = len(min_s)
max_digits = len(max_s)
for digits in range(min_digits, max_digits):
uniform_range(min_s, "9" * digits)
min_s = "1" + "0" * digits
out.append(" | ")
uniform_range(min_s, max_s)
return
less_decimals = max(decimals_left - 1, 1)
if has_min:
if min_value < 0:
out.append("\"-\" (")
_generate_min_max_int(None, -min_value, out, decimals_left, top_level=False)
out.append(") | [0] | [1-9] ")
more_digits(0, decimals_left - 1)
elif min_value == 0:
if top_level:
out.append("[0] | [1-9] ")
more_digits(0, less_decimals)
else:
more_digits(1, decimals_left)
elif min_value <= 9:
c = str(min_value)
range_start = '1' if top_level else '0'
if c > range_start:
digit_range(range_start, chr(ord(c) - 1))
out.append(" ")
more_digits(1, less_decimals)
out.append(" | ")
digit_range(c, "9")
out.append(" ")
more_digits(0, less_decimals)
else:
min_s = str(min_value)
length = len(min_s)
c = min_s[0]
if c > "1":
digit_range("1" if top_level else "0", chr(ord(c) - 1))
out.append(" ")
more_digits(length, less_decimals)
out.append(" | ")
digit_range(c, c)
out.append(" (")
_generate_min_max_int(int(min_s[1:]), None, out, less_decimals, top_level=False)
out.append(")")
if c < "9":
out.append(" | ")
digit_range(chr(ord(c) + 1), "9")
out.append(" ")
more_digits(length - 1, less_decimals)
return
if has_max:
if max_value >= 0:
if top_level:
out.append("\"-\" [1-9] ")
more_digits(0, less_decimals)
out.append(" | ")
_generate_min_max_int(0, max_value, out, decimals_left, top_level=True)
else:
out.append("\"-\" (")
_generate_min_max_int(-max_value, None, out, decimals_left, top_level=False)
out.append(")")
return
raise RuntimeError("At least one of min_value or max_value must be set")
class BuiltinRule:
def __init__(self, content: str, deps: list = None):
def __init__(self, content: str, deps: list | None = None):
self.content = content
self.deps = deps or []
_up_to_15_digits = _build_repetition('[0-9]', 0, 15)
# whitespace is constrained to a single space char to prevent model "running away" in
# whitespace. Also maybe improves generation quality?
SPACE_RULE = '" "?'
# Constraining spaces to prevent model "running away".
SPACE_RULE = '| " " | "\\n" [ \\t]{0,20}'
PRIMITIVE_RULES = {
'boolean' : BuiltinRule('("true" | "false") space', []),
'decimal-part' : BuiltinRule('[0-9] ' + _up_to_15_digits, []),
'integral-part': BuiltinRule('[0-9] | [1-9] ' + _up_to_15_digits, []),
'decimal-part' : BuiltinRule('[0-9]{1,16}', []),
'integral-part': BuiltinRule('[0] | [1-9] [0-9]{0,15}', []),
'number' : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
'integer' : BuiltinRule('("-"? integral-part) space', ['integral-part']),
'value' : BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
'object' : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
'array' : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
'uuid' : BuiltinRule(r'"\"" ' + ' "-" '.join('[0-9a-fA-F]' * n for n in [8, 4, 4, 4, 12]) + r' "\"" space', []),
'char' : BuiltinRule(r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])', []),
'uuid' : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space', []),
'char' : BuiltinRule(r'[^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})', []),
'string' : BuiltinRule(r'"\"" char* "\"" space', ['char']),
'null' : BuiltinRule('"null" space', []),
}
# TODO: support "uri", "email" string formats
STRING_FORMAT_RULES = {
'date' : BuiltinRule('[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
'time' : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
'date' : BuiltinRule('[0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
'time' : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
'date-time' : BuiltinRule('date "T" time', ['date', 'time']),
'date-string' : BuiltinRule('"\\"" date "\\"" space', ['date']),
'time-string' : BuiltinRule('"\\"" time "\\"" space', ['time']),
......@@ -101,7 +233,7 @@ GRAMMAR_RANGE_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"\]\-\\]')
GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]'}
NON_LITERAL_SET = set('|.()[]{}*+?')
ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('[]()|{}*+?')
ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('^$.[]()|{}*+?')
class SchemaConverter:
......@@ -118,7 +250,7 @@ class SchemaConverter:
def _format_literal(self, literal):
escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub(
lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), literal
lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)) or m.group(0), literal
)
return f'"{escaped}"'
......@@ -145,6 +277,51 @@ class SchemaConverter:
return ''.join(('(', *recurse(0), ')'))
def _not_strings(self, strings):
class TrieNode:
def __init__(self):
self.children = {}
self.is_end_of_string = False
def insert(self, string):
node = self
for c in string:
node = node.children.setdefault(c, TrieNode())
node.is_end_of_string = True
trie = TrieNode()
for s in strings:
trie.insert(s)
char_rule = self._add_primitive('char', PRIMITIVE_RULES['char'])
out = ['["] ( ']
def visit(node):
rejects = []
first = True
for c in sorted(node.children.keys()):
child = node.children[c]
rejects.append(c)
if first:
first = False
else:
out.append(' | ')
out.append(f'[{c}]')
if child.children:
out.append(f' (')
visit(child)
out.append(')')
elif child.is_end_of_string:
out.append(f' {char_rule}+')
if node.children:
if not first:
out.append(' | ')
out.append(f'[^"{"".join(rejects)}] {char_rule}*')
visit(trie)
out.append(f' ){"" if trie.is_end_of_string else "?"} ["] space')
return ''.join(out)
def _add_rule(self, name, rule):
esc_name = INVALID_RULE_CHARS_RE.sub('-', name)
if esc_name not in self._rules or self._rules[esc_name] == rule:
......@@ -228,11 +405,11 @@ class SchemaConverter:
i = 0
length = len(pattern)
def to_rule(s: Tuple[str, bool]) -> str:
def to_rule(s: tuple[str, bool]) -> str:
(txt, is_literal) = s
return "\"" + txt + "\"" if is_literal else txt
def transform() -> Tuple[str, bool]:
def transform() -> tuple[str, bool]:
'''
Parse a unit at index i (advancing it), and return its string representation + whether it's a literal.
'''
......@@ -245,7 +422,7 @@ class SchemaConverter:
# We only need a flat structure here to apply repetition operators to the last item, and
# to merge literals at the and (we're parsing grouped ( sequences ) recursively and don't treat '|' specially
# (GBNF's syntax is luckily very close to regular expressions!)
seq: list[Tuple[str, bool]] = []
seq: list[tuple[str, bool]] = []
def get_dot():
if self._dotall:
......@@ -333,7 +510,7 @@ class SchemaConverter:
sub_rule_ids[sub] = id
sub = id
seq[-1] = (_build_repetition(f'"{sub}"' if sub_is_literal else sub, min_times, max_times, item_rule_is_literal=sub_is_literal), False)
seq[-1] = (_build_repetition(f'"{sub}"' if sub_is_literal else sub, min_times, max_times), False)
else:
literal = ''
while i < length:
......@@ -390,13 +567,13 @@ class SchemaConverter:
return self._add_rule(rule_name, self._generate_union_rule(name, schema.get('oneOf') or schema['anyOf']))
elif isinstance(schema_type, list):
return self._add_rule(rule_name, self._generate_union_rule(name, [{'type': t} for t in schema_type]))
return self._add_rule(rule_name, self._generate_union_rule(name, [{**schema, 'type': t} for t in schema_type]))
elif 'const' in schema:
return self._add_rule(rule_name, self._generate_constant_rule(schema['const']))
return self._add_rule(rule_name, self._generate_constant_rule(schema['const']) + ' space')
elif 'enum' in schema:
rule = ' | '.join((self._generate_constant_rule(v) for v in schema['enum']))
rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in schema['enum'])) + ') space'
return self._add_rule(rule_name, rule)
elif schema_type in (None, 'object') and \
......@@ -427,7 +604,7 @@ class SchemaConverter:
else:
add_component(t, is_required=True)
return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=[]))
return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=None))
elif schema_type in (None, 'array') and ('items' in schema or 'prefixItems' in schema):
items = schema.get('items') or schema['prefixItems']
......@@ -465,6 +642,24 @@ class SchemaConverter:
return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\"" space')
elif schema_type in (None, 'integer') and \
('minimum' in schema or 'exclusiveMinimum' in schema or 'maximum' in schema or 'exclusiveMaximum' in schema):
min_value = None
max_value = None
if 'minimum' in schema:
min_value = schema['minimum']
elif 'exclusiveMinimum' in schema:
min_value = schema['exclusiveMinimum'] + 1
if 'maximum' in schema:
max_value = schema['maximum']
elif 'exclusiveMaximum' in schema:
max_value = schema['exclusiveMaximum'] - 1
out = ["("]
_generate_min_max_int(min_value, max_value, out)
out.append(") space")
return self._add_rule(rule_name, ''.join(out))
elif (schema_type == 'object') or (len(schema) == 0):
return self._add_rule(rule_name, self._add_primitive('object', PRIMITIVE_RULES['object']))
......@@ -483,7 +678,7 @@ class SchemaConverter:
self._add_primitive(dep, dep_rule)
return n
def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], name: str, additional_properties: Union[bool, Any]):
def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], name: str, additional_properties: Optional[Union[bool, Any]]):
prop_order = self._prop_order
# sort by position in prop_order (if specified) then by original order
sorted_props = [kv[0] for _, kv in sorted(enumerate(properties), key=lambda ikv: (prop_order.get(ikv[1][0], len(prop_order)), ikv[0]))]
......@@ -498,12 +693,16 @@ class SchemaConverter:
required_props = [k for k in sorted_props if k in required]
optional_props = [k for k in sorted_props if k not in required]
if additional_properties == True or isinstance(additional_properties, dict):
if additional_properties is not None and additional_properties != False:
sub_name = f'{name}{"-" if name else ""}additional'
value_rule = self.visit({} if additional_properties == True else additional_properties, f'{sub_name}-value')
value_rule = self.visit(additional_properties, f'{sub_name}-value') if isinstance(additional_properties, dict) else \
self._add_primitive('value', PRIMITIVE_RULES['value'])
key_rule = self._add_primitive('string', PRIMITIVE_RULES['string']) if not sorted_props \
else self._add_rule(f'{sub_name}-k', self._not_strings(sorted_props))
prop_kv_rule_names["*"] = self._add_rule(
f'{sub_name}-kv',
self._add_primitive('string', PRIMITIVE_RULES['string']) + f' ":" space {value_rule}'
f'{key_rule} ":" space {value_rule}'
)
optional_props.append("*")
......@@ -518,15 +717,11 @@ class SchemaConverter:
def get_recursive_refs(ks, first_is_optional):
[k, *rest] = ks
kv_rule_name = prop_kv_rule_names[k]
if k == '*':
res = self._add_rule(
f'{name}{"-" if name else ""}additional-kvs',
f'{kv_rule_name} ( "," space ' + kv_rule_name + ' )*'
)
elif first_is_optional:
res = f'( "," space {kv_rule_name} )?'
comma_ref = f'( "," space {kv_rule_name} )'
if first_is_optional:
res = comma_ref + ('*' if k == '*' else '?')
else:
res = kv_rule_name
res = kv_rule_name + (' ' + comma_ref + "*" if k == '*' else '')
if len(rest) > 0:
res += ' ' + self._add_rule(
f'{name}{"-" if name else ""}{k}-rest',
......@@ -556,7 +751,7 @@ class SchemaConverter:
def main(args_in = None):
parser = argparse.ArgumentParser(
description='''
Generates a grammar (suitable for use in ./main) that produces JSON conforming to a
Generates a grammar (suitable for use in ./llama-cli) that produces JSON conforming to a
given JSON schema. Only a subset of JSON schema features are supported; more may be
added in the future.
''',
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment