init

0ce8bcfd · xuxzh1 · b0135f4b · 0ce8bcfd · 0ce8bcfd · 0ce8bcfd
Commit 0ce8bcfd authored Nov 12, 2024 by xuxzh1 🎱
20 changed files
--- a/llm/llama.cpp/examples/llama-bench/README.md
+++ b/llm/llama.cpp/examples/llama-bench/README.md
-# llama.cpp/example/llama-bench
+# llama.cpp/examples/llama-bench

 Performance testing tool for llama.cpp.

@@ -162,7 +162,7 @@ $ ./llama-bench -o csv
 ```

 ```csv
-build_commit,build_number,cuda,opencl,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
+build_commit,build_number,cuda,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
 "3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","512","0","2023-09-23T12:09:01Z","212155977","732372","2413.341687","8.305961"
 "3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","0","128","2023-09-23T12:09:02Z","969320879","2728399","132.052051","0.371342"
 ```
@@ -179,7 +179,6 @@ $ ./llama-bench -o json
    "build_commit": "3469684",
    "build_number": 1275,
    "cuda": true,
-    "opencl": false,
    "metal": false,
    "gpu_blas": true,
    "blas": true,
@@ -210,7 +209,6 @@ $ ./llama-bench -o json
    "build_commit": "3469684",
    "build_number": 1275,
    "cuda": true,
-    "opencl": false,
    "metal": false,
    "gpu_blas": true,
    "blas": true,
@@ -253,7 +251,6 @@ CREATE TABLE IF NOT EXISTS test (
  build_commit TEXT,
  build_number INTEGER,
  cuda INTEGER,
-  opencl INTEGER,
  metal INTEGER,
  gpu_blas INTEGER,
  blas INTEGER,
@@ -279,6 +276,6 @@ CREATE TABLE IF NOT EXISTS test (
  stddev_ts REAL
 );

-INSERT INTO test (build_commit, build_number, cuda, opencl, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
-INSERT INTO test (build_commit, build_number, cuda, opencl, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
+INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
+INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
 ```
--- a/llm/llama.cpp/examples/llama-bench/llama-bench.cpp
+++ b/llm/llama.cpp/examples/llama-bench/llama-bench.cpp
@@ -23,6 +23,10 @@
 #include "ggml-cuda.h"
 #include "ggml-sycl.h"

+#ifdef GGML_USE_CANN
+#include "ggml-cann.h"
+#endif
+
 // utils
 static uint64_t get_time_ns() {
    using clock = std::chrono::high_resolution_clock;
@@ -41,20 +45,6 @@ static std::string join(const std::vector<T> & values, const std::string & delim
    return str.str();
 }

-template<class T>
-static std::vector<T> split(const std::string & str, char delim) {
-    std::vector<T> values;
-    std::istringstream str_stream(str);
-    std::string token;
-    while (std::getline(str_stream, token, delim)) {
-        T value;
-        std::istringstream token_stream(token);
-        token_stream >> value;
-        values.push_back(value);
-    }
-    return values;
-}
-
 template<typename T, typename F>
 static std::vector<std::string> transform_to_str(const std::vector<T> & values, F f) {
    std::vector<std::string> str_values;
@@ -134,30 +124,59 @@ static std::string get_gpu_info() {
            id += "/";
        }
    }
+#endif
+#ifdef GGML_USE_CANN
+    uint32_t count = ggml_backend_cann_get_device_count();
+    for (uint32_t i = 0; i < count; i++) {
+        char buf[128];
+        ggml_backend_cann_get_device_description(i, buf, sizeof(buf));
+        id += buf;
+        if (i < count - 1) {
+            id += "/";
+        }
+    }
 #endif
    // TODO: other backends
    return id;
 }

 // command line params
-enum output_formats {CSV, JSON, MARKDOWN, SQL};
+enum output_formats {NONE, CSV, JSON, MARKDOWN, SQL};

 static const char * output_format_str(output_formats format) {
    switch (format) {
+        case NONE:     return "none";
        case CSV:      return "csv";
        case JSON:     return "json";
        case MARKDOWN: return "md";
        case SQL:      return "sql";
-        default: GGML_ASSERT(!"invalid output format");
+        default: GGML_ABORT("invalid output format");
    }
 }

+static bool output_format_from_str(const std::string & s, output_formats & format) {
+    if (s == "none") {
+        format = NONE;
+    } else if (s == "csv") {
+        format = CSV;
+    } else if (s == "json") {
+        format = JSON;
+    } else if (s == "md") {
+        format = MARKDOWN;
+    } else if (s == "sql") {
+        format = SQL;
+    } else {
+        return false;
+    }
+    return true;
+}
+
 static const char * split_mode_str(llama_split_mode mode) {
    switch (mode) {
        case LLAMA_SPLIT_MODE_NONE:  return "none";
        case LLAMA_SPLIT_MODE_LAYER: return "layer";
        case LLAMA_SPLIT_MODE_ROW:   return "row";
-        default: GGML_ASSERT(!"invalid split mode");
+        default: GGML_ABORT("invalid split mode");
    }
 }

@@ -190,31 +209,33 @@ struct cmd_params {
    int reps;
    bool verbose;
    output_formats output_format;
+    output_formats output_format_stderr;
 };

 static const cmd_params cmd_params_defaults = {
-    /* model         */ {"models/7B/ggml-model-q4_0.gguf"},
-    /* n_prompt      */ {512},
-    /* n_gen         */ {128},
-    /* n_pg          */ {},
-    /* n_batch       */ {2048},
-    /* n_ubatch      */ {512},
-    /* type_k        */ {GGML_TYPE_F16},
-    /* type_v        */ {GGML_TYPE_F16},
-    /* n_threads     */ {cpu_get_num_math()},
-    /* n_gpu_layers  */ {99},
-    /* rpc_servers   */ {""},
-    /* split_mode    */ {LLAMA_SPLIT_MODE_LAYER},
-    /* main_gpu      */ {0},
-    /* no_kv_offload */ {false},
-    /* flash_attn    */ {false},
-    /* tensor_split  */ {std::vector<float>(llama_max_devices(), 0.0f)},
-    /* use_mmap      */ {true},
-    /* embeddings    */ {false},
-    /* numa          */ GGML_NUMA_STRATEGY_DISABLED,
-    /* reps          */ 5,
-    /* verbose       */ false,
-    /* output_format */ MARKDOWN
+    /* model                */ {"models/7B/ggml-model-q4_0.gguf"},
+    /* n_prompt             */ {512},
+    /* n_gen                */ {128},
+    /* n_pg                 */ {},
+    /* n_batch              */ {2048},
+    /* n_ubatch             */ {512},
+    /* type_k               */ {GGML_TYPE_F16},
+    /* type_v               */ {GGML_TYPE_F16},
+    /* n_threads            */ {cpu_get_num_math()},
+    /* n_gpu_layers         */ {99},
+    /* rpc_servers          */ {""},
+    /* split_mode           */ {LLAMA_SPLIT_MODE_LAYER},
+    /* main_gpu             */ {0},
+    /* no_kv_offload        */ {false},
+    /* flash_attn           */ {false},
+    /* tensor_split         */ {std::vector<float>(llama_max_devices(), 0.0f)},
+    /* use_mmap             */ {true},
+    /* embeddings           */ {false},
+    /* numa                 */ GGML_NUMA_STRATEGY_DISABLED,
+    /* reps                 */ 5,
+    /* verbose              */ false,
+    /* output_format        */ MARKDOWN,
+    /* output_format_stderr */ NONE,
 };

 static void print_usage(int /* argc */, char ** argv) {
@@ -243,6 +264,7 @@ static void print_usage(int /* argc */, char ** argv) {
    printf("  -ts, --tensor-split <ts0/ts1/..>    (default: 0)\n");
    printf("  -r, --repetitions <n>               (default: %d)\n", cmd_params_defaults.reps);
    printf("  -o, --output <csv|json|md|sql>      (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
+    printf("  -oe, --output-err <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
    printf("  -v, --verbose                       (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
    printf("\n");
    printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
@@ -284,7 +306,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {

    params.verbose = cmd_params_defaults.verbose;
    params.output_format = cmd_params_defaults.output_format;
+    params.output_format_stderr = cmd_params_defaults.output_format_stderr;
    params.reps = cmd_params_defaults.reps;
+    params.numa = cmd_params_defaults.numa;

    for (int i = 1; i < argc; i++) {
        arg = argv[i];
@@ -300,28 +324,28 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                invalid_param = true;
                break;
            }
-            auto p = split<std::string>(argv[i], split_delim);
+            auto p = string_split<std::string>(argv[i], split_delim);
            params.model.insert(params.model.end(), p.begin(), p.end());
        } else if (arg == "-p" || arg == "--n-prompt") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            auto p = split<int>(argv[i], split_delim);
+            auto p = string_split<int>(argv[i], split_delim);
            params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end());
        } else if (arg == "-n" || arg == "--n-gen") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            auto p = split<int>(argv[i], split_delim);
+            auto p = string_split<int>(argv[i], split_delim);
            params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
        } else if (arg == "-pg") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            auto p = split<std::string>(argv[i], ',');
+            auto p = string_split<std::string>(argv[i], ',');
            if (p.size() != 2) {
                invalid_param = true;
                break;
@@ -332,21 +356,21 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                invalid_param = true;
                break;
            }
-            auto p = split<int>(argv[i], split_delim);
+            auto p = string_split<int>(argv[i], split_delim);
            params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
        } else if (arg == "-ub" || arg == "--ubatch-size") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            auto p = split<int>(argv[i], split_delim);
+            auto p = string_split<int>(argv[i], split_delim);
            params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end());
        } else if (arg == "-ctk" || arg == "--cache-type-k") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            auto p = split<std::string>(argv[i], split_delim);
+            auto p = string_split<std::string>(argv[i], split_delim);
            std::vector<ggml_type> types;
            for (const auto & t : p) {
                ggml_type gt = ggml_type_from_name(t);
@@ -362,7 +386,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                invalid_param = true;
                break;
            }
-            auto p = split<std::string>(argv[i], split_delim);
+            auto p = string_split<std::string>(argv[i], split_delim);
            std::vector<ggml_type> types;
            for (const auto & t : p) {
                ggml_type gt = ggml_type_from_name(t);
@@ -378,14 +402,14 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                invalid_param = true;
                break;
            }
-            auto p = split<int>(argv[i], split_delim);
+            auto p = string_split<int>(argv[i], split_delim);
            params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
        } else if (arg == "-ngl" || arg == "--n-gpu-layers") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            auto p = split<int>(argv[i], split_delim);
+            auto p = string_split<int>(argv[i], split_delim);
            params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
        } else if (arg == "-rpc" || arg == "--rpc") {
            if (++i >= argc) {
@@ -398,7 +422,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                invalid_param = true;
                break;
            }
-            auto p = split<std::string>(argv[i], split_delim);
+            auto p = string_split<std::string>(argv[i], split_delim);
            std::vector<llama_split_mode> modes;
            for (const auto & m : p) {
                llama_split_mode mode;
@@ -420,13 +444,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                invalid_param = true;
                break;
            }
-            params.main_gpu = split<int>(argv[i], split_delim);
+            params.main_gpu = string_split<int>(argv[i], split_delim);
        } else if (arg == "-nkvo" || arg == "--no-kv-offload") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            auto p = split<bool>(argv[i], split_delim);
+            auto p = string_split<bool>(argv[i], split_delim);
            params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
        } else if (arg == "--numa") {
            if (++i >= argc) {
@@ -444,28 +468,28 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                invalid_param = true;
                break;
            }
-            auto p = split<bool>(argv[i], split_delim);
+            auto p = string_split<bool>(argv[i], split_delim);
            params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
        } else if (arg == "-mmp" || arg == "--mmap") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            auto p = split<bool>(argv[i], split_delim);
+            auto p = string_split<bool>(argv[i], split_delim);
            params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
        } else if (arg == "-embd" || arg == "--embeddings") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            auto p = split<bool>(argv[i], split_delim);
+            auto p = string_split<bool>(argv[i], split_delim);
            params.embeddings.insert(params.embeddings.end(), p.begin(), p.end());
        } else if (arg == "-ts" || arg == "--tensor-split") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            for (auto ts : split<std::string>(argv[i], split_delim)) {
+            for (auto ts : string_split<std::string>(argv[i], split_delim)) {
                // split string by ; and /
                const std::regex regex{R"([;/]+)"};
                std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1};
@@ -493,18 +517,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                invalid_param = true;
                break;
            }
-            if (argv[i] == std::string("csv")) {
-                params.output_format = CSV;
-            } else if (argv[i] == std::string("json")) {
-                params.output_format = JSON;
-            } else if (argv[i] == std::string("md")) {
-                params.output_format = MARKDOWN;
-            } else if (argv[i] == std::string("sql")) {
-                params.output_format = SQL;
-            } else {
+            invalid_param = !output_format_from_str(argv[i], params.output_format);
+        } else if (arg == "-oe" || arg == "--output-err") {
+            if (++i >= argc) {
                invalid_param = true;
                break;
            }
+            invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
        } else if (arg == "-v" || arg == "--verbose") {
            params.verbose = true;
        } else {
@@ -706,12 +725,10 @@ struct test {
    static const std::string build_commit;
    static const int build_number;
    static const bool cuda;
-    static const bool opencl;
    static const bool vulkan;
    static const bool kompute;
    static const bool metal;
    static const bool sycl;
-    static const bool rpc;
    static const bool gpu_blas;
    static const bool blas;
    static const std::string cpu_info;
@@ -723,6 +740,7 @@ struct test {
    int n_batch;
    int n_ubatch;
    int n_threads;
+    bool has_rpc;
    ggml_type type_k;
    ggml_type type_v;
    int n_gpu_layers;
@@ -748,6 +766,7 @@ struct test {
        n_batch = inst.n_batch;
        n_ubatch = inst.n_ubatch;
        n_threads = inst.n_threads;
+        has_rpc = !inst.rpc_servers.empty();
        type_k = inst.type_k;
        type_v = inst.type_v;
        n_gpu_layers = inst.n_gpu_layers;
@@ -795,9 +814,6 @@ struct test {
        if (cuda) {
            return GGML_CUDA_NAME;
        }
-        if (opencl) {
-            return "OpenCL";
-        }
        if (vulkan) {
            return "Vulkan";
        }
@@ -810,9 +826,6 @@ struct test {
        if (sycl) {
            return GGML_SYCL_NAME;
        }
-        if (rpc) {
-            return "RPC";
-        }
        if (gpu_blas) {
            return "GPU BLAS";
        }
@@ -826,7 +839,7 @@ struct test {
    static const std::vector<std::string> & get_fields() {
        static const std::vector<std::string> fields = {
            "build_commit", "build_number",
-            "cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas",
+            "cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas",
            "cpu_info", "gpu_info",
            "model_filename", "model_type", "model_size", "model_n_params",
            "n_batch", "n_ubatch",
@@ -852,7 +865,7 @@ struct test {
            field == "avg_ns" || field == "stddev_ns") {
            return INT;
        }
-        if (field == "cuda" || field == "opencl"  || field == "vulkan" || field == "kompute" || field == "metal" ||
+        if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
            field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
            field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
            return BOOL;
@@ -881,8 +894,8 @@ struct test {
        }
        std::vector<std::string> values = {
            build_commit, std::to_string(build_number),
-            std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan),
-            std::to_string(metal), std::to_string(sycl), std::to_string(rpc), std::to_string(gpu_blas), std::to_string(blas),
+            std::to_string(cuda), std::to_string(vulkan), std::to_string(vulkan),
+            std::to_string(metal), std::to_string(sycl), std::to_string(has_rpc), std::to_string(gpu_blas), std::to_string(blas),
            cpu_info, gpu_info,
            model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
            std::to_string(n_batch), std::to_string(n_ubatch),
@@ -910,14 +923,12 @@ struct test {
 const std::string test::build_commit = LLAMA_COMMIT;
 const int         test::build_number = LLAMA_BUILD_NUMBER;
 const bool        test::cuda         = !!ggml_cpu_has_cuda();
-const bool        test::opencl       = !!ggml_cpu_has_clblast();
 const bool        test::vulkan       = !!ggml_cpu_has_vulkan();
 const bool        test::kompute      = !!ggml_cpu_has_kompute();
 const bool        test::metal        = !!ggml_cpu_has_metal();
 const bool        test::gpu_blas     = !!ggml_cpu_has_gpublas();
 const bool        test::blas         = !!ggml_cpu_has_blas();
 const bool        test::sycl         = !!ggml_cpu_has_sycl();
-const bool        test::rpc          = !!ggml_cpu_has_rpc();
 const std::string test::cpu_info     = get_cpu_info();
 const std::string test::gpu_info     = get_gpu_info();

@@ -1035,6 +1046,27 @@ struct markdown_printer : public printer {
        if (field == "n_gpu_layers") {
            return 3;
        }
+        if (field == "n_threads") {
+            return 7;
+        }
+        if (field == "n_batch") {
+            return 7;
+        }
+        if (field == "n_ubatch") {
+            return 8;
+        }
+        if (field == "type_k" || field == "type_v") {
+            return 6;
+        }
+        if (field == "split_mode") {
+            return 5;
+        }
+        if (field == "flash_attn") {
+            return 2;
+        }
+        if (field == "use_mmap") {
+            return 4;
+        }
        if (field == "test") {
            return 13;
        }
@@ -1162,6 +1194,9 @@ struct markdown_printer : public printer {
                value = buf;
            } else if (field == "backend") {
                value = test::get_backend();
+                if (t.has_rpc) {
+                    value += "+RPC";
+                }
            } else if (field == "test") {
                if (t.n_prompt > 0 && t.n_gen == 0) {
                    snprintf(buf, sizeof(buf), "pp%d", t.n_prompt);
@@ -1278,6 +1313,22 @@ static void llama_null_log_callback(enum ggml_log_level level, const char * text
    (void) user_data;
 }

+static std::unique_ptr<printer> create_printer(output_formats format) {
+    switch (format) {
+        case NONE:
+            return nullptr;
+        case CSV:
+            return std::unique_ptr<printer>(new csv_printer());
+        case JSON:
+            return std::unique_ptr<printer>(new json_printer());
+        case MARKDOWN:
+            return std::unique_ptr<printer>(new markdown_printer());
+        case SQL:
+            return std::unique_ptr<printer>(new sql_printer());
+    }
+    GGML_ABORT("fatal error");
+}
+
 int main(int argc, char ** argv) {
    // try to set locale for unicode characters in markdown
    setlocale(LC_CTYPE, ".UTF-8");
@@ -1304,26 +1355,18 @@ int main(int argc, char ** argv) {
    llama_numa_init(params.numa);

    // initialize printer
-    std::unique_ptr<printer> p;
-    switch (params.output_format) {
-        case CSV:
-            p.reset(new csv_printer());
-            break;
-        case JSON:
-            p.reset(new json_printer());
-            break;
-        case MARKDOWN:
-            p.reset(new markdown_printer());
-            break;
-        case SQL:
-            p.reset(new sql_printer());
-            break;
-        default:
-            assert(false);
-            exit(1);
+    std::unique_ptr<printer> p = create_printer(params.output_format);
+    std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
+
+    if (p) {
+        p->fout = stdout;
+        p->print_header(params);
+    }
+
+    if (p_err) {
+        p_err->fout = stderr;
+        p_err->print_header(params);
    }
-    p->fout = stdout;
-    p->print_header(params);

    std::vector<cmd_params_instance> params_instances = get_cmd_params_instances(params);

@@ -1381,7 +1424,15 @@ int main(int argc, char ** argv) {
            t.samples_ns.push_back(t_ns);
        }

-        p->print_test(t);
+        if (p) {
+            p->print_test(t);
+            fflush(p->fout);
+        }
+
+        if (p_err) {
+            p_err->print_test(t);
+            fflush(p_err->fout);
+        }

        llama_print_timings(ctx);

@@ -1390,7 +1441,13 @@ int main(int argc, char ** argv) {

    llama_free_model(lmodel);

-    p->print_footer();
+    if (p) {
+        p->print_footer();
+    }
+
+    if (p_err) {
+        p_err->print_footer();
+    }

    llama_backend_free();


--- a/llm/llama.cpp/examples/llama.android/app/build.gradle.kts
+++ b/llm/llama.cpp/examples/llama.android/app/build.gradle.kts
+plugins {
+    id("com.android.application")
+    id("org.jetbrains.kotlin.android")
+}
+
+android {
+    namespace = "com.example.llama"
+    compileSdk = 34
+
+    defaultConfig {
+        applicationId = "com.example.llama"
+        minSdk = 33
+        targetSdk = 34
+        versionCode = 1
+        versionName = "1.0"
+
+        testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner"
+        vectorDrawables {
+            useSupportLibrary = true
+        }
+    }
+
+    buildTypes {
+        release {
+            isMinifyEnabled = false
+            proguardFiles(
+                getDefaultProguardFile("proguard-android-optimize.txt"),
+                "proguard-rules.pro"
+            )
+        }
+    }
+    compileOptions {
+        sourceCompatibility = JavaVersion.VERSION_1_8
+        targetCompatibility = JavaVersion.VERSION_1_8
+    }
+    kotlinOptions {
+        jvmTarget = "1.8"
+    }
+    buildFeatures {
+        compose = true
+    }
+    composeOptions {
+        kotlinCompilerExtensionVersion = "1.5.1"
+    }
+}
+
+dependencies {
+
+    implementation("androidx.core:core-ktx:1.12.0")
+    implementation("androidx.lifecycle:lifecycle-runtime-ktx:2.6.2")
+    implementation("androidx.activity:activity-compose:1.8.2")
+    implementation(platform("androidx.compose:compose-bom:2023.08.00"))
+    implementation("androidx.compose.ui:ui")
+    implementation("androidx.compose.ui:ui-graphics")
+    implementation("androidx.compose.ui:ui-tooling-preview")
+    implementation("androidx.compose.material3:material3")
+    implementation(project(":llama"))
+    testImplementation("junit:junit:4.13.2")
+    androidTestImplementation("androidx.test.ext:junit:1.1.5")
+    androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
+    androidTestImplementation(platform("androidx.compose:compose-bom:2023.08.00"))
+    androidTestImplementation("androidx.compose.ui:ui-test-junit4")
+    debugImplementation("androidx.compose.ui:ui-tooling")
+    debugImplementation("androidx.compose.ui:ui-test-manifest")
+}
--- a/llm/llama.cpp/examples/llama.android/build.gradle.kts
+++ b/llm/llama.cpp/examples/llama.android/build.gradle.kts
+// Top-level build file where you can add configuration options common to all sub-projects/modules.
+plugins {
+    id("com.android.application") version "8.2.0" apply false
+    id("org.jetbrains.kotlin.android") version "1.9.0" apply false
+    id("com.android.library") version "8.2.0" apply false
+}
--- a/llm/llama.cpp/examples/llama.android/llama/build.gradle.kts
+++ b/llm/llama.cpp/examples/llama.android/llama/build.gradle.kts
+plugins {
+    id("com.android.library")
+    id("org.jetbrains.kotlin.android")
+}
+
+android {
+    namespace = "android.llama.cpp"
+    compileSdk = 34
+
+    defaultConfig {
+        minSdk = 33
+
+        testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner"
+        consumerProguardFiles("consumer-rules.pro")
+        ndk {
+            // Add NDK properties if wanted, e.g.
+            // abiFilters += listOf("arm64-v8a")
+        }
+        externalNativeBuild {
+            cmake {
+                arguments += "-DCMAKE_BUILD_TYPE=Release"
+                cppFlags += listOf()
+                arguments += listOf()
+
+                cppFlags("")
+            }
+        }
+    }
+
+    buildTypes {
+        release {
+            isMinifyEnabled = false
+            proguardFiles(
+                getDefaultProguardFile("proguard-android-optimize.txt"),
+                "proguard-rules.pro"
+            )
+        }
+    }
+    externalNativeBuild {
+        cmake {
+            path("src/main/cpp/CMakeLists.txt")
+            version = "3.22.1"
+        }
+    }
+    compileOptions {
+        sourceCompatibility = JavaVersion.VERSION_1_8
+        targetCompatibility = JavaVersion.VERSION_1_8
+    }
+    kotlinOptions {
+        jvmTarget = "1.8"
+    }
+
+    packaging {
+        resources {
+            excludes += "/META-INF/{AL2.0,LGPL2.1}"
+        }
+    }
+}
+
+dependencies {
+
+    implementation("androidx.core:core-ktx:1.12.0")
+    implementation("androidx.appcompat:appcompat:1.6.1")
+    implementation("com.google.android.material:material:1.11.0")
+    testImplementation("junit:junit:4.13.2")
+    androidTestImplementation("androidx.test.ext:junit:1.1.5")
+    androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
+}
--- a/llm/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
+++ b/llm/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
@@ -11,15 +11,15 @@ cmake_minimum_required(VERSION 3.22.1)
 # build script scope).
 project("llama-android")

-include(FetchContent)
-FetchContent_Declare(
-        llama
-        GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
-        GIT_TAG        master
-)
+#include(FetchContent)
+#FetchContent_Declare(
+#        llama
+#        GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
+#        GIT_TAG        master
+#)

 # Also provides "common"
-FetchContent_MakeAvailable(llama)
+#FetchContent_MakeAvailable(llama)

 # Creates and names a library, sets it as either STATIC
 # or SHARED, and provides the relative paths to its source code.
@@ -30,6 +30,10 @@ FetchContent_MakeAvailable(llama)
 # the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
 # is preferred for the same purpose.
 #
+
+#load local llama.cpp
+add_subdirectory(../../../../../../ build-llama)
+
 # In order to load a library into your app from Java/Kotlin, you must call
 # System.loadLibrary() and pass the name of the library defined here;
 # for GameActivity/NativeActivity derived applications, the same library name must be

--- a/llm/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp
+++ b/llm/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@@ -5,7 +5,7 @@
 #include <string>
 #include <unistd.h>
 #include "llama.h"
-#include "common/common.h"
+#include "common.h"

 // Write C++ code here.
 //
@@ -409,7 +409,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(

    const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
    if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
-        return env->NewStringUTF("");
+        return nullptr;
    }

    auto new_token_chars = llama_token_to_piece(context, new_token_id);

--- a/llm/llama.cpp/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/llm/llama.cpp/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -26,11 +26,12 @@ actor LlamaContext {
    private var context: OpaquePointer
    private var batch: llama_batch
    private var tokens_list: [llama_token]
+    var is_done: Bool = false

    /// This variable is used to store temporarily invalid cchars
    private var temporary_invalid_cchars: [CChar]

-    var n_len: Int32 = 64
+    var n_len: Int32 = 1024
    var n_cur: Int32 = 0

    var n_decode: Int32 = 0
@@ -160,6 +161,7 @@ actor LlamaContext {

        if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
            print("\n")
+            is_done = true
            let new_token_str = String(cString: temporary_invalid_cchars + [0])
            temporary_invalid_cchars.removeAll()
            return new_token_str
@@ -322,7 +324,7 @@ actor LlamaContext {
        defer {
            result.deallocate()
        }
-        let nTokens = llama_token_to_piece(model, token, result, 8, false)
+        let nTokens = llama_token_to_piece(model, token, result, 8, 0, false)

        if nTokens < 0 {
            let newResult = UnsafeMutablePointer<Int8>.allocate(capacity: Int(-nTokens))
@@ -330,7 +332,7 @@ actor LlamaContext {
            defer {
                newResult.deallocate()
            }
-            let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, false)
+            let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, 0, false)
            let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
            return Array(bufferPointer)
        } else {

--- a/llm/llama.cpp/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
+++ b/llm/llama.cpp/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
@@ -131,22 +131,29 @@ class LlamaState: ObservableObject {

        messageLog += "\(text)"

-        while await llamaContext.n_cur < llamaContext.n_len {
-            let result = await llamaContext.completion_loop()
-            messageLog += "\(result)"
-        }
+        Task.detached {
+            while await !llamaContext.is_done {
+                let result = await llamaContext.completion_loop()
+                await MainActor.run {
+                    self.messageLog += "\(result)"
+                }
+            }

-        let t_end = DispatchTime.now().uptimeNanoseconds
-        let t_generation = Double(t_end - t_heat_end) / NS_PER_S
-        let tokens_per_second = Double(await llamaContext.n_len) / t_generation
+            let t_end = DispatchTime.now().uptimeNanoseconds
+            let t_generation = Double(t_end - t_heat_end) / self.NS_PER_S
+            let tokens_per_second = Double(await llamaContext.n_len) / t_generation

-        await llamaContext.clear()
-        messageLog += """
-            \n
-            Done
-            Heat up took \(t_heat)s
-            Generated \(tokens_per_second) t/s\n
-            """
+            await llamaContext.clear()
+
+            await MainActor.run {
+                self.messageLog += """
+                    \n
+                    Done
+                    Heat up took \(t_heat)s
+                    Generated \(tokens_per_second) t/s\n
+                    """
+            }
+        }
    }

    func bench() async {

--- a/llm/llama.cpp/examples/llava/CMakeLists.txt
+++ b/llm/llama.cpp/examples/llava/CMakeLists.txt
@@ -30,8 +30,9 @@ if(TARGET BUILD_INFO)
    add_dependencies(llava BUILD_INFO)
 endif()

-set(TARGET llava-cli)
-add_executable(llava-cli llava-cli.cpp)
-install(TARGETS llava-cli RUNTIME)
-target_link_libraries(llava-cli PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(llava PRIVATE cxx_std_11)
+set(TARGET llama-llava-cli)
+add_executable(${TARGET} llava-cli.cpp)
+set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/llm/llama.cpp/examples/llava/MobileVLM-README.md
+++ b/llm/llama.cpp/examples/llava/MobileVLM-README.md
@@ -9,12 +9,12 @@ The implementation is based on llava, and is compatible with llava and mobileVLM
 Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using **MobileVLM-1.7B** as an example, the different conversion step will be shown.

 ## Usage
-Build with cmake or run `make llava-cli` to build it.
+Build with cmake or run `make llama-llava-cli` to build it.

-After building, run: `./llava-cli` to see the usage. For example:
+After building, run: `./llama-llava-cli` to see the usage. For example:

 ```sh
-./llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
+./llama-llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
    --mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \
    --image path/to/an/image.jpg \
    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:"
@@ -30,16 +30,16 @@ git clone https://huggingface.co/mtgv/MobileVLM-1.7B
 git clone https://huggingface.co/openai/clip-vit-large-patch14-336
 ```

-2. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
+2. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:

 ```sh
-python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B
+python ./examples/llava/llava_surgery.py -m path/to/MobileVLM-1.7B
 ```

-3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:
+3. Use `convert_image_encoder_to_gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:

 ```sh
-python ./examples/llava/convert-image-encoder-to-gguf \
+python ./examples/llava/convert_image_encoder_to_gguf \
    -m path/to/clip-vit-large-patch14-336 \
    --llava-projector path/to/MobileVLM-1.7B/llava.projector \
    --output-dir path/to/MobileVLM-1.7B \
@@ -47,22 +47,22 @@ python ./examples/llava/convert-image-encoder-to-gguf \
 ```

 ```sh
-python ./examples/llava/convert-image-encoder-to-gguf \
+python ./examples/llava/convert_image_encoder_to_gguf \
    -m path/to/clip-vit-large-patch14-336 \
    --llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \
    --output-dir path/to/MobileVLM-1.7B_V2 \
    --projector-type ldpv2
 ```

-4. Use `examples/convert-legacy-llama.py` to convert the LLaMA part of LLaVA to GGUF:
+4. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:

 ```sh
-python ./examples/convert-legacy-llama.py path/to/MobileVLM-1.7B
+python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B
 ```

 5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
 ```sh
-./quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
+./llama-quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
 ```

 Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory.
@@ -82,7 +82,7 @@ refer to `android/adb_run.sh`, modify resources' `name` and `path`
 ### case 1
 **input**
 ```sh
-/data/local/tmp/llava-cli \
+/data/local/tmp/llama-llava-cli \
    -m /data/local/tmp/ggml-model-q4_k.gguf \
    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
    -t 4 \
@@ -102,7 +102,7 @@ llama_print_timings:       total time =   34731.93 ms
 ### case 2
 **input**
 ```sh
-/data/local/tmp/llava-cli \
+/data/local/tmp/llama-llava-cli \
    -m /data/local/tmp/ggml-model-q4_k.gguf \
    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
    -t 4 \
@@ -126,7 +126,7 @@ llama_print_timings:       total time =   34570.79 ms
 #### llava-cli release-b2005
 **input**
 ```sh
-/data/local/tmp/llava-cli \
+/data/local/tmp/llama-llava-cli \
    -m /data/local/tmp/ggml-model-q4_k.gguf \
    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
    -t 4 \
@@ -194,13 +194,13 @@ llama_print_timings:       total time =   44411.01 ms /   377 tokens
 ## Orin compile and run
 ### compile
 ```sh
-make LLAMA_CUDA=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
+make GGML_CUDA=1 CUDA_DOCKER_ARCH=sm_87 GGML_CUDA_F16=1 -j 32
 ```
 ### run on Orin
 ### case 1
 **input**
 ```sh
-./llava-cli \
+./llama-llava-cli \
    -m /data/local/tmp/ggml-model-q4_k.gguf \
    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
    --image /data/local/tmp/demo.jpeg \
@@ -224,7 +224,7 @@ llama_print_timings:       total time =    1352.63 ms /   252 tokens
 ### case 2
 **input**
 ```sh
-./llava-cli \
+./llama-llava-cli \
    -m /data/local/tmp/ggml-model-q4_k.gguf \
    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:" \

--- a/llm/llama.cpp/examples/llava/README.md
+++ b/llm/llama.cpp/examples/llava/README.md
@@ -11,12 +11,12 @@ For llava-1.6 a variety of prepared gguf models are available as well [7b-34b](h
 After API is confirmed, more models will be supported / uploaded.

 ## Usage
-Build with cmake or run `make llava-cli` to build it.
+Build with cmake or run `make llama-llava-cli` to build it.

-After building, run: `./llava-cli` to see the usage. For example:
+After building, run: `./llama-llava-cli` to see the usage. For example:

 ```sh
-./llava-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
+./llama-llava-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
 ```

 **note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
@@ -38,22 +38,22 @@ git clone https://huggingface.co/openai/clip-vit-large-patch14-336
 pip install -r examples/llava/requirements.txt
 ```

-3. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
+3. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:

 ```sh
-python ./examples/llava/llava-surgery.py -m ../llava-v1.5-7b
+python ./examples/llava/llava_surgery.py -m ../llava-v1.5-7b
 ```

-4. Use `convert-image-encoder-to-gguf.py` to convert the LLaVA image encoder to GGUF:
+4. Use `convert_image_encoder_to_gguf.py` to convert the LLaVA image encoder to GGUF:

 ```sh
-python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
+python ./examples/llava/convert_image_encoder_to_gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
 ```

-5. Use `examples/convert-legacy-llama.py` to convert the LLaMA part of LLaVA to GGUF:
+5. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:

 ```sh
-python ./examples/convert-legacy-llama.py ../llava-v1.5-7b --skip-unknown
+python ./examples/convert_legacy_llama.py ../llava-v1.5-7b --skip-unknown
 ```

 Now both the LLaMA part and the image encoder are in the `llava-v1.5-7b` directory.
@@ -70,9 +70,9 @@ git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b
 pip install -r examples/llava/requirements.txt
 ```

-3) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
+3) Use `llava_surgery_v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
 ```console
-python examples/llava/llava-surgery-v2.py -C -m ../llava-v1.6-vicuna-7b/
+python examples/llava/llava_surgery_v2.py -C -m ../llava-v1.6-vicuna-7b/
 ```
 - you will find a llava.projector and a llava.clip file in your model directory

@@ -86,18 +86,18 @@ curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.jso

 5) Create the visual gguf model:
 ```console
-python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
+python ./examples/llava/convert_image_encoder_to_gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
 ```
 - This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP

 6) Then convert the model to gguf format:
 ```console
-python ./examples/convert-legacy-llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown
+python ./examples/convert_legacy_llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown
 ```

-7) And finally we can run the llava-cli using the 1.6 model version:
+7) And finally we can run the llava cli using the 1.6 model version:
 ```console
-./llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
+./llama-llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
 ```

 **note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096)

--- a/llm/llama.cpp/examples/llava/android/adb_run.sh
+++ b/llm/llama.cpp/examples/llava/android/adb_run.sh
@@ -10,7 +10,7 @@ prompt="A chat between a curious user and an artificial intelligence assistant.
 # prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"

 program_dir="build_64/bin"
-binName="llava-cli"
+binName="llama-llava-cli"
 n_threads=4



--- a/llm/llama.cpp/examples/llava/android/build_64.sh
+++ b/llm/llama.cpp/examples/llava/android/build_64.sh
+#!/bin/bash
+cmake ../../../../ \
+-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+-DCMAKE_BUILD_TYPE=Release \
+-DANDROID_ABI="arm64-v8a" \
+-DANDROID_PLATFORM=android-23 $1
+
+make -j4
--- a/llm/llama.cpp/examples/llava/clip.cpp
+++ b/llm/llama.cpp/examples/llava/clip.cpp
@@ -16,6 +16,10 @@
 #include "ggml-metal.h"
 #endif

+#ifdef GGML_USE_CANN
+#include "ggml-cann.h"
+#endif
+
 #define STB_IMAGE_IMPLEMENTATION
 #include "stb_image.h"

@@ -865,7 +869,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            embeddings = peg_0;
        }
        else {
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
        }
    }

@@ -1001,6 +1005,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
    LOG_TEE("%s: CLIP using Metal backend\n", __func__);
 #endif

+#ifdef GGML_USE_CANN
+    new_clip->backend = ggml_backend_cann_init(0);
+    LOG_TEE("%s: CLIP using CANN backend\n", __func__);
+#endif
+

    if (!new_clip->backend) {
        new_clip->backend = ggml_backend_cpu_init();
@@ -1121,20 +1130,20 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            }
            if (n < 32)
                hparams.image_grid_pinpoints[n] = 0;
-        } catch (std::runtime_error & e) {
+        } catch (std::runtime_error & /*e*/) {
            hparams.image_grid_pinpoints[0]=0;
        }

        try {
            int idx = get_key_idx(ctx, KEY_MM_PATCH_MERGE_TYPE);
            strcpy(hparams.mm_patch_merge_type, gguf_get_val_str(ctx, idx));
-        } catch (std::runtime_error & e) {
+        } catch (std::runtime_error & /*e*/) {
            strcpy(hparams.mm_patch_merge_type, "flat");
        }

        try {
            hparams.image_crop_resolution = get_u32(ctx, KEY_IMAGE_CROP_RESOLUTION); // llava-1.6
-        } catch(const std::exception& e) {
+        } catch(const std::exception& /*e*/) {
            hparams.image_crop_resolution = hparams.image_size;
        }

@@ -1173,7 +1182,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        try {
            vision_model.class_embedding  = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
            new_clip->has_class_embedding = true;
-        } catch (const std::exception& e) {
+        } catch (const std::exception& /*e*/) {
            new_clip->has_class_embedding = false;
        }

@@ -1181,7 +1190,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            vision_model.pre_ln_w  = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
            vision_model.pre_ln_b  = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
            new_clip->has_pre_norm = true;
-        } catch (std::exception & e) {
+        } catch (std::exception & /*e*/) {
            new_clip->has_pre_norm = false;
        }

@@ -1189,21 +1198,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            vision_model.post_ln_w  = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight"));
            vision_model.post_ln_b  = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias"));
            new_clip->has_post_norm = true;
-        } catch (std::exception & e) {
+        } catch (std::exception & /*e*/) {
            new_clip->has_post_norm = false;
        }

        try {
            vision_model.patch_bias = get_tensor(new_clip->ctx_data, TN_PATCH_BIAS);
            new_clip->has_patch_bias = true;
-        } catch (std::exception & e) {
+        } catch (std::exception & /*e*/) {
            new_clip->has_patch_bias = false;
        }

        try {
            vision_model.patch_embeddings    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
            vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
-        } catch(const std::exception& e) {
+        } catch(const std::exception& /*e*/) {
            LOG_TEE("%s: failed to load vision model tensors\n", __func__);
        }

@@ -1215,26 +1224,26 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
                // Yi-type llava
                vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "weight"));
                vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "bias"));
-            } catch (std::runtime_error & e) {  }
+            } catch (std::runtime_error & /*e*/) { }
            try {
                // missing in Yi-type llava
                vision_model.mm_2_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
                vision_model.mm_2_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
-            } catch (std::runtime_error & e) {  }
+            } catch (std::runtime_error & /*e*/) { }
            try {
                // Yi-type llava
                vision_model.mm_3_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "weight"));
                vision_model.mm_3_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "bias"));
-            } catch (std::runtime_error & e) {  }
+            } catch (std::runtime_error & /*e*/) { }
            try {
                // Yi-type llava
                vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight"));
                vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias"));
-            } catch (std::runtime_error & e) {  }
+            } catch (std::runtime_error & /*e*/) { }
            try {
                vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
                // LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__);
-            } catch (std::runtime_error & e) {  }
+            } catch (std::runtime_error & /*e*/) { }
        } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
            // MobileVLM projection
            vision_model.mm_model_mlp_1_w               = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight"));

--- a/llm/llama.cpp/examples/llava/convert_image_encoder_to_gguf.py
+++ b/llm/llama.cpp/examples/llava/convert_image_encoder_to_gguf.py
+import argparse
+import os
+import json
+import re
+
+import torch
+import numpy as np
+from gguf import *
+from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel
+
+TEXT = "clip.text"
+VISION = "clip.vision"
+
+
+def k(raw_key: str, arch: str) -> str:
+    return raw_key.format(arch=arch)
+
+
+def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: bool) -> bool:
+    if name in (
+        "logit_scale",
+        "text_model.embeddings.position_ids",
+        "vision_model.embeddings.position_ids",
+    ):
+        return True
+
+    if has_llava and name in ["visual_projection.weight", "vision_model.post_layernorm.weight", "vision_model.post_layernorm.bias"]:
+        return True
+
+    if name.startswith("v") and not has_vision:
+        return True
+
+    if name.startswith("t") and not has_text:
+        return True
+
+    return False
+
+
+def get_tensor_name(name: str) -> str:
+    if "projection" in name:
+        return name
+    if "mm_projector" in name:
+        name = name.replace("model.mm_projector", "mm")
+        name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1)
+        name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1)
+        return name
+
+    return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
+
+
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a significant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1))
+        + list(range(ord("¡"), ord("¬") + 1))
+        + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+ap = argparse.ArgumentParser()
+ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
+ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
+ap.add_argument("--text-only", action="store_true", required=False,
+                help="Save a text-only model. It can't be used to encode images")
+ap.add_argument("--vision-only", action="store_true", required=False,
+                help="Save a vision-only model. It can't be used to encode texts")
+ap.add_argument("--clip-model-is-vision", action="store_true", required=False,
+                help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
+ap.add_argument("--clip-model-is-openclip", action="store_true", required=False,
+                help="The clip model is from openclip (for ViT-SO400M type))")
+ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
+ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp")
+ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
+# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
+# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
+default_image_mean = [0.48145466, 0.4578275, 0.40821073]
+default_image_std = [0.26862954, 0.26130258, 0.27577711]
+ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
+ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
+
+# with proper
+args = ap.parse_args()
+
+
+if args.text_only and args.vision_only:
+    print("--text-only and --image-only arguments cannot be specified at the same time.")
+    exit(1)
+
+if args.use_f32:
+    print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.")
+
+# output in the same directory as the model if output_dir is None
+dir_model = args.model_dir
+
+if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip:
+    vocab = None
+    tokens = None
+else:
+    with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
+        vocab = json.load(f)
+        tokens = [key for key in vocab]
+
+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+    config = json.load(f)
+    if args.clip_model_is_vision:
+        v_hparams = config
+        t_hparams = None
+    else:
+        v_hparams = config["vision_config"]
+        t_hparams = config["text_config"]
+
+# possible data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+#
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if args.use_f32:
+    ftype = 0
+
+if args.clip_model_is_vision or args.clip_model_is_openclip:
+    model = CLIPVisionModel.from_pretrained(dir_model)
+    processor = None
+else:
+    model = CLIPModel.from_pretrained(dir_model)
+    processor = CLIPProcessor.from_pretrained(dir_model)
+
+fname_middle = None
+has_text_encoder = True
+has_vision_encoder = True
+has_llava_projector = False
+if args.text_only:
+    fname_middle = "text-"
+    has_vision_encoder = False
+elif args.llava_projector is not None:
+    fname_middle = "mmproj-"
+    has_text_encoder = False
+    has_llava_projector = True
+elif args.vision_only:
+    fname_middle = "vision-"
+    has_text_encoder = False
+else:
+    fname_middle = ""
+
+output_dir = args.output_dir if args.output_dir is not None else dir_model
+os.makedirs(output_dir, exist_ok=True)
+output_prefix = os.path.basename(output_dir).replace("ggml_", "")
+fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
+fout = GGUFWriter(path=fname_out, arch="clip")
+
+fout.add_bool("clip.has_text_encoder", has_text_encoder)
+fout.add_bool("clip.has_vision_encoder", has_vision_encoder)
+fout.add_bool("clip.has_llava_projector", has_llava_projector)
+fout.add_file_type(ftype)
+model_name = config["_name_or_path"] if "_name_or_path" in config else os.path.basename(dir_model)
+fout.add_name(model_name)
+if args.text_only:
+    fout.add_description("text-only CLIP model")
+elif args.vision_only and not has_llava_projector:
+    fout.add_description("vision-only CLIP model")
+elif has_llava_projector:
+    fout.add_description("image encoder for LLaVA")
+    # add projector type
+    fout.add_string("clip.projector_type", args.projector_type)
+else:
+    fout.add_description("two-tower CLIP model")
+
+if has_text_encoder:
+    assert t_hparams is not None
+    assert tokens is not None
+    # text_model hparams
+    fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"])
+    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"])
+    fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, TEXT), t_hparams["intermediate_size"])
+    fout.add_uint32("clip.text.projection_dim", t_hparams.get("projection_dim", config["projection_dim"]))
+    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, TEXT), t_hparams["num_attention_heads"])
+    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, TEXT), t_hparams["layer_norm_eps"])
+    fout.add_uint32(k(KEY_BLOCK_COUNT, TEXT), t_hparams["num_hidden_layers"])
+    fout.add_token_list(tokens)
+
+if has_vision_encoder:
+    # vision_model hparams
+    fout.add_uint32("clip.vision.image_size", v_hparams["image_size"])
+    fout.add_uint32("clip.vision.patch_size", v_hparams["patch_size"])
+    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), v_hparams["hidden_size"])
+    fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), v_hparams["intermediate_size"])
+    fout.add_uint32("clip.vision.projection_dim", v_hparams.get("projection_dim", config["projection_dim"]))
+    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["num_attention_heads"])
+    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"])
+    block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
+    fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)
+                            #     /**
+                            #      "image_grid_pinpoints": [
+                            #         [
+                            #         336,
+                            #         672
+                            #         ],
+                            #         [
+                            #         672,
+                            #         336
+                            #         ],
+                            #         [
+                            #         672,
+                            #         672
+                            #         ],
+                            #         [
+                            #         1008,
+                            #         336
+                            #         ],
+                            #         [
+                            #         336,
+                            #         1008
+                            #         ]
+                            #     ],
+                            #     Flattened:
+                            #     [
+                            #         336, 672,
+                            #         672, 336,
+                            #         672, 672,
+                            #         1008, 336,
+                            #         336, 1008
+                            #     ]
+                            #  *
+                            #  */
+    if "image_grid_pinpoints" in v_hparams:
+        # flatten it
+        image_grid_pinpoints = []
+        for pinpoint in v_hparams["image_grid_pinpoints"]:
+            for p in pinpoint:
+                image_grid_pinpoints.append(p)
+        fout.add_array("clip.vision.image_grid_pinpoints", image_grid_pinpoints)
+    if "image_crop_resolution" in v_hparams:
+        fout.add_uint32("clip.vision.image_crop_resolution", v_hparams["image_crop_resolution"])
+    if "image_aspect_ratio" in v_hparams:
+        fout.add_string("clip.vision.image_aspect_ratio", v_hparams["image_aspect_ratio"])
+    if "image_split_resolution" in v_hparams:
+        fout.add_uint32("clip.vision.image_split_resolution", v_hparams["image_split_resolution"])
+    if "mm_patch_merge_type" in v_hparams:
+        fout.add_string("clip.vision.mm_patch_merge_type", v_hparams["mm_patch_merge_type"])
+    if "mm_projector_type" in v_hparams:
+        fout.add_string("clip.vision.mm_projector_type", v_hparams["mm_projector_type"])
+
+
+    if processor is not None:
+        image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean  # pyright: ignore[reportAttributeAccessIssue]
+        image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std  # pyright: ignore[reportAttributeAccessIssue]
+    else:
+        image_mean = args.image_mean if args.image_mean is not None else default_image_mean
+        image_std = args.image_std if args.image_std is not None else default_image_std
+    fout.add_array("clip.vision.image_mean", image_mean)
+    fout.add_array("clip.vision.image_std", image_std)
+
+use_gelu = v_hparams["hidden_act"] == "gelu"
+fout.add_bool("clip.use_gelu", use_gelu)
+
+
+if has_llava_projector:
+    model.vision_model.encoder.layers.pop(-1)  # pyright: ignore[reportAttributeAccessIssue]
+    projector = torch.load(args.llava_projector)
+    for name, data in projector.items():
+        name = get_tensor_name(name)
+        # pw and dw conv ndim==4
+        if data.ndim == 2 or data.ndim == 4:
+            data = data.squeeze().numpy().astype(np.float16)
+        else:
+            data = data.squeeze().numpy().astype(np.float32)
+
+        fout.add_tensor(name, data)
+
+    print("Projector tensors added\n")
+
+state_dict = model.state_dict()  # pyright: ignore[reportAttributeAccessIssue]
+for name, data in state_dict.items():
+    if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector):
+        # we don't need this
+        print(f"skipping parameter: {name}")
+        continue
+
+    name = get_tensor_name(name)
+    data = data.squeeze().numpy()
+
+    n_dims = len(data.shape)
+
+    # ftype == 0 -> float32, ftype == 1 -> float16
+    ftype_cur = 0
+    if n_dims == 4:
+        print(f"tensor {name} is always saved in f16")
+        data = data.astype(np.float16)
+        ftype_cur = 1
+    elif ftype == 1:
+        if name[-7:] == ".weight" and n_dims == 2:
+            print("  Converting to float16")
+            data = data.astype(np.float16)
+            ftype_cur = 1
+        else:
+            print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype_cur = 0
+    else:
+        if data.dtype != np.float32:
+            print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype_cur = 0
+
+    print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
+    fout.add_tensor(name, data)
+
+
+fout.write_header_to_file()
+fout.write_kv_data_to_file()
+fout.write_tensors_to_file()
+fout.close()
+
+print("Done. Output file: " + fname_out)
--- a/llm/llama.cpp/examples/llava/llava-cli.cpp
+++ b/llm/llama.cpp/examples/llava/llava-cli.cpp
@@ -112,9 +112,12 @@ struct llava_context {
    struct llama_model * model = NULL;
 };

-static void show_additional_info(int /*argc*/, char ** argv) {
-    LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
-    LOG_TEE("  note: a lower temperature value like 0.1 is recommended for better quality.\n");
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+    gpt_params_print_usage(argc, argv, params);
+
+    LOG_TEE("\n example usage:\n");
+    LOG_TEE("\n     %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+    LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
 }

 static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) {
@@ -278,7 +281,7 @@ int main(int argc, char ** argv) {
    gpt_params params;

    if (!gpt_params_parse(argc, argv, params)) {
-        show_additional_info(argc, argv);
+        print_usage(argc, argv, params);
        return 1;
    }

@@ -290,8 +293,7 @@ int main(int argc, char ** argv) {
 #endif // LOG_DISABLE_LOGS

    if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
-        gpt_params_print_usage(argc, argv, params);
-        show_additional_info(argc, argv);
+        print_usage(argc, argv, {});
        return 1;
    }
    auto model = llava_init(&params);

--- a/llm/llama.cpp/examples/llava/llava_surgery.py
+++ b/llm/llama.cpp/examples/llava/llava_surgery.py
+import argparse
+import glob
+import os
+import torch
+
+
+ap = argparse.ArgumentParser()
+ap.add_argument("-m", "--model", help="Path to LLaVA v1.5 model")
+args = ap.parse_args()
+
+# find the model part that includes the the multimodal projector weights
+path = sorted(glob.glob(f"{args.model}/pytorch_model*.bin"))[-1]
+checkpoint = torch.load(path)
+
+# get a list of mm tensor names
+mm_tensors = [k for k, v in checkpoint.items() if k.startswith("model.mm_projector")]
+
+# store these tensors in a new dictionary and torch.save them
+projector = {name: checkpoint[name].float() for name in mm_tensors}
+torch.save(projector, f"{args.model}/llava.projector")
+
+# BakLLaVA models contain CLIP tensors in it
+clip_tensors = [k for k, v in checkpoint.items() if k.startswith("model.vision_tower")]
+if len(clip_tensors) > 0:
+    clip = {name.replace("vision_tower.vision_tower.", ""): checkpoint[name].float() for name in clip_tensors}
+    torch.save(clip, f"{args.model}/llava.clip")
+
+
+    # added tokens should be removed to be able to convert Mistral models
+    if os.path.exists(f"{args.model}/added_tokens.json"):
+        with open(f"{args.model}/added_tokens.json", "w") as f:
+            f.write("{}\n")
+
+
+
+print("Done!")
+print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
+print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")
--- a/llm/llama.cpp/examples/llava/llava_surgery_v2.py
+++ b/llm/llama.cpp/examples/llava/llava_surgery_v2.py
+import argparse
+import glob
+import os
+import torch
+from safetensors import safe_open
+from safetensors.torch import save_file
+from typing import Any, ContextManager, cast
+
+# Function to determine if file is a SafeTensor file
+def is_safetensor_file(file_path):
+    return file_path.endswith('.safetensors')
+
+
+# Unified loading function
+def load_model(file_path):
+    if is_safetensor_file(file_path):
+        tensors = {}
+        with cast(ContextManager[Any], safe_open(file_path, framework="pt", device="cpu")) as f:
+            for key in f.keys():
+                tensors[key] = f.get_tensor(key).clone()
+                # output shape
+                print(f"{key} : {tensors[key].shape}")
+        return tensors, 'safetensor'
+    else:
+        return torch.load(file_path, map_location=torch.device('cpu')), 'pytorch'
+
+
+# Unified saving function
+def save_model(model, file_path, file_type):
+    if file_type == 'safetensor':
+        # safe_save(model, file_path)
+        save_file(model, file_path)
+    else:
+        torch.save(model, file_path)
+
+
+# Adapted function to clean vision tower from checkpoint
+def clean_vision_tower_from_checkpoint(checkpoint_path):
+    checkpoint, file_type = load_model(checkpoint_path)
+    # file_type = 'pytorch'
+    model_path = os.path.dirname(checkpoint_path)
+    print(f"Searching for vision tower tensors in {checkpoint_path}")
+    clip_tensors = [k for k, v in checkpoint.items() if (k.startswith("model.vision_tower") or k.startswith("vit."))]
+
+    if len(clip_tensors) > 0:
+        print(f"Found {len(clip_tensors)} tensors to extract from {checkpoint_path}")
+        # Adapted for file type
+        clip_path = os.path.join(model_path, "llava.clip")
+
+        if os.path.exists(clip_path):
+            print(f"Loading existing llava.clip from {clip_path}")
+            existing_clip, _ = load_model(clip_path)
+        else:
+            print(f"Creating new llava.clip at {clip_path}")
+            existing_clip = {}
+        # Update existing_clip with new tensors, avoid duplicates
+        for name in clip_tensors:
+            simple_name = name[name.index('vision_model.'):] if 'vision_model.' in name else name
+            print(f"Adding {simple_name} to llava.clip")
+            if simple_name not in existing_clip:
+                existing_clip[simple_name] = checkpoint[name]
+
+        # Save the updated clip tensors back to llava.clip
+        save_model(existing_clip, clip_path, 'pytorch')
+
+        # Remove the tensors from the original checkpoint
+        for name in clip_tensors:
+            del checkpoint[name]
+
+        checkpoint_path = checkpoint_path
+        return True
+    return False
+
+def find_relevant_checkpoints(checkpoint_paths, newline_criteria, projector):
+    newline_checkpoint_path = None
+    projector_checkpoint_path = None
+
+    for path in checkpoint_paths:
+        checkpoint, _ = load_model(path)
+        if newline_criteria(checkpoint) and newline_checkpoint_path is None:
+            newline_checkpoint_path = path
+        if projector(checkpoint):
+            projector_checkpoint_path = path
+
+    return newline_checkpoint_path, projector_checkpoint_path
+
+def newline_criteria(checkpoint):
+    return any(k.startswith("model.image_newline") for k in checkpoint.keys())
+
+def proj_criteria(checkpoint):
+    return any(k.startswith("model.mm_projector") or k.startswith("vision_proj.") for k in checkpoint.keys())
+
+
+# Command-line interface setup
+ap = argparse.ArgumentParser()
+ap.add_argument("-m", "--model", required=True, help="Path to LLaVA v1.5+ model")
+ap.add_argument("-C", "--clean-vision-tower", action="store_true", help="Remove any vision tower from the model files")
+args = ap.parse_args()
+
+if args.clean_vision_tower:
+    # Generalized to handle both PyTorch and SafeTensors models
+    model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True)
+    # checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and path.startswith('pytorch')) or (path.endswith('.safetensors') and path.startswith('model'))]
+    checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])]
+    for projector_checkpoint_path in checkpoint_paths:
+        print(f"Cleaning {projector_checkpoint_path}")
+        if not clean_vision_tower_from_checkpoint(projector_checkpoint_path):
+            print(f"No vision tower found in {projector_checkpoint_path}")
+            # we break once none is found, so far all models append them at the end
+            # break
+    print("Done! All vision tower tensors are removed from the model files and stored in llava.clip file.")
+
+# Now we look for the projector in the last checkpoint
+model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True)
+checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])]
+# last_checkpoint_path = checkpoint_paths[0]
+# first_checkpoint_path = checkpoint_paths[-1]
+newline_checkpoint_path, projector_checkpoint_path = find_relevant_checkpoints(checkpoint_paths, newline_criteria, proj_criteria)
+
+print(f"Taking projector from {projector_checkpoint_path}")
+first_mm_tensors = []
+first_checkpoint = None
+if newline_checkpoint_path is not None:
+    print(f"Taking newline from {newline_checkpoint_path}")
+    first_checkpoint, file_type = load_model(newline_checkpoint_path)
+    first_mm_tensors = [k for k, v in first_checkpoint.items() if k.startswith("model.image_newline")]
+
+# Load the checkpoint
+mm_tensors = []
+last_checkpoint = None
+if projector_checkpoint_path is not None:
+    last_checkpoint, file_type = load_model(projector_checkpoint_path)
+    mm_tensors = [k for k, v in last_checkpoint.items() if k.startswith("model.mm_projector") or k.startswith("vision_proj.")]
+
+if len(mm_tensors) == 0:
+    if last_checkpoint is not None:
+        for k, v in last_checkpoint.items():
+            print(k)
+    print(f"Found {len(mm_tensors)} tensors to extract out of {len(last_checkpoint) if last_checkpoint is not None else 0} tensors.")
+    print("No tensors found. Is this a LLaVA model?")
+    exit()
+
+print(f"Found {len(mm_tensors)} tensors to extract.")
+print(f"Found additional {len(first_mm_tensors)} tensors to extract.")
+# projector = {name: checkpoint.[name].float() for name in mm_tensors}
+projector = {}
+for name in mm_tensors:
+    assert last_checkpoint is not None
+    projector[name] = last_checkpoint[name].float()
+for name in first_mm_tensors:
+    assert first_checkpoint is not None
+    projector[name] = first_checkpoint[name].float()
+
+if len(projector) > 0:
+    save_model(projector, f"{args.model}/llava.projector", 'pytorch')
+
+print("Done!")
+print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.")
+print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")
--- a/llm/llama.cpp/examples/llava/requirements.txt
+++ b/llm/llama.cpp/examples/llava/requirements.txt
-r ../../requirements/requirements-convert-legacy-llama.txt
+-r ../../requirements/requirements-convert_legacy_llama.txt
+--extra-index-url https://download.pytorch.org/whl/cpu
 pillow~=10.2.0
-torch~=2.1.1
+torch~=2.2.1