llama: remove server static assets (#3174)

e95ffc74 · Jeffrey Morgan · GitHub · 2dce1ab4 · 2dce1ab4 · 2dce1ab4
Unverified Commit e95ffc74 authored Mar 15, 2024 by Jeffrey Morgan Committed by GitHub Mar 15, 2024
6 changed files
--- a/llm/ext_server/completion.js.hpp
+++ b/llm/ext_server/completion.js.hpp
--- a/llm/ext_server/index.html.hpp
+++ b/llm/ext_server/index.html.hpp
--- a/llm/ext_server/index.js.hpp
+++ b/llm/ext_server/index.js.hpp
--- a/llm/ext_server/json-schema-to-grammar.mjs.hpp
+++ b/llm/ext_server/json-schema-to-grammar.mjs.hpp
--- a/llm/ext_server/oai.hpp
+++ b/llm/ext_server/oai.hpp
-#pragma once
-#include <string>
-#include <vector>
-#include <set>
-#include <mutex>
-#include <condition_variable>
-#include <unordered_map>
-#include "json.hpp"
-#include "utils.hpp"
-#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
-using json = nlohmann::json;
-inline static json oaicompat_completion_params_parse(
-    const struct llama_model * model,
-    const json &body, /* openai api json semantics */
-    const std::string &chat_template)
-{
-    json llama_params;
-    llama_params["__oaicompat"] = true;
-    // Map OpenAI parameters to llama.cpp parameters
-    //
-    // For parameters that are defined by the OpenAI documentation (e.g.
-    // temperature), we explicitly specify OpenAI's intended default; we
-    // need to do that because sometimes OpenAI disagrees with llama.cpp
-    //
-    // https://platform.openai.com/docs/api-reference/chat/create
-    llama_sampling_params default_sparams;
-    llama_params["model"]             = json_value(body, "model", std::string("unknown"));
-    llama_params["prompt"]            = format_chat(model, chat_template, body["messages"]);
-    llama_params["cache_prompt"]      = json_value(body, "cache_prompt", false);
-    llama_params["temperature"]       = json_value(body, "temperature", 0.0);
-    llama_params["top_k"]             = json_value(body, "top_k", default_sparams.top_k);
-    llama_params["top_p"]             = json_value(body, "top_p", 1.0);
-    llama_params["n_predict"]         = json_value(body, "max_tokens", -1);
-    llama_params["logit_bias"]        = json_value(body, "logit_bias",json::object());
-    llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
-    llama_params["presence_penalty"]  = json_value(body, "presence_penalty", 0.0);
-    llama_params["seed"]              = json_value(body, "seed", LLAMA_DEFAULT_SEED);
-    llama_params["stream"]            = json_value(body, "stream", false);
-    llama_params["mirostat"]          = json_value(body, "mirostat", default_sparams.mirostat);
-    llama_params["mirostat_tau"]      = json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
-    llama_params["mirostat_eta"]      = json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
-    llama_params["penalize_nl"]       = json_value(body, "penalize_nl", default_sparams.penalize_nl);
-    llama_params["typical_p"]         = json_value(body, "typical_p", default_sparams.typical_p);
-    llama_params["repeat_last_n"]     = json_value(body, "repeat_last_n", default_sparams.penalty_last_n);
-    llama_params["ignore_eos"]        = json_value(body, "ignore_eos", false);
-    llama_params["tfs_z"]             = json_value(body, "tfs_z", default_sparams.tfs_z);
-    if (body.count("grammar") != 0) {
-        llama_params["grammar"] = json_value(body, "grammar", json::object());
-    }
-    // Handle 'stop' field
-    if (body.contains("stop") && body["stop"].is_string()) {
-        llama_params["stop"] = json::array({body["stop"].get<std::string>()});
-    } else {
-        llama_params["stop"] = json_value(body, "stop", json::array());
-    }
-    // Ensure there is ChatML-specific end sequence among stop words
-    llama_params["stop"].push_back("<|im_end|>");
-    return llama_params;
-}
-inline static json format_final_response_oaicompat(const json &request, const task_result &response, bool streaming = false)
-{
-    json result = response.result_json;
-    bool stopped_word        = result.count("stopped_word") != 0;
-    bool stopped_eos         = json_value(result, "stopped_eos", false);
-    int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
-    int num_prompt_tokens    = json_value(result, "tokens_evaluated", 0);
-    std::string content      = json_value(result, "content", std::string(""));
-    std::string finish_reason = "length";
-    if (stopped_word || stopped_eos) {
-        finish_reason = "stop";
-    }
-    json choices =
-        streaming ? json::array({json{{"finish_reason", finish_reason},
-                                        {"index", 0},
-                                        {"delta", json::object()}}})
-                  : json::array({json{{"finish_reason", finish_reason},
-                                        {"index", 0},
-                                        {"message", json{{"content", content},
-                                                         {"role", "assistant"}}}}});
-    std::time_t t = std::time(0);
-    json res =
-        json{{"choices", choices},
-            {"created", t},
-            {"model",
-                json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
-            {"object", streaming ? "chat.completion.chunk" : "chat.completion"},
-            {"usage",
-                json{{"completion_tokens", num_tokens_predicted},
-                     {"prompt_tokens",     num_prompt_tokens},
-                     {"total_tokens",      num_tokens_predicted + num_prompt_tokens}}},
-            {"id", gen_chatcmplid()}};
-    if (server_verbose) {
-        res["__verbose"] = result;
-    }
-    if (result.contains("completion_probabilities")) {
-        res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
-    }
-    return res;
-}
-// return value is vector as there is one case where we might need to generate two responses
-inline static std::vector<json> format_partial_response_oaicompat(const task_result &response) {
-    json result = response.result_json;
-    if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
-        return std::vector<json>({response.result_json});
-    }
-    bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
-    std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
-    bool stopped_word   = json_value(result, "stopped_word", false);
-    bool stopped_eos    = json_value(result, "stopped_eos", false);
-    bool stopped_limit  = json_value(result, "stopped_limit", false);
-    std::string content = json_value(result, "content", std::string(""));
-    std::string finish_reason;
-    if (stopped_word || stopped_eos) {
-        finish_reason = "stop";
-    }
-    if (stopped_limit) {
-        finish_reason = "length";
-    }
-    std::time_t t = std::time(0);
-    json choices;
-    if (!finish_reason.empty()) {
-        choices = json::array({json{{"finish_reason", finish_reason},
-                                    {"index", 0},
-                                    {"delta", json::object()}}});
-    } else {
-        if (first) {
-            if (content.empty()) {
-                choices = json::array({json{{"finish_reason", nullptr},
-                                            {"index", 0},
-                                            {"delta", json{{"role", "assistant"}}}}});
-            } else {
-                // We have to send this as two updates to conform to openai behavior
-                json initial_ret = json{{"choices", json::array({json{
-                                        {"finish_reason", nullptr},
-                                        {"index", 0},
-                                        {"delta", json{
-                                            {"role", "assistant"}
-                                        }}}})},
-                            {"created", t},
-                            {"id", gen_chatcmplid()},
-                            {"model", modelname},
-                            {"object", "chat.completion.chunk"}};
-                json second_ret = json{
-                            {"choices", json::array({json{{"finish_reason", nullptr},
-                                                            {"index", 0},
-                                                            {"delta", json{
-                                                            {"content", content}}}
-                                                            }})},
-                            {"created", t},
-                            {"id", gen_chatcmplid()},
-                            {"model", modelname},
-                            {"object", "chat.completion.chunk"}};
-                return std::vector<json>({initial_ret, second_ret});
-            }
-        } else {
-            // Some idiosyncrasy in task processing logic makes several trailing calls
-            // with empty content, we ignore these at the calee site.
-            if (content.empty()) {
-                return std::vector<json>({json::object()});
-            }
-            choices = json::array({json{
-                {"finish_reason", nullptr},
-                {"index", 0},
-                {"delta",
-                json{
-                    {"content", content},
-                }},
-            }});
-        }
-    }
-    json ret = json{{"choices", choices},
-                    {"created", t},
-                    {"id", gen_chatcmplid()},
-                    {"model", modelname},
-                    {"object", "chat.completion.chunk"}};
-    return std::vector<json>({ret});
-}
-inline static json format_embeddings_response_oaicompat(const json &request, const json &embeddings)
-{
-    json res =
-        json{
-            {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
-            {"object", "list"},
-            {"usage",
-                json{{"prompt_tokens", 0},
-                     {"total_tokens", 0}}},
-            {"data", embeddings}
-        };
-    return res;
-}
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -2,7 +2,6 @@
 #include "llama.h"
 #include "grammar-parser.h"
 #include "utils.hpp"
-#include "oai.hpp"
 #include "../llava/clip.h"
 #include "../llava/llava.h"
@@ -18,12 +17,6 @@
 #include "httplib.h"
 #include "json.hpp"
-// auto generated files (update with ./deps.sh)
-#include "index.html.hpp"
-#include "index.js.hpp"
-#include "completion.js.hpp"
-#include "json-schema-to-grammar.mjs.hpp"
 #include <cstddef>
 #include <thread>
 #include <chrono>
@@ -129,9 +122,6 @@ struct server_slot {
    bool stopped_word = false;
    bool stopped_limit = false;
-    bool oaicompat = false;
-    std::string oaicompat_model;
    std::string stopping_word;
    // sampling
@@ -543,14 +533,6 @@ struct llama_server_context
        slot_params default_params;
        llama_sampling_params default_sparams;
-        if (data.count("__oaicompat") != 0) {
-            slot->oaicompat = true;
-            slot->oaicompat_model = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
-        } else {
-            slot->oaicompat = false;
-            slot->oaicompat_model = "";
-        }
        slot->params.stream             = json_value(data, "stream",            false);
        slot->params.cache_prompt       = json_value(data, "cache_prompt",      false);
        slot->params.n_predict          = json_value(data, "n_predict",         default_params.n_predict);
@@ -1148,12 +1130,6 @@ struct llama_server_context
            res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs_output);
        }
-        if (slot.oaicompat)
-        {
-            res.result_json["oaicompat_token_ctr"] = slot.n_decoded;
-            res.result_json["model"] = slot.oaicompat_model;
-        }
        queue_results.send(res);
    }
@@ -1201,12 +1177,6 @@ struct llama_server_context
            res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs);
        }
-        if (slot.oaicompat)
-        {
-            res.result_json["oaicompat_token_ctr"] = slot.n_decoded;
-            res.result_json["model"] = slot.oaicompat_model;
-        }
        queue_results.send(res);
    }
@@ -3075,41 +3045,9 @@ int _main(int argc, char **argv)
    // this is only called if no index.html is found in the public --path
    svr.Get("/", [](const httplib::Request &, httplib::Response &res)
            {
-                res.set_content(reinterpret_cast<const char*>(&index_html), index_html_len, "text/html; charset=utf-8");
+                res.set_content("server running", "text/plain; charset=utf-8");
-                return false;
+                res.status = 200; // Unauthorized
-            });
+                return true;
-    // this is only called if no index.js is found in the public --path
-    svr.Get("/index.js", [](const httplib::Request &, httplib::Response &res)
-            {
-                res.set_content(reinterpret_cast<const char *>(&index_js), index_js_len, "text/javascript; charset=utf-8");
-                return false;
-            });
-    // this is only called if no index.html is found in the public --path
-    svr.Get("/completion.js", [](const httplib::Request &, httplib::Response &res)
-            {
-                res.set_content(reinterpret_cast<const char*>(&completion_js), completion_js_len, "application/javascript; charset=utf-8");
-                return false;
-            });
-    // this is only called if no index.html is found in the public --path
-    svr.Get("/json-schema-to-grammar.mjs", [](const httplib::Request &, httplib::Response &res)
-            {
-                res.set_content(reinterpret_cast<const char*>(&json_schema_to_grammar_mjs), json_schema_to_grammar_mjs_len, "application/javascript; charset=utf-8");
-                return false;
-            });
-    svr.Get("/props", [&llama](const httplib::Request & req, httplib::Response &res)
-            {
-                res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
-                json data = {
-                    { "user_name",      llama.name_user.c_str() },
-                    { "assistant_name", llama.name_assistant.c_str() },
-                    { "default_generation_settings", llama.default_generation_settings_for_props },
-                    { "total_slots",    llama.params.n_parallel }
-                };
-                res.set_content(data.dump(), "application/json; charset=utf-8");
            });
    svr.Post("/completion", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
@@ -3189,180 +3127,6 @@ int _main(int argc, char **argv)
                }
            });
-    svr.Get("/v1/models", [&params, &model_meta](const httplib::Request& req, httplib::Response& res)
-            {
-                res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
-                std::time_t t = std::time(0);
-                json models = {
-                    {"object", "list"},
-                    {"data", {
-                        {
-                            {"id",       params.model_alias},
-                            {"object",   "model"},
-                            {"created",  t},
-                            {"owned_by", "llamacpp"},
-                            {"meta",     model_meta}
-                        },
-                    }}
-                };
-                res.set_content(models.dump(), "application/json; charset=utf-8");
-            });
-    const auto chat_completions = [&llama, &validate_api_key, &sparams](const httplib::Request &req, httplib::Response &res)
-    {
-        res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
-        if (!validate_api_key(req, res)) {
-            return;
-        }
-        json data = oaicompat_completion_params_parse(llama.model, json::parse(req.body), sparams.chat_template);
-        const int task_id = llama.queue_tasks.get_new_id();
-        llama.queue_results.add_waiting_task_id(task_id);
-        llama.request_completion(task_id, data, false, false, -1);
-        if (!json_value(data, "stream", false)) {
-            std::string completion_text;
-            task_result result = llama.queue_results.recv(task_id);
-            if (!result.error && result.stop) {
-                json oaicompat_result = format_final_response_oaicompat(data, result);
-                res.set_content(oaicompat_result.dump(-1, ' ', false,
-                                    json::error_handler_t::replace),
-                                    "application/json; charset=utf-8");
-            } else {
-                res.status = 500;
-                res.set_content(result.result_json["content"], "text/plain; charset=utf-8");
-            }
-            llama.queue_results.remove_waiting_task_id(task_id);
-        } else {
-            const auto chunked_content_provider = [task_id, &llama](size_t, httplib::DataSink &sink) {
-                while (true) {
-                    task_result llama_result = llama.queue_results.recv(task_id);
-                    if (!llama_result.error) {
-                        std::vector<json> result_array = format_partial_response_oaicompat( llama_result);
-                        for (auto it = result_array.begin(); it != result_array.end(); ++it)
-                        {
-                            if (!it->empty()) {
-                                const std::string str =
-                                    "data: " +
-                                    it->dump(-1, ' ', false, json::error_handler_t::replace) +
-                                    "\n\n";
-                                LOG_VERBOSE("data stream", {{"to_send", str}});
-                                if (!sink.write(str.c_str(), str.size())) {
-                                    llama.queue_results.remove_waiting_task_id(task_id);
-                                    return false;
-                                }
-                            }
-                        }
-                        if (llama_result.stop) {
-                            break;
-                        }
-                    } else {
-                        const std::string str =
-                            "error: " +
-                            llama_result.result_json.dump(-1, ' ', false,
-                                    json::error_handler_t::replace) +
-                            "\n\n";
-                        LOG_VERBOSE("data stream", {{"to_send", str}});
-                        if (!sink.write(str.c_str(), str.size())) {
-                            llama.queue_results.remove_waiting_task_id(task_id);
-                            return false;
-                        }
-                        break;
-                    }
-                }
-                sink.done();
-                llama.queue_results.remove_waiting_task_id(task_id);
-                return true;
-            };
-            auto on_complete = [task_id, &llama](bool) {
-                // cancel request
-                llama.request_cancel(task_id);
-                llama.queue_results.remove_waiting_task_id(task_id);
-            };
-            res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
-        }
-    };
-    svr.Post("/chat/completions", chat_completions);
-    svr.Post("/v1/chat/completions", chat_completions);
-    svr.Post("/infill", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
-            {
-                res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
-                if (!validate_api_key(req, res)) {
-                    return;
-                }
-                json data = json::parse(req.body);
-                const int task_id = llama.queue_tasks.get_new_id();
-                llama.queue_results.add_waiting_task_id(task_id);
-                llama.request_completion(task_id, data, true, false, -1);
-                if (!json_value(data, "stream", false)) {
-                    std::string completion_text;
-                    task_result result = llama.queue_results.recv(task_id);
-                    if (!result.error && result.stop)
-                    {
-                        res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
-                    }
-                    else
-                    {
-                        res.status = 404;
-                        res.set_content(result.result_json["content"], "text/plain; charset=utf-8");
-                    }
-                    llama.queue_results.remove_waiting_task_id(task_id);
-                } else {
-                    const auto chunked_content_provider = [task_id, &llama](size_t, httplib::DataSink & sink) {
-                        while (true)
-                        {
-                            task_result result = llama.queue_results.recv(task_id);
-                            if (!result.error) {
-                                const std::string str =
-                                "data: " +
-                                result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
-                                "\n\n";
-                                LOG_VERBOSE("data stream", {
-                                    { "to_send", str }
-                                });
-                                if (!sink.write(str.c_str(), str.size()))
-                                {
-                                    llama.queue_results.remove_waiting_task_id(task_id);
-                                    return false;
-                                }
-                                if (result.stop)
-                                {
-                                    break;
-                                }
-                            }
-                            else
-                            {
-                                break;
-                            }
-                        }
-                        llama.queue_results.remove_waiting_task_id(task_id);
-                        sink.done();
-                        return true;
-                    };
-                    auto on_complete = [task_id, &llama] (bool)
-                    {
-                        // cancel
-                        llama.request_cancel(task_id);
-                    };
-                    res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
-                }
-            });
-    svr.Options(R"(/.*)", [](const httplib::Request &, httplib::Response &res)
-                { return res.set_content("", "application/json; charset=utf-8"); });
    svr.Post("/tokenize", [&llama](const httplib::Request &req, httplib::Response &res)
            {
                res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
@@ -3427,66 +3191,6 @@ int _main(int argc, char **argv)
                return res.set_content(result.result_json.dump(), "application/json; charset=utf-8");
            });
-    svr.Post("/v1/embeddings", [&llama](const httplib::Request &req, httplib::Response &res)
-            {
-                res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
-                const json body = json::parse(req.body);
-                json prompt;
-                if (body.count("input") != 0)
-                {
-                    prompt = body["input"];
-                    // batch
-                    if(prompt.is_array()) {
-                        json data = json::array();
-                        int i = 0;
-                        for (const json &elem : prompt) {
-                            const int task_id = llama.queue_tasks.get_new_id();
-                            llama.queue_results.add_waiting_task_id(task_id);
-                            llama.request_completion(task_id, { {"prompt", elem}, { "n_predict", 0} }, false, true, -1);
-                            // get the result
-                            task_result result = llama.queue_results.recv(task_id);
-                            llama.queue_results.remove_waiting_task_id(task_id);
-                            json embedding = json{
-                                {"embedding", json_value(result.result_json, "embedding", json::array())},
-                                {"index", i++},
-                                {"object", "embedding"}
-                            };
-                            data.push_back(embedding);
-                        }
-                        json result = format_embeddings_response_oaicompat(body, data);
-                        return res.set_content(result.dump(), "application/json; charset=utf-8");
-                    }
-                }
-                else
-                {
-                    prompt = "";
-                }
-                // create and queue the task
-                const int task_id = llama.queue_tasks.get_new_id();
-                llama.queue_results.add_waiting_task_id(task_id);
-                llama.request_completion(task_id, { {"prompt", prompt}, { "n_predict", 0}}, false, true, -1);
-                // get the result
-                task_result result = llama.queue_results.recv(task_id);
-                llama.queue_results.remove_waiting_task_id(task_id);
-                json data = json::array({json{
-                        {"embedding", json_value(result.result_json, "embedding", json::array())},
-                        {"index", 0},
-                        {"object", "embedding"}
-                    }}
-                );
-                json root = format_embeddings_response_oaicompat(body, data);
-                // send the result
-                return res.set_content(root.dump(), "application/json; charset=utf-8");
-            });
    // GG: if I put the main loop inside a thread, it crashes on the first request when build in Debug!?
    //     "Bus error: 10" - this is on macOS, it does not crash on Linux
    //std::thread t2([&]()