relay load model errors to the client (#3065)

b80661e8 · Bruce MacDonald · GitHub · 6d3adfbe · b80661e8 · b80661e8
Unverified Commit b80661e8 authored Mar 11, 2024 by Bruce MacDonald Committed by GitHub Mar 11, 2024
Showing with 51 additions and 11 deletions

llm/dyn_ext_server.go llm/dyn_ext_server.go +1 -1

llm/ext_server/ext_server.cpp llm/ext_server/ext_server.cpp +6 -10

llm/patches/03-load_exception.diff llm/patches/03-load_exception.diff +44 -0

No files found.
--- a/llm/dyn_ext_server.go
+++ b/llm/dyn_ext_server.go
@@ -149,7 +149,7 @@ func newDynExtServer(library, model string, adapters, projectors []string, opts

 	slog.Info("Initializing llama server")
 	slog.Debug(fmt.Sprintf("server params: %+v", sparams))
-	initResp := newExtServerResp(128)
+	initResp := newExtServerResp(512)
 	defer freeExtServerResp(initResp)
 	C.dyn_llama_server_init(llm.s, &sparams, &initResp)
 	if initResp.id < 0 {

--- a/llm/ext_server/ext_server.cpp
+++ b/llm/ext_server/ext_server.cpp
@@ -114,16 +114,12 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
    llama_backend_init();
    llama_numa_init(params.numa);

-    // load the model
-    if (!llama->load_model(params)) {
-      // TODO - consider modifying the logging logic or patching load_model so
-      // we can capture more detailed error messages and pass them back to the
-      // caller for better UX
-      err->id = -1;
-      snprintf(err->msg, err->msg_len, "error loading model %s",
-               params.model.c_str());
-      return;
-    }
+  if (!llama->load_model(params)) { 
+    // an error occured that was not thrown
+    err->id = -1;
+    snprintf(err->msg, err->msg_len, "error loading model %s", params.model.c_str());
+    return;
+  }

    llama->initialize();
  } catch (std::exception &e) {

--- a/llm/patches/03-load_exception.diff
+++ b/llm/patches/03-load_exception.diff
+diff --git a/llama.cpp b/llama.cpp
+index 4225f955..7b762f86 100644
+--- a/llama.cpp
+++ b/llama.cpp
+@@ -4756,7 +4756,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
+         }
+     } catch (const std::exception & err) {
+         LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
+-        return -1;
+        throw;
+     }
+ 
+     return 0;
+@@ -12102,16 +12102,22 @@ struct llama_model * llama_load_model_from_file(
+         };
+     }
+ 
+-    int status = llama_model_load(path_model, *model, params);
+-    GGML_ASSERT(status <= 0);
+-    if (status < 0) {
+-        if (status == -1) {
+-            LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
+-        } else if (status == -2) {
+-            LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
+    try {
+        int status = llama_model_load(path_model, *model, params);
+        GGML_ASSERT(status <= 0);
+        if (status < 0) {
+            if (status == -1) {
+                LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
+            } else if (status == -2) {
+                LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
+            }
+            delete model;
+            return nullptr;
+         }
+    } catch (...) {
+        LLAMA_LOG_ERROR("%s: exception loading model\n", __func__);
+         delete model;
+-        return nullptr;
+        throw;
+     }
+ 
+     return model;