Fix cuda leaks

This should resolve the problem where we don't fully unload from the GPU when we go idle.

Fix cuda leaks
This should resolve the problem where we don't fully unload from the GPU when we go idle.
fc39a6cd · Daniel Hiltgen · 88622847 · fc39a6cd · fc39a6cd
Commit fc39a6cd authored Feb 18, 2024 by Daniel Hiltgen
Hide whitespace changes
Inline Side-by-side

Showing with 136 additions and 9 deletions

llm/patches/02-shutdown.diff llm/patches/02-shutdown.diff +20 -9

llm/patches/03-cudaleaks.diff llm/patches/03-cudaleaks.diff +116 -0

No files found.
--- a/llm/patches/02-shutdown.diff
+++ b/llm/patches/02-shutdown.diff
 diff --git a/examples/server/server.cpp b/examples/server/server.cpp
-index 11dd82c3..311495a8 100644
+index a0b46970..7800c6e7 100644
 --- a/examples/server/server.cpp
 +++ b/examples/server/server.cpp
 @@ -28,6 +28,7 @@
@@ -10,7 +10,7 @@ index 11dd82c3..311495a8 100644
 using json = nlohmann::json;
-@@ -2394,6 +2395,9 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
+@@ -2511,6 +2512,9 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
     }
 }
@@ -20,7 +20,7 @@ index 11dd82c3..311495a8 100644
 int main(int argc, char **argv)
 {
 #if SERVER_VERBOSE != 1
-@@ -3014,8 +3018,14 @@ int main(int argc, char **argv)
+@@ -3128,8 +3132,25 @@ int main(int argc, char **argv)
         std::placeholders::_2,
         std::placeholders::_3
     ));
@@ -29,18 +29,29 @@ index 11dd82c3..311495a8 100644
 +    shutdown_handler = [&](int) {
 +        llama.queue_tasks.terminate();
 +    };
-+    signal(SIGTERM, signal_handler);
+
-+    signal(SIGINT, signal_handler);
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+    struct sigaction sigint_action;
+    sigint_action.sa_handler = signal_handler;
+    sigemptyset (&sigint_action.sa_mask);
+    sigint_action.sa_flags = 0;
+    sigaction(SIGINT, &sigint_action, NULL);
+#elif defined (_WIN32)
+    auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
+        return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
+    };
+    SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
+#endif
 +    llama.queue_tasks.start_loop();
 +    svr.stop();
     t.join();
     llama_backend_free();
 diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
-index 70cce072..9124869a 100644
+index 54854896..0ee670db 100644
 --- a/examples/server/utils.hpp
 +++ b/examples/server/utils.hpp
-@@ -190,6 +190,7 @@ inline std::string format_chatml(std::vector<json> messages)
+@@ -220,6 +220,7 @@ inline std::string format_chatml(std::vector<json> messages)
 struct llama_server_queue {
     int id = 0;
     std::mutex mutex_tasks;
@@ -48,7 +59,7 @@ index 70cce072..9124869a 100644
     // queues
     std::vector<task_server> queue_tasks;
     std::vector<task_server> queue_tasks_deferred;
-@@ -248,9 +249,18 @@ struct llama_server_queue {
+@@ -278,9 +279,18 @@ struct llama_server_queue {
         queue_tasks_deferred.clear();
     }
@@ -69,7 +80,7 @@ index 70cce072..9124869a 100644
         while (true) {
             // new task arrived
             LOG_VERBOSE("have new task", {});
-@@ -294,8 +304,12 @@ struct llama_server_queue {
+@@ -324,8 +334,12 @@ struct llama_server_queue {
             {
                 std::unique_lock<std::mutex> lock(mutex_tasks);
                 if (queue_tasks.empty()) {

--- a/llm/patches/03-cudaleaks.diff
+++ b/llm/patches/03-cudaleaks.diff
+diff --git a/examples/server/server.cpp b/examples/server/server.cpp
+index 3102762c..568ac1d0 100644
+--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
+@@ -307,6 +307,10 @@ struct llama_client_slot
+     }
+ };
+#ifdef GGML_USE_CUBLAS
+extern "C" GGML_CALL void ggml_free_cublas(void);
+#endif
+
+ struct llama_server_context
+ {
+     llama_model *model = nullptr;
+@@ -353,6 +357,10 @@ struct llama_server_context
+             llama_free_model(model);
+             model = nullptr;
+         }
+#ifdef GGML_USE_CUBLAS
+        ggml_free_cublas();
+#endif
+
+     }
+     bool load_model(const gpt_params &params_)
+@@ -3093,6 +3101,7 @@ int main(int argc, char **argv)
+     sigemptyset (&sigint_action.sa_mask);
+     sigint_action.sa_flags = 0;
+     sigaction(SIGINT, &sigint_action, NULL);
+    sigaction(SIGUSR1, &sigint_action, NULL);
+ #elif defined (_WIN32)
+     auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
+         return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
+@@ -3106,3 +3115,4 @@ int main(int argc, char **argv)
+     llama_backend_free();
+     return 0;
+ }
+
+diff --git a/ggml-cuda.cu b/ggml-cuda.cu
+index 96976f24..3543920e 100644
+--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
+@@ -39,6 +39,7 @@
+ #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
+ #define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
+ #define cublasCreate hipblasCreate
+#define cublasDestroy hipblasDestroy
+ #define cublasGemmEx hipblasGemmEx
+ #define cublasGemmBatchedEx hipblasGemmBatchedEx
+ #define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
+@@ -7928,10 +7929,11 @@ GGML_CALL bool ggml_cublas_loaded(void) {
+     return g_cublas_loaded;
+ }
+static bool g_cublas_initialized = false;
+
+ GGML_CALL void ggml_init_cublas() {
+-    static bool initialized = false;
+-    if (!initialized) {
+    if (!g_cublas_initialized) {
+ #ifdef __HIP_PLATFORM_AMD__
+         // Workaround for a rocBLAS bug when using multiple graphics cards:
+@@ -7941,7 +7943,7 @@ GGML_CALL void ggml_init_cublas() {
+ #endif
+         if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
+-            initialized = true;
+            g_cublas_initialized = true;
+             g_cublas_loaded = false;
+             return;
+         }
+@@ -8011,7 +8013,7 @@ GGML_CALL void ggml_init_cublas() {
+         // configure logging to stdout
+         // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
+-        initialized = true;
+        g_cublas_initialized = true;
+         g_cublas_loaded = true;
+     }
+ }
+@@ -11528,3 +11530,17 @@ GGML_CALL int ggml_backend_cuda_reg_devices() {
+     }
+     return device_count;
+ }
+
+extern "C" GGML_CALL void ggml_free_cublas(void);
+GGML_CALL void ggml_free_cublas(void) {
+    for (int id = 0; id < g_device_count; ++id) {
+#if !defined(GGML_USE_HIPBLAS)
+        CU_CHECK(cuMemUnmap(g_cuda_pool_addr[id], g_cuda_pool_size[id]));
+        g_cuda_pool_size[id] = 0;
+        g_cuda_pool_addr[id] = 0;
+#endif
+        CUBLAS_CHECK(cublasDestroy(g_cublas_handles[id]));
+        g_cublas_handles[id] = nullptr;
+    }
+    g_cublas_initialized = false;
+}
+\ No newline at end of file
+diff --git a/ggml-cuda.h b/ggml-cuda.h
+index b1ebd61d..b4c80c2c 100644
+--- a/ggml-cuda.h
+++ b/ggml-cuda.h
+@@ -20,6 +20,9 @@ extern "C" {
+ // Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
+ GGML_API GGML_CALL void   ggml_init_cublas(void);
+// Release CUDA resources
+GGML_API GGML_CALL void   ggml_free_cublas(void);
+
+ // Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
+ GGML_API GGML_CALL bool   ggml_cublas_loaded(void);