Commit fc39a6cd authored by Daniel Hiltgen's avatar Daniel Hiltgen
Browse files

Fix cuda leaks

This should resolve the problem where we don't fully unload from the GPU
when we go idle.
parent 88622847
diff --git a/examples/server/server.cpp b/examples/server/server.cpp diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 11dd82c3..311495a8 100644 index a0b46970..7800c6e7 100644
--- a/examples/server/server.cpp --- a/examples/server/server.cpp
+++ b/examples/server/server.cpp +++ b/examples/server/server.cpp
@@ -28,6 +28,7 @@ @@ -28,6 +28,7 @@
...@@ -10,7 +10,7 @@ index 11dd82c3..311495a8 100644 ...@@ -10,7 +10,7 @@ index 11dd82c3..311495a8 100644
using json = nlohmann::json; using json = nlohmann::json;
@@ -2394,6 +2395,9 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con @@ -2511,6 +2512,9 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
} }
} }
...@@ -20,7 +20,7 @@ index 11dd82c3..311495a8 100644 ...@@ -20,7 +20,7 @@ index 11dd82c3..311495a8 100644
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
#if SERVER_VERBOSE != 1 #if SERVER_VERBOSE != 1
@@ -3014,8 +3018,14 @@ int main(int argc, char **argv) @@ -3128,8 +3132,25 @@ int main(int argc, char **argv)
std::placeholders::_2, std::placeholders::_2,
std::placeholders::_3 std::placeholders::_3
)); ));
...@@ -29,18 +29,29 @@ index 11dd82c3..311495a8 100644 ...@@ -29,18 +29,29 @@ index 11dd82c3..311495a8 100644
+ shutdown_handler = [&](int) { + shutdown_handler = [&](int) {
+ llama.queue_tasks.terminate(); + llama.queue_tasks.terminate();
+ }; + };
+ signal(SIGTERM, signal_handler); +
+ signal(SIGINT, signal_handler); +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+ struct sigaction sigint_action;
+ sigint_action.sa_handler = signal_handler;
+ sigemptyset (&sigint_action.sa_mask);
+ sigint_action.sa_flags = 0;
+ sigaction(SIGINT, &sigint_action, NULL);
+#elif defined (_WIN32)
+ auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
+ return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
+ };
+ SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
+#endif
+ llama.queue_tasks.start_loop(); + llama.queue_tasks.start_loop();
+ svr.stop(); + svr.stop();
t.join(); t.join();
llama_backend_free(); llama_backend_free();
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 70cce072..9124869a 100644 index 54854896..0ee670db 100644
--- a/examples/server/utils.hpp --- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp +++ b/examples/server/utils.hpp
@@ -190,6 +190,7 @@ inline std::string format_chatml(std::vector<json> messages) @@ -220,6 +220,7 @@ inline std::string format_chatml(std::vector<json> messages)
struct llama_server_queue { struct llama_server_queue {
int id = 0; int id = 0;
std::mutex mutex_tasks; std::mutex mutex_tasks;
...@@ -48,7 +59,7 @@ index 70cce072..9124869a 100644 ...@@ -48,7 +59,7 @@ index 70cce072..9124869a 100644
// queues // queues
std::vector<task_server> queue_tasks; std::vector<task_server> queue_tasks;
std::vector<task_server> queue_tasks_deferred; std::vector<task_server> queue_tasks_deferred;
@@ -248,9 +249,18 @@ struct llama_server_queue { @@ -278,9 +279,18 @@ struct llama_server_queue {
queue_tasks_deferred.clear(); queue_tasks_deferred.clear();
} }
...@@ -69,7 +80,7 @@ index 70cce072..9124869a 100644 ...@@ -69,7 +80,7 @@ index 70cce072..9124869a 100644
while (true) { while (true) {
// new task arrived // new task arrived
LOG_VERBOSE("have new task", {}); LOG_VERBOSE("have new task", {});
@@ -294,8 +304,12 @@ struct llama_server_queue { @@ -324,8 +334,12 @@ struct llama_server_queue {
{ {
std::unique_lock<std::mutex> lock(mutex_tasks); std::unique_lock<std::mutex> lock(mutex_tasks);
if (queue_tasks.empty()) { if (queue_tasks.empty()) {
......
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 3102762c..568ac1d0 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -307,6 +307,10 @@ struct llama_client_slot
}
};
+#ifdef GGML_USE_CUBLAS
+extern "C" GGML_CALL void ggml_free_cublas(void);
+#endif
+
struct llama_server_context
{
llama_model *model = nullptr;
@@ -353,6 +357,10 @@ struct llama_server_context
llama_free_model(model);
model = nullptr;
}
+#ifdef GGML_USE_CUBLAS
+ ggml_free_cublas();
+#endif
+
}
bool load_model(const gpt_params &params_)
@@ -3093,6 +3101,7 @@ int main(int argc, char **argv)
sigemptyset (&sigint_action.sa_mask);
sigint_action.sa_flags = 0;
sigaction(SIGINT, &sigint_action, NULL);
+ sigaction(SIGUSR1, &sigint_action, NULL);
#elif defined (_WIN32)
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
@@ -3106,3 +3115,4 @@ int main(int argc, char **argv)
llama_backend_free();
return 0;
}
+
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 96976f24..3543920e 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -39,6 +39,7 @@
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
#define cublasCreate hipblasCreate
+#define cublasDestroy hipblasDestroy
#define cublasGemmEx hipblasGemmEx
#define cublasGemmBatchedEx hipblasGemmBatchedEx
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
@@ -7928,10 +7929,11 @@ GGML_CALL bool ggml_cublas_loaded(void) {
return g_cublas_loaded;
}
+static bool g_cublas_initialized = false;
+
GGML_CALL void ggml_init_cublas() {
- static bool initialized = false;
- if (!initialized) {
+ if (!g_cublas_initialized) {
#ifdef __HIP_PLATFORM_AMD__
// Workaround for a rocBLAS bug when using multiple graphics cards:
@@ -7941,7 +7943,7 @@ GGML_CALL void ggml_init_cublas() {
#endif
if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
- initialized = true;
+ g_cublas_initialized = true;
g_cublas_loaded = false;
return;
}
@@ -8011,7 +8013,7 @@ GGML_CALL void ggml_init_cublas() {
// configure logging to stdout
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
- initialized = true;
+ g_cublas_initialized = true;
g_cublas_loaded = true;
}
}
@@ -11528,3 +11530,17 @@ GGML_CALL int ggml_backend_cuda_reg_devices() {
}
return device_count;
}
+
+extern "C" GGML_CALL void ggml_free_cublas(void);
+GGML_CALL void ggml_free_cublas(void) {
+ for (int id = 0; id < g_device_count; ++id) {
+#if !defined(GGML_USE_HIPBLAS)
+ CU_CHECK(cuMemUnmap(g_cuda_pool_addr[id], g_cuda_pool_size[id]));
+ g_cuda_pool_size[id] = 0;
+ g_cuda_pool_addr[id] = 0;
+#endif
+ CUBLAS_CHECK(cublasDestroy(g_cublas_handles[id]));
+ g_cublas_handles[id] = nullptr;
+ }
+ g_cublas_initialized = false;
+}
\ No newline at end of file
diff --git a/ggml-cuda.h b/ggml-cuda.h
index b1ebd61d..b4c80c2c 100644
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -20,6 +20,9 @@ extern "C" {
// Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
GGML_API GGML_CALL void ggml_init_cublas(void);
+// Release CUDA resources
+GGML_API GGML_CALL void ggml_free_cublas(void);
+
// Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
GGML_API GGML_CALL bool ggml_cublas_loaded(void);
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment