Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
9220b4fa
Unverified
Commit
9220b4fa
authored
Feb 19, 2024
by
Daniel Hiltgen
Committed by
GitHub
Feb 19, 2024
Browse files
Merge pull request #2585 from dhiltgen/cuda_leaks
Fix cuda leaks
parents
1e23e823
fc39a6cd
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
136 additions
and
9 deletions
+136
-9
llm/patches/02-shutdown.diff
llm/patches/02-shutdown.diff
+20
-9
llm/patches/03-cudaleaks.diff
llm/patches/03-cudaleaks.diff
+116
-0
No files found.
llm/patches/02-shutdown.diff
View file @
9220b4fa
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index
11dd82c3..311495a8
100644
index
a0b46970..7800c6e7
100644
--- a/examples/server/server.cpp
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -28,6 +28,7 @@
@@ -28,6 +28,7 @@
...
@@ -10,7 +10,7 @@ index 11dd82c3..311495a8 100644
...
@@ -10,7 +10,7 @@ index 11dd82c3..311495a8 100644
using json = nlohmann::json;
using json = nlohmann::json;
@@ -2
394
,6 +2
395
,9 @@
static void append_to_generated_text_from_generated_token_probs(llama_server_con
@@ -2
511
,6 +2
512
,9 @@
static void append_to_generated_text_from_generated_token_probs(llama_server_con
}
}
}
}
...
@@ -20,7 +20,7 @@ index 11dd82c3..311495a8 100644
...
@@ -20,7 +20,7 @@ index 11dd82c3..311495a8 100644
int main(int argc, char **argv)
int main(int argc, char **argv)
{
{
#if SERVER_VERBOSE != 1
#if SERVER_VERBOSE != 1
@@ -3
014
,8 +3
018,14
@@
int main(int argc, char **argv)
@@ -3
128
,8 +3
132,25
@@
int main(int argc, char **argv)
std::placeholders::_2,
std::placeholders::_2,
std::placeholders::_3
std::placeholders::_3
));
));
...
@@ -29,18 +29,29 @@ index 11dd82c3..311495a8 100644
...
@@ -29,18 +29,29 @@ index 11dd82c3..311495a8 100644
+ shutdown_handler = [&](int) {
+ shutdown_handler = [&](int) {
+ llama.queue_tasks.terminate();
+ llama.queue_tasks.terminate();
+ };
+ };
+ signal(SIGTERM, signal_handler);
+
+ signal(SIGINT, signal_handler);
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+ struct sigaction sigint_action;
+ sigint_action.sa_handler = signal_handler;
+ sigemptyset (&sigint_action.sa_mask);
+ sigint_action.sa_flags = 0;
+ sigaction(SIGINT, &sigint_action, NULL);
+#elif defined (_WIN32)
+ auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
+ return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
+ };
+ SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
+#endif
+ llama.queue_tasks.start_loop();
+ llama.queue_tasks.start_loop();
+ svr.stop();
+ svr.stop();
t.join();
t.join();
llama_backend_free();
llama_backend_free();
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index
70cce072..9124869a
100644
index
54854896..0ee670db
100644
--- a/examples/server/utils.hpp
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -
19
0,6 +
19
0,7 @@
inline std::string format_chatml(std::vector<json> messages)
@@ -
22
0,6 +
22
0,7 @@
inline std::string format_chatml(std::vector<json> messages)
struct llama_server_queue {
struct llama_server_queue {
int id = 0;
int id = 0;
std::mutex mutex_tasks;
std::mutex mutex_tasks;
...
@@ -48,7 +59,7 @@ index 70cce072..9124869a 100644
...
@@ -48,7 +59,7 @@ index 70cce072..9124869a 100644
// queues
// queues
std::vector<task_server> queue_tasks;
std::vector<task_server> queue_tasks;
std::vector<task_server> queue_tasks_deferred;
std::vector<task_server> queue_tasks_deferred;
@@ -2
4
8,9 +2
4
9,18 @@
struct llama_server_queue {
@@ -2
7
8,9 +2
7
9,18 @@
struct llama_server_queue {
queue_tasks_deferred.clear();
queue_tasks_deferred.clear();
}
}
...
@@ -69,7 +80,7 @@ index 70cce072..9124869a 100644
...
@@ -69,7 +80,7 @@ index 70cce072..9124869a 100644
while (true) {
while (true) {
// new task arrived
// new task arrived
LOG_VERBOSE("have new task", {});
LOG_VERBOSE("have new task", {});
@@ -2
9
4,8 +3
0
4,12 @@
struct llama_server_queue {
@@ -
3
24,8 +3
3
4,12 @@
struct llama_server_queue {
{
{
std::unique_lock<std::mutex> lock(mutex_tasks);
std::unique_lock<std::mutex> lock(mutex_tasks);
if (queue_tasks.empty()) {
if (queue_tasks.empty()) {
...
...
llm/patches/03-cudaleaks.diff
0 → 100644
View file @
9220b4fa
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 3102762c..568ac1d0 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -307,6 +307,10 @@
struct llama_client_slot
}
};
+#ifdef GGML_USE_CUBLAS
+extern "C" GGML_CALL void ggml_free_cublas(void);
+#endif
+
struct llama_server_context
{
llama_model *model = nullptr;
@@ -353,6 +357,10 @@
struct llama_server_context
llama_free_model(model);
model = nullptr;
}
+#ifdef GGML_USE_CUBLAS
+ ggml_free_cublas();
+#endif
+
}
bool load_model(const gpt_params ¶ms_)
@@ -3093,6 +3101,7 @@
int main(int argc, char **argv)
sigemptyset (&sigint_action.sa_mask);
sigint_action.sa_flags = 0;
sigaction(SIGINT, &sigint_action, NULL);
+ sigaction(SIGUSR1, &sigint_action, NULL);
#elif defined (_WIN32)
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
@@ -3106,3 +3115,4 @@
int main(int argc, char **argv)
llama_backend_free();
return 0;
}
+
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 96976f24..3543920e 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -39,6 +39,7 @@
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
#define cublasCreate hipblasCreate
+#define cublasDestroy hipblasDestroy
#define cublasGemmEx hipblasGemmEx
#define cublasGemmBatchedEx hipblasGemmBatchedEx
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
@@ -7928,10 +7929,11 @@
GGML_CALL bool ggml_cublas_loaded(void) {
return g_cublas_loaded;
}
+static bool g_cublas_initialized = false;
+
GGML_CALL void ggml_init_cublas() {
- static bool initialized = false;
- if (!initialized) {
+ if (!g_cublas_initialized) {
#ifdef __HIP_PLATFORM_AMD__
// Workaround for a rocBLAS bug when using multiple graphics cards:
@@ -7941,7 +7943,7 @@
GGML_CALL void ggml_init_cublas() {
#endif
if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
- initialized = true;
+ g_cublas_initialized = true;
g_cublas_loaded = false;
return;
}
@@ -8011,7 +8013,7 @@
GGML_CALL void ggml_init_cublas() {
// configure logging to stdout
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
- initialized = true;
+ g_cublas_initialized = true;
g_cublas_loaded = true;
}
}
@@ -11528,3 +11530,17 @@
GGML_CALL int ggml_backend_cuda_reg_devices() {
}
return device_count;
}
+
+extern "C" GGML_CALL void ggml_free_cublas(void);
+GGML_CALL void ggml_free_cublas(void) {
+ for (int id = 0; id < g_device_count; ++id) {
+#if !defined(GGML_USE_HIPBLAS)
+ CU_CHECK(cuMemUnmap(g_cuda_pool_addr[id], g_cuda_pool_size[id]));
+ g_cuda_pool_size[id] = 0;
+ g_cuda_pool_addr[id] = 0;
+#endif
+ CUBLAS_CHECK(cublasDestroy(g_cublas_handles[id]));
+ g_cublas_handles[id] = nullptr;
+ }
+ g_cublas_initialized = false;
+}
\
No newline at end of file
diff --git a/ggml-cuda.h b/ggml-cuda.h
index b1ebd61d..b4c80c2c 100644
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -20,6 +20,9 @@
extern "C" {
// Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
GGML_API GGML_CALL void ggml_init_cublas(void);
+// Release CUDA resources
+GGML_API GGML_CALL void ggml_free_cublas(void);
+
// Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
GGML_API GGML_CALL bool ggml_cublas_loaded(void);
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment