Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
560be5e0
Unverified
Commit
560be5e0
authored
Mar 25, 2024
by
Daniel Hiltgen
Committed by
GitHub
Mar 25, 2024
Browse files
Merge pull request #3308 from dhiltgen/bump_more
Bump llama.cpp to b2510
parents
4a1c76b3
3e30c75f
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
7 additions
and
184 deletions
+7
-184
llm/ext_server/server.cpp
llm/ext_server/server.cpp
+6
-0
llm/llama.cpp
llm/llama.cpp
+1
-1
llm/patches/01-cache.diff
llm/patches/01-cache.diff
+0
-21
llm/patches/02-cudaleaks.diff
llm/patches/02-cudaleaks.diff
+0
-117
llm/patches/05-fix-clip-free.diff
llm/patches/05-fix-clip-free.diff
+0
-45
No files found.
llm/ext_server/server.cpp
View file @
560be5e0
...
@@ -343,6 +343,12 @@ struct llama_server_context
...
@@ -343,6 +343,12 @@ struct llama_server_context
~
llama_server_context
()
~
llama_server_context
()
{
{
if
(
clp_ctx
)
{
LOG_INFO
(
"freeing clip model"
,
{});
clip_free
(
clp_ctx
);
clp_ctx
=
nullptr
;
}
if
(
ctx
)
if
(
ctx
)
{
{
llama_free
(
ctx
);
llama_free
(
ctx
);
...
...
llama.cpp
@
1b26aebe
Compare
ceca1aef
...
1b26aebe
Subproject commit
ceca1aef0738b57951cd12c603c3477e75312dec
Subproject commit
1b26aebe4de4f048ac99996efd8a2c9af150904d
llm/patches/01-cache.diff
deleted
100644 → 0
View file @
4a1c76b3
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 8fe5e0b1..3e82acb9 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -997,13 +997,15 @@
struct llama_server_context
slot.n_sent_text += result.text_to_send.size();
// add the token to slot queue and cache
}
- slot.add_token_string(result);
+
if (slot.params.stream)
{
send_partial_response(slot, result);
}
}
+ slot.add_token_string(result);
+
if (incomplete)
{
slot.has_next_token = true;
llm/patches/02-cudaleaks.diff
deleted
100644 → 0
View file @
4a1c76b3
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 8fe5e0b1..53bf39c1 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -31,6 +31,10 @@
#include <atomic>
#include <signal.h>
+#ifdef GGML_USE_CUBLAS
+extern "C" GGML_CALL void ggml_free_cublas(void);
+#endif
+
using json = nlohmann::json;
struct server_params {
@@ -363,6 +367,10 @@
struct llama_server_context
llama_free_model(model);
model = nullptr;
}
+
+#ifdef GGML_USE_CUBLAS
+ ggml_free_cublas();
+#endif
}
bool load_model(const gpt_params ¶ms_)
@@ -3543,6 +3551,7 @@
int main(int argc, char **argv)
sigemptyset (&sigint_action.sa_mask);
sigint_action.sa_flags = 0;
sigaction(SIGINT, &sigint_action, NULL);
+ sigaction(SIGUSR1, &sigint_action, NULL);
#elif defined (_WIN32)
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 72bcec8c..6c934e8c 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -43,6 +43,7 @@
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
#define cublasCreate hipblasCreate
+#define cublasDestroy hipblasDestroy
#define cublasGemmEx hipblasGemmEx
#define cublasGemmBatchedEx hipblasGemmBatchedEx
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
@@ -8751,10 +8752,10 @@
GGML_CALL bool ggml_cublas_loaded(void) {
return g_cublas_loaded;
}
-GGML_CALL void ggml_init_cublas() {
- static bool initialized = false;
+static bool g_cublas_initialized = false;
- if (!initialized) {
+GGML_CALL void ggml_init_cublas() {
+ if (!g_cublas_initialized) {
#ifdef __HIP_PLATFORM_AMD__
// Workaround for a rocBLAS bug when using multiple graphics cards:
@@ -8764,7 +8765,7 @@
GGML_CALL void ggml_init_cublas() {
#endif
if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
- initialized = true;
+ g_cublas_initialized = true;
g_cublas_loaded = false;
fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__);
return;
@@ -8835,7 +8836,7 @@
GGML_CALL void ggml_init_cublas() {
// configure logging to stdout
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
- initialized = true;
+ g_cublas_initialized = true;
g_cublas_loaded = true;
}
}
@@ -12490,3 +12491,23 @@
GGML_CALL int ggml_backend_cuda_reg_devices() {
}
return device_count;
}
+
+
+extern "C" GGML_CALL void ggml_free_cublas(void);
+GGML_CALL void ggml_free_cublas(void) {
+ for (int id = 0; id < g_device_count; ++id) {
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
+ if (g_device_caps[id].vmm) {
+ CU_CHECK(cuMemUnmap(g_cuda_pool_addr[id], g_cuda_pool_size[id]));
+ g_cuda_pool_size[id] = 0;
+ g_cuda_pool_addr[id] = 0;
+ }
+#endif
+ // TODO: free legacy non-vmm memory
+ // destroy cublas handle
+ CUBLAS_CHECK(cublasDestroy(g_cublas_handles[id]));
+ g_cublas_handles[id] = nullptr;
+ }
+
+ g_cublas_initialized = false;
+}
\
No newline at end of file
diff --git a/ggml-cuda.h b/ggml-cuda.h
index b1ebd61d..6dd58ddf 100644
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -23,6 +23,9 @@
GGML_API GGML_CALL void ggml_init_cublas(void);
// Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
GGML_API GGML_CALL bool ggml_cublas_loaded(void);
+// Release CUDA resources
+GGML_API GGML_CALL void ggml_free_cublas(void);
+
GGML_API GGML_CALL void * ggml_cuda_host_malloc(size_t size);
GGML_API GGML_CALL void ggml_cuda_host_free(void * ptr);
llm/patches/05-fix-clip-free.diff
deleted
100644 → 0
View file @
4a1c76b3
From 9192432daf90b1bfec75577434a99b4ea70d54c8 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Thu, 14 Mar 2024 12:09:50 -0700
Subject: [PATCH] fix clip free
---
examples/llava/clip.cpp | 4 ++++
examples/server/server.cpp | 6 ++++++
2 files changed, 10 insertions(+)
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index ef9e4ba7..b4ddfe6b 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -1673,6 +1673,10 @@
void clip_free(clip_ctx * ctx) {
ggml_free(ctx->ctx_data);
gguf_free(ctx->ctx_gguf);
+ ggml_backend_buffer_free(ctx->params_buffer);
+ ggml_backend_buffer_free(ctx->compute_buffer);
+ ggml_backend_free(ctx->backend);
+ ggml_gallocr_free(ctx->compute_alloc);
delete ctx;
}
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 8fe5e0b1..f927336b 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -353,6 +353,12 @@
struct llama_server_context
~llama_server_context()
{
+ if (clp_ctx)
+ {
+ LOG_INFO("freeing clip model", {});
+ clip_free(clp_ctx);
+ clp_ctx = nullptr;
+ }
if (ctx)
{
llama_free(ctx);
--
2.43.2
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment