Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
4613a080
Unverified
Commit
4613a080
authored
Feb 20, 2024
by
Jeffrey Morgan
Committed by
GitHub
Feb 20, 2024
Browse files
update llama.cpp submodule to `66c1968f7` (#2618)
parent
ace2cdf1
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
39 additions
and
130 deletions
+39
-130
llm/dyn_ext_server.go
llm/dyn_ext_server.go
+6
-1
llm/ext_server/ext_server.cpp
llm/ext_server/ext_server.cpp
+3
-2
llm/ext_server/ext_server.h
llm/ext_server/ext_server.h
+1
-1
llm/llama.cpp
llm/llama.cpp
+1
-1
llm/patches/02-cudaleaks.diff
llm/patches/02-cudaleaks.diff
+28
-29
llm/patches/02-shutdown.diff
llm/patches/02-shutdown.diff
+0
-96
No files found.
llm/dyn_ext_server.go
View file @
4613a080
...
...
@@ -106,7 +106,12 @@ func newDynExtServer(library, model string, adapters, projectors []string, opts
sparams
.
memory_f16
=
C
.
bool
(
opts
.
F16KV
)
sparams
.
use_mlock
=
C
.
bool
(
opts
.
UseMLock
)
sparams
.
use_mmap
=
C
.
bool
(
opts
.
UseMMap
)
sparams
.
numa
=
C
.
bool
(
opts
.
UseNUMA
)
if
opts
.
UseNUMA
{
sparams
.
numa
=
C
.
int
(
1
)
}
else
{
sparams
.
numa
=
C
.
int
(
0
)
}
sparams
.
lora_adapters
=
nil
for
i
:=
0
;
i
<
len
(
adapters
);
i
++
{
...
...
llm/ext_server/ext_server.cpp
View file @
4613a080
...
...
@@ -80,7 +80,7 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
params
.
main_gpu
=
sparams
->
main_gpu
;
params
.
use_mlock
=
sparams
->
use_mlock
;
params
.
use_mmap
=
sparams
->
use_mmap
;
params
.
numa
=
sparams
->
numa
;
params
.
numa
=
(
ggml_numa_strategy
)
sparams
->
numa
;
params
.
embedding
=
sparams
->
embedding
;
if
(
sparams
->
model
!=
NULL
)
{
params
.
model
=
sparams
->
model
;
...
...
@@ -111,7 +111,8 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
}
#endif
llama_backend_init
(
params
.
numa
);
llama_backend_init
();
llama_numa_init
(
params
.
numa
);
// load the model
if
(
!
llama
->
load_model
(
params
))
{
...
...
llm/ext_server/ext_server.h
View file @
4613a080
...
...
@@ -41,7 +41,7 @@ typedef struct ext_server_params {
int32_t
main_gpu
;
// the GPU that is used for scratch and small tensors
bool
use_mlock
;
// force system to keep model in RAM
bool
use_mmap
;
// use mmap if possible
bool
numa
;
// attempt optimizations that help on some NUMA systems
int
numa
;
// attempt optimizations that help on some NUMA systems
bool
embedding
;
// get only sentence embedding
ext_server_lora_adapter_t
*
lora_adapters
;
char
*
mmproj
;
...
...
llama.cpp
@
66c1968f
Compare
6c00a066
...
66c1968f
Subproject commit 6
c00a066928b0475b865a2e3e709e2166e02d548
Subproject commit 6
6c1968f7a2e895675425e875b6589f1233a1b52
llm/patches/0
3
-cudaleaks.diff
→
llm/patches/0
2
-cudaleaks.diff
View file @
4613a080
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index
3102762c..568ac1d0
100644
index
7800c6e7..be30db23
100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -30
7
,6 +30
7
,10 @@
struct llama_client_slot
}
};
@@ -30,6 +30,10 @@
#include <atomic>
#include <signal.h>
+#ifdef GGML_USE_CUBLAS
+extern "C" GGML_CALL void ggml_free_cublas(void);
+#endif
+
struct llama_server_context
{
llama_model *model = nullptr;
@@ -353,6 +357,
10
@@
struct llama_server_context
using json = nlohmann::json;
struct server_params
@@ -353,6 +357,
9
@@
struct llama_server_context
llama_free_model(model);
model = nullptr;
}
+#ifdef GGML_USE_CUBLAS
+ ggml_free_cublas();
+#endif
+
}
bool load_model(const gpt_params ¶ms_)
@@ -3
09
3,6 +310
1
,7 @@
int main(int argc, char **argv)
@@ -3
14
3,6 +31
5
0,7 @@
int main(int argc, char **argv)
sigemptyset (&sigint_action.sa_mask);
sigint_action.sa_flags = 0;
sigaction(SIGINT, &sigint_action, NULL);
...
...
@@ -32,13 +31,8 @@ index 3102762c..568ac1d0 100644
#elif defined (_WIN32)
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
@@ -3106,3 +3115,4 @@
int main(int argc, char **argv)
llama_backend_free();
return 0;
}
+
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 9
6976f24..3543920e
100644
index 9
33ebbc4..88a4f664
100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -39,6 +39,7 @@
...
...
@@ -49,30 +43,30 @@ index 96976f24..3543920e 100644
#define cublasGemmEx hipblasGemmEx
#define cublasGemmBatchedEx hipblasGemmBatchedEx
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
@@ -79
28
,10 +79
2
9,1
1
@@
GGML_CALL bool ggml_cublas_loaded(void) {
@@ -79
91
,10 +799
2
,1
0
@@
GGML_CALL bool ggml_cublas_loaded(void) {
return g_cublas_loaded;
}
+static bool g_cublas_initialized = false;
+
GGML_CALL void ggml_init_cublas() {
-GGML_CALL void ggml_init_cublas() {
- static bool initialized = false;
+static bool g_cublas_initialized = false;
- if (!initialized) {
+GGML_CALL void ggml_init_cublas() {
+ if (!g_cublas_initialized) {
#ifdef __HIP_PLATFORM_AMD__
// Workaround for a rocBLAS bug when using multiple graphics cards:
@@ -
7941,7 +7943
,7 @@
GGML_CALL void ggml_init_cublas() {
@@ -
8004,7 +8005
,7 @@
GGML_CALL void ggml_init_cublas() {
#endif
if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
- initialized = true;
+ g_cublas_initialized = true;
g_cublas_loaded = false;
fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__);
return;
}
@@ -8011,7 +8013,7 @@
GGML_CALL void ggml_init_cublas() {
@@ -8075,7 +8076,7 @@
GGML_CALL void ggml_init_cublas() {
// configure logging to stdout
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
...
...
@@ -81,25 +75,30 @@ index 96976f24..3543920e 100644
g_cublas_loaded = true;
}
}
@@ -11
528
,3 +11
530,17
@@
GGML_CALL int ggml_backend_cuda_reg_devices() {
@@ -11
604
,3 +11
605,23
@@
GGML_CALL int ggml_backend_cuda_reg_devices() {
}
return device_count;
}
+
+
+extern "C" GGML_CALL void ggml_free_cublas(void);
+GGML_CALL void ggml_free_cublas(void) {
+ for (int id = 0; id < g_device_count; ++id) {
+#if !defined(GGML_USE_HIPBLAS)
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
+ if (g_device_caps[id].vmm) {
+ CU_CHECK(cuMemUnmap(g_cuda_pool_addr[id], g_cuda_pool_size[id]));
+ g_cuda_pool_size[id] = 0;
+ g_cuda_pool_addr[id] = 0;
+ }
+#endif
+ // TODO: free legacy non-vmm memory
+ // destroy cublas handle
+ CUBLAS_CHECK(cublasDestroy(g_cublas_handles[id]));
+ g_cublas_handles[id] = nullptr;
+ }
+
+ g_cublas_initialized = false;
+}
\
No newline at end of file
diff --git a/ggml-cuda.h b/ggml-cuda.h
index b1ebd61d..b4c80c2c 100644
--- a/ggml-cuda.h
...
...
llm/patches/02-shutdown.diff
deleted
100644 → 0
View file @
ace2cdf1
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index a0b46970..7800c6e7 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -28,6 +28,7 @@
#include <chrono>
#include <condition_variable>
#include <atomic>
+#include <signal.h>
using json = nlohmann::json;
@@ -2511,6 +2512,9 @@
static void append_to_generated_text_from_generated_token_probs(llama_server_con
}
}
+std::function<void(int)> shutdown_handler;
+inline void signal_handler(int signal) { shutdown_handler(signal); }
+
int main(int argc, char **argv)
{
#if SERVER_VERBOSE != 1
@@ -3128,8 +3132,25 @@
int main(int argc, char **argv)
std::placeholders::_2,
std::placeholders::_3
));
- llama.queue_tasks.start_loop();
+ shutdown_handler = [&](int) {
+ llama.queue_tasks.terminate();
+ };
+
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+ struct sigaction sigint_action;
+ sigint_action.sa_handler = signal_handler;
+ sigemptyset (&sigint_action.sa_mask);
+ sigint_action.sa_flags = 0;
+ sigaction(SIGINT, &sigint_action, NULL);
+#elif defined (_WIN32)
+ auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
+ return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
+ };
+ SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
+#endif
+ llama.queue_tasks.start_loop();
+ svr.stop();
t.join();
llama_backend_free();
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 54854896..0ee670db 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -220,6 +220,7 @@
inline std::string format_chatml(std::vector<json> messages)
struct llama_server_queue {
int id = 0;
std::mutex mutex_tasks;
+ bool running;
// queues
std::vector<task_server> queue_tasks;
std::vector<task_server> queue_tasks_deferred;
@@ -278,9 +279,18 @@
struct llama_server_queue {
queue_tasks_deferred.clear();
}
- // Start the main loop. This call is blocking
- [[noreturn]]
+ // end the start_loop routine
+ void terminate() {
+ {
+ std::unique_lock<std::mutex> lock(mutex_tasks);
+ running = false;
+ }
+ condition_tasks.notify_all();
+ }
+
+ // Start the main loop.
void start_loop() {
+ running = true;
while (true) {
// new task arrived
LOG_VERBOSE("have new task", {});
@@ -324,8 +334,12 @@
struct llama_server_queue {
{
std::unique_lock<std::mutex> lock(mutex_tasks);
if (queue_tasks.empty()) {
+ if (!running) {
+ LOG_VERBOSE("ending start_loop", {});
+ return;
+ }
condition_tasks.wait(lock, [&]{
- return !queue_tasks.empty();
+ return (!queue_tasks.empty() || !running);
});
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment