Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
0e4669b0
Unverified
Commit
0e4669b0
authored
Mar 08, 2024
by
Jeffrey Morgan
Committed by
GitHub
Mar 08, 2024
Browse files
update llama.cpp submodule to `6cdabe6` (#2999)
parent
b886bec3
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
32 additions
and
33 deletions
+32
-33
llm/ext_server/ext_server.cpp
llm/ext_server/ext_server.cpp
+12
-12
llm/llama.cpp
llm/llama.cpp
+1
-1
llm/patches/01-cache.diff
llm/patches/01-cache.diff
+6
-8
llm/patches/02-cudaleaks.diff
llm/patches/02-cudaleaks.diff
+13
-12
No files found.
llm/ext_server/ext_server.cpp
View file @
0e4669b0
...
...
@@ -26,7 +26,7 @@
#endif // GGML_USE_CUBLAS
// Expose the llama server as a callable extern "C" API
llama_
server_context
*
llama
=
NULL
;
server_context
*
llama
=
NULL
;
std
::
thread
ext_server_thread
;
bool
shutting_down
=
false
;
std
::
atomic_int
recv_counter
;
...
...
@@ -57,7 +57,7 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
err
->
id
=
0
;
err
->
msg
[
0
]
=
'\0'
;
try
{
llama
=
new
llama_
server_context
;
llama
=
new
server_context
;
gpt_params
params
;
params
.
n_ctx
=
sparams
->
n_ctx
;
params
.
n_batch
=
sparams
->
n_batch
;
...
...
@@ -144,13 +144,13 @@ void llama_server_start() {
LOG_TEE
(
"llama server main loop starting
\n
"
);
ggml_time_init
();
llama
->
queue_tasks
.
on_new_task
(
std
::
bind
(
&
llama_
server_context
::
process_single_task
,
llama
,
std
::
placeholders
::
_1
));
&
server_context
::
process_single_task
,
llama
,
std
::
placeholders
::
_1
));
llama
->
queue_tasks
.
on_finish_multitask
(
std
::
bind
(
&
llama_
server_context
::
on_finish_multitask
,
llama
,
std
::
placeholders
::
_1
));
&
server_context
::
on_finish_multitask
,
llama
,
std
::
placeholders
::
_1
));
llama
->
queue_tasks
.
on_run_slots
(
std
::
bind
(
&
llama_
server_context
::
update_slots
,
llama
));
&
server_context
::
update_slots
,
llama
));
llama
->
queue_results
.
on_multitask_update
(
std
::
bind
(
&
llama_
server_queue
::
update_multitask
,
&
server_queue
::
update_multitask
,
&
llama
->
queue_tasks
,
std
::
placeholders
::
_1
,
std
::
placeholders
::
_2
,
...
...
@@ -198,7 +198,7 @@ void llama_server_completion(const char *json_req, ext_server_resp_t *resp) {
json
data
=
json
::
parse
(
json_req
);
resp
->
id
=
llama
->
queue_tasks
.
get_new_id
();
llama
->
queue_results
.
add_waiting_task_id
(
resp
->
id
);
llama
->
request_completion
(
resp
->
id
,
data
,
false
,
false
,
-
1
);
llama
->
request_completion
(
resp
->
id
,
-
1
,
data
,
false
,
false
);
}
catch
(
std
::
exception
&
e
)
{
snprintf
(
resp
->
msg
,
resp
->
msg_len
,
"exception %s"
,
e
.
what
());
}
catch
(...)
{
...
...
@@ -216,9 +216,9 @@ void llama_server_completion_next_result(const int task_id,
std
::
string
result_json
;
try
{
atomicRecv
ar
(
recv_counter
);
task_result
result
=
llama
->
queue_results
.
recv
(
task_id
);
server_
task_result
result
=
llama
->
queue_results
.
recv
(
task_id
);
result_json
=
result
.
result_json
.
dump
(
-
1
,
' '
,
false
,
json
::
error_handler_t
::
replace
);
result
.
data
.
dump
(
-
1
,
' '
,
false
,
json
::
error_handler_t
::
replace
);
resp
->
id
=
result
.
id
;
resp
->
stop
=
result
.
stop
;
resp
->
error
=
result
.
error
;
...
...
@@ -363,10 +363,10 @@ void llama_server_embedding(const char *json_req, char **json_resp,
}
const
int
task_id
=
llama
->
queue_tasks
.
get_new_id
();
llama
->
queue_results
.
add_waiting_task_id
(
task_id
);
llama
->
request_completion
(
task_id
,
{{
"prompt"
,
prompt
},
{
"n_predict"
,
0
}},
false
,
true
,
-
1
);
llama
->
request_completion
(
task_id
,
-
1
,
{{
"prompt"
,
prompt
},
{
"n_predict"
,
0
}},
false
,
true
);
atomicRecv
ar
(
recv_counter
);
task_result
result
=
llama
->
queue_results
.
recv
(
task_id
);
std
::
string
result_json
=
result
.
result_json
.
dump
();
server_
task_result
result
=
llama
->
queue_results
.
recv
(
task_id
);
std
::
string
result_json
=
result
.
data
.
dump
();
const
std
::
string
::
size_type
size
=
result_json
.
size
()
+
1
;
*
json_resp
=
new
char
[
size
];
snprintf
(
*
json_resp
,
size
,
"%s"
,
result_json
.
c_str
());
...
...
llama.cpp
@
6cdabe65
Compare
c29af7e2
...
6cdabe65
Subproject commit
c29af7e2252d288f2ea58a7d437c1cb7c0abf160
Subproject commit
6cdabe652695167263c8b447520987b11856f7ca
llm/patches/01-cache.diff
View file @
0e4669b0
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index
2b2f4a0f..afac49af
100644
index
f255ad76..914ecfdd
100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -997,13 +997,15 @@
struct llama_server_context
slot.n_sent_text += result.text_to_send.size();
@@ -1101,12 +1101,13 @@
struct server_context {
// add the token to slot queue and cache
}
- slot.add_token_string(result);
+
if (slot.params.stream)
{
if (slot.params.stream) {
send_partial_response(slot, result);
}
}
+ slot.add_token_string(result);
+
if (incomplete)
{
if (incomplete) {
slot.has_next_token = true;
}
llm/patches/02-cudaleaks.diff
View file @
0e4669b0
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index
2b2f4a0f..25857bdd
100644
index
f255ad76..5b83acb1
100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -
31
,6 +
31
,10 @@
#include <
atomic
>
@@ -
28
,6 +
28
,10 @@
#include <
thread
>
#include <signal.h>
+#ifdef GGML_USE_CUBLAS
...
...
@@ -12,18 +12,19 @@ index 2b2f4a0f..25857bdd 100644
+
using json = nlohmann::json;
struct
server_
params {
@@ -
363
,6 +
367,9
@@
struct
llama_
server_context
bool
server_
verbose = false;
@@ -
648
,6 +
652,10
@@
struct server_context
{
llama_free_model(model);
model = nullptr;
}
+
+#ifdef GGML_USE_CUBLAS
+ ggml_free_cublas();
+#endif
}
bool load_model(const gpt_params ¶ms_)
@@ -3
494
,6 +3
501
,7 @@
int main(int argc, char **argv)
bool load_model(const gpt_params &
params_)
{
@@ -3
339
,6 +3
347
,7 @@
int main(int argc, char **
argv)
{
sigemptyset (&sigint_action.sa_mask);
sigint_action.sa_flags = 0;
sigaction(SIGINT, &sigint_action, NULL);
...
...
@@ -32,7 +33,7 @@ index 2b2f4a0f..25857bdd 100644
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index
0c6501e9..75c12723
100644
index
72bcec8c..50a45e3d
100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -43,6 +43,7 @@
...
...
@@ -43,7 +44,7 @@ index 0c6501e9..75c12723 100644
#define cublasGemmEx hipblasGemmEx
#define cublasGemmBatchedEx hipblasGemmBatchedEx
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
@@ -8
694
,10 +8
695
,10 @@
GGML_CALL bool ggml_cublas_loaded(void) {
@@ -8
751
,10 +8
752
,10 @@
GGML_CALL bool ggml_cublas_loaded(void) {
return g_cublas_loaded;
}
...
...
@@ -57,7 +58,7 @@ index 0c6501e9..75c12723 100644
#ifdef __HIP_PLATFORM_AMD__
// Workaround for a rocBLAS bug when using multiple graphics cards:
@@ -87
07
,7 +87
08
,7 @@
GGML_CALL void ggml_init_cublas() {
@@ -87
64
,7 +87
65
,7 @@
GGML_CALL void ggml_init_cublas() {
#endif
if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
...
...
@@ -66,7 +67,7 @@ index 0c6501e9..75c12723 100644
g_cublas_loaded = false;
fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__);
return;
@@ -8
778
,7 +8
779
,7 @@
GGML_CALL void ggml_init_cublas() {
@@ -8
835
,7 +8
836
,7 @@
GGML_CALL void ggml_init_cublas() {
// configure logging to stdout
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
...
...
@@ -75,7 +76,7 @@ index 0c6501e9..75c12723 100644
g_cublas_loaded = true;
}
}
@@ -12
345
,3 +12
346
,22 @@
GGML_CALL int ggml_backend_cuda_reg_devices() {
@@ -12
490
,3 +12
491
,22 @@
GGML_CALL int ggml_backend_cuda_reg_devices() {
}
return device_count;
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment