Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
0e4669b0
Unverified
Commit
0e4669b0
authored
Mar 08, 2024
by
Jeffrey Morgan
Committed by
GitHub
Mar 08, 2024
Browse files
update llama.cpp submodule to `6cdabe6` (#2999)
parent
b886bec3
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
32 additions
and
33 deletions
+32
-33
llm/ext_server/ext_server.cpp
llm/ext_server/ext_server.cpp
+12
-12
llm/llama.cpp
llm/llama.cpp
+1
-1
llm/patches/01-cache.diff
llm/patches/01-cache.diff
+6
-8
llm/patches/02-cudaleaks.diff
llm/patches/02-cudaleaks.diff
+13
-12
No files found.
llm/ext_server/ext_server.cpp
View file @
0e4669b0
...
@@ -26,7 +26,7 @@
...
@@ -26,7 +26,7 @@
#endif // GGML_USE_CUBLAS
#endif // GGML_USE_CUBLAS
// Expose the llama server as a callable extern "C" API
// Expose the llama server as a callable extern "C" API
llama_
server_context
*
llama
=
NULL
;
server_context
*
llama
=
NULL
;
std
::
thread
ext_server_thread
;
std
::
thread
ext_server_thread
;
bool
shutting_down
=
false
;
bool
shutting_down
=
false
;
std
::
atomic_int
recv_counter
;
std
::
atomic_int
recv_counter
;
...
@@ -57,7 +57,7 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
...
@@ -57,7 +57,7 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
err
->
id
=
0
;
err
->
id
=
0
;
err
->
msg
[
0
]
=
'\0'
;
err
->
msg
[
0
]
=
'\0'
;
try
{
try
{
llama
=
new
llama_
server_context
;
llama
=
new
server_context
;
gpt_params
params
;
gpt_params
params
;
params
.
n_ctx
=
sparams
->
n_ctx
;
params
.
n_ctx
=
sparams
->
n_ctx
;
params
.
n_batch
=
sparams
->
n_batch
;
params
.
n_batch
=
sparams
->
n_batch
;
...
@@ -144,13 +144,13 @@ void llama_server_start() {
...
@@ -144,13 +144,13 @@ void llama_server_start() {
LOG_TEE
(
"llama server main loop starting
\n
"
);
LOG_TEE
(
"llama server main loop starting
\n
"
);
ggml_time_init
();
ggml_time_init
();
llama
->
queue_tasks
.
on_new_task
(
std
::
bind
(
llama
->
queue_tasks
.
on_new_task
(
std
::
bind
(
&
llama_
server_context
::
process_single_task
,
llama
,
std
::
placeholders
::
_1
));
&
server_context
::
process_single_task
,
llama
,
std
::
placeholders
::
_1
));
llama
->
queue_tasks
.
on_finish_multitask
(
std
::
bind
(
llama
->
queue_tasks
.
on_finish_multitask
(
std
::
bind
(
&
llama_
server_context
::
on_finish_multitask
,
llama
,
std
::
placeholders
::
_1
));
&
server_context
::
on_finish_multitask
,
llama
,
std
::
placeholders
::
_1
));
llama
->
queue_tasks
.
on_run_slots
(
std
::
bind
(
llama
->
queue_tasks
.
on_run_slots
(
std
::
bind
(
&
llama_
server_context
::
update_slots
,
llama
));
&
server_context
::
update_slots
,
llama
));
llama
->
queue_results
.
on_multitask_update
(
std
::
bind
(
llama
->
queue_results
.
on_multitask_update
(
std
::
bind
(
&
llama_
server_queue
::
update_multitask
,
&
server_queue
::
update_multitask
,
&
llama
->
queue_tasks
,
&
llama
->
queue_tasks
,
std
::
placeholders
::
_1
,
std
::
placeholders
::
_1
,
std
::
placeholders
::
_2
,
std
::
placeholders
::
_2
,
...
@@ -198,7 +198,7 @@ void llama_server_completion(const char *json_req, ext_server_resp_t *resp) {
...
@@ -198,7 +198,7 @@ void llama_server_completion(const char *json_req, ext_server_resp_t *resp) {
json
data
=
json
::
parse
(
json_req
);
json
data
=
json
::
parse
(
json_req
);
resp
->
id
=
llama
->
queue_tasks
.
get_new_id
();
resp
->
id
=
llama
->
queue_tasks
.
get_new_id
();
llama
->
queue_results
.
add_waiting_task_id
(
resp
->
id
);
llama
->
queue_results
.
add_waiting_task_id
(
resp
->
id
);
llama
->
request_completion
(
resp
->
id
,
data
,
false
,
false
,
-
1
);
llama
->
request_completion
(
resp
->
id
,
-
1
,
data
,
false
,
false
);
}
catch
(
std
::
exception
&
e
)
{
}
catch
(
std
::
exception
&
e
)
{
snprintf
(
resp
->
msg
,
resp
->
msg_len
,
"exception %s"
,
e
.
what
());
snprintf
(
resp
->
msg
,
resp
->
msg_len
,
"exception %s"
,
e
.
what
());
}
catch
(...)
{
}
catch
(...)
{
...
@@ -216,9 +216,9 @@ void llama_server_completion_next_result(const int task_id,
...
@@ -216,9 +216,9 @@ void llama_server_completion_next_result(const int task_id,
std
::
string
result_json
;
std
::
string
result_json
;
try
{
try
{
atomicRecv
ar
(
recv_counter
);
atomicRecv
ar
(
recv_counter
);
task_result
result
=
llama
->
queue_results
.
recv
(
task_id
);
server_
task_result
result
=
llama
->
queue_results
.
recv
(
task_id
);
result_json
=
result_json
=
result
.
result_json
.
dump
(
-
1
,
' '
,
false
,
json
::
error_handler_t
::
replace
);
result
.
data
.
dump
(
-
1
,
' '
,
false
,
json
::
error_handler_t
::
replace
);
resp
->
id
=
result
.
id
;
resp
->
id
=
result
.
id
;
resp
->
stop
=
result
.
stop
;
resp
->
stop
=
result
.
stop
;
resp
->
error
=
result
.
error
;
resp
->
error
=
result
.
error
;
...
@@ -363,10 +363,10 @@ void llama_server_embedding(const char *json_req, char **json_resp,
...
@@ -363,10 +363,10 @@ void llama_server_embedding(const char *json_req, char **json_resp,
}
}
const
int
task_id
=
llama
->
queue_tasks
.
get_new_id
();
const
int
task_id
=
llama
->
queue_tasks
.
get_new_id
();
llama
->
queue_results
.
add_waiting_task_id
(
task_id
);
llama
->
queue_results
.
add_waiting_task_id
(
task_id
);
llama
->
request_completion
(
task_id
,
{{
"prompt"
,
prompt
},
{
"n_predict"
,
0
}},
false
,
true
,
-
1
);
llama
->
request_completion
(
task_id
,
-
1
,
{{
"prompt"
,
prompt
},
{
"n_predict"
,
0
}},
false
,
true
);
atomicRecv
ar
(
recv_counter
);
atomicRecv
ar
(
recv_counter
);
task_result
result
=
llama
->
queue_results
.
recv
(
task_id
);
server_
task_result
result
=
llama
->
queue_results
.
recv
(
task_id
);
std
::
string
result_json
=
result
.
result_json
.
dump
();
std
::
string
result_json
=
result
.
data
.
dump
();
const
std
::
string
::
size_type
size
=
result_json
.
size
()
+
1
;
const
std
::
string
::
size_type
size
=
result_json
.
size
()
+
1
;
*
json_resp
=
new
char
[
size
];
*
json_resp
=
new
char
[
size
];
snprintf
(
*
json_resp
,
size
,
"%s"
,
result_json
.
c_str
());
snprintf
(
*
json_resp
,
size
,
"%s"
,
result_json
.
c_str
());
...
...
llama.cpp
@
6cdabe65
Compare
c29af7e2
...
6cdabe65
Subproject commit
c29af7e2252d288f2ea58a7d437c1cb7c0abf160
Subproject commit
6cdabe652695167263c8b447520987b11856f7ca
llm/patches/01-cache.diff
View file @
0e4669b0
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index
2b2f4a0f..afac49af
100644
index
f255ad76..914ecfdd
100644
--- a/examples/server/server.cpp
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -997,13 +997,15 @@
struct llama_server_context
@@ -1101,12 +1101,13 @@
struct server_context {
slot.n_sent_text += result.text_to_send.size();
// add the token to slot queue and cache
// add the token to slot queue and cache
}
}
- slot.add_token_string(result);
- slot.add_token_string(result);
+
if (slot.params.stream) {
if (slot.params.stream)
{
send_partial_response(slot, result);
send_partial_response(slot, result);
}
}
}
}
+ slot.add_token_string(result);
+ slot.add_token_string(result);
+
+
if (incomplete)
if (incomplete) {
{
slot.has_next_token = true;
slot.has_next_token = true;
}
llm/patches/02-cudaleaks.diff
View file @
0e4669b0
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index
2b2f4a0f..25857bdd
100644
index
f255ad76..5b83acb1
100644
--- a/examples/server/server.cpp
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -
31
,6 +
31
,10 @@
@@ -
28
,6 +
28
,10 @@
#include <
atomic
>
#include <
thread
>
#include <signal.h>
#include <signal.h>
+#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUBLAS
...
@@ -12,18 +12,19 @@ index 2b2f4a0f..25857bdd 100644
...
@@ -12,18 +12,19 @@ index 2b2f4a0f..25857bdd 100644
+
+
using json = nlohmann::json;
using json = nlohmann::json;
struct
server_
params {
bool
server_
verbose = false;
@@ -
363
,6 +
367,9
@@
struct
llama_
server_context
@@ -
648
,6 +
652,10
@@
struct server_context
{
llama_free_model(model);
llama_free_model(model);
model = nullptr;
model = nullptr;
}
}
+
+#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUBLAS
+ ggml_free_cublas();
+ ggml_free_cublas();
+#endif
+#endif
}
}
bool load_model(const gpt_params ¶ms_)
bool load_model(const gpt_params &
params_)
{
@@ -3
494
,6 +3
501
,7 @@
int main(int argc, char **argv)
@@ -3
339
,6 +3
347
,7 @@
int main(int argc, char **
argv)
{
sigemptyset (&sigint_action.sa_mask);
sigemptyset (&sigint_action.sa_mask);
sigint_action.sa_flags = 0;
sigint_action.sa_flags = 0;
sigaction(SIGINT, &sigint_action, NULL);
sigaction(SIGINT, &sigint_action, NULL);
...
@@ -32,7 +33,7 @@ index 2b2f4a0f..25857bdd 100644
...
@@ -32,7 +33,7 @@ index 2b2f4a0f..25857bdd 100644
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index
0c6501e9..75c12723
100644
index
72bcec8c..50a45e3d
100644
--- a/ggml-cuda.cu
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -43,6 +43,7 @@
@@ -43,6 +43,7 @@
...
@@ -43,7 +44,7 @@ index 0c6501e9..75c12723 100644
...
@@ -43,7 +44,7 @@ index 0c6501e9..75c12723 100644
#define cublasGemmEx hipblasGemmEx
#define cublasGemmEx hipblasGemmEx
#define cublasGemmBatchedEx hipblasGemmBatchedEx
#define cublasGemmBatchedEx hipblasGemmBatchedEx
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
@@ -8
694
,10 +8
695
,10 @@
GGML_CALL bool ggml_cublas_loaded(void) {
@@ -8
751
,10 +8
752
,10 @@
GGML_CALL bool ggml_cublas_loaded(void) {
return g_cublas_loaded;
return g_cublas_loaded;
}
}
...
@@ -57,7 +58,7 @@ index 0c6501e9..75c12723 100644
...
@@ -57,7 +58,7 @@ index 0c6501e9..75c12723 100644
#ifdef __HIP_PLATFORM_AMD__
#ifdef __HIP_PLATFORM_AMD__
// Workaround for a rocBLAS bug when using multiple graphics cards:
// Workaround for a rocBLAS bug when using multiple graphics cards:
@@ -87
07
,7 +87
08
,7 @@
GGML_CALL void ggml_init_cublas() {
@@ -87
64
,7 +87
65
,7 @@
GGML_CALL void ggml_init_cublas() {
#endif
#endif
if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
...
@@ -66,7 +67,7 @@ index 0c6501e9..75c12723 100644
...
@@ -66,7 +67,7 @@ index 0c6501e9..75c12723 100644
g_cublas_loaded = false;
g_cublas_loaded = false;
fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__);
fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__);
return;
return;
@@ -8
778
,7 +8
779
,7 @@
GGML_CALL void ggml_init_cublas() {
@@ -8
835
,7 +8
836
,7 @@
GGML_CALL void ggml_init_cublas() {
// configure logging to stdout
// configure logging to stdout
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
...
@@ -75,7 +76,7 @@ index 0c6501e9..75c12723 100644
...
@@ -75,7 +76,7 @@ index 0c6501e9..75c12723 100644
g_cublas_loaded = true;
g_cublas_loaded = true;
}
}
}
}
@@ -12
345
,3 +12
346
,22 @@
GGML_CALL int ggml_backend_cuda_reg_devices() {
@@ -12
490
,3 +12
491
,22 @@
GGML_CALL int ggml_backend_cuda_reg_devices() {
}
}
return device_count;
return device_count;
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment