Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
0e4669b0
"vscode:/vscode.git/clone" did not exist on "5963e5050303070d2195fd7b329acb8e9d69f8eb"
Unverified
Commit
0e4669b0
authored
Mar 08, 2024
by
Jeffrey Morgan
Committed by
GitHub
Mar 08, 2024
Browse files
update llama.cpp submodule to `6cdabe6` (#2999)
parent
b886bec3
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
32 additions
and
33 deletions
+32
-33
llm/ext_server/ext_server.cpp
llm/ext_server/ext_server.cpp
+12
-12
llm/llama.cpp
llm/llama.cpp
+1
-1
llm/patches/01-cache.diff
llm/patches/01-cache.diff
+6
-8
llm/patches/02-cudaleaks.diff
llm/patches/02-cudaleaks.diff
+13
-12
No files found.
llm/ext_server/ext_server.cpp
View file @
0e4669b0
...
...
@@ -26,7 +26,7 @@
#endif // GGML_USE_CUBLAS
// Expose the llama server as a callable extern "C" API
llama_
server_context
*
llama
=
NULL
;
server_context
*
llama
=
NULL
;
std
::
thread
ext_server_thread
;
bool
shutting_down
=
false
;
std
::
atomic_int
recv_counter
;
...
...
@@ -57,7 +57,7 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
err
->
id
=
0
;
err
->
msg
[
0
]
=
'\0'
;
try
{
llama
=
new
llama_
server_context
;
llama
=
new
server_context
;
gpt_params
params
;
params
.
n_ctx
=
sparams
->
n_ctx
;
params
.
n_batch
=
sparams
->
n_batch
;
...
...
@@ -144,13 +144,13 @@ void llama_server_start() {
LOG_TEE
(
"llama server main loop starting
\n
"
);
ggml_time_init
();
llama
->
queue_tasks
.
on_new_task
(
std
::
bind
(
&
llama_
server_context
::
process_single_task
,
llama
,
std
::
placeholders
::
_1
));
&
server_context
::
process_single_task
,
llama
,
std
::
placeholders
::
_1
));
llama
->
queue_tasks
.
on_finish_multitask
(
std
::
bind
(
&
llama_
server_context
::
on_finish_multitask
,
llama
,
std
::
placeholders
::
_1
));
&
server_context
::
on_finish_multitask
,
llama
,
std
::
placeholders
::
_1
));
llama
->
queue_tasks
.
on_run_slots
(
std
::
bind
(
&
llama_
server_context
::
update_slots
,
llama
));
&
server_context
::
update_slots
,
llama
));
llama
->
queue_results
.
on_multitask_update
(
std
::
bind
(
&
llama_
server_queue
::
update_multitask
,
&
server_queue
::
update_multitask
,
&
llama
->
queue_tasks
,
std
::
placeholders
::
_1
,
std
::
placeholders
::
_2
,
...
...
@@ -198,7 +198,7 @@ void llama_server_completion(const char *json_req, ext_server_resp_t *resp) {
json
data
=
json
::
parse
(
json_req
);
resp
->
id
=
llama
->
queue_tasks
.
get_new_id
();
llama
->
queue_results
.
add_waiting_task_id
(
resp
->
id
);
llama
->
request_completion
(
resp
->
id
,
data
,
false
,
false
,
-
1
);
llama
->
request_completion
(
resp
->
id
,
-
1
,
data
,
false
,
false
);
}
catch
(
std
::
exception
&
e
)
{
snprintf
(
resp
->
msg
,
resp
->
msg_len
,
"exception %s"
,
e
.
what
());
}
catch
(...)
{
...
...
@@ -216,9 +216,9 @@ void llama_server_completion_next_result(const int task_id,
std
::
string
result_json
;
try
{
atomicRecv
ar
(
recv_counter
);
task_result
result
=
llama
->
queue_results
.
recv
(
task_id
);
server_
task_result
result
=
llama
->
queue_results
.
recv
(
task_id
);
result_json
=
result
.
result_json
.
dump
(
-
1
,
' '
,
false
,
json
::
error_handler_t
::
replace
);
result
.
data
.
dump
(
-
1
,
' '
,
false
,
json
::
error_handler_t
::
replace
);
resp
->
id
=
result
.
id
;
resp
->
stop
=
result
.
stop
;
resp
->
error
=
result
.
error
;
...
...
@@ -363,10 +363,10 @@ void llama_server_embedding(const char *json_req, char **json_resp,
}
const
int
task_id
=
llama
->
queue_tasks
.
get_new_id
();
llama
->
queue_results
.
add_waiting_task_id
(
task_id
);
llama
->
request_completion
(
task_id
,
{{
"prompt"
,
prompt
},
{
"n_predict"
,
0
}},
false
,
true
,
-
1
);
llama
->
request_completion
(
task_id
,
-
1
,
{{
"prompt"
,
prompt
},
{
"n_predict"
,
0
}},
false
,
true
);
atomicRecv
ar
(
recv_counter
);
task_result
result
=
llama
->
queue_results
.
recv
(
task_id
);
std
::
string
result_json
=
result
.
result_json
.
dump
();
server_
task_result
result
=
llama
->
queue_results
.
recv
(
task_id
);
std
::
string
result_json
=
result
.
data
.
dump
();
const
std
::
string
::
size_type
size
=
result_json
.
size
()
+
1
;
*
json_resp
=
new
char
[
size
];
snprintf
(
*
json_resp
,
size
,
"%s"
,
result_json
.
c_str
());
...
...
llama.cpp
@
6cdabe65
Compare
c29af7e2
...
6cdabe65
Subproject commit
c29af7e2252d288f2ea58a7d437c1cb7c0abf160
Subproject commit
6cdabe652695167263c8b447520987b11856f7ca
llm/patches/01-cache.diff
View file @
0e4669b0
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index
2b2f4a0f..afac49af
100644
index
f255ad76..914ecfdd
100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -997,13 +997,15 @@
struct llama_server_context
slot.n_sent_text += result.text_to_send.size();
@@ -1101,12 +1101,13 @@
struct server_context {
// add the token to slot queue and cache
}
- slot.add_token_string(result);
+
if (slot.params.stream)
{
if (slot.params.stream) {
send_partial_response(slot, result);
}
}
+ slot.add_token_string(result);
+
if (incomplete)
{
if (incomplete) {
slot.has_next_token = true;
}
llm/patches/02-cudaleaks.diff
View file @
0e4669b0
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index
2b2f4a0f..25857bdd
100644
index
f255ad76..5b83acb1
100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -
31
,6 +
31
,10 @@
#include <
atomic
>
@@ -
28
,6 +
28
,10 @@
#include <
thread
>
#include <signal.h>
+#ifdef GGML_USE_CUBLAS
...
...
@@ -12,18 +12,19 @@ index 2b2f4a0f..25857bdd 100644
+
using json = nlohmann::json;
struct
server_
params {
@@ -
363
,6 +
367,9
@@
struct
llama_
server_context
bool
server_
verbose = false;
@@ -
648
,6 +
652,10
@@
struct server_context
{
llama_free_model(model);
model = nullptr;
}
+
+#ifdef GGML_USE_CUBLAS
+ ggml_free_cublas();
+#endif
}
bool load_model(const gpt_params ¶ms_)
@@ -3
494
,6 +3
501
,7 @@
int main(int argc, char **argv)
bool load_model(const gpt_params &
params_)
{
@@ -3
339
,6 +3
347
,7 @@
int main(int argc, char **
argv)
{
sigemptyset (&sigint_action.sa_mask);
sigint_action.sa_flags = 0;
sigaction(SIGINT, &sigint_action, NULL);
...
...
@@ -32,7 +33,7 @@ index 2b2f4a0f..25857bdd 100644
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index
0c6501e9..75c12723
100644
index
72bcec8c..50a45e3d
100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -43,6 +43,7 @@
...
...
@@ -43,7 +44,7 @@ index 0c6501e9..75c12723 100644
#define cublasGemmEx hipblasGemmEx
#define cublasGemmBatchedEx hipblasGemmBatchedEx
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
@@ -8
694
,10 +8
695
,10 @@
GGML_CALL bool ggml_cublas_loaded(void) {
@@ -8
751
,10 +8
752
,10 @@
GGML_CALL bool ggml_cublas_loaded(void) {
return g_cublas_loaded;
}
...
...
@@ -57,7 +58,7 @@ index 0c6501e9..75c12723 100644
#ifdef __HIP_PLATFORM_AMD__
// Workaround for a rocBLAS bug when using multiple graphics cards:
@@ -87
07
,7 +87
08
,7 @@
GGML_CALL void ggml_init_cublas() {
@@ -87
64
,7 +87
65
,7 @@
GGML_CALL void ggml_init_cublas() {
#endif
if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
...
...
@@ -66,7 +67,7 @@ index 0c6501e9..75c12723 100644
g_cublas_loaded = false;
fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__);
return;
@@ -8
778
,7 +8
779
,7 @@
GGML_CALL void ggml_init_cublas() {
@@ -8
835
,7 +8
836
,7 @@
GGML_CALL void ggml_init_cublas() {
// configure logging to stdout
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
...
...
@@ -75,7 +76,7 @@ index 0c6501e9..75c12723 100644
g_cublas_loaded = true;
}
}
@@ -12
345
,3 +12
346
,22 @@
GGML_CALL int ggml_backend_cuda_reg_devices() {
@@ -12
490
,3 +12
491
,22 @@
GGML_CALL int ggml_backend_cuda_reg_devices() {
}
return device_count;
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment