Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
369eda65
Unverified
Commit
369eda65
authored
Mar 11, 2024
by
Jeffrey Morgan
Committed by
GitHub
Mar 11, 2024
Browse files
update llama.cpp submodule to `ceca1ae` (#3064)
parent
f878e910
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
36 additions
and
61 deletions
+36
-61
llm/ext_server/ext_server.cpp
llm/ext_server/ext_server.cpp
+13
-13
llm/generate/gen_darwin.sh
llm/generate/gen_darwin.sh
+0
-15
llm/llama.cpp
llm/llama.cpp
+1
-1
llm/patches/01-cache.diff
llm/patches/01-cache.diff
+8
-6
llm/patches/02-cudaleaks.diff
llm/patches/02-cudaleaks.diff
+14
-13
llm/patches/03-locale.diff
llm/patches/03-locale.diff
+0
-13
No files found.
llm/ext_server/ext_server.cpp
View file @
369eda65
...
@@ -26,7 +26,7 @@
...
@@ -26,7 +26,7 @@
#endif // GGML_USE_CUBLAS
#endif // GGML_USE_CUBLAS
// Expose the llama server as a callable extern "C" API
// Expose the llama server as a callable extern "C" API
server_context
*
llama
=
NULL
;
llama_
server_context
*
llama
=
NULL
;
std
::
thread
ext_server_thread
;
std
::
thread
ext_server_thread
;
bool
shutting_down
=
false
;
bool
shutting_down
=
false
;
std
::
atomic_int
recv_counter
;
std
::
atomic_int
recv_counter
;
...
@@ -57,7 +57,7 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
...
@@ -57,7 +57,7 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
err
->
id
=
0
;
err
->
id
=
0
;
err
->
msg
[
0
]
=
'\0'
;
err
->
msg
[
0
]
=
'\0'
;
try
{
try
{
llama
=
new
server_context
;
llama
=
new
llama_
server_context
;
gpt_params
params
;
gpt_params
params
;
params
.
n_ctx
=
sparams
->
n_ctx
;
params
.
n_ctx
=
sparams
->
n_ctx
;
params
.
n_batch
=
sparams
->
n_batch
;
params
.
n_batch
=
sparams
->
n_batch
;
...
@@ -125,7 +125,7 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
...
@@ -125,7 +125,7 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
return
;
return
;
}
}
llama
->
init
();
llama
->
init
ialize
();
}
catch
(
std
::
exception
&
e
)
{
}
catch
(
std
::
exception
&
e
)
{
err
->
id
=
-
1
;
err
->
id
=
-
1
;
snprintf
(
err
->
msg
,
err
->
msg_len
,
"exception %s"
,
e
.
what
());
snprintf
(
err
->
msg
,
err
->
msg_len
,
"exception %s"
,
e
.
what
());
...
@@ -144,13 +144,13 @@ void llama_server_start() {
...
@@ -144,13 +144,13 @@ void llama_server_start() {
LOG_TEE
(
"llama server main loop starting
\n
"
);
LOG_TEE
(
"llama server main loop starting
\n
"
);
ggml_time_init
();
ggml_time_init
();
llama
->
queue_tasks
.
on_new_task
(
std
::
bind
(
llama
->
queue_tasks
.
on_new_task
(
std
::
bind
(
&
server_context
::
process_single_task
,
llama
,
std
::
placeholders
::
_1
));
&
llama_
server_context
::
process_single_task
,
llama
,
std
::
placeholders
::
_1
));
llama
->
queue_tasks
.
on_finish_multitask
(
std
::
bind
(
llama
->
queue_tasks
.
on_finish_multitask
(
std
::
bind
(
&
server_context
::
on_finish_multitask
,
llama
,
std
::
placeholders
::
_1
));
&
llama_
server_context
::
on_finish_multitask
,
llama
,
std
::
placeholders
::
_1
));
llama
->
queue_tasks
.
on_run_slots
(
std
::
bind
(
llama
->
queue_tasks
.
on_run_slots
(
std
::
bind
(
&
server_context
::
update_slots
,
llama
));
&
llama_
server_context
::
update_slots
,
llama
));
llama
->
queue_results
.
on_multitask_update
(
std
::
bind
(
llama
->
queue_results
.
on_multitask_update
(
std
::
bind
(
&
server_queue
::
update_multitask
,
&
llama_
server_queue
::
update_multitask
,
&
llama
->
queue_tasks
,
&
llama
->
queue_tasks
,
std
::
placeholders
::
_1
,
std
::
placeholders
::
_1
,
std
::
placeholders
::
_2
,
std
::
placeholders
::
_2
,
...
@@ -198,7 +198,7 @@ void llama_server_completion(const char *json_req, ext_server_resp_t *resp) {
...
@@ -198,7 +198,7 @@ void llama_server_completion(const char *json_req, ext_server_resp_t *resp) {
json
data
=
json
::
parse
(
json_req
);
json
data
=
json
::
parse
(
json_req
);
resp
->
id
=
llama
->
queue_tasks
.
get_new_id
();
resp
->
id
=
llama
->
queue_tasks
.
get_new_id
();
llama
->
queue_results
.
add_waiting_task_id
(
resp
->
id
);
llama
->
queue_results
.
add_waiting_task_id
(
resp
->
id
);
llama
->
request_completion
(
resp
->
id
,
-
1
,
data
,
false
,
false
);
llama
->
request_completion
(
resp
->
id
,
data
,
false
,
false
,
-
1
);
}
catch
(
std
::
exception
&
e
)
{
}
catch
(
std
::
exception
&
e
)
{
snprintf
(
resp
->
msg
,
resp
->
msg_len
,
"exception %s"
,
e
.
what
());
snprintf
(
resp
->
msg
,
resp
->
msg_len
,
"exception %s"
,
e
.
what
());
}
catch
(...)
{
}
catch
(...)
{
...
@@ -216,9 +216,9 @@ void llama_server_completion_next_result(const int task_id,
...
@@ -216,9 +216,9 @@ void llama_server_completion_next_result(const int task_id,
std
::
string
result_json
;
std
::
string
result_json
;
try
{
try
{
atomicRecv
ar
(
recv_counter
);
atomicRecv
ar
(
recv_counter
);
server_
task_result
result
=
llama
->
queue_results
.
recv
(
task_id
);
task_result
result
=
llama
->
queue_results
.
recv
(
task_id
);
result_json
=
result_json
=
result
.
data
.
dump
(
-
1
,
' '
,
false
,
json
::
error_handler_t
::
replace
);
result
.
result_json
.
dump
(
-
1
,
' '
,
false
,
json
::
error_handler_t
::
replace
);
resp
->
id
=
result
.
id
;
resp
->
id
=
result
.
id
;
resp
->
stop
=
result
.
stop
;
resp
->
stop
=
result
.
stop
;
resp
->
error
=
result
.
error
;
resp
->
error
=
result
.
error
;
...
@@ -363,10 +363,10 @@ void llama_server_embedding(const char *json_req, char **json_resp,
...
@@ -363,10 +363,10 @@ void llama_server_embedding(const char *json_req, char **json_resp,
}
}
const
int
task_id
=
llama
->
queue_tasks
.
get_new_id
();
const
int
task_id
=
llama
->
queue_tasks
.
get_new_id
();
llama
->
queue_results
.
add_waiting_task_id
(
task_id
);
llama
->
queue_results
.
add_waiting_task_id
(
task_id
);
llama
->
request_completion
(
task_id
,
-
1
,
{{
"prompt"
,
prompt
},
{
"n_predict"
,
0
}},
false
,
true
);
llama
->
request_completion
(
task_id
,
{{
"prompt"
,
prompt
},
{
"n_predict"
,
0
}},
false
,
true
,
-
1
);
atomicRecv
ar
(
recv_counter
);
atomicRecv
ar
(
recv_counter
);
server_
task_result
result
=
llama
->
queue_results
.
recv
(
task_id
);
task_result
result
=
llama
->
queue_results
.
recv
(
task_id
);
std
::
string
result_json
=
result
.
data
.
dump
();
std
::
string
result_json
=
result
.
result_json
.
dump
();
const
std
::
string
::
size_type
size
=
result_json
.
size
()
+
1
;
const
std
::
string
::
size_type
size
=
result_json
.
size
()
+
1
;
*
json_resp
=
new
char
[
size
];
*
json_resp
=
new
char
[
size
];
snprintf
(
*
json_resp
,
size
,
"%s"
,
result_json
.
c_str
());
snprintf
(
*
json_resp
,
size
,
"%s"
,
result_json
.
c_str
());
...
...
llm/generate/gen_darwin.sh
View file @
369eda65
...
@@ -18,19 +18,6 @@ sign() {
...
@@ -18,19 +18,6 @@ sign() {
fi
fi
}
}
# bundle_metal bundles ggml-common.h and ggml-metal.metal into a single file
bundle_metal
()
{
grep
-v
'#include "ggml-common.h"'
"
${
LLAMACPP_DIR
}
/ggml-metal.metal"
|
grep
-v
'#pragma once'
>
"
${
LLAMACPP_DIR
}
/ggml-metal.metal.temp"
echo
'#define GGML_COMMON_IMPL_METAL'
>
"
${
LLAMACPP_DIR
}
/ggml-metal.metal"
cat
"
${
LLAMACPP_DIR
}
/ggml-common.h"
|
grep
-v
'#pragma once'
>>
"
${
LLAMACPP_DIR
}
/ggml-metal.metal"
cat
"
${
LLAMACPP_DIR
}
/ggml-metal.metal.temp"
>>
"
${
LLAMACPP_DIR
}
/ggml-metal.metal"
rm
"
${
LLAMACPP_DIR
}
/ggml-metal.metal.temp"
}
cleanup_metal
()
{
(
cd
${
LLAMACPP_DIR
}
&&
git checkout ggml-metal.metal
)
}
COMMON_DARWIN_DEFS
=
"-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin"
COMMON_DARWIN_DEFS
=
"-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin"
case
"
${
GOARCH
}
"
in
case
"
${
GOARCH
}
"
in
...
@@ -76,11 +63,9 @@ case "${GOARCH}" in
...
@@ -76,11 +63,9 @@ case "${GOARCH}" in
CMAKE_DEFS
=
"
${
COMMON_DARWIN_DEFS
}
-DLLAMA_METAL_EMBED_LIBRARY=on -DLLAMA_ACCELERATE=on -DCMAKE_SYSTEM_PROCESSOR=
${
ARCH
}
-DCMAKE_OSX_ARCHITECTURES=
${
ARCH
}
-DLLAMA_METAL=on
${
CMAKE_DEFS
}
"
CMAKE_DEFS
=
"
${
COMMON_DARWIN_DEFS
}
-DLLAMA_METAL_EMBED_LIBRARY=on -DLLAMA_ACCELERATE=on -DCMAKE_SYSTEM_PROCESSOR=
${
ARCH
}
-DCMAKE_OSX_ARCHITECTURES=
${
ARCH
}
-DLLAMA_METAL=on
${
CMAKE_DEFS
}
"
BUILD_DIR
=
"
${
LLAMACPP_DIR
}
/build/darwin/
${
ARCH
}
/metal"
BUILD_DIR
=
"
${
LLAMACPP_DIR
}
/build/darwin/
${
ARCH
}
/metal"
EXTRA_LIBS
=
"
${
EXTRA_LIBS
}
-framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
EXTRA_LIBS
=
"
${
EXTRA_LIBS
}
-framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
bundle_metal
build
build
sign
${
LLAMACPP_DIR
}
/build/darwin/
${
ARCH
}
/metal/lib/libext_server.dylib
sign
${
LLAMACPP_DIR
}
/build/darwin/
${
ARCH
}
/metal/lib/libext_server.dylib
compress_libs
compress_libs
cleanup_metal
;;
;;
*
)
*
)
echo
"GOARCH must be set"
echo
"GOARCH must be set"
...
...
llama.cpp
@
ceca1aef
Compare
77d1ac7e
...
ceca1aef
Subproject commit
77d1ac7e00bf049b9f2bba1b5a310a78318c49c4
Subproject commit
ceca1aef0738b57951cd12c603c3477e75312dec
llm/patches/01-cache.diff
View file @
369eda65
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index
f255ad76..914ecfdd
100644
index
8fe5e0b1..3e82acb9
100644
--- a/examples/server/server.cpp
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1101,12 +1101,13 @@
struct server_context {
@@ -997,13 +997,15 @@
struct llama_server_context
slot.n_sent_text += result.text_to_send.size();
// add the token to slot queue and cache
// add the token to slot queue and cache
}
}
- slot.add_token_string(result);
- slot.add_token_string(result);
if (slot.params.stream) {
+
if (slot.params.stream)
{
send_partial_response(slot, result);
send_partial_response(slot, result);
}
}
}
}
+ slot.add_token_string(result);
+ slot.add_token_string(result);
+
+
if (incomplete) {
if (incomplete)
{
slot.has_next_token = true;
slot.has_next_token = true;
}
llm/patches/02-cudaleaks.diff
View file @
369eda65
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index
b14cca61..02bfd4b
1 100644
index
8fe5e0b1..53bf39c
1 100644
--- a/examples/server/server.cpp
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -29,6 +29,10 @@
@@ -31,6 +31,10 @@
#include <atomic>
#include <signal.h>
#include <signal.h>
#include <memory>
+#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUBLAS
+extern "C" GGML_CALL void ggml_free_cublas(void);
+extern "C" GGML_CALL void ggml_free_cublas(void);
...
@@ -12,8 +12,8 @@ index b14cca61..02bfd4b1 100644
...
@@ -12,8 +12,8 @@ index b14cca61..02bfd4b1 100644
+
+
using json = nlohmann::json;
using json = nlohmann::json;
bool
server_
verbose = false;
struct
server_
params {
@@ -
664
,6 +
668
,10 @@
struct server_context
{
@@ -
363
,6 +
367
,10 @@
struct
llama_
server_context
llama_free_model(model);
llama_free_model(model);
model = nullptr;
model = nullptr;
}
}
...
@@ -23,8 +23,8 @@ index b14cca61..02bfd4b1 100644
...
@@ -23,8 +23,8 @@ index b14cca61..02bfd4b1 100644
+#endif
+#endif
}
}
bool load_model(const gpt_params &
params_)
{
bool load_model(const gpt_params ¶ms_)
@@ -3
499
,6 +35
07
,7 @@
int main(int argc, char **
argv)
{
@@ -3
543
,6 +35
51
,7 @@
int main(int argc, char **argv)
sigemptyset (&sigint_action.sa_mask);
sigemptyset (&sigint_action.sa_mask);
sigint_action.sa_flags = 0;
sigint_action.sa_flags = 0;
sigaction(SIGINT, &sigint_action, NULL);
sigaction(SIGINT, &sigint_action, NULL);
...
@@ -33,10 +33,10 @@ index b14cca61..02bfd4b1 100644
...
@@ -33,10 +33,10 @@ index b14cca61..02bfd4b1 100644
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index
c207ff87..945708a4
100644
index
72bcec8c..6c934e8c
100644
--- a/ggml-cuda.cu
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -4
6
,6 +4
6
,7 @@
@@ -4
3
,6 +4
3
,7 @@
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
#define cublasCreate hipblasCreate
#define cublasCreate hipblasCreate
...
@@ -44,7 +44,7 @@ index c207ff87..945708a4 100644
...
@@ -44,7 +44,7 @@ index c207ff87..945708a4 100644
#define cublasGemmEx hipblasGemmEx
#define cublasGemmEx hipblasGemmEx
#define cublasGemmBatchedEx hipblasGemmBatchedEx
#define cublasGemmBatchedEx hipblasGemmBatchedEx
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
@@ -8
014
,10 +8
015
,10 @@
GGML_CALL bool ggml_cublas_loaded(void) {
@@ -8
751
,10 +8
752
,10 @@
GGML_CALL bool ggml_cublas_loaded(void) {
return g_cublas_loaded;
return g_cublas_loaded;
}
}
...
@@ -58,7 +58,7 @@ index c207ff87..945708a4 100644
...
@@ -58,7 +58,7 @@ index c207ff87..945708a4 100644
#ifdef __HIP_PLATFORM_AMD__
#ifdef __HIP_PLATFORM_AMD__
// Workaround for a rocBLAS bug when using multiple graphics cards:
// Workaround for a rocBLAS bug when using multiple graphics cards:
@@ -8
027
,7 +8
028
,7 @@
GGML_CALL void ggml_init_cublas() {
@@ -8
764
,7 +8
765
,7 @@
GGML_CALL void ggml_init_cublas() {
#endif
#endif
if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
...
@@ -67,7 +67,7 @@ index c207ff87..945708a4 100644
...
@@ -67,7 +67,7 @@ index c207ff87..945708a4 100644
g_cublas_loaded = false;
g_cublas_loaded = false;
fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__);
fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__);
return;
return;
@@ -8
098
,7 +8
099
,7 @@
GGML_CALL void ggml_init_cublas() {
@@ -8
835
,7 +8
836
,7 @@
GGML_CALL void ggml_init_cublas() {
// configure logging to stdout
// configure logging to stdout
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
...
@@ -76,7 +76,7 @@ index c207ff87..945708a4 100644
...
@@ -76,7 +76,7 @@ index c207ff87..945708a4 100644
g_cublas_loaded = true;
g_cublas_loaded = true;
}
}
}
}
@@ -1
1753
,3 +1
1754
,23 @@
GGML_CALL int ggml_backend_cuda_reg_devices() {
@@ -1
2490
,3 +1
2491
,23 @@
GGML_CALL int ggml_backend_cuda_reg_devices() {
}
}
return device_count;
return device_count;
}
}
...
@@ -100,6 +100,7 @@ index c207ff87..945708a4 100644
...
@@ -100,6 +100,7 @@ index c207ff87..945708a4 100644
+
+
+ g_cublas_initialized = false;
+ g_cublas_initialized = false;
+}
+}
\
No newline at end of file
diff --git a/ggml-cuda.h b/ggml-cuda.h
diff --git a/ggml-cuda.h b/ggml-cuda.h
index b1ebd61d..6dd58ddf 100644
index b1ebd61d..6dd58ddf 100644
--- a/ggml-cuda.h
--- a/ggml-cuda.h
...
...
llm/patches/03-locale.diff
deleted
100644 → 0
View file @
f878e910
diff --git a/llama.cpp b/llama.cpp
index b19616e8..519b9602 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -9938,7 +9938,7 @@
struct llm_tokenizer_wpm {
}
uint32_t to_lower(uint32_t code) {
- static const std::locale locale("en_US.UTF-8");
+ static const std::locale locale("");
#if defined(_WIN32)
if (code > 0xFFFF) {
return code;
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment