Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
369eda65
"vscode:/vscode.git/clone" did not exist on "c052791b5fe29ce8a308bf63dda97aa205b729be"
Unverified
Commit
369eda65
authored
Mar 11, 2024
by
Jeffrey Morgan
Committed by
GitHub
Mar 11, 2024
Browse files
update llama.cpp submodule to `ceca1ae` (#3064)
parent
f878e910
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
36 additions
and
61 deletions
+36
-61
llm/ext_server/ext_server.cpp
llm/ext_server/ext_server.cpp
+13
-13
llm/generate/gen_darwin.sh
llm/generate/gen_darwin.sh
+0
-15
llm/llama.cpp
llm/llama.cpp
+1
-1
llm/patches/01-cache.diff
llm/patches/01-cache.diff
+8
-6
llm/patches/02-cudaleaks.diff
llm/patches/02-cudaleaks.diff
+14
-13
llm/patches/03-locale.diff
llm/patches/03-locale.diff
+0
-13
No files found.
llm/ext_server/ext_server.cpp
View file @
369eda65
...
...
@@ -26,7 +26,7 @@
#endif // GGML_USE_CUBLAS
// Expose the llama server as a callable extern "C" API
server_context
*
llama
=
NULL
;
llama_
server_context
*
llama
=
NULL
;
std
::
thread
ext_server_thread
;
bool
shutting_down
=
false
;
std
::
atomic_int
recv_counter
;
...
...
@@ -57,7 +57,7 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
err
->
id
=
0
;
err
->
msg
[
0
]
=
'\0'
;
try
{
llama
=
new
server_context
;
llama
=
new
llama_
server_context
;
gpt_params
params
;
params
.
n_ctx
=
sparams
->
n_ctx
;
params
.
n_batch
=
sparams
->
n_batch
;
...
...
@@ -125,7 +125,7 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
return
;
}
llama
->
init
();
llama
->
init
ialize
();
}
catch
(
std
::
exception
&
e
)
{
err
->
id
=
-
1
;
snprintf
(
err
->
msg
,
err
->
msg_len
,
"exception %s"
,
e
.
what
());
...
...
@@ -144,13 +144,13 @@ void llama_server_start() {
LOG_TEE
(
"llama server main loop starting
\n
"
);
ggml_time_init
();
llama
->
queue_tasks
.
on_new_task
(
std
::
bind
(
&
server_context
::
process_single_task
,
llama
,
std
::
placeholders
::
_1
));
&
llama_
server_context
::
process_single_task
,
llama
,
std
::
placeholders
::
_1
));
llama
->
queue_tasks
.
on_finish_multitask
(
std
::
bind
(
&
server_context
::
on_finish_multitask
,
llama
,
std
::
placeholders
::
_1
));
&
llama_
server_context
::
on_finish_multitask
,
llama
,
std
::
placeholders
::
_1
));
llama
->
queue_tasks
.
on_run_slots
(
std
::
bind
(
&
server_context
::
update_slots
,
llama
));
&
llama_
server_context
::
update_slots
,
llama
));
llama
->
queue_results
.
on_multitask_update
(
std
::
bind
(
&
server_queue
::
update_multitask
,
&
llama_
server_queue
::
update_multitask
,
&
llama
->
queue_tasks
,
std
::
placeholders
::
_1
,
std
::
placeholders
::
_2
,
...
...
@@ -198,7 +198,7 @@ void llama_server_completion(const char *json_req, ext_server_resp_t *resp) {
json
data
=
json
::
parse
(
json_req
);
resp
->
id
=
llama
->
queue_tasks
.
get_new_id
();
llama
->
queue_results
.
add_waiting_task_id
(
resp
->
id
);
llama
->
request_completion
(
resp
->
id
,
-
1
,
data
,
false
,
false
);
llama
->
request_completion
(
resp
->
id
,
data
,
false
,
false
,
-
1
);
}
catch
(
std
::
exception
&
e
)
{
snprintf
(
resp
->
msg
,
resp
->
msg_len
,
"exception %s"
,
e
.
what
());
}
catch
(...)
{
...
...
@@ -216,9 +216,9 @@ void llama_server_completion_next_result(const int task_id,
std
::
string
result_json
;
try
{
atomicRecv
ar
(
recv_counter
);
server_
task_result
result
=
llama
->
queue_results
.
recv
(
task_id
);
task_result
result
=
llama
->
queue_results
.
recv
(
task_id
);
result_json
=
result
.
data
.
dump
(
-
1
,
' '
,
false
,
json
::
error_handler_t
::
replace
);
result
.
result_json
.
dump
(
-
1
,
' '
,
false
,
json
::
error_handler_t
::
replace
);
resp
->
id
=
result
.
id
;
resp
->
stop
=
result
.
stop
;
resp
->
error
=
result
.
error
;
...
...
@@ -363,10 +363,10 @@ void llama_server_embedding(const char *json_req, char **json_resp,
}
const
int
task_id
=
llama
->
queue_tasks
.
get_new_id
();
llama
->
queue_results
.
add_waiting_task_id
(
task_id
);
llama
->
request_completion
(
task_id
,
-
1
,
{{
"prompt"
,
prompt
},
{
"n_predict"
,
0
}},
false
,
true
);
llama
->
request_completion
(
task_id
,
{{
"prompt"
,
prompt
},
{
"n_predict"
,
0
}},
false
,
true
,
-
1
);
atomicRecv
ar
(
recv_counter
);
server_
task_result
result
=
llama
->
queue_results
.
recv
(
task_id
);
std
::
string
result_json
=
result
.
data
.
dump
();
task_result
result
=
llama
->
queue_results
.
recv
(
task_id
);
std
::
string
result_json
=
result
.
result_json
.
dump
();
const
std
::
string
::
size_type
size
=
result_json
.
size
()
+
1
;
*
json_resp
=
new
char
[
size
];
snprintf
(
*
json_resp
,
size
,
"%s"
,
result_json
.
c_str
());
...
...
llm/generate/gen_darwin.sh
View file @
369eda65
...
...
@@ -18,19 +18,6 @@ sign() {
fi
}
# bundle_metal bundles ggml-common.h and ggml-metal.metal into a single file
bundle_metal
()
{
grep
-v
'#include "ggml-common.h"'
"
${
LLAMACPP_DIR
}
/ggml-metal.metal"
|
grep
-v
'#pragma once'
>
"
${
LLAMACPP_DIR
}
/ggml-metal.metal.temp"
echo
'#define GGML_COMMON_IMPL_METAL'
>
"
${
LLAMACPP_DIR
}
/ggml-metal.metal"
cat
"
${
LLAMACPP_DIR
}
/ggml-common.h"
|
grep
-v
'#pragma once'
>>
"
${
LLAMACPP_DIR
}
/ggml-metal.metal"
cat
"
${
LLAMACPP_DIR
}
/ggml-metal.metal.temp"
>>
"
${
LLAMACPP_DIR
}
/ggml-metal.metal"
rm
"
${
LLAMACPP_DIR
}
/ggml-metal.metal.temp"
}
cleanup_metal
()
{
(
cd
${
LLAMACPP_DIR
}
&&
git checkout ggml-metal.metal
)
}
COMMON_DARWIN_DEFS
=
"-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin"
case
"
${
GOARCH
}
"
in
...
...
@@ -76,11 +63,9 @@ case "${GOARCH}" in
CMAKE_DEFS
=
"
${
COMMON_DARWIN_DEFS
}
-DLLAMA_METAL_EMBED_LIBRARY=on -DLLAMA_ACCELERATE=on -DCMAKE_SYSTEM_PROCESSOR=
${
ARCH
}
-DCMAKE_OSX_ARCHITECTURES=
${
ARCH
}
-DLLAMA_METAL=on
${
CMAKE_DEFS
}
"
BUILD_DIR
=
"
${
LLAMACPP_DIR
}
/build/darwin/
${
ARCH
}
/metal"
EXTRA_LIBS
=
"
${
EXTRA_LIBS
}
-framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
bundle_metal
build
sign
${
LLAMACPP_DIR
}
/build/darwin/
${
ARCH
}
/metal/lib/libext_server.dylib
compress_libs
cleanup_metal
;;
*
)
echo
"GOARCH must be set"
...
...
llama.cpp
@
ceca1aef
Compare
77d1ac7e
...
ceca1aef
Subproject commit
77d1ac7e00bf049b9f2bba1b5a310a78318c49c4
Subproject commit
ceca1aef0738b57951cd12c603c3477e75312dec
llm/patches/01-cache.diff
View file @
369eda65
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index
f255ad76..914ecfdd
100644
index
8fe5e0b1..3e82acb9
100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1101,12 +1101,13 @@
struct server_context {
@@ -997,13 +997,15 @@
struct llama_server_context
slot.n_sent_text += result.text_to_send.size();
// add the token to slot queue and cache
}
- slot.add_token_string(result);
if (slot.params.stream) {
+
if (slot.params.stream)
{
send_partial_response(slot, result);
}
}
+ slot.add_token_string(result);
+
if (incomplete) {
if (incomplete)
{
slot.has_next_token = true;
}
llm/patches/02-cudaleaks.diff
View file @
369eda65
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index
b14cca61..02bfd4b
1 100644
index
8fe5e0b1..53bf39c
1 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -29,6 +29,10 @@
@@ -31,6 +31,10 @@
#include <atomic>
#include <signal.h>
#include <memory>
+#ifdef GGML_USE_CUBLAS
+extern "C" GGML_CALL void ggml_free_cublas(void);
...
...
@@ -12,8 +12,8 @@ index b14cca61..02bfd4b1 100644
+
using json = nlohmann::json;
bool
server_
verbose = false;
@@ -
664
,6 +
668
,10 @@
struct server_context
{
struct
server_
params {
@@ -
363
,6 +
367
,10 @@
struct
llama_
server_context
llama_free_model(model);
model = nullptr;
}
...
...
@@ -23,8 +23,8 @@ index b14cca61..02bfd4b1 100644
+#endif
}
bool load_model(const gpt_params &
params_)
{
@@ -3
499
,6 +35
07
,7 @@
int main(int argc, char **
argv)
{
bool load_model(const gpt_params ¶ms_)
@@ -3
543
,6 +35
51
,7 @@
int main(int argc, char **argv)
sigemptyset (&sigint_action.sa_mask);
sigint_action.sa_flags = 0;
sigaction(SIGINT, &sigint_action, NULL);
...
...
@@ -33,10 +33,10 @@ index b14cca61..02bfd4b1 100644
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index
c207ff87..945708a4
100644
index
72bcec8c..6c934e8c
100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -4
6
,6 +4
6
,7 @@
@@ -4
3
,6 +4
3
,7 @@
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
#define cublasCreate hipblasCreate
...
...
@@ -44,7 +44,7 @@ index c207ff87..945708a4 100644
#define cublasGemmEx hipblasGemmEx
#define cublasGemmBatchedEx hipblasGemmBatchedEx
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
@@ -8
014
,10 +8
015
,10 @@
GGML_CALL bool ggml_cublas_loaded(void) {
@@ -8
751
,10 +8
752
,10 @@
GGML_CALL bool ggml_cublas_loaded(void) {
return g_cublas_loaded;
}
...
...
@@ -58,7 +58,7 @@ index c207ff87..945708a4 100644
#ifdef __HIP_PLATFORM_AMD__
// Workaround for a rocBLAS bug when using multiple graphics cards:
@@ -8
027
,7 +8
028
,7 @@
GGML_CALL void ggml_init_cublas() {
@@ -8
764
,7 +8
765
,7 @@
GGML_CALL void ggml_init_cublas() {
#endif
if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
...
...
@@ -67,7 +67,7 @@ index c207ff87..945708a4 100644
g_cublas_loaded = false;
fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__);
return;
@@ -8
098
,7 +8
099
,7 @@
GGML_CALL void ggml_init_cublas() {
@@ -8
835
,7 +8
836
,7 @@
GGML_CALL void ggml_init_cublas() {
// configure logging to stdout
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
...
...
@@ -76,7 +76,7 @@ index c207ff87..945708a4 100644
g_cublas_loaded = true;
}
}
@@ -1
1753
,3 +1
1754
,23 @@
GGML_CALL int ggml_backend_cuda_reg_devices() {
@@ -1
2490
,3 +1
2491
,23 @@
GGML_CALL int ggml_backend_cuda_reg_devices() {
}
return device_count;
}
...
...
@@ -100,6 +100,7 @@ index c207ff87..945708a4 100644
+
+ g_cublas_initialized = false;
+}
\
No newline at end of file
diff --git a/ggml-cuda.h b/ggml-cuda.h
index b1ebd61d..6dd58ddf 100644
--- a/ggml-cuda.h
...
...
llm/patches/03-locale.diff
deleted
100644 → 0
View file @
f878e910
diff --git a/llama.cpp b/llama.cpp
index b19616e8..519b9602 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -9938,7 +9938,7 @@
struct llm_tokenizer_wpm {
}
uint32_t to_lower(uint32_t code) {
- static const std::locale locale("en_US.UTF-8");
+ static const std::locale locale("");
#if defined(_WIN32)
if (code > 0xFFFF) {
return code;
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment