Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
orangecat
ollama
Commits
21347e1e
"scripts/git@developer.sourcefind.cn:change/sglang.git" did not exist on "a0f844ed5a89159d4e637827ccdbd1e2d175798e"
Unverified
Commit
21347e1e
authored
Mar 01, 2024
by
Jeffrey Morgan
Committed by
GitHub
Mar 01, 2024
Browse files
update llama.cpp submodule to `c29af7e` (#2868)
parent
3b4bab3d
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
24 additions
and
25 deletions
+24
-25
llm/ext_server/ext_server.cpp
llm/ext_server/ext_server.cpp
+3
-3
llm/llama.cpp
llm/llama.cpp
+1
-1
llm/patches/01-cache.diff
llm/patches/01-cache.diff
+3
-3
llm/patches/02-cudaleaks.diff
llm/patches/02-cudaleaks.diff
+17
-18
No files found.
llm/ext_server/ext_server.cpp
View file @
21347e1e
...
@@ -146,9 +146,9 @@ void llama_server_start() {
...
@@ -146,9 +146,9 @@ void llama_server_start() {
llama
->
queue_tasks
.
on_new_task
(
std
::
bind
(
llama
->
queue_tasks
.
on_new_task
(
std
::
bind
(
&
llama_server_context
::
process_single_task
,
llama
,
std
::
placeholders
::
_1
));
&
llama_server_context
::
process_single_task
,
llama
,
std
::
placeholders
::
_1
));
llama
->
queue_tasks
.
on_finish_multitask
(
std
::
bind
(
llama
->
queue_tasks
.
on_finish_multitask
(
std
::
bind
(
&
llama_server_context
::
on_finish_multitask
,
llama
,
std
::
placeholders
::
_1
));
&
llama_server_context
::
on_finish_multitask
,
llama
,
std
::
placeholders
::
_1
));
llama
->
queue_tasks
.
on_
all_tasks_finished
(
std
::
bind
(
llama
->
queue_tasks
.
on_
run_slots
(
std
::
bind
(
&
llama_server_context
::
run_on_all_tasks_finished
,
llama
));
&
llama_server_context
::
update_slots
,
llama
));
llama
->
queue_results
.
on_multitask_update
(
std
::
bind
(
llama
->
queue_results
.
on_multitask_update
(
std
::
bind
(
&
llama_server_queue
::
update_multitask
,
&
llama_server_queue
::
update_multitask
,
&
llama
->
queue_tasks
,
&
llama
->
queue_tasks
,
...
...
llama.cpp
@
c29af7e2
Compare
87c91c07
...
c29af7e2
Subproject commit
87c91c07663b707e831c59ec373b5e665ff9d64a
Subproject commit
c29af7e2252d288f2ea58a7d437c1cb7c0abf160
llm/patches/01-cache.diff
View file @
21347e1e
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index
d86d7e04..2694e92e
100644
index
2b2f4a0f..afac49af
100644
--- a/examples/server/server.cpp
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -9
01
,13 +9
01
,15 @@
struct llama_server_context
@@ -9
97
,13 +9
97
,15 @@
struct llama_server_context
slot.sent_
coun
t += result.text_to_send.size();
slot.
n_
sent_
tex
t += result.text_to_send.size();
// add the token to slot queue and cache
// add the token to slot queue and cache
}
}
- slot.add_token_string(result);
- slot.add_token_string(result);
...
...
llm/patches/02-cudaleaks.diff
View file @
21347e1e
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index
7800c6e7..be30db23
100644
index
2b2f4a0f..25857bdd
100644
--- a/examples/server/server.cpp
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -3
0
,6 +3
0
,10 @@
@@ -3
1
,6 +3
1
,10 @@
#include <atomic>
#include <atomic>
#include <signal.h>
#include <signal.h>
...
@@ -12,8 +12,8 @@ index 7800c6e7..be30db23 100644
...
@@ -12,8 +12,8 @@ index 7800c6e7..be30db23 100644
+
+
using json = nlohmann::json;
using json = nlohmann::json;
struct server_params
struct server_params
{
@@ -3
5
3,6 +3
5
7,9 @@
struct llama_server_context
@@ -3
6
3,6 +3
6
7,9 @@
struct llama_server_context
llama_free_model(model);
llama_free_model(model);
model = nullptr;
model = nullptr;
}
}
...
@@ -23,7 +23,7 @@ index 7800c6e7..be30db23 100644
...
@@ -23,7 +23,7 @@ index 7800c6e7..be30db23 100644
}
}
bool load_model(const gpt_params ¶ms_)
bool load_model(const gpt_params ¶ms_)
@@ -3
143
,6 +3
1
50,7 @@
int main(int argc, char **argv)
@@ -3
494
,6 +350
1
,7 @@
int main(int argc, char **argv)
sigemptyset (&sigint_action.sa_mask);
sigemptyset (&sigint_action.sa_mask);
sigint_action.sa_flags = 0;
sigint_action.sa_flags = 0;
sigaction(SIGINT, &sigint_action, NULL);
sigaction(SIGINT, &sigint_action, NULL);
...
@@ -32,10 +32,10 @@ index 7800c6e7..be30db23 100644
...
@@ -32,10 +32,10 @@ index 7800c6e7..be30db23 100644
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index
933ebbc4..88a4f664
100644
index
0c6501e9..75c12723
100644
--- a/ggml-cuda.cu
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -3
9
,6 +3
9
,7 @@
@@ -
4
3,6 +
4
3,7 @@
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
#define cublasCreate hipblasCreate
#define cublasCreate hipblasCreate
...
@@ -43,7 +43,7 @@ index 933ebbc4..88a4f664 100644
...
@@ -43,7 +43,7 @@ index 933ebbc4..88a4f664 100644
#define cublasGemmEx hipblasGemmEx
#define cublasGemmEx hipblasGemmEx
#define cublasGemmBatchedEx hipblasGemmBatchedEx
#define cublasGemmBatchedEx hipblasGemmBatchedEx
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
@@ -
7991
,10 +
7992
,10 @@
GGML_CALL bool ggml_cublas_loaded(void) {
@@ -
8694
,10 +
8695
,10 @@
GGML_CALL bool ggml_cublas_loaded(void) {
return g_cublas_loaded;
return g_cublas_loaded;
}
}
...
@@ -57,7 +57,7 @@ index 933ebbc4..88a4f664 100644
...
@@ -57,7 +57,7 @@ index 933ebbc4..88a4f664 100644
#ifdef __HIP_PLATFORM_AMD__
#ifdef __HIP_PLATFORM_AMD__
// Workaround for a rocBLAS bug when using multiple graphics cards:
// Workaround for a rocBLAS bug when using multiple graphics cards:
@@ -8
004
,7 +8
005
,7 @@
GGML_CALL void ggml_init_cublas() {
@@ -8
707
,7 +8
708
,7 @@
GGML_CALL void ggml_init_cublas() {
#endif
#endif
if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
...
@@ -66,7 +66,7 @@ index 933ebbc4..88a4f664 100644
...
@@ -66,7 +66,7 @@ index 933ebbc4..88a4f664 100644
g_cublas_loaded = false;
g_cublas_loaded = false;
fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__);
fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__);
return;
return;
@@ -8
075
,7 +8
076
,7 @@
GGML_CALL void ggml_init_cublas() {
@@ -8
778
,7 +8
779
,7 @@
GGML_CALL void ggml_init_cublas() {
// configure logging to stdout
// configure logging to stdout
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
...
@@ -75,12 +75,11 @@ index 933ebbc4..88a4f664 100644
...
@@ -75,12 +75,11 @@ index 933ebbc4..88a4f664 100644
g_cublas_loaded = true;
g_cublas_loaded = true;
}
}
}
}
@@ -1
1604
,3 +1
1605
,2
3
@@
GGML_CALL int ggml_backend_cuda_reg_devices() {
@@ -1
2345
,3 +1
2346
,2
2
@@
GGML_CALL int ggml_backend_cuda_reg_devices() {
}
}
return device_count;
return device_count;
}
}
+
+
+
+extern "C" GGML_CALL void ggml_free_cublas(void);
+extern "C" GGML_CALL void ggml_free_cublas(void);
+GGML_CALL void ggml_free_cublas(void) {
+GGML_CALL void ggml_free_cublas(void) {
+ for (int id = 0; id < g_device_count; ++id) {
+ for (int id = 0; id < g_device_count; ++id) {
...
@@ -100,16 +99,16 @@ index 933ebbc4..88a4f664 100644
...
@@ -100,16 +99,16 @@ index 933ebbc4..88a4f664 100644
+ g_cublas_initialized = false;
+ g_cublas_initialized = false;
+}
+}
diff --git a/ggml-cuda.h b/ggml-cuda.h
diff --git a/ggml-cuda.h b/ggml-cuda.h
index b1ebd61d..
b4c80c2c
100644
index b1ebd61d..
6dd58ddf
100644
--- a/ggml-cuda.h
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -2
0
,6 +2
0
,9 @@
extern "C" {
@@ -2
3
,6 +2
3
,9 @@
GGML_API GGML_CALL void ggml_init_cublas(void);
//
Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded
`.
//
Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false
`.
GGML_API GGML_CALL
void
ggml_
init_
cublas(void);
GGML_API GGML_CALL
bool
ggml_cublas
_loaded
(void);
+// Release CUDA resources
+// Release CUDA resources
+GGML_API GGML_CALL void ggml_free_cublas(void);
+GGML_API GGML_CALL void ggml_free_cublas(void);
+
+
// Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
GGML_API GGML_CALL void * ggml_cuda_host_malloc(size_t size);
GGML_API GGML_CALL
bool
ggml_cu
blas_loaded(void
);
GGML_API GGML_CALL
void
ggml_cu
da_host_free(void * ptr
);
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment