Bump llama.cpp to b1662 and set n_parallel=1

9adca7f7 · Daniel Hiltgen · 89bbaafa · 9adca7f7 · 328b83de · a7aee47b
Commit 9adca7f7 authored Dec 14, 2023 by Daniel Hiltgen
Showing with 9 additions and 9 deletions

llm/ext_server.go llm/ext_server.go +1 -1

llm/llama.cpp/gguf llm/llama.cpp/gguf +1 -1

llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch ...ama.cpp/patches/0001-Expose-callable-API-for-server.patch +7 -7

No files found.
--- a/llm/ext_server.go
+++ b/llm/ext_server.go
@@ -160,7 +160,7 @@ func newExtServer(server extServer, model string, adapters, projectors []string,
 	sparams.n_batch = C.uint(opts.NumBatch)
 	sparams.n_gpu_layers = C.int(numGPU)
 	sparams.main_gpu = C.int(opts.MainGPU)
-	sparams.n_parallel = 2 // TODO - wire up concurrency
+	sparams.n_parallel = 1 // TODO - wire up concurrency
 	// Always use the value encoded in the model
 	sparams.rope_freq_base = 0.0

--- a/gguf @ 328b83de
+++ b/gguf @ 328b83de
-Subproject commit a7aee47b98e45539d491071b25778b833b77e387
+Subproject commit 328b83de23b33240e28f4e74900d1d06726f5eb1
--- a/llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch
+++ b/llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch
-From 087cf3300e973d7790db8f7cad01d2a790de38be Mon Sep 17 00:00:00 2001
+From b5e195803e2a989e57eef0010adce778df1e2d01 Mon Sep 17 00:00:00 2001
 From: Daniel Hiltgen <daniel@ollama.com>
 Date: Mon, 13 Nov 2023 12:25:58 -0800
 Subject: [PATCH] Expose callable API for server
@@ -46,7 +46,7 @@ index 859cd12..4ea47a7 100644
 +endif()
 \ No newline at end of file
 diff --git a/examples/server/server.cpp b/examples/server/server.cpp
-index d0cd8e1..5f5d4c5 100644
+index 0403853..2084fd8 100644
 --- a/examples/server/server.cpp
 +++ b/examples/server/server.cpp
 @@ -5,6 +5,9 @@
@@ -59,15 +59,15 @@ index d0cd8e1..5f5d4c5 100644
 #ifndef NDEBUG
 // crash the server in debug mode, otherwise send an http 500 error
-@@ -2632,6 +2635,7 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
+@@ -2643,6 +2646,7 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
     }
 }
 +#ifndef LLAMA_SERVER_LIBRARY
 int main(int argc, char **argv)
 {
-     // own arguments required by this example
+ #if SERVER_VERBOSE != 1
-@@ -3066,3 +3070,273 @@ int main(int argc, char **argv)
+@@ -3123,3 +3127,273 @@ int main(int argc, char **argv)
     llama_backend_free();
     return 0;
 }
@@ -439,10 +439,10 @@ index 0000000..d22f1b6
 +#endif // LLAMA_SERVER_LIBRARY
 \ No newline at end of file
 diff --git a/ggml-cuda.cu b/ggml-cuda.cu
-index 9e1acd3..ea64b55 100644
+index f20846f..9640cf3 100644
 --- a/ggml-cuda.cu
 +++ b/ggml-cuda.cu
-@@ -6505,6 +6505,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
+@@ -6757,6 +6757,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
         CUDA_CHECK(cudaGetDevice(&id));
         src_ptr = (char *) extra->data_device[id];
     } else {