Fix CPU performance on hyperthreaded systems

The default thread count logic was broken and resulted in 2x the number of threads as it should on a hyperthreading CPU resulting in thrashing and poor performance.

Fix CPU performance on hyperthreaded systems
The default thread count logic was broken and resulted in 2x the number of threads as it should on a hyperthreading CPU resulting in thrashing and poor performance.
325d7498 · Daniel Hiltgen · d9cd3d96 · 325d7498 · 325d7498
Commit 325d7498 authored Dec 21, 2023 by Daniel Hiltgen
Hide whitespace changes
Inline Side-by-side

Showing with 9 additions and 12 deletions

llm/ext_server.go llm/ext_server.go +1 -6

llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch ...ama.cpp/patches/0001-Expose-callable-API-for-server.patch +8 -6

No files found.
--- a/llm/ext_server.go
+++ b/llm/ext_server.go
@@ -37,7 +37,6 @@ import (
 	"fmt"
 	"log"
 	"os"
-	"runtime"
 	"strings"
 	"sync"
 	"time"
@@ -185,11 +184,7 @@ func newExtServer(server extServer, model string, adapters, projectors []string,
 		sparams.mmproj = nil
 	}

-	if opts.NumThread > 0 {
-		sparams.n_threads = C.uint(opts.NumThread)
-	} else {
-		sparams.n_threads = C.uint(runtime.NumCPU())
-	}
+	sparams.n_threads = C.uint(opts.NumThread)

 	log.Printf("Initializing internal llama server")
 	resp := newExtServerResp(128)

--- a/llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch
+++ b/llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch
-From b5e195803e2a989e57eef0010adce778df1e2d01 Mon Sep 17 00:00:00 2001
+From 7184ae16e8fd0e9e91cac4c81daa323057fa992b Mon Sep 17 00:00:00 2001
 From: Daniel Hiltgen <daniel@ollama.com>
 Date: Mon, 13 Nov 2023 12:25:58 -0800
 Subject: [PATCH] Expose callable API for server
@@ -6,10 +6,10 @@ Subject: [PATCH] Expose callable API for server
 This adds an extern "C" interface within the example server
 ---
 examples/server/CMakeLists.txt |  24 +++
- examples/server/server.cpp     | 274 +++++++++++++++++++++++++++++++++
+ examples/server/server.cpp     | 276 +++++++++++++++++++++++++++++++++
 examples/server/server.h       |  89 +++++++++++
 ggml-cuda.cu                   |   1 +
- 4 files changed, 388 insertions(+)
+ 4 files changed, 390 insertions(+)
 create mode 100644 examples/server/server.h

 diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
@@ -46,7 +46,7 @@ index 859cd12..4ea47a7 100644
 +endif()
 \ No newline at end of file
 diff --git a/examples/server/server.cpp b/examples/server/server.cpp
-index 0403853..2084fd8 100644
+index 0403853..065420c 100644
 --- a/examples/server/server.cpp
 +++ b/examples/server/server.cpp
 @@ -5,6 +5,9 @@
@@ -67,7 +67,7 @@ index 0403853..2084fd8 100644
 int main(int argc, char **argv)
 {
 #if SERVER_VERBOSE != 1
-@@ -3123,3 +3127,273 @@ int main(int argc, char **argv)
+@@ -3123,3 +3127,275 @@ int main(int argc, char **argv)
     llama_backend_free();
     return 0;
 }
@@ -89,7 +89,9 @@ index 0403853..2084fd8 100644
 +        gpt_params params;
 +        params.n_ctx = sparams->n_ctx;
 +        params.n_batch = sparams->n_batch;
-+        params.n_threads = sparams->n_threads;
+        if (sparams->n_threads > 0) {
+            params.n_threads = sparams->n_threads;
+        }
 +        params.n_parallel = sparams->n_parallel;
 +        params.rope_freq_base = sparams->rope_freq_base;
 +        params.rope_freq_scale = sparams->rope_freq_scale;