Commit 325d7498 authored by Daniel Hiltgen's avatar Daniel Hiltgen
Browse files

Fix CPU performance on hyperthreaded systems

The default thread count logic was broken and resulted in 2x the number
of threads as it should on a hyperthreading CPU
resulting in thrashing and poor performance.
parent d9cd3d96
......@@ -37,7 +37,6 @@ import (
"fmt"
"log"
"os"
"runtime"
"strings"
"sync"
"time"
......@@ -185,11 +184,7 @@ func newExtServer(server extServer, model string, adapters, projectors []string,
sparams.mmproj = nil
}
if opts.NumThread > 0 {
sparams.n_threads = C.uint(opts.NumThread)
} else {
sparams.n_threads = C.uint(runtime.NumCPU())
}
sparams.n_threads = C.uint(opts.NumThread)
log.Printf("Initializing internal llama server")
resp := newExtServerResp(128)
......
From b5e195803e2a989e57eef0010adce778df1e2d01 Mon Sep 17 00:00:00 2001
From 7184ae16e8fd0e9e91cac4c81daa323057fa992b Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Mon, 13 Nov 2023 12:25:58 -0800
Subject: [PATCH] Expose callable API for server
......@@ -6,10 +6,10 @@ Subject: [PATCH] Expose callable API for server
This adds an extern "C" interface within the example server
---
examples/server/CMakeLists.txt | 24 +++
examples/server/server.cpp | 274 +++++++++++++++++++++++++++++++++
examples/server/server.cpp | 276 +++++++++++++++++++++++++++++++++
examples/server/server.h | 89 +++++++++++
ggml-cuda.cu | 1 +
4 files changed, 388 insertions(+)
4 files changed, 390 insertions(+)
create mode 100644 examples/server/server.h
diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
......@@ -46,7 +46,7 @@ index 859cd12..4ea47a7 100644
+endif()
\ No newline at end of file
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 0403853..2084fd8 100644
index 0403853..065420c 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -5,6 +5,9 @@
......@@ -67,7 +67,7 @@ index 0403853..2084fd8 100644
int main(int argc, char **argv)
{
#if SERVER_VERBOSE != 1
@@ -3123,3 +3127,273 @@ int main(int argc, char **argv)
@@ -3123,3 +3127,275 @@ int main(int argc, char **argv)
llama_backend_free();
return 0;
}
......@@ -89,7 +89,9 @@ index 0403853..2084fd8 100644
+ gpt_params params;
+ params.n_ctx = sparams->n_ctx;
+ params.n_batch = sparams->n_batch;
+ params.n_threads = sparams->n_threads;
+ if (sparams->n_threads > 0) {
+ params.n_threads = sparams->n_threads;
+ }
+ params.n_parallel = sparams->n_parallel;
+ params.rope_freq_base = sparams->rope_freq_base;
+ params.rope_freq_scale = sparams->rope_freq_scale;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment