Switch back to subprocessing for llama.cpp

This should resolve a number of memory leak and stability defects by allowing us to isolate llama.cpp in a separate process and shutdown when idle, and gracefully restart if it has problems. This also serves as a first step to be able to run multiple copies to support multiple models concurrently.

Switch back to subprocessing for llama.cpp
This should resolve a number of memory leak and stability defects by allowing us to isolate llama.cpp in a separate process and shutdown when idle, and gracefully restart if it has problems. This also serves as a first step to be able to run multiple copies to support multiple models concurrently.
58d95cc9 · Daniel Hiltgen · 3b6a9154 · 58d95cc9 · 58d95cc9 · 58d95cc9
Commit 58d95cc9 authored Mar 14, 2024 by Daniel Hiltgen
14 changed files
--- a/llm/llm.go
+++ b/llm/llm.go
 package llm
-import (
+// #cgo CFLAGS: -Illama.cpp
-	"context"
+// #cgo darwin,arm64 LDFLAGS: ${SRCDIR}/build/darwin/arm64_static/libllama.a -lstdc++
-	"fmt"
+// #cgo darwin,amd64 LDFLAGS: ${SRCDIR}/build/darwin/x86_64_static/libllama.a -lstdc++
-	"log/slog"
+// #cgo windows,amd64 LDFLAGS: ${SRCDIR}/build/windows/amd64_static/libllama.a -static -lstdc++
-	"os"
+// #cgo linux,amd64 LDFLAGS: ${SRCDIR}/build/linux/x86_64_static/libllama.a -lstdc++
-	"slices"
+// #cgo linux,arm64 LDFLAGS: ${SRCDIR}/build/linux/arm64_static/libllama.a -lstdc++
-	"strings"
+// #include "llama.h"
+import "C"
-	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/format"
+// SystemInfo is an unused example of calling llama.cpp functions using CGo
-	"github.com/ollama/ollama/gpu"
+func SystemInfo() string {
-)
+	return C.GoString(C.llama_print_system_info())
-type LLM interface {
-	Predict(context.Context, PredictOpts, func(PredictResult)) error
-	Embedding(context.Context, string) ([]float64, error)
-	Encode(context.Context, string) ([]int, error)
-	Decode(context.Context, []int) (string, error)
-	Close()
-}
-var cpuOnlyFamilies = []string{
-	"mamba",
-}
-func New(model string, adapters, projectors []string, opts *api.Options) (LLM, error) {
-	if _, err := os.Stat(model); err != nil {
-		return nil, err
-	}
-	f, err := os.Open(model)
-	if err != nil {
-		return nil, err
-	}
-	defer f.Close()
-	ggml, _, err := DecodeGGML(f)
-	if err != nil {
-		return nil, err
-	}
-	if opts.NumCtx > int(ggml.KV().ContextLength()) {
-		slog.Warn("requested context length is greater than model max context length", "requested", opts.NumCtx, "model", ggml.KV().ContextLength())
-		opts.NumCtx = int(ggml.KV().ContextLength())
-	}
-	if opts.NumCtx < 4 {
-		opts.NumCtx = 4
-	}
-	availableMemory, _ := gpu.CheckVRAM()
-	info := gpu.GetGPUInfo()
-	usedMemory := info.MinimumMemory
-	for _, projector := range projectors {
-		usedMemory += projectorMemoryRequirements(projector)
-		// multimodal models require at least 2048 context
-		opts.NumCtx = max(opts.NumCtx, 2048)
-	}
-	// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
-	kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.KV().BlockCount()) * int64(ggml.KV().EmbeddingLength()) / int64(ggml.KV().HeadCount()) * int64(ggml.KV().HeadCountKV())
-	// this amount is the overhead + tensors in memory
-	// TODO: get this from the llama.cpp's graph calculations instead of
-	// estimating it's 1/6 * kv_cache_size * num_gqa
-	graph := int64(ggml.KV().GQA()) * kv / 6
-	usedMemory += graph
-	if usedMemory > availableMemory || slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture()) {
-		info.Library = "cpu"
-	}
-	requiredMemory := usedMemory
-	var layers int
-	for i := 0; i < int(ggml.KV().BlockCount()); i++ {
-		layerMemory := ggml.LayerSize(fmt.Sprintf("blk.%d.", i)) + kv/int64(ggml.KV().BlockCount())
-		requiredMemory += layerMemory
-		if availableMemory > usedMemory+layerMemory && (opts.NumGPU < 0 || layers < opts.NumGPU) {
-			usedMemory += layerMemory
-			layers++
-		}
-	}
-	memOutputLayer := ggml.LayerSize("output.")
-	requiredMemory += memOutputLayer
-	// only offload output layer if all repeating layers are offloaded
-	if layers >= int(ggml.KV().BlockCount()) && availableMemory > usedMemory+memOutputLayer {
-		usedMemory += memOutputLayer
-		layers++
-	}
-	slog.Info(
-		"offload to gpu",
-		"layers", layers,
-		"required", format.HumanBytes2(requiredMemory),
-		"used", format.HumanBytes2(usedMemory),
-		"available", format.HumanBytes2(availableMemory),
-		"kv", format.HumanBytes2(kv),
-		"graph", format.HumanBytes2(graph),
-	)
-	if opts.NumGPU < 0 && info.Library != "cpu" {
-		opts.NumGPU = layers
-	}
-	return newLlmServer(info, model, adapters, projectors, opts)
-}
-func projectorMemoryRequirements(filename string) int64 {
-	file, err := os.Open(filename)
-	if err != nil {
-		return 0
-	}
-	defer file.Close()
-	ggml, _, err := DecodeGGML(file)
-	if err != nil {
-		return 0
-	}
-	prefixes := make(map[string]struct{})
-	for _, layer := range ggml.Tensors() {
-		parts := strings.Split(layer.Name, ".")
-		prefixes[strings.Join(parts[:2], ".")] = struct{}{}
-	}
-	var ask int64
-	for prefix := range prefixes {
-		ask += ggml.LayerSize(prefix)
-	}
-	return ask
-}
-// Give any native cgo implementations an opportunity to initialize
-func Init() error {
-	return nativeInit()
-}
-func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts *api.Options) (LLM, error) {
-	dynLibs := getDynLibs(gpuInfo)
-	// Check to see if the user has requested a specific library instead of auto-detecting
-	demandLib := os.Getenv("OLLAMA_LLM_LIBRARY")
-	if demandLib != "" {
-		libPath := availableDynLibs[demandLib]
-		if libPath == "" {
-			slog.Info(fmt.Sprintf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib))
-		} else {
-			slog.Info(fmt.Sprintf("Loading OLLAMA_LLM_LIBRARY=%s", demandLib))
-			dynLibs = []string{libPath}
-		}
-	}
-	// We stage into a temp directory, and if we've been idle for a while, it may have been reaped
-	_, err := os.Stat(dynLibs[0])
-	if err != nil {
-		slog.Info(fmt.Sprintf("%s has disappeared, reloading libraries", dynLibs[0]))
-		err = nativeInit()
-		if err != nil {
-			return nil, err
-		}
-	}
-	err2 := fmt.Errorf("unable to locate suitable llm library")
-	for _, dynLib := range dynLibs {
-		srv, err := newDynExtServer(dynLib, model, adapters, projectors, opts)
-		if err == nil {
-			return srv, nil
-		}
-		slog.Warn(fmt.Sprintf("Failed to load dynamic library %s  %s", dynLib, err))
-		err2 = err
-	}
-	return nil, err2
 }
--- a/llm/payload_linux.go
+++ b/llm/payload_linux.go
@@ -4,5 +4,5 @@ import (
 	"embed"
 )
-//go:embed llama.cpp/build/linux/*/*/lib/*
+//go:embed build/darwin/x86_64/*/bin/*
 var libEmbed embed.FS
--- a/llm/payload_windows.go
+++ b/llm/payload_windows.go
@@ -4,5 +4,5 @@ import (
 	"embed"
 )
-//go:embed llama.cpp/build/windows/*/*/lib/*.dll*
+//go:embed build/darwin/arm64/*/bin/*
 var libEmbed embed.FS
--- a/llm/payload_darwin_amd64.go
+++ b/llm/payload_darwin_amd64.go
 package llm
-import (
+import "embed"
-	"embed"
-)
-//go:embed llama.cpp/build/darwin/x86_64/*/lib/*.dylib*
+//go:embed build/linux/*/*/bin/*
 var libEmbed embed.FS
--- a/llm/llm_windows.go
+++ b/llm/llm_windows.go
+package llm
+import "embed"
+//go:embed build/windows/*/*/bin/*
+var libEmbed embed.FS
--- a/llm/payload.go
+++ b/llm/payload.go
+package llm
+import (
+	"compress/gzip"
+	"errors"
+	"fmt"
+	"io"
+	"io/fs"
+	"log/slog"
+	"os"
+	"path/filepath"
+	"strings"
+	"golang.org/x/exp/slices"
+	"golang.org/x/sync/errgroup"
+	"github.com/ollama/ollama/gpu"
+)
+var errPayloadMissing = fmt.Errorf("expected payloads not included in this build of ollama")
+func Init() error {
+	payloadsDir, err := gpu.PayloadsDir()
+	if err != nil {
+		return err
+	}
+	slog.Info("extracting embedded files", "dir", payloadsDir)
+	binGlob := "build/*/*/*/bin/*"
+	// extract server libraries
+	err = extractFiles(payloadsDir, binGlob)
+	if err != nil {
+		return fmt.Errorf("extract binaries: %v", err)
+	}
+	var variants []string
+	for v := range availableServers() {
+		variants = append(variants, v)
+	}
+	slog.Info(fmt.Sprintf("Dynamic LLM libraries %v", variants))
+	slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY")
+	return nil
+}
+// binary names may contain an optional variant separated by '_'
+// For example, "ollama_rocm_v6" and "ollama_rocm_v5" or "ollama_cpu" and "ollama_cpu_avx2"
+// Any library without a variant is the lowest common denominator
+func availableServers() map[string]string {
+	payloadsDir, err := gpu.PayloadsDir()
+	if err != nil {
+		slog.Error("payload lookup error", "error", err)
+		return nil
+	}
+	// glob payloadsDir for files that start with ollama_
+	pattern := filepath.Join(payloadsDir, "*")
+	files, err := filepath.Glob(pattern)
+	if err != nil {
+		slog.Debug("could not glob", "pattern", pattern, "error", err)
+		return nil
+	}
+	servers := make(map[string]string)
+	for _, file := range files {
+		slog.Debug("availableServers : found", "file", file)
+		servers[filepath.Base(file)] = file
+	}
+	return servers
+}
+// serversForGpu returns a list of compatible servers give the provided GPU
+// info, ordered by performance. assumes Init() has been called
+// TODO - switch to metadata based mapping
+func serversForGpu(info gpu.GpuInfo) []string {
+	// glob workDir for files that start with ollama_
+	availableServers := availableServers()
+	requested := info.Library
+	if info.Variant != "" {
+		requested += "_" + info.Variant
+	}
+	servers := []string{}
+	// exact match first
+	for a := range availableServers {
+		if a == requested {
+			servers = []string{a}
+			if a == "metal" {
+				return servers
+			}
+			break
+		}
+	}
+	alt := []string{}
+	// Then for GPUs load alternates and sort the list for consistent load ordering
+	if info.Library != "cpu" {
+		for a := range availableServers {
+			if info.Library == strings.Split(a, "_")[0] && a != requested {
+				alt = append(alt, a)
+			}
+		}
+		slices.Sort(alt)
+		servers = append(servers, alt...)
+	}
+	// Load up the best CPU variant if not primary requested
+	if info.Library != "cpu" {
+		variant := gpu.GetCPUVariant()
+		// If no variant, then we fall back to default
+		// If we have a variant, try that if we find an exact match
+		// Attempting to run the wrong CPU instructions will panic the
+		// process
+		if variant != "" {
+			for cmp := range availableServers {
+				if cmp == "cpu_"+variant {
+					servers = append(servers, cmp)
+					break
+				}
+			}
+		} else {
+			servers = append(servers, "cpu")
+		}
+	}
+	if len(servers) == 0 {
+		servers = []string{"cpu"}
+	}
+	return servers
+}
+// extract extracts the embedded files to the target directory
+func extractFiles(targetDir string, glob string) error {
+	files, err := fs.Glob(libEmbed, glob)
+	if err != nil || len(files) == 0 {
+		return errPayloadMissing
+	}
+	if err := os.MkdirAll(targetDir, 0o755); err != nil {
+		return fmt.Errorf("extractFiles could not mkdir %s: %v", targetDir, err)
+	}
+	g := new(errgroup.Group)
+	// build/$OS/$GOARCH/$VARIANT/{bin,lib}/$FILE
+	for _, file := range files {
+		filename := file
+		variant := filepath.Base(filepath.Dir(filepath.Dir(filename)))
+		slog.Debug("extracting", "variant", variant, "file", filename)
+		g.Go(func() error {
+			srcf, err := libEmbed.Open(filename)
+			if err != nil {
+				return err
+			}
+			defer srcf.Close()
+			src := io.Reader(srcf)
+			if strings.HasSuffix(filename, ".gz") {
+				src, err = gzip.NewReader(src)
+				if err != nil {
+					return fmt.Errorf("decompress payload %s: %v", filename, err)
+				}
+				filename = strings.TrimSuffix(filename, ".gz")
+			}
+			variantDir := filepath.Join(targetDir, variant)
+			if err := os.MkdirAll(variantDir, 0o755); err != nil {
+				return fmt.Errorf("extractFiles could not mkdir %s: %v", variantDir, err)
+			}
+			base := filepath.Base(filename)
+			destFilename := filepath.Join(variantDir, base)
+			_, err = os.Stat(destFilename)
+			switch {
+			case errors.Is(err, os.ErrNotExist):
+				destFile, err := os.OpenFile(destFilename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
+				if err != nil {
+					return fmt.Errorf("write payload %s: %v", filename, err)
+				}
+				defer destFile.Close()
+				if _, err := io.Copy(destFile, src); err != nil {
+					return fmt.Errorf("copy payload %s: %v", filename, err)
+				}
+			case err != nil:
+				return fmt.Errorf("stat payload %s: %v", filename, err)
+			}
+			return nil
+		})
+	}
+	err = g.Wait()
+	if err != nil {
+		// If we fail to extract, the payload dir is unusable, so cleanup whatever we extracted
+		gpu.Cleanup()
+		return err
+	}
+	return nil
+}
--- a/llm/payload_common.go
+++ b/llm/payload_common.go
-package llm
-import (
-	"compress/gzip"
-	"errors"
-	"fmt"
-	"io"
-	"io/fs"
-	"log/slog"
-	"os"
-	"path/filepath"
-	"runtime"
-	"strings"
-	"sync"
-	"golang.org/x/exp/slices"
-	"golang.org/x/sync/errgroup"
-	"github.com/ollama/ollama/gpu"
-)
-// Libraries names may contain an optional variant separated by '_'
-// For example, "rocm_v6" and "rocm_v5" or "cpu" and "cpu_avx2"
-// Any library without a variant is the lowest common denominator
-var availableDynLibs = map[string]string{}
-const pathComponentCount = 7
-// getDynLibs returns an ordered list of LLM libraries to try, starting with the best
-func getDynLibs(gpuInfo gpu.GpuInfo) []string {
-	// Short circuit if we know we're using the default built-in (darwin only)
-	if gpuInfo.Library == "default" {
-		return []string{"default"}
-	}
-	// TODO - temporary until we have multiple CPU variations for Darwin
-	// Short circuit on darwin with metal only
-	if len(availableDynLibs) == 1 {
-		if _, onlyMetal := availableDynLibs["metal"]; onlyMetal {
-			return []string{availableDynLibs["metal"]}
-		}
-	}
-	exactMatch := ""
-	dynLibs := []string{}
-	altDynLibs := []string{}
-	requested := gpuInfo.Library
-	if gpuInfo.Variant != "" {
-		requested += "_" + gpuInfo.Variant
-	}
-	// Try to find an exact match
-	for cmp := range availableDynLibs {
-		if requested == cmp {
-			exactMatch = cmp
-			dynLibs = []string{availableDynLibs[cmp]}
-			break
-		}
-	}
-	// Then for GPUs load alternates and sort the list for consistent load ordering
-	if gpuInfo.Library != "cpu" {
-		for cmp := range availableDynLibs {
-			if gpuInfo.Library == strings.Split(cmp, "_")[0] && cmp != exactMatch {
-				altDynLibs = append(altDynLibs, cmp)
-			}
-		}
-		slices.Sort(altDynLibs)
-		for _, altDynLib := range altDynLibs {
-			dynLibs = append(dynLibs, availableDynLibs[altDynLib])
-		}
-	}
-	// Load up the best CPU variant if not primary requested
-	if gpuInfo.Library != "cpu" {
-		variant := gpu.GetCPUVariant()
-		// If no variant, then we fall back to default
-		// If we have a variant, try that if we find an exact match
-		// Attempting to run the wrong CPU instructions will panic the
-		// process
-		if variant != "" {
-			for cmp := range availableDynLibs {
-				if cmp == "cpu_"+variant {
-					dynLibs = append(dynLibs, availableDynLibs[cmp])
-					break
-				}
-			}
-		} else {
-			dynLibs = append(dynLibs, availableDynLibs["cpu"])
-		}
-	}
-	// Finally, if we didn't find any matches, LCD CPU FTW
-	if len(dynLibs) == 0 {
-		dynLibs = []string{availableDynLibs["cpu"]}
-	}
-	slog.Debug(fmt.Sprintf("ordered list of LLM libraries to try %v", dynLibs))
-	return dynLibs
-}
-func rocmDynLibPresent() bool {
-	for dynLibName := range availableDynLibs {
-		if strings.HasPrefix(dynLibName, "rocm") {
-			return true
-		}
-	}
-	return false
-}
-func nativeInit() error {
-	payloadsDir, err := gpu.PayloadsDir()
-	if err != nil {
-		return err
-	}
-	slog.Info(fmt.Sprintf("Extracting dynamic libraries to %s ...", payloadsDir))
-	libs, err := extractDynamicLibs(payloadsDir, "llama.cpp/build/*/*/*/lib/*")
-	if err != nil {
-		if errors.Is(err, payloadMissing) {
-			slog.Info(fmt.Sprintf("%s", payloadMissing))
-			return nil
-		}
-		return err
-	}
-	for _, lib := range libs {
-		// The last dir component is the variant name
-		variant := filepath.Base(filepath.Dir(lib))
-		availableDynLibs[variant] = lib
-	}
-	if err := verifyDriverAccess(); err != nil {
-		return err
-	}
-	// Report which dynamic libraries we have loaded to assist troubleshooting
-	variants := make([]string, len(availableDynLibs))
-	i := 0
-	for variant := range availableDynLibs {
-		variants[i] = variant
-		i++
-	}
-	slog.Info(fmt.Sprintf("Dynamic LLM libraries %v", variants))
-	slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY")
-	return nil
-}
-func extractDynamicLibs(payloadsDir, glob string) ([]string, error) {
-	files, err := fs.Glob(libEmbed, glob)
-	if err != nil || len(files) == 0 {
-		return nil, payloadMissing
-	}
-	var mu sync.Mutex
-	var libs []string
-	var g errgroup.Group
-	for _, file := range files {
-		pathComps := strings.Split(file, "/")
-		if len(pathComps) != pathComponentCount {
-			slog.Error(fmt.Sprintf("unexpected payload components: %v", pathComps))
-			continue
-		}
-		file := file
-		g.Go(func() error {
-			// llama.cpp/build/$OS/$GOARCH/$VARIANT/lib/$LIBRARY
-			// Include the variant in the path to avoid conflicts between multiple server libs
-			targetDir := filepath.Join(payloadsDir, pathComps[pathComponentCount-3])
-			srcFile, err := libEmbed.Open(file)
-			if err != nil {
-				return fmt.Errorf("read payload %s: %v", file, err)
-			}
-			defer srcFile.Close()
-			if err := os.MkdirAll(targetDir, 0o755); err != nil {
-				return fmt.Errorf("create payload lib dir %s: %v", payloadsDir, err)
-			}
-			src := io.Reader(srcFile)
-			filename := file
-			if strings.HasSuffix(file, ".gz") {
-				src, err = gzip.NewReader(src)
-				if err != nil {
-					return fmt.Errorf("decompress payload %s: %v", file, err)
-				}
-				filename = strings.TrimSuffix(filename, ".gz")
-			}
-			destFile := filepath.Join(targetDir, filepath.Base(filename))
-			if strings.Contains(destFile, "server") {
-				mu.Lock()
-				libs = append(libs, destFile)
-				mu.Unlock()
-			}
-			destFp, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
-			if err != nil {
-				return fmt.Errorf("write payload %s: %v", file, err)
-			}
-			defer destFp.Close()
-			if _, err := io.Copy(destFp, src); err != nil {
-				return fmt.Errorf("copy payload %s: %v", file, err)
-			}
-			return nil
-		})
-	}
-	err = g.Wait()
-	if err != nil {
-		// If we fail to extract, the payload dir is unusable, so cleanup whatever we extracted
-		gpu.Cleanup()
-		return nil, err
-	}
-	return libs, nil
-}
-func verifyDriverAccess() error {
-	if runtime.GOOS != "linux" {
-		return nil
-	}
-	// Only check ROCm access if we have the dynamic lib loaded
-	if rocmDynLibPresent() {
-		// Verify we have permissions - either running as root, or we have group access to the driver
-		fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666)
-		if err != nil {
-			if errors.Is(err, fs.ErrPermission) {
-				return fmt.Errorf("Radeon card detected, but permissions not set up properly.  Either run ollama as root, or add you user account to the render group.")
-			} else if errors.Is(err, fs.ErrNotExist) {
-				// expected behavior without a radeon card
-				return nil
-			}
-			return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
-		}
-		fd.Close()
-	}
-	return nil
-}
--- a/llm/payload_darwin_arm64.go
+++ b/llm/payload_darwin_arm64.go
-package llm
-import (
-	"embed"
-)
-//go:embed llama.cpp/ggml-metal.metal llama.cpp/build/darwin/arm64/*/lib/*.dylib*
-var libEmbed embed.FS
--- a/llm/payload_test.go
+++ b/llm/payload_test.go
-package llm
-import (
-	"testing"
-	"github.com/ollama/ollama/gpu"
-	"github.com/stretchr/testify/assert"
-)
-func TestGetDynLibs(t *testing.T) {
-	availableDynLibs = map[string]string{
-		"cpu": "X_cpu",
-	}
-	assert.Equal(t, false, rocmDynLibPresent())
-	res := getDynLibs(gpu.GpuInfo{Library: "cpu"})
-	assert.Len(t, res, 1)
-	assert.Equal(t, availableDynLibs["cpu"], res[0])
-	variant := gpu.GetCPUVariant()
-	if variant != "" {
-		variant = "_" + variant
-	}
-	availableDynLibs = map[string]string{
-		"rocm_v5":       "X_rocm_v5",
-		"rocm_v6":       "X_rocm_v6",
-		"cpu" + variant: "X_cpu",
-	}
-	assert.Equal(t, true, rocmDynLibPresent())
-	res = getDynLibs(gpu.GpuInfo{Library: "rocm"})
-	assert.Len(t, res, 3)
-	assert.Equal(t, availableDynLibs["rocm_v5"], res[0])
-	assert.Equal(t, availableDynLibs["rocm_v6"], res[1])
-	assert.Equal(t, availableDynLibs["cpu"+variant], res[2])
-	res = getDynLibs(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
-	assert.Len(t, res, 3)
-	assert.Equal(t, availableDynLibs["rocm_v6"], res[0])
-	assert.Equal(t, availableDynLibs["rocm_v5"], res[1])
-	assert.Equal(t, availableDynLibs["cpu"+variant], res[2])
-	res = getDynLibs(gpu.GpuInfo{Library: "cuda"})
-	assert.Len(t, res, 1)
-	assert.Equal(t, availableDynLibs["cpu"+variant], res[0])
-	res = getDynLibs(gpu.GpuInfo{Library: "default"})
-	assert.Len(t, res, 1)
-	assert.Equal(t, "default", res[0])
-	availableDynLibs = map[string]string{
-		"rocm":          "X_rocm_v5",
-		"cpu" + variant: "X_cpu",
-	}
-	assert.Equal(t, true, rocmDynLibPresent())
-	res = getDynLibs(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
-	assert.Len(t, res, 2)
-	assert.Equal(t, availableDynLibs["rocm"], res[0])
-	assert.Equal(t, availableDynLibs["cpu"+variant], res[1])
-}
--- a/llm/server.go
+++ b/llm/server.go
+package llm
+import (
+	"bufio"
+	"bytes"
+	"context"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"io"
+	"log"
+	"log/slog"
+	"math/rand"
+	"net"
+	"net/http"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"runtime"
+	"slices"
+	"strconv"
+	"strings"
+	"time"
+	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/format"
+	"github.com/ollama/ollama/gpu"
+)
+// LlamaServer is an instance of the llama.cpp server
+type LlamaServer struct {
+	port    int
+	cmd     *exec.Cmd
+	done    chan error // Channel to signal when the process exits
+	status  *StatusWriter
+	options *api.Options
+}
+var cpuOnlyFamilies = []string{
+	"mamba",
+}
+func NewLlamaServer(model string, adapters, projectors []string, opts *api.Options) (*LlamaServer, error) {
+	if _, err := os.Stat(model); err != nil {
+		return nil, err
+	}
+	f, err := os.Open(model)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+	ggml, _, err := DecodeGGML(f)
+	if err != nil {
+		return nil, err
+	}
+	if opts.NumCtx > int(ggml.KV().ContextLength()) {
+		slog.Warn("requested context length is greater than model max context length", "requested", opts.NumCtx, "model", ggml.KV().ContextLength())
+		opts.NumCtx = int(ggml.KV().ContextLength())
+	}
+	if opts.NumCtx < 4 {
+		opts.NumCtx = 4
+	}
+	availableMemory, _ := gpu.CheckVRAM()
+	info := gpu.GetGPUInfo()
+	usedMemory := info.MinimumMemory
+	for _, projector := range projectors {
+		usedMemory += projectorMemoryRequirements(projector)
+		// multimodal models require at least 2048 context
+		opts.NumCtx = max(opts.NumCtx, 2048)
+	}
+	// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
+	kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.KV().BlockCount()) * int64(ggml.KV().EmbeddingLength()) / int64(ggml.KV().HeadCount()) * int64(ggml.KV().HeadCountKV())
+	// this amount is the overhead + tensors in memory
+	// TODO: get this from the llama.cpp's graph calculations instead of
+	// estimating it's 1/6 * kv_cache_size * num_gqa
+	graph := int64(ggml.KV().GQA()) * kv / 6
+	usedMemory += graph
+	if usedMemory > availableMemory || slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture()) {
+		info.Library = "cpu"
+	}
+	requiredMemory := usedMemory
+	var layers int
+	for i := 0; i < int(ggml.KV().BlockCount()); i++ {
+		layerMemory := ggml.LayerSize(fmt.Sprintf("blk.%d.", i)) + kv/int64(ggml.KV().BlockCount())
+		requiredMemory += layerMemory
+		if availableMemory > usedMemory+layerMemory && (opts.NumGPU < 0 || layers < opts.NumGPU) {
+			usedMemory += layerMemory
+			layers++
+		}
+	}
+	memOutputLayer := ggml.LayerSize("output.")
+	requiredMemory += memOutputLayer
+	// only offload output layer if all repeating layers are offloaded
+	if layers >= int(ggml.KV().BlockCount()) && availableMemory > usedMemory+memOutputLayer {
+		usedMemory += memOutputLayer
+		layers++
+	}
+	slog.Info(
+		"offload to gpu",
+		"layers", layers,
+		"required", format.HumanBytes2(requiredMemory),
+		"used", format.HumanBytes2(usedMemory),
+		"available", format.HumanBytes2(availableMemory),
+		"kv", format.HumanBytes2(kv),
+		"graph", format.HumanBytes2(graph),
+	)
+	if opts.NumGPU < 0 && info.Library != "cpu" {
+		opts.NumGPU = layers
+	}
+	if len(adapters) > 1 {
+		return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
+	}
+	availableServers := availableServers()
+	servers := serversForGpu(info)
+	demandLib := os.Getenv("OLLAMA_LLM_LIBRARY")
+	if demandLib != "" {
+		serverPath := availableServers[demandLib]
+		if serverPath == "" {
+			slog.Info(fmt.Sprintf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib))
+		} else {
+			slog.Info("user override", "OLLAMA_LLM_LIBRARY", demandLib, "path", serverPath)
+			servers = []string{demandLib}
+		}
+	}
+	if len(servers) == 0 {
+		return nil, fmt.Errorf("no servers found for %v", info)
+	}
+	params := []string{
+		"--model", model,
+		"--ctx-size", fmt.Sprintf("%d", opts.NumCtx),
+		"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
+		"--embedding",
+	}
+	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
+		params = append(params, "--log-format", "json")
+	} else {
+		params = append(params, "--log-disable")
+	}
+	if opts.NumGPU > 0 {
+		params = append(params, "--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU))
+	}
+	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
+		params = append(params, "--verbose")
+	}
+	if opts.MainGPU > 0 {
+		params = append(params, "--main-gpu", fmt.Sprintf("%d", opts.MainGPU))
+	}
+	if opts.RopeFrequencyBase > 0 {
+		params = append(params, "--rope-freq-base", fmt.Sprintf("%f", opts.RopeFrequencyBase))
+	}
+	if opts.RopeFrequencyScale > 0 {
+		params = append(params, "--rope-freq-scale", fmt.Sprintf("%f", opts.RopeFrequencyScale))
+	}
+	if len(adapters) > 0 {
+		// TODO: applying multiple adapters is not supported by the llama.cpp server yet
+		params = append(params, "--lora", adapters[0])
+	}
+	if len(projectors) > 0 {
+		// TODO: applying multiple projectors is not supported by the llama.cpp server yet
+		params = append(params, "--mmproj", projectors[0])
+	}
+	if opts.NumThread > 0 {
+		params = append(params, "--threads", fmt.Sprintf("%d", opts.NumThread))
+	}
+	if !opts.F16KV {
+		params = append(params, "--memory-f32")
+	}
+	if opts.UseMLock {
+		params = append(params, "--mlock")
+	}
+	if !opts.UseMMap {
+		params = append(params, "--no-mmap")
+	}
+	if opts.UseNUMA {
+		params = append(params, "--numa")
+	}
+	// Loop through potential servers
+	var finalErr error
+	for i := 0; i < len(servers); i++ {
+		dir := availableServers[servers[i]]
+		// Find an availableServers  port, retry on each iterration in case the failure was a port conflict race
+		port := 0
+		if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
+			var l *net.TCPListener
+			if l, err = net.ListenTCP("tcp", a); err == nil {
+				port = l.Addr().(*net.TCPAddr).Port
+				l.Close()
+			}
+		}
+		if port == 0 {
+			slog.Debug("ResolveTCPAddr failed ", "error", err)
+			port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
+		}
+		finalParams := append(params, "--port", strconv.Itoa(port))
+		pathEnv := "LD_LIBRARY_PATH"
+		if runtime.GOOS == "windows" {
+			pathEnv = "PATH"
+		}
+		// append the server directory to LD_LIBRARY_PATH/PATH
+		libraryPaths := []string{dir}
+		if libraryPath, ok := os.LookupEnv(pathEnv); ok {
+			// Append our runner directory to the path
+			// This will favor system libraries over our bundled library dependencies
+			libraryPaths = append(filepath.SplitList(libraryPath), libraryPaths...)
+		}
+		server := filepath.Join(dir, "ollama_llama_server")
+		if runtime.GOOS == "windows" {
+			server = server + ".exe"
+		}
+		s := &LlamaServer{
+			port:    port,
+			cmd:     exec.Command(server, finalParams...),
+			status:  NewStatusWriter(os.Stderr),
+			options: opts,
+		}
+		libEnv := fmt.Sprintf("%s=%s", pathEnv, strings.Join(libraryPaths, string(filepath.ListSeparator)))
+		slog.Debug(libEnv)
+		s.cmd.Env = append(os.Environ(), libEnv)
+		s.cmd.Stdout = os.Stdout
+		s.cmd.Stderr = s.status
+		slog.Info("starting llama server", "cmd", s.cmd.String())
+		if err = s.cmd.Start(); err != nil {
+			msg := ""
+			if s.status != nil && s.status.LastErrMsg != "" {
+				msg = s.status.LastErrMsg
+			}
+			err = fmt.Errorf("error starting the external llama server: %v %s", err, msg)
+			finalErr = err
+			continue
+		}
+		// reap subprocess when it exits
+		go func() {
+			// Exit status managed via getServerStatus
+			_ = s.cmd.Wait()
+		}()
+		if err = s.waitUntilRunning(); err != nil {
+			slog.Error("error starting llama server", "server", servers[i], "error", err)
+			s.Close()
+			finalErr = err
+			continue
+		}
+		return s, nil
+	}
+	slog.Error("unable to load any llama server", "error", finalErr)
+	return nil, finalErr
+}
+func projectorMemoryRequirements(filename string) int64 {
+	file, err := os.Open(filename)
+	if err != nil {
+		return 0
+	}
+	defer file.Close()
+	ggml, _, err := DecodeGGML(file)
+	if err != nil {
+		return 0
+	}
+	prefixes := make(map[string]struct{})
+	for _, layer := range ggml.Tensors() {
+		parts := strings.Split(layer.Name, ".")
+		prefixes[strings.Join(parts[:2], ".")] = struct{}{}
+	}
+	var ask int64
+	for prefix := range prefixes {
+		ask += ggml.LayerSize(prefix)
+	}
+	return ask
+}
+type ServerStatus int
+const ( // iota is reset to 0
+	ServerStatusReady ServerStatus = iota
+	ServerStatusNoSlotsAvaialble
+	ServerStatusLoadingModel
+	ServerStatusNotResponding
+	ServerStatusError
+)
+type ServerStatusResp struct {
+	Status          string `json:"status"`
+	SlotsIdle       int    `json:"slots_idle"`
+	SlotsProcessing int    `json:"slots_processing"`
+	Error           string `json:"error"`
+}
+func (s *LlamaServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
+	// Fail fast if its exited
+	if s.cmd.ProcessState != nil {
+		msg := ""
+		if s.status != nil && s.status.LastErrMsg != "" {
+			msg = s.status.LastErrMsg
+		}
+		return ServerStatusError, fmt.Errorf("llama runner process no longer running: %d %s", s.cmd.ProcessState.ExitCode(), msg)
+	}
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, fmt.Sprintf("http://127.0.0.1:%d/health", s.port), nil)
+	if err != nil {
+		return ServerStatusError, fmt.Errorf("error creating GET request: %v", err)
+	}
+	req.Header.Set("Content-Type", "application/json")
+	resp, err := http.DefaultClient.Do(req)
+	if err != nil {
+		if errors.Is(err, context.DeadlineExceeded) {
+			return ServerStatusNotResponding, fmt.Errorf("server not responding")
+		}
+		return ServerStatusError, fmt.Errorf("health resp: %w", err)
+	}
+	defer resp.Body.Close()
+	body, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return ServerStatusError, fmt.Errorf("read health request: %w", err)
+	}
+	var status ServerStatusResp
+	if err := json.Unmarshal(body, &status); err != nil {
+		return ServerStatusError, fmt.Errorf("health unmarshal encode response: %w", err)
+	}
+	switch status.Status {
+	case "ok":
+		return ServerStatusReady, nil
+	case "no slot available":
+		return ServerStatusNoSlotsAvaialble, nil
+	case "loading model":
+		return ServerStatusLoadingModel, nil
+	default:
+		return ServerStatusError, fmt.Errorf("server error: %+v", status)
+	}
+}
+func (s *LlamaServer) Ping(ctx context.Context) error {
+	_, err := s.getServerStatus(ctx)
+	if err != nil {
+		slog.Debug("server unhealthy", "error", err)
+		return err
+	}
+	return nil
+}
+func (s *LlamaServer) waitUntilRunning() error {
+	start := time.Now()
+	expiresAt := time.Now().Add(3 * time.Minute) // be generous with timeout, large models can take a while to load
+	ticker := time.NewTicker(50 * time.Millisecond)
+	defer ticker.Stop()
+	slog.Info("waiting for llama runner to start responding")
+	var lastStatus ServerStatus = -1
+	for {
+		select {
+		case err := <-s.done:
+			msg := ""
+			if s.status != nil && s.status.LastErrMsg != "" {
+				msg = s.status.LastErrMsg
+			}
+			return fmt.Errorf("llama runner process has terminated: %v %s", err, msg)
+		case <-ticker.C:
+			if time.Now().After(expiresAt) {
+				// timeout
+				msg := ""
+				if s.status != nil && s.status.LastErrMsg != "" {
+					msg = s.status.LastErrMsg
+				}
+				return fmt.Errorf("timed out waiting for llama runner to start: %s", msg)
+			}
+			if s.cmd.ProcessState != nil {
+				msg := ""
+				if s.status != nil && s.status.LastErrMsg != "" {
+					msg = s.status.LastErrMsg
+				}
+				return fmt.Errorf("llama runner process no longer running: %d %s", s.cmd.ProcessState.ExitCode(), msg)
+			}
+			ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond)
+			defer cancel()
+			status, err := s.getServerStatus(ctx)
+			if err != nil && lastStatus != status {
+				slog.Debug("server not yet available", "error", err)
+				lastStatus = status
+				continue
+			}
+			switch status {
+			case ServerStatusLoadingModel:
+				// TODO - this state never seems to happen with the current server.cpp code (bug?)
+				// it doesn't respond to the health endpoint until after the model is loaded
+				slog.Debug("loading model")
+			case ServerStatusReady:
+				slog.Debug(fmt.Sprintf("llama runner started in %f seconds", time.Since(start).Seconds()))
+				return nil
+			}
+		}
+	}
+}
+const jsonGrammar = `
+root   ::= object
+value  ::= object | array | string | number | ("true" | "false" | "null") ws
+object ::=
+  "{" ws (
+            string ":" ws value
+    ("," ws string ":" ws value)*
+  )? "}" ws
+array  ::=
+  "[" ws (
+            value
+    ("," ws value)*
+  )? "]" ws
+string ::=
+  "\"" (
+    [^"\\] |
+    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
+  )* "\"" ws
+number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
+# Optional space: by convention, applied in this grammar after literal chars when allowed
+ws ::= ([ \t\n] ws)?
+`
+const maxBufferSize = 512 * format.KiloByte
+const maxRetries = 3
+type ImageData struct {
+	Data []byte `json:"data"`
+	ID   int    `json:"id"`
+}
+type completion struct {
+	Content string `json:"content"`
+	Model   string `json:"model"`
+	Prompt  string `json:"prompt"`
+	Stop    bool   `json:"stop"`
+	Timings struct {
+		PredictedN  int     `json:"predicted_n"`
+		PredictedMS float64 `json:"predicted_ms"`
+		PromptN     int     `json:"prompt_n"`
+		PromptMS    float64 `json:"prompt_ms"`
+	}
+}
+type CompletionRequest struct {
+	Prompt  string
+	Format  string
+	Images  []ImageData
+	Options api.Options
+}
+type CompletionResponse struct {
+	Content            string
+	Done               bool
+	PromptEvalCount    int
+	PromptEvalDuration time.Duration
+	EvalCount          int
+	EvalDuration       time.Duration
+}
+func (s *LlamaServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
+	request := map[string]any{
+		"prompt":            req.Prompt,
+		"stream":            true,
+		"n_predict":         req.Options.NumPredict,
+		"n_keep":            req.Options.NumKeep,
+		"main_gpu":          req.Options.MainGPU,
+		"temperature":       req.Options.Temperature,
+		"top_k":             req.Options.TopK,
+		"top_p":             req.Options.TopP,
+		"tfs_z":             req.Options.TFSZ,
+		"typical_p":         req.Options.TypicalP,
+		"repeat_last_n":     req.Options.RepeatLastN,
+		"repeat_penalty":    req.Options.RepeatPenalty,
+		"presence_penalty":  req.Options.PresencePenalty,
+		"frequency_penalty": req.Options.FrequencyPenalty,
+		"mirostat":          req.Options.Mirostat,
+		"mirostat_tau":      req.Options.MirostatTau,
+		"mirostat_eta":      req.Options.MirostatEta,
+		"penalize_nl":       req.Options.PenalizeNewline,
+		"seed":              req.Options.Seed,
+		"stop":              req.Options.Stop,
+		"image_data":        req.Images,
+		"cache_prompt":      true,
+	}
+	// Make sure the server is ready
+	status, err := s.getServerStatus(ctx)
+	if err != nil {
+		return err
+	} else if status != ServerStatusReady {
+		return fmt.Errorf("unexpected server status: %d", status)
+	}
+	if req.Format == "json" {
+		request["grammar"] = jsonGrammar
+		if !strings.Contains(strings.ToLower(req.Prompt), "json") {
+			slog.Warn("Prompt does not specify that the LLM should response in JSON, but JSON format is expected. For best results specify that JSON is expected in the system prompt.")
+		}
+	}
+	retryDelay := 100 * time.Microsecond
+	for retries := 0; retries < maxRetries; retries++ {
+		if retries > 0 {
+			time.Sleep(retryDelay) // wait before retrying
+			retryDelay *= 2        // exponential backoff
+		}
+		// Handling JSON marshaling with special characters unescaped.
+		buffer := &bytes.Buffer{}
+		enc := json.NewEncoder(buffer)
+		enc.SetEscapeHTML(false)
+		if err := enc.Encode(request); err != nil {
+			return fmt.Errorf("failed to marshal data: %v", err)
+		}
+		endpoint := fmt.Sprintf("http://127.0.0.1:%d/completion", s.port)
+		req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, buffer)
+		if err != nil {
+			return fmt.Errorf("error creating POST request: %v", err)
+		}
+		req.Header.Set("Content-Type", "application/json")
+		resp, err := http.DefaultClient.Do(req)
+		if err != nil {
+			return fmt.Errorf("POST predict: %v", err)
+		}
+		defer resp.Body.Close()
+		if resp.StatusCode >= 400 {
+			bodyBytes, err := io.ReadAll(resp.Body)
+			if err != nil {
+				return fmt.Errorf("failed reading llm error response: %w", err)
+			}
+			log.Printf("llm predict error: %s", bodyBytes)
+			return fmt.Errorf("%s", bodyBytes)
+		}
+		scanner := bufio.NewScanner(resp.Body)
+		buf := make([]byte, 0, maxBufferSize)
+		scanner.Buffer(buf, maxBufferSize)
+		retryNeeded := false
+		// keep track of the last token generated, this is used to abort if the model starts looping
+		var lastToken string
+		var tokenRepeat int
+		for scanner.Scan() {
+			select {
+			case <-ctx.Done():
+				// This handles the request cancellation
+				return ctx.Err()
+			default:
+				line := scanner.Bytes()
+				if len(line) == 0 {
+					continue
+				}
+				// try again on slot unavailable
+				if bytes.Contains(line, []byte("slot unavailable")) {
+					retryNeeded = true
+					break
+				}
+				evt, ok := bytes.CutPrefix(line, []byte("data: "))
+				if !ok {
+					return fmt.Errorf("error parsing llm response stream: %s", line)
+				}
+				var c completion
+				if err := json.Unmarshal(evt, &c); err != nil {
+					return fmt.Errorf("error unmarshaling llm prediction response: %v", err)
+				}
+				switch {
+				case strings.TrimSpace(c.Content) == lastToken:
+					tokenRepeat++
+				default:
+					lastToken = strings.TrimSpace(c.Content)
+					tokenRepeat = 0
+				}
+				// 30 picked as an arbitrary max token repeat limit, modify as needed
+				if tokenRepeat > 30 {
+					slog.Debug("prediction aborted, token repeat limit reached")
+					return ctx.Err()
+				}
+				if c.Content != "" {
+					fn(CompletionResponse{
+						Content: c.Content,
+					})
+				}
+				if c.Stop {
+					fn(CompletionResponse{
+						Done:               true,
+						PromptEvalCount:    c.Timings.PromptN,
+						PromptEvalDuration: parseDurationMs(c.Timings.PromptMS),
+						EvalCount:          c.Timings.PredictedN,
+						EvalDuration:       parseDurationMs(c.Timings.PredictedMS),
+					})
+					return nil
+				}
+			}
+		}
+		if err := scanner.Err(); err != nil {
+			if strings.Contains(err.Error(), "unexpected EOF") {
+				s.Close()
+				msg := ""
+				if s.status != nil && s.status.LastErrMsg != "" {
+					msg = s.status.LastErrMsg
+				}
+				return fmt.Errorf("an unknown error was encountered while running the model %s", msg)
+			}
+			return fmt.Errorf("error reading llm response: %v", err)
+		}
+		if !retryNeeded {
+			return nil // success
+		}
+	}
+	// should never reach here ideally
+	return fmt.Errorf("max retries exceeded")
+}
+type EmbeddingRequest struct {
+	Content string `json:"content"`
+}
+type EmbeddingResponse struct {
+	Embedding []float64 `json:"embedding"`
+}
+func (s *LlamaServer) Embedding(ctx context.Context, prompt string) ([]float64, error) {
+	// Make sure the server is ready
+	status, err := s.getServerStatus(ctx)
+	if err != nil {
+		return nil, err
+	} else if status != ServerStatusReady {
+		return nil, fmt.Errorf("unexpected server status: %d", status)
+	}
+	data, err := json.Marshal(TokenizeRequest{Content: prompt})
+	if err != nil {
+		return nil, fmt.Errorf("error marshaling embed data: %w", err)
+	}
+	req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/embedding", s.port), bytes.NewBuffer(data))
+	if err != nil {
+		return nil, fmt.Errorf("error creating embed request: %w", err)
+	}
+	req.Header.Set("Content-Type", "application/json")
+	resp, err := http.DefaultClient.Do(req)
+	if err != nil {
+		return nil, fmt.Errorf("do embedding request: %w", err)
+	}
+	defer resp.Body.Close()
+	body, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return nil, fmt.Errorf("error reading embed response: %w", err)
+	}
+	if resp.StatusCode >= 400 {
+		log.Printf("llm encode error: %s", body)
+		return nil, fmt.Errorf("%s", body)
+	}
+	var embedding EmbeddingResponse
+	if err := json.Unmarshal(body, &embedding); err != nil {
+		return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
+	}
+	return embedding.Embedding, nil
+}
+type TokenizeRequest struct {
+	Content string `json:"content"`
+}
+type TokenizeResponse struct {
+	Tokens []int `json:"tokens"`
+}
+func (s *LlamaServer) Tokenize(ctx context.Context, content string) ([]int, error) {
+	// Make sure the server is ready
+	status, err := s.getServerStatus(ctx)
+	if err != nil {
+		return nil, err
+	} else if status != ServerStatusReady {
+		return nil, fmt.Errorf("unexpected server status: %d", status)
+	}
+	data, err := json.Marshal(TokenizeRequest{Content: content})
+	if err != nil {
+		return nil, fmt.Errorf("marshaling encode data: %w", err)
+	}
+	req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/tokenize", s.port), bytes.NewBuffer(data))
+	if err != nil {
+		return nil, fmt.Errorf("encode request: %w", err)
+	}
+	req.Header.Set("Content-Type", "application/json")
+	resp, err := http.DefaultClient.Do(req)
+	if err != nil {
+		return nil, fmt.Errorf("do encode request: %w", err)
+	}
+	defer resp.Body.Close()
+	body, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return nil, fmt.Errorf("read encode request: %w", err)
+	}
+	if resp.StatusCode >= 400 {
+		log.Printf("llm encode error: %s", body)
+		return nil, fmt.Errorf("%s", body)
+	}
+	var encoded TokenizeResponse
+	if err := json.Unmarshal(body, &encoded); err != nil {
+		return nil, fmt.Errorf("unmarshal encode response: %w", err)
+	}
+	return encoded.Tokens, nil
+}
+type DetokenizeRequest struct {
+	Tokens []int `json:"tokens"`
+}
+type DetokenizeResponse struct {
+	Content string `json:"content"`
+}
+func (s *LlamaServer) Detokenize(ctx context.Context, tokens []int) (string, error) {
+	// Make sure the server is ready
+	status, err := s.getServerStatus(ctx)
+	if err != nil {
+		return "", err
+	} else if status != ServerStatusReady {
+		return "", fmt.Errorf("unexpected server status: %d", status)
+	}
+	data, err := json.Marshal(DetokenizeRequest{Tokens: tokens})
+	if err != nil {
+		return "", fmt.Errorf("marshaling decode data: %w", err)
+	}
+	req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/detokenize", s.port), bytes.NewBuffer(data))
+	if err != nil {
+		return "", fmt.Errorf("decode request: %w", err)
+	}
+	req.Header.Set("Content-Type", "application/json")
+	resp, err := http.DefaultClient.Do(req)
+	if err != nil {
+		return "", fmt.Errorf("do decode request: %w", err)
+	}
+	defer resp.Body.Close()
+	body, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return "", fmt.Errorf("read decode request: %w", err)
+	}
+	if resp.StatusCode >= 400 {
+		log.Printf("llm decode error: %s", body)
+		return "", fmt.Errorf("%s", body)
+	}
+	var decoded DetokenizeResponse
+	if err := json.Unmarshal(body, &decoded); err != nil {
+		return "", fmt.Errorf("unmarshal encode response: %w", err)
+	}
+	return decoded.Content, nil
+}
+func (s *LlamaServer) Close() error {
+	if s.cmd != nil {
+		slog.Debug("stopping llama server")
+		return s.cmd.Process.Kill()
+	}
+	return nil
+}
+func parseDurationMs(ms float64) time.Duration {
+	dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms))
+	if err != nil {
+		panic(err)
+	}
+	return dur
+}
--- a/llm/status.go
+++ b/llm/status.go
+package llm
+import (
+	"bytes"
+	"os"
+)
+// StatusWriter is a writer that captures error messages from the llama runner process
+type StatusWriter struct {
+	LastErrMsg string
+	out        *os.File
+}
+func NewStatusWriter(out *os.File) *StatusWriter {
+	return &StatusWriter{
+		out: out,
+	}
+}
+// TODO - regex matching to detect errors like
+// libcublasLt.so.11: cannot open shared object file: No such file or directory
+var errorPrefixes = []string{
+	"error:",
+	"CUDA error",
+	"cudaMalloc failed",
+	"\"ERR\"",
+}
+func (w *StatusWriter) Write(b []byte) (int, error) {
+	var errMsg string
+	for _, prefix := range errorPrefixes {
+		if _, after, ok := bytes.Cut(b, []byte(prefix)); ok {
+			errMsg = prefix + string(bytes.TrimSpace(after))
+		}
+	}
+	if errMsg != "" {
+		w.LastErrMsg = errMsg
+	}
+	return w.out.Write(b)
+}
--- a/llm/utils.go
+++ b/llm/utils.go
-package llm
-import (
-	"fmt"
-	"time"
-)
-func parseDurationMs(ms float64) time.Duration {
-	dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms))
-	if err != nil {
-		panic(err)
-	}
-	return dur
-}
--- a/server/routes.go
+++ b/server/routes.go
@@ -56,12 +56,13 @@ func init() {
 var loaded struct {
 	mu sync.Mutex
-	runner llm.LLM
+	llama *llm.LlamaServer
-	expireAt    time.Time
 	expireTimer *time.Timer
-	*Model
+	model      string
+	adapters   []string
+	projectors []string
 	*api.Options
 }
@@ -69,21 +70,28 @@ var defaultSessionDuration = 5 * time.Minute
 // load a model into memory if it is not already loaded, it is up to the caller to lock loaded.mu before calling this function
 func load(c *gin.Context, model *Model, opts *api.Options, sessionDuration time.Duration) error {
-	needLoad := loaded.runner == nil || // is there a model loaded?
+	ctx, cancel := context.WithTimeout(c, 10*time.Second)
-		loaded.ModelPath != model.ModelPath || // has the base model changed?
+	defer cancel()
-		!reflect.DeepEqual(loaded.AdapterPaths, model.AdapterPaths) || // have the adapters changed?
-		!reflect.DeepEqual(loaded.Options.Runner, opts.Runner) // have the runner options changed?
+	needLoad := loaded.llama == nil || // is there a model loaded?
+		loaded.model != model.ModelPath || // has the base model changed?
+		!reflect.DeepEqual(loaded.adapters, model.AdapterPaths) || // have the adapters changed?
+		!reflect.DeepEqual(loaded.projectors, model.ProjectorPaths) || // have the adapters changed?
+		!reflect.DeepEqual(loaded.Options.Runner, opts.Runner) || // have the runner options changed?
+		loaded.llama.Ping(ctx) != nil
 	if needLoad {
-		if loaded.runner != nil {
+		if loaded.llama != nil {
 			slog.Info("changing loaded model")
-			loaded.runner.Close()
+			loaded.llama.Close()
-			loaded.runner = nil
+			loaded.llama = nil
-			loaded.Model = nil
+			loaded.model = ""
+			loaded.adapters = nil
+			loaded.projectors = nil
 			loaded.Options = nil
 		}
-		llmRunner, err := llm.New(model.ModelPath, model.AdapterPaths, model.ProjectorPaths, opts)
+		llama, err := llm.NewLlamaServer(model.ModelPath, model.AdapterPaths, model.ProjectorPaths, opts)
 		if err != nil {
 			// some older models are not compatible with newer versions of llama.cpp
 			// show a generalized compatibility error until there is a better way to
@@ -95,28 +103,26 @@ func load(c *gin.Context, model *Model, opts *api.Options, sessionDuration time.
 			return err
 		}
-		loaded.Model = model
+		loaded.model = model.ModelPath
-		loaded.runner = llmRunner
+		loaded.adapters = model.AdapterPaths
+		loaded.projectors = model.ProjectorPaths
+		loaded.llama = llama
 		loaded.Options = opts
 	}
-	loaded.expireAt = time.Now().Add(sessionDuration)
 	if loaded.expireTimer == nil {
 		loaded.expireTimer = time.AfterFunc(sessionDuration, func() {
 			loaded.mu.Lock()
 			defer loaded.mu.Unlock()
-			if time.Now().Before(loaded.expireAt) {
+			if loaded.llama != nil {
-				return
+				loaded.llama.Close()
 			}
-			if loaded.runner != nil {
+			loaded.llama = nil
-				loaded.runner.Close()
+			loaded.model = ""
-			}
+			loaded.adapters = nil
+			loaded.projectors = nil
-			loaded.runner = nil
-			loaded.Model = nil
 			loaded.Options = nil
 		})
 	}
@@ -265,7 +271,7 @@ func GenerateHandler(c *gin.Context) {
 		sb.Reset()
 		if req.Context != nil {
-			prev, err := loaded.runner.Decode(c.Request.Context(), req.Context)
+			prev, err := loaded.llama.Detokenize(c.Request.Context(), req.Context)
 			if err != nil {
 				c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 				return
@@ -286,9 +292,8 @@ func GenerateHandler(c *gin.Context) {
 	go func() {
 		defer close(ch)
-		fn := func(r llm.PredictResult) {
+		fn := func(r llm.CompletionResponse) {
 			// Update model expiration
-			loaded.expireAt = time.Now().Add(sessionDuration)
 			loaded.expireTimer.Reset(sessionDuration)
 			// Build up the full response
@@ -322,7 +327,7 @@ func GenerateHandler(c *gin.Context) {
 					}
 					// TODO (jmorganca): encode() should not strip special tokens
-					tokens, err := loaded.runner.Encode(c.Request.Context(), p)
+					tokens, err := loaded.llama.Tokenize(c.Request.Context(), p)
 					if err != nil {
 						ch <- gin.H{"error": err.Error()}
 						return
@@ -344,13 +349,13 @@ func GenerateHandler(c *gin.Context) {
 		}
 		// Start prediction
-		predictReq := llm.PredictOpts{
+		req := llm.CompletionRequest{
 			Prompt:  prompt,
 			Format:  req.Format,
 			Images:  images,
 			Options: opts,
 		}
-		if err := loaded.runner.Predict(c.Request.Context(), predictReq, fn); err != nil {
+		if err := loaded.llama.Completion(c.Request.Context(), req, fn); err != nil {
 			ch <- gin.H{"error": err.Error()}
 		}
 	}()
@@ -471,7 +476,7 @@ func EmbeddingsHandler(c *gin.Context) {
 		return
 	}
-	embedding, err := loaded.runner.Embedding(c.Request.Context(), req.Prompt)
+	embedding, err := loaded.llama.Embedding(c.Request.Context(), req.Prompt)
 	if err != nil {
 		slog.Info(fmt.Sprintf("embedding generation failed: %v", err))
 		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to generate embedding"})
@@ -1123,8 +1128,8 @@ func Serve(ln net.Listener) error {
 	signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM)
 	go func() {
 		<-signals
-		if loaded.runner != nil {
+		if loaded.llama != nil {
-			loaded.runner.Close()
+			loaded.llama.Close()
 		}
 		gpu.Cleanup()
 		os.Exit(0)
@@ -1196,7 +1201,7 @@ func streamResponse(c *gin.Context, ch chan any) {
 // ChatPrompt builds up a prompt from a series of messages for the currently `loaded` model
 func chatPrompt(ctx context.Context, template string, messages []api.Message, numCtx int) (string, error) {
 	encode := func(s string) ([]int, error) {
-		return loaded.runner.Encode(ctx, s)
+		return loaded.llama.Tokenize(ctx, s)
 	}
 	prompt, err := ChatPrompt(template, messages, numCtx, encode)
@@ -1326,9 +1331,8 @@ func ChatHandler(c *gin.Context) {
 	go func() {
 		defer close(ch)
-		fn := func(r llm.PredictResult) {
+		fn := func(r llm.CompletionResponse) {
 			// Update model expiration
-			loaded.expireAt = time.Now().Add(sessionDuration)
 			loaded.expireTimer.Reset(sessionDuration)
 			resp := api.ChatResponse{
@@ -1352,14 +1356,12 @@ func ChatHandler(c *gin.Context) {
 			ch <- resp
 		}
-		// Start prediction
+		if err := loaded.llama.Completion(c.Request.Context(), llm.CompletionRequest{
-		predictReq := llm.PredictOpts{
 			Prompt:  prompt,
 			Format:  req.Format,
 			Images:  images,
 			Options: opts,
-		}
+		}, fn); err != nil {
-		if err := loaded.runner.Predict(c.Request.Context(), predictReq, fn); err != nil {
 			ch <- gin.H{"error": err.Error()}
 		}
 	}()

--- a/server/routes_test.go
+++ b/server/routes_test.go
@@ -17,7 +17,6 @@ import (
 	"github.com/stretchr/testify/assert"
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/version"
 )
@@ -211,7 +210,7 @@ func Test_Routes(t *testing.T) {
 		},
 	}
-	s := Server{}
+	s := &Server{}
 	router := s.GenerateRoutes()
 	httpSrv := httptest.NewServer(router)
@@ -242,27 +241,3 @@ func Test_Routes(t *testing.T) {
 	}
 }
-type MockLLM struct {
-	encoding []int
-}
-func (llm *MockLLM) Predict(ctx context.Context, pred llm.PredictOpts, fn func(llm.PredictResult)) error {
-	return nil
-}
-func (llm *MockLLM) Encode(ctx context.Context, prompt string) ([]int, error) {
-	return llm.encoding, nil
-}
-func (llm *MockLLM) Decode(ctx context.Context, tokens []int) (string, error) {
-	return "", nil
-}
-func (llm *MockLLM) Embedding(ctx context.Context, input string) ([]float64, error) {
-	return []float64{}, nil
-}
-func (llm *MockLLM) Close() {
-	// do nothing
-}