Switch back to subprocessing for llama.cpp

This should resolve a number of memory leak and stability defects by allowing us to isolate llama.cpp in a separate process and shutdown when idle, and gracefully restart if it has problems. This also serves as a first step to be able to run multiple copies to support multiple models concurrently.

Switch back to subprocessing for llama.cpp
This should resolve a number of memory leak and stability defects by allowing us to isolate llama.cpp in a separate process and shutdown when idle, and gracefully restart if it has problems. This also serves as a first step to be able to run multiple copies to support multiple models concurrently.
58d95cc9 · Daniel Hiltgen · 3b6a9154 · 58d95cc9 · 58d95cc9 · 58d95cc9
Commit 58d95cc9 authored Mar 14, 2024 by Daniel Hiltgen
14 changed files
--- a/llm/llm.go
+++ b/llm/llm.go
 package llm
-import (
+// #cgo CFLAGS: -Illama.cpp
-	"context"
+// #cgo darwin,arm64 LDFLAGS: ${SRCDIR}/build/darwin/arm64_static/libllama.a -lstdc++
-	"fmt"
+// #cgo darwin,amd64 LDFLAGS: ${SRCDIR}/build/darwin/x86_64_static/libllama.a -lstdc++
-	"log/slog"
+// #cgo windows,amd64 LDFLAGS: ${SRCDIR}/build/windows/amd64_static/libllama.a -static -lstdc++
-	"os"
+// #cgo linux,amd64 LDFLAGS: ${SRCDIR}/build/linux/x86_64_static/libllama.a -lstdc++
-	"slices"
+// #cgo linux,arm64 LDFLAGS: ${SRCDIR}/build/linux/arm64_static/libllama.a -lstdc++
-	"strings"
+// #include "llama.h"
+import "C"
-	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/format"
+// SystemInfo is an unused example of calling llama.cpp functions using CGo
-	"github.com/ollama/ollama/gpu"
+func SystemInfo() string {
-)
+	return C.GoString(C.llama_print_system_info())
-type LLM interface {
-	Predict(context.Context, PredictOpts, func(PredictResult)) error
-	Embedding(context.Context, string) ([]float64, error)
-	Encode(context.Context, string) ([]int, error)
-	Decode(context.Context, []int) (string, error)
-	Close()
-}
-var cpuOnlyFamilies = []string{
-	"mamba",
-}
-func New(model string, adapters, projectors []string, opts *api.Options) (LLM, error) {
-	if _, err := os.Stat(model); err != nil {
-		return nil, err
-	}
-	f, err := os.Open(model)
-	if err != nil {
-		return nil, err
-	}
-	defer f.Close()
-	ggml, _, err := DecodeGGML(f)
-	if err != nil {
-		return nil, err
-	}
-	if opts.NumCtx > int(ggml.KV().ContextLength()) {
-		slog.Warn("requested context length is greater than model max context length", "requested", opts.NumCtx, "model", ggml.KV().ContextLength())
-		opts.NumCtx = int(ggml.KV().ContextLength())
-	}
-	if opts.NumCtx < 4 {
-		opts.NumCtx = 4
-	}
-	availableMemory, _ := gpu.CheckVRAM()
-	info := gpu.GetGPUInfo()
-	usedMemory := info.MinimumMemory
-	for _, projector := range projectors {
-		usedMemory += projectorMemoryRequirements(projector)
-		// multimodal models require at least 2048 context
-		opts.NumCtx = max(opts.NumCtx, 2048)
-	}
-	// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
-	kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.KV().BlockCount()) * int64(ggml.KV().EmbeddingLength()) / int64(ggml.KV().HeadCount()) * int64(ggml.KV().HeadCountKV())
-	// this amount is the overhead + tensors in memory
-	// TODO: get this from the llama.cpp's graph calculations instead of
-	// estimating it's 1/6 * kv_cache_size * num_gqa
-	graph := int64(ggml.KV().GQA()) * kv / 6
-	usedMemory += graph
-	if usedMemory > availableMemory || slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture()) {
-		info.Library = "cpu"
-	}
-	requiredMemory := usedMemory
-	var layers int
-	for i := 0; i < int(ggml.KV().BlockCount()); i++ {
-		layerMemory := ggml.LayerSize(fmt.Sprintf("blk.%d.", i)) + kv/int64(ggml.KV().BlockCount())
-		requiredMemory += layerMemory
-		if availableMemory > usedMemory+layerMemory && (opts.NumGPU < 0 || layers < opts.NumGPU) {
-			usedMemory += layerMemory
-			layers++
-		}
-	}
-	memOutputLayer := ggml.LayerSize("output.")
-	requiredMemory += memOutputLayer
-	// only offload output layer if all repeating layers are offloaded
-	if layers >= int(ggml.KV().BlockCount()) && availableMemory > usedMemory+memOutputLayer {
-		usedMemory += memOutputLayer
-		layers++
-	}
-	slog.Info(
-		"offload to gpu",
-		"layers", layers,
-		"required", format.HumanBytes2(requiredMemory),
-		"used", format.HumanBytes2(usedMemory),
-		"available", format.HumanBytes2(availableMemory),
-		"kv", format.HumanBytes2(kv),
-		"graph", format.HumanBytes2(graph),
-	)
-	if opts.NumGPU < 0 && info.Library != "cpu" {
-		opts.NumGPU = layers
-	}
-	return newLlmServer(info, model, adapters, projectors, opts)
-}
-func projectorMemoryRequirements(filename string) int64 {
-	file, err := os.Open(filename)
-	if err != nil {
-		return 0
-	}
-	defer file.Close()
-	ggml, _, err := DecodeGGML(file)
-	if err != nil {
-		return 0
-	}
-	prefixes := make(map[string]struct{})
-	for _, layer := range ggml.Tensors() {
-		parts := strings.Split(layer.Name, ".")
-		prefixes[strings.Join(parts[:2], ".")] = struct{}{}
-	}
-	var ask int64
-	for prefix := range prefixes {
-		ask += ggml.LayerSize(prefix)
-	}
-	return ask
-}
-// Give any native cgo implementations an opportunity to initialize
-func Init() error {
-	return nativeInit()
-}
-func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts *api.Options) (LLM, error) {
-	dynLibs := getDynLibs(gpuInfo)
-	// Check to see if the user has requested a specific library instead of auto-detecting
-	demandLib := os.Getenv("OLLAMA_LLM_LIBRARY")
-	if demandLib != "" {
-		libPath := availableDynLibs[demandLib]
-		if libPath == "" {
-			slog.Info(fmt.Sprintf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib))
-		} else {
-			slog.Info(fmt.Sprintf("Loading OLLAMA_LLM_LIBRARY=%s", demandLib))
-			dynLibs = []string{libPath}
-		}
-	}
-	// We stage into a temp directory, and if we've been idle for a while, it may have been reaped
-	_, err := os.Stat(dynLibs[0])
-	if err != nil {
-		slog.Info(fmt.Sprintf("%s has disappeared, reloading libraries", dynLibs[0]))
-		err = nativeInit()
-		if err != nil {
-			return nil, err
-		}
-	}
-	err2 := fmt.Errorf("unable to locate suitable llm library")
-	for _, dynLib := range dynLibs {
-		srv, err := newDynExtServer(dynLib, model, adapters, projectors, opts)
-		if err == nil {
-			return srv, nil
-		}
-		slog.Warn(fmt.Sprintf("Failed to load dynamic library %s  %s", dynLib, err))
-		err2 = err
-	}
-	return nil, err2
 }
--- a/llm/payload_linux.go
+++ b/llm/payload_linux.go
@@ -4,5 +4,5 @@ import (
 	"embed"
 )
-//go:embed llama.cpp/build/linux/*/*/lib/*
+//go:embed build/darwin/x86_64/*/bin/*
 var libEmbed embed.FS
--- a/llm/payload_windows.go
+++ b/llm/payload_windows.go
@@ -4,5 +4,5 @@ import (
 	"embed"
 )
-//go:embed llama.cpp/build/windows/*/*/lib/*.dll*
+//go:embed build/darwin/arm64/*/bin/*
 var libEmbed embed.FS
--- a/llm/payload_darwin_amd64.go
+++ b/llm/payload_darwin_amd64.go
 package llm
-import (
+import "embed"
-	"embed"
-)
-//go:embed llama.cpp/build/darwin/x86_64/*/lib/*.dylib*
+//go:embed build/linux/*/*/bin/*
 var libEmbed embed.FS
--- a/llm/llm_windows.go
+++ b/llm/llm_windows.go
+package llm
+import "embed"
+//go:embed build/windows/*/*/bin/*
+var libEmbed embed.FS
--- a/llm/payload.go
+++ b/llm/payload.go
+package llm
+import (
+	"compress/gzip"
+	"errors"
+	"fmt"
+	"io"
+	"io/fs"
+	"log/slog"
+	"os"
+	"path/filepath"
+	"strings"
+	"golang.org/x/exp/slices"
+	"golang.org/x/sync/errgroup"
+	"github.com/ollama/ollama/gpu"
+)
+var errPayloadMissing = fmt.Errorf("expected payloads not included in this build of ollama")
+func Init() error {
+	payloadsDir, err := gpu.PayloadsDir()
+	if err != nil {
+		return err
+	}
+	slog.Info("extracting embedded files", "dir", payloadsDir)
+	binGlob := "build/*/*/*/bin/*"
+	// extract server libraries
+	err = extractFiles(payloadsDir, binGlob)
+	if err != nil {
+		return fmt.Errorf("extract binaries: %v", err)
+	}
+	var variants []string
+	for v := range availableServers() {
+		variants = append(variants, v)
+	}
+	slog.Info(fmt.Sprintf("Dynamic LLM libraries %v", variants))
+	slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY")
+	return nil
+}
+// binary names may contain an optional variant separated by '_'
+// For example, "ollama_rocm_v6" and "ollama_rocm_v5" or "ollama_cpu" and "ollama_cpu_avx2"
+// Any library without a variant is the lowest common denominator
+func availableServers() map[string]string {
+	payloadsDir, err := gpu.PayloadsDir()
+	if err != nil {
+		slog.Error("payload lookup error", "error", err)
+		return nil
+	}
+	// glob payloadsDir for files that start with ollama_
+	pattern := filepath.Join(payloadsDir, "*")
+	files, err := filepath.Glob(pattern)
+	if err != nil {
+		slog.Debug("could not glob", "pattern", pattern, "error", err)
+		return nil
+	}
+	servers := make(map[string]string)
+	for _, file := range files {
+		slog.Debug("availableServers : found", "file", file)
+		servers[filepath.Base(file)] = file
+	}
+	return servers
+}
+// serversForGpu returns a list of compatible servers give the provided GPU
+// info, ordered by performance. assumes Init() has been called
+// TODO - switch to metadata based mapping
+func serversForGpu(info gpu.GpuInfo) []string {
+	// glob workDir for files that start with ollama_
+	availableServers := availableServers()
+	requested := info.Library
+	if info.Variant != "" {
+		requested += "_" + info.Variant
+	}
+	servers := []string{}
+	// exact match first
+	for a := range availableServers {
+		if a == requested {
+			servers = []string{a}
+			if a == "metal" {
+				return servers
+			}
+			break
+		}
+	}
+	alt := []string{}
+	// Then for GPUs load alternates and sort the list for consistent load ordering
+	if info.Library != "cpu" {
+		for a := range availableServers {
+			if info.Library == strings.Split(a, "_")[0] && a != requested {
+				alt = append(alt, a)
+			}
+		}
+		slices.Sort(alt)
+		servers = append(servers, alt...)
+	}
+	// Load up the best CPU variant if not primary requested
+	if info.Library != "cpu" {
+		variant := gpu.GetCPUVariant()
+		// If no variant, then we fall back to default
+		// If we have a variant, try that if we find an exact match
+		// Attempting to run the wrong CPU instructions will panic the
+		// process
+		if variant != "" {
+			for cmp := range availableServers {
+				if cmp == "cpu_"+variant {
+					servers = append(servers, cmp)
+					break
+				}
+			}
+		} else {
+			servers = append(servers, "cpu")
+		}
+	}
+	if len(servers) == 0 {
+		servers = []string{"cpu"}
+	}
+	return servers
+}
+// extract extracts the embedded files to the target directory
+func extractFiles(targetDir string, glob string) error {
+	files, err := fs.Glob(libEmbed, glob)
+	if err != nil || len(files) == 0 {
+		return errPayloadMissing
+	}
+	if err := os.MkdirAll(targetDir, 0o755); err != nil {
+		return fmt.Errorf("extractFiles could not mkdir %s: %v", targetDir, err)
+	}
+	g := new(errgroup.Group)
+	// build/$OS/$GOARCH/$VARIANT/{bin,lib}/$FILE
+	for _, file := range files {
+		filename := file
+		variant := filepath.Base(filepath.Dir(filepath.Dir(filename)))
+		slog.Debug("extracting", "variant", variant, "file", filename)
+		g.Go(func() error {
+			srcf, err := libEmbed.Open(filename)
+			if err != nil {
+				return err
+			}
+			defer srcf.Close()
+			src := io.Reader(srcf)
+			if strings.HasSuffix(filename, ".gz") {
+				src, err = gzip.NewReader(src)
+				if err != nil {
+					return fmt.Errorf("decompress payload %s: %v", filename, err)
+				}
+				filename = strings.TrimSuffix(filename, ".gz")
+			}
+			variantDir := filepath.Join(targetDir, variant)
+			if err := os.MkdirAll(variantDir, 0o755); err != nil {
+				return fmt.Errorf("extractFiles could not mkdir %s: %v", variantDir, err)
+			}
+			base := filepath.Base(filename)
+			destFilename := filepath.Join(variantDir, base)
+			_, err = os.Stat(destFilename)
+			switch {
+			case errors.Is(err, os.ErrNotExist):
+				destFile, err := os.OpenFile(destFilename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
+				if err != nil {
+					return fmt.Errorf("write payload %s: %v", filename, err)
+				}
+				defer destFile.Close()
+				if _, err := io.Copy(destFile, src); err != nil {
+					return fmt.Errorf("copy payload %s: %v", filename, err)
+				}
+			case err != nil:
+				return fmt.Errorf("stat payload %s: %v", filename, err)
+			}
+			return nil
+		})
+	}
+	err = g.Wait()
+	if err != nil {
+		// If we fail to extract, the payload dir is unusable, so cleanup whatever we extracted
+		gpu.Cleanup()
+		return err
+	}
+	return nil
+}
--- a/llm/payload_common.go
+++ b/llm/payload_common.go
-package llm
-import (
-	"compress/gzip"
-	"errors"
-	"fmt"
-	"io"
-	"io/fs"
-	"log/slog"
-	"os"
-	"path/filepath"
-	"runtime"
-	"strings"
-	"sync"
-	"golang.org/x/exp/slices"
-	"golang.org/x/sync/errgroup"
-	"github.com/ollama/ollama/gpu"
-)
-// Libraries names may contain an optional variant separated by '_'
-// For example, "rocm_v6" and "rocm_v5" or "cpu" and "cpu_avx2"
-// Any library without a variant is the lowest common denominator
-var availableDynLibs = map[string]string{}
-const pathComponentCount = 7
-// getDynLibs returns an ordered list of LLM libraries to try, starting with the best
-func getDynLibs(gpuInfo gpu.GpuInfo) []string {
-	// Short circuit if we know we're using the default built-in (darwin only)
-	if gpuInfo.Library == "default" {
-		return []string{"default"}
-	}
-	// TODO - temporary until we have multiple CPU variations for Darwin
-	// Short circuit on darwin with metal only
-	if len(availableDynLibs) == 1 {
-		if _, onlyMetal := availableDynLibs["metal"]; onlyMetal {
-			return []string{availableDynLibs["metal"]}
-		}
-	}
-	exactMatch := ""
-	dynLibs := []string{}
-	altDynLibs := []string{}
-	requested := gpuInfo.Library
-	if gpuInfo.Variant != "" {
-		requested += "_" + gpuInfo.Variant
-	}
-	// Try to find an exact match
-	for cmp := range availableDynLibs {
-		if requested == cmp {
-			exactMatch = cmp
-			dynLibs = []string{availableDynLibs[cmp]}
-			break
-		}
-	}
-	// Then for GPUs load alternates and sort the list for consistent load ordering
-	if gpuInfo.Library != "cpu" {
-		for cmp := range availableDynLibs {
-			if gpuInfo.Library == strings.Split(cmp, "_")[0] && cmp != exactMatch {
-				altDynLibs = append(altDynLibs, cmp)
-			}
-		}
-		slices.Sort(altDynLibs)
-		for _, altDynLib := range altDynLibs {
-			dynLibs = append(dynLibs, availableDynLibs[altDynLib])
-		}
-	}
-	// Load up the best CPU variant if not primary requested
-	if gpuInfo.Library != "cpu" {
-		variant := gpu.GetCPUVariant()
-		// If no variant, then we fall back to default
-		// If we have a variant, try that if we find an exact match
-		// Attempting to run the wrong CPU instructions will panic the
-		// process
-		if variant != "" {
-			for cmp := range availableDynLibs {
-				if cmp == "cpu_"+variant {
-					dynLibs = append(dynLibs, availableDynLibs[cmp])
-					break
-				}
-			}
-		} else {
-			dynLibs = append(dynLibs, availableDynLibs["cpu"])
-		}
-	}
-	// Finally, if we didn't find any matches, LCD CPU FTW
-	if len(dynLibs) == 0 {
-		dynLibs = []string{availableDynLibs["cpu"]}
-	}
-	slog.Debug(fmt.Sprintf("ordered list of LLM libraries to try %v", dynLibs))
-	return dynLibs
-}
-func rocmDynLibPresent() bool {
-	for dynLibName := range availableDynLibs {
-		if strings.HasPrefix(dynLibName, "rocm") {
-			return true
-		}
-	}
-	return false
-}
-func nativeInit() error {
-	payloadsDir, err := gpu.PayloadsDir()
-	if err != nil {
-		return err
-	}
-	slog.Info(fmt.Sprintf("Extracting dynamic libraries to %s ...", payloadsDir))
-	libs, err := extractDynamicLibs(payloadsDir, "llama.cpp/build/*/*/*/lib/*")
-	if err != nil {
-		if errors.Is(err, payloadMissing) {
-			slog.Info(fmt.Sprintf("%s", payloadMissing))
-			return nil
-		}
-		return err
-	}
-	for _, lib := range libs {
-		// The last dir component is the variant name
-		variant := filepath.Base(filepath.Dir(lib))
-		availableDynLibs[variant] = lib
-	}
-	if err := verifyDriverAccess(); err != nil {
-		return err
-	}
-	// Report which dynamic libraries we have loaded to assist troubleshooting
-	variants := make([]string, len(availableDynLibs))
-	i := 0
-	for variant := range availableDynLibs {
-		variants[i] = variant
-		i++
-	}
-	slog.Info(fmt.Sprintf("Dynamic LLM libraries %v", variants))
-	slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY")
-	return nil
-}
-func extractDynamicLibs(payloadsDir, glob string) ([]string, error) {
-	files, err := fs.Glob(libEmbed, glob)
-	if err != nil || len(files) == 0 {
-		return nil, payloadMissing
-	}
-	var mu sync.Mutex
-	var libs []string
-	var g errgroup.Group
-	for _, file := range files {
-		pathComps := strings.Split(file, "/")
-		if len(pathComps) != pathComponentCount {
-			slog.Error(fmt.Sprintf("unexpected payload components: %v", pathComps))
-			continue
-		}
-		file := file
-		g.Go(func() error {
-			// llama.cpp/build/$OS/$GOARCH/$VARIANT/lib/$LIBRARY
-			// Include the variant in the path to avoid conflicts between multiple server libs
-			targetDir := filepath.Join(payloadsDir, pathComps[pathComponentCount-3])
-			srcFile, err := libEmbed.Open(file)
-			if err != nil {
-				return fmt.Errorf("read payload %s: %v", file, err)
-			}
-			defer srcFile.Close()
-			if err := os.MkdirAll(targetDir, 0o755); err != nil {
-				return fmt.Errorf("create payload lib dir %s: %v", payloadsDir, err)
-			}
-			src := io.Reader(srcFile)
-			filename := file
-			if strings.HasSuffix(file, ".gz") {
-				src, err = gzip.NewReader(src)
-				if err != nil {
-					return fmt.Errorf("decompress payload %s: %v", file, err)
-				}
-				filename = strings.TrimSuffix(filename, ".gz")
-			}
-			destFile := filepath.Join(targetDir, filepath.Base(filename))
-			if strings.Contains(destFile, "server") {
-				mu.Lock()
-				libs = append(libs, destFile)
-				mu.Unlock()
-			}
-			destFp, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
-			if err != nil {
-				return fmt.Errorf("write payload %s: %v", file, err)
-			}
-			defer destFp.Close()
-			if _, err := io.Copy(destFp, src); err != nil {
-				return fmt.Errorf("copy payload %s: %v", file, err)
-			}
-			return nil
-		})
-	}
-	err = g.Wait()
-	if err != nil {
-		// If we fail to extract, the payload dir is unusable, so cleanup whatever we extracted
-		gpu.Cleanup()
-		return nil, err
-	}
-	return libs, nil
-}
-func verifyDriverAccess() error {
-	if runtime.GOOS != "linux" {
-		return nil
-	}
-	// Only check ROCm access if we have the dynamic lib loaded
-	if rocmDynLibPresent() {
-		// Verify we have permissions - either running as root, or we have group access to the driver
-		fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666)
-		if err != nil {
-			if errors.Is(err, fs.ErrPermission) {
-				return fmt.Errorf("Radeon card detected, but permissions not set up properly.  Either run ollama as root, or add you user account to the render group.")
-			} else if errors.Is(err, fs.ErrNotExist) {
-				// expected behavior without a radeon card
-				return nil
-			}
-			return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
-		}
-		fd.Close()
-	}
-	return nil
-}
--- a/llm/payload_darwin_arm64.go
+++ b/llm/payload_darwin_arm64.go
-package llm
-import (
-	"embed"
-)
-//go:embed llama.cpp/ggml-metal.metal llama.cpp/build/darwin/arm64/*/lib/*.dylib*
-var libEmbed embed.FS
--- a/llm/payload_test.go
+++ b/llm/payload_test.go
-package llm
-import (
-	"testing"
-	"github.com/ollama/ollama/gpu"
-	"github.com/stretchr/testify/assert"
-)
-func TestGetDynLibs(t *testing.T) {
-	availableDynLibs = map[string]string{
-		"cpu": "X_cpu",
-	}
-	assert.Equal(t, false, rocmDynLibPresent())
-	res := getDynLibs(gpu.GpuInfo{Library: "cpu"})
-	assert.Len(t, res, 1)
-	assert.Equal(t, availableDynLibs["cpu"], res[0])
-	variant := gpu.GetCPUVariant()
-	if variant != "" {
-		variant = "_" + variant
-	}
-	availableDynLibs = map[string]string{
-		"rocm_v5":       "X_rocm_v5",
-		"rocm_v6":       "X_rocm_v6",
-		"cpu" + variant: "X_cpu",
-	}
-	assert.Equal(t, true, rocmDynLibPresent())
-	res = getDynLibs(gpu.GpuInfo{Library: "rocm"})
-	assert.Len(t, res, 3)
-	assert.Equal(t, availableDynLibs["rocm_v5"], res[0])
-	assert.Equal(t, availableDynLibs["rocm_v6"], res[1])
-	assert.Equal(t, availableDynLibs["cpu"+variant], res[2])
-	res = getDynLibs(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
-	assert.Len(t, res, 3)
-	assert.Equal(t, availableDynLibs["rocm_v6"], res[0])
-	assert.Equal(t, availableDynLibs["rocm_v5"], res[1])
-	assert.Equal(t, availableDynLibs["cpu"+variant], res[2])
-	res = getDynLibs(gpu.GpuInfo{Library: "cuda"})
-	assert.Len(t, res, 1)
-	assert.Equal(t, availableDynLibs["cpu"+variant], res[0])
-	res = getDynLibs(gpu.GpuInfo{Library: "default"})
-	assert.Len(t, res, 1)
-	assert.Equal(t, "default", res[0])
-	availableDynLibs = map[string]string{
-		"rocm":          "X_rocm_v5",
-		"cpu" + variant: "X_cpu",
-	}
-	assert.Equal(t, true, rocmDynLibPresent())
-	res = getDynLibs(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
-	assert.Len(t, res, 2)
-	assert.Equal(t, availableDynLibs["rocm"], res[0])
-	assert.Equal(t, availableDynLibs["cpu"+variant], res[1])
-}
--- a/llm/server.go
+++ b/llm/server.go
--- a/llm/status.go
+++ b/llm/status.go
+package llm
+import (
+	"bytes"
+	"os"
+)
+// StatusWriter is a writer that captures error messages from the llama runner process
+type StatusWriter struct {
+	LastErrMsg string
+	out        *os.File
+}
+func NewStatusWriter(out *os.File) *StatusWriter {
+	return &StatusWriter{
+		out: out,
+	}
+}
+// TODO - regex matching to detect errors like
+// libcublasLt.so.11: cannot open shared object file: No such file or directory
+var errorPrefixes = []string{
+	"error:",
+	"CUDA error",
+	"cudaMalloc failed",
+	"\"ERR\"",
+}
+func (w *StatusWriter) Write(b []byte) (int, error) {
+	var errMsg string
+	for _, prefix := range errorPrefixes {
+		if _, after, ok := bytes.Cut(b, []byte(prefix)); ok {
+			errMsg = prefix + string(bytes.TrimSpace(after))
+		}
+	}
+	if errMsg != "" {
+		w.LastErrMsg = errMsg
+	}
+	return w.out.Write(b)
+}
--- a/llm/utils.go
+++ b/llm/utils.go
-package llm
-import (
-	"fmt"
-	"time"
-)
-func parseDurationMs(ms float64) time.Duration {
-	dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms))
-	if err != nil {
-		panic(err)
-	}
-	return dur
-}
--- a/server/routes.go
+++ b/server/routes.go
@@ -56,12 +56,13 @@ func init() {
 var loaded struct {
 	mu sync.Mutex
-	runner llm.LLM
+	llama *llm.LlamaServer
-	expireAt    time.Time
 	expireTimer *time.Timer
-	*Model
+	model      string
+	adapters   []string
+	projectors []string
 	*api.Options
 }
@@ -69,21 +70,28 @@ var defaultSessionDuration = 5 * time.Minute
 // load a model into memory if it is not already loaded, it is up to the caller to lock loaded.mu before calling this function
 func load(c *gin.Context, model *Model, opts *api.Options, sessionDuration time.Duration) error {
-	needLoad := loaded.runner == nil || // is there a model loaded?
+	ctx, cancel := context.WithTimeout(c, 10*time.Second)
-		loaded.ModelPath != model.ModelPath || // has the base model changed?
+	defer cancel()
-		!reflect.DeepEqual(loaded.AdapterPaths, model.AdapterPaths) || // have the adapters changed?
-		!reflect.DeepEqual(loaded.Options.Runner, opts.Runner) // have the runner options changed?
+	needLoad := loaded.llama == nil || // is there a model loaded?
+		loaded.model != model.ModelPath || // has the base model changed?
+		!reflect.DeepEqual(loaded.adapters, model.AdapterPaths) || // have the adapters changed?
+		!reflect.DeepEqual(loaded.projectors, model.ProjectorPaths) || // have the adapters changed?
+		!reflect.DeepEqual(loaded.Options.Runner, opts.Runner) || // have the runner options changed?
+		loaded.llama.Ping(ctx) != nil
 	if needLoad {
-		if loaded.runner != nil {
+		if loaded.llama != nil {
 			slog.Info("changing loaded model")
-			loaded.runner.Close()
+			loaded.llama.Close()
-			loaded.runner = nil
+			loaded.llama = nil
-			loaded.Model = nil
+			loaded.model = ""
+			loaded.adapters = nil
+			loaded.projectors = nil
 			loaded.Options = nil
 		}
-		llmRunner, err := llm.New(model.ModelPath, model.AdapterPaths, model.ProjectorPaths, opts)
+		llama, err := llm.NewLlamaServer(model.ModelPath, model.AdapterPaths, model.ProjectorPaths, opts)
 		if err != nil {
 			// some older models are not compatible with newer versions of llama.cpp
 			// show a generalized compatibility error until there is a better way to
@@ -95,28 +103,26 @@ func load(c *gin.Context, model *Model, opts *api.Options, sessionDuration time.
 			return err
 		}
-		loaded.Model = model
+		loaded.model = model.ModelPath
-		loaded.runner = llmRunner
+		loaded.adapters = model.AdapterPaths
+		loaded.projectors = model.ProjectorPaths
+		loaded.llama = llama
 		loaded.Options = opts
 	}
-	loaded.expireAt = time.Now().Add(sessionDuration)
 	if loaded.expireTimer == nil {
 		loaded.expireTimer = time.AfterFunc(sessionDuration, func() {
 			loaded.mu.Lock()
 			defer loaded.mu.Unlock()
-			if time.Now().Before(loaded.expireAt) {
+			if loaded.llama != nil {
-				return
+				loaded.llama.Close()
 			}
-			if loaded.runner != nil {
+			loaded.llama = nil
-				loaded.runner.Close()
+			loaded.model = ""
-			}
+			loaded.adapters = nil
+			loaded.projectors = nil
-			loaded.runner = nil
-			loaded.Model = nil
 			loaded.Options = nil
 		})
 	}
@@ -265,7 +271,7 @@ func GenerateHandler(c *gin.Context) {
 		sb.Reset()
 		if req.Context != nil {
-			prev, err := loaded.runner.Decode(c.Request.Context(), req.Context)
+			prev, err := loaded.llama.Detokenize(c.Request.Context(), req.Context)
 			if err != nil {
 				c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 				return
@@ -286,9 +292,8 @@ func GenerateHandler(c *gin.Context) {
 	go func() {
 		defer close(ch)
-		fn := func(r llm.PredictResult) {
+		fn := func(r llm.CompletionResponse) {
 			// Update model expiration
-			loaded.expireAt = time.Now().Add(sessionDuration)
 			loaded.expireTimer.Reset(sessionDuration)
 			// Build up the full response
@@ -322,7 +327,7 @@ func GenerateHandler(c *gin.Context) {
 					}
 					// TODO (jmorganca): encode() should not strip special tokens
-					tokens, err := loaded.runner.Encode(c.Request.Context(), p)
+					tokens, err := loaded.llama.Tokenize(c.Request.Context(), p)
 					if err != nil {
 						ch <- gin.H{"error": err.Error()}
 						return
@@ -344,13 +349,13 @@ func GenerateHandler(c *gin.Context) {
 		}
 		// Start prediction
-		predictReq := llm.PredictOpts{
+		req := llm.CompletionRequest{
 			Prompt:  prompt,
 			Format:  req.Format,
 			Images:  images,
 			Options: opts,
 		}
-		if err := loaded.runner.Predict(c.Request.Context(), predictReq, fn); err != nil {
+		if err := loaded.llama.Completion(c.Request.Context(), req, fn); err != nil {
 			ch <- gin.H{"error": err.Error()}
 		}
 	}()
@@ -471,7 +476,7 @@ func EmbeddingsHandler(c *gin.Context) {
 		return
 	}
-	embedding, err := loaded.runner.Embedding(c.Request.Context(), req.Prompt)
+	embedding, err := loaded.llama.Embedding(c.Request.Context(), req.Prompt)
 	if err != nil {
 		slog.Info(fmt.Sprintf("embedding generation failed: %v", err))
 		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to generate embedding"})
@@ -1123,8 +1128,8 @@ func Serve(ln net.Listener) error {
 	signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM)
 	go func() {
 		<-signals
-		if loaded.runner != nil {
+		if loaded.llama != nil {
-			loaded.runner.Close()
+			loaded.llama.Close()
 		}
 		gpu.Cleanup()
 		os.Exit(0)
@@ -1196,7 +1201,7 @@ func streamResponse(c *gin.Context, ch chan any) {
 // ChatPrompt builds up a prompt from a series of messages for the currently `loaded` model
 func chatPrompt(ctx context.Context, template string, messages []api.Message, numCtx int) (string, error) {
 	encode := func(s string) ([]int, error) {
-		return loaded.runner.Encode(ctx, s)
+		return loaded.llama.Tokenize(ctx, s)
 	}
 	prompt, err := ChatPrompt(template, messages, numCtx, encode)
@@ -1326,9 +1331,8 @@ func ChatHandler(c *gin.Context) {
 	go func() {
 		defer close(ch)
-		fn := func(r llm.PredictResult) {
+		fn := func(r llm.CompletionResponse) {
 			// Update model expiration
-			loaded.expireAt = time.Now().Add(sessionDuration)
 			loaded.expireTimer.Reset(sessionDuration)
 			resp := api.ChatResponse{
@@ -1352,14 +1356,12 @@ func ChatHandler(c *gin.Context) {
 			ch <- resp
 		}
-		// Start prediction
+		if err := loaded.llama.Completion(c.Request.Context(), llm.CompletionRequest{
-		predictReq := llm.PredictOpts{
 			Prompt:  prompt,
 			Format:  req.Format,
 			Images:  images,
 			Options: opts,
-		}
+		}, fn); err != nil {
-		if err := loaded.runner.Predict(c.Request.Context(), predictReq, fn); err != nil {
 			ch <- gin.H{"error": err.Error()}
 		}
 	}()

--- a/server/routes_test.go
+++ b/server/routes_test.go
@@ -17,7 +17,6 @@ import (
 	"github.com/stretchr/testify/assert"
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/version"
 )
@@ -211,7 +210,7 @@ func Test_Routes(t *testing.T) {
 		},
 	}
-	s := Server{}
+	s := &Server{}
 	router := s.GenerateRoutes()
 	httpSrv := httptest.NewServer(router)
@@ -242,27 +241,3 @@ func Test_Routes(t *testing.T) {
 	}
 }
-type MockLLM struct {
-	encoding []int
-}
-func (llm *MockLLM) Predict(ctx context.Context, pred llm.PredictOpts, fn func(llm.PredictResult)) error {
-	return nil
-}
-func (llm *MockLLM) Encode(ctx context.Context, prompt string) ([]int, error) {
-	return llm.encoding, nil
-}
-func (llm *MockLLM) Decode(ctx context.Context, tokens []int) (string, error) {
-	return "", nil
-}
-func (llm *MockLLM) Embedding(ctx context.Context, input string) ([]float64, error) {
-	return []float64{}, nil
-}
-func (llm *MockLLM) Close() {
-	// do nothing
-}