Commit 58d95cc9 authored by Daniel Hiltgen's avatar Daniel Hiltgen
Browse files

Switch back to subprocessing for llama.cpp

This should resolve a number of memory leak and stability defects by allowing
us to isolate llama.cpp in a separate process and shutdown when idle, and
gracefully restart if it has problems.  This also serves as a first step to be
able to run multiple copies to support multiple models concurrently.
parent 3b6a9154
package llm package llm
import ( // #cgo CFLAGS: -Illama.cpp
"context" // #cgo darwin,arm64 LDFLAGS: ${SRCDIR}/build/darwin/arm64_static/libllama.a -lstdc++
"fmt" // #cgo darwin,amd64 LDFLAGS: ${SRCDIR}/build/darwin/x86_64_static/libllama.a -lstdc++
"log/slog" // #cgo windows,amd64 LDFLAGS: ${SRCDIR}/build/windows/amd64_static/libllama.a -static -lstdc++
"os" // #cgo linux,amd64 LDFLAGS: ${SRCDIR}/build/linux/x86_64_static/libllama.a -lstdc++
"slices" // #cgo linux,arm64 LDFLAGS: ${SRCDIR}/build/linux/arm64_static/libllama.a -lstdc++
"strings" // #include "llama.h"
import "C"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/format" // SystemInfo is an unused example of calling llama.cpp functions using CGo
"github.com/ollama/ollama/gpu" func SystemInfo() string {
) return C.GoString(C.llama_print_system_info())
type LLM interface {
Predict(context.Context, PredictOpts, func(PredictResult)) error
Embedding(context.Context, string) ([]float64, error)
Encode(context.Context, string) ([]int, error)
Decode(context.Context, []int) (string, error)
Close()
}
var cpuOnlyFamilies = []string{
"mamba",
}
func New(model string, adapters, projectors []string, opts *api.Options) (LLM, error) {
if _, err := os.Stat(model); err != nil {
return nil, err
}
f, err := os.Open(model)
if err != nil {
return nil, err
}
defer f.Close()
ggml, _, err := DecodeGGML(f)
if err != nil {
return nil, err
}
if opts.NumCtx > int(ggml.KV().ContextLength()) {
slog.Warn("requested context length is greater than model max context length", "requested", opts.NumCtx, "model", ggml.KV().ContextLength())
opts.NumCtx = int(ggml.KV().ContextLength())
}
if opts.NumCtx < 4 {
opts.NumCtx = 4
}
availableMemory, _ := gpu.CheckVRAM()
info := gpu.GetGPUInfo()
usedMemory := info.MinimumMemory
for _, projector := range projectors {
usedMemory += projectorMemoryRequirements(projector)
// multimodal models require at least 2048 context
opts.NumCtx = max(opts.NumCtx, 2048)
}
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.KV().BlockCount()) * int64(ggml.KV().EmbeddingLength()) / int64(ggml.KV().HeadCount()) * int64(ggml.KV().HeadCountKV())
// this amount is the overhead + tensors in memory
// TODO: get this from the llama.cpp's graph calculations instead of
// estimating it's 1/6 * kv_cache_size * num_gqa
graph := int64(ggml.KV().GQA()) * kv / 6
usedMemory += graph
if usedMemory > availableMemory || slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture()) {
info.Library = "cpu"
}
requiredMemory := usedMemory
var layers int
for i := 0; i < int(ggml.KV().BlockCount()); i++ {
layerMemory := ggml.LayerSize(fmt.Sprintf("blk.%d.", i)) + kv/int64(ggml.KV().BlockCount())
requiredMemory += layerMemory
if availableMemory > usedMemory+layerMemory && (opts.NumGPU < 0 || layers < opts.NumGPU) {
usedMemory += layerMemory
layers++
}
}
memOutputLayer := ggml.LayerSize("output.")
requiredMemory += memOutputLayer
// only offload output layer if all repeating layers are offloaded
if layers >= int(ggml.KV().BlockCount()) && availableMemory > usedMemory+memOutputLayer {
usedMemory += memOutputLayer
layers++
}
slog.Info(
"offload to gpu",
"layers", layers,
"required", format.HumanBytes2(requiredMemory),
"used", format.HumanBytes2(usedMemory),
"available", format.HumanBytes2(availableMemory),
"kv", format.HumanBytes2(kv),
"graph", format.HumanBytes2(graph),
)
if opts.NumGPU < 0 && info.Library != "cpu" {
opts.NumGPU = layers
}
return newLlmServer(info, model, adapters, projectors, opts)
}
func projectorMemoryRequirements(filename string) int64 {
file, err := os.Open(filename)
if err != nil {
return 0
}
defer file.Close()
ggml, _, err := DecodeGGML(file)
if err != nil {
return 0
}
prefixes := make(map[string]struct{})
for _, layer := range ggml.Tensors() {
parts := strings.Split(layer.Name, ".")
prefixes[strings.Join(parts[:2], ".")] = struct{}{}
}
var ask int64
for prefix := range prefixes {
ask += ggml.LayerSize(prefix)
}
return ask
}
// Give any native cgo implementations an opportunity to initialize
func Init() error {
return nativeInit()
}
func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts *api.Options) (LLM, error) {
dynLibs := getDynLibs(gpuInfo)
// Check to see if the user has requested a specific library instead of auto-detecting
demandLib := os.Getenv("OLLAMA_LLM_LIBRARY")
if demandLib != "" {
libPath := availableDynLibs[demandLib]
if libPath == "" {
slog.Info(fmt.Sprintf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib))
} else {
slog.Info(fmt.Sprintf("Loading OLLAMA_LLM_LIBRARY=%s", demandLib))
dynLibs = []string{libPath}
}
}
// We stage into a temp directory, and if we've been idle for a while, it may have been reaped
_, err := os.Stat(dynLibs[0])
if err != nil {
slog.Info(fmt.Sprintf("%s has disappeared, reloading libraries", dynLibs[0]))
err = nativeInit()
if err != nil {
return nil, err
}
}
err2 := fmt.Errorf("unable to locate suitable llm library")
for _, dynLib := range dynLibs {
srv, err := newDynExtServer(dynLib, model, adapters, projectors, opts)
if err == nil {
return srv, nil
}
slog.Warn(fmt.Sprintf("Failed to load dynamic library %s %s", dynLib, err))
err2 = err
}
return nil, err2
} }
...@@ -4,5 +4,5 @@ import ( ...@@ -4,5 +4,5 @@ import (
"embed" "embed"
) )
//go:embed llama.cpp/build/linux/*/*/lib/* //go:embed build/darwin/x86_64/*/bin/*
var libEmbed embed.FS var libEmbed embed.FS
...@@ -4,5 +4,5 @@ import ( ...@@ -4,5 +4,5 @@ import (
"embed" "embed"
) )
//go:embed llama.cpp/build/windows/*/*/lib/*.dll* //go:embed build/darwin/arm64/*/bin/*
var libEmbed embed.FS var libEmbed embed.FS
package llm package llm
import ( import "embed"
"embed"
)
//go:embed llama.cpp/build/darwin/x86_64/*/lib/*.dylib* //go:embed build/linux/*/*/bin/*
var libEmbed embed.FS var libEmbed embed.FS
package llm
import "embed"
//go:embed build/windows/*/*/bin/*
var libEmbed embed.FS
package llm
import (
"compress/gzip"
"errors"
"fmt"
"io"
"io/fs"
"log/slog"
"os"
"path/filepath"
"strings"
"golang.org/x/exp/slices"
"golang.org/x/sync/errgroup"
"github.com/ollama/ollama/gpu"
)
var errPayloadMissing = fmt.Errorf("expected payloads not included in this build of ollama")
func Init() error {
payloadsDir, err := gpu.PayloadsDir()
if err != nil {
return err
}
slog.Info("extracting embedded files", "dir", payloadsDir)
binGlob := "build/*/*/*/bin/*"
// extract server libraries
err = extractFiles(payloadsDir, binGlob)
if err != nil {
return fmt.Errorf("extract binaries: %v", err)
}
var variants []string
for v := range availableServers() {
variants = append(variants, v)
}
slog.Info(fmt.Sprintf("Dynamic LLM libraries %v", variants))
slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY")
return nil
}
// binary names may contain an optional variant separated by '_'
// For example, "ollama_rocm_v6" and "ollama_rocm_v5" or "ollama_cpu" and "ollama_cpu_avx2"
// Any library without a variant is the lowest common denominator
func availableServers() map[string]string {
payloadsDir, err := gpu.PayloadsDir()
if err != nil {
slog.Error("payload lookup error", "error", err)
return nil
}
// glob payloadsDir for files that start with ollama_
pattern := filepath.Join(payloadsDir, "*")
files, err := filepath.Glob(pattern)
if err != nil {
slog.Debug("could not glob", "pattern", pattern, "error", err)
return nil
}
servers := make(map[string]string)
for _, file := range files {
slog.Debug("availableServers : found", "file", file)
servers[filepath.Base(file)] = file
}
return servers
}
// serversForGpu returns a list of compatible servers give the provided GPU
// info, ordered by performance. assumes Init() has been called
// TODO - switch to metadata based mapping
func serversForGpu(info gpu.GpuInfo) []string {
// glob workDir for files that start with ollama_
availableServers := availableServers()
requested := info.Library
if info.Variant != "" {
requested += "_" + info.Variant
}
servers := []string{}
// exact match first
for a := range availableServers {
if a == requested {
servers = []string{a}
if a == "metal" {
return servers
}
break
}
}
alt := []string{}
// Then for GPUs load alternates and sort the list for consistent load ordering
if info.Library != "cpu" {
for a := range availableServers {
if info.Library == strings.Split(a, "_")[0] && a != requested {
alt = append(alt, a)
}
}
slices.Sort(alt)
servers = append(servers, alt...)
}
// Load up the best CPU variant if not primary requested
if info.Library != "cpu" {
variant := gpu.GetCPUVariant()
// If no variant, then we fall back to default
// If we have a variant, try that if we find an exact match
// Attempting to run the wrong CPU instructions will panic the
// process
if variant != "" {
for cmp := range availableServers {
if cmp == "cpu_"+variant {
servers = append(servers, cmp)
break
}
}
} else {
servers = append(servers, "cpu")
}
}
if len(servers) == 0 {
servers = []string{"cpu"}
}
return servers
}
// extract extracts the embedded files to the target directory
func extractFiles(targetDir string, glob string) error {
files, err := fs.Glob(libEmbed, glob)
if err != nil || len(files) == 0 {
return errPayloadMissing
}
if err := os.MkdirAll(targetDir, 0o755); err != nil {
return fmt.Errorf("extractFiles could not mkdir %s: %v", targetDir, err)
}
g := new(errgroup.Group)
// build/$OS/$GOARCH/$VARIANT/{bin,lib}/$FILE
for _, file := range files {
filename := file
variant := filepath.Base(filepath.Dir(filepath.Dir(filename)))
slog.Debug("extracting", "variant", variant, "file", filename)
g.Go(func() error {
srcf, err := libEmbed.Open(filename)
if err != nil {
return err
}
defer srcf.Close()
src := io.Reader(srcf)
if strings.HasSuffix(filename, ".gz") {
src, err = gzip.NewReader(src)
if err != nil {
return fmt.Errorf("decompress payload %s: %v", filename, err)
}
filename = strings.TrimSuffix(filename, ".gz")
}
variantDir := filepath.Join(targetDir, variant)
if err := os.MkdirAll(variantDir, 0o755); err != nil {
return fmt.Errorf("extractFiles could not mkdir %s: %v", variantDir, err)
}
base := filepath.Base(filename)
destFilename := filepath.Join(variantDir, base)
_, err = os.Stat(destFilename)
switch {
case errors.Is(err, os.ErrNotExist):
destFile, err := os.OpenFile(destFilename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
if err != nil {
return fmt.Errorf("write payload %s: %v", filename, err)
}
defer destFile.Close()
if _, err := io.Copy(destFile, src); err != nil {
return fmt.Errorf("copy payload %s: %v", filename, err)
}
case err != nil:
return fmt.Errorf("stat payload %s: %v", filename, err)
}
return nil
})
}
err = g.Wait()
if err != nil {
// If we fail to extract, the payload dir is unusable, so cleanup whatever we extracted
gpu.Cleanup()
return err
}
return nil
}
package llm
import (
"compress/gzip"
"errors"
"fmt"
"io"
"io/fs"
"log/slog"
"os"
"path/filepath"
"runtime"
"strings"
"sync"
"golang.org/x/exp/slices"
"golang.org/x/sync/errgroup"
"github.com/ollama/ollama/gpu"
)
// Libraries names may contain an optional variant separated by '_'
// For example, "rocm_v6" and "rocm_v5" or "cpu" and "cpu_avx2"
// Any library without a variant is the lowest common denominator
var availableDynLibs = map[string]string{}
const pathComponentCount = 7
// getDynLibs returns an ordered list of LLM libraries to try, starting with the best
func getDynLibs(gpuInfo gpu.GpuInfo) []string {
// Short circuit if we know we're using the default built-in (darwin only)
if gpuInfo.Library == "default" {
return []string{"default"}
}
// TODO - temporary until we have multiple CPU variations for Darwin
// Short circuit on darwin with metal only
if len(availableDynLibs) == 1 {
if _, onlyMetal := availableDynLibs["metal"]; onlyMetal {
return []string{availableDynLibs["metal"]}
}
}
exactMatch := ""
dynLibs := []string{}
altDynLibs := []string{}
requested := gpuInfo.Library
if gpuInfo.Variant != "" {
requested += "_" + gpuInfo.Variant
}
// Try to find an exact match
for cmp := range availableDynLibs {
if requested == cmp {
exactMatch = cmp
dynLibs = []string{availableDynLibs[cmp]}
break
}
}
// Then for GPUs load alternates and sort the list for consistent load ordering
if gpuInfo.Library != "cpu" {
for cmp := range availableDynLibs {
if gpuInfo.Library == strings.Split(cmp, "_")[0] && cmp != exactMatch {
altDynLibs = append(altDynLibs, cmp)
}
}
slices.Sort(altDynLibs)
for _, altDynLib := range altDynLibs {
dynLibs = append(dynLibs, availableDynLibs[altDynLib])
}
}
// Load up the best CPU variant if not primary requested
if gpuInfo.Library != "cpu" {
variant := gpu.GetCPUVariant()
// If no variant, then we fall back to default
// If we have a variant, try that if we find an exact match
// Attempting to run the wrong CPU instructions will panic the
// process
if variant != "" {
for cmp := range availableDynLibs {
if cmp == "cpu_"+variant {
dynLibs = append(dynLibs, availableDynLibs[cmp])
break
}
}
} else {
dynLibs = append(dynLibs, availableDynLibs["cpu"])
}
}
// Finally, if we didn't find any matches, LCD CPU FTW
if len(dynLibs) == 0 {
dynLibs = []string{availableDynLibs["cpu"]}
}
slog.Debug(fmt.Sprintf("ordered list of LLM libraries to try %v", dynLibs))
return dynLibs
}
func rocmDynLibPresent() bool {
for dynLibName := range availableDynLibs {
if strings.HasPrefix(dynLibName, "rocm") {
return true
}
}
return false
}
func nativeInit() error {
payloadsDir, err := gpu.PayloadsDir()
if err != nil {
return err
}
slog.Info(fmt.Sprintf("Extracting dynamic libraries to %s ...", payloadsDir))
libs, err := extractDynamicLibs(payloadsDir, "llama.cpp/build/*/*/*/lib/*")
if err != nil {
if errors.Is(err, payloadMissing) {
slog.Info(fmt.Sprintf("%s", payloadMissing))
return nil
}
return err
}
for _, lib := range libs {
// The last dir component is the variant name
variant := filepath.Base(filepath.Dir(lib))
availableDynLibs[variant] = lib
}
if err := verifyDriverAccess(); err != nil {
return err
}
// Report which dynamic libraries we have loaded to assist troubleshooting
variants := make([]string, len(availableDynLibs))
i := 0
for variant := range availableDynLibs {
variants[i] = variant
i++
}
slog.Info(fmt.Sprintf("Dynamic LLM libraries %v", variants))
slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY")
return nil
}
func extractDynamicLibs(payloadsDir, glob string) ([]string, error) {
files, err := fs.Glob(libEmbed, glob)
if err != nil || len(files) == 0 {
return nil, payloadMissing
}
var mu sync.Mutex
var libs []string
var g errgroup.Group
for _, file := range files {
pathComps := strings.Split(file, "/")
if len(pathComps) != pathComponentCount {
slog.Error(fmt.Sprintf("unexpected payload components: %v", pathComps))
continue
}
file := file
g.Go(func() error {
// llama.cpp/build/$OS/$GOARCH/$VARIANT/lib/$LIBRARY
// Include the variant in the path to avoid conflicts between multiple server libs
targetDir := filepath.Join(payloadsDir, pathComps[pathComponentCount-3])
srcFile, err := libEmbed.Open(file)
if err != nil {
return fmt.Errorf("read payload %s: %v", file, err)
}
defer srcFile.Close()
if err := os.MkdirAll(targetDir, 0o755); err != nil {
return fmt.Errorf("create payload lib dir %s: %v", payloadsDir, err)
}
src := io.Reader(srcFile)
filename := file
if strings.HasSuffix(file, ".gz") {
src, err = gzip.NewReader(src)
if err != nil {
return fmt.Errorf("decompress payload %s: %v", file, err)
}
filename = strings.TrimSuffix(filename, ".gz")
}
destFile := filepath.Join(targetDir, filepath.Base(filename))
if strings.Contains(destFile, "server") {
mu.Lock()
libs = append(libs, destFile)
mu.Unlock()
}
destFp, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
if err != nil {
return fmt.Errorf("write payload %s: %v", file, err)
}
defer destFp.Close()
if _, err := io.Copy(destFp, src); err != nil {
return fmt.Errorf("copy payload %s: %v", file, err)
}
return nil
})
}
err = g.Wait()
if err != nil {
// If we fail to extract, the payload dir is unusable, so cleanup whatever we extracted
gpu.Cleanup()
return nil, err
}
return libs, nil
}
func verifyDriverAccess() error {
if runtime.GOOS != "linux" {
return nil
}
// Only check ROCm access if we have the dynamic lib loaded
if rocmDynLibPresent() {
// Verify we have permissions - either running as root, or we have group access to the driver
fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666)
if err != nil {
if errors.Is(err, fs.ErrPermission) {
return fmt.Errorf("Radeon card detected, but permissions not set up properly. Either run ollama as root, or add you user account to the render group.")
} else if errors.Is(err, fs.ErrNotExist) {
// expected behavior without a radeon card
return nil
}
return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
}
fd.Close()
}
return nil
}
package llm
import (
"embed"
)
//go:embed llama.cpp/ggml-metal.metal llama.cpp/build/darwin/arm64/*/lib/*.dylib*
var libEmbed embed.FS
package llm
import (
"testing"
"github.com/ollama/ollama/gpu"
"github.com/stretchr/testify/assert"
)
func TestGetDynLibs(t *testing.T) {
availableDynLibs = map[string]string{
"cpu": "X_cpu",
}
assert.Equal(t, false, rocmDynLibPresent())
res := getDynLibs(gpu.GpuInfo{Library: "cpu"})
assert.Len(t, res, 1)
assert.Equal(t, availableDynLibs["cpu"], res[0])
variant := gpu.GetCPUVariant()
if variant != "" {
variant = "_" + variant
}
availableDynLibs = map[string]string{
"rocm_v5": "X_rocm_v5",
"rocm_v6": "X_rocm_v6",
"cpu" + variant: "X_cpu",
}
assert.Equal(t, true, rocmDynLibPresent())
res = getDynLibs(gpu.GpuInfo{Library: "rocm"})
assert.Len(t, res, 3)
assert.Equal(t, availableDynLibs["rocm_v5"], res[0])
assert.Equal(t, availableDynLibs["rocm_v6"], res[1])
assert.Equal(t, availableDynLibs["cpu"+variant], res[2])
res = getDynLibs(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
assert.Len(t, res, 3)
assert.Equal(t, availableDynLibs["rocm_v6"], res[0])
assert.Equal(t, availableDynLibs["rocm_v5"], res[1])
assert.Equal(t, availableDynLibs["cpu"+variant], res[2])
res = getDynLibs(gpu.GpuInfo{Library: "cuda"})
assert.Len(t, res, 1)
assert.Equal(t, availableDynLibs["cpu"+variant], res[0])
res = getDynLibs(gpu.GpuInfo{Library: "default"})
assert.Len(t, res, 1)
assert.Equal(t, "default", res[0])
availableDynLibs = map[string]string{
"rocm": "X_rocm_v5",
"cpu" + variant: "X_cpu",
}
assert.Equal(t, true, rocmDynLibPresent())
res = getDynLibs(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
assert.Len(t, res, 2)
assert.Equal(t, availableDynLibs["rocm"], res[0])
assert.Equal(t, availableDynLibs["cpu"+variant], res[1])
}
This diff is collapsed.
package llm
import (
"bytes"
"os"
)
// StatusWriter is a writer that captures error messages from the llama runner process
type StatusWriter struct {
LastErrMsg string
out *os.File
}
func NewStatusWriter(out *os.File) *StatusWriter {
return &StatusWriter{
out: out,
}
}
// TODO - regex matching to detect errors like
// libcublasLt.so.11: cannot open shared object file: No such file or directory
var errorPrefixes = []string{
"error:",
"CUDA error",
"cudaMalloc failed",
"\"ERR\"",
}
func (w *StatusWriter) Write(b []byte) (int, error) {
var errMsg string
for _, prefix := range errorPrefixes {
if _, after, ok := bytes.Cut(b, []byte(prefix)); ok {
errMsg = prefix + string(bytes.TrimSpace(after))
}
}
if errMsg != "" {
w.LastErrMsg = errMsg
}
return w.out.Write(b)
}
package llm
import (
"fmt"
"time"
)
func parseDurationMs(ms float64) time.Duration {
dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms))
if err != nil {
panic(err)
}
return dur
}
...@@ -56,12 +56,13 @@ func init() { ...@@ -56,12 +56,13 @@ func init() {
var loaded struct { var loaded struct {
mu sync.Mutex mu sync.Mutex
runner llm.LLM llama *llm.LlamaServer
expireAt time.Time
expireTimer *time.Timer expireTimer *time.Timer
*Model model string
adapters []string
projectors []string
*api.Options *api.Options
} }
...@@ -69,21 +70,28 @@ var defaultSessionDuration = 5 * time.Minute ...@@ -69,21 +70,28 @@ var defaultSessionDuration = 5 * time.Minute
// load a model into memory if it is not already loaded, it is up to the caller to lock loaded.mu before calling this function // load a model into memory if it is not already loaded, it is up to the caller to lock loaded.mu before calling this function
func load(c *gin.Context, model *Model, opts *api.Options, sessionDuration time.Duration) error { func load(c *gin.Context, model *Model, opts *api.Options, sessionDuration time.Duration) error {
needLoad := loaded.runner == nil || // is there a model loaded? ctx, cancel := context.WithTimeout(c, 10*time.Second)
loaded.ModelPath != model.ModelPath || // has the base model changed? defer cancel()
!reflect.DeepEqual(loaded.AdapterPaths, model.AdapterPaths) || // have the adapters changed?
!reflect.DeepEqual(loaded.Options.Runner, opts.Runner) // have the runner options changed? needLoad := loaded.llama == nil || // is there a model loaded?
loaded.model != model.ModelPath || // has the base model changed?
!reflect.DeepEqual(loaded.adapters, model.AdapterPaths) || // have the adapters changed?
!reflect.DeepEqual(loaded.projectors, model.ProjectorPaths) || // have the adapters changed?
!reflect.DeepEqual(loaded.Options.Runner, opts.Runner) || // have the runner options changed?
loaded.llama.Ping(ctx) != nil
if needLoad { if needLoad {
if loaded.runner != nil { if loaded.llama != nil {
slog.Info("changing loaded model") slog.Info("changing loaded model")
loaded.runner.Close() loaded.llama.Close()
loaded.runner = nil loaded.llama = nil
loaded.Model = nil loaded.model = ""
loaded.adapters = nil
loaded.projectors = nil
loaded.Options = nil loaded.Options = nil
} }
llmRunner, err := llm.New(model.ModelPath, model.AdapterPaths, model.ProjectorPaths, opts) llama, err := llm.NewLlamaServer(model.ModelPath, model.AdapterPaths, model.ProjectorPaths, opts)
if err != nil { if err != nil {
// some older models are not compatible with newer versions of llama.cpp // some older models are not compatible with newer versions of llama.cpp
// show a generalized compatibility error until there is a better way to // show a generalized compatibility error until there is a better way to
...@@ -95,28 +103,26 @@ func load(c *gin.Context, model *Model, opts *api.Options, sessionDuration time. ...@@ -95,28 +103,26 @@ func load(c *gin.Context, model *Model, opts *api.Options, sessionDuration time.
return err return err
} }
loaded.Model = model loaded.model = model.ModelPath
loaded.runner = llmRunner loaded.adapters = model.AdapterPaths
loaded.projectors = model.ProjectorPaths
loaded.llama = llama
loaded.Options = opts loaded.Options = opts
} }
loaded.expireAt = time.Now().Add(sessionDuration)
if loaded.expireTimer == nil { if loaded.expireTimer == nil {
loaded.expireTimer = time.AfterFunc(sessionDuration, func() { loaded.expireTimer = time.AfterFunc(sessionDuration, func() {
loaded.mu.Lock() loaded.mu.Lock()
defer loaded.mu.Unlock() defer loaded.mu.Unlock()
if time.Now().Before(loaded.expireAt) { if loaded.llama != nil {
return loaded.llama.Close()
} }
if loaded.runner != nil { loaded.llama = nil
loaded.runner.Close() loaded.model = ""
} loaded.adapters = nil
loaded.projectors = nil
loaded.runner = nil
loaded.Model = nil
loaded.Options = nil loaded.Options = nil
}) })
} }
...@@ -265,7 +271,7 @@ func GenerateHandler(c *gin.Context) { ...@@ -265,7 +271,7 @@ func GenerateHandler(c *gin.Context) {
sb.Reset() sb.Reset()
if req.Context != nil { if req.Context != nil {
prev, err := loaded.runner.Decode(c.Request.Context(), req.Context) prev, err := loaded.llama.Detokenize(c.Request.Context(), req.Context)
if err != nil { if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return return
...@@ -286,9 +292,8 @@ func GenerateHandler(c *gin.Context) { ...@@ -286,9 +292,8 @@ func GenerateHandler(c *gin.Context) {
go func() { go func() {
defer close(ch) defer close(ch)
fn := func(r llm.PredictResult) { fn := func(r llm.CompletionResponse) {
// Update model expiration // Update model expiration
loaded.expireAt = time.Now().Add(sessionDuration)
loaded.expireTimer.Reset(sessionDuration) loaded.expireTimer.Reset(sessionDuration)
// Build up the full response // Build up the full response
...@@ -322,7 +327,7 @@ func GenerateHandler(c *gin.Context) { ...@@ -322,7 +327,7 @@ func GenerateHandler(c *gin.Context) {
} }
// TODO (jmorganca): encode() should not strip special tokens // TODO (jmorganca): encode() should not strip special tokens
tokens, err := loaded.runner.Encode(c.Request.Context(), p) tokens, err := loaded.llama.Tokenize(c.Request.Context(), p)
if err != nil { if err != nil {
ch <- gin.H{"error": err.Error()} ch <- gin.H{"error": err.Error()}
return return
...@@ -344,13 +349,13 @@ func GenerateHandler(c *gin.Context) { ...@@ -344,13 +349,13 @@ func GenerateHandler(c *gin.Context) {
} }
// Start prediction // Start prediction
predictReq := llm.PredictOpts{ req := llm.CompletionRequest{
Prompt: prompt, Prompt: prompt,
Format: req.Format, Format: req.Format,
Images: images, Images: images,
Options: opts, Options: opts,
} }
if err := loaded.runner.Predict(c.Request.Context(), predictReq, fn); err != nil { if err := loaded.llama.Completion(c.Request.Context(), req, fn); err != nil {
ch <- gin.H{"error": err.Error()} ch <- gin.H{"error": err.Error()}
} }
}() }()
...@@ -471,7 +476,7 @@ func EmbeddingsHandler(c *gin.Context) { ...@@ -471,7 +476,7 @@ func EmbeddingsHandler(c *gin.Context) {
return return
} }
embedding, err := loaded.runner.Embedding(c.Request.Context(), req.Prompt) embedding, err := loaded.llama.Embedding(c.Request.Context(), req.Prompt)
if err != nil { if err != nil {
slog.Info(fmt.Sprintf("embedding generation failed: %v", err)) slog.Info(fmt.Sprintf("embedding generation failed: %v", err))
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to generate embedding"}) c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to generate embedding"})
...@@ -1123,8 +1128,8 @@ func Serve(ln net.Listener) error { ...@@ -1123,8 +1128,8 @@ func Serve(ln net.Listener) error {
signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM) signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM)
go func() { go func() {
<-signals <-signals
if loaded.runner != nil { if loaded.llama != nil {
loaded.runner.Close() loaded.llama.Close()
} }
gpu.Cleanup() gpu.Cleanup()
os.Exit(0) os.Exit(0)
...@@ -1196,7 +1201,7 @@ func streamResponse(c *gin.Context, ch chan any) { ...@@ -1196,7 +1201,7 @@ func streamResponse(c *gin.Context, ch chan any) {
// ChatPrompt builds up a prompt from a series of messages for the currently `loaded` model // ChatPrompt builds up a prompt from a series of messages for the currently `loaded` model
func chatPrompt(ctx context.Context, template string, messages []api.Message, numCtx int) (string, error) { func chatPrompt(ctx context.Context, template string, messages []api.Message, numCtx int) (string, error) {
encode := func(s string) ([]int, error) { encode := func(s string) ([]int, error) {
return loaded.runner.Encode(ctx, s) return loaded.llama.Tokenize(ctx, s)
} }
prompt, err := ChatPrompt(template, messages, numCtx, encode) prompt, err := ChatPrompt(template, messages, numCtx, encode)
...@@ -1326,9 +1331,8 @@ func ChatHandler(c *gin.Context) { ...@@ -1326,9 +1331,8 @@ func ChatHandler(c *gin.Context) {
go func() { go func() {
defer close(ch) defer close(ch)
fn := func(r llm.PredictResult) { fn := func(r llm.CompletionResponse) {
// Update model expiration // Update model expiration
loaded.expireAt = time.Now().Add(sessionDuration)
loaded.expireTimer.Reset(sessionDuration) loaded.expireTimer.Reset(sessionDuration)
resp := api.ChatResponse{ resp := api.ChatResponse{
...@@ -1352,14 +1356,12 @@ func ChatHandler(c *gin.Context) { ...@@ -1352,14 +1356,12 @@ func ChatHandler(c *gin.Context) {
ch <- resp ch <- resp
} }
// Start prediction if err := loaded.llama.Completion(c.Request.Context(), llm.CompletionRequest{
predictReq := llm.PredictOpts{
Prompt: prompt, Prompt: prompt,
Format: req.Format, Format: req.Format,
Images: images, Images: images,
Options: opts, Options: opts,
} }, fn); err != nil {
if err := loaded.runner.Predict(c.Request.Context(), predictReq, fn); err != nil {
ch <- gin.H{"error": err.Error()} ch <- gin.H{"error": err.Error()}
} }
}() }()
......
...@@ -17,7 +17,6 @@ import ( ...@@ -17,7 +17,6 @@ import (
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/parser" "github.com/ollama/ollama/parser"
"github.com/ollama/ollama/version" "github.com/ollama/ollama/version"
) )
...@@ -211,7 +210,7 @@ func Test_Routes(t *testing.T) { ...@@ -211,7 +210,7 @@ func Test_Routes(t *testing.T) {
}, },
} }
s := Server{} s := &Server{}
router := s.GenerateRoutes() router := s.GenerateRoutes()
httpSrv := httptest.NewServer(router) httpSrv := httptest.NewServer(router)
...@@ -242,27 +241,3 @@ func Test_Routes(t *testing.T) { ...@@ -242,27 +241,3 @@ func Test_Routes(t *testing.T) {
} }
} }
type MockLLM struct {
encoding []int
}
func (llm *MockLLM) Predict(ctx context.Context, pred llm.PredictOpts, fn func(llm.PredictResult)) error {
return nil
}
func (llm *MockLLM) Encode(ctx context.Context, prompt string) ([]int, error) {
return llm.encoding, nil
}
func (llm *MockLLM) Decode(ctx context.Context, tokens []int) (string, error) {
return "", nil
}
func (llm *MockLLM) Embedding(ctx context.Context, input string) ([]float64, error) {
return []float64{}, nil
}
func (llm *MockLLM) Close() {
// do nothing
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment