llm.go 4.48 KB
Newer Older
1
2
3
package llm

import (
4
	"context"
5
	"fmt"
6
	"log/slog"
7
	"os"
8
	"runtime"
9
	"slices"
10

11
12
	"github.com/ollama/ollama/api"
	"github.com/ollama/ollama/gpu"
13
14
15
)

type LLM interface {
Bruce MacDonald's avatar
Bruce MacDonald committed
16
	Predict(context.Context, PredictOpts, func(PredictResult)) error
17
18
19
	Embedding(context.Context, string) ([]float64, error)
	Encode(context.Context, string) ([]int, error)
	Decode(context.Context, []int) (string, error)
20
21
22
	Close()
}

23
24
25
26
var cpuOnlyFamilies = []string{
	"mamba",
}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
27
func New(model string, adapters, projectors []string, opts api.Options) (LLM, error) {
28
29
30
31
32
33
34
35
	if _, err := os.Stat(model); err != nil {
		return nil, err
	}

	f, err := os.Open(model)
	if err != nil {
		return nil, err
	}
Michael Yang's avatar
Michael Yang committed
36
	defer f.Close()
37

Michael Yang's avatar
Michael Yang committed
38
	ggml, size, err := DecodeGGML(f)
39
40
41
42
	if err != nil {
		return nil, err
	}

Michael Yang's avatar
Michael Yang committed
43
44
45
	if opts.NumCtx > int(ggml.KV().ContextLength()) {
		slog.Warn("requested context length is greater than model max context length", "requested", opts.NumCtx, "model", ggml.KV().ContextLength())
		opts.NumCtx = int(ggml.KV().ContextLength())
Michael Yang's avatar
Michael Yang committed
46
47
	}

48
49
50
51
	if opts.NumCtx < 4 {
		opts.NumCtx = 4
	}

52
	vram, _ := gpu.CheckVRAM()
53
54

	// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
Michael Yang's avatar
Michael Yang committed
55
	kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.KV().BlockCount()) * int64(ggml.KV().EmbeddingLength()) * int64(ggml.KV().HeadCountKV()) / int64(ggml.KV().HeadCount())
56

57
	// this amount is the overhead + tensors in memory
Michael Yang's avatar
typo  
Michael Yang committed
58
	// TODO: get this from the llama.cpp's graph calculations instead of
59
	// estimating it's 1/6 * kv_cache_size * num_gqa
Michael Yang's avatar
Michael Yang committed
60
	graph := int64(ggml.KV().GQA()) * kv / 6
61

Michael Yang's avatar
Michael Yang committed
62
	if slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture()) {
63
64
65
		opts.NumGPU = 0
	}

66
	info := gpu.GetGPUInfo()
67
68
69
70
71
	switch runtime.GOOS {
	case "darwin":
		if opts.NumGPU == 0 {
			break
		}
72

73
		if size+kv+graph > vram {
74
			slog.Info("not enough vram available, setting num_gpu=0")
75
76
77
78
			opts.NumGPU = 0
			break
		}

79
80
		// TODO: implement layer splitting on macOS
		opts.NumGPU = 999
81
	default:
82
		if info.Library == "cpu" {
83
			slog.Info("GPU not available, falling back to CPU")
84
85
86
87
88
89
			opts.NumGPU = 0
			break
		}

		// don't use GPU at all if no layers are loaded
		if opts.NumGPU == 0 {
90
91
			info.Library = "cpu"
			info.Variant = gpu.GetCPUVariant()
92
93
94
95
96
97
98
99
100
101
102
103
104
105
			break
		}

		// user-defined GPU count
		if opts.NumGPU != -1 {
			break
		}

		// the "main" GPU needs the most memory and determines the limit
		// of how many layers can be loaded. It needs to fit:
		// 1. the full compute graph allocation for all devices (graph)
		// 2. the proportional kv cache for all devices (kv * % layers)
		// 3. the proportional model (size * % layers / # devices)
		// This estimates the number of layers
Michael Yang's avatar
Michael Yang committed
106
		maxlayers := int64(ggml.KV().BlockCount()) + 1
107
108
109
110
111
112
		devices := int64(info.DeviceCount)
		avg := vram / devices
		layers := maxlayers * (avg - graph) / (kv + size/devices)
		if layers > maxlayers {
			layers = maxlayers
		}
113

114
115
116
		// 1 + 2 must fit on the main gpu
		min := graph + kv*layers/maxlayers
		if layers <= 0 || min > avg {
117
			slog.Info("not enough vram available, falling back to CPU only")
118
119
			info.Library = "cpu"
			info.Variant = gpu.GetCPUVariant()
120
121
			opts.NumGPU = 0
			break
122
		}
123
124

		opts.NumGPU = int(layers)
125
126
	}

Bruce MacDonald's avatar
Bruce MacDonald committed
127
128
	opts.RopeFrequencyBase = 0.0
	opts.RopeFrequencyScale = 0.0
Daniel Hiltgen's avatar
Daniel Hiltgen committed
129
	return newLlmServer(info, model, adapters, projectors, opts)
130
131
132
}

// Give any native cgo implementations an opportunity to initialize
Daniel Hiltgen's avatar
Daniel Hiltgen committed
133
134
func Init() error {
	return nativeInit()
135
}
136

Daniel Hiltgen's avatar
Daniel Hiltgen committed
137
func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
138
	dynLibs := getDynLibs(gpuInfo)
139
140
141
142

	// Check to see if the user has requested a specific library instead of auto-detecting
	demandLib := os.Getenv("OLLAMA_LLM_LIBRARY")
	if demandLib != "" {
143
		libPath := availableDynLibs[demandLib]
144
		if libPath == "" {
145
			slog.Info(fmt.Sprintf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib))
146
		} else {
147
			slog.Info(fmt.Sprintf("Loading OLLAMA_LLM_LIBRARY=%s", demandLib))
148
			dynLibs = []string{libPath}
149
150
151
		}
	}

152
153
154
155
	// We stage into a temp directory, and if we've been idle for a while, it may have been reaped
	_, err := os.Stat(dynLibs[0])
	if err != nil {
		slog.Info(fmt.Sprintf("%s has disappeared, reloading libraries", dynLibs[0]))
Daniel Hiltgen's avatar
Daniel Hiltgen committed
156
		err = nativeInit()
157
158
159
160
161
		if err != nil {
			return nil, err
		}
	}

162
163
164
	err2 := fmt.Errorf("unable to locate suitable llm library")
	for _, dynLib := range dynLibs {
		srv, err := newDynExtServer(dynLib, model, adapters, projectors, opts)
165
166
167
		if err == nil {
			return srv, nil
		}
168
		slog.Warn(fmt.Sprintf("Failed to load dynamic library %s  %s", dynLib, err))
169
		err2 = err
170
171
	}

172
	return nil, err2
173
}