llm.go 4.72 KB
Newer Older
1
2
3
package llm

import (
4
	"context"
5
	"fmt"
6
	"log/slog"
7
	"os"
8
	"slices"
Michael Yang's avatar
Michael Yang committed
9
	"strings"
10

11
	"github.com/ollama/ollama/api"
Michael Yang's avatar
Michael Yang committed
12
	"github.com/ollama/ollama/format"
13
	"github.com/ollama/ollama/gpu"
14
15
16
)

type LLM interface {
Bruce MacDonald's avatar
Bruce MacDonald committed
17
	Predict(context.Context, PredictOpts, func(PredictResult)) error
18
19
20
	Embedding(context.Context, string) ([]float64, error)
	Encode(context.Context, string) ([]int, error)
	Decode(context.Context, []int) (string, error)
21
22
23
	Close()
}

24
25
26
27
var cpuOnlyFamilies = []string{
	"mamba",
}

Michael Yang's avatar
Michael Yang committed
28
func New(model string, adapters, projectors []string, opts *api.Options) (LLM, error) {
29
30
31
32
33
34
35
36
	if _, err := os.Stat(model); err != nil {
		return nil, err
	}

	f, err := os.Open(model)
	if err != nil {
		return nil, err
	}
Michael Yang's avatar
Michael Yang committed
37
	defer f.Close()
38

Michael Yang's avatar
Michael Yang committed
39
	ggml, _, err := DecodeGGML(f)
40
41
42
43
	if err != nil {
		return nil, err
	}

Michael Yang's avatar
Michael Yang committed
44
45
46
	if opts.NumCtx > int(ggml.KV().ContextLength()) {
		slog.Warn("requested context length is greater than model max context length", "requested", opts.NumCtx, "model", ggml.KV().ContextLength())
		opts.NumCtx = int(ggml.KV().ContextLength())
Michael Yang's avatar
Michael Yang committed
47
48
	}

49
50
51
52
	if opts.NumCtx < 4 {
		opts.NumCtx = 4
	}

Michael Yang's avatar
Michael Yang committed
53
54
55
56
57
58
59
60
61
62
	availableMemory, _ := gpu.CheckVRAM()
	info := gpu.GetGPUInfo()

	usedMemory := info.MinimumMemory
	for _, projector := range projectors {
		usedMemory += projectorMemoryRequirements(projector)

		// multimodal models require at least 2048 context
		opts.NumCtx = max(opts.NumCtx, 2048)
	}
63

Michael Yang's avatar
Michael Yang committed
64
65
	// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
	kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.KV().BlockCount()) * int64(ggml.KV().EmbeddingLength()) / int64(ggml.KV().HeadCount()) * int64(ggml.KV().HeadCountKV())
66

67
	// this amount is the overhead + tensors in memory
Michael Yang's avatar
typo  
Michael Yang committed
68
	// TODO: get this from the llama.cpp's graph calculations instead of
69
	// estimating it's 1/6 * kv_cache_size * num_gqa
Michael Yang's avatar
Michael Yang committed
70
	graph := int64(ggml.KV().GQA()) * kv / 6
Michael Yang's avatar
Michael Yang committed
71
	usedMemory += graph
72

Michael Yang's avatar
Michael Yang committed
73
74
	if usedMemory > availableMemory || slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture()) {
		info.Library = "cpu"
75
76
	}

Michael Yang's avatar
Michael Yang committed
77
	requiredMemory := usedMemory
78

Michael Yang's avatar
Michael Yang committed
79
80
81
82
	var layers int
	for i := 0; i < int(ggml.KV().BlockCount()); i++ {
		layerMemory := ggml.LayerSize(fmt.Sprintf("blk.%d.", i)) + kv/int64(ggml.KV().BlockCount())
		requiredMemory += layerMemory
83

Michael Yang's avatar
Michael Yang committed
84
85
86
		if availableMemory > usedMemory+layerMemory && (opts.NumGPU < 0 || layers < opts.NumGPU) {
			usedMemory += layerMemory
			layers++
87
		}
Michael Yang's avatar
Michael Yang committed
88
	}
89

Michael Yang's avatar
Michael Yang committed
90
91
	memOutputLayer := ggml.LayerSize("output.")
	requiredMemory += memOutputLayer
92

Michael Yang's avatar
Michael Yang committed
93
94
95
96
97
	// only offload output layer if all repeating layers are offloaded
	if layers >= int(ggml.KV().BlockCount()) && availableMemory > usedMemory+memOutputLayer {
		usedMemory += memOutputLayer
		layers++
	}
98

Michael Yang's avatar
Michael Yang committed
99
100
101
102
103
104
105
106
107
108
109
110
111
	slog.Info(
		"offload to gpu",
		"layers", layers,
		"required", format.HumanBytes2(requiredMemory),
		"used", format.HumanBytes2(usedMemory),
		"available", format.HumanBytes2(availableMemory),
		"kv", format.HumanBytes2(kv),
		"graph", format.HumanBytes2(graph),
	)

	if opts.NumGPU < 0 && info.Library != "cpu" {
		opts.NumGPU = layers
	}
112

Michael Yang's avatar
Michael Yang committed
113
114
	return newLlmServer(info, model, adapters, projectors, opts)
}
115

Michael Yang's avatar
Michael Yang committed
116
117
118
119
func projectorMemoryRequirements(filename string) int64 {
	file, err := os.Open(filename)
	if err != nil {
		return 0
120
	}
Michael Yang's avatar
Michael Yang committed
121
	defer file.Close()
122

Michael Yang's avatar
Michael Yang committed
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
	ggml, _, err := DecodeGGML(file)
	if err != nil {
		return 0
	}

	prefixes := make(map[string]struct{})
	for _, layer := range ggml.Tensors() {
		parts := strings.Split(layer.Name, ".")
		prefixes[strings.Join(parts[:2], ".")] = struct{}{}
	}

	var ask int64
	for prefix := range prefixes {
		ask += ggml.LayerSize(prefix)
	}

	return ask
140
141
142
}

// Give any native cgo implementations an opportunity to initialize
Daniel Hiltgen's avatar
Daniel Hiltgen committed
143
144
func Init() error {
	return nativeInit()
145
}
146

Michael Yang's avatar
Michael Yang committed
147
func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts *api.Options) (LLM, error) {
148
	dynLibs := getDynLibs(gpuInfo)
149
150
151
152

	// Check to see if the user has requested a specific library instead of auto-detecting
	demandLib := os.Getenv("OLLAMA_LLM_LIBRARY")
	if demandLib != "" {
153
		libPath := availableDynLibs[demandLib]
154
		if libPath == "" {
155
			slog.Info(fmt.Sprintf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib))
156
		} else {
157
			slog.Info(fmt.Sprintf("Loading OLLAMA_LLM_LIBRARY=%s", demandLib))
158
			dynLibs = []string{libPath}
159
160
161
		}
	}

162
163
164
165
	// We stage into a temp directory, and if we've been idle for a while, it may have been reaped
	_, err := os.Stat(dynLibs[0])
	if err != nil {
		slog.Info(fmt.Sprintf("%s has disappeared, reloading libraries", dynLibs[0]))
Daniel Hiltgen's avatar
Daniel Hiltgen committed
166
		err = nativeInit()
167
168
169
170
171
		if err != nil {
			return nil, err
		}
	}

172
173
174
	err2 := fmt.Errorf("unable to locate suitable llm library")
	for _, dynLib := range dynLibs {
		srv, err := newDynExtServer(dynLib, model, adapters, projectors, opts)
175
176
177
		if err == nil {
			return srv, nil
		}
178
		slog.Warn(fmt.Sprintf("Failed to load dynamic library %s  %s", dynLib, err))
179
		err2 = err
180
181
	}

182
	return nil, err2
183
}