llm.go 3.49 KB
Newer Older
1
2
3
package llm

import (
4
	"context"
5
	"log"
6
	"os"
7
	"runtime"
8
9

	"github.com/jmorganca/ollama/api"
10
	"github.com/jmorganca/ollama/gpu"
11
12
13
)

type LLM interface {
Bruce MacDonald's avatar
Bruce MacDonald committed
14
	Predict(context.Context, PredictOpts, func(PredictResult)) error
15
16
17
	Embedding(context.Context, string) ([]float64, error)
	Encode(context.Context, string) ([]int, error)
	Decode(context.Context, []int) (string, error)
18
19
20
	Close()
}

21
var AvailableShims = map[string]string{}
22

Michael Yang's avatar
Michael Yang committed
23
func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
24
25
26
27
28
29
30
31
	if _, err := os.Stat(model); err != nil {
		return nil, err
	}

	f, err := os.Open(model)
	if err != nil {
		return nil, err
	}
Michael Yang's avatar
Michael Yang committed
32
	defer f.Close()
33

Bruce MacDonald's avatar
Bruce MacDonald committed
34
	ggml, err := DecodeGGML(f)
35
36
37
38
	if err != nil {
		return nil, err
	}

39
40
41
42
	if opts.NumCtx < 4 {
		opts.NumCtx = 4
	}

43
44
	vram, _ := gpu.CheckVRAM()
	size := ggml.Size
45
46

	// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
47
	kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(ggml.NumHead())
48

49
50
	// this amount is the overhead + tensors in memory
	// TODO: get this from the llama.cpp's graph calcluations instead of
51
	// estimating it's 1/6 * kv_cache_size * num_gqa
52
	graph := int64(ggml.NumGQA()) * kv / 6
53
54
55

	info := gpu.GetGPUInfo()
	library := info.Library
56
57
58
59
60
	switch runtime.GOOS {
	case "darwin":
		if opts.NumGPU == 0 {
			break
		}
61

62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
		if size+kv+graph > vram {
			log.Println("not enough vram available, falling back to CPU only")
			opts.NumGPU = 0
			break
		}

		opts.NumGPU = 1
	default:
		if library == "cpu" || library == "default" {
			log.Println("GPU not available, falling back to CPU")
			opts.NumGPU = 0
			break
		}

		// don't use GPU at all if no layers are loaded
		if opts.NumGPU == 0 {
			library = "cpu"
			break
		}

		// user-defined GPU count
		if opts.NumGPU != -1 {
			break
		}

		// the "main" GPU needs the most memory and determines the limit
		// of how many layers can be loaded. It needs to fit:
		// 1. the full compute graph allocation for all devices (graph)
		// 2. the proportional kv cache for all devices (kv * % layers)
		// 3. the proportional model (size * % layers / # devices)
		// This estimates the number of layers
		maxlayers := int64(ggml.NumLayers()) + 1
		devices := int64(info.DeviceCount)
		avg := vram / devices
		layers := maxlayers * (avg - graph) / (kv + size/devices)
		if layers > maxlayers {
			layers = maxlayers
		}
100

101
102
103
104
105
106
107
		// 1 + 2 must fit on the main gpu
		min := graph + kv*layers/maxlayers
		if layers <= 0 || min > avg {
			log.Printf("not enough vram available, falling back to CPU only")
			library = "cpu"
			opts.NumGPU = 0
			break
108
		}
109
110

		opts.NumGPU = int(layers)
111
112
	}

Bruce MacDonald's avatar
Bruce MacDonald committed
113
114
	opts.RopeFrequencyBase = 0.0
	opts.RopeFrequencyScale = 0.0
115
	return newLlmServer(library, model, adapters, projectors, opts)
116
117
118
119
120
}

// Give any native cgo implementations an opportunity to initialize
func Init(workdir string) error {
	return nativeInit(workdir)
121
}
122

123
func newLlmServer(library, model string, adapters, projectors []string, opts api.Options) (extServer, error) {
124
	if _, libPresent := AvailableShims[library]; libPresent && library != "default" {
125
		srv, err := newDynamicShimExtServer(AvailableShims[library], model, adapters, projectors, opts)
126
127
128
		if err == nil {
			return srv, nil
		}
129
130
		log.Printf("Failed to load dynamic library %s - falling back to CPU mode %s", library, err)
		// TODO - update some state to indicate we were unable to load the GPU library for future "info" ux
131
132
	}

133
	return newDefaultExtServer(model, adapters, projectors, opts)
134
}