llm.go 4.38 KB
Newer Older
1
2
3
package llm

import (
4
	"context"
5
	"fmt"
6
	"log"
7
	"os"
8
	"runtime"
9
10

	"github.com/jmorganca/ollama/api"
11
	"github.com/jmorganca/ollama/gpu"
12
13
14
)

type LLM interface {
Bruce MacDonald's avatar
Bruce MacDonald committed
15
	Predict(context.Context, PredictOpts, func(PredictResult)) error
16
17
18
	Embedding(context.Context, string) ([]float64, error)
	Encode(context.Context, string) ([]int, error)
	Decode(context.Context, []int) (string, error)
19
20
21
	Close()
}

Michael Yang's avatar
Michael Yang committed
22
func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
23
24
25
26
27
28
29
30
	if _, err := os.Stat(model); err != nil {
		return nil, err
	}

	f, err := os.Open(model)
	if err != nil {
		return nil, err
	}
Michael Yang's avatar
Michael Yang committed
31
	defer f.Close()
32

Bruce MacDonald's avatar
Bruce MacDonald committed
33
	ggml, err := DecodeGGML(f)
34
35
36
37
	if err != nil {
		return nil, err
	}

38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
	if opts.NumCtx < 4 {
		opts.NumCtx = 4
	}

	fmt.Println("size", ggml.Size)
	fmt.Println("filetype", ggml.FileType())
	fmt.Println("architecture", ggml.ModelFamily())
	fmt.Println("type", ggml.ModelType())
	fmt.Println("name", ggml.Name())
	fmt.Println("embd", ggml.NumEmbed())
	fmt.Println("head", ggml.NumHead())
	fmt.Println("head_kv", ggml.NumHeadKv())
	fmt.Println("gqa", ggml.NumGQA())

	available, _ := gpu.CheckVRAM()

	// For now assume filesize = model size
	// TODO: use actual model size
	requiredModel := ggml.Size

	// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
	requiredKv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(ggml.NumHead())
60

61
62
	// this amount is the overhead + tensors in memory
	// TODO: get this from the llama.cpp's graph calcluations instead of
63
64
	// estimating it's 1/6 * kv_cache_size * num_gqa
	requiredAlloc := int64(ggml.NumGQA()) * requiredKv / 6
65

66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
	requiredTotal := requiredModel + requiredKv + requiredAlloc

	log.Println("system memory bytes:", available)
	log.Println("required model bytes:", requiredModel)
	log.Println("required kv bytes:", requiredKv)
	log.Println("required alloc bytes:", requiredAlloc)
	log.Println("required total bytes:", requiredTotal)

	info := gpu.GetGPUInfo()
	library := info.Library

	if opts.NumGPU == -1 {
		// default to offloading all layers
		opts.NumGPU = int(ggml.NumLayers()) + 1
	}

	// decide how many layers to put on the GPU
	if opts.NumGPU > 0 {
		switch runtime.GOOS {
		case "darwin":
			if requiredTotal > available {
				log.Println("not enough vram available, falling back to CPU only")
				opts.NumGPU = 0
			}
		default:
			if library == "cpu" || library == "default" {
				opts.NumGPU = 0
				break
			}

96
97
98
99
			// alloc buffer and kv cache is allocated as a fixed amount on the main gpu
			// TODO: find the largest GPU and only reserve memory there
			avgAvailable := available / int64(info.DeviceCount)
			if requiredAlloc > avgAvailable {
100
101
102
103
104
105
				log.Printf("not enough vram available, falling back to CPU only")
				library = "cpu"
				opts.NumGPU = 0
				break
			}

106
107
108
109
110
111
112
113
114
115
			// we don't know which GPU will be used, so estimate
			// the scratch buffer space on all of them
			// TODO: allocate less layers to the GPU with the scratch buffer
			// and more to the others (based on their available memory)
			available -= requiredAlloc * int64(info.DeviceCount)

			// no offloading required
			if requiredModel+requiredKv <= available {
				break
			}
116
117
118
119
120
121

			// fill remaining vram with layers
			log.Println("splitting", available, "of available memory bytes into layers")
			bytesPerLayer := int64((requiredModel + requiredKv) / int64(ggml.NumLayers()))
			log.Println("bytes per layer:", bytesPerLayer)
			layers := available / bytesPerLayer
122
			log.Println("total required with split:", requiredAlloc+(layers*bytesPerLayer))
123
124
125
			if layers < int64(opts.NumGPU) {
				opts.NumGPU = int(layers)
			}
126
		}
127
128
	}

Bruce MacDonald's avatar
Bruce MacDonald committed
129
130
131
	opts.NumGQA = 0
	opts.RopeFrequencyBase = 0.0
	opts.RopeFrequencyScale = 0.0
132
133
	gpuInfo := gpu.GetGPUInfo()
	return newLlmServer(gpuInfo, model, adapters, projectors, opts)
134
135
136
137
138
}

// Give any native cgo implementations an opportunity to initialize
func Init(workdir string) error {
	return nativeInit(workdir)
139
}
140

141
142
143
144
145
146
func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (extServer, error) {
	for _, shim := range getShims(gpuInfo) {
		if shim == "default" {
			break
		}
		srv, err := newDynamicShimExtServer(shim, model, adapters, projectors, opts)
147
148
149
		if err == nil {
			return srv, nil
		}
150
		log.Printf("Failed to load dynamic library %s  %s", shim, err)
151
152
	}

153
	return newDefaultExtServer(model, adapters, projectors, opts)
154

155
}