llm.go 2.59 KB
Newer Older
1
2
3
package llm

import (
4
	"context"
5
	"fmt"
6
	"log"
7
	"os"
8
	"runtime"
9

10
11
	"github.com/pbnjay/memory"

12
	"github.com/jmorganca/ollama/api"
Michael Yang's avatar
Michael Yang committed
13
	"github.com/jmorganca/ollama/format"
14
15
16
)

type LLM interface {
17
18
19
20
	Predict(context.Context, []int, string, func(api.GenerateResponse)) error
	Embedding(context.Context, string) ([]float64, error)
	Encode(context.Context, string) ([]int, error)
	Decode(context.Context, []int) (string, error)
21
22
	SetOptions(api.Options)
	Close()
23
	Ping(context.Context) error
24
25
}

26
func New(workDir, model string, adapters []string, opts api.Options) (LLM, error) {
27
28
29
30
31
32
33
34
	if _, err := os.Stat(model); err != nil {
		return nil, err
	}

	f, err := os.Open(model)
	if err != nil {
		return nil, err
	}
Michael Yang's avatar
Michael Yang committed
35
	defer f.Close()
36

Bruce MacDonald's avatar
Bruce MacDonald committed
37
	ggml, err := DecodeGGML(f)
38
39
40
41
	if err != nil {
		return nil, err
	}

42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
	if runtime.GOOS == "darwin" {
		switch ggml.FileType() {
		case "Q8_0":
			if ggml.Name() != "gguf" && opts.NumGPU != 0 {
				// GGML Q8_0 do not support Metal API and will
				// cause the runner to segmentation fault so disable GPU
				log.Printf("WARNING: GPU disabled for F32, Q5_0, Q5_1, and Q8_0")
				opts.NumGPU = 0
			}
		case "F32", "Q5_0", "Q5_1":
			if opts.NumGPU != 0 {
				// F32, Q5_0, Q5_1, and Q8_0 do not support Metal API and will
				// cause the runner to segmentation fault so disable GPU
				log.Printf("WARNING: GPU disabled for F32, Q5_0, Q5_1, and Q8_0")
				opts.NumGPU = 0
			}
58
		}
Michael Yang's avatar
Michael Yang committed
59

60
61
		var requiredMemory int64
		var f16Multiplier int64 = 2
62

63
64
65
66
67
68
69
70
71
72
73
74
75
		switch ggml.ModelType() {
		case "3B", "7B":
			requiredMemory = 8 * format.GigaByte
		case "13B":
			requiredMemory = 16 * format.GigaByte
		case "30B", "34B", "40B":
			requiredMemory = 32 * format.GigaByte
		case "65B", "70B":
			requiredMemory = 64 * format.GigaByte
		case "180B":
			requiredMemory = 128 * format.GigaByte
			f16Multiplier = 4
		}
76

77
		systemMemory := int64(memory.TotalMemory())
78

79
80
81
82
83
		if ggml.FileType() == "F16" && requiredMemory*f16Multiplier > systemMemory {
			return nil, fmt.Errorf("F16 model requires at least %s of total memory", format.HumanBytes(requiredMemory))
		} else if requiredMemory > systemMemory {
			return nil, fmt.Errorf("model requires at least %s of total memory", format.HumanBytes(requiredMemory))
		}
84
85
	}

Bruce MacDonald's avatar
Bruce MacDonald committed
86
87
88
	switch ggml.Name() {
	case "gguf":
		opts.NumGQA = 0 // TODO: remove this when llama.cpp runners differ enough to need separate newLlama functions
89
		return newLlama(model, adapters, chooseRunners(workDir, "gguf"), ggml.NumLayers(), opts)
Bruce MacDonald's avatar
Bruce MacDonald committed
90
	case "ggml", "ggmf", "ggjt", "ggla":
91
		return newLlama(model, adapters, chooseRunners(workDir, "ggml"), ggml.NumLayers(), opts)
92
	default:
Michael Yang's avatar
Michael Yang committed
93
		return nil, fmt.Errorf("unknown ggml type: %s", ggml.ModelFamily())
94
95
	}
}