llm.go 2.68 KB
Newer Older
1
2
3
package llm

import (
4
	"context"
5
	"fmt"
6
	"log"
7
	"os"
8
	"runtime"
9

10
11
	"github.com/pbnjay/memory"

12
	"github.com/jmorganca/ollama/api"
Michael Yang's avatar
Michael Yang committed
13
	"github.com/jmorganca/ollama/format"
14
15
16
)

type LLM interface {
17
18
19
20
	Predict(context.Context, []int, string, func(api.GenerateResponse)) error
	Embedding(context.Context, string) ([]float64, error)
	Encode(context.Context, string) ([]int, error)
	Decode(context.Context, []int) (string, error)
21
22
	SetOptions(api.Options)
	Close()
23
	Ping(context.Context) error
24
25
}

26
func New(workDir, model string, adapters []string, opts api.Options) (LLM, error) {
27
28
29
30
31
32
33
34
	if _, err := os.Stat(model); err != nil {
		return nil, err
	}

	f, err := os.Open(model)
	if err != nil {
		return nil, err
	}
Michael Yang's avatar
Michael Yang committed
35
	defer f.Close()
36

Bruce MacDonald's avatar
Bruce MacDonald committed
37
	ggml, err := DecodeGGML(f)
38
39
40
41
	if err != nil {
		return nil, err
	}

42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
	if runtime.GOOS == "darwin" {
		switch ggml.FileType() {
		case "Q8_0":
			if ggml.Name() != "gguf" && opts.NumGPU != 0 {
				// GGML Q8_0 do not support Metal API and will
				// cause the runner to segmentation fault so disable GPU
				log.Printf("WARNING: GPU disabled for F32, Q5_0, Q5_1, and Q8_0")
				opts.NumGPU = 0
			}
		case "F32", "Q5_0", "Q5_1":
			if opts.NumGPU != 0 {
				// F32, Q5_0, Q5_1, and Q8_0 do not support Metal API and will
				// cause the runner to segmentation fault so disable GPU
				log.Printf("WARNING: GPU disabled for F32, Q5_0, Q5_1, and Q8_0")
				opts.NumGPU = 0
			}
58
59
60
		}
	}

Michael Yang's avatar
Michael Yang committed
61
62
	var requiredMemory int64
	var f16Multiplier int64 = 2
63

Michael Yang's avatar
Michael Yang committed
64
	switch ggml.ModelType() {
Michael Yang's avatar
Michael Yang committed
65
	case "3B", "7B":
Michael Yang's avatar
Michael Yang committed
66
		requiredMemory = 8 * format.GigaByte
Michael Yang's avatar
Michael Yang committed
67
	case "13B":
Michael Yang's avatar
Michael Yang committed
68
		requiredMemory = 16 * format.GigaByte
Michael Yang's avatar
Michael Yang committed
69
	case "30B", "34B", "40B":
Michael Yang's avatar
Michael Yang committed
70
		requiredMemory = 32 * format.GigaByte
Michael Yang's avatar
Michael Yang committed
71
	case "65B", "70B":
Michael Yang's avatar
Michael Yang committed
72
		requiredMemory = 64 * format.GigaByte
Michael Yang's avatar
Michael Yang committed
73
	case "180B":
Michael Yang's avatar
Michael Yang committed
74
75
76
77
		requiredMemory = 128 * format.GigaByte
		f16Multiplier = 4
	}

78
79
80
81
82
83
84
85
86
87
88
89
90
	systemMemory := int64(memory.TotalMemory())

	videoMemory, err := CheckVRAM()
	if err != nil{
		videoMemory = 0
	}

	totalMemory := systemMemory + videoMemory

	if ggml.FileType() == "F16" && requiredMemory*f16Multiplier > totalMemory {
		return nil, fmt.Errorf("F16 model requires at least %s of total memory", format.HumanBytes(requiredMemory))
	} else if requiredMemory > totalMemory {
		return nil, fmt.Errorf("model requires at least %s of total memory", format.HumanBytes(requiredMemory))
91
92
	}

Bruce MacDonald's avatar
Bruce MacDonald committed
93
94
95
	switch ggml.Name() {
	case "gguf":
		opts.NumGQA = 0 // TODO: remove this when llama.cpp runners differ enough to need separate newLlama functions
96
		return newLlama(model, adapters, chooseRunners(workDir, "gguf"), ggml.NumLayers(), opts)
Bruce MacDonald's avatar
Bruce MacDonald committed
97
	case "ggml", "ggmf", "ggjt", "ggla":
98
		return newLlama(model, adapters, chooseRunners(workDir, "ggml"), ggml.NumLayers(), opts)
99
	default:
Michael Yang's avatar
Michael Yang committed
100
		return nil, fmt.Errorf("unknown ggml type: %s", ggml.ModelFamily())
101
102
	}
}