llm.go 2.58 KB
Newer Older
1
2
3
package llm

import (
4
	"context"
5
	"fmt"
6
	"log"
7
	"os"
8
	"runtime"
9

10
11
	"github.com/pbnjay/memory"

12
	"github.com/jmorganca/ollama/api"
Michael Yang's avatar
Michael Yang committed
13
	"github.com/jmorganca/ollama/format"
14
15
16
)

type LLM interface {
17
18
19
20
	Predict(context.Context, []int, string, func(api.GenerateResponse)) error
	Embedding(context.Context, string) ([]float64, error)
	Encode(context.Context, string) ([]int, error)
	Decode(context.Context, []int) (string, error)
21
22
	SetOptions(api.Options)
	Close()
23
	Ping(context.Context) error
24
25
}

26
func New(workDir, model string, adapters []string, opts api.Options) (LLM, error) {
27
28
29
30
31
32
33
34
	if _, err := os.Stat(model); err != nil {
		return nil, err
	}

	f, err := os.Open(model)
	if err != nil {
		return nil, err
	}
Michael Yang's avatar
Michael Yang committed
35
	defer f.Close()
36

Bruce MacDonald's avatar
Bruce MacDonald committed
37
	ggml, err := DecodeGGML(f)
38
39
40
41
	if err != nil {
		return nil, err
	}

42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
	if runtime.GOOS == "darwin" {
		switch ggml.FileType() {
		case "Q8_0":
			if ggml.Name() != "gguf" && opts.NumGPU != 0 {
				// GGML Q8_0 do not support Metal API and will
				// cause the runner to segmentation fault so disable GPU
				log.Printf("WARNING: GPU disabled for F32, Q5_0, Q5_1, and Q8_0")
				opts.NumGPU = 0
			}
		case "F32", "Q5_0", "Q5_1":
			if opts.NumGPU != 0 {
				// F32, Q5_0, Q5_1, and Q8_0 do not support Metal API and will
				// cause the runner to segmentation fault so disable GPU
				log.Printf("WARNING: GPU disabled for F32, Q5_0, Q5_1, and Q8_0")
				opts.NumGPU = 0
			}
58
59
60
		}
	}

Michael Yang's avatar
Michael Yang committed
61
62
63
	var requiredMemory int64
	var f16Multiplier int64 = 2
	totalResidentMemory := int64(memory.TotalMemory())
Michael Yang's avatar
Michael Yang committed
64
	switch ggml.ModelType() {
Michael Yang's avatar
Michael Yang committed
65
	case "3B", "7B":
Michael Yang's avatar
Michael Yang committed
66
		requiredMemory = 8 * format.GigaByte
Michael Yang's avatar
Michael Yang committed
67
	case "13B":
Michael Yang's avatar
Michael Yang committed
68
		requiredMemory = 16 * format.GigaByte
Michael Yang's avatar
Michael Yang committed
69
	case "30B", "34B", "40B":
Michael Yang's avatar
Michael Yang committed
70
		requiredMemory = 32 * format.GigaByte
Michael Yang's avatar
Michael Yang committed
71
	case "65B", "70B":
Michael Yang's avatar
Michael Yang committed
72
		requiredMemory = 64 * format.GigaByte
Michael Yang's avatar
Michael Yang committed
73
	case "180B":
Michael Yang's avatar
Michael Yang committed
74
75
76
77
78
79
80
81
		requiredMemory = 128 * format.GigaByte
		f16Multiplier = 4
	}

	if ggml.FileType() == "F16" && requiredMemory*f16Multiplier > totalResidentMemory {
		return nil, fmt.Errorf("F16 model requires at least %s of memory", format.HumanBytes(requiredMemory))
	} else if requiredMemory > totalResidentMemory {
		return nil, fmt.Errorf("model requires at least %s of memory", format.HumanBytes(requiredMemory))
82
83
	}

Bruce MacDonald's avatar
Bruce MacDonald committed
84
85
86
	switch ggml.Name() {
	case "gguf":
		opts.NumGQA = 0 // TODO: remove this when llama.cpp runners differ enough to need separate newLlama functions
87
		return newLlama(model, adapters, chooseRunners(workDir, "gguf"), ggml.NumLayers(), opts)
Bruce MacDonald's avatar
Bruce MacDonald committed
88
	case "ggml", "ggmf", "ggjt", "ggla":
89
		return newLlama(model, adapters, chooseRunners(workDir, "ggml"), ggml.NumLayers(), opts)
90
	default:
Michael Yang's avatar
Michael Yang committed
91
		return nil, fmt.Errorf("unknown ggml type: %s", ggml.ModelFamily())
92
93
	}
}