llm.go 3.23 KB
Newer Older
1
2
3
package llm

import (
4
	"context"
5
	"fmt"
6
	"log"
7
	"os"
8
	"runtime"
9

10
11
	"github.com/pbnjay/memory"

12
13
14
15
	"github.com/jmorganca/ollama/api"
)

type LLM interface {
16
17
18
19
	Predict(context.Context, []int, string, func(api.GenerateResponse)) error
	Embedding(context.Context, string) ([]float64, error)
	Encode(context.Context, string) ([]int, error)
	Decode(context.Context, []int) (string, error)
20
21
	SetOptions(api.Options)
	Close()
22
	Ping(context.Context) error
23
24
}

25
func New(workDir, model string, adapters []string, opts api.Options) (LLM, error) {
26
27
28
29
30
31
32
33
	if _, err := os.Stat(model); err != nil {
		return nil, err
	}

	f, err := os.Open(model)
	if err != nil {
		return nil, err
	}
Michael Yang's avatar
Michael Yang committed
34
	defer f.Close()
35

Bruce MacDonald's avatar
Bruce MacDonald committed
36
	ggml, err := DecodeGGML(f)
37
38
39
40
	if err != nil {
		return nil, err
	}

41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
	if runtime.GOOS == "darwin" {
		switch ggml.FileType() {
		case "Q8_0":
			if ggml.Name() != "gguf" && opts.NumGPU != 0 {
				// GGML Q8_0 do not support Metal API and will
				// cause the runner to segmentation fault so disable GPU
				log.Printf("WARNING: GPU disabled for F32, Q5_0, Q5_1, and Q8_0")
				opts.NumGPU = 0
			}
		case "F32", "Q5_0", "Q5_1":
			if opts.NumGPU != 0 {
				// F32, Q5_0, Q5_1, and Q8_0 do not support Metal API and will
				// cause the runner to segmentation fault so disable GPU
				log.Printf("WARNING: GPU disabled for F32, Q5_0, Q5_1, and Q8_0")
				opts.NumGPU = 0
			}
57
58
59
		}
	}

60
	totalResidentMemory := memory.TotalMemory()
Michael Yang's avatar
Michael Yang committed
61
	switch ggml.ModelType() {
Michael Yang's avatar
Michael Yang committed
62
	case "3B", "7B":
Michael Yang's avatar
Michael Yang committed
63
64
65
66
		if ggml.FileType() == "F16" && totalResidentMemory < 16*1000*1000 {
			return nil, fmt.Errorf("F16 model requires at least 16 GB of memory")
		} else if totalResidentMemory < 8*1000*1000 {
			return nil, fmt.Errorf("model requires at least 8 GB of memory")
67
		}
Michael Yang's avatar
Michael Yang committed
68
	case "13B":
Michael Yang's avatar
Michael Yang committed
69
70
71
72
		if ggml.FileType() == "F16" && totalResidentMemory < 32*1000*1000 {
			return nil, fmt.Errorf("F16 model requires at least 32 GB of memory")
		} else if totalResidentMemory < 16*1000*1000 {
			return nil, fmt.Errorf("model requires at least 16 GB of memory")
73
		}
Michael Yang's avatar
Michael Yang committed
74
	case "30B", "34B", "40B":
Michael Yang's avatar
Michael Yang committed
75
76
77
78
		if ggml.FileType() == "F16" && totalResidentMemory < 64*1000*1000 {
			return nil, fmt.Errorf("F16 model requires at least 64 GB of memory")
		} else if totalResidentMemory < 32*1000*1000 {
			return nil, fmt.Errorf("model requires at least 32 GB of memory")
79
		}
Michael Yang's avatar
Michael Yang committed
80
	case "65B", "70B":
Michael Yang's avatar
Michael Yang committed
81
82
83
84
		if ggml.FileType() == "F16" && totalResidentMemory < 128*1000*1000 {
			return nil, fmt.Errorf("F16 model requires at least 128 GB of memory")
		} else if totalResidentMemory < 64*1000*1000 {
			return nil, fmt.Errorf("model requires at least 64 GB of memory")
85
		}
Michael Yang's avatar
Michael Yang committed
86
	case "180B":
Michael Yang's avatar
Michael Yang committed
87
		if ggml.FileType() == "F16" && totalResidentMemory < 512*1000*1000 {
Michael Yang's avatar
Michael Yang committed
88
			return nil, fmt.Errorf("F16 model requires at least 512GB of memory")
Michael Yang's avatar
Michael Yang committed
89
		} else if totalResidentMemory < 128*1000*1000 {
Michael Yang's avatar
Michael Yang committed
90
91
			return nil, fmt.Errorf("model requires at least 128GB of memory")
		}
92
93
	}

Bruce MacDonald's avatar
Bruce MacDonald committed
94
95
96
	switch ggml.Name() {
	case "gguf":
		opts.NumGQA = 0 // TODO: remove this when llama.cpp runners differ enough to need separate newLlama functions
97
		return newLlama(model, adapters, chooseRunners(workDir, "gguf"), ggml.NumLayers(), opts)
Bruce MacDonald's avatar
Bruce MacDonald committed
98
	case "ggml", "ggmf", "ggjt", "ggla":
99
		return newLlama(model, adapters, chooseRunners(workDir, "ggml"), ggml.NumLayers(), opts)
100
	default:
Michael Yang's avatar
Michael Yang committed
101
		return nil, fmt.Errorf("unknown ggml type: %s", ggml.ModelFamily())
102
103
	}
}