llm.go 3.16 KB
Newer Older
1
2
3
package llm

import (
4
	"context"
5
	"fmt"
6
	"log"
7
8
	"os"

9
10
	"github.com/pbnjay/memory"

11
12
13
14
	"github.com/jmorganca/ollama/api"
)

type LLM interface {
15
16
17
18
	Predict(context.Context, []int, string, func(api.GenerateResponse)) error
	Embedding(context.Context, string) ([]float64, error)
	Encode(context.Context, string) ([]int, error)
	Decode(context.Context, []int) (string, error)
19
20
	SetOptions(api.Options)
	Close()
21
	Ping(context.Context) error
22
23
}

24
func New(workDir, model string, adapters []string, opts api.Options) (LLM, error) {
25
26
27
28
29
30
31
32
	if _, err := os.Stat(model); err != nil {
		return nil, err
	}

	f, err := os.Open(model)
	if err != nil {
		return nil, err
	}
Michael Yang's avatar
Michael Yang committed
33
	defer f.Close()
34

Bruce MacDonald's avatar
Bruce MacDonald committed
35
	ggml, err := DecodeGGML(f)
36
37
38
39
	if err != nil {
		return nil, err
	}

Michael Yang's avatar
Michael Yang committed
40
	switch ggml.FileType() {
Bruce MacDonald's avatar
Bruce MacDonald committed
41
42
43
44
45
46
47
48
	case "Q8_0":
		if ggml.Name() != "gguf" && opts.NumGPU != 0 {
			// GGML Q8_0 do not support Metal API and will
			// cause the runner to segmentation fault so disable GPU
			log.Printf("WARNING: GPU disabled for F32, Q5_0, Q5_1, and Q8_0")
			opts.NumGPU = 0
		}
	case "F32", "Q5_0", "Q5_1":
49
		if opts.NumGPU != 0 {
Bruce MacDonald's avatar
Bruce MacDonald committed
50
			// F32, Q5_0, Q5_1, and Q8_0 do not support Metal API and will
51
			// cause the runner to segmentation fault so disable GPU
Michael Yang's avatar
Michael Yang committed
52
			log.Printf("WARNING: GPU disabled for F32, Q5_0, Q5_1, and Q8_0")
53
54
55
56
			opts.NumGPU = 0
		}
	}

57
	totalResidentMemory := memory.TotalMemory()
Michael Yang's avatar
Michael Yang committed
58
	switch ggml.ModelType() {
Michael Yang's avatar
Michael Yang committed
59
60
	case "3B", "7B":
		if ggml.FileType() == "F16" && totalResidentMemory < 16*1024*1024 {
Michael Yang's avatar
Michael Yang committed
61
62
			return nil, fmt.Errorf("F16 model requires at least 16GB of memory")
		} else if totalResidentMemory < 8*1024*1024 {
63
64
			return nil, fmt.Errorf("model requires at least 8GB of memory")
		}
Michael Yang's avatar
Michael Yang committed
65
66
	case "13B":
		if ggml.FileType() == "F16" && totalResidentMemory < 32*1024*1024 {
Michael Yang's avatar
Michael Yang committed
67
68
			return nil, fmt.Errorf("F16 model requires at least 32GB of memory")
		} else if totalResidentMemory < 16*1024*1024 {
69
70
			return nil, fmt.Errorf("model requires at least 16GB of memory")
		}
Michael Yang's avatar
Michael Yang committed
71
72
	case "30B", "34B", "40B":
		if ggml.FileType() == "F16" && totalResidentMemory < 64*1024*1024 {
Michael Yang's avatar
Michael Yang committed
73
74
			return nil, fmt.Errorf("F16 model requires at least 64GB of memory")
		} else if totalResidentMemory < 32*1024*1024 {
75
76
			return nil, fmt.Errorf("model requires at least 32GB of memory")
		}
Michael Yang's avatar
Michael Yang committed
77
78
	case "65B", "70B":
		if ggml.FileType() == "F16" && totalResidentMemory < 128*1024*1024 {
Michael Yang's avatar
Michael Yang committed
79
80
			return nil, fmt.Errorf("F16 model requires at least 128GB of memory")
		} else if totalResidentMemory < 64*1024*1024 {
81
82
			return nil, fmt.Errorf("model requires at least 64GB of memory")
		}
Michael Yang's avatar
Michael Yang committed
83
84
85
86
87
88
	case "180B":
		if ggml.FileType() == "F16" && totalResidentMemory < 512*1024*1024 {
			return nil, fmt.Errorf("F16 model requires at least 512GB of memory")
		} else if totalResidentMemory < 128*1024*1024 {
			return nil, fmt.Errorf("model requires at least 128GB of memory")
		}
89
90
	}

Bruce MacDonald's avatar
Bruce MacDonald committed
91
92
93
	switch ggml.Name() {
	case "gguf":
		opts.NumGQA = 0 // TODO: remove this when llama.cpp runners differ enough to need separate newLlama functions
94
		return newLlama(model, adapters, chooseRunners(workDir, "gguf"), ggml.NumLayers(), opts)
Bruce MacDonald's avatar
Bruce MacDonald committed
95
	case "ggml", "ggmf", "ggjt", "ggla":
96
		return newLlama(model, adapters, chooseRunners(workDir, "ggml"), ggml.NumLayers(), opts)
97
	default:
Michael Yang's avatar
Michael Yang committed
98
		return nil, fmt.Errorf("unknown ggml type: %s", ggml.ModelFamily())
99
100
	}
}