llama.go 5.39 KB
Newer Older
1
2
3
4
5
package llm

import (
	"bytes"
	"context"
6
	_ "embed"
7
8
	"errors"
	"fmt"
9
10
11
	"io"
	"io/fs"
	"log"
12
13
	"os"
	"os/exec"
14
	"path/filepath"
15
	"sync"
16
17
18
	"time"

	"github.com/jmorganca/ollama/api"
Michael Yang's avatar
Michael Yang committed
19
	"github.com/jmorganca/ollama/format"
20
21
)

22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
const jsonGrammar = `
root   ::= object
value  ::= object | array | string | number | ("true" | "false" | "null") ws

object ::=
  "{" ws (
            string ":" ws value
    ("," ws string ":" ws value)*
  )? "}" ws

array  ::=
  "[" ws (
            value
    ("," ws value)*
  )? "]" ws

string ::=
  "\"" (
    [^"\\] |
    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
  )* "\"" ws

number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws

# Optional space: by convention, applied in this grammar after literal chars when allowed
ws ::= ([ \t\n] ws)?
`

50
51
52
53
type llamaModel struct {
	hyperparameters llamaHyperparameters
}

Michael Yang's avatar
Michael Yang committed
54
55
func (llm *llamaModel) ModelFamily() string {
	return "llama"
56
57
}

Michael Yang's avatar
Michael Yang committed
58
59
func llamaModelType(numLayer uint32) string {
	switch numLayer {
60
	case 26:
Michael Yang's avatar
Michael Yang committed
61
		return "3B"
62
	case 32:
Michael Yang's avatar
Michael Yang committed
63
		return "7B"
64
	case 40:
Michael Yang's avatar
Michael Yang committed
65
		return "13B"
66
	case 48:
Michael Yang's avatar
Michael Yang committed
67
		return "34B"
68
	case 60:
Michael Yang's avatar
Michael Yang committed
69
		return "30B"
70
	case 80:
Michael Yang's avatar
Michael Yang committed
71
72
		return "65B"
	default:
Michael Yang's avatar
Michael Yang committed
73
		return "unknown"
74
	}
Michael Yang's avatar
Michael Yang committed
75
}
76

Michael Yang's avatar
Michael Yang committed
77
78
func (llm *llamaModel) ModelType() string {
	return llamaModelType(llm.hyperparameters.NumLayer)
79
80
}

Michael Yang's avatar
Michael Yang committed
81
82
func (llm *llamaModel) FileType() string {
	return fileType(llm.hyperparameters.FileType)
83
84
}

85
86
87
88
func (llm *llamaModel) NumLayers() int64 {
	return int64(llm.hyperparameters.NumLayer)
}

89
90
91
92
93
94
95
96
97
98
99
100
101
102
type llamaHyperparameters struct {
	// NumVocab is the size of the model's vocabulary.
	NumVocab uint32

	// NumEmbd is the size of the model's embedding layer.
	NumEmbd uint32
	NumMult uint32
	NumHead uint32

	// NumLayer is the number of layers in the model.
	NumLayer uint32
	NumRot   uint32

	// FileType describes the quantization level of the model, e.g. Q4_0, Q5_K, etc.
Michael Yang's avatar
Michael Yang committed
103
	FileType uint32
104
105
106
}

type Running struct {
107
108
109
110
111
112
	Port          int
	Cmd           *exec.Cmd
	Cancel        context.CancelFunc
	exitOnce      sync.Once
	exitCh        chan error // channel to receive the exit status of the subprocess
	*StatusWriter            // captures error messages from the llama runner process
113
114
}

Patrick Devine's avatar
Patrick Devine committed
115
116
117
118
119
type ImageData struct {
	Data []byte `json:"data"`
	ID   int    `json:"id"`
}

120
var (
Jeffrey Morgan's avatar
Jeffrey Morgan committed
121
	errNvidiaSMI     = errors.New("warning: gpu support may not be enabled, check that you have installed GPU drivers: nvidia-smi command failed")
122
	errAvailableVRAM = errors.New("not enough VRAM available, falling back to CPU only")
123
	payloadMissing   = fmt.Errorf("expected payload not included in this build of ollama")
124
)
125

126
127
// StatusWriter is a writer that captures error messages from the llama runner process
type StatusWriter struct {
128
129
	ErrCh      chan error
	LastErrMsg string
130
131
132
133
134
135
136
137
138
}

func NewStatusWriter() *StatusWriter {
	return &StatusWriter{
		ErrCh: make(chan error, 1),
	}
}

func (w *StatusWriter) Write(b []byte) (int, error) {
139
	var errMsg string
140
	if _, after, ok := bytes.Cut(b, []byte("error:")); ok {
141
142
143
		errMsg = string(bytes.TrimSpace(after))
	} else if _, after, ok := bytes.Cut(b, []byte("CUDA error")); ok {
		errMsg = string(bytes.TrimSpace(after))
144
	}
145
146
147
148
149
150

	if errMsg != "" {
		w.LastErrMsg = errMsg
		w.ErrCh <- fmt.Errorf("llama runner: %s", errMsg)
	}

151
152
153
	return os.Stderr.Write(b)
}

Michael Yang's avatar
Michael Yang committed
154
type prediction struct {
Michael Yang's avatar
Michael Yang committed
155
156
157
158
159
	Content string `json:"content"`
	Model   string `json:"model"`
	Prompt  string `json:"prompt"`
	Stop    bool   `json:"stop"`

Michael Yang's avatar
Michael Yang committed
160
161
162
163
164
165
	Timings struct {
		PredictedN  int     `json:"predicted_n"`
		PredictedMS float64 `json:"predicted_ms"`
		PromptN     int     `json:"prompt_n"`
		PromptMS    float64 `json:"prompt_ms"`
	}
166
167
}

Michael Yang's avatar
Michael Yang committed
168
const maxBufferSize = 512 * format.KiloByte
169
170
const maxRetries = 3
const retryDelay = 1 * time.Second
171

Bruce MacDonald's avatar
Bruce MacDonald committed
172
type PredictOpts struct {
173
174
175
	Prompt string
	Format string
	Images []api.ImageData
Bruce MacDonald's avatar
Bruce MacDonald committed
176
}
177

Bruce MacDonald's avatar
Bruce MacDonald committed
178
179
180
181
182
183
184
185
type PredictResult struct {
	Content            string
	Done               bool
	PromptEvalCount    int
	PromptEvalDuration time.Duration
	EvalCount          int
	EvalDuration       time.Duration
}
186

187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
type TokenizeRequest struct {
	Content string `json:"content"`
}

type TokenizeResponse struct {
	Tokens []int `json:"tokens"`
}

type DetokenizeRequest struct {
	Tokens []int `json:"tokens"`
}

type DetokenizeResponse struct {
	Content string `json:"content"`
}

type EmbeddingRequest struct {
	Content string `json:"content"`
}

type EmbeddingResponse struct {
	Embedding []float64 `json:"embedding"`
}
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248

func extractLib(workDir, glob string) error {
	files, err := fs.Glob(libEmbed, glob)
	if err != nil || len(files) == 0 {
		return payloadMissing
	}

	if len(files) != 1 {
		// Shouldn't happen, but just use the first one we find
		log.Printf("WARNING: multiple payloads detected - using %s", files[0])
	}

	srcFile, err := libEmbed.Open(files[0])
	if err != nil {
		return fmt.Errorf("read payload %s: %v", files[0], err)
	}
	defer srcFile.Close()
	if err := os.MkdirAll(workDir, 0o755); err != nil {
		return fmt.Errorf("create payload temp dir %s: %v", workDir, err)
	}

	destFile := filepath.Join(workDir, filepath.Base(files[0]))

	_, err = os.Stat(destFile)
	switch {
	case errors.Is(err, os.ErrNotExist):
		destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
		if err != nil {
			return fmt.Errorf("write payload %s: %v", files[0], err)
		}
		defer destFile.Close()
		if _, err := io.Copy(destFile, srcFile); err != nil {
			return fmt.Errorf("copy payload %s: %v", files[0], err)
		}
	case err != nil:
		return fmt.Errorf("stat payload %s: %v", files[0], err)
	}
	return nil
}