llama.go 7.27 KB
Newer Older
Jeffrey Morgan's avatar
Jeffrey Morgan committed
1
2
package llama

Michael Yang's avatar
Michael Yang committed
3
/*
Michael Yang's avatar
Michael Yang committed
4
#cgo CPPFLAGS: -O3 -DNDEBUG=1 -DGGML_USE_K_QUANTS
Michael Yang's avatar
Michael Yang committed
5
#cgo CXXFLAGS: -std=c++11
Michael Yang's avatar
Michael Yang committed
6
#cgo darwin CPPFLAGS: -DGGML_USE_ACCELERATE -DGGML_USE_METAL -DGGML_METAL_NDEBUG
Michael Yang's avatar
Michael Yang committed
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#cgo darwin LDFLAGS: -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
#include <stdlib.h>
#include "llama.h"

struct llama_sample_options
{
	float repeat_penalty;
	float frequency_penalty;
	float presence_penalty;
	float temperature;
	int32_t top_k;
	float top_p;
	float tfs_z;
	float typical_p;
	int mirostat;
	float mirostat_tau;
	float mirostat_eta;
};

llama_token llama_sample(
		struct llama_context *ctx,
		struct llama_token_data *candidates,
		size_t n_candidates,
		const llama_token *last_tokens,
		size_t n_last_tokens,
		struct llama_sample_options *opts)
{
	llama_token_data_array candidates_p = {
		candidates,
		n_candidates,
		false,
	};

	llama_sample_repetition_penalty(
		ctx, &candidates_p,
		last_tokens, n_last_tokens,
		opts->repeat_penalty);

	llama_sample_frequency_and_presence_penalties(
		ctx, &candidates_p,
		last_tokens, n_last_tokens,
		opts->frequency_penalty, opts->presence_penalty);

	if (opts->temperature <= 0) {
		return llama_sample_token_greedy(ctx, &candidates_p);
	}
Bruce MacDonald's avatar
Bruce MacDonald committed
53

Michael Yang's avatar
Michael Yang committed
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
	if (opts->mirostat == 1) {
		int mirostat_m = 100;
		float mirostat_mu = 2.0f * opts->mirostat_tau;
		llama_sample_temperature(ctx, &candidates_p, opts->temperature);
		return llama_sample_token_mirostat(
			ctx, &candidates_p,
			opts->mirostat_tau, opts->mirostat_eta,
			mirostat_m, &mirostat_mu);
	} else if (opts->mirostat == 2) {
		float mirostat_mu = 2.0f * opts->mirostat_tau;
		llama_sample_temperature(ctx, &candidates_p, opts->temperature);
		return llama_sample_token_mirostat_v2(
			ctx, &candidates_p,
			opts->mirostat_tau, opts->mirostat_eta,
			&mirostat_mu);
	} else {
		llama_sample_top_k(ctx, &candidates_p, opts->top_k, 1);
		llama_sample_tail_free(ctx, &candidates_p, opts->tfs_z, 1);
		llama_sample_typical(ctx, &candidates_p, opts->typical_p, 1);
		llama_sample_top_p(ctx, &candidates_p, opts->top_p, 1);
		llama_sample_temperature(ctx, &candidates_p, opts->temperature);
		return llama_sample_token(ctx, &candidates_p);
	}
}
*/
import "C"
Jeffrey Morgan's avatar
Jeffrey Morgan committed
80
import (
Michael Yang's avatar
Michael Yang committed
81
	"bytes"
Michael Yang's avatar
Michael Yang committed
82
	"errors"
83
	"fmt"
Michael Yang's avatar
Michael Yang committed
84
85
	"io"
	"os"
Jeffrey Morgan's avatar
Jeffrey Morgan committed
86
	"strings"
87
	"time"
Michael Yang's avatar
Michael Yang committed
88
	"unicode/utf8"
Jeffrey Morgan's avatar
Jeffrey Morgan committed
89
	"unsafe"
Michael Yang's avatar
Michael Yang committed
90
91

	"github.com/jmorganca/ollama/api"
Jeffrey Morgan's avatar
Jeffrey Morgan committed
92
93
)

Michael Yang's avatar
Michael Yang committed
94
95
96
97
type llama struct {
	params *C.struct_llama_context_params
	model  *C.struct_llama_model
	ctx    *C.struct_llama_context
Jeffrey Morgan's avatar
Jeffrey Morgan committed
98

Michael Yang's avatar
Michael Yang committed
99
100
	api.Options
}
Jeffrey Morgan's avatar
Jeffrey Morgan committed
101

Michael Yang's avatar
Michael Yang committed
102
103
104
func New(model string, opts api.Options) (*llama, error) {
	if _, err := os.Stat(model); err != nil {
		return nil, err
Jeffrey Morgan's avatar
Jeffrey Morgan committed
105
106
	}

Michael Yang's avatar
Michael Yang committed
107
108
	llm := llama{Options: opts}

Michael Yang's avatar
Michael Yang committed
109
	C.llama_backend_init(C.bool(llm.UseNUMA))
Michael Yang's avatar
Michael Yang committed
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129

	params := C.llama_context_default_params()
	params.seed = C.uint(llm.Seed)
	params.n_ctx = C.int(llm.NumCtx)
	params.n_batch = C.int(llm.NumBatch)
	params.n_gpu_layers = C.int(llm.NumGPU)
	params.main_gpu = C.int(llm.MainGPU)
	params.low_vram = C.bool(llm.LowVRAM)
	params.f16_kv = C.bool(llm.F16KV)
	params.logits_all = C.bool(llm.LogitsAll)
	params.vocab_only = C.bool(llm.VocabOnly)
	params.use_mmap = C.bool(llm.UseMMap)
	params.use_mlock = C.bool(llm.UseMLock)
	params.embedding = C.bool(llm.EmbeddingOnly)
	llm.params = &params

	cModel := C.CString(model)
	defer C.free(unsafe.Pointer(cModel))

	llm.model = C.llama_load_model_from_file(cModel, params)
130
131
132
133
	if llm.model == nil {
		return nil, errors.New("failed to load model")
	}

Michael Yang's avatar
Michael Yang committed
134
	llm.ctx = C.llama_new_context_with_model(llm.model, params)
135
136
137
	if llm.ctx == nil {
		return nil, errors.New("failed to create context")
	}
Michael Yang's avatar
Michael Yang committed
138
139
140
141
142
143
144

	// warm up the model
	bos := []C.llama_token{C.llama_token_bos()}
	C.llama_eval(llm.ctx, unsafe.SliceData(bos), C.int(len(bos)), 0, C.int(opts.NumThread))
	C.llama_reset_timings(llm.ctx)

	return &llm, nil
Jeffrey Morgan's avatar
Jeffrey Morgan committed
145
146
}

Michael Yang's avatar
Michael Yang committed
147
148
149
150
151
func (llm *llama) Close() {
	defer C.llama_free_model(llm.model)
	defer C.llama_free(llm.ctx)

	C.llama_print_timings(llm.ctx)
Jeffrey Morgan's avatar
Jeffrey Morgan committed
152
153
}

Michael Yang's avatar
Michael Yang committed
154
155
156
157
158
159
160
161
func (llm *llama) Predict(ctx []int, prompt string, fn func(api.GenerateResponse)) error {
	if input := llm.tokenize(prompt); input != nil {
		embd := make([]C.llama_token, len(ctx))
		for i := range ctx {
			embd[i] = C.llama_token(ctx[i])
		}

		return llm.generate(append(embd, input...), fn)
Jeffrey Morgan's avatar
Jeffrey Morgan committed
162
163
	}

Michael Yang's avatar
Michael Yang committed
164
165
	return errors.New("llama: tokenize")
}
Michael Yang's avatar
Michael Yang committed
166

Michael Yang's avatar
Michael Yang committed
167
168
169
func (llm *llama) tokenize(prompt string) []C.llama_token {
	cPrompt := C.CString(prompt)
	defer C.free(unsafe.Pointer(cPrompt))
Michael Yang's avatar
Michael Yang committed
170

171
	tokens := make([]C.llama_token, len(prompt)+1)
Michael Yang's avatar
Michael Yang committed
172
173
	if n := C.llama_tokenize(llm.ctx, cPrompt, unsafe.SliceData(tokens), C.int(len(tokens)), true); n > 0 {
		return tokens[:n]
Jeffrey Morgan's avatar
Jeffrey Morgan committed
174
175
176
177
178
	}

	return nil
}

Michael Yang's avatar
Michael Yang committed
179
180
181
182
func (llm *llama) detokenize(tokens ...C.llama_token) string {
	var sb strings.Builder
	for _, token := range tokens {
		sb.WriteString(C.GoString(C.llama_token_to_str(llm.ctx, token)))
Jeffrey Morgan's avatar
Jeffrey Morgan committed
183
184
	}

Michael Yang's avatar
Michael Yang committed
185
	return sb.String()
Jeffrey Morgan's avatar
Jeffrey Morgan committed
186
187
}

188
func (llm *llama) generate(input []C.llama_token, fn func(api.GenerateResponse)) error {
Michael Yang's avatar
Michael Yang committed
189
190
191
192
193
194
195
196
197
198
199
200
201
	var opts C.struct_llama_sample_options
	opts.repeat_penalty = C.float(llm.RepeatPenalty)
	opts.frequency_penalty = C.float(llm.FrequencyPenalty)
	opts.presence_penalty = C.float(llm.PresencePenalty)
	opts.temperature = C.float(llm.Temperature)
	opts.top_k = C.int(llm.TopK)
	opts.top_p = C.float(llm.TopP)
	opts.tfs_z = C.float(llm.TFSZ)
	opts.typical_p = C.float(llm.TypicalP)
	opts.mirostat = C.int(llm.Mirostat)
	opts.mirostat_tau = C.float(llm.MirostatTau)
	opts.mirostat_eta = C.float(llm.MirostatEta)

202
	output := deque[C.llama_token]{capacity: llm.NumCtx}
Michael Yang's avatar
Michael Yang committed
203

Michael Yang's avatar
Michael Yang committed
204
205
206
207
208
	context := deque[int]{capacity: llm.NumCtx / 2}
	for _, in := range input {
		context.PushLeft(int(in))
	}

Michael Yang's avatar
Michael Yang committed
209
	var b bytes.Buffer
Michael Yang's avatar
Michael Yang committed
210
	for C.llama_get_kv_cache_token_count(llm.ctx) < C.int(llm.NumCtx) {
211
		if retval := C.llama_eval(llm.ctx, unsafe.SliceData(input), C.int(len(input)), C.llama_get_kv_cache_token_count(llm.ctx), C.int(llm.NumThread)); retval != 0 {
Michael Yang's avatar
Michael Yang committed
212
213
214
			return errors.New("llama: eval")
		}

215
216
217
218
		token, err := llm.sample(output, &opts)
		if errors.Is(err, io.EOF) {
			break
		} else if err != nil {
Michael Yang's avatar
Michael Yang committed
219
			return err
Michael Yang's avatar
Michael Yang committed
220
221
		}

Michael Yang's avatar
Michael Yang committed
222
223
224
225
226
227
		b.WriteString(llm.detokenize(token))
		if utf8.Valid(b.Bytes()) || b.Len() >= utf8.UTFMax {
			// call the callback
			fn(api.GenerateResponse{
				Response: b.String(),
			})
228

Michael Yang's avatar
Michael Yang committed
229
230
231
232
			output.PushLeft(token)
			context.PushLeft(int(token))
			b.Reset()
		}
233
234
235

		input = []C.llama_token{token}
	}
Michael Yang's avatar
Michael Yang committed
236

237
238
239
240
241
	dur := func(ms float64) time.Duration {
		d, err := time.ParseDuration(fmt.Sprintf("%fms", ms))
		if err != nil {
			panic(err)
		}
Michael Yang's avatar
Michael Yang committed
242

243
		return d
Jeffrey Morgan's avatar
Jeffrey Morgan committed
244
245
	}

246
247
248
	timings := C.llama_get_timings(llm.ctx)
	fn(api.GenerateResponse{
		Done:               true,
Michael Yang's avatar
Michael Yang committed
249
		Context:            context.Data(),
250
251
252
253
254
255
		PromptEvalCount:    int(timings.n_p_eval),
		PromptEvalDuration: dur(float64(timings.t_p_eval_ms)),
		EvalCount:          int(timings.n_eval),
		EvalDuration:       dur(float64(timings.t_eval_ms)),
	})

Michael Yang's avatar
Michael Yang committed
256
	return nil
Jeffrey Morgan's avatar
Jeffrey Morgan committed
257
258
}

259
func (llm *llama) sample(output deque[C.llama_token], opts *C.struct_llama_sample_options) (C.llama_token, error) {
Michael Yang's avatar
Michael Yang committed
260
261
262
	numVocab := int(C.llama_n_vocab(llm.ctx))
	logits := unsafe.Slice(C.llama_get_logits(llm.ctx), numVocab)

263
264
265
	candidates := deque[C.struct_llama_token_data]{capacity: numVocab}
	for i := 0; i < candidates.Cap(); i++ {
		candidates.PushLeft(C.struct_llama_token_data{
Michael Yang's avatar
Michael Yang committed
266
267
268
269
270
			id:    C.int(i),
			logit: logits[i],
			p:     0,
		})
	}
Jeffrey Morgan's avatar
Jeffrey Morgan committed
271

Michael Yang's avatar
Michael Yang committed
272
273
	token := C.llama_sample(
		llm.ctx,
Michael Yang's avatar
size_t  
Michael Yang committed
274
275
		unsafe.SliceData(candidates.Data()), C.size_t(candidates.Len()),
		unsafe.SliceData(output.Data()), C.size_t(output.Len()),
Michael Yang's avatar
Michael Yang committed
276
277
278
		opts)
	if token != C.llama_token_eos() {
		return token, nil
Jeffrey Morgan's avatar
Jeffrey Morgan committed
279
	}
Michael Yang's avatar
Michael Yang committed
280
281

	return 0, io.EOF
Jeffrey Morgan's avatar
Jeffrey Morgan committed
282
}