llama.go 7.08 KB
Newer Older
Jeffrey Morgan's avatar
Jeffrey Morgan committed
1
2
package llama

Michael Yang's avatar
Michael Yang committed
3
/*
Michael Yang's avatar
Michael Yang committed
4
5
6
#cgo CPPFLAGS: -O3 -DNDEBUG=1
#cgo CXXFLAGS: -std=c++11
#cgo darwin CPPFLAGS: -DGGML_USE_METAL=1 -DGGML_METAL_NDEBUG=1
Michael Yang's avatar
Michael Yang committed
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#cgo darwin LDFLAGS: -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
#include <stdlib.h>
#include "llama.h"

struct llama_sample_options
{
	float repeat_penalty;
	float frequency_penalty;
	float presence_penalty;
	float temperature;
	int32_t top_k;
	float top_p;
	float tfs_z;
	float typical_p;
	int mirostat;
	float mirostat_tau;
	float mirostat_eta;
};

llama_token llama_sample(
		struct llama_context *ctx,
		struct llama_token_data *candidates,
		size_t n_candidates,
		const llama_token *last_tokens,
		size_t n_last_tokens,
		struct llama_sample_options *opts)
{
	llama_token_data_array candidates_p = {
		candidates,
		n_candidates,
		false,
	};

	llama_sample_repetition_penalty(
		ctx, &candidates_p,
		last_tokens, n_last_tokens,
		opts->repeat_penalty);

	llama_sample_frequency_and_presence_penalties(
		ctx, &candidates_p,
		last_tokens, n_last_tokens,
		opts->frequency_penalty, opts->presence_penalty);

	if (opts->temperature <= 0) {
		return llama_sample_token_greedy(ctx, &candidates_p);
	}
Bruce MacDonald's avatar
Bruce MacDonald committed
53

Michael Yang's avatar
Michael Yang committed
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
	if (opts->mirostat == 1) {
		int mirostat_m = 100;
		float mirostat_mu = 2.0f * opts->mirostat_tau;
		llama_sample_temperature(ctx, &candidates_p, opts->temperature);
		return llama_sample_token_mirostat(
			ctx, &candidates_p,
			opts->mirostat_tau, opts->mirostat_eta,
			mirostat_m, &mirostat_mu);
	} else if (opts->mirostat == 2) {
		float mirostat_mu = 2.0f * opts->mirostat_tau;
		llama_sample_temperature(ctx, &candidates_p, opts->temperature);
		return llama_sample_token_mirostat_v2(
			ctx, &candidates_p,
			opts->mirostat_tau, opts->mirostat_eta,
			&mirostat_mu);
	} else {
		llama_sample_top_k(ctx, &candidates_p, opts->top_k, 1);
		llama_sample_tail_free(ctx, &candidates_p, opts->tfs_z, 1);
		llama_sample_typical(ctx, &candidates_p, opts->typical_p, 1);
		llama_sample_top_p(ctx, &candidates_p, opts->top_p, 1);
		llama_sample_temperature(ctx, &candidates_p, opts->temperature);
		return llama_sample_token(ctx, &candidates_p);
	}
}
*/
import "C"
Jeffrey Morgan's avatar
Jeffrey Morgan committed
80
import (
Michael Yang's avatar
Michael Yang committed
81
	"errors"
82
	"fmt"
Michael Yang's avatar
Michael Yang committed
83
84
	"io"
	"os"
Jeffrey Morgan's avatar
Jeffrey Morgan committed
85
	"strings"
86
	"time"
Jeffrey Morgan's avatar
Jeffrey Morgan committed
87
	"unsafe"
Michael Yang's avatar
Michael Yang committed
88
89

	"github.com/jmorganca/ollama/api"
Jeffrey Morgan's avatar
Jeffrey Morgan committed
90
91
)

Michael Yang's avatar
Michael Yang committed
92
93
94
95
type llama struct {
	params *C.struct_llama_context_params
	model  *C.struct_llama_model
	ctx    *C.struct_llama_context
Jeffrey Morgan's avatar
Jeffrey Morgan committed
96

Michael Yang's avatar
Michael Yang committed
97
98
	api.Options
}
Jeffrey Morgan's avatar
Jeffrey Morgan committed
99

Michael Yang's avatar
Michael Yang committed
100
101
102
func New(model string, opts api.Options) (*llama, error) {
	if _, err := os.Stat(model); err != nil {
		return nil, err
Jeffrey Morgan's avatar
Jeffrey Morgan committed
103
104
	}

Michael Yang's avatar
Michael Yang committed
105
106
	llm := llama{Options: opts}

Michael Yang's avatar
Michael Yang committed
107
	C.llama_backend_init(C.bool(llm.UseNUMA))
Michael Yang's avatar
Michael Yang committed
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127

	params := C.llama_context_default_params()
	params.seed = C.uint(llm.Seed)
	params.n_ctx = C.int(llm.NumCtx)
	params.n_batch = C.int(llm.NumBatch)
	params.n_gpu_layers = C.int(llm.NumGPU)
	params.main_gpu = C.int(llm.MainGPU)
	params.low_vram = C.bool(llm.LowVRAM)
	params.f16_kv = C.bool(llm.F16KV)
	params.logits_all = C.bool(llm.LogitsAll)
	params.vocab_only = C.bool(llm.VocabOnly)
	params.use_mmap = C.bool(llm.UseMMap)
	params.use_mlock = C.bool(llm.UseMLock)
	params.embedding = C.bool(llm.EmbeddingOnly)
	llm.params = &params

	cModel := C.CString(model)
	defer C.free(unsafe.Pointer(cModel))

	llm.model = C.llama_load_model_from_file(cModel, params)
128
129
130
131
	if llm.model == nil {
		return nil, errors.New("failed to load model")
	}

Michael Yang's avatar
Michael Yang committed
132
	llm.ctx = C.llama_new_context_with_model(llm.model, params)
133
134
135
	if llm.ctx == nil {
		return nil, errors.New("failed to create context")
	}
Michael Yang's avatar
Michael Yang committed
136
137
138
139
140
141
142

	// warm up the model
	bos := []C.llama_token{C.llama_token_bos()}
	C.llama_eval(llm.ctx, unsafe.SliceData(bos), C.int(len(bos)), 0, C.int(opts.NumThread))
	C.llama_reset_timings(llm.ctx)

	return &llm, nil
Jeffrey Morgan's avatar
Jeffrey Morgan committed
143
144
}

Michael Yang's avatar
Michael Yang committed
145
146
147
148
149
func (llm *llama) Close() {
	defer C.llama_free_model(llm.model)
	defer C.llama_free(llm.ctx)

	C.llama_print_timings(llm.ctx)
Jeffrey Morgan's avatar
Jeffrey Morgan committed
150
151
}

Michael Yang's avatar
Michael Yang committed
152
153
154
155
156
157
158
159
func (llm *llama) Predict(ctx []int, prompt string, fn func(api.GenerateResponse)) error {
	if input := llm.tokenize(prompt); input != nil {
		embd := make([]C.llama_token, len(ctx))
		for i := range ctx {
			embd[i] = C.llama_token(ctx[i])
		}

		return llm.generate(append(embd, input...), fn)
Jeffrey Morgan's avatar
Jeffrey Morgan committed
160
161
	}

Michael Yang's avatar
Michael Yang committed
162
163
	return errors.New("llama: tokenize")
}
Michael Yang's avatar
Michael Yang committed
164

Michael Yang's avatar
Michael Yang committed
165
166
167
func (llm *llama) tokenize(prompt string) []C.llama_token {
	cPrompt := C.CString(prompt)
	defer C.free(unsafe.Pointer(cPrompt))
Michael Yang's avatar
Michael Yang committed
168

Michael Yang's avatar
Michael Yang committed
169
170
171
	tokens := make([]C.llama_token, llm.NumCtx)
	if n := C.llama_tokenize(llm.ctx, cPrompt, unsafe.SliceData(tokens), C.int(len(tokens)), true); n > 0 {
		return tokens[:n]
Jeffrey Morgan's avatar
Jeffrey Morgan committed
172
173
174
175
176
	}

	return nil
}

Michael Yang's avatar
Michael Yang committed
177
178
179
180
func (llm *llama) detokenize(tokens ...C.llama_token) string {
	var sb strings.Builder
	for _, token := range tokens {
		sb.WriteString(C.GoString(C.llama_token_to_str(llm.ctx, token)))
Jeffrey Morgan's avatar
Jeffrey Morgan committed
181
182
	}

Michael Yang's avatar
Michael Yang committed
183
	return sb.String()
Jeffrey Morgan's avatar
Jeffrey Morgan committed
184
185
}

186
func (llm *llama) generate(input []C.llama_token, fn func(api.GenerateResponse)) error {
Michael Yang's avatar
Michael Yang committed
187
188
189
190
191
192
193
194
195
196
197
198
199
	var opts C.struct_llama_sample_options
	opts.repeat_penalty = C.float(llm.RepeatPenalty)
	opts.frequency_penalty = C.float(llm.FrequencyPenalty)
	opts.presence_penalty = C.float(llm.PresencePenalty)
	opts.temperature = C.float(llm.Temperature)
	opts.top_k = C.int(llm.TopK)
	opts.top_p = C.float(llm.TopP)
	opts.tfs_z = C.float(llm.TFSZ)
	opts.typical_p = C.float(llm.TypicalP)
	opts.mirostat = C.int(llm.Mirostat)
	opts.mirostat_tau = C.float(llm.MirostatTau)
	opts.mirostat_eta = C.float(llm.MirostatEta)

200
	output := deque[C.llama_token]{capacity: llm.NumCtx}
Michael Yang's avatar
Michael Yang committed
201

Michael Yang's avatar
Michael Yang committed
202
203
204
205
206
	context := deque[int]{capacity: llm.NumCtx / 2}
	for _, in := range input {
		context.PushLeft(int(in))
	}

Michael Yang's avatar
Michael Yang committed
207
	for C.llama_get_kv_cache_token_count(llm.ctx) < C.int(llm.NumCtx) {
208
		if retval := C.llama_eval(llm.ctx, unsafe.SliceData(input), C.int(len(input)), C.llama_get_kv_cache_token_count(llm.ctx), C.int(llm.NumThread)); retval != 0 {
Michael Yang's avatar
Michael Yang committed
209
210
211
			return errors.New("llama: eval")
		}

212
213
214
215
		token, err := llm.sample(output, &opts)
		if errors.Is(err, io.EOF) {
			break
		} else if err != nil {
Michael Yang's avatar
Michael Yang committed
216
			return err
Michael Yang's avatar
Michael Yang committed
217
218
		}

219
220
221
222
223
224
		// call the callback
		fn(api.GenerateResponse{
			Response: llm.detokenize(token),
		})

		output.PushLeft(token)
Michael Yang's avatar
Michael Yang committed
225
		context.PushLeft(int(token))
226
227
228

		input = []C.llama_token{token}
	}
Michael Yang's avatar
Michael Yang committed
229

230
231
232
233
234
	dur := func(ms float64) time.Duration {
		d, err := time.ParseDuration(fmt.Sprintf("%fms", ms))
		if err != nil {
			panic(err)
		}
Michael Yang's avatar
Michael Yang committed
235

236
		return d
Jeffrey Morgan's avatar
Jeffrey Morgan committed
237
238
	}

239
240
241
	timings := C.llama_get_timings(llm.ctx)
	fn(api.GenerateResponse{
		Done:               true,
Michael Yang's avatar
Michael Yang committed
242
		Context:            context.Data(),
243
244
245
246
247
248
		PromptEvalCount:    int(timings.n_p_eval),
		PromptEvalDuration: dur(float64(timings.t_p_eval_ms)),
		EvalCount:          int(timings.n_eval),
		EvalDuration:       dur(float64(timings.t_eval_ms)),
	})

Michael Yang's avatar
Michael Yang committed
249
	return nil
Jeffrey Morgan's avatar
Jeffrey Morgan committed
250
251
}

252
func (llm *llama) sample(output deque[C.llama_token], opts *C.struct_llama_sample_options) (C.llama_token, error) {
Michael Yang's avatar
Michael Yang committed
253
254
255
	numVocab := int(C.llama_n_vocab(llm.ctx))
	logits := unsafe.Slice(C.llama_get_logits(llm.ctx), numVocab)

256
257
258
	candidates := deque[C.struct_llama_token_data]{capacity: numVocab}
	for i := 0; i < candidates.Cap(); i++ {
		candidates.PushLeft(C.struct_llama_token_data{
Michael Yang's avatar
Michael Yang committed
259
260
261
262
263
			id:    C.int(i),
			logit: logits[i],
			p:     0,
		})
	}
Jeffrey Morgan's avatar
Jeffrey Morgan committed
264

Michael Yang's avatar
Michael Yang committed
265
266
	token := C.llama_sample(
		llm.ctx,
Michael Yang's avatar
size_t  
Michael Yang committed
267
268
		unsafe.SliceData(candidates.Data()), C.size_t(candidates.Len()),
		unsafe.SliceData(output.Data()), C.size_t(output.Len()),
Michael Yang's avatar
Michael Yang committed
269
270
271
		opts)
	if token != C.llama_token_eos() {
		return token, nil
Jeffrey Morgan's avatar
Jeffrey Morgan committed
272
	}
Michael Yang's avatar
Michael Yang committed
273
274

	return 0, io.EOF
Jeffrey Morgan's avatar
Jeffrey Morgan committed
275
}