llama.go 6.26 KB
Newer Older
Jeffrey Morgan's avatar
Jeffrey Morgan committed
1
2
package llama

Michael Yang's avatar
Michael Yang committed
3
/*
Michael Yang's avatar
Michael Yang committed
4
5
6
#cgo CPPFLAGS: -O3 -DNDEBUG=1
#cgo CXXFLAGS: -std=c++11
#cgo darwin CPPFLAGS: -DGGML_USE_METAL=1 -DGGML_METAL_NDEBUG=1
Michael Yang's avatar
Michael Yang committed
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#cgo darwin LDFLAGS: -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
#include <stdlib.h>
#include "llama.h"

struct llama_sample_options
{
	float repeat_penalty;
	float frequency_penalty;
	float presence_penalty;
	float temperature;
	int32_t top_k;
	float top_p;
	float tfs_z;
	float typical_p;
	int mirostat;
	float mirostat_tau;
	float mirostat_eta;
};

llama_token llama_sample(
		struct llama_context *ctx,
		struct llama_token_data *candidates,
		size_t n_candidates,
		const llama_token *last_tokens,
		size_t n_last_tokens,
		struct llama_sample_options *opts)
{
	llama_token_data_array candidates_p = {
		candidates,
		n_candidates,
		false,
	};

	llama_sample_repetition_penalty(
		ctx, &candidates_p,
		last_tokens, n_last_tokens,
		opts->repeat_penalty);

	llama_sample_frequency_and_presence_penalties(
		ctx, &candidates_p,
		last_tokens, n_last_tokens,
		opts->frequency_penalty, opts->presence_penalty);

	if (opts->temperature <= 0) {
		return llama_sample_token_greedy(ctx, &candidates_p);
	}
Bruce MacDonald's avatar
Bruce MacDonald committed
53

Michael Yang's avatar
Michael Yang committed
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
	if (opts->mirostat == 1) {
		int mirostat_m = 100;
		float mirostat_mu = 2.0f * opts->mirostat_tau;
		llama_sample_temperature(ctx, &candidates_p, opts->temperature);
		return llama_sample_token_mirostat(
			ctx, &candidates_p,
			opts->mirostat_tau, opts->mirostat_eta,
			mirostat_m, &mirostat_mu);
	} else if (opts->mirostat == 2) {
		float mirostat_mu = 2.0f * opts->mirostat_tau;
		llama_sample_temperature(ctx, &candidates_p, opts->temperature);
		return llama_sample_token_mirostat_v2(
			ctx, &candidates_p,
			opts->mirostat_tau, opts->mirostat_eta,
			&mirostat_mu);
	} else {
		llama_sample_top_k(ctx, &candidates_p, opts->top_k, 1);
		llama_sample_tail_free(ctx, &candidates_p, opts->tfs_z, 1);
		llama_sample_typical(ctx, &candidates_p, opts->typical_p, 1);
		llama_sample_top_p(ctx, &candidates_p, opts->top_p, 1);
		llama_sample_temperature(ctx, &candidates_p, opts->temperature);
		return llama_sample_token(ctx, &candidates_p);
	}
}
*/
import "C"
Jeffrey Morgan's avatar
Jeffrey Morgan committed
80
import (
Michael Yang's avatar
Michael Yang committed
81
82
83
	"errors"
	"io"
	"os"
Jeffrey Morgan's avatar
Jeffrey Morgan committed
84
85
	"strings"
	"unsafe"
Michael Yang's avatar
Michael Yang committed
86
87

	"github.com/jmorganca/ollama/api"
Jeffrey Morgan's avatar
Jeffrey Morgan committed
88
89
)

Michael Yang's avatar
Michael Yang committed
90
91
92
93
type llama struct {
	params *C.struct_llama_context_params
	model  *C.struct_llama_model
	ctx    *C.struct_llama_context
Jeffrey Morgan's avatar
Jeffrey Morgan committed
94

Michael Yang's avatar
Michael Yang committed
95
96
	api.Options
}
Jeffrey Morgan's avatar
Jeffrey Morgan committed
97

Michael Yang's avatar
Michael Yang committed
98
99
100
func New(model string, opts api.Options) (*llama, error) {
	if _, err := os.Stat(model); err != nil {
		return nil, err
Jeffrey Morgan's avatar
Jeffrey Morgan committed
101
102
	}

Michael Yang's avatar
Michael Yang committed
103
104
	llm := llama{Options: opts}

Michael Yang's avatar
Michael Yang committed
105
	C.llama_backend_init(C.bool(llm.UseNUMA))
Michael Yang's avatar
Michael Yang committed
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125

	params := C.llama_context_default_params()
	params.seed = C.uint(llm.Seed)
	params.n_ctx = C.int(llm.NumCtx)
	params.n_batch = C.int(llm.NumBatch)
	params.n_gpu_layers = C.int(llm.NumGPU)
	params.main_gpu = C.int(llm.MainGPU)
	params.low_vram = C.bool(llm.LowVRAM)
	params.f16_kv = C.bool(llm.F16KV)
	params.logits_all = C.bool(llm.LogitsAll)
	params.vocab_only = C.bool(llm.VocabOnly)
	params.use_mmap = C.bool(llm.UseMMap)
	params.use_mlock = C.bool(llm.UseMLock)
	params.embedding = C.bool(llm.EmbeddingOnly)
	llm.params = &params

	cModel := C.CString(model)
	defer C.free(unsafe.Pointer(cModel))

	llm.model = C.llama_load_model_from_file(cModel, params)
126
127
128
129
	if llm.model == nil {
		return nil, errors.New("failed to load model")
	}

Michael Yang's avatar
Michael Yang committed
130
	llm.ctx = C.llama_new_context_with_model(llm.model, params)
131
132
133
	if llm.ctx == nil {
		return nil, errors.New("failed to create context")
	}
Michael Yang's avatar
Michael Yang committed
134
135
136
137
138
139
140

	// warm up the model
	bos := []C.llama_token{C.llama_token_bos()}
	C.llama_eval(llm.ctx, unsafe.SliceData(bos), C.int(len(bos)), 0, C.int(opts.NumThread))
	C.llama_reset_timings(llm.ctx)

	return &llm, nil
Jeffrey Morgan's avatar
Jeffrey Morgan committed
141
142
}

Michael Yang's avatar
Michael Yang committed
143
144
145
146
147
func (llm *llama) Close() {
	defer C.llama_free_model(llm.model)
	defer C.llama_free(llm.ctx)

	C.llama_print_timings(llm.ctx)
Jeffrey Morgan's avatar
Jeffrey Morgan committed
148
149
}

Michael Yang's avatar
Michael Yang committed
150
151
152
func (llm *llama) Predict(prompt string, fn func(string)) error {
	if tokens := llm.tokenize(prompt); tokens != nil {
		return llm.generate(tokens, fn)
Jeffrey Morgan's avatar
Jeffrey Morgan committed
153
154
	}

Michael Yang's avatar
Michael Yang committed
155
156
	return errors.New("llama: tokenize")
}
Michael Yang's avatar
Michael Yang committed
157

Michael Yang's avatar
Michael Yang committed
158
159
160
func (llm *llama) tokenize(prompt string) []C.llama_token {
	cPrompt := C.CString(prompt)
	defer C.free(unsafe.Pointer(cPrompt))
Michael Yang's avatar
Michael Yang committed
161

Michael Yang's avatar
Michael Yang committed
162
163
164
	tokens := make([]C.llama_token, llm.NumCtx)
	if n := C.llama_tokenize(llm.ctx, cPrompt, unsafe.SliceData(tokens), C.int(len(tokens)), true); n > 0 {
		return tokens[:n]
Jeffrey Morgan's avatar
Jeffrey Morgan committed
165
166
167
168
169
	}

	return nil
}

Michael Yang's avatar
Michael Yang committed
170
171
172
173
func (llm *llama) detokenize(tokens ...C.llama_token) string {
	var sb strings.Builder
	for _, token := range tokens {
		sb.WriteString(C.GoString(C.llama_token_to_str(llm.ctx, token)))
Jeffrey Morgan's avatar
Jeffrey Morgan committed
174
175
	}

Michael Yang's avatar
Michael Yang committed
176
	return sb.String()
Jeffrey Morgan's avatar
Jeffrey Morgan committed
177
178
}

Michael Yang's avatar
Michael Yang committed
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
func (llm *llama) generate(tokens []C.llama_token, fn func(string)) error {
	var opts C.struct_llama_sample_options
	opts.repeat_penalty = C.float(llm.RepeatPenalty)
	opts.frequency_penalty = C.float(llm.FrequencyPenalty)
	opts.presence_penalty = C.float(llm.PresencePenalty)
	opts.temperature = C.float(llm.Temperature)
	opts.top_k = C.int(llm.TopK)
	opts.top_p = C.float(llm.TopP)
	opts.tfs_z = C.float(llm.TFSZ)
	opts.typical_p = C.float(llm.TypicalP)
	opts.mirostat = C.int(llm.Mirostat)
	opts.mirostat_tau = C.float(llm.MirostatTau)
	opts.mirostat_eta = C.float(llm.MirostatEta)

	pastTokens := deque[C.llama_token]{capacity: llm.RepeatLastN}

	for C.llama_get_kv_cache_token_count(llm.ctx) < C.int(llm.NumCtx) {
		if retval := C.llama_eval(llm.ctx, unsafe.SliceData(tokens), C.int(len(tokens)), C.llama_get_kv_cache_token_count(llm.ctx), C.int(llm.NumThread)); retval != 0 {
			return errors.New("llama: eval")
		}

		token, err := llm.sample(pastTokens, &opts)
		switch {
		case errors.Is(err, io.EOF):
			return nil
Michael Yang's avatar
Michael Yang committed
204
205
		case err != nil:
			return err
Michael Yang's avatar
Michael Yang committed
206
207
208
209
210
211
212
		}

		fn(llm.detokenize(token))

		tokens = []C.llama_token{token}

		pastTokens.PushLeft(token)
Jeffrey Morgan's avatar
Jeffrey Morgan committed
213
214
	}

Michael Yang's avatar
Michael Yang committed
215
	return nil
Jeffrey Morgan's avatar
Jeffrey Morgan committed
216
217
}

Michael Yang's avatar
Michael Yang committed
218
219
220
221
222
223
224
225
226
227
228
229
func (llm *llama) sample(pastTokens deque[C.llama_token], opts *C.struct_llama_sample_options) (C.llama_token, error) {
	numVocab := int(C.llama_n_vocab(llm.ctx))
	logits := unsafe.Slice(C.llama_get_logits(llm.ctx), numVocab)

	candidates := make([]C.struct_llama_token_data, 0, numVocab)
	for i := 0; i < numVocab; i++ {
		candidates = append(candidates, C.llama_token_data{
			id:    C.int(i),
			logit: logits[i],
			p:     0,
		})
	}
Jeffrey Morgan's avatar
Jeffrey Morgan committed
230

Michael Yang's avatar
Michael Yang committed
231
232
233
234
235
236
237
	token := C.llama_sample(
		llm.ctx,
		unsafe.SliceData(candidates), C.ulong(len(candidates)),
		unsafe.SliceData(pastTokens.Data()), C.ulong(pastTokens.Len()),
		opts)
	if token != C.llama_token_eos() {
		return token, nil
Jeffrey Morgan's avatar
Jeffrey Morgan committed
238
	}
Michael Yang's avatar
Michael Yang committed
239
240

	return 0, io.EOF
Jeffrey Morgan's avatar
Jeffrey Morgan committed
241
}