dyn_ext_server.go 11.3 KB
Newer Older
1
2
3
package llm

/*
4
#cgo CFLAGS: -I${SRCDIR}/ext_server -I${SRCDIR}/llama.cpp -I${SRCDIR}/llama.cpp/common -I${SRCDIR}/llama.cpp/examples/server
5
#cgo CFLAGS: -DNDEBUG -DLLAMA_SERVER_LIBRARY=1 -D_XOPEN_SOURCE=600 -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
6
#cgo CFLAGS: -Wmissing-noreturn -Wextra -Wcast-qual -Wno-unused-function -Wno-array-bounds
7
#cgo CPPFLAGS: -Ofast -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations
8
9
#cgo darwin CFLAGS: -D_DARWIN_C_SOURCE
#cgo darwin CPPFLAGS:  -DGGML_USE_ACCELERATE
Daniel Hiltgen's avatar
Daniel Hiltgen committed
10
#cgo darwin CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_NDEBUG
11
#cgo darwin LDFLAGS: -lc++ -framework Accelerate
Daniel Hiltgen's avatar
Daniel Hiltgen committed
12
#cgo darwin LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
13
#cgo linux CFLAGS: -D_GNU_SOURCE
14
15
#cgo linux LDFLAGS: -lrt -ldl -lstdc++ -lm
#cgo linux windows LDFLAGS: -lpthread
16
17

#include <stdlib.h>
18
#include "dyn_ext_server.h"
19
20
21

*/
import "C"
22

23
24
25
26
27
import (
	"bytes"
	"context"
	"encoding/json"
	"fmt"
28
	"log/slog"
29
30
	"os"
	"path/filepath"
31
	"strings"
32
33
34
35
	"sync"
	"time"
	"unsafe"

36
37
	"github.com/ollama/ollama/api"
	"github.com/ollama/ollama/gpu"
38
39
)

40
41
type dynExtServer struct {
	s       C.struct_dynamic_llama_server
Michael Yang's avatar
Michael Yang committed
42
	options *api.Options
43
44
45
46
47
}

// Note: current implementation does not support concurrent instantiations
var mutex sync.Mutex

48
49
50
51
52
53
func newExtServerResp(len C.size_t) C.ext_server_resp_t {
	var resp C.ext_server_resp_t
	resp.msg_len = len
	bytes := make([]byte, len)
	resp.msg = (*C.char)(C.CBytes(bytes))
	return resp
54
55
}

56
57
58
59
60
func freeExtServerResp(resp C.ext_server_resp_t) {
	if resp.msg_len == 0 {
		return
	}
	C.free(unsafe.Pointer(resp.msg))
61
62
}

63
64
func extServerResponseToErr(resp C.ext_server_resp_t) error {
	return fmt.Errorf(C.GoString(resp.msg))
65
66
}

Michael Yang's avatar
Michael Yang committed
67
func newDynExtServer(library, model string, adapters, projectors []string, opts *api.Options) (LLM, error) {
68
	if !mutex.TryLock() {
69
		slog.Info("concurrent llm servers not yet supported, waiting for prior server to complete")
70
71
		mutex.Lock()
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
72
	gpu.UpdatePath(filepath.Dir(library))
73
74
	libPath := C.CString(library)
	defer C.free(unsafe.Pointer(libPath))
Daniel Hiltgen's avatar
Daniel Hiltgen committed
75
	resp := newExtServerResp(512)
76
77
78
79
80
81
82
	defer freeExtServerResp(resp)
	var srv C.struct_dynamic_llama_server
	C.dyn_init(libPath, &srv, &resp)
	if resp.id < 0 {
		mutex.Unlock()
		return nil, fmt.Errorf("Unable to load dynamic library: %s", C.GoString(resp.msg))
	}
Michael Yang's avatar
Michael Yang committed
83
	llm := dynExtServer{
84
85
86
		s:       srv,
		options: opts,
	}
87
	slog.Info(fmt.Sprintf("Loading Dynamic llm server: %s", library))
88

89
	var sparams C.ext_server_params_t
90
91
92
93
94
95
	sparams.model = C.CString(model)
	defer C.free(unsafe.Pointer(sparams.model))

	sparams.embedding = true
	sparams.n_ctx = C.uint(opts.NumCtx)
	sparams.n_batch = C.uint(opts.NumBatch)
96
	sparams.n_gpu_layers = C.int(opts.NumGPU)
97
	sparams.main_gpu = C.int(opts.MainGPU)
98
	sparams.n_parallel = 1 // TODO - wire up concurrency
99
100
101
102

	// Always use the value encoded in the model
	sparams.rope_freq_base = 0.0
	sparams.rope_freq_scale = 0.0
103
104
105
	sparams.memory_f16 = C.bool(opts.F16KV)
	sparams.use_mlock = C.bool(opts.UseMLock)
	sparams.use_mmap = C.bool(opts.UseMMap)
106
107
108
109
110
111

	if opts.UseNUMA {
		sparams.numa = C.int(1)
	} else {
		sparams.numa = C.int(0)
	}
112
113
114

	sparams.lora_adapters = nil
	for i := 0; i < len(adapters); i++ {
115
		la := (*C.ext_server_lora_adapter_t)(C.malloc(C.sizeof_ext_server_lora_adapter_t))
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
		defer C.free(unsafe.Pointer(la))
		la.adapter = C.CString(adapters[i])
		defer C.free(unsafe.Pointer(la.adapter))
		la.scale = C.float(1.0) // TODO expose scale/weights up through ollama UX
		la.next = nil
		if i == 0 {
			sparams.lora_adapters = la
		} else {
			tmp := sparams.lora_adapters
			for ; tmp.next != nil; tmp = tmp.next {
			}
			tmp.next = la
		}
	}

131
132
133
134
135
136
137
	if len(projectors) > 0 {
		// TODO: applying multiple projectors is not supported by the llama.cpp server yet
		sparams.mmproj = C.CString(projectors[0])
		defer C.free(unsafe.Pointer(sparams.mmproj))
	} else {
		sparams.mmproj = nil
	}
138

139
	sparams.n_threads = C.uint(opts.NumThread)
140

Daniel Hiltgen's avatar
Daniel Hiltgen committed
141
142
143
144
145
146
	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
		sparams.verbose_logging = C.bool(true)
	} else {
		sparams.verbose_logging = C.bool(false)
	}

147
	slog.Info("Initializing llama server")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
148
	slog.Debug(fmt.Sprintf("server params: %+v", sparams))
149
	initResp := newExtServerResp(512)
150
151
152
	defer freeExtServerResp(initResp)
	C.dyn_llama_server_init(llm.s, &sparams, &initResp)
	if initResp.id < 0 {
153
		mutex.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
154
155
156
		err := extServerResponseToErr(initResp)
		slog.Debug(fmt.Sprintf("failure during initialization: %s", err))
		return nil, err
157
158
	}

159
	slog.Info("Starting llama main loop")
160
	C.dyn_llama_server_start(llm.s)
Michael Yang's avatar
Michael Yang committed
161
	return &llm, nil
162
163
}

164
func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
165
166
	resp := newExtServerResp(128)
	defer freeExtServerResp(resp)
Jeffrey Morgan's avatar
Jeffrey Morgan committed
167

168
	if len(predict.Images) > 0 {
Jeffrey Morgan's avatar
Jeffrey Morgan committed
169
		slog.Info(fmt.Sprintf("loaded %d images", len(predict.Images)))
170
	}
171
172
173
174

	request := map[string]any{
		"prompt":            predict.Prompt,
		"stream":            true,
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
		"n_predict":         predict.Options.NumPredict,
		"n_keep":            predict.Options.NumKeep,
		"temperature":       predict.Options.Temperature,
		"top_k":             predict.Options.TopK,
		"top_p":             predict.Options.TopP,
		"tfs_z":             predict.Options.TFSZ,
		"typical_p":         predict.Options.TypicalP,
		"repeat_last_n":     predict.Options.RepeatLastN,
		"repeat_penalty":    predict.Options.RepeatPenalty,
		"presence_penalty":  predict.Options.PresencePenalty,
		"frequency_penalty": predict.Options.FrequencyPenalty,
		"mirostat":          predict.Options.Mirostat,
		"mirostat_tau":      predict.Options.MirostatTau,
		"mirostat_eta":      predict.Options.MirostatEta,
		"penalize_nl":       predict.Options.PenalizeNewline,
		"seed":              predict.Options.Seed,
		"stop":              predict.Options.Stop,
Jeffrey Morgan's avatar
Jeffrey Morgan committed
192
		"image_data":        predict.Images,
193
		"cache_prompt":      true,
194
195
196
197
	}

	if predict.Format == "json" {
		request["grammar"] = jsonGrammar
198
199
200
		if !strings.Contains(strings.ToLower(predict.Prompt), "json") {
			slog.Warn("Prompt does not specify that the LLM should response in JSON, but JSON format is expected. For best results specify that JSON is expected in the system prompt.")
		}
201
202
	}

203
204
205
206
207
208
	retryDelay := 100 * time.Microsecond
	for retries := 0; retries < maxRetries; retries++ {
		if retries > 0 {
			time.Sleep(retryDelay) // wait before retrying
			retryDelay *= 2        // exponential backoff
		}
209

210
211
212
213
		// Handling JSON marshaling with special characters unescaped.
		buffer := &bytes.Buffer{}
		enc := json.NewEncoder(buffer)
		enc.SetEscapeHTML(false)
214

215
216
217
		if err := enc.Encode(request); err != nil {
			return fmt.Errorf("failed to marshal data: %w", err)
		}
218

219
220
		req := C.CString(buffer.String())
		defer C.free(unsafe.Pointer(req))
221

222
		C.dyn_llama_server_completion(llm.s, req, &resp)
223
224
225
		if resp.id < 0 {
			return extServerResponseToErr(resp)
		}
226

227
		retryNeeded := false
228
229
230
		// keep track of the last token generated, this is used to abort if the model starts looping
		var lastToken string
		var tokenRepeat int
231
232
233
234
	out:
		for {
			select {
			case <-ctx.Done():
235
				return cancelCompletion(llm, resp)
236
237
			default:
				var result C.ext_server_task_result_t
238
				C.dyn_llama_server_completion_next_result(llm.s, resp.id, &result)
239
				json_resp := C.GoString(result.json_resp)
240
				C.dyn_llama_server_release_task_result(llm.s, &result)
241
242
243

				var p prediction
				if err := json.Unmarshal([]byte(json_resp), &p); err != nil {
244
					C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
245
246
247
248
249
250
251
252
253
254
255
256
257
					if resp.id < 0 {
						return fmt.Errorf("error unmarshaling llm prediction response: %w and cancel %s", err, C.GoString(resp.msg))
					} else {
						return fmt.Errorf("error unmarshaling llm prediction response: %w", err)
					}
				}

				if bool(result.error) && strings.Contains(json_resp, "slot unavailable") {
					retryNeeded = true
					// task will already be canceled
					break out
				}

258
259
260
261
262
263
264
265
266
267
268
269
270
271
				switch {
				case strings.TrimSpace(p.Content) == lastToken:
					tokenRepeat++
				default:
					lastToken = strings.TrimSpace(p.Content)
					tokenRepeat = 0
				}

				// 30 picked as an arbitrary max token repeat limit, modify as needed
				if tokenRepeat > 30 {
					slog.Debug("prediction aborted, token repeat limit reached")
					return cancelCompletion(llm, resp)
				}

272
273
274
275
276
277
				if p.Content != "" {
					fn(PredictResult{
						Content: p.Content,
					})
				}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
278
				if p.Stop || bool(result.stop) {
279
280
281
282
283
284
285
286
287
					fn(PredictResult{
						Done:               true,
						PromptEvalCount:    p.Timings.PromptN,
						PromptEvalDuration: parseDurationMs(p.Timings.PromptMS),
						EvalCount:          p.Timings.PredictedN,
						EvalDuration:       parseDurationMs(p.Timings.PredictedMS),
					})
					return nil
				}
288
289
			}
		}
290
291
292
		if !retryNeeded {
			return nil // success
		}
293
294
	}

295
296
297
298
	// should never reach here ideally
	return fmt.Errorf("max retries exceeded")
}

299
300
301
302
303
304
305
306
307
func cancelCompletion(llm *dynExtServer, resp C.ext_server_resp_t) error {
	C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
	if resp.id < 0 {
		return extServerResponseToErr(resp)
	} else {
		return nil
	}
}

308
func (llm *dynExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
309
310
311
312
313
314
	data, err := json.Marshal(TokenizeRequest{Content: prompt})
	if err != nil {
		return nil, fmt.Errorf("marshaling encode data: %w", err)
	}
	req := C.CString(string(data))
	defer C.free(unsafe.Pointer(req))
315
316
317
	var json_resp *C.char
	resp := newExtServerResp(128)
	defer freeExtServerResp(resp)
318
	C.dyn_llama_server_tokenize(llm.s, req, &json_resp, &resp)
319
320
	if resp.id < 0 {
		return nil, extServerResponseToErr(resp)
321
	}
322
	defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
323
324

	var encoded TokenizeResponse
325
	if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &encoded); err2 != nil {
326
327
328
329
330
331
		return nil, fmt.Errorf("unmarshal encode response: %w", err2)
	}

	return encoded.Tokens, err
}

332
func (llm *dynExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
333
334
335
336
337
338
339
340
341
342
	if len(tokens) == 0 {
		return "", nil
	}
	data, err := json.Marshal(DetokenizeRequest{Tokens: tokens})
	if err != nil {
		return "", fmt.Errorf("marshaling decode data: %w", err)
	}

	req := C.CString(string(data))
	defer C.free(unsafe.Pointer(req))
343
344
345
	var json_resp *C.char
	resp := newExtServerResp(128)
	defer freeExtServerResp(resp)
346
	C.dyn_llama_server_detokenize(llm.s, req, &json_resp, &resp)
347
348
	if resp.id < 0 {
		return "", extServerResponseToErr(resp)
349
	}
350
	defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
351
352

	var decoded DetokenizeResponse
353
	if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &decoded); err2 != nil {
354
355
356
357
358
359
		return "", fmt.Errorf("unmarshal encode response: %w", err2)
	}

	return decoded.Content, err
}

360
func (llm *dynExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
361
362
363
364
365
366
367
	data, err := json.Marshal(TokenizeRequest{Content: input})
	if err != nil {
		return nil, fmt.Errorf("error marshaling embed data: %w", err)
	}

	req := C.CString(string(data))
	defer C.free(unsafe.Pointer(req))
368
369
370
	var json_resp *C.char
	resp := newExtServerResp(128)
	defer freeExtServerResp(resp)
371
	C.dyn_llama_server_embedding(llm.s, req, &json_resp, &resp)
372
373
	if resp.id < 0 {
		return nil, extServerResponseToErr(resp)
374
	}
375
	defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
376
377

	var embedding EmbeddingResponse
378
	if err := json.Unmarshal([]byte(C.GoString(json_resp)), &embedding); err != nil {
379
380
381
382
383
384
		return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
	}

	return embedding.Embedding, nil
}

385
386
func (llm *dynExtServer) Close() {
	C.dyn_llama_server_stop(llm.s)
387
388
	mutex.Unlock()
}