llama.go 19.5 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
package llm

import (
	"bufio"
	"bytes"
	"context"
	"embed"
	"encoding/json"
	"errors"
	"fmt"
	"io"
	"io/fs"
	"log"
	"math/rand"
	"net/http"
	"os"
	"os/exec"
	"path"
	"path/filepath"
20
	"regexp"
21
22
23
24
25
26
27
28
	"runtime"
	"strconv"
	"strings"
	"time"

	"github.com/jmorganca/ollama/api"
)

Bruce MacDonald's avatar
Bruce MacDonald committed
29
//go:embed llama.cpp/*/build/*/bin/*
30
31
32
33
34
35
var llamaCppEmbed embed.FS

func osPath(llamaPath string) string {
	if runtime.GOOS == "windows" {
		return path.Join(llamaPath, "Release")
	}
36

37
38
39
	return llamaPath
}

40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
func cudaVersion() (int, error) {
	// first try nvcc, it gives the most accurate version if available
	cmd := exec.Command("nvcc", "--version")
	output, err := cmd.CombinedOutput()
	if err == nil {
		// regex to match the CUDA version line in nvcc --version output
		re := regexp.MustCompile(`release (\d+\.\d+),`)
		matches := re.FindStringSubmatch(string(output))
		if len(matches) >= 2 {
			cudaVersion := matches[1]
			cudaVersionParts := strings.Split(cudaVersion, ".")
			cudaMajorVersion, err := strconv.Atoi(cudaVersionParts[0])
			if err == nil {
				return cudaMajorVersion, nil
			}
		}
	}

	// fallback to nvidia-smi
	cmd = exec.Command("nvidia-smi")
	output, err = cmd.CombinedOutput()
Bruce MacDonald's avatar
Bruce MacDonald committed
61
	if err != nil {
62
		return -1, err
Bruce MacDonald's avatar
Bruce MacDonald committed
63
	}
64

65
66
67
68
	re := regexp.MustCompile(`CUDA Version: (\d+\.\d+)`)
	matches := re.FindStringSubmatch(string(output))
	if len(matches) < 2 {
		return -1, errors.New("could not find CUDA version")
Bruce MacDonald's avatar
Bruce MacDonald committed
69
	}
70

71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
	cudaVersion := matches[1]
	cudaVersionParts := strings.Split(cudaVersion, ".")
	cudaMajorVersion, err := strconv.Atoi(cudaVersionParts[0])
	if err != nil {
		return -1, err
	}
	return cudaMajorVersion, nil
}

func chooseRunner(runnerType string) string {
	tmpDir, err := os.MkdirTemp("", "llama-*")
	if err != nil {
		log.Fatalf("llama.cpp: failed to create temp dir: %v", err)
	}

	cpuPath := osPath(path.Join("llama.cpp", runnerType, "build", "cpu", "bin"))
	llamaPath := cpuPath
Bruce MacDonald's avatar
Bruce MacDonald committed
88
	files := []string{"server"}
89
90

	// Set OS specific llama.cpp runner paths
Bruce MacDonald's avatar
Bruce MacDonald committed
91
92
	switch runtime.GOOS {
	case "darwin":
93
94
95
		// TODO: change to check metal version
		llamaPath = osPath(path.Join("llama.cpp", runnerType, "build", "gpu", "bin"))
		files = append(files, "ggml-metal.metal")
96
	case "linux":
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
		cudaVersion, err := cudaVersion()
		if err != nil {
			// fallback to CPU runner in the following the CUDA version check
			log.Printf("failed to get CUDA version: %v", err)
		}

		switch cudaVersion {
		case 11, 12:
			cudaDir := fmt.Sprintf("cuda-%d", cudaVersion)
			llamaPath = osPath(path.Join("llama.cpp", runnerType, "build", cudaDir, "bin"))
		default:
			if cudaVersion != -1 {
				// a valid version was returned but it is not supported
				log.Printf("CUDA version %d not supported, falling back to CPU", cudaVersion)
			}
			llamaPath = cpuPath
		}
	case "windows":
		// TODO: select windows GPU runner here when available
		files = []string{"server.exe"}
	default:
		log.Printf("unknown OS, running on CPU: %s", runtime.GOOS)
	}

	// check if the runner exists, if not fallback to CPU runner
	if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil {
		// fallback to CPU runner
		llamaPath = cpuPath
		files = []string{"server"}
		if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil {
			log.Fatalf("llama.cpp executable not found")
128
		}
129
		log.Printf("llama.cpp %s executable not found, falling back to cpu", runnerType)
Bruce MacDonald's avatar
Bruce MacDonald committed
130
	}
131

132
	// copy the files locally to run the llama.cpp server
Bruce MacDonald's avatar
Bruce MacDonald committed
133
134
135
	for _, f := range files {
		srcPath := path.Join(llamaPath, f)
		destPath := filepath.Join(tmpDir, f)
136

Bruce MacDonald's avatar
Bruce MacDonald committed
137
138
139
140
141
		srcFile, err := llamaCppEmbed.Open(srcPath)
		if err != nil {
			log.Fatalf("read llama.cpp %s: %v", f, err)
		}
		defer srcFile.Close()
142

Bruce MacDonald's avatar
Bruce MacDonald committed
143
144
145
		destFile, err := os.OpenFile(destPath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
		if err != nil {
			log.Fatalf("write llama.cpp %s: %v", f, err)
146
		}
Bruce MacDonald's avatar
Bruce MacDonald committed
147
		defer destFile.Close()
148

Bruce MacDonald's avatar
Bruce MacDonald committed
149
150
		if _, err := io.Copy(destFile, srcFile); err != nil {
			log.Fatalf("copy llama.cpp %s: %v", f, err)
151
		}
Bruce MacDonald's avatar
Bruce MacDonald committed
152
	}
153

Bruce MacDonald's avatar
Bruce MacDonald committed
154
155
156
157
	runPath := filepath.Join(tmpDir, "server")
	if runtime.GOOS == "windows" {
		runPath = filepath.Join(tmpDir, "server.exe")
	}
158

Bruce MacDonald's avatar
Bruce MacDonald committed
159
	return runPath
160
161
162
163
164
165
}

type llamaModel struct {
	hyperparameters llamaHyperparameters
}

Michael Yang's avatar
Michael Yang committed
166
167
func (llm *llamaModel) ModelFamily() string {
	return "llama"
168
169
}

Michael Yang's avatar
Michael Yang committed
170
171
func llamaModelType(numLayer uint32) string {
	switch numLayer {
172
	case 26:
Michael Yang's avatar
Michael Yang committed
173
		return "3B"
174
	case 32:
Michael Yang's avatar
Michael Yang committed
175
		return "7B"
176
	case 40:
Michael Yang's avatar
Michael Yang committed
177
		return "13B"
178
	case 48:
Michael Yang's avatar
Michael Yang committed
179
		return "34B"
180
	case 60:
Michael Yang's avatar
Michael Yang committed
181
		return "30B"
182
	case 80:
Michael Yang's avatar
Michael Yang committed
183
184
185
		return "65B"
	default:
		return "Unknown"
186
	}
Michael Yang's avatar
Michael Yang committed
187
}
188

Michael Yang's avatar
Michael Yang committed
189
190
func (llm *llamaModel) ModelType() string {
	return llamaModelType(llm.hyperparameters.NumLayer)
191
192
}

Michael Yang's avatar
Michael Yang committed
193
194
func (llm *llamaModel) FileType() string {
	return fileType(llm.hyperparameters.FileType)
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
}

type llamaHyperparameters struct {
	// NumVocab is the size of the model's vocabulary.
	NumVocab uint32

	// NumEmbd is the size of the model's embedding layer.
	NumEmbd uint32
	NumMult uint32
	NumHead uint32

	// NumLayer is the number of layers in the model.
	NumLayer uint32
	NumRot   uint32

	// FileType describes the quantization level of the model, e.g. Q4_0, Q5_K, etc.
Michael Yang's avatar
Michael Yang committed
211
	FileType uint32
212
213
214
215
216
217
218
219
}

type Running struct {
	Port   int
	Cmd    *exec.Cmd
	Cancel context.CancelFunc
}

Bruce MacDonald's avatar
Bruce MacDonald committed
220
221
222
223
type ModelRunner struct {
	Path string // path to the model runner executable
}

224
225
226
227
228
type llama struct {
	api.Options
	Running
}

229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
var errNoGPU = errors.New("nvidia-smi command failed")

// CheckVRAM returns the available VRAM in MiB on Linux machines with NVIDIA GPUs
func CheckVRAM() (int, error) {
	cmd := exec.Command("nvidia-smi", "--query-gpu=memory.total", "--format=csv,noheader,nounits")
	var stdout bytes.Buffer
	cmd.Stdout = &stdout
	err := cmd.Run()
	if err != nil {
		return 0, errNoGPU
	}

	var total int
	scanner := bufio.NewScanner(&stdout)
	for scanner.Scan() {
		line := scanner.Text()
		vram, err := strconv.Atoi(line)
		if err != nil {
			return 0, fmt.Errorf("failed to parse available VRAM: %v", err)
		}

		total += vram
	}

	return total, nil
}

func NumGPU(opts api.Options) int {
	if opts.NumGPU != -1 {
		return opts.NumGPU
	}
	n := 1 // default to enable metal on macOS
	if runtime.GOOS == "linux" {
		vram, err := CheckVRAM()
		if err != nil {
			if err.Error() != "nvidia-smi command failed" {
				log.Print(err.Error())
			}
			// nvidia driver not installed or no nvidia GPU found
			return 0
		}
		// TODO: this is a very rough heuristic, better would be to calculate this based on number of layers and context size
		switch {
		case vram < 500:
			log.Printf("WARNING: Low VRAM detected, disabling GPU")
			n = 0
		case vram < 1000:
			n = 4
		case vram < 2000:
			n = 8
		case vram < 4000:
			n = 12
		case vram < 8000:
			n = 16
		case vram < 12000:
			n = 24
		case vram < 16000:
			n = 32
		default:
			n = 48
		}
		log.Printf("%d MB VRAM available, loading %d GPU layers", vram, n)
	}
	return n
}

295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
func newLlama(model string, adapters []string, runner ModelRunner, opts api.Options) (*llama, error) {
	if _, err := os.Stat(model); err != nil {
		return nil, err
	}

	if _, err := os.Stat(runner.Path); err != nil {
		return nil, err
	}

	if len(adapters) > 1 {
		return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
	}

	params := []string{
		"--model", model,
		"--ctx-size", fmt.Sprintf("%d", opts.NumCtx),
		"--rope-freq-base", fmt.Sprintf("%f", opts.RopeFrequencyBase),
		"--rope-freq-scale", fmt.Sprintf("%f", opts.RopeFrequencyScale),
		"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
314
		"--n-gpu-layers", fmt.Sprintf("%d", NumGPU(opts)),
315
316
317
		"--embedding",
	}

Bruce MacDonald's avatar
Bruce MacDonald committed
318
319
320
321
	if opts.NumGQA > 0 {
		params = append(params, "--gqa", fmt.Sprintf("%d", opts.NumGQA))
	}

322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
	if len(adapters) > 0 {
		// TODO: applying multiple adapters is not supported by the llama.cpp server yet
		params = append(params, "--lora", adapters[0])
	}

	if opts.NumThread > 0 {
		params = append(params, "--threads", fmt.Sprintf("%d", opts.NumThread))
	}

	if !opts.F16KV {
		params = append(params, "--memory-f32")
	}
	if opts.UseMLock {
		params = append(params, "--mlock")
	}
	if !opts.UseMMap {
		params = append(params, "--no-mmap")
	}
	if opts.UseNUMA {
		params = append(params, "--numa")
	}

	// start the llama.cpp server with a retry in case the port is already in use
	for try := 0; try < 3; try++ {
		port := rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
		ctx, cancel := context.WithCancel(context.Background())
		cmd := exec.CommandContext(
			ctx,
			runner.Path,
			append(params, "--port", strconv.Itoa(port))...,
		)
Bruce MacDonald's avatar
Bruce MacDonald committed
353

354
355
		cmd.Stdout = os.Stderr
		cmd.Stderr = os.Stderr
356
357
358

		llm := &llama{Options: opts, Running: Running{Port: port, Cmd: cmd, Cancel: cancel}}

Bruce MacDonald's avatar
Bruce MacDonald committed
359
360
361
362
363
364
		log.Print("starting llama.cpp server")
		if err := llm.Cmd.Start(); err != nil {
			log.Printf("error starting the external llama.cpp server: %v", err)
			continue
		}

365
366
367
368
369
370
		if err := waitForServer(llm); err != nil {
			log.Printf("error starting llama.cpp server: %v", err)
			llm.Close()
			// try again
			continue
		}
Bruce MacDonald's avatar
Bruce MacDonald committed
371

372
373
374
375
376
377
378
379
380
381
		// server started successfully
		return llm, nil
	}

	return nil, fmt.Errorf("max retry exceeded starting llama.cpp")
}

func waitForServer(llm *llama) error {
	// wait for the server to start responding
	start := time.Now()
382
	expiresAt := time.Now().Add(45 * time.Second)
Bruce MacDonald's avatar
Bruce MacDonald committed
383
	ticker := time.NewTicker(200 * time.Millisecond)
384
385

	log.Print("waiting for llama.cpp server to start responding")
Bruce MacDonald's avatar
Bruce MacDonald committed
386
387
388
389
	for range ticker.C {
		if time.Now().After(expiresAt) {
			return fmt.Errorf("llama.cpp server did not start within alloted time, retrying")
		}
390

Bruce MacDonald's avatar
Bruce MacDonald committed
391
392
		if err := llm.Ping(context.Background()); err == nil {
			break
393
394
		}
	}
Bruce MacDonald's avatar
Bruce MacDonald committed
395
396
397

	log.Printf("llama.cpp server started in %f seconds", time.Since(start).Seconds())
	return nil
398
399
400
}

func (llm *llama) Close() {
Bruce MacDonald's avatar
Bruce MacDonald committed
401
402
403
404
	llm.Cancel()
	if err := llm.Cmd.Wait(); err != nil {
		log.Printf("llama.cpp server exited with error: %v", err)
	}
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
}

func (llm *llama) SetOptions(opts api.Options) {
	llm.Options = opts
}

type GenerationSettings struct {
	FrequencyPenalty float64       `json:"frequency_penalty"`
	IgnoreEOS        bool          `json:"ignore_eos"`
	LogitBias        []interface{} `json:"logit_bias"`
	Mirostat         int           `json:"mirostat"`
	MirostatEta      float64       `json:"mirostat_eta"`
	MirostatTau      float64       `json:"mirostat_tau"`
	Model            string        `json:"model"`
	NCtx             int           `json:"n_ctx"`
	NKeep            int           `json:"n_keep"`
	NPredict         int           `json:"n_predict"`
	NProbs           int           `json:"n_probs"`
	PenalizeNl       bool          `json:"penalize_nl"`
	PresencePenalty  float64       `json:"presence_penalty"`
	RepeatLastN      int           `json:"repeat_last_n"`
	RepeatPenalty    float64       `json:"repeat_penalty"`
	Seed             uint32        `json:"seed"`
	Stop             []string      `json:"stop"`
	Stream           bool          `json:"stream"`
	Temp             float64       `json:"temp"`
	TfsZ             float64       `json:"tfs_z"`
	TopK             int           `json:"top_k"`
	TopP             float64       `json:"top_p"`
	TypicalP         float64       `json:"typical_p"`
}

type Timings struct {
Michael Yang's avatar
Michael Yang committed
438
439
440
441
	PredictedN  int     `json:"predicted_n"`
	PredictedMS float64 `json:"predicted_ms"`
	PromptN     int     `json:"prompt_n"`
	PromptMS    float64 `json:"prompt_ms"`
442
443
}

Michael Yang's avatar
Michael Yang committed
444
445
446
447
448
449
450
type Prediction struct {
	Content string `json:"content"`
	Model   string `json:"model"`
	Prompt  string `json:"prompt"`
	Stop    bool   `json:"stop"`

	Timings `json:"timings"`
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
}

type PredictRequest struct {
	Stream           bool            `json:"stream"`
	NPredict         int             `json:"n_predict,omitempty"`
	TopK             int             `json:"top_k,omitempty"`
	TopP             float32         `json:"top_p,omitempty"`
	TfsZ             float32         `json:"tfs_z,omitempty"`
	TypicalP         float32         `json:"typical_p,omitempty"`
	RepeatLastN      int             `json:"repeat_last_n,omitempty"`
	Temperature      float32         `json:"temperature,omitempty"`
	RepeatPenalty    float32         `json:"repeat_penalty,omitempty"`
	PresencePenalty  float32         `json:"presence_penalty,omitempty"`
	FrequencyPenalty float32         `json:"frequency_penalty,omitempty"`
	Mirostat         int             `json:"mirostat,omitempty"`
	MirostatTau      float32         `json:"mirostat_tau,omitempty"`
	MirostatEta      float32         `json:"mirostat_eta,omitempty"`
	PenalizeNl       bool            `json:"penalize_nl,omitempty"`
	NKeep            int             `json:"n_keep,omitempty"`
	Seed             int             `json:"seed,omitempty"`
	Prompt           string          `json:"prompt,omitempty"`
	NProbs           int             `json:"n_probs,omitempty"`
	LogitBias        map[int]float32 `json:"logit_bias,omitempty"`
	IgnoreEos        bool            `json:"ignore_eos,omitempty"`
	Stop             []string        `json:"stop,omitempty"`
}

478
479
func (llm *llama) Predict(ctx context.Context, prevContext []int, prompt string, fn func(api.GenerateResponse)) error {
	prevConvo, err := llm.Decode(ctx, prevContext)
480
	if err != nil {
481
		return err
482
	}
483
484
485
486
487

	var nextContext strings.Builder
	nextContext.WriteString(prevConvo)
	nextContext.WriteString(prompt)

488
489
	endpoint := fmt.Sprintf("http://127.0.0.1:%d/completion", llm.Port)
	predReq := PredictRequest{
490
		Prompt:           nextContext.String(),
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
		Stream:           true,
		NPredict:         llm.NumPredict,
		NKeep:            llm.NumKeep,
		Temperature:      llm.Temperature,
		TopK:             llm.TopK,
		TopP:             llm.TopP,
		TfsZ:             llm.TFSZ,
		TypicalP:         llm.TypicalP,
		RepeatLastN:      llm.RepeatLastN,
		RepeatPenalty:    llm.RepeatPenalty,
		PresencePenalty:  llm.PresencePenalty,
		FrequencyPenalty: llm.FrequencyPenalty,
		Mirostat:         llm.Mirostat,
		MirostatTau:      llm.MirostatTau,
		MirostatEta:      llm.MirostatEta,
		PenalizeNl:       llm.PenalizeNewline,
		Stop:             llm.Stop,
	}
	data, err := json.Marshal(predReq)
	if err != nil {
		return fmt.Errorf("error marshaling data: %v", err)
	}

	req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, bytes.NewBuffer(data))
	if err != nil {
		return fmt.Errorf("error creating POST request: %v", err)
	}
	req.Header.Set("Content-Type", "application/json")

	resp, err := http.DefaultClient.Do(req)
	if err != nil {
		return fmt.Errorf("POST predict: %v", err)
	}
	defer resp.Body.Close()

	if resp.StatusCode >= 400 {
		bodyBytes, err := io.ReadAll(resp.Body)
		if err != nil {
			return fmt.Errorf("failed reading llm error response: %w", err)
		}
		log.Printf("llm predict error: %s", bodyBytes)
		return fmt.Errorf("%s", bodyBytes)
	}

	scanner := bufio.NewScanner(resp.Body)
	for scanner.Scan() {
		select {
		case <-ctx.Done():
			// This handles the request cancellation
			return ctx.Err()
		default:
			line := scanner.Text()
			if line == "" {
				continue
			}

			// Read data from the server-side event stream
			if strings.HasPrefix(line, "data: ") {
				evt := line[6:]
Michael Yang's avatar
Michael Yang committed
550
551
552
				var p Prediction
				if err := json.Unmarshal([]byte(evt), &p); err != nil {
					return fmt.Errorf("error unmarshaling llm prediction response: %v", err)
553
554
				}

Michael Yang's avatar
Michael Yang committed
555
556
557
558
				if p.Content != "" {
					fn(api.GenerateResponse{Response: p.Content})
					nextContext.WriteString(p.Content)
				}
Michael Yang's avatar
Michael Yang committed
559
560

				if p.Stop {
561
					embd, err := llm.Encode(ctx, nextContext.String())
562
563
564
					if err != nil {
						return fmt.Errorf("encoding context: %v", err)
					}
565

566
567
568
					fn(api.GenerateResponse{
						Done:               true,
						Context:            embd,
Michael Yang's avatar
Michael Yang committed
569
570
571
572
						PromptEvalCount:    p.PromptN,
						PromptEvalDuration: parseDurationMs(p.PromptMS),
						EvalCount:          p.PredictedN,
						EvalDuration:       parseDurationMs(p.PredictedMS),
573
574
					})

Michael Yang's avatar
Michael Yang committed
575
					return nil
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
				}
			}
		}
	}

	if err := scanner.Err(); err != nil {
		return fmt.Errorf("error reading llm response: %v", err)
	}

	return nil
}

type TokenizeRequest struct {
	Content string `json:"content"`
}

type TokenizeResponse struct {
	Tokens []int `json:"tokens"`
}

func (llm *llama) Encode(ctx context.Context, prompt string) ([]int, error) {
	endpoint := fmt.Sprintf("http://127.0.0.1:%d/tokenize", llm.Port)
	data, err := json.Marshal(TokenizeRequest{Content: prompt})
	if err != nil {
		return nil, fmt.Errorf("marshaling encode data: %w", err)
	}

	req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, bytes.NewBuffer(data))
	if err != nil {
		return nil, fmt.Errorf("encode request: %w", err)
	}
	req.Header.Set("Content-Type", "application/json")

	resp, err := http.DefaultClient.Do(req)
	if err != nil {
		return nil, fmt.Errorf("do encode request: %w", err)
	}
	defer resp.Body.Close()

	body, err := io.ReadAll(resp.Body)
	if err != nil {
		return nil, fmt.Errorf("read encode request: %w", err)
	}

	if resp.StatusCode >= 400 {
		log.Printf("llm encode error: %s", body)
		return nil, fmt.Errorf("%s", body)
	}

	var encoded TokenizeResponse
	if err := json.Unmarshal(body, &encoded); err != nil {
		return nil, fmt.Errorf("unmarshal encode response: %w", err)
	}

	return encoded.Tokens, nil
}

type DetokenizeRequest struct {
	Tokens []int `json:"tokens"`
}

type DetokenizeResponse struct {
	Content string `json:"content"`
}

func (llm *llama) Decode(ctx context.Context, tokens []int) (string, error) {
	if len(tokens) == 0 {
		return "", nil
	}
	endpoint := fmt.Sprintf("http://127.0.0.1:%d/detokenize", llm.Port)
	data, err := json.Marshal(DetokenizeRequest{Tokens: tokens})
	if err != nil {
		return "", fmt.Errorf("marshaling decode data: %w", err)
	}

	req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, bytes.NewBuffer(data))
	if err != nil {
		return "", fmt.Errorf("decode request: %w", err)
	}
	req.Header.Set("Content-Type", "application/json")

	resp, err := http.DefaultClient.Do(req)
	if err != nil {
		return "", fmt.Errorf("do decode request: %w", err)
	}
	defer resp.Body.Close()

	body, err := io.ReadAll(resp.Body)
	if err != nil {
		return "", fmt.Errorf("read decode request: %w", err)
	}

	if resp.StatusCode >= 400 {
		log.Printf("llm decode error: %s", body)
		return "", fmt.Errorf("%s", body)
	}

	var decoded DetokenizeResponse
	if err := json.Unmarshal(body, &decoded); err != nil {
		return "", fmt.Errorf("unmarshal encode response: %w", err)
	}

	// decoded content contains a leading whitespace
	decoded.Content, _ = strings.CutPrefix(decoded.Content, "")

	return decoded.Content, nil
}

type EmbeddingRequest struct {
	Content string `json:"content"`
}

type EmbeddingResponse struct {
	Embedding []float64 `json:"embedding"`
}

func (llm *llama) Embedding(ctx context.Context, input string) ([]float64, error) {
	endpoint := fmt.Sprintf("http://127.0.0.1:%d/embedding", llm.Port)
	data, err := json.Marshal(TokenizeRequest{Content: input})
	if err != nil {
		return nil, fmt.Errorf("error marshaling embed data: %w", err)
	}

	req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, bytes.NewBuffer(data))
	if err != nil {
		return nil, fmt.Errorf("error creating embed request: %w", err)
	}
	req.Header.Set("Content-Type", "application/json")

	resp, err := http.DefaultClient.Do(req)
	if err != nil {
		return nil, fmt.Errorf("POST embedding: %w", err)
	}
	defer resp.Body.Close()

	body, err := io.ReadAll(resp.Body)
	if err != nil {
		return nil, fmt.Errorf("error reading embed response: %w", err)
	}

	if resp.StatusCode >= 400 {
		log.Printf("llm encode error: %s", body)
		return nil, fmt.Errorf("%s", body)
	}

	var embedding EmbeddingResponse
	if err := json.Unmarshal(body, &embedding); err != nil {
		return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
	}

	return embedding.Embedding, nil
}

// Ping checks that the server subprocess is still running and responding to requests
func (llm *llama) Ping(ctx context.Context) error {
Bruce MacDonald's avatar
Bruce MacDonald committed
731
	resp, err := http.Head(fmt.Sprintf("http://127.0.0.1:%d", llm.Port))
732
733
734
735
736
737
738
739
	if err != nil {
		return fmt.Errorf("ping resp: %w", err)
	}
	if resp.StatusCode != http.StatusOK {
		return fmt.Errorf("unexpected ping status: %s", resp.Status)
	}
	return nil
}