llama.go

package llm

import (
	"bufio"
	"bytes"
	"context"
	"embed"
	"encoding/json"
	"errors"
	"fmt"
	"io"
	"io/fs"
	"log"
	"math/rand"
	"net/http"
	"os"
	"os/exec"
	"path"
	"path/filepath"
	"runtime"
	"slices"
	"strconv"
	"strings"
	"sync"
	"time"

	"github.com/jmorganca/ollama/api"
	"github.com/jmorganca/ollama/format"
)

//go:embed llama.cpp/*/build/*/bin/*
var llamaCppEmbed embed.FS

type ModelRunner struct {
	Path        string // path to the model runner executable
	Accelerated bool
}

func chooseRunners(workDir, runnerType string) []ModelRunner {
	buildPath := path.Join("llama.cpp", runnerType, "build")
	var runners []ModelRunner

	// set the runners based on the OS
	// IMPORTANT: the order of the runners in the array is the priority order
	switch runtime.GOOS {
	case "darwin":
		runners = []ModelRunner{
			{Path: path.Join(buildPath, "metal", "bin", "ollama-runner")},
			{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
		}
	case "linux":
		runners = []ModelRunner{
			{Path: path.Join(buildPath, "cuda", "bin", "ollama-runner"), Accelerated: true},
			{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
		}
	case "windows":
		// TODO: select windows GPU runner here when available
		runners = []ModelRunner{
			{Path: path.Join(buildPath, "cpu", "bin", "Release", "ollama-runner.exe")},
		}
	default:
		log.Printf("unknown OS, running on CPU: %s", runtime.GOOS)
		runners = []ModelRunner{
			{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
		}
	}

	runnerAvailable := false // if no runner files are found in the embed, this flag will cause a fast fail
	for _, r := range runners {
		// find all the files in the runner's bin directory
		files, err := fs.Glob(llamaCppEmbed, path.Join(path.Dir(r.Path), "*"))
		if err != nil {
			// this is expected, ollama may be compiled without all runners packed in
			log.Printf("%s runner not found: %v", r.Path, err)
			continue
		}

		for _, f := range files {
			runnerAvailable = true

			srcFile, err := llamaCppEmbed.Open(f)
			if err != nil {
				log.Fatalf("read llama runner %s: %v", f, err)
			}
			defer srcFile.Close()

			// create the directory in case it does not exist, filepath.Dir() converts the file path to the OS's format
			destPath := filepath.Join(workDir, filepath.Dir(f))
			if err := os.MkdirAll(destPath, 0o755); err != nil {
				log.Fatalf("create runner temp dir %s: %v", filepath.Dir(f), err)
			}

			// create the path to the destination file, filepath.Base() converts the file path to the OS's format
			destFile := filepath.Join(destPath, filepath.Base(f))

			_, err = os.Stat(destFile)
			switch {
			case errors.Is(err, os.ErrNotExist):
				destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
				if err != nil {
					log.Fatalf("write llama runner %s: %v", f, err)
				}
				defer destFile.Close()

				if _, err := io.Copy(destFile, srcFile); err != nil {
					log.Fatalf("copy llama runner %s: %v", f, err)
				}
			case err != nil:
				log.Fatalf("stat llama runner %s: %v", f, err)
			}
		}
	}
	if !runnerAvailable {
		log.Fatalf("%s runner not found", runnerType)
	}

	// return the runners to try in priority order
	localRunnersByPriority := []ModelRunner{}
	for _, r := range runners {
		// clean the ModelRunner paths so that they match the OS we are running on
		localRunnersByPriority = append(localRunnersByPriority, ModelRunner{
			Path:        filepath.Clean(path.Join(workDir, r.Path)),
			Accelerated: r.Accelerated,
		})
	}

	return localRunnersByPriority
}

type llamaModel struct {
	hyperparameters llamaHyperparameters
}

func (llm *llamaModel) ModelFamily() string {
	return "llama"
}

func llamaModelType(numLayer uint32) string {
	switch numLayer {
	case 26:
		return "3B"
	case 32:
		return "7B"
	case 40:
		return "13B"
	case 48:
		return "34B"
	case 60:
		return "30B"
	case 80:
		return "65B"
	default:
		return "unknown"
	}
}

func (llm *llamaModel) ModelType() string {
	return llamaModelType(llm.hyperparameters.NumLayer)
}

func (llm *llamaModel) FileType() string {
	return fileType(llm.hyperparameters.FileType)
}

func (llm *llamaModel) NumLayers() int64 {
	return int64(llm.hyperparameters.NumLayer)
}

type llamaHyperparameters struct {
	// NumVocab is the size of the model's vocabulary.
	NumVocab uint32

	// NumEmbd is the size of the model's embedding layer.
	NumEmbd uint32
	NumMult uint32
	NumHead uint32

	// NumLayer is the number of layers in the model.
	NumLayer uint32
	NumRot   uint32

	// FileType describes the quantization level of the model, e.g. Q4_0, Q5_K, etc.
	FileType uint32
}

type Running struct {
	Port          int
	Cmd           *exec.Cmd
	Cancel        context.CancelFunc
	exitOnce      sync.Once
	exitCh        chan error // channel to receive the exit status of the subprocess
	*StatusWriter            // captures error messages from the llama runner process
}

type llama struct {
	api.Options
	Running
}

var errNoGPU = errors.New("nvidia-smi command failed")

// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
func CheckVRAM() (int64, error) {
	cmd := exec.Command("nvidia-smi", "--query-gpu=memory.free", "--format=csv,noheader,nounits")
	var stdout bytes.Buffer
	cmd.Stdout = &stdout
	err := cmd.Run()
	if err != nil {
		return 0, errNoGPU
	}

	var freeMiB int64
	scanner := bufio.NewScanner(&stdout)
	for scanner.Scan() {
		line := scanner.Text()
		vram, err := strconv.ParseInt(strings.TrimSpace(line), 10, 64)
		if err != nil {
			return 0, fmt.Errorf("failed to parse available VRAM: %v", err)
		}

		freeMiB += vram
	}

	freeBytes := freeMiB * 1024 * 1024
	if freeBytes < 2*format.GigaByte {
		log.Printf("less than 2 GB VRAM available, falling back to CPU only")
		freeMiB = 0
	}

	return freeBytes, nil
}

func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
	if opts.NumGPU != -1 {
		return opts.NumGPU
	}
	if runtime.GOOS == "linux" {
		freeBytes, err := CheckVRAM()
		if err != nil {
			if err.Error() != "nvidia-smi command failed" {
				log.Print(err.Error())
			}
			// nvidia driver not installed or no nvidia GPU found
			return 0
		}

		// Calculate bytes per layer
		// TODO: this is a rough heuristic, better would be to calculate this based on number of layers and context size
		bytesPerLayer := fileSizeBytes / numLayer

		// max number of layers we can fit in VRAM, subtract 8% to prevent consuming all available VRAM and running out of memory
		layers := int(freeBytes/bytesPerLayer) * 92 / 100
		log.Printf("%d MB VRAM available, loading up to %d GPU layers", freeBytes/(1024*1024), layers)

		return layers
	}
	// default to enable metal on macOS
	return 1
}

// StatusWriter is a writer that captures error messages from the llama runner process
type StatusWriter struct {
	ErrCh      chan error
	LastErrMsg string
}

func NewStatusWriter() *StatusWriter {
	return &StatusWriter{
		ErrCh: make(chan error, 1),
	}
}

func (w *StatusWriter) Write(b []byte) (int, error) {
	var errMsg string
	if _, after, ok := bytes.Cut(b, []byte("error:")); ok {
		errMsg = string(bytes.TrimSpace(after))
	} else if _, after, ok := bytes.Cut(b, []byte("CUDA error")); ok {
		errMsg = string(bytes.TrimSpace(after))
	}

	if errMsg != "" {
		w.LastErrMsg = errMsg
		w.ErrCh <- fmt.Errorf("llama runner: %s", errMsg)
	}

	return os.Stderr.Write(b)
}

func newLlama(model string, adapters []string, runners []ModelRunner, ggml *GGML, opts api.Options) (*llama, error) {
	fileInfo, err := os.Stat(model)
	if err != nil {
		return nil, err
	}

	if len(adapters) > 1 {
		return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
	}

	numGPU := NumGPU(ggml.NumLayers(), fileInfo.Size(), opts)
	params := []string{
		"--model", model,
		"--ctx-size", fmt.Sprintf("%d", opts.NumCtx),
		"--rope-freq-base", fmt.Sprintf("%f", opts.RopeFrequencyBase),
		"--rope-freq-scale", fmt.Sprintf("%f", opts.RopeFrequencyScale),
		"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
		"--n-gpu-layers", fmt.Sprintf("%d", numGPU),
		"--embedding",
	}

	if opts.NumGQA > 0 {
		params = append(params, "--gqa", fmt.Sprintf("%d", opts.NumGQA))
	}

	if len(adapters) > 0 {
		// TODO: applying multiple adapters is not supported by the llama.cpp server yet
		params = append(params, "--lora", adapters[0])
	}

	if opts.NumThread > 0 {
		params = append(params, "--threads", fmt.Sprintf("%d", opts.NumThread))
	}

	if !opts.F16KV {
		params = append(params, "--memory-f32")
	}
	if opts.UseMLock {
		params = append(params, "--mlock")
	}
	if !opts.UseMMap {
		params = append(params, "--no-mmap")
	}
	if opts.UseNUMA {
		params = append(params, "--numa")
	}

	var runnerErr error

	// start the llama.cpp server with a retry in case the port is already in use
	for _, runner := range runners {
		if runner.Accelerated && numGPU == 0 {
			log.Printf("skipping accelerated runner because num_gpu=0")
			continue
		}

		if _, err := os.Stat(runner.Path); err != nil {
			log.Printf("llama runner not found: %v", err)
			continue
		}

		port := rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
		ctx, cancel := context.WithCancel(context.Background())
		cmd := exec.CommandContext(
			ctx,
			runner.Path,
			append(params, "--port", strconv.Itoa(port))...,
		)
		cmd.Env = append(os.Environ(), fmt.Sprintf("LD_LIBRARY_PATH=%s", filepath.Dir(runner.Path)))
		cmd.Stdout = os.Stderr
		statusWriter := NewStatusWriter()
		cmd.Stderr = statusWriter

		llm := &llama{Options: opts, Running: Running{Port: port, Cmd: cmd, Cancel: cancel, exitCh: make(chan error)}}

		log.Print("starting llama runner")
		if err := llm.Cmd.Start(); err != nil {
			log.Printf("error starting the external llama runner: %v", err)
			continue
		}

		// monitor the llama runner process and signal when it exits
		go func() {
			err := llm.Cmd.Wait()
			// default to printing the exit message of the command process, it will probably just say 'exit staus 1'
			errMsg := err.Error()
			// try to set a better error message if llama runner logs captured an error
			if statusWriter.LastErrMsg != "" {
				errMsg = statusWriter.LastErrMsg
			}
			log.Println(errMsg)
			// llm.Cmd.Wait() can only be called once, use this exit channel to signal that the process has exited
			llm.exitOnce.Do(func() {
				close(llm.exitCh)
			})
		}()

		if err := waitForServer(llm); err != nil {
			log.Printf("error starting llama runner: %v", err)
			llm.Close()

			// default the runnerErr to the error returned by the most recent llama runner process
			runnerErr = err

			// capture the error directly from the runner process, if any
			select {
			case runnerErr = <-statusWriter.ErrCh:
			default:
				// the runner process probably timed out
			}

			// try again
			continue
		}

		// server started successfully
		return llm, nil
	}

	if runnerErr != nil {
		// this is the error returned from the llama runner process that failed most recently

		// falcon and starcoder model families are not compatible with older versions of llama.cpp
		families := []string{"falcon", "starcoder"}
		if strings.Contains(runnerErr.Error(), "failed to load model") && slices.Contains(families, ggml.ModelFamily()) {
			return nil, fmt.Errorf("%v: %s", runnerErr, "this model may be incompatible with your version of Ollama. Please run `ollama pull` to get the latest version of this model.")
		}

		return nil, runnerErr
	}

	return nil, fmt.Errorf("failed to start a llama runner")
}

func waitForServer(llm *llama) error {
	start := time.Now()
	expiresAt := time.Now().Add(3 * time.Minute) // be generous with timeout, large models can take a while to load
	ticker := time.NewTicker(200 * time.Millisecond)
	defer ticker.Stop()

	log.Print("waiting for llama runner to start responding")
	for {
		select {
		case <-llm.exitCh:
			// failed to start subprocess
			return fmt.Errorf("llama runner process has terminated")
		case <-ticker.C:
			if time.Now().After(expiresAt) {
				// timeout
				return fmt.Errorf("timed out waiting for llama runner to start")
			}

			if err := llm.Ping(context.Background()); err == nil {
				// success
				log.Printf("llama runner started in %f seconds", time.Since(start).Seconds())
				return nil
			}
		}
	}
}

func (llm *llama) Close() {
	// signal the sub-process to terminate
	llm.Cancel()

	// wait for the command to exit to prevent race conditions with the next run
	<-llm.exitCh

	if llm.StatusWriter != nil && llm.StatusWriter.LastErrMsg != "" {
		log.Printf("llama runner stopped with error: %v", llm.StatusWriter.LastErrMsg)
	} else {
		log.Print("llama runner stopped successfully")
	}
}

func (llm *llama) SetOptions(opts api.Options) {
	llm.Options = opts
}

type prediction struct {
	Content string `json:"content"`
	Model   string `json:"model"`
	Prompt  string `json:"prompt"`
	Stop    bool   `json:"stop"`

	Timings struct {
		PredictedN  int     `json:"predicted_n"`
		PredictedMS float64 `json:"predicted_ms"`
		PromptN     int     `json:"prompt_n"`
		PromptMS    float64 `json:"prompt_ms"`
	}
}

const maxBufferSize = 512 * format.KiloByte

func (llm *llama) Predict(ctx context.Context, prevContext []int, prompt string, fn func(api.GenerateResponse)) error {
	prevConvo, err := llm.Decode(ctx, prevContext)
	if err != nil {
		return err
	}

	// Remove leading spaces from prevConvo if present
	prevConvo = strings.TrimPrefix(prevConvo, " ")

	var nextContext strings.Builder
	nextContext.WriteString(prevConvo)
	nextContext.WriteString(prompt)

	request := map[string]any{
		"prompt":            nextContext.String(),
		"stream":            true,
		"n_predict":         llm.NumPredict,
		"n_keep":            llm.NumKeep,
		"temperature":       llm.Temperature,
		"top_k":             llm.TopK,
		"top_p":             llm.TopP,
		"tfs_z":             llm.TFSZ,
		"typical_p":         llm.TypicalP,
		"repeat_last_n":     llm.RepeatLastN,
		"repeat_penalty":    llm.RepeatPenalty,
		"presence_penalty":  llm.PresencePenalty,
		"frequency_penalty": llm.FrequencyPenalty,
		"mirostat":          llm.Mirostat,
		"mirostat_tau":      llm.MirostatTau,
		"mirostat_eta":      llm.MirostatEta,
		"penalize_nl":       llm.PenalizeNewline,
		"seed":              llm.Seed,
		"stop":              llm.Stop,
	}

	// Handling JSON marshaling with special characters unescaped.
	buffer := &bytes.Buffer{}
	enc := json.NewEncoder(buffer)
	enc.SetEscapeHTML(false)

	if err := enc.Encode(request); err != nil {
		return fmt.Errorf("failed to marshal data: %v", err)
	}

	endpoint := fmt.Sprintf("http://127.0.0.1:%d/completion", llm.Port)
	req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, buffer)
	if err != nil {
		return fmt.Errorf("error creating POST request: %v", err)
	}
	req.Header.Set("Content-Type", "application/json")

	resp, err := http.DefaultClient.Do(req)
	if err != nil {
		return fmt.Errorf("POST predict: %v", err)
	}
	defer resp.Body.Close()

	if resp.StatusCode >= 400 {
		bodyBytes, err := io.ReadAll(resp.Body)
		if err != nil {
			return fmt.Errorf("failed reading llm error response: %w", err)
		}
		log.Printf("llm predict error: %s", bodyBytes)
		return fmt.Errorf("%s", bodyBytes)
	}

	scanner := bufio.NewScanner(resp.Body)
	// increase the buffer size to avoid running out of space
	buf := make([]byte, 0, maxBufferSize)
	scanner.Buffer(buf, maxBufferSize)
	for scanner.Scan() {
		select {
		case <-ctx.Done():
			// This handles the request cancellation
			return ctx.Err()
		default:
			line := scanner.Bytes()
			if len(line) == 0 {
				continue
			}

			if evt, ok := bytes.CutPrefix(line, []byte("data: ")); ok {
				var p prediction
				if err := json.Unmarshal(evt, &p); err != nil {
					return fmt.Errorf("error unmarshaling llm prediction response: %v", err)
				}

				if p.Content != "" {
					fn(api.GenerateResponse{Response: p.Content})
					nextContext.WriteString(p.Content)
				}

				if p.Stop {
					embd, err := llm.Encode(ctx, nextContext.String())
					if err != nil {
						return fmt.Errorf("encoding context: %v", err)
					}

					fn(api.GenerateResponse{
						Done:               true,
						Context:            embd,
						PromptEvalCount:    p.Timings.PromptN,
						PromptEvalDuration: parseDurationMs(p.Timings.PromptMS),
						EvalCount:          p.Timings.PredictedN,
						EvalDuration:       parseDurationMs(p.Timings.PredictedMS),
					})

					return nil
				}
			}
		}
	}

	if err := scanner.Err(); err != nil {
		if strings.Contains(err.Error(), "unexpected EOF") {
			// this means the llama runner subprocess crashed
			llm.Close()
			if llm.StatusWriter != nil && llm.StatusWriter.LastErrMsg != "" {
				return fmt.Errorf("llama runner exited: %v", llm.StatusWriter.LastErrMsg)
			}
			return fmt.Errorf("llama runner exited, you may not have enough available memory to run this model")
		}
		return fmt.Errorf("error reading llm response: %v", err)
	}

	return nil
}

type TokenizeRequest struct {
	Content string `json:"content"`
}

type TokenizeResponse struct {
	Tokens []int `json:"tokens"`
}

func (llm *llama) Encode(ctx context.Context, prompt string) ([]int, error) {
	endpoint := fmt.Sprintf("http://127.0.0.1:%d/tokenize", llm.Port)
	data, err := json.Marshal(TokenizeRequest{Content: prompt})
	if err != nil {
		return nil, fmt.Errorf("marshaling encode data: %w", err)
	}

	req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, bytes.NewBuffer(data))
	if err != nil {
		return nil, fmt.Errorf("encode request: %w", err)
	}
	req.Header.Set("Content-Type", "application/json")

	resp, err := http.DefaultClient.Do(req)
	if err != nil {
		return nil, fmt.Errorf("do encode request: %w", err)
	}
	defer resp.Body.Close()

	body, err := io.ReadAll(resp.Body)
	if err != nil {
		return nil, fmt.Errorf("read encode request: %w", err)
	}

	if resp.StatusCode >= 400 {
		log.Printf("llm encode error: %s", body)
		return nil, fmt.Errorf("%s", body)
	}

	var encoded TokenizeResponse
	if err := json.Unmarshal(body, &encoded); err != nil {
		return nil, fmt.Errorf("unmarshal encode response: %w", err)
	}

	return encoded.Tokens, nil
}

type DetokenizeRequest struct {
	Tokens []int `json:"tokens"`
}

type DetokenizeResponse struct {
	Content string `json:"content"`
}

func (llm *llama) Decode(ctx context.Context, tokens []int) (string, error) {
	if len(tokens) == 0 {
		return "", nil
	}
	endpoint := fmt.Sprintf("http://127.0.0.1:%d/detokenize", llm.Port)
	data, err := json.Marshal(DetokenizeRequest{Tokens: tokens})
	if err != nil {
		return "", fmt.Errorf("marshaling decode data: %w", err)
	}

	req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, bytes.NewBuffer(data))
	if err != nil {
		return "", fmt.Errorf("decode request: %w", err)
	}
	req.Header.Set("Content-Type", "application/json")

	resp, err := http.DefaultClient.Do(req)
	if err != nil {
		return "", fmt.Errorf("do decode request: %w", err)
	}
	defer resp.Body.Close()

	body, err := io.ReadAll(resp.Body)
	if err != nil {
		return "", fmt.Errorf("read decode request: %w", err)
	}

	if resp.StatusCode >= 400 {
		log.Printf("llm decode error: %s", body)
		return "", fmt.Errorf("%s", body)
	}

	var decoded DetokenizeResponse
	if err := json.Unmarshal(body, &decoded); err != nil {
		return "", fmt.Errorf("unmarshal encode response: %w", err)
	}

	return decoded.Content, nil
}

type EmbeddingRequest struct {
	Content string `json:"content"`
}

type EmbeddingResponse struct {
	Embedding []float64 `json:"embedding"`
}

func (llm *llama) Embedding(ctx context.Context, input string) ([]float64, error) {
	endpoint := fmt.Sprintf("http://127.0.0.1:%d/embedding", llm.Port)
	data, err := json.Marshal(TokenizeRequest{Content: input})
	if err != nil {
		return nil, fmt.Errorf("error marshaling embed data: %w", err)
	}

	req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, bytes.NewBuffer(data))
	if err != nil {
		return nil, fmt.Errorf("error creating embed request: %w", err)
	}
	req.Header.Set("Content-Type", "application/json")

	resp, err := http.DefaultClient.Do(req)
	if err != nil {
		return nil, fmt.Errorf("POST embedding: %w", err)
	}
	defer resp.Body.Close()

	body, err := io.ReadAll(resp.Body)
	if err != nil {
		return nil, fmt.Errorf("error reading embed response: %w", err)
	}

	if resp.StatusCode >= 400 {
		log.Printf("llm encode error: %s", body)
		return nil, fmt.Errorf("%s", body)
	}

	var embedding EmbeddingResponse
	if err := json.Unmarshal(body, &embedding); err != nil {
		return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
	}

	return embedding.Embedding, nil
}

// Ping checks that the server subprocess is still running and responding to requests
func (llm *llama) Ping(ctx context.Context) error {
	resp, err := http.Head(fmt.Sprintf("http://127.0.0.1:%d", llm.Port))
	if err != nil {
		return fmt.Errorf("ping resp: %w", err)
	}
	if resp.StatusCode != http.StatusOK {
		return fmt.Errorf("unexpected ping status: %s", resp.Status)
	}
	return nil
}