Add cgo implementation for llama.cpp

Run the server.cpp directly inside the Go runtime via cgo while retaining the LLM Go abstractions.

Add cgo implementation for llama.cpp
Run the server.cpp directly inside the Go runtime via cgo while retaining the LLM Go abstractions.
d4cd6957 · Daniel Hiltgen · 5e7fd690 · d4cd6957 · d4cd6957 · d4cd6957
Commit d4cd6957 authored Nov 13, 2023 by Daniel Hiltgen
7 changed files
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -18,7 +18,6 @@ type LLM interface {
 	Embedding(context.Context, string) ([]float64, error)
 	Encode(context.Context, string) ([]int, error)
 	Decode(context.Context, []int) (string, error)
-	SetOptions(api.Options)
 	Close()
 	Ping(context.Context) error
 }
@@ -79,5 +78,5 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
 	opts.NumGQA = 0
 	opts.RopeFrequencyBase = 0.0
 	opts.RopeFrequencyScale = 0.0
-	return newLlama(model, adapters, projectors, chooseRunners(workDir), ggml.NumLayers(), opts)
+	return newLlamaExtServer(model, adapters, projectors, ggml.NumLayers(), opts)
 }
--- a/scripts/build_darwin.sh
+++ b/scripts/build_darwin.sh
@@ -9,7 +9,7 @@ mkdir -p dist

 for TARGETARCH in arm64 amd64; do
    GOOS=darwin GOARCH=$TARGETARCH go generate ./...
-    GOOS=darwin GOARCH=$TARGETARCH go build -o dist/ollama-darwin-$TARGETARCH
+    CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -o dist/ollama-darwin-$TARGETARCH
    rm -rf llm/llama.cpp/*/build
 done


--- a/scripts/build_linux.sh
+++ b/scripts/build_linux.sh
@@ -7,7 +7,7 @@ export GOFLAGS="'-ldflags=-w -s \"-X=github.com/jmorganca/ollama/version.Version

 mkdir -p dist

-for TARGETARCH in arm64 amd64; do
+for TARGETARCH in amd64 arm64; do
    docker buildx build --load --platform=linux/$TARGETARCH --build-arg=VERSION --build-arg=GOFLAGS -f Dockerfile.build -t builder:$TARGETARCH .
    docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH
    docker cp builder-$TARGETARCH:/go/src/github.com/jmorganca/ollama/ollama ./dist/ollama-linux-$TARGETARCH

--- a/scripts/setup_integration_tests.sh
+++ b/scripts/setup_integration_tests.sh
+#!/bin/bash
+
+# This script sets up integration tests which run the full stack to verify
+# inference locally
+set -e
+set -o pipefail
+
+REPO=$(dirname $0)/../
+export OLLAMA_MODELS=${REPO}/test_data/models
+REGISTRY_SCHEME=https
+REGISTRY=registry.ollama.ai
+TEST_MODEL=library/orca-mini
+TEST_MODEL_TAG=latest
+ACCEPT_HEADER="Accept: application/vnd.docker.distribution.manifest.v2+json"
+
+mkdir -p ${OLLAMA_MODELS}/manifests/${REGISTRY}/${TEST_MODEL}/
+mkdir -p ${OLLAMA_MODELS}/blobs/
+
+echo "Pulling manifest for ${TEST_MODEL}:${TEST_MODEL_TAG}"
+curl -s --header "${ACCEPT_HEADER}" \
+    -o ${OLLAMA_MODELS}/manifests/${REGISTRY}/${TEST_MODEL}/${TEST_MODEL_TAG} \
+    ${REGISTRY_SCHEME}://${REGISTRY}/v2/${TEST_MODEL}/manifests/${TEST_MODEL_TAG} 
+
+CFG_HASH=$(cat ${OLLAMA_MODELS}/manifests/${REGISTRY}/${TEST_MODEL}/${TEST_MODEL_TAG} | jq -r ".config.digest")
+echo "Pulling config blob ${CFG_HASH}"
+curl -L -C - --header "${ACCEPT_HEADER}" \
+        -o ${OLLAMA_MODELS}/blobs/${CFG_HASH} \
+        ${REGISTRY_SCHEME}://${REGISTRY}/v2/${TEST_MODEL}/blobs/${CFG_HASH}
+
+for LAYER in $(cat ${OLLAMA_MODELS}/manifests/${REGISTRY}/${TEST_MODEL}/${TEST_MODEL_TAG} | jq -r ".layers[].digest" ) ; do
+    echo "Pulling blob ${LAYER}"
+    curl -L -C - --header "${ACCEPT_HEADER}" \
+        -o ${OLLAMA_MODELS}/blobs/${LAYER} \
+        ${REGISTRY_SCHEME}://${REGISTRY}/v2/${TEST_MODEL}/blobs/${LAYER}
+done
\ No newline at end of file
--- a/server/llm_test.go
+++ b/server/llm_test.go
+package server
+
+import (
+	"context"
+	"strings"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+
+	"github.com/jmorganca/ollama/api"
+)
+
+// TODO - this would ideally be in the llm package, but that would require some refactoring of interfaces in the server
+//        package to avoid circular dependencies
+
+// WARNING - these tests will fail on mac if you don't manually copy ggml-metal.metal to this dir (./server)
+//
+// TODO - Fix this ^^
+
+var (
+	req = [2]api.GenerateRequest{
+		{
+			Model:   "orca-mini",
+			Prompt:  "tell me a short story about agi?",
+			Options: map[string]interface{}{},
+		}, {
+			Model:   "orca-mini",
+			Prompt:  "what is the origin of the us thanksgiving holiday?",
+			Options: map[string]interface{}{},
+		},
+	}
+	resp = [2]string{
+		"once upon a time",
+		"fourth thursday",
+	}
+)
+
+func TestIntegrationSimpleOrcaMini(t *testing.T) {
+	SkipIFNoTestData(t)
+	ctx, cancel := context.WithTimeout(context.Background(), time.Second*60)
+	defer cancel()
+	opts := api.DefaultOptions()
+	opts.Seed = 42
+	opts.Temperature = 0.0
+	model, llmRunner := PrepareModelForPrompts(t, req[0].Model, opts)
+	defer llmRunner.Close()
+	response := OneShotPromptResponse(t, ctx, req[0], model, llmRunner)
+	assert.Contains(t, strings.ToLower(response), resp[0])
+}
+
+// TODO
+// The server always loads a new runner and closes the old one, which forces serial execution
+// At present this test case fails with concurrency problems.  Eventually we should try to
+// get true concurrency working with n_parallel support in the backend
+func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
+	SkipIFNoTestData(t)
+	t.Skip("concurrent prediction on single runner not currently supported")
+	ctx, cancel := context.WithTimeout(context.Background(), time.Second*60)
+	defer cancel()
+	opts := api.DefaultOptions()
+	opts.Seed = 42
+	opts.Temperature = 0.0
+	var wg sync.WaitGroup
+	wg.Add(len(req))
+	model, llmRunner := PrepareModelForPrompts(t, req[0].Model, opts)
+	defer llmRunner.Close()
+	for i := 0; i < len(req); i++ {
+		go func(i int) {
+			defer wg.Done()
+			response := OneShotPromptResponse(t, ctx, req[i], model, llmRunner)
+			t.Logf("Prompt: %s\nResponse: %s", req[0].Prompt, response)
+			assert.Contains(t, strings.ToLower(response), resp[i], "error in thread %d (%s)", i, req[i].Prompt)
+		}(i)
+	}
+	wg.Wait()
+}
+
+func TestIntegrationConcurrentRunnersOrcaMini(t *testing.T) {
+	SkipIFNoTestData(t)
+	ctx, cancel := context.WithTimeout(context.Background(), time.Second*60)
+	defer cancel()
+	opts := api.DefaultOptions()
+	opts.Seed = 42
+	opts.Temperature = 0.0
+	var wg sync.WaitGroup
+	wg.Add(len(req))
+
+	for i := 0; i < len(req); i++ {
+		go func(i int) {
+			defer wg.Done()
+			model, llmRunner := PrepareModelForPrompts(t, req[0].Model, opts)
+			defer llmRunner.Close()
+			response := OneShotPromptResponse(t, ctx, req[i], model, llmRunner)
+			t.Logf("Prompt: %s\nResponse: %s", req[0].Prompt, response)
+			assert.Contains(t, strings.ToLower(response), resp[i], "error in thread %d (%s)", i, req[i].Prompt)
+		}(i)
+	}
+	wg.Wait()
+}
+
+// TODO - create a parallel test with 2 different models once we support concurrency
--- a/server/llm_utils_test.go
+++ b/server/llm_utils_test.go
+package server
+
+import (
+	"context"
+	"errors"
+	"os"
+	"path"
+	"runtime"
+	"testing"
+	"time"
+
+	"github.com/jmorganca/ollama/api"
+	"github.com/jmorganca/ollama/llm"
+	"github.com/stretchr/testify/require"
+)
+
+func SkipIFNoTestData(t *testing.T) {
+	modelDir := getModelDir()
+	if _, err := os.Stat(modelDir); errors.Is(err, os.ErrNotExist) {
+		t.Skipf("%s does not exist - skipping integration tests", modelDir)
+	}
+}
+
+func getModelDir() string {
+	_, filename, _, _ := runtime.Caller(0)
+	return path.Dir(path.Dir(filename) + "/../test_data/models/.")
+}
+
+func PrepareModelForPrompts(t *testing.T, modelName string, opts api.Options) (*Model, llm.LLM) {
+	modelDir := getModelDir()
+	os.Setenv("OLLAMA_MODELS", modelDir)
+	model, err := GetModel(modelName)
+	require.NoError(t, err, "GetModel ")
+	err = opts.FromMap(model.Options)
+	require.NoError(t, err, "opts from model ")
+	runner, err := llm.New("unused", model.ModelPath, model.AdapterPaths, model.ProjectorPaths, opts)
+	require.NoError(t, err, "llm.New failed")
+	return model, runner
+}
+
+func OneShotPromptResponse(t *testing.T, ctx context.Context, req api.GenerateRequest, model *Model, runner llm.LLM) string {
+	checkpointStart := time.Now()
+	prompt, err := model.Prompt(PromptVars{
+		System: req.System,
+		Prompt: req.Prompt,
+		First:  len(req.Context) == 0,
+	})
+	require.NoError(t, err, "prompt generation failed")
+	success := make(chan bool, 1)
+	response := ""
+	cb := func(r llm.PredictResult) {
+
+		if !r.Done {
+			response += r.Content
+		} else {
+			success <- true
+		}
+	}
+	checkpointLoaded := time.Now()
+	predictReq := llm.PredictOpts{
+		Prompt:           prompt,
+		Format:           req.Format,
+		CheckpointStart:  checkpointStart,
+		CheckpointLoaded: checkpointLoaded,
+	}
+	err = runner.Predict(ctx, predictReq, cb)
+	require.NoError(t, err, "predict call failed")
+
+	select {
+	case <-ctx.Done():
+		t.Errorf("failed to complete before timeout: \n%s", response)
+		return ""
+	case <-success:
+		return response
+	}
+}
--- a/server/routes.go
+++ b/server/routes.go
@@ -126,10 +126,6 @@ func load(c *gin.Context, modelName string, reqOpts map[string]interface{}, sess
 		loaded.Options = &opts
 	}

-	// update options for the loaded llm
-	// TODO(mxyng): this isn't thread safe, but it should be fine for now
-	loaded.runner.SetOptions(opts)
-
 	loaded.expireAt = time.Now().Add(sessionDuration)

 	if loaded.expireTimer == nil {