Merge branch 'ollama:main' into main

4c4c730a · mraiser · GitHub · 6eb3cddc · e02ecfb6 · 4c4c730a
Unverified Commit 4c4c730a authored Jan 27, 2024 by mraiser Committed by GitHub Jan 27, 2024
20 changed files
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -23,29 +23,72 @@ jobs:
        with:
          go-version: '1.21'
          cache: true
-      - if: ${{ startsWith(matrix.os, 'windows-') }}
-        shell: pwsh
-        run: |
-          $path = vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath
-          if ($path) {
-              $path = join-path $path 'Common7\Tools\vsdevcmd.bat'
-              if (test-path $path) {
-                  cmd /s /c """$path"" $args && set" | where { $_ -match '(\w+)=(.*)' } | foreach {
-                      echo "$($Matches[1])=$($Matches[2])" | Out-File -FilePath $Env:GITHUB_ENV -Encoding utf8 -Append
-                  }
-              }
-          }
-          echo "C:\Program Files\Git\usr\bin" | Out-File -FilePath $Env:GITHUB_PATH -Encoding utf8 -Append
      - run: go get ./...
      - run: go generate -x ./...
      - uses: actions/upload-artifact@v4
        with:
          name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
-          path: |
+          path: llm/llama.cpp/build/**/lib/*
-            llm/llama.cpp/build/**/lib/*
+  generate-cuda:
+    strategy:
+      matrix:
+        cuda-version:
+          - '11.8.0'
+    runs-on: ubuntu-latest
+    container: nvidia/cuda:${{ matrix.cuda-version }}-devel-ubuntu20.04
+    steps:
+      - run: |
+          apt-get update && apt-get install -y git build-essential curl
+          curl -fsSL https://github.com/Kitware/CMake/releases/download/v3.28.1/cmake-3.28.1-linux-x86_64.tar.gz \
+            | tar -zx -C /usr --strip-components 1
+        env:
+          DEBIAN_FRONTEND: noninteractive
+      - uses: actions/checkout@v4
+      - uses: actions/setup-go@v4
+        with:
+          go-version: '1.21'
+          cache: true
+      - run: go get ./...
+      - run: |
+          git config --global --add safe.directory /__w/ollama/ollama
+          go generate -x ./...
+        env:
+          OLLAMA_SKIP_CPU_GENERATE: '1'
+      - uses: actions/upload-artifact@v4
+        with:
+          name: cuda-${{ matrix.cuda-version }}-libraries
+          path: llm/llama.cpp/build/**/lib/*
+  generate-rocm:
+    strategy:
+      matrix:
+        rocm-version:
+          - '5.7.1'
+          - '6.0'
+    runs-on: ubuntu-latest
+    container: rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}
+    steps:
+      - run: |
+          apt-get update && apt-get install -y git build-essential curl rocm-libs
+          curl -fsSL https://github.com/Kitware/CMake/releases/download/v3.28.1/cmake-3.28.1-linux-x86_64.tar.gz \
+            | tar -zx -C /usr --strip-components 1
+        env:
+          DEBIAN_FRONTEND: noninteractive
+      - uses: actions/checkout@v4
+      - uses: actions/setup-go@v4
+        with:
+          go-version: '1.21'
+          cache: true
+      - run: go get ./...
+      - run: |
+          git config --global --add safe.directory /__w/ollama/ollama
+          go generate -x ./...
+        env:
+          OLLAMA_SKIP_CPU_GENERATE: '1'
+      - uses: actions/upload-artifact@v4
+        with:
+          name: rocm-${{ matrix.rocm-version }}-libraries
+          path: llm/llama.cpp/build/**/lib/*
  lint:
-    needs: generate
    strategy:
      matrix:
        os: [ubuntu-latest, macos-latest, windows-latest]
@@ -69,10 +112,19 @@ jobs:
        with:
          go-version: '1.21'
          cache: false
-      - uses: actions/download-artifact@v4
+      - run: |
-        with:
+          mkdir -p llm/llama.cpp/build/linux/${{ matrix.arch }}/stub/lib/
-          name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
+          touch llm/llama.cpp/build/linux/${{ matrix.arch }}/stub/lib/stub.so
-          path: llm/llama.cpp/build
+        if: ${{ startsWith(matrix.os, 'ubuntu-') }}
+      - run: |
+          mkdir -p llm/llama.cpp/build/darwin/${{ matrix.arch }}/stub/lib/
+          touch llm/llama.cpp/build/darwin/${{ matrix.arch }}/stub/lib/stub.dylib
+          touch llm/llama.cpp/ggml-metal.metal
+        if: ${{ startsWith(matrix.os, 'macos-') }}
+      - run: |
+          mkdir -p llm/llama.cpp/build/windows/${{ matrix.arch }}/stub/lib/
+          touch llm/llama.cpp/build/windows/${{ matrix.arch }}/stub/lib/stub.dll
+        if: ${{ startsWith(matrix.os, 'windows-') }}
      - uses: golangci/golangci-lint-action@v3
  test:
    needs: generate
@@ -104,3 +156,7 @@ jobs:
          path: llm/llama.cpp/build
      - run: go build
      - run: go test -v ./...
+      - uses: actions/upload-artifact@v4
+        with:
+          name: ${{ matrix.os }}-binaries
+          path: ollama
--- a/Dockerfile
+++ b/Dockerfile
@@ -109,17 +109,28 @@ ARG CGO_CFLAGS
 RUN go build .
 # Runtime stages
-FROM --platform=linux/amd64 rocm/dev-centos-7:6.0-complete as runtime-amd64
+FROM --platform=linux/amd64 ubuntu:22.04 as runtime-amd64
+RUN apt-get update && apt-get install -y ca-certificates
 COPY --from=build-amd64 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
 FROM --platform=linux/arm64 ubuntu:22.04 as runtime-arm64
 RUN apt-get update && apt-get install -y ca-certificates
 COPY --from=build-arm64 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
+# Radeon images are much larger so we keep it distinct from the CPU/CUDA image
+FROM --platform=linux/amd64 rocm/dev-centos-7:5.7.1-complete as runtime-rocm
+RUN update-pciids
+COPY --from=build-amd64 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
+EXPOSE 11434
+ENV OLLAMA_HOST 0.0.0.0
+ENTRYPOINT ["/bin/ollama"]
+CMD ["serve"]
 FROM runtime-$TARGETARCH
 EXPOSE 11434
 ENV OLLAMA_HOST 0.0.0.0
 ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
-ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/opt/rocm/lib:
+ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
 ENTRYPOINT ["/bin/ollama"]

--- a/api/types.go
+++ b/api/types.go
@@ -34,24 +34,26 @@ func (e StatusError) Error() string {
 type ImageData []byte
 type GenerateRequest struct {
-	Model    string      `json:"model"`
+	Model     string      `json:"model"`
-	Prompt   string      `json:"prompt"`
+	Prompt    string      `json:"prompt"`
-	System   string      `json:"system"`
+	System    string      `json:"system"`
-	Template string      `json:"template"`
+	Template  string      `json:"template"`
-	Context  []int       `json:"context,omitempty"`
+	Context   []int       `json:"context,omitempty"`
-	Stream   *bool       `json:"stream,omitempty"`
+	Stream    *bool       `json:"stream,omitempty"`
-	Raw      bool        `json:"raw,omitempty"`
+	Raw       bool        `json:"raw,omitempty"`
-	Format   string      `json:"format"`
+	Format    string      `json:"format"`
-	Images   []ImageData `json:"images,omitempty"`
+	KeepAlive *Duration   `json:"keep_alive,omitempty"`
+	Images    []ImageData `json:"images,omitempty"`
 	Options map[string]interface{} `json:"options"`
 }
 type ChatRequest struct {
-	Model    string    `json:"model"`
+	Model     string    `json:"model"`
-	Messages []Message `json:"messages"`
+	Messages  []Message `json:"messages"`
-	Stream   *bool     `json:"stream,omitempty"`
+	Stream    *bool     `json:"stream,omitempty"`
-	Format   string    `json:"format"`
+	Format    string    `json:"format"`
+	KeepAlive *Duration `json:"keep_alive,omitempty"`
 	Options map[string]interface{} `json:"options"`
 }
@@ -126,8 +128,9 @@ type Runner struct {
 }
 type EmbeddingRequest struct {
-	Model  string `json:"model"`
+	Model     string    `json:"model"`
-	Prompt string `json:"prompt"`
+	Prompt    string    `json:"prompt"`
+	KeepAlive *Duration `json:"keep_alive,omitempty"`
 	Options map[string]interface{} `json:"options"`
 }
@@ -171,6 +174,7 @@ type ShowResponse struct {
 	Template   string       `json:"template,omitempty"`
 	System     string       `json:"system,omitempty"`
 	Details    ModelDetails `json:"details,omitempty"`
+	Messages   []Message    `json:"messages,omitempty"`
 }
 type CopyRequest struct {
@@ -236,6 +240,7 @@ type GenerateResponse struct {
 }
 type ModelDetails struct {
+	ParentModel       string   `json:"parent_model"`
 	Format            string   `json:"format"`
 	Family            string   `json:"family"`
 	Families          []string `json:"families"`
@@ -411,14 +416,19 @@ func (d *Duration) UnmarshalJSON(b []byte) (err error) {
 	case float64:
 		if t < 0 {
 			t = math.MaxFloat64
+			d.Duration = time.Duration(t)
+		} else {
+			d.Duration = time.Duration(t * float64(time.Second))
 		}
-		d.Duration = time.Duration(t)
 	case string:
 		d.Duration, err = time.ParseDuration(t)
 		if err != nil {
 			return err
 		}
+		if d.Duration < 0 {
+			mf := math.MaxFloat64
+			d.Duration = time.Duration(mf)
+		}
 	}
 	return nil

--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -458,15 +458,17 @@ func RunGenerate(cmd *cobra.Command, args []string) error {
 type generateContextKey string
 type runOptions struct {
-	Model    string
+	Model       string
-	Prompt   string
+	ParentModel string
-	Messages []api.Message
+	Prompt      string
-	WordWrap bool
+	Messages    []api.Message
-	Format   string
+	WordWrap    bool
-	System   string
+	Format      string
-	Template string
+	System      string
-	Images   []api.ImageData
+	Template    string
-	Options  map[string]interface{}
+	Images      []api.ImageData
+	Options     map[string]interface{}
+	MultiModal  bool
 }
 type displayResponseState struct {

--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -7,12 +7,14 @@ import (
 	"net/http"
 	"os"
 	"regexp"
+	"sort"
 	"strings"
 	"github.com/spf13/cobra"
 	"golang.org/x/exp/slices"
 	"github.com/jmorganca/ollama/api"
+	"github.com/jmorganca/ollama/progress"
 	"github.com/jmorganca/ollama/readline"
 )
@@ -25,43 +27,75 @@ const (
 	MultilineTemplate
 )
-func modelIsMultiModal(cmd *cobra.Command, name string) bool {
+func loadModel(cmd *cobra.Command, opts *runOptions) error {
-	// get model details
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
-		fmt.Println("error: couldn't connect to ollama server")
+		return err
-		return false
 	}
-	req := api.ShowRequest{Name: name}
+	p := progress.NewProgress(os.Stderr)
-	resp, err := client.Show(cmd.Context(), &req)
+	defer p.StopAndClear()
+	spinner := progress.NewSpinner("")
+	p.Add("", spinner)
+	showReq := api.ShowRequest{Name: opts.Model}
+	showResp, err := client.Show(cmd.Context(), &showReq)
 	if err != nil {
-		return false
+		return err
 	}
+	opts.MultiModal = slices.Contains(showResp.Details.Families, "clip")
+	opts.ParentModel = showResp.Details.ParentModel
-	return slices.Contains(resp.Details.Families, "clip")
+	if len(showResp.Messages) > 0 {
-}
+		opts.Messages = append(opts.Messages, showResp.Messages...)
+	}
-func generateInteractive(cmd *cobra.Command, opts runOptions) error {
-	multiModal := modelIsMultiModal(cmd, opts.Model)
-	// load the model
+	chatReq := &api.ChatRequest{
-	loadOpts := runOptions{
 		Model:    opts.Model,
-		Prompt:   "",
 		Messages: []api.Message{},
 	}
-	if _, err := chat(cmd, loadOpts); err != nil {
+	err = client.Chat(cmd.Context(), chatReq, func(resp api.ChatResponse) error {
+		p.StopAndClear()
+		if len(opts.Messages) > 0 {
+			for _, msg := range opts.Messages {
+				switch msg.Role {
+				case "user":
+					fmt.Printf(">>> %s\n", msg.Content)
+				case "assistant":
+					state := &displayResponseState{}
+					displayResponse(msg.Content, opts.WordWrap, state)
+					fmt.Println()
+					fmt.Println()
+				}
+			}
+		}
+		return nil
+	})
+	if err != nil {
+		return err
+	}
+	return nil
+}
+func generateInteractive(cmd *cobra.Command, opts runOptions) error {
+	opts.Messages = make([]api.Message, 0)
+	err := loadModel(cmd, &opts)
+	if err != nil {
 		return err
 	}
 	usage := func() {
 		fmt.Fprintln(os.Stderr, "Available Commands:")
-		fmt.Fprintln(os.Stderr, "  /set          Set session variables")
+		fmt.Fprintln(os.Stderr, "  /set            Set session variables")
-		fmt.Fprintln(os.Stderr, "  /show         Show model information")
+		fmt.Fprintln(os.Stderr, "  /show           Show model information")
-		fmt.Fprintln(os.Stderr, "  /bye          Exit")
+		fmt.Fprintln(os.Stderr, "  /load <model>   Load a session or model")
-		fmt.Fprintln(os.Stderr, "  /?, /help     Help for a command")
+		fmt.Fprintln(os.Stderr, "  /save <model>   Save your current session")
-		fmt.Fprintln(os.Stderr, "  /? shortcuts  Help for keyboard shortcuts")
+		fmt.Fprintln(os.Stderr, "  /bye            Exit")
+		fmt.Fprintln(os.Stderr, "  /?, /help       Help for a command")
+		fmt.Fprintln(os.Stderr, "  /? shortcuts    Help for keyboard shortcuts")
 		fmt.Fprintln(os.Stderr, "")
 		fmt.Fprintln(os.Stderr, "Use \"\"\" to begin a multi-line message.")
 		fmt.Fprintln(os.Stderr, "")
@@ -140,7 +174,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 	var sb strings.Builder
 	var multiline MultilineState
-	opts.Messages = make([]api.Message, 0)
 	for {
 		line, err := scanner.Readline()
@@ -203,6 +236,44 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 			if err := ListHandler(cmd, args[1:]); err != nil {
 				return err
 			}
+		case strings.HasPrefix(line, "/load"):
+			args := strings.Fields(line)
+			if len(args) != 2 {
+				fmt.Println("Usage:\n  /load <modelname>")
+				continue
+			}
+			opts.Model = args[1]
+			opts.Messages = []api.Message{}
+			fmt.Printf("Loading model '%s'\n", opts.Model)
+			if err := loadModel(cmd, &opts); err != nil {
+				return err
+			}
+			continue
+		case strings.HasPrefix(line, "/save"):
+			args := strings.Fields(line)
+			if len(args) != 2 {
+				fmt.Println("Usage:\n  /save <modelname>")
+				continue
+			}
+			client, err := api.ClientFromEnvironment()
+			if err != nil {
+				fmt.Println("error: couldn't connect to ollama server")
+				return err
+			}
+			req := &api.CreateRequest{
+				Name:      args[1],
+				Modelfile: buildModelfile(opts),
+			}
+			fn := func(resp api.ProgressResponse) error { return nil }
+			err = client.Create(cmd.Context(), req, fn)
+			if err != nil {
+				fmt.Println("error: couldn't save model")
+				return err
+			}
+			fmt.Printf("Created new model '%s'\n", args[1])
+			continue
 		case strings.HasPrefix(line, "/set"):
 			args := strings.Fields(line)
 			if len(args) > 1 {
@@ -389,7 +460,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 			args := strings.Fields(line)
 			isFile := false
-			if multiModal {
+			if opts.MultiModal {
 				for _, f := range extractFileNames(line) {
 					if strings.HasPrefix(f, args[0]) {
 						isFile = true
@@ -411,7 +482,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 		if sb.Len() > 0 && multiline == MultilineNone {
 			newMessage := api.Message{Role: "user", Content: sb.String()}
-			if multiModal {
+			if opts.MultiModal {
 				msg, images, err := extractFileData(sb.String())
 				if err != nil {
 					return err
@@ -454,6 +525,38 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 	}
 }
+func buildModelfile(opts runOptions) string {
+	var mf strings.Builder
+	model := opts.ParentModel
+	if model == "" {
+		model = opts.Model
+	}
+	fmt.Fprintf(&mf, "FROM %s\n", model)
+	if opts.System != "" {
+		fmt.Fprintf(&mf, "SYSTEM \"\"\"%s\"\"\"\n", opts.System)
+	}
+	if opts.Template != "" {
+		fmt.Fprintf(&mf, "TEMPLATE \"\"\"%s\"\"\"\n", opts.Template)
+	}
+	keys := make([]string, 0)
+	for k := range opts.Options {
+		keys = append(keys, k)
+	}
+	sort.Strings(keys)
+	for _, k := range keys {
+		fmt.Fprintf(&mf, "PARAMETER %s %v\n", k, opts.Options[k])
+	}
+	fmt.Fprintln(&mf)
+	for _, msg := range opts.Messages {
+		fmt.Fprintf(&mf, "MESSAGE %s \"\"\"%s\"\"\"\n", msg.Role, msg.Content)
+	}
+	return mf.String()
+}
 func normalizeFilePath(fp string) string {
 	// Define a map of escaped characters and their replacements
 	replacements := map[string]string{

--- a/cmd/interactive_test.go
+++ b/cmd/interactive_test.go
 package cmd
 import (
+	"bytes"
 	"testing"
+	"text/template"
 	"github.com/stretchr/testify/assert"
+	"github.com/jmorganca/ollama/api"
 )
 func TestExtractFilenames(t *testing.T) {
@@ -49,3 +53,64 @@ d:\path with\spaces\seven.svg inbetween7 c:\users\jdoe\eight.png inbetween8
 	assert.Contains(t, res[9], "ten.svg")
 	assert.Contains(t, res[9], "E:")
 }
+func TestModelfileBuilder(t *testing.T) {
+	opts := runOptions{
+		Model:    "hork",
+		System:   "You are part horse and part shark, but all hork. Do horklike things",
+		Template: "This is a template.",
+		Messages: []api.Message{
+			{Role: "user", Content: "Hey there hork!"},
+			{Role: "assistant", Content: "Yes it is true, I am half horse, half shark."},
+		},
+		Options: map[string]interface{}{},
+	}
+	opts.Options["temperature"] = 0.9
+	opts.Options["seed"] = 42
+	opts.Options["penalize_newline"] = false
+	opts.Options["stop"] = []string{"hi", "there"}
+	mf := buildModelfile(opts)
+	expectedModelfile := `FROM {{.Model}}
+SYSTEM """{{.System}}"""
+TEMPLATE """{{.Template}}"""
+PARAMETER penalize_newline false
+PARAMETER seed 42
+PARAMETER stop [hi there]
+PARAMETER temperature 0.9
+MESSAGE user """Hey there hork!"""
+MESSAGE assistant """Yes it is true, I am half horse, half shark."""
+`
+	tmpl, err := template.New("").Parse(expectedModelfile)
+	assert.Nil(t, err)
+	var buf bytes.Buffer
+	err = tmpl.Execute(&buf, opts)
+	assert.Nil(t, err)
+	assert.Equal(t, buf.String(), mf)
+	opts.ParentModel = "horseshark"
+	mf = buildModelfile(opts)
+	expectedModelfile = `FROM {{.ParentModel}}
+SYSTEM """{{.System}}"""
+TEMPLATE """{{.Template}}"""
+PARAMETER penalize_newline false
+PARAMETER seed 42
+PARAMETER stop [hi there]
+PARAMETER temperature 0.9
+MESSAGE user """Hey there hork!"""
+MESSAGE assistant """Yes it is true, I am half horse, half shark."""
+`
+	tmpl, err = template.New("").Parse(expectedModelfile)
+	assert.Nil(t, err)
+	var parentBuf bytes.Buffer
+	err = tmpl.Execute(&parentBuf, opts)
+	assert.Nil(t, err)
+	assert.Equal(t, parentBuf.String(), mf)
+}
--- a/docs/development.md
+++ b/docs/development.md
@@ -50,7 +50,8 @@ development and runtime packages.
 Typically the build scripts will auto-detect CUDA, however, if your Linux distro
 or installation approach uses unusual paths, you can specify the location by
 specifying an environment variable `CUDA_LIB_DIR` to the location of the shared
-libraries, and `CUDACXX` to the location of the nvcc compiler.
+libraries, and `CUDACXX` to the location of the nvcc compiler.  You can customize
+set set of target CUDA architectues by setting `CMAKE_CUDA_ARCHITECTURES` (e.g. "50;60;70")
 Then generate dependencies:

--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -19,6 +19,7 @@ A model file is the blueprint to create and share models with Ollama.
  - [SYSTEM](#system)
  - [ADAPTER](#adapter)
  - [LICENSE](#license)
+  - [MESSAGE](#message)
 - [Notes](#notes)
 ## Format
@@ -38,6 +39,7 @@ INSTRUCTION arguments
 | [`SYSTEM`](#system)                 | Specifies the system message that will be set in the template. |
 | [`ADAPTER`](#adapter)               | Defines the (Q)LoRA adapters to apply to the model.            |
 | [`LICENSE`](#license)               | Specifies the legal license.                                   |
+| [`MESSAGE`](#message)               | Specify message history.                                       |
 ## Examples
@@ -205,6 +207,19 @@ LICENSE """
 """
 ```
+### MESSAGE
+The `MESSAGE` instruction allows you to specify a message history for the model to use when responding:
+```modelfile
+MESSAGE user Is Toronto in Canada?
+MESSAGE assistant yes
+MESSAGE user Is Sacramento in Canada?
+MESSAGE assistant no
+MESSAGE user Is Ontario in Canada?
+MESSAGE assistant yes
+```
 ## Notes
 - the **`Modelfile` is not case sensitive**. In the examples, uppercase instructions are used to make it easier to distinguish it from arguments.

--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -16,6 +16,7 @@ import (
 	"os"
 	"path/filepath"
 	"runtime"
+	"strconv"
 	"strings"
 	"sync"
 	"unsafe"
@@ -29,8 +30,8 @@ type handles struct {
 var gpuMutex sync.Mutex
 var gpuHandles *handles = nil
-// With our current CUDA compile flags, 5.2 and older will not work properly
+// With our current CUDA compile flags, older than 5.0 will not work properly
-const CudaComputeMajorMin = 6
+var CudaComputeMin = [2]C.int{5, 0}
 // Possible locations for the nvidia-ml library
 var CudaLinuxGlobs = []string{
@@ -121,9 +122,15 @@ func GetGPUInfo() GpuInfo {
 		initGPUHandles()
 	}
+	// All our GPU builds have AVX enabled, so fallback to CPU if we don't detect at least AVX
+	cpuVariant := GetCPUVariant()
+	if cpuVariant == "" {
+		slog.Warn("CPU does not have AVX or AVX2, disabling GPU support.")
+	}
 	var memInfo C.mem_info_t
 	resp := GpuInfo{}
-	if gpuHandles.cuda != nil {
+	if gpuHandles.cuda != nil && cpuVariant != "" {
 		C.cuda_check_vram(*gpuHandles.cuda, &memInfo)
 		if memInfo.err != nil {
 			slog.Info(fmt.Sprintf("error looking up CUDA GPU memory: %s", C.GoString(memInfo.err)))
@@ -135,19 +142,40 @@ func GetGPUInfo() GpuInfo {
 			if cc.err != nil {
 				slog.Info(fmt.Sprintf("error looking up CUDA GPU compute capability: %s", C.GoString(cc.err)))
 				C.free(unsafe.Pointer(cc.err))
-			} else if cc.major >= CudaComputeMajorMin {
+			} else if cc.major > CudaComputeMin[0] || (cc.major == CudaComputeMin[0] && cc.minor >= CudaComputeMin[1]) {
 				slog.Info(fmt.Sprintf("CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
 				resp.Library = "cuda"
 			} else {
 				slog.Info(fmt.Sprintf("CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
 			}
 		}
-	} else if gpuHandles.rocm != nil {
+	} else if gpuHandles.rocm != nil && cpuVariant != "" {
 		C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
 		if memInfo.err != nil {
 			slog.Info(fmt.Sprintf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err)))
 			C.free(unsafe.Pointer(memInfo.err))
+		} else if memInfo.igpu_index >= 0 && memInfo.count == 1 {
+			// Only one GPU detected and it appears to be an integrated GPU - skip it
+			slog.Info("ROCm unsupported integrated GPU detected")
 		} else {
+			if memInfo.igpu_index >= 0 {
+				// We have multiple GPUs reported, and one of them is an integrated GPU
+				// so we have to set the env var to bypass it
+				// If the user has specified their own ROCR_VISIBLE_DEVICES, don't clobber it
+				val := os.Getenv("ROCR_VISIBLE_DEVICES")
+				if val == "" {
+					devices := []string{}
+					for i := 0; i < int(memInfo.count); i++ {
+						if i == int(memInfo.igpu_index) {
+							continue
+						}
+						devices = append(devices, strconv.Itoa(i))
+					}
+					val = strings.Join(devices, ",")
+					os.Setenv("ROCR_VISIBLE_DEVICES", val)
+				}
+				slog.Info(fmt.Sprintf("ROCm integrated GPU detected - ROCR_VISIBLE_DEVICES=%s", val))
+			}
 			resp.Library = "rocm"
 			var version C.rocm_version_resp_t
 			C.rocm_get_version(*gpuHandles.rocm, &version)
@@ -163,7 +191,7 @@ func GetGPUInfo() GpuInfo {
 	if resp.Library == "" {
 		C.cpu_check_ram(&memInfo)
 		resp.Library = "cpu"
-		resp.Variant = GetCPUVariant()
+		resp.Variant = cpuVariant
 	}
 	if memInfo.err != nil {
 		slog.Info(fmt.Sprintf("error looking up CPU memory: %s", C.GoString(memInfo.err)))
@@ -199,7 +227,9 @@ func CheckVRAM() (int64, error) {
 		if overhead < gpus*1024*1024*1024 {
 			overhead = gpus * 1024 * 1024 * 1024
 		}
-		return int64(gpuInfo.FreeMemory - overhead), nil
+		avail := int64(gpuInfo.FreeMemory - overhead)
+		slog.Debug(fmt.Sprintf("%s detected %d devices with %dM available memory", gpuInfo.Library, gpuInfo.DeviceCount, avail/1024/1024))
+		return avail, nil
 	}
 	return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation

--- a/gpu/gpu_info.h
+++ b/gpu/gpu_info.h
@@ -42,6 +42,7 @@ typedef struct mem_info {
  uint64_t total;
  uint64_t free;
  unsigned int count;
+  int igpu_index; // If >= 0, we detected an integrated GPU to ignore
  char *err;  // If non-nill, caller responsible for freeing
 } mem_info_t;

--- a/gpu/gpu_info_cuda.c
+++ b/gpu/gpu_info_cuda.c
@@ -70,6 +70,7 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
    resp->ch.handle = NULL;
    snprintf(buf, buflen, "nvml vram init failure: %d", ret);
    resp->err = strdup(buf);
+    return;
  }
  // Report driver version if we're in verbose mode, ignore errors

--- a/gpu/gpu_info_rocm.c
+++ b/gpu/gpu_info_rocm.c
@@ -77,6 +77,7 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
 void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
  resp->err = NULL;
+  resp->igpu_index = -1;
  uint64_t totalMem = 0;
  uint64_t usedMem = 0;
  rsmi_status_t ret;
@@ -162,8 +163,14 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
    }
    LOG(h.verbose, "[%d] ROCm totalMem %ld\n", i, totalMem);
    LOG(h.verbose, "[%d] ROCm usedMem %ld\n", i, usedMem);
-    resp->total += totalMem;
+    if (totalMem < 1024 * 1024 * 1024) {
-    resp->free += totalMem - usedMem;
+      // Do not add up integrated GPU memory capacity, it's a bogus 512M, and actually uses system memory
+      LOG(h.verbose, "[%d] ROCm integrated GPU\n", i);
+      resp->igpu_index = i;
+    } else {
+      resp->total += totalMem;
+      resp->free += totalMem - usedMem;
+    }
  }
 }
@@ -171,7 +178,7 @@ void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
  const int buflen = 256;
  char buf[buflen + 1];
  if (h.handle == NULL) {
-    resp->str = strdup("nvml handle not initialized");
+    resp->str = strdup("rocm handle not initialized");
    resp->status = 1;
    return;
  }
@@ -188,4 +195,4 @@ void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
  resp->str = strdup(buf);
 }
 #endif  // __APPLE__
\ No newline at end of file
--- a/llm/dyn_ext_server.go
+++ b/llm/dyn_ext_server.go
@@ -190,6 +190,7 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
 		"seed":              predict.Options.Seed,
 		"stop":              predict.Options.Stop,
 		"image_data":        imageData,
+		"cache_prompt":      true,
 	}
 	if predict.Format == "json" {

--- a/llm/generate/gen_common.sh
+++ b/llm/generate/gen_common.sh
@@ -39,6 +39,9 @@ init_vars() {
    *)
        ;;
    esac
+    if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then 
+        CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
+    fi
 }
 git_module_setup() {
@@ -61,6 +64,17 @@ apply_patches() {
    if ! grep ollama ${LLAMACPP_DIR}/examples/server/CMakeLists.txt; then
        echo 'include (../../../ext_server/CMakeLists.txt) # ollama' >>${LLAMACPP_DIR}/examples/server/CMakeLists.txt
    fi
+    # apply temporary patches until fix is upstream
+    for patch in ../patches/*.diff; do
+        for file in $(grep "^+++ " ${patch} | cut -f2 -d' ' | cut -f2- -d/); do
+            (cd ${LLAMACPP_DIR}; git checkout ${file})
+        done
+    done
+    for patch in ../patches/*.diff; do
+        (cd ${LLAMACPP_DIR} && git apply ${patch})
+    done
    # Avoid duplicate main symbols when we link into the cgo binary
    sed -e 's/int main(/int __main(/g' <${LLAMACPP_DIR}/examples/server/server.cpp >${LLAMACPP_DIR}/examples/server/server.cpp.tmp &&
        mv ${LLAMACPP_DIR}/examples/server/server.cpp.tmp ${LLAMACPP_DIR}/examples/server/server.cpp

--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -140,7 +140,7 @@ if [ -d "${CUDA_LIB_DIR}" ]; then
    if [ -n "${CUDA_MAJOR}" ]; then
        CUDA_VARIANT=_v${CUDA_MAJOR}
    fi
-    CMAKE_DEFS="-DLLAMA_CUBLAS=on ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
+    CMAKE_DEFS="-DLLAMA_CUBLAS=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
    BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cuda${CUDA_VARIANT}"
    EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
    build

--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -25,6 +25,11 @@ function init_vars {
    }
    $script:GZIP=(get-command -ea 'silentlycontinue' gzip).path
    $script:DUMPBIN=(get-command -ea 'silentlycontinue' dumpbin).path
+    if ($null -eq $env:CMAKE_CUDA_ARCHITECTURES) {
+        $script:CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
+    } else {
+        $script:CMAKE_CUDA_ARCHITECTURES=$env:CMAKE_CUDA_ARCHITECTURES
+    }
 }
 function git_module_setup {
@@ -40,6 +45,29 @@ function apply_patches {
    if (!(Select-String -Path "${script:llamacppDir}/examples/server/CMakeLists.txt" -Pattern 'ollama')) {
        Add-Content -Path "${script:llamacppDir}/examples/server/CMakeLists.txt" -Value 'include (../../../ext_server/CMakeLists.txt) # ollama'
    }
+    # Apply temporary patches until fix is upstream
+    $patches = Get-ChildItem "../patches/*.diff"
+    foreach ($patch in $patches) {
+        # Extract file paths from the patch file
+        $filePaths = Get-Content $patch.FullName | Where-Object { $_ -match '^\+\+\+ ' } | ForEach-Object {
+            $parts = $_ -split ' '
+            ($parts[1] -split '/', 2)[1]
+        }
+        # Checkout each file
+        foreach ($file in $filePaths) {
+            Set-Location -Path ${script:llamacppDir}
+            git checkout $file
+        }
+    }
+    # Apply each patch
+    foreach ($patch in $patches) {
+        Set-Location -Path ${script:llamacppDir}
+        git apply $patch.FullName
+    }
    # Avoid duplicate main symbols when we link into the cgo binary
    $content = Get-Content -Path "${script:llamacppDir}/examples/server/server.cpp"
    $content = $content -replace 'int main\(', 'int __main('
@@ -128,7 +156,7 @@ if ($null -ne $script:CUDA_LIB_DIR) {
    }
    init_vars
    $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
-    $script:cmakeDefs += @("-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on")
+    $script:cmakeDefs += @("-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
    build
    install
    cp "${script:CUDA_LIB_DIR}/cudart64_*.dll" "${script:buildDir}/lib"

--- a/llm/gguf.go
+++ b/llm/gguf.go
@@ -69,12 +69,65 @@ type tensor struct {
 	name   string
 	kind   uint32
 	offset uint64
-	size   uint64
 	// shape is the number of elements in each dimension
 	shape [4]uint64
 }
+func (t tensor) blockSize() uint64 {
+	switch {
+	case t.kind < 2:
+		return 1
+	case t.kind < 10:
+		return 32
+	default:
+		return 256
+	}
+}
+func (t tensor) typeSize() uint64 {
+	blockSize := t.blockSize()
+	switch t.kind {
+	case 0: // FP32
+		return 4
+	case 1: // FP16
+		return 2
+	case 2: // Q4_0
+		return 2 + blockSize/2
+	case 3: // Q4_1
+		return 2 + 2 + blockSize/2
+	case 6: // Q5_0
+		return 2 + 4 + blockSize/2
+	case 7: // Q5_1
+		return 2 + 2 + 4 + blockSize/2
+	case 8: // Q8_0
+		return 2 + blockSize
+	case 9: // Q8_1
+		return 4 + 4 + blockSize
+	case 10: // Q2_K
+		return blockSize/16 + blockSize/4 + 2 + 2
+	case 11: // Q3_K
+		return blockSize/8 + blockSize/4 + 12 + 2
+	case 12: // Q4_K
+		return 2 + 2 + 12 + blockSize/2
+	case 13: // Q5_K
+		return 2 + 2 + 12 + blockSize/8 + blockSize/2
+	case 14: // Q6_K
+		return blockSize/2 + blockSize/4 + blockSize/16 + 2
+	default:
+		return 0
+	}
+}
+func (t tensor) parameters() uint64 {
+	return t.shape[0] * t.shape[1] * t.shape[2] * t.shape[3]
+}
+func (t tensor) size() uint64 {
+	return t.parameters() * t.typeSize() / t.blockSize()
+}
 type ggufModel struct {
 	*containerGGUF
@@ -201,61 +254,15 @@ func (llm *ggufModel) Decode(rso *readSeekOffset) error {
 			shape[i] = llm.readU64(rso)
 		}
-		kind := llm.readU32(rso)
+		tensor := tensor{
-		offset := llm.readU64(rso)
-		var blockSize uint64
-		switch {
-		case kind < 2:
-			blockSize = 1
-		case kind < 10:
-			blockSize = 32
-		default:
-			blockSize = 256
-		}
-		var typeSize uint64
-		switch kind {
-		case 0: // FP32
-			typeSize = 4
-		case 1: // FP16
-			typeSize = 2
-		case 2: // Q4_0
-			typeSize = 2 + blockSize/2
-		case 3: // Q4_1
-			typeSize = 2 + 2 + blockSize/2
-		case 6: // Q5_0
-			typeSize = 2 + 4 + blockSize/2
-		case 7: // Q5_1
-			typeSize = 2 + 2 + 4 + blockSize/2
-		case 8: // Q8_0
-			typeSize = 2 + blockSize
-		case 9: // Q8_1
-			typeSize = 4 + 4 + blockSize
-		case 10: // Q2_K
-			typeSize = blockSize/16 + blockSize/4 + 2 + 2
-		case 11: // Q3_K
-			typeSize = blockSize/8 + blockSize/4 + 12 + 2
-		case 12: // Q4_K
-			typeSize = 2 + 2 + 12 + blockSize/2
-		case 13: // Q5_K
-			typeSize = 2 + 2 + 12 + blockSize/8 + blockSize/2
-		case 14: // Q6_K
-			typeSize = blockSize/2 + blockSize/4 + blockSize/16 + 2
-		}
-		parameters := shape[0] * shape[1] * shape[2] * shape[3]
-		size := parameters * typeSize / blockSize
-		llm.tensors = append(llm.tensors, tensor{
 			name:   name,
-			kind:   kind,
+			kind:   llm.readU32(rso),
-			offset: offset,
+			offset: llm.readU64(rso),
-			size:   size,
 			shape:  shape,
-		})
+		}
-		llm.parameters += parameters
+		llm.tensors = append(llm.tensors, tensor)
+		llm.parameters += tensor.parameters()
 	}
 	alignment, ok := llm.kv["general.alignment"].(uint32)
@@ -265,7 +272,7 @@ func (llm *ggufModel) Decode(rso *readSeekOffset) error {
 	rso.Seek(int64(alignment)-rso.offset%int64(alignment), io.SeekCurrent)
 	for _, tensor := range llm.tensors {
-		padded := (int64(tensor.size) + int64(alignment) - 1) & ^(int64(alignment) - 1)
+		padded := (int64(tensor.size()) + int64(alignment) - 1) & ^(int64(alignment) - 1)
 		rso.Seek(padded, io.SeekCurrent)
 	}

--- a/llama.cpp @ cd4fddb2
+++ b/llama.cpp @ cd4fddb2
-Subproject commit 011e8ec577fd135cbc02993d3ea9840c516d6a1c
+Subproject commit cd4fddb29f81d6a1f6d51a0c016bc6b486d68def
--- a/llm/patches/01-cache.diff
+++ b/llm/patches/01-cache.diff
+diff --git a/examples/server/server.cpp b/examples/server/server.cpp
+index 0462fbd2..4fa7b57f 100644
+--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
+@@ -1857,12 +1857,6 @@ struct llama_server_context
+                         LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
+                     }
+-                    LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);
+-
+-                    llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1);
+-
+-                    slot.cache_tokens = prompt_tokens;
+-
+                     if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0)
+                     {
+                         // we have to evaluate at least 1 token to generate logits.
+@@ -1870,6 +1864,12 @@ struct llama_server_context
+                         slot.n_past--;
+                     }
+                    LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);
+
+                    llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1);
+
+                    slot.cache_tokens = prompt_tokens;
+
+                     LOG_VERBOSE("prompt ingested", {
+                                                     {"n_past", slot.n_past},
+                                                     {"cached", tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)},
--- a/parser/parser.go
+++ b/parser/parser.go
@@ -7,6 +7,7 @@ import (
 	"fmt"
 	"io"
 	"log/slog"
+	"slices"
 )
 type Command struct {
@@ -56,6 +57,16 @@ func Parse(reader io.Reader) ([]Command, error) {
 			command.Args = string(bytes.TrimSpace(fields[1]))
 		case "EMBED":
 			return nil, fmt.Errorf("deprecated command: EMBED is no longer supported, use the /embed API endpoint instead")
+		case "MESSAGE":
+			command.Name = string(bytes.ToLower(fields[0]))
+			fields = bytes.SplitN(fields[1], []byte(" "), 2)
+			if len(fields) < 2 {
+				return nil, fmt.Errorf("should be in the format <role> <message>")
+			}
+			if !slices.Contains([]string{"system", "user", "assistant"}, string(bytes.ToLower(fields[0]))) {
+				return nil, fmt.Errorf("role must be one of \"system\", \"user\", or \"assistant\"")
+			}
+			command.Args = fmt.Sprintf("%s: %s", string(bytes.ToLower(fields[0])), string(fields[1]))
 		default:
 			if !bytes.HasPrefix(fields[0], []byte("#")) {
 				// log a warning for unknown commands