Merge remote-tracking branch 'upstream/main' into pr3702

920a4b07 · Daniel Hiltgen · c496967e · ee49844d · 920a4b07 · 920a4b07
Commit 920a4b07 authored May 08, 2024 by Daniel Hiltgen
20 changed files
--- a/examples/kubernetes/README.md
+++ b/examples/kubernetes/README.md
@@ -7,12 +7,24 @@
 ## Steps
-1. Create the Ollama namespace, daemon set, and service
+1. Create the Ollama namespace, deployment, and service
   ```bash
   kubectl apply -f cpu.yaml
   ```
+## (Optional) Hardware Acceleration
+Hardware acceleration in Kubernetes requires NVIDIA's [`k8s-device-plugin`](https://github.com/NVIDIA/k8s-device-plugin) which is deployed in Kubernetes in form of daemonset. Follow the link for more details.
+Once configured, create a GPU enabled Ollama deployment.
+```bash
+kubectl apply -f gpu.yaml
+```
+## Test
 1. Port forward the Ollama service to connect and use it locally
   ```bash
@@ -24,13 +36,3 @@
   ```bash
   ollama run orca-mini:3b
   ```
\ No newline at end of file
-## (Optional) Hardware Acceleration
-Hardware acceleration in Kubernetes requires NVIDIA's [`k8s-device-plugin`](https://github.com/NVIDIA/k8s-device-plugin). Follow the link for more details.
-Once configured, create a GPU enabled Ollama deployment.
-```bash
-kubectl apply -f gpu.yaml
-```
--- a/examples/langchain-python-rag-document/main.py
+++ b/examples/langchain-python-rag-document/main.py
@@ -51,7 +51,7 @@ while True:
        template=template,
    )
-    llm = Ollama(model="llama2:13b", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
+    llm = Ollama(model="llama3:8b", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
    qa_chain = RetrievalQA.from_chain_type(
        llm,
        retriever=vectorstore.as_retriever(),

--- a/examples/langchain-python-rag-websummary/main.py
+++ b/examples/langchain-python-rag-websummary/main.py
-from langchain.llms import Ollama
+from langchain_community.llms import Ollama
-from langchain.document_loaders import WebBaseLoader
+from langchain_community.document_loaders import WebBaseLoader
 from langchain.chains.summarize import load_summarize_chain
 loader = WebBaseLoader("https://ollama.com/blog/run-llama2-uncensored-locally")
 docs = loader.load()
-llm = Ollama(model="llama2")
+llm = Ollama(model="llama3")
 chain = load_summarize_chain(llm, chain_type="stuff")
-result = chain.run(docs)
+result = chain.invoke(docs) 
 print(result)
--- a/examples/langchain-python-simple/README.md
+++ b/examples/langchain-python-simple/README.md
@@ -4,10 +4,10 @@ This example is a basic "hello world" of using LangChain with Ollama.
 ## Running the Example
-1. Ensure you have the `llama2` model installed:
+1. Ensure you have the `llama3` model installed:
   ```bash
-   ollama pull llama2
+   ollama pull llama3
   ```
 2. Install the Python Requirements.
@@ -21,4 +21,3 @@ This example is a basic "hello world" of using LangChain with Ollama.
   ```bash
   python main.py
   ```
\ No newline at end of file
--- a/examples/langchain-python-simple/main.py
+++ b/examples/langchain-python-simple/main.py
 from langchain.llms import Ollama
 input = input("What is your question?")
-llm = Ollama(model="llama2")
+llm = Ollama(model="llama3")
 res = llm.predict(input)
 print (res)
--- a/examples/modelfile-mario/Modelfile
+++ b/examples/modelfile-mario/Modelfile
-FROM llama2
+FROM llama3
 PARAMETER temperature 1
 SYSTEM """
 You are Mario from super mario bros, acting as an assistant.

--- a/examples/modelfile-mario/readme.md
+++ b/examples/modelfile-mario/readme.md
@@ -2,12 +2,12 @@
 # Example character: Mario
-This example shows how to create a basic character using Llama2 as the base model.
+This example shows how to create a basic character using Llama3 as the base model.
 To run this example:
 1. Download the Modelfile
-2. `ollama pull llama2` to get the base model used in the model file.
+2. `ollama pull llama3` to get the base model used in the model file.
 3. `ollama create NAME -f ./Modelfile`
 4. `ollama run NAME`
@@ -18,7 +18,7 @@ Ask it some questions like "Who are you?" or "Is Peach in trouble again?"
 What the model file looks like:
 ```
-FROM llama2
+FROM llama3
 PARAMETER temperature 1
 SYSTEM """
 You are Mario from Super Mario Bros, acting as an assistant.

--- a/examples/python-json-datagenerator/predefinedschema.py
+++ b/examples/python-json-datagenerator/predefinedschema.py
@@ -2,7 +2,7 @@ import requests
 import json
 import random
-model = "llama2"
+model = "llama3"
 template = {
  "firstName": "",
  "lastName": "",

--- a/examples/python-json-datagenerator/randomaddresses.py
+++ b/examples/python-json-datagenerator/randomaddresses.py
@@ -12,7 +12,7 @@ countries = [
    "France",
 ]
 country = random.choice(countries)
-model = "llama2"
+model = "llama3"
 prompt = f"generate one realistically believable sample data set of a persons first name, last name, address in {country}, and phone number. Do not use common names. Respond using JSON. Key names should have no backslashes, values should use plain ascii with no special characters."

--- a/examples/python-json-datagenerator/readme.md
+++ b/examples/python-json-datagenerator/readme.md
@@ -6,10 +6,10 @@ There are two python scripts in this example. `randomaddresses.py` generates ran
 ## Running the Example
-1. Ensure you have the `llama2` model installed:
+1. Ensure you have the `llama3` model installed:
   ```bash
-   ollama pull llama2
+   ollama pull llama3
   ```
 2. Install the Python Requirements.

--- a/examples/python-simplechat/client.py
+++ b/examples/python-simplechat/client.py
@@ -2,7 +2,7 @@ import json
 import requests
 # NOTE: ollama must be running for this to work, start the ollama app or run `ollama serve`
-model = "llama2"  # TODO: update this for whatever model you wish to use
+model = "llama3"  # TODO: update this for whatever model you wish to use
 def chat(messages):

--- a/examples/python-simplechat/readme.md
+++ b/examples/python-simplechat/readme.md
@@ -4,10 +4,10 @@ The **chat** endpoint is one of two ways to generate text from an LLM with Ollam
 ## Running the Example
-1. Ensure you have the `llama2` model installed:
+1. Ensure you have the `llama3` model installed:
   ```bash
-   ollama pull llama2
+   ollama pull llama3
   ```
 2. Install the Python Requirements.

--- a/examples/typescript-mentors/README.md
+++ b/examples/typescript-mentors/README.md
@@ -4,10 +4,10 @@ This example demonstrates how one would create a set of 'mentors' you can have a
 ## Usage
-1. Add llama2 to have the mentors ask your questions:
+1. Add llama3 to have the mentors ask your questions:
   ```bash
-   ollama pull llama2
+   ollama pull llama3
   ```
 2. Install prerequisites:

--- a/examples/typescript-mentors/character-generator.ts
+++ b/examples/typescript-mentors/character-generator.ts
@@ -15,7 +15,7 @@ async function characterGenerator() {
  ollama.setModel("stablebeluga2:70b-q4_K_M");
  const bio = await ollama.generate(`create a bio of ${character} in a single long paragraph. Instead of saying '${character} is...' or '${character} was...' use language like 'You are...' or 'You were...'. Then create a paragraph describing the speaking mannerisms and style of ${character}. Don't include anything about how ${character} looked or what they sounded like, just focus on the words they said. Instead of saying '${character} would say...' use language like 'You should say...'. If you use quotes, always use single quotes instead of double quotes. If there are any specific words or phrases you used a lot, show how you used them. `);
-  const thecontents = `FROM llama2\nSYSTEM """\n${bio.response.replace(/(\r\n|\n|\r)/gm, " ").replace('would', 'should')} All answers to questions should be related back to what you are most known for.\n"""`;
+  const thecontents = `FROM llama3\nSYSTEM """\n${bio.response.replace(/(\r\n|\n|\r)/gm, " ").replace('would', 'should')} All answers to questions should be related back to what you are most known for.\n"""`;
  fs.writeFile(path.join(directory, 'Modelfile'), thecontents, (err: any) => {
    if (err) throw err;

--- a/examples/typescript-simplechat/client.ts
+++ b/examples/typescript-simplechat/client.ts
 import * as readline from "readline";
-const model = "llama2";
+const model = "llama3";
 type Message = {
  role: "assistant" | "user" | "system";
  content: string;

--- a/format/bytes.go
+++ b/format/bytes.go
@@ -15,6 +15,7 @@ const (
 	KibiByte = Byte * 1024
 	MebiByte = KibiByte * 1024
+	GibiByte = MebiByte * 1024
 )
 func HumanBytes(b int64) string {
@@ -52,6 +53,8 @@ func HumanBytes(b int64) string {
 func HumanBytes2(b uint64) string {
 	switch {
+	case b >= GibiByte:
+		return fmt.Sprintf("%.1f GiB", float64(b)/GibiByte)
 	case b >= MebiByte:
 		return fmt.Sprintf("%.1f MiB", float64(b)/MebiByte)
 	case b >= KibiByte:

--- a/format/format.go
+++ b/format/format.go
@@ -13,12 +13,20 @@ const (
 func HumanNumber(b uint64) string {
 	switch {
-	case b > Billion:
+	case b >= Billion:
-		return fmt.Sprintf("%.0fB", math.Round(float64(b)/Billion))
+		number := float64(b) / Billion
-	case b > Million:
+		if number == math.Floor(number) {
-		return fmt.Sprintf("%.0fM", math.Round(float64(b)/Million))
+			return fmt.Sprintf("%.0fB", number) // no decimals if whole number
-	case b > Thousand:
+		}
-		return fmt.Sprintf("%.0fK", math.Round(float64(b)/Thousand))
+		return fmt.Sprintf("%.1fB", number) // one decimal if not a whole number
+	case b >= Million:
+		number := float64(b) / Million
+		if number == math.Floor(number) {
+			return fmt.Sprintf("%.0fM", number) // no decimals if whole number
+		}
+		return fmt.Sprintf("%.2fM", number) // two decimals if not a whole number
+	case b >= Thousand:
+		return fmt.Sprintf("%.0fK", float64(b)/Thousand)
 	default:
 		return fmt.Sprintf("%d", b)
 	}

--- a/format/format_test.go
+++ b/format/format_test.go
+package format
+import (
+	"testing"
+)
+func TestHumanNumber(t *testing.T) {
+	type testCase struct {
+		input    uint64
+		expected string
+	}
+	testCases := []testCase{
+		{0, "0"},
+		{1000000, "1M"},
+		{125000000, "125M"},
+		{500500000, "500.50M"},
+		{500550000, "500.55M"},
+		{1000000000, "1B"},
+		{2800000000, "2.8B"},
+		{2850000000, "2.9B"},
+		{1000000000000, "1000B"},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.expected, func(t *testing.T) {
+			result := HumanNumber(tc.input)
+			if result != tc.expected {
+				t.Errorf("Expected %s, got %s", tc.expected, result)
+			}
+		})
+	}
+}
--- a/gpu/amd_common.go
+++ b/gpu/amd_common.go
@@ -7,7 +7,7 @@ import (
 	"log/slog"
 	"os"
 	"path/filepath"
-	"strconv"
+	"runtime"
 	"strings"
 )
@@ -35,22 +35,66 @@ func GetSupportedGFX(libDir string) ([]string, error) {
 	return ret, nil
 }
-func amdSetVisibleDevices(ids []int, skip map[int]interface{}) {
+func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
-	// Set the visible devices if not already set
+	ids := []string{}
-	// TODO - does sort order matter?
+	for _, info := range gpuInfo {
-	devices := []string{}
+		if info.Library != "rocm" {
-	for i := range ids {
+			// TODO shouldn't happen if things are wired correctly...
-		if _, skipped := skip[i]; skipped {
+			slog.Debug("rocmGetVisibleDevicesEnv skipping over non-rocm device", "library", info.Library)
 			continue
 		}
-		devices = append(devices, strconv.Itoa(i))
+		ids = append(ids, info.ID)
+	}
+	return "HIP_VISIBLE_DEVICES", strings.Join(ids, ",")
+}
+func commonAMDValidateLibDir() (string, error) {
+	// We try to favor system paths first, so that we can wire up the subprocess to use
+	// the system version.  Only use our bundled version if the system version doesn't work
+	// This gives users a more recovery options if versions have subtle problems at runtime
+	// Prefer explicit HIP env var
+	hipPath := os.Getenv("HIP_PATH")
+	if hipPath != "" {
+		hipLibDir := filepath.Join(hipPath, "bin")
+		if rocmLibUsable(hipLibDir) {
+			slog.Debug("detected ROCM via HIP_PATH=" + hipPath)
+			return hipLibDir, nil
+		}
+	}
+	// Scan the LD_LIBRARY_PATH or PATH
+	pathEnv := "LD_LIBRARY_PATH"
+	if runtime.GOOS == "windows" {
+		pathEnv = "PATH"
 	}
-	val := strings.Join(devices, ",")
+	paths := os.Getenv(pathEnv)
-	err := os.Setenv("HIP_VISIBLE_DEVICES", val)
+	for _, path := range filepath.SplitList(paths) {
+		d, err := filepath.Abs(path)
 		if err != nil {
-		slog.Warn(fmt.Sprintf("failed to set env: %s", err))
+			continue
-	} else {
+		}
-		slog.Info("Setting HIP_VISIBLE_DEVICES=" + val)
+		if rocmLibUsable(d) {
+			return d, nil
+		}
+	}
+	// Well known location(s)
+	for _, path := range RocmStandardLocations {
+		if rocmLibUsable(path) {
+			return path, nil
+		}
+	}
+	// Installer payload location if we're running the installed binary
+	exe, err := os.Executable()
+	if err == nil {
+		rocmTargetDir := filepath.Join(filepath.Dir(exe), "rocm")
+		if rocmLibUsable(rocmTargetDir) {
+			slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
+			return rocmTargetDir, nil
+		}
 	}
+	return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
 }
--- a/gpu/amd_hip_windows.go
+++ b/gpu/amd_hip_windows.go
@@ -69,7 +69,7 @@ func NewHipLib() (*HipLib, error) {
 func (hl *HipLib) Release() {
 	err := windows.FreeLibrary(hl.dll)
 	if err != nil {
-		slog.Warn(fmt.Sprintf("failed to unload amdhip64.dll: %s", err))
+		slog.Warn("failed to unload amdhip64.dll", "error", err)
 	}
 	hl.dll = 0
 }
@@ -98,7 +98,7 @@ func (hl *HipLib) HipGetDeviceCount() int {
 		return 0
 	}
 	if status != hipSuccess {
-		slog.Warn(fmt.Sprintf("failed call to hipGetDeviceCount: %d %s", status, err))
+		slog.Warn("failed call to hipGetDeviceCount", "status", status, "error", err)
 	}
 	return count
 }