update

0cb78a2f · xuxzh1 · 217903ab · 0cb78a2f · 0cb78a2f · 0cb78a2f
Commit 0cb78a2f authored Dec 06, 2024 by xuxzh1 🎱
20 changed files
--- a/docs/api.md
+++ b/docs/api.md
@@ -69,7 +69,7 @@ Enable JSON mode by setting the `format` parameter to `json`. This will structur
 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3",
+  "model": "llama3.2",
  "prompt": "Why is the sky blue?"
 }'
 ```
@@ -80,7 +80,7 @@ A stream of JSON objects is returned:
 ```json
 {
-  "model": "llama3",
+  "model": "llama3.2",
  "created_at": "2023-08-04T08:52:19.385406455-07:00",
  "response": "The",
  "done": false
@@ -102,7 +102,7 @@ To calculate how fast the response is generated in tokens per second (token/s),
 ```json
 {
-  "model": "llama3",
+  "model": "llama3.2",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "response": "",
  "done": true,
@@ -124,7 +124,7 @@ A response can be received in one reply when streaming is off.
 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3",
+  "model": "llama3.2",
  "prompt": "Why is the sky blue?",
  "stream": false
 }'
@@ -136,7 +136,7 @@ If `stream` is set to `false`, the response will be a single JSON object:
 ```json
 {
-  "model": "llama3",
+  "model": "llama3.2",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "response": "The sky is blue because it is the color of the sky.",
  "done": true,
@@ -194,7 +194,7 @@ curl http://localhost:11434/api/generate -d '{
 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3",
+  "model": "llama3.2",
  "prompt": "What color is the sky at different times of the day? Respond using JSON",
  "format": "json",
  "stream": false
@@ -205,7 +205,7 @@ curl http://localhost:11434/api/generate -d '{
 ```json
 {
-  "model": "llama3",
+  "model": "llama3.2",
  "created_at": "2023-11-09T21:07:55.186497Z",
  "response": "{\n\"morning\": {\n\"color\": \"blue\"\n},\n\"noon\": {\n\"color\": \"blue-gray\"\n},\n\"afternoon\": {\n\"color\": \"warm gray\"\n},\n\"evening\": {\n\"color\": \"orange\"\n}\n}\n",
  "done": true,
@@ -327,7 +327,7 @@ If you want to set custom options for the model at runtime rather than in the Mo
 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3",
+  "model": "llama3.2",
  "prompt": "Why is the sky blue?",
  "stream": false,
  "options": {
@@ -355,7 +355,6 @@ curl http://localhost:11434/api/generate -d '{
    "num_gpu": 1,
    "main_gpu": 0,
    "low_vram": false,
-    "f16_kv": true,
    "vocab_only": false,
    "use_mmap": true,
    "use_mlock": false,
@@ -368,7 +367,7 @@ curl http://localhost:11434/api/generate -d '{
 ```json
 {
-  "model": "llama3",
+  "model": "llama3.2",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "response": "The sky is blue because it is the color of the sky.",
  "done": true,
@@ -390,7 +389,7 @@ If an empty prompt is provided, the model will be loaded into memory.
 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3"
+  "model": "llama3.2"
 }'
 ```
@@ -400,13 +399,40 @@ A single JSON object is returned:
 ```json
 {
-  "model": "llama3",
+  "model": "llama3.2",
  "created_at": "2023-12-18T19:52:07.071755Z",
  "response": "",
  "done": true
 }
 ```
+#### Unload a model
+If an empty prompt is provided and the `keep_alive` parameter is set to `0`, a model will be unloaded from memory.
+##### Request
+```shell
+curl http://localhost:11434/api/generate -d '{
+  "model": "llama3.2",
+  "keep_alive": 0
+}'
+```
+##### Response
+A single JSON object is returned:
+```json
+{
+  "model": "llama3.2",
+  "created_at": "2024-09-12T03:54:03.516566Z",
+  "response": "",
+  "done": true,
+  "done_reason": "unload"
+}
+```
 ## Generate a chat completion
 ```shell
@@ -445,7 +471,7 @@ Send a chat message with a streaming response.
 ```shell
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama3",
+  "model": "llama3.2",
  "messages": [
    {
      "role": "user",
@@ -461,7 +487,7 @@ A stream of JSON objects is returned:
 ```json
 {
-  "model": "llama3",
+  "model": "llama3.2",
  "created_at": "2023-08-04T08:52:19.385406455-07:00",
  "message": {
    "role": "assistant",
@@ -476,7 +502,7 @@ Final response:
 ```json
 {
-  "model": "llama3",
+  "model": "llama3.2",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "done": true,
  "total_duration": 4883583458,
@@ -494,7 +520,7 @@ Final response:
 ```shell
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama3",
+  "model": "llama3.2",
  "messages": [
    {
      "role": "user",
@@ -509,7 +535,7 @@ curl http://localhost:11434/api/chat -d '{
 ```json
 {
-  "model": "registry.ollama.ai/library/llama3:latest",
+  "model": "llama3.2",
  "created_at": "2023-12-12T14:13:43.416799Z",
  "message": {
    "role": "assistant",
@@ -533,7 +559,7 @@ Send a chat message with a conversation history. You can use this same approach
 ```shell
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama3",
+  "model": "llama3.2",
  "messages": [
    {
      "role": "user",
@@ -557,7 +583,7 @@ A stream of JSON objects is returned:
 ```json
 {
-  "model": "llama3",
+  "model": "llama3.2",
  "created_at": "2023-08-04T08:52:19.385406455-07:00",
  "message": {
    "role": "assistant",
@@ -571,7 +597,7 @@ Final response:
 ```json
 {
-  "model": "llama3",
+  "model": "llama3.2",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "done": true,
  "total_duration": 8113331500,
@@ -629,7 +655,7 @@ curl http://localhost:11434/api/chat -d '{
 ```shell
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama3",
+  "model": "llama3.2",
  "messages": [
    {
      "role": "user",
@@ -647,7 +673,7 @@ curl http://localhost:11434/api/chat -d '{
 ```json
 {
-  "model": "registry.ollama.ai/library/llama3:latest",
+  "model": "llama3.2",
  "created_at": "2023-12-12T14:13:43.416799Z",
  "message": {
    "role": "assistant",
@@ -669,7 +695,7 @@ curl http://localhost:11434/api/chat -d '{
 ```
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama3.1",
+  "model": "llama3.2",
  "messages": [
    {
      "role": "user",
@@ -708,7 +734,7 @@ curl http://localhost:11434/api/chat -d '{
 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3.2",
  "created_at": "2024-07-22T20:33:28.123648Z",
  "message": {
    "role": "assistant",
@@ -736,6 +762,64 @@ curl http://localhost:11434/api/chat -d '{
 }
 ```
+#### Load a model
+If the messages array is empty, the model will be loaded into memory.
+##### Request
+```
+curl http://localhost:11434/api/chat -d '{
+  "model": "llama3.2",
+  "messages": []
+}'
+```
+##### Response
+```json
+{
+  "model": "llama3.2",
+  "created_at":"2024-09-12T21:17:29.110811Z",
+  "message": {
+    "role": "assistant",
+    "content": ""
+  },
+  "done_reason": "load",
+  "done": true
+}
+```
+#### Unload a model
+If the messages array is empty and the `keep_alive` parameter is set to `0`, a model will be unloaded from memory.
+##### Request
+```
+curl http://localhost:11434/api/chat -d '{
+  "model": "llama3.2",
+  "messages": [],
+  "keep_alive": 0
+}'
+```
+##### Response
+A single JSON object is returned:
+```json
+{
+  "model": "llama3.2",
+  "created_at":"2024-09-12T21:33:17.547535Z",
+  "message": {
+    "role": "assistant",
+    "content": ""
+  },
+  "done_reason": "unload",
+  "done": true
+}
+```
 ## Create a Model
 ```shell
@@ -746,10 +830,30 @@ Create a model from a [`Modelfile`](./modelfile.md). It is recommended to set `m
 ### Parameters
- `name`: name of the model to create
+- `model`: name of the model to create
 - `modelfile` (optional): contents of the Modelfile
 - `stream`: (optional) if `false` the response will be returned as a single response object, rather than a stream of objects
 - `path` (optional): path to the Modelfile
+- `quantize` (optional): quantize a non-quantized (e.g. float16) model
+#### Quantization types
+| Type | Recommended |
+| --- | :-: |
+| q2_K | |
+| q3_K_L | |
+| q3_K_M | |
+| q3_K_S | |
+| q4_0 | |
+| q4_1 | |
+| q4_K_M | * |
+| q4_K_S | |
+| q5_0 | |
+| q5_1 | |
+| q5_K_M | |
+| q5_K_S | |
+| q6_K | |
+| q8_0 | * |
 ### Examples
@@ -761,14 +865,14 @@ Create a new model from a `Modelfile`.
 ```shell
 curl http://localhost:11434/api/create -d '{
-  "name": "mario",
+  "model": "mario",
  "modelfile": "FROM llama3\nSYSTEM You are mario from Super Mario Bros."
 }'
 ```
 ##### Response
-A stream of JSON objects. Notice that the final JSON object shows a `"status": "success"`.
+A stream of JSON objects is returned:
 ```json
 {"status":"reading model metadata"}
@@ -784,13 +888,43 @@ A stream of JSON objects. Notice that the final JSON object shows a `"status": "
 {"status":"success"}
 ```
+#### Quantize a model
+Quantize a non-quantized model.
+##### Request
+```shell
+curl http://localhost:11434/api/create -d '{
+  "model": "llama3.1:quantized",
+  "modelfile": "FROM llama3.1:8b-instruct-fp16",
+  "quantize": "q4_K_M"
+}'
+```
+##### Response
+A stream of JSON objects is returned:
+```
+{"status":"quantizing F16 model to Q4_K_M"}
+{"status":"creating new layer sha256:667b0c1932bc6ffc593ed1d03f895bf2dc8dc6df21db3042284a6f4416b06a29"}
+{"status":"using existing layer sha256:11ce4ee3e170f6adebac9a991c22e22ab3f8530e154ee669954c4bc73061c258"}
+{"status":"using existing layer sha256:0ba8f0e314b4264dfd19df045cde9d4c394a52474bf92ed6a3de22a4ca31a177"}
+{"status":"using existing layer sha256:56bb8bd477a519ffa694fc449c2413c6f0e1d3b1c88fa7e3c9d88d3ae49d4dcb"}
+{"status":"creating new layer sha256:455f34728c9b5dd3376378bfb809ee166c145b0b4c1f1a6feca069055066ef9a"}
+{"status":"writing manifest"}
+{"status":"success"}
+```
 ### Check if a Blob Exists
 ```shell
 HEAD /api/blobs/:digest
 ```
-Ensures that the file blob used for a FROM or ADAPTER field exists on the server. This is checking your Ollama server and not Ollama.ai.
+Ensures that the file blob used for a FROM or ADAPTER field exists on the server. This is checking your Ollama server and not ollama.com.
 #### Query Parameters
@@ -895,7 +1029,7 @@ Show information about a model including details, modelfile, template, parameter
 ### Parameters
- `name`: name of the model to show
+- `model`: name of the model to show
 - `verbose`: (optional) if set to `true`, returns full data for verbose response fields
 ### Examples
@@ -904,7 +1038,7 @@ Show information about a model including details, modelfile, template, parameter
 ```shell
 curl http://localhost:11434/api/show -d '{
-  "name": "llama3"
+  "model": "llama3.2"
 }'
 ```
@@ -965,7 +1099,7 @@ Copy a model. Creates a model with another name from an existing model.
 ```shell
 curl http://localhost:11434/api/copy -d '{
-  "source": "llama3",
+  "source": "llama3.2",
  "destination": "llama3-backup"
 }'
 ```
@@ -984,7 +1118,7 @@ Delete a model and its data.
 ### Parameters
- `name`: model name to delete
+- `model`: model name to delete
 ### Examples
@@ -992,7 +1126,7 @@ Delete a model and its data.
 ```shell
 curl -X DELETE http://localhost:11434/api/delete -d '{
-  "name": "llama3:13b"
+  "model": "llama3:13b"
 }'
 ```
@@ -1010,7 +1144,7 @@ Download a model from the ollama library. Cancelled pulls are resumed from where
 ### Parameters
- `name`: name of the model to pull
+- `model`: name of the model to pull
 - `insecure`: (optional) allow insecure connections to the library. Only use this if you are pulling from your own library during development.
 - `stream`: (optional) if `false` the response will be returned as a single response object, rather than a stream of objects
@@ -1020,7 +1154,7 @@ Download a model from the ollama library. Cancelled pulls are resumed from where
 ```shell
 curl http://localhost:11434/api/pull -d '{
-  "name": "llama3"
+  "model": "llama3.2"
 }'
 ```
@@ -1082,7 +1216,7 @@ Upload a model to a model library. Requires registering for ollama.ai and adding
 ### Parameters
- `name`: name of the model to push in the form of `<namespace>/<model>:<tag>`
+- `model`: name of the model to push in the form of `<namespace>/<model>:<tag>`
 - `insecure`: (optional) allow insecure connections to the library. Only use this if you are pushing to your library during development.
 - `stream`: (optional) if `false` the response will be returned as a single response object, rather than a stream of objects
@@ -1092,7 +1226,7 @@ Upload a model to a model library. Requires registering for ollama.ai and adding
 ```shell
 curl http://localhost:11434/api/push -d '{
-  "name": "mattw/pygmalion:latest"
+  "model": "mattw/pygmalion:latest"
 }'
 ```

--- a/docs/development.md
+++ b/docs/development.md
@@ -2,15 +2,13 @@
 Install required tools:
- cmake version 3.24 or higher
 - go version 1.22 or higher
 - gcc version 11.4.0 or higher
 ### MacOS
-```bash
+[Download Go](https://go.dev/dl/)
-brew install go cmake gcc
-```
 Optionally enable debugging and more verbose logging:
@@ -22,10 +20,10 @@ export CGO_CFLAGS="-g"
 export OLLAMA_DEBUG=1
 ```
-Get the required libraries and build the native LLM code:
+Get the required libraries and build the native LLM code:  (Adjust the job count based on your number of processors for a faster build)
 ```bash
-go generate ./...
+make -j 5
 ```
 Then build ollama:
@@ -40,13 +38,17 @@ Now you can run `ollama`:
 ./ollama
 ```
+#### Xcode 15 warnings
+If you are using Xcode newer than version 14, you may see a warning during `go build` about `ld: warning: ignoring duplicate libraries: '-lobjc'` due to Golang issue https://github.com/golang/go/issues/67799 which can be safely ignored.  You can suppress the warning with `export CGO_LDFLAGS="-Wl,-no_warn_duplicate_libraries"`
 ### Linux
 #### Linux CUDA (NVIDIA)
 _Your operating system distribution may already have packages for NVIDIA CUDA. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!_
-Install `cmake` and `golang` as well as [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads)
+Install `make`, `gcc` and `golang` as well as [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads)
 development and runtime packages.
 Typically the build scripts will auto-detect CUDA, however, if your Linux distro
@@ -55,10 +57,10 @@ specifying an environment variable `CUDA_LIB_DIR` to the location of the shared
 libraries, and `CUDACXX` to the location of the nvcc compiler. You can customize
 a set of target CUDA architectures by setting `CMAKE_CUDA_ARCHITECTURES` (e.g. "50;60;70")
-Then generate dependencies:
+Then generate dependencies:  (Adjust the job count based on your number of processors for a faster build)
 ```
-go generate ./...
+make -j 5
 ```
 Then build the binary:
@@ -71,7 +73,7 @@ go build .
 _Your operating system distribution may already have packages for AMD ROCm and CLBlast. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!_
-Install [CLBlast](https://github.com/CNugteren/CLBlast/blob/master/doc/installation.md) and [ROCm](https://rocm.docs.amd.com/en/latest/) development packages first, as well as `cmake` and `golang`.
+Install [CLBlast](https://github.com/CNugteren/CLBlast/blob/master/doc/installation.md) and [ROCm](https://rocm.docs.amd.com/en/latest/) development packages first, as well as `make`, `gcc`, and `golang`.
 Typically the build scripts will auto-detect ROCm, however, if your Linux distro
 or installation approach uses unusual paths, you can specify the location by
@@ -80,8 +82,10 @@ install (typically `/opt/rocm`), and `CLBlast_DIR` to the location of the
 CLBlast install (typically `/usr/lib/cmake/CLBlast`). You can also customize
 the AMD GPU targets by setting AMDGPU_TARGETS (e.g. `AMDGPU_TARGETS="gfx1101;gfx1102"`)
+Then generate dependencies:  (Adjust the job count based on your number of processors for a faster build)
 ```
-go generate ./...
+make -j 5
 ```
 Then build the binary:
@@ -94,19 +98,13 @@ ROCm requires elevated privileges to access the GPU at runtime. On most distros
 #### Advanced CPU Settings
-By default, running `go generate ./...` will compile a few different variations
+By default, running `make` will compile a few different variations
 of the LLM library based on common CPU families and vector math capabilities,
 including a lowest-common-denominator which should run on almost any 64 bit CPU
 somewhat slowly. At runtime, Ollama will auto-detect the optimal variation to
-load. If you would like to build a CPU-based build customized for your
+load. 
-processor, you can set `OLLAMA_CUSTOM_CPU_DEFS` to the llama.cpp flags you would
-like to use. For example, to compile an optimized binary for an Intel i9-9880H,
-you might use:
-```
+Custom CPU settings are not currently supported in the new Go server build but will be added back after we complete the transition.
-OLLAMA_CUSTOM_CPU_DEFS="-DGGML_AVX=on -DGGML_AVX2=on -DGGML_F16C=on -DGGML_FMA=on" go generate ./...
-go build .
-```
 #### Containerized Linux Build
@@ -114,37 +112,64 @@ If you have Docker available, you can build linux binaries with `./scripts/build
 ### Windows
-Note: The Windows build for Ollama is still under development.
+The following tools are required as a minimal development environment to build CPU inference support.
-First, install required tools:
- MSVC toolchain - C/C++ and cmake as minimal requirements
 - Go version 1.22 or higher
- MinGW (pick one variant) with GCC.
+  - https://go.dev/dl/
-  - [MinGW-w64](https://www.mingw-w64.org/)
+- Git
+  - https://git-scm.com/download/win
+- clang with gcc compat and Make.  There are multiple options on how to go about installing these tools on Windows.  We have verified the following, but others may work as well:  
  - [MSYS2](https://www.msys2.org/)
- The `ThreadJob` Powershell module: `Install-Module -Name ThreadJob -Scope CurrentUser`
+    - After installing, from an MSYS2 terminal, run `pacman -S mingw-w64-clang-x86_64-gcc-compat mingw-w64-clang-x86_64-clang make` to install the required tools
+  - Assuming you used the default install prefix for msys2 above, add `C:\msys64\clang64\bin` and `c:\msys64\usr\bin` to your environment variable `PATH` where you will perform the build steps below (e.g. system-wide, account-level, powershell, cmd, etc.)
+> [!NOTE]  
+> Due to bugs in the GCC C++ library for unicode support, Ollama should be built with clang on windows.
 Then, build the `ollama` binary:
 ```powershell
 $env:CGO_ENABLED="1"
-go generate ./...
+make -j 8
 go build .
 ```
+#### GPU Support
+The GPU tools require the Microsoft native build tools.  To build either CUDA or ROCm, you must first install MSVC via Visual Studio:
+- Make sure to select `Desktop development with C++` as a Workload during the Visual Studio install
+- You must complete the Visual Studio install and run it once **BEFORE** installing CUDA or ROCm for the tools to properly register
+- Add the location of the **64 bit (x64)** compiler (`cl.exe`) to your `PATH`
+- Note: the default Developer Shell may configure the 32 bit (x86) compiler which will lead to build failures.  Ollama requires a 64 bit toolchain.
 #### Windows CUDA (NVIDIA)
-In addition to the common Windows development tools described above, install CUDA after installing MSVC.
+In addition to the common Windows development tools and MSVC described above:
 - [NVIDIA CUDA](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html)
 #### Windows ROCm (AMD Radeon)
-In addition to the common Windows development tools described above, install AMDs HIP package after installing MSVC.
+In addition to the common Windows development tools and MSVC described above:
 - [AMD HIP](https://www.amd.com/en/developer/resources/rocm-hub/hip-sdk.html)
- [Strawberry Perl](https://strawberryperl.com/)
-Lastly, add `ninja.exe` included with MSVC to the system path (e.g. `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\Common7\IDE\CommonExtensions\Microsoft\CMake\Ninja`).
+#### Windows arm64
+The default `Developer PowerShell for VS 2022` may default to x86 which is not what you want.  To ensure you get an arm64 development environment, start a plain PowerShell terminal and run:
+```powershell
+import-module 'C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\Tools\\Microsoft.VisualStudio.DevShell.dll'
+Enter-VsDevShell -Arch arm64 -vsinstallpath 'C:\\Program Files\\Microsoft Visual Studio\\2022\\Community' -skipautomaticlocation
+```
+You can confirm with `write-host $env:VSCMD_ARG_TGT_ARCH`
+Follow the instructions at https://www.msys2.org/wiki/arm64/ to set up an arm64 msys2 environment.  Ollama requires gcc and mingw32-make to compile, which is not currently available on Windows arm64, but a gcc compatibility adapter is available via `mingw-w64-clang-aarch64-gcc-compat`. At a minimum you will need to install the following:
+```
+pacman -S mingw-w64-clang-aarch64-clang mingw-w64-clang-aarch64-gcc-compat mingw-w64-clang-aarch64-make make
+```
+You will need to ensure your PATH includes go, cmake, gcc and clang mingw32-make to build ollama from source. (typically `C:\msys64\clangarm64\bin\`)
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -50,6 +50,9 @@ sudo systemctl restart docker
 docker run -d --gpus=all -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
 ```
+> [!NOTE]  
+> If you're running on an NVIDIA JetPack system, Ollama can't automatically discover the correct JetPack version. Pass the environment variable JETSON_JETPACK=5 or JETSON_JETPACK=6 to the container to select version 5 or 6.
 ### AMD GPU
 To run Ollama using Docker with AMD GPUs, use the `rocm` tag and the following command:
@@ -63,7 +66,7 @@ docker run -d --device /dev/kfd --device /dev/dri -v ollama:/root/.ollama -p 114
 Now you can run a model:
 ```
-docker exec -it ollama ollama run llama3.1
+docker exec -it ollama ollama run llama3.2
 ```
 ### Try different models

--- a/docs/faq.md
+++ b/docs/faq.md
@@ -32,7 +32,7 @@ When using the API, specify the `num_ctx` parameter:
 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3",
+  "model": "llama3.2",
  "prompt": "Why is the sky blue?",
  "options": {
    "num_ctx": 4096
@@ -111,7 +111,10 @@ On Windows, Ollama inherits your user and system environment variables.
 ## How do I use Ollama behind a proxy?
-Ollama is compatible with proxy servers if `HTTP_PROXY` or `HTTPS_PROXY` are configured. When using either variables, ensure it is set where `ollama serve` can access the values. When using `HTTPS_PROXY`, ensure the proxy certificate is installed as a system certificate. Refer to the section above for how to use environment variables on your platform.
+Ollama pulls models from the Internet and may require a proxy server to access the models. Use `HTTPS_PROXY` to redirect outbound requests through the proxy. Ensure the proxy certificate is installed as a system certificate. Refer to the section above for how to use environment variables on your platform.
+> [!NOTE]
+> Avoid setting `HTTP_PROXY`. Ollama does not use HTTP for model pulls, only HTTPS. Setting `HTTP_PROXY` may interrupt client connections to the server.
 ### How do I use Ollama behind a proxy in Docker?
@@ -191,6 +194,8 @@ Refer to the section [above](#how-do-i-configure-ollama-server) for how to set e
 If a different directory needs to be used, set the environment variable `OLLAMA_MODELS` to the chosen directory.
+> Note: on Linux using the standard installer, the `ollama` user needs read and write access to the specified directory. To assign the directory to the `ollama` user run `sudo chown -R ollama:ollama <directory>`.
 Refer to the section [above](#how-do-i-configure-ollama-server) for how to set environment variables on your platform.
 ## How can I use Ollama in Visual Studio Code?
@@ -227,14 +232,18 @@ curl http://localhost:11434/api/chat -d '{"model": "mistral"}'
 To preload a model using the CLI, use the command:
 ```shell
-ollama run llama3.1 ""
+ollama run llama3.2 ""
 ```
 ## How do I keep a model loaded in memory or make it unload immediately?
-By default models are kept in memory for 5 minutes before being unloaded. This allows for quicker response times if you are making numerous requests to the LLM. You may, however, want to free up the memory before the 5 minutes have elapsed or keep the model loaded indefinitely. Use the `keep_alive` parameter with either the `/api/generate` and `/api/chat` API endpoints to control how long the model is left in memory.
+By default models are kept in memory for 5 minutes before being unloaded. This allows for quicker response times if you're making numerous requests to the LLM. If you want to immediately unload a model from memory, use the `ollama stop` command:
+```shell
+ollama stop llama3.2
+```
-The `keep_alive` parameter can be set to:
+If you're using the API, use the `keep_alive` parameter with the `/api/generate` and `/api/chat` endpoints to set the amount of time that a model stays in memory. The `keep_alive` parameter can be set to:
 * a duration string (such as "10m" or "24h")
 * a number in seconds (such as 3600)
 * any negative number which will keep the model loaded in memory (e.g. -1 or "-1m")
@@ -242,17 +251,17 @@ The `keep_alive` parameter can be set to:
 For example, to preload a model and leave it in memory use:
 ```shell
-curl http://localhost:11434/api/generate -d '{"model": "llama3", "keep_alive": -1}'
+curl http://localhost:11434/api/generate -d '{"model": "llama3.2", "keep_alive": -1}'
 ```
 To unload the model and free up memory use:
 ```shell
-curl http://localhost:11434/api/generate -d '{"model": "llama3", "keep_alive": 0}'
+curl http://localhost:11434/api/generate -d '{"model": "llama3.2", "keep_alive": 0}'
 ```
-Alternatively, you can change the amount of time all models are loaded into memory by setting the `OLLAMA_KEEP_ALIVE` environment variable when starting the Ollama server. The `OLLAMA_KEEP_ALIVE` variable uses the same parameter types as the `keep_alive` parameter types mentioned above. Refer to section explaining [how to configure the Ollama server](#how-do-i-configure-ollama-server) to correctly set the environment variable.
+Alternatively, you can change the amount of time all models are loaded into memory by setting the `OLLAMA_KEEP_ALIVE` environment variable when starting the Ollama server. The `OLLAMA_KEEP_ALIVE` variable uses the same parameter types as the `keep_alive` parameter types mentioned above. Refer to the section explaining [how to configure the Ollama server](#how-do-i-configure-ollama-server) to correctly set the environment variable.
-If you wish to override the `OLLAMA_KEEP_ALIVE` setting, use the `keep_alive` API parameter with the `/api/generate` or `/api/chat` API endpoints.
+The `keep_alive` API parameter with the `/api/generate` and `/api/chat` API endpoints will override the `OLLAMA_KEEP_ALIVE` setting.
 ## How do I manage the maximum number of requests the Ollama server can queue?
@@ -276,4 +285,4 @@ Note: Windows with Radeon GPUs currently default to 1 model maximum due to limit
 ## How does Ollama load models on multiple GPUs?
 Installing multiple GPUs of the same brand can be a great way to increase your available VRAM to load larger models.  When you load a new model, Ollama evaluates the required VRAM for the model against what is currently available.  If the model will entirely fit on any single GPU, Ollama will load the model on that GPU.  This typically provides the best performance as it reduces the amount of data transfering across the PCI bus during inference.  If the model does not fit entirely on one GPU, then it will be spread across all the available GPUs.
\ No newline at end of file
--- a/docs/gpu.md
+++ b/docs/gpu.md
@@ -10,7 +10,7 @@ Check your compute compatibility to see if your card is supported:
 | 9.0                | NVIDIA              | `H100`                                                                                                      |
 | 8.9                | GeForce RTX 40xx    | `RTX 4090` `RTX 4080 SUPER` `RTX 4080` `RTX 4070 Ti SUPER` `RTX 4070 Ti` `RTX 4070 SUPER` `RTX 4070` `RTX 4060 Ti` `RTX 4060`  |
 |                    | NVIDIA Professional | `L4` `L40` `RTX 6000`                                                                                       |
-| 8.6                | GeForce RTX 30xx    | `RTX 3090 Ti` `RTX 3090` `RTX 3080 Ti` `RTX 3080` `RTX 3070 Ti` `RTX 3070` `RTX 3060 Ti` `RTX 3060`         |
+| 8.6                | GeForce RTX 30xx    | `RTX 3090 Ti` `RTX 3090` `RTX 3080 Ti` `RTX 3080` `RTX 3070 Ti` `RTX 3070` `RTX 3060 Ti` `RTX 3060` `RTX 3050 Ti` `RTX 3050`   |
 |                    | NVIDIA Professional | `A40` `RTX A6000` `RTX A5000` `RTX A4000` `RTX A3000` `RTX A2000` `A10` `A16` `A2`                          |
 | 8.0                | NVIDIA              | `A100` `A30`                                                                                                |
 | 7.5                | GeForce GTX/RTX     | `GTX 1650 Ti` `TITAN RTX` `RTX 2080 Ti` `RTX 2080` `RTX 2070` `RTX 2060`                                    |
@@ -74,6 +74,10 @@ would set `HSA_OVERRIDE_GFX_VERSION="10.3.0"` as an environment variable for the
 server.  If you have an unsupported AMD GPU you can experiment using the list of
 supported types below.
+If you have multiple GPUs with different GFX versions, append the numeric device
+number to the environment variable to set them individually.  For example,
+`HSA_OVERRIDE_GFX_VERSION_0=10.3.0` and  `HSA_OVERRIDE_GFX_VERSION_1=11.0.0`
 At this time, the known supported GPU types on linux are the following LLVM Targets.
 This table shows some example GPUs that map to these LLVM targets:
 | **LLVM Target** | **An Example GPU** |
@@ -99,9 +103,10 @@ Reach out on [Discord](https://discord.gg/ollama) or file an
 ### GPU Selection
 If you have multiple AMD GPUs in your system and want to limit Ollama to use a
-subset, you can set `HIP_VISIBLE_DEVICES` to a comma separated list of GPUs.
+subset, you can set `ROCR_VISIBLE_DEVICES` to a comma separated list of GPUs.
 You can see the list of devices with `rocminfo`.  If you want to ignore the GPUs
-and force CPU usage, use an invalid GPU ID (e.g., "-1")
+and force CPU usage, use an invalid GPU ID (e.g., "-1").  When available, use the
+`Uuid` to uniquely identify the device instead of numeric value.
 ### Container Permission

--- a/docs/images/ollama-keys.png
+++ b/docs/images/ollama-keys.png
--- a/docs/images/signup.png
+++ b/docs/images/signup.png
--- a/docs/import.md
+++ b/docs/import.md
-# Import
+# Importing a model
-GGUF models and select Safetensors models can be imported directly into Ollama.
+## Table of Contents
-## Import GGUF
+  * [Importing a Safetensors adapter](#Importing-a-fine-tuned-adapter-from-Safetensors-weights)
+  * [Importing a Safetensors model](#Importing-a-model-from-Safetensors-weights)
+  * [Importing a GGUF file](#Importing-a-GGUF-based-model-or-adapter)
+  * [Sharing models on ollama.com](#Sharing-your-model-on-ollamacom)
-A binary GGUF file can be imported directly into Ollama through a Modelfile.
+## Importing a fine tuned adapter from Safetensors weights
+First, create a `Modelfile` with a `FROM` command pointing at the base model you used for fine tuning, and an `ADAPTER` command which points to the directory with your Safetensors adapter:
 ```dockerfile
-FROM /path/to/file.gguf
+FROM <base model name>
+ADAPTER /path/to/safetensors/adapter/directory
+```
+Make sure that you use the same base model in the `FROM` command as you used to create the adapter otherwise you will get erratic results. Most frameworks use different quantization methods, so it's best to use non-quantized (i.e. non-QLoRA) adapters. If your adapter is in the same directory as your `Modelfile`, use `ADAPTER .` to specify the adapter path.
+Now run `ollama create` from the directory where the `Modelfile` was created:
+```bash
+ollama create my-model
 ```
-## Import Safetensors
+Lastly, test the model:
+```bash
+ollama run my-model
+```
-If the model being imported is one of these architectures, it can be imported directly into Ollama through a Modelfile:
+Ollama supports importing adapters based on several different model architectures including:
- - LlamaForCausalLM
+  * Llama (including Llama 2, Llama 3, Llama 3.1, and Llama 3.2);
- - MistralForCausalLM
+  * Mistral (including Mistral 1, Mistral 2, and Mixtral); and
- - GemmaForCausalLM
+  * Gemma (including Gemma 1 and Gemma 2)
+You can create the adapter using a fine tuning framework or tool which can output adapters in the Safetensors format, such as:
+  * Hugging Face [fine tuning framework](https://huggingface.co/docs/transformers/en/training)
+  * [Unsloth](https://github.com/unslothai/unsloth)
+  * [MLX](https://github.com/ml-explore/mlx)
+## Importing a model from Safetensors weights
+First, create a `Modelfile` with a `FROM` command which points to the directory containing your Safetensors weights:
 ```dockerfile
 FROM /path/to/safetensors/directory
 ```
-For architectures not directly convertable by Ollama, see llama.cpp's [guide](https://github.com/ggerganov/llama.cpp/blob/master/README.md#prepare-and-quantize) on conversion. After conversion, see [Import GGUF](#import-gguf).
+If you create the Modelfile in the same directory as the weights, you can use the command `FROM .`.
-## Automatic Quantization
+Now run the `ollama create` command from the directory where you created the `Modelfile`:
-> [!NOTE]
+```shell
-> Automatic quantization requires v0.1.35 or higher.
+ollama create my-model
+```
+Lastly, test the model:
+```shell
+ollama run my-model
+```
+Ollama supports importing models for several different architectures including:
+  * Llama (including Llama 2, Llama 3, Llama 3.1, and Llama 3.2);
+  * Mistral (including Mistral 1, Mistral 2, and Mixtral);
+  * Gemma (including Gemma 1 and Gemma 2); and
+  * Phi3
+This includes importing foundation models as well as any fine tuned models which have been _fused_ with a foundation model.
+## Importing a GGUF based model or adapter
+If you have a GGUF based model or adapter it is possible to import it into Ollama. You can obtain a GGUF model or adapter by:
+  * converting a Safetensors model with the `convert_hf_to_gguf.py` from Llama.cpp; 
+  * converting a Safetensors adapter with the `convert_lora_to_gguf.py` from Llama.cpp; or
+  * downloading a model or adapter from a place such as HuggingFace
+To import a GGUF model, create a `Modelfile` containing:
+```dockerfile
+FROM /path/to/file.gguf
+```
-Ollama is capable of quantizing FP16 or FP32 models to any of the supported quantizations with the `-q/--quantize` flag in `ollama create`.
+For a GGUF adapter, create the `Modelfile` with:
+```dockerfile
+FROM <model name>
+ADAPTER /path/to/file.gguf
+```
+When importing a GGUF adapter, it's important to use the same base model as the base model that the adapter was created with. You can use:
+ * a model from Ollama
+ * a GGUF file
+ * a Safetensors based model 
+Once you have created your `Modelfile`, use the `ollama create` command to build the model.
+```shell
+ollama create my-model
+```
+## Quantizing a Model
+Quantizing a model allows you to run models faster and with less memory consumption but at reduced accuracy. This allows you to run a model on more modest hardware.
+Ollama can quantize FP16 and FP32 based models into different quantization levels using the `-q/--quantize` flag with the `ollama create` command.
+First, create a Modelfile with the FP16 or FP32 based model you wish to quantize.
 ```dockerfile
 FROM /path/to/my/gemma/f16/model
 ```
+Use `ollama create` to then create the quantized model.
 ```shell
-$ ollama create -q Q4_K_M mymodel
+$ ollama create --quantize q4_K_M mymodel
 transferring model data
 quantizing F16 model to Q4_K_M
 creating new layer sha256:735e246cc1abfd06e9cdcf95504d6789a6cd1ad7577108a70d9902fef503c1bd
@@ -47,42 +132,53 @@ success
 ### Supported Quantizations
- `Q4_0`
+- `q4_0`
- `Q4_1`
+- `q4_1`
- `Q5_0`
+- `q5_0`
- `Q5_1`
+- `q5_1`
- `Q8_0`
+- `q8_0`
 #### K-means Quantizations
- `Q3_K_S`
+- `q3_K_S`
- `Q3_K_M`
+- `q3_K_M`
- `Q3_K_L`
+- `q3_K_L`
- `Q4_K_S`
+- `q4_K_S`
- `Q4_K_M`
+- `q4_K_M`
- `Q5_K_S`
+- `q5_K_S`
- `Q5_K_M`
+- `q5_K_M`
- `Q6_K`
+- `q6_K`
-## Template Detection
-> [!NOTE]
+## Sharing your model on ollama.com
-> Template detection requires v0.1.42 or higher.
-Ollama uses model metadata, specifically `tokenizer.chat_template`, to automatically create a template appropriate for the model you're importing.
+You can share any model you have created by pushing it to [ollama.com](https://ollama.com) so that other users can try it out.
-```dockerfile
+First, use your browser to go to the [Ollama Sign-Up](https://ollama.com/signup) page. If you already have an account, you can skip this step.
-FROM /path/to/my/gemma/model
+<img src="images/signup.png" alt="Sign-Up" width="40%">
+The `Username` field will be used as part of your model's name (e.g. `jmorganca/mymodel`), so make sure you are comfortable with the username that you have selected.
+Now that you have created an account and are signed-in, go to the [Ollama Keys Settings](https://ollama.com/settings/keys) page.
+Follow the directions on the page to determine where your Ollama Public Key is located.
+<img src="images/ollama-keys.png" alt="Ollama Keys" width="80%">
+Click on the `Add Ollama Public Key` button, and copy and paste the contents of your Ollama Public Key into the text field.
+To push a model to [ollama.com](https://ollama.com), first make sure that it is named correctly with your username. You may have to use the `ollama cp` command to copy
+your model to give it the correct name. Once you're happy with your model's name, use the `ollama push` command to push it to [ollama.com](https://ollama.com).
+```shell
+ollama cp mymodel myuser/mymodel
+ollama push myuser/mymodel
 ```
+Once your model has been pushed, other users can pull and run it by using the command:
 ```shell
-$ ollama create mymodel
+ollama run myuser/mymodel
-transferring model data
-using autodetected template gemma-instruct
-creating new layer sha256:baa2a0edc27d19cc6b7537578a9a7ba1a4e3214dc185ed5ae43692b319af7b84
-creating new layer sha256:ba66c3309914dbef07e5149a648fd1877f030d337a4f240d444ea335008943cb
-writing manifest
-success
 ```
-Defining a template in the Modelfile will disable this feature which may be useful if you want to use a different template than the autodetected one.
--- a/docs/linux.md
+++ b/docs/linux.md
-# Ollama on Linux
+# Linux
 ## Install
-Install Ollama running this one-liner:
+To install Ollama, run the following command:
->
+```shell
-```bash
 curl -fsSL https://ollama.com/install.sh | sh
 ```
-## AMD Radeon GPU support
+## Manual install
-While AMD has contributed the `amdgpu` driver upstream to the official linux
+Download and extract the package:
-kernel source, the version is older and may not support all ROCm features. We
-recommend you install the latest driver from
-https://www.amd.com/en/support/linux-drivers for best support of your Radeon
-GPU.
-## Manual install
+```shell
+curl -L https://ollama.com/download/ollama-linux-amd64.tgz -o ollama-linux-amd64.tgz
+sudo tar -C /usr -xzf ollama-linux-amd64.tgz
+```
+Start Ollama:
+```shell
+ollama serve
+```
+In another terminal, verify that Ollama is running:
+```shell
+ollama -v
+```
+### AMD GPU install
+If you have an AMD GPU, also download and extract the additional ROCm package:
+```shell
+curl -L https://ollama.com/download/ollama-linux-amd64-rocm.tgz -o ollama-linux-amd64-rocm.tgz
+sudo tar -C /usr -xzf ollama-linux-amd64-rocm.tgz
+```
-### Download the `ollama` binary
+### ARM64 install
-Ollama is distributed as a self-contained binary. Download it to a directory in your PATH:
+Download and extract the ARM64-specific package:
-```bash
+```shell
-sudo curl -L https://ollama.com/download/ollama-linux-amd64 -o /usr/bin/ollama
+curl -L https://ollama.com/download/ollama-linux-arm64.tgz -o ollama-linux-arm64.tgz
-sudo chmod +x /usr/bin/ollama
+sudo tar -C /usr -xzf ollama-linux-arm64.tgz
 ```
 ### Adding Ollama as a startup service (recommended)
-Create a user for Ollama:
+Create a user and group for Ollama:
-```bash
+```shell
-sudo useradd -r -s /bin/false -m -d /usr/share/ollama ollama
+sudo useradd -r -s /bin/false -U -m -d /usr/share/ollama ollama
+sudo usermod -a -G ollama $(whoami)
 ```
 Create a service file in `/etc/systemd/system/ollama.service`:
@@ -50,6 +69,7 @@ User=ollama
 Group=ollama
 Restart=always
 RestartSec=3
+Environment="PATH=$PATH"
 [Install]
 WantedBy=default.target
@@ -57,64 +77,86 @@ WantedBy=default.target
 Then start the service:
-```bash
+```shell
 sudo systemctl daemon-reload
 sudo systemctl enable ollama
 ```
-### Install CUDA drivers (optional – for Nvidia GPUs)
+### Install CUDA drivers (optional)
 [Download and install](https://developer.nvidia.com/cuda-downloads) CUDA.
 Verify that the drivers are installed by running the following command, which should print details about your GPU:
-```bash
+```shell
 nvidia-smi
 ```
-### Install ROCm (optional - for Radeon GPUs)
+### Install AMD ROCm drivers (optional)
-[Download and Install](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html)
-Make sure to install ROCm v6
+[Download and Install](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html) ROCm v6.
 ### Start Ollama
-Start Ollama using `systemd`:
+Start Ollama and verify it is running:
-```bash
+```shell
 sudo systemctl start ollama
+sudo systemctl status ollama
 ```
-## Update
+> [!NOTE]
+> While AMD has contributed the `amdgpu` driver upstream to the official linux
+> kernel source, the version is older and may not support all ROCm features. We
+> recommend you install the latest driver from
+> https://www.amd.com/en/support/linux-drivers for best support of your Radeon
+> GPU.
+## Customizing
-Update ollama by running the install script again:
+To customize the installation of Ollama, you can edit the systemd service file or the environment variables by running:
-```bash
+```
+sudo systemctl edit ollama
+```
+Alternatively, create an override file manually in `/etc/systemd/system/ollama.service.d/override.conf`:
+```ini
+[Service]
+Environment="OLLAMA_DEBUG=1"
+```
+## Updating
+Update Ollama by running the install script again:
+```shell
 curl -fsSL https://ollama.com/install.sh | sh
 ```
-Or by downloading the ollama binary:
+Or by re-downloading Ollama:
-```bash
+```shell
-sudo curl -L https://ollama.com/download/ollama-linux-amd64 -o /usr/bin/ollama
+curl -L https://ollama.com/download/ollama-linux-amd64.tgz -o ollama-linux-amd64.tgz
-sudo chmod +x /usr/bin/ollama
+sudo tar -C /usr -xzf ollama-linux-amd64.tgz
 ```
 ## Installing specific versions
-Use `OLLAMA_VERSION` environment variable with the install script to install a specific version of Ollama, including pre-releases. You can find the version numbers in the [releases page](https://github.com/ollama/ollama/releases). 
+Use `OLLAMA_VERSION` environment variable with the install script to install a specific version of Ollama, including pre-releases. You can find the version numbers in the [releases page](https://github.com/ollama/ollama/releases).
 For example:
-```
+```shell
-curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION=0.1.32 sh
+curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION=0.3.9 sh
 ```
 ## Viewing logs
 To view logs of Ollama running as a startup service, run:
-```bash
+```shell
 journalctl -e -u ollama
 ```
@@ -122,7 +164,7 @@ journalctl -e -u ollama
 Remove the ollama service:
-```bash
+```shell
 sudo systemctl stop ollama
 sudo systemctl disable ollama
 sudo rm /etc/systemd/system/ollama.service
@@ -130,13 +172,13 @@ sudo rm /etc/systemd/system/ollama.service
 Remove the ollama binary from your bin directory (either `/usr/local/bin`, `/usr/bin`, or `/bin`):
-```bash
+```shell
 sudo rm $(which ollama)
 ```
 Remove the downloaded models and Ollama service user and group:
-```bash
+```shell
 sudo rm -r /usr/share/ollama
 sudo userdel ollama
 sudo groupdel ollama

--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -11,8 +11,9 @@ A model file is the blueprint to create and share models with Ollama.
 - [Examples](#examples)
 - [Instructions](#instructions)
  - [FROM (Required)](#from-required)
-    - [Build from llama3](#build-from-llama3)
+    - [Build from existing model](#build-from-existing-model)
-    - [Build from a bin file](#build-from-a-bin-file)
+    - [Build from a Safetensors model](#build-from-a-safetensors-model)
+    - [Build from a GGUF file](#build-from-a-gguf-file)
  - [PARAMETER](#parameter)
    - [Valid Parameters and Values](#valid-parameters-and-values)
  - [TEMPLATE](#template)
@@ -49,7 +50,7 @@ INSTRUCTION arguments
 An example of a `Modelfile` creating a mario blueprint:
 ```modelfile
-FROM llama3
+FROM llama3.2
 # sets the temperature to 1 [higher is more creative, lower is more coherent]
 PARAMETER temperature 1
 # sets the context window size to 4096, this controls how many tokens the LLM can use as context to generate the next token
@@ -71,10 +72,10 @@ More examples are available in the [examples directory](../examples).
 To view the Modelfile of a given model, use the `ollama show --modelfile` command.
  ```bash
-  > ollama show --modelfile llama3
+  > ollama show --modelfile llama3.2
  # Modelfile generated by "ollama show"
  # To build a new Modelfile based on this one, replace the FROM line with:
-  # FROM llama3:latest
+  # FROM llama3.2:latest
  FROM /Users/pdevine/.ollama/models/blobs/sha256-00e1317cbf74d901080d7100f57580ba8dd8de57203072dc6f668324ba545f29
  TEMPLATE """{{ if .System }}<|start_header_id|>system<|end_header_id|>
@@ -99,22 +100,39 @@ The `FROM` instruction defines the base model to use when creating a model.
 FROM <model name>:<tag>
 ```
-#### Build from llama3
+#### Build from existing model
 ```modelfile
-FROM llama3
+FROM llama3.2
 ```
 A list of available base models:
 <https://github.com/ollama/ollama#model-library>
+Additional models can be found at:
+<https://ollama.com/library>
-#### Build from a `bin` file
+#### Build from a Safetensors model
 ```modelfile
-FROM ./ollama-model.bin
+FROM <model directory>
 ```
-This bin file location should be specified as an absolute path or relative to the `Modelfile` location.
+The model directory should contain the Safetensors weights for a supported architecture.
+Currently supported model architectures:
+  * Llama (including Llama 2, Llama 3, Llama 3.1, and Llama 3.2)
+  * Mistral (including Mistral 1, Mistral 2, and Mixtral)
+  * Gemma (including Gemma 1 and Gemma 2)
+  * Phi3
+#### Build from a GGUF file
+```modelfile
+FROM ./ollama-model.gguf
+```
+The GGUF file location should be specified as an absolute path or relative to the `Modelfile` location.
 ### PARAMETER
@@ -174,10 +192,23 @@ SYSTEM """<system message>"""
 ### ADAPTER
-The `ADAPTER` instruction is an optional instruction that specifies any LoRA adapter that should apply to the base model. The value of this instruction should be an absolute path or a path relative to the Modelfile and the file must be in a GGML file format. The adapter should be tuned from the base model otherwise the behaviour is undefined.
+The `ADAPTER` instruction specifies a fine tuned LoRA adapter that should apply to the base model. The value of the adapter should be an absolute path or a path relative to the Modelfile. The base model should be specified with a `FROM` instruction. If the base model is not the same as the base model that the adapter was tuned from the behaviour will be erratic.
+#### Safetensor adapter
+```modelfile
+ADAPTER <path to safetensor adapter>
+```
+Currently supported Safetensor adapters:
+  * Llama (including Llama 2, Llama 3, and Llama 3.1)
+  * Mistral (including Mistral 1, Mistral 2, and Mixtral)
+  * Gemma (including Gemma 1 and Gemma 2)
+#### GGUF adapter
 ```modelfile
-ADAPTER ./ollama-lora.bin
+ADAPTER ./ollama-lora.gguf
 ```
 ### LICENSE

--- a/docs/openai.md
+++ b/docs/openai.md
@@ -25,7 +25,7 @@ chat_completion = client.chat.completions.create(
            'content': 'Say this is a test',
        }
    ],
-    model='llama3',
+    model='llama3.2',
 )
 response = client.chat.completions.create(
@@ -37,7 +37,7 @@ response = client.chat.completions.create(
                {"type": "text", "text": "What's in this image?"},
                {
                    "type": "image_url",
-                    "image_url": "iVBORw0KGgoAAAANSUhEUgAAAG0AAABmCAYAAADBPx+VAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAA3VSURBVHgB7Z27r0zdG8fX743i1bi1ikMoFMQloXRpKFFIqI7LH4BEQ+NWIkjQuSWCRIEoULk0gsK1kCBI0IhrQVT7tz/7zZo888yz1r7MnDl7z5xvsjkzs2fP3uu71nNfa7lkAsm7d++Sffv2JbNmzUqcc8m0adOSzZs3Z+/XES4ZckAWJEGWPiCxjsQNLWmQsWjRIpMseaxcuTKpG/7HP27I8P79e7dq1ars/yL4/v27S0ejqwv+cUOGEGGpKHR37tzJCEpHV9tnT58+dXXCJDdECBE2Ojrqjh071hpNECjx4cMHVycM1Uhbv359B2F79+51586daxN/+pyRkRFXKyRDAqxEp4yMlDDzXG1NPnnyJKkThoK0VFd1ELZu3TrzXKxKfW7dMBQ6bcuWLW2v0VlHjx41z717927ba22U9APcw7Nnz1oGEPeL3m3p2mTAYYnFmMOMXybPPXv2bNIPpFZr1NHn4HMw0KRBjg9NuRw95s8PEcz/6DZELQd/09C9QGq5RsmSRybqkwHGjh07OsJSsYYm3ijPpyHzoiacg35MLdDSIS/O1yM778jOTwYUkKNHWUzUWaOsylE00MyI0fcnOwIdjvtNdW/HZwNLGg+sR1kMepSNJXmIwxBZiG8tDTpEZzKg0GItNsosY8USkxDhD0Rinuiko2gfL/RbiD2LZAjU9zKQJj8RDR0vJBR1/Phx9+PHj9Z7REF4nTZkxzX4LCXHrV271qXkBAPGfP/atWvu/PnzHe4C97F48eIsRLZ9+3a3f/9+87dwP1JxaF7/3r17ba+5l4EcaVo0lj3SBq5kGTJSQmLWMjgYNei2GPT1MuMqGTDEFHzeQSP2wi/jGnkmPJ/nhccs44jvDAxpVcxnq0F6eT8h4ni/iIWpR5lPyA6ETkNXoSukvpJAD3AsXLiwpZs49+fPn5ke4j10TqYvegSfn0OnafC+Tv9ooA/JPkgQysqQNBzagXY55nO/oa1F7qvIPWkRL12WRpMWUvpVDYmxAPehxWSe8ZEXL20sadYIozfmNch4QJPAfeJgW3rNsnzphBKNJM2KKODo1rVOMRYik5ETy3ix4qWNI81qAAirizgMIc+yhTytx0JWZuNI03qsrgWlGtwjoS9XwgUhWGyhUaRZZQNNIEwCiXD16tXcAHUs79co0vSD8rrJCIW98pzvxpAWyyo3HYwqS0+H0BjStClcZJT5coMm6D2LOF8TolGJtK9fvyZpyiC5ePFi9nc/oJU4eiEP0jVoAnHa9wyJycITMP78+eMeP37sXrx44d6+fdt6f82aNdkx1pg9e3Zb5W+RSRE+n+VjksQWifvVaTKFhn5O8my63K8Qabdv33b379/PiAP//vuvW7BggZszZ072/+TJk91YgkafPn166zXB1rQHFvouAWHq9z3SEevSUerqCn2/dDCeta2jxYbr69evk4MHDyY7d+7MjhMnTiTPnz9Pfv/+nfQT2ggpO2dMF8cghuoM7Ygj5iWCqRlGFml0QC/ftGmTmzt3rmsaKDsgBSPh0/8yPeLLBihLkOKJc0jp8H8vUzcxIA1k6QJ/c78tWEyj5P3o4u9+jywNPdJi5rAH9x0KHcl4Hg570eQp3+vHXGyrmEeigzQsQsjavXt38ujRo44LQuDDhw+TW7duRS1HGgMxhNXHgflaNTOsHyKvHK5Ijo2jbFjJBQK9YwFd6RVMzfgRBmEfP37suBBm/p49e1qjEP2mwTViNRo0VJWH1deMXcNK08uUjVUu7s/zRaL+oLNxz1bpANco4npUgX4G2eFbpDFyQoQxojBCpEGSytmOH8qrH5Q9vuzD6ofQylkCUmh8DBAr+q8JCyVNtWQIidKQE9wNtLSQnS4jDSsxNHogzFuQBw4cyM61UKVsjfr3ooBkPSqqQHesUPWVtzi9/vQi1T+rJj7WiTz4Pt/l3LxUkr5P2VYZaZ4URpsE+st/dujQoaBBYokbrz/8TJNQYLSonrPS9kUaSkPeZyj1AWSj+d+VBoy1pIWVNed8P0Ll/ee5HdGRhrHhR5GGN0r4LGZBaj8oFDJitBTJzIZgFcmU0Y8ytWMZMzJOaXUSrUs5RxKnrxmbb5YXO9VGUhtpXldhEUogFr3IzIsvlpmdosVcGVGXFWp2oU9kLFL3dEkSz6NHEY1sjSRdIuDFWEhd8KxFqsRi1uM/nz9/zpxnwlESONdg6dKlbsaMGS4EHFHtjFIDHwKOo46l4TxSuxgDzi+rE2jg+BaFruOX4HXa0Nnf1lwAPufZeF8/r6zD97WK2qFnGjBxTw5qNGPxT+5T/r7/7RawFC3j4vTp09koCxkeHjqbHJqArmH5UrFKKksnxrK7FuRIs8STfBZv+luugXZ2pR/pP9Ois4z+TiMzUUkUjD0iEi1fzX8GmXyuxUBRcaUfykV0YZnlJGKQpOiGB76x5GeWkWWJc3mOrK6S7xdND+W5N6XyaRgtWJFe13GkaZnKOsYqGdOVVVbGupsyA/l7emTLHi7vwTdirNEt0qxnzAvBFcnQF16xh/TMpUuXHDowhlA9vQVraQhkudRdzOnK+04ZSP3DUhVSP61YsaLtd/ks7ZgtPcXqPqEafHkdqa84X6aCeL7YWlv6edGFHb+ZFICPlljHhg0bKuk0CSvVznWsotRu433alNdFrqG45ejoaPCaUkWERpLXjzFL2Rpllp7PJU2a/v7Ab8N05/9t27Z16KUqoFGsxnI9EosS2niSYg9SpU6B4JgTrvVW1flt1sT+0ADIJU2maXzcUTraGCRaL1Wp9rUMk16PMom8QhruxzvZIegJjFU7LLCePfS8uaQdPny4jTTL0dbee5mYokQsXTIWNY46kuMbnt8Kmec+LGWtOVIl9cT1rCB0V8WqkjAsRwta93TbwNYoGKsUSChN44lgBNCoHLHzquYKrU6qZ8lolCIN0Rh6cP0Q3U6I6IXILYOQI513hJaSKAorFpuHXJNfVlpRtmYBk1Su1obZr5dnKAO+L10Hrj3WZW+E3qh6IszE37F6EB+68mGpvKm4eb9bFrlzrok7fvr0Kfv727dvWRmdVTJHw0qiiCUSZ6wCK+7XL/AcsgNyL74DQQ730sv78Su7+t/A36MdY0sW5o40ahslXr58aZ5HtZB8GH64m9EmMZ7FpYw4T6QnrZfgenrhFxaSiSGXtPnz57e9TkNZLvTjeqhr734CNtrK41L40sUQckmj1lGKQ0rC37x544r8eNXRpnVE3ZZY7zXo8NomiO0ZUCj2uHz58rbXoZ6gc0uA+F6ZeKS/jhRDUq8MKrTho9fEkihMmhxtBI1DxKFY9XLpVcSkfoi8JGnToZO5sU5aiDQIW716ddt7ZLYtMQlhECdBGXZZMWldY5BHm5xgAroWj4C0hbYkSc/jBmggIrXJWlZM6pSETsEPGqZOndr2uuuR5rF169a2HoHPdurUKZM4CO1WTPqaDaAd+GFGKdIQkxAn9RuEWcTRyN2KSUgiSgF5aWzPTeA/lN5rZubMmR2bE4SIC4nJoltgAV/dVefZm72AtctUCJU2CMJ327hxY9t7EHbkyJFseq+EJSY16RPo3Dkq1kkr7+q0bNmyDuLQcZBEPYmHVdOBiJyIlrRDq41YPWfXOxUysi5fvtyaj+2BpcnsUV/oSoEMOk2CQGlr4ckhBwaetBhjCwH0ZHtJROPJkyc7UjcYLDjmrH7ADTEBXFfOYmB0k9oYBOjJ8b4aOYSe7QkKcYhFlq3QYLQhSidNmtS2RATwy8YOM3EQJsUjKiaWZ+vZToUQgzhkHXudb/PW5YMHD9yZM2faPsMwoc7RciYJXbGuBqJ1UIGKKLv915jsvgtJxCZDubdXr165mzdvtr1Hz5LONA8jrUwKPqsmVesKa49S3Q4WxmRPUEYdTjgiUcfUwLx589ySJUva3oMkP6IYddq6HMS4o55xBJBUeRjzfa4Zdeg56QZ43LhxoyPo7Lf1kNt7oO8wWAbNwaYjIv5lhyS7kRf96dvm5Jah8vfvX3flyhX35cuX6HfzFHOToS1H4BenCaHvO8pr8iDuwoUL7tevX+b5ZdbBair0xkFIlFDlW4ZknEClsp/TzXyAKVOmmHWFVSbDNw1l1+4f90U6IY/q4V27dpnE9bJ+v87QEydjqx/UamVVPRG+mwkNTYN+9tjkwzEx+atCm/X9WvWtDtAb68Wy9LXa1UmvCDDIpPkyOQ5ZwSzJ4jMrvFcr0rSjOUh+GcT4LSg5ugkW1Io0/SCDQBojh0hPlaJdah+tkVYrnTZowP8iq1F1TgMBBauufyB33x1v+NWFYmT5KmppgHC+NkAgbmRkpD3yn9QIseXymoTQFGQmIOKTxiZIWpvAatenVqRVXf2nTrAWMsPnKrMZHz6bJq5jvce6QK8J1cQNgKxlJapMPdZSR64/UivS9NztpkVEdKcrs5alhhWP9NeqlfWopzhZScI6QxseegZRGeg5a8C3Re1Mfl1ScP36ddcUaMuv24iOJtz7sbUjTS4qBvKmstYJoUauiuD3k5qhyr7QdUHMeCgLa1Ear9NquemdXgmum4fvJ6w1lqsuDhNrg1qSpleJK7K3TF0Q2jSd94uSZ60kK1e3qyVpQK6PVWXp2/FC3mp6jBhKKOiY2h3gtUV64TWM6wDETRPLDfSakXmH3w8g9Jlug8ZtTt4kVF0kLUYYmCCtD/DrQ5YhMGbA9L3ucdjh0y8kOHW5gU/VEEmJTcL4Pz/f7mgoAbYkAAAAAElFTkSuQmCC",
+                    "image_url": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAG0AAABmCAYAAADBPx+VAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAA3VSURBVHgB7Z27r0zdG8fX743i1bi1ikMoFMQloXRpKFFIqI7LH4BEQ+NWIkjQuSWCRIEoULk0gsK1kCBI0IhrQVT7tz/7zZo888yz1r7MnDl7z5xvsjkzs2fP3uu71nNfa7lkAsm7d++Sffv2JbNmzUqcc8m0adOSzZs3Z+/XES4ZckAWJEGWPiCxjsQNLWmQsWjRIpMseaxcuTKpG/7HP27I8P79e7dq1ars/yL4/v27S0ejqwv+cUOGEGGpKHR37tzJCEpHV9tnT58+dXXCJDdECBE2Ojrqjh071hpNECjx4cMHVycM1Uhbv359B2F79+51586daxN/+pyRkRFXKyRDAqxEp4yMlDDzXG1NPnnyJKkThoK0VFd1ELZu3TrzXKxKfW7dMBQ6bcuWLW2v0VlHjx41z717927ba22U9APcw7Nnz1oGEPeL3m3p2mTAYYnFmMOMXybPPXv2bNIPpFZr1NHn4HMw0KRBjg9NuRw95s8PEcz/6DZELQd/09C9QGq5RsmSRybqkwHGjh07OsJSsYYm3ijPpyHzoiacg35MLdDSIS/O1yM778jOTwYUkKNHWUzUWaOsylE00MyI0fcnOwIdjvtNdW/HZwNLGg+sR1kMepSNJXmIwxBZiG8tDTpEZzKg0GItNsosY8USkxDhD0Rinuiko2gfL/RbiD2LZAjU9zKQJj8RDR0vJBR1/Phx9+PHj9Z7REF4nTZkxzX4LCXHrV271qXkBAPGfP/atWvu/PnzHe4C97F48eIsRLZ9+3a3f/9+87dwP1JxaF7/3r17ba+5l4EcaVo0lj3SBq5kGTJSQmLWMjgYNei2GPT1MuMqGTDEFHzeQSP2wi/jGnkmPJ/nhccs44jvDAxpVcxnq0F6eT8h4ni/iIWpR5lPyA6ETkNXoSukvpJAD3AsXLiwpZs49+fPn5ke4j10TqYvegSfn0OnafC+Tv9ooA/JPkgQysqQNBzagXY55nO/oa1F7qvIPWkRL12WRpMWUvpVDYmxAPehxWSe8ZEXL20sadYIozfmNch4QJPAfeJgW3rNsnzphBKNJM2KKODo1rVOMRYik5ETy3ix4qWNI81qAAirizgMIc+yhTytx0JWZuNI03qsrgWlGtwjoS9XwgUhWGyhUaRZZQNNIEwCiXD16tXcAHUs79co0vSD8rrJCIW98pzvxpAWyyo3HYwqS0+H0BjStClcZJT5coMm6D2LOF8TolGJtK9fvyZpyiC5ePFi9nc/oJU4eiEP0jVoAnHa9wyJycITMP78+eMeP37sXrx44d6+fdt6f82aNdkx1pg9e3Zb5W+RSRE+n+VjksQWifvVaTKFhn5O8my63K8Qabdv33b379/PiAP//vuvW7BggZszZ072/+TJk91YgkafPn166zXB1rQHFvouAWHq9z3SEevSUerqCn2/dDCeta2jxYbr69evk4MHDyY7d+7MjhMnTiTPnz9Pfv/+nfQT2ggpO2dMF8cghuoM7Ygj5iWCqRlGFml0QC/ftGmTmzt3rmsaKDsgBSPh0/8yPeLLBihLkOKJc0jp8H8vUzcxIA1k6QJ/c78tWEyj5P3o4u9+jywNPdJi5rAH9x0KHcl4Hg570eQp3+vHXGyrmEeigzQsQsjavXt38ujRo44LQuDDhw+TW7duRS1HGgMxhNXHgflaNTOsHyKvHK5Ijo2jbFjJBQK9YwFd6RVMzfgRBmEfP37suBBm/p49e1qjEP2mwTViNRo0VJWH1deMXcNK08uUjVUu7s/zRaL+oLNxz1bpANco4npUgX4G2eFbpDFyQoQxojBCpEGSytmOH8qrH5Q9vuzD6ofQylkCUmh8DBAr+q8JCyVNtWQIidKQE9wNtLSQnS4jDSsxNHogzFuQBw4cyM61UKVsjfr3ooBkPSqqQHesUPWVtzi9/vQi1T+rJj7WiTz4Pt/l3LxUkr5P2VYZaZ4URpsE+st/dujQoaBBYokbrz/8TJNQYLSonrPS9kUaSkPeZyj1AWSj+d+VBoy1pIWVNed8P0Ll/ee5HdGRhrHhR5GGN0r4LGZBaj8oFDJitBTJzIZgFcmU0Y8ytWMZMzJOaXUSrUs5RxKnrxmbb5YXO9VGUhtpXldhEUogFr3IzIsvlpmdosVcGVGXFWp2oU9kLFL3dEkSz6NHEY1sjSRdIuDFWEhd8KxFqsRi1uM/nz9/zpxnwlESONdg6dKlbsaMGS4EHFHtjFIDHwKOo46l4TxSuxgDzi+rE2jg+BaFruOX4HXa0Nnf1lwAPufZeF8/r6zD97WK2qFnGjBxTw5qNGPxT+5T/r7/7RawFC3j4vTp09koCxkeHjqbHJqArmH5UrFKKksnxrK7FuRIs8STfBZv+luugXZ2pR/pP9Ois4z+TiMzUUkUjD0iEi1fzX8GmXyuxUBRcaUfykV0YZnlJGKQpOiGB76x5GeWkWWJc3mOrK6S7xdND+W5N6XyaRgtWJFe13GkaZnKOsYqGdOVVVbGupsyA/l7emTLHi7vwTdirNEt0qxnzAvBFcnQF16xh/TMpUuXHDowhlA9vQVraQhkudRdzOnK+04ZSP3DUhVSP61YsaLtd/ks7ZgtPcXqPqEafHkdqa84X6aCeL7YWlv6edGFHb+ZFICPlljHhg0bKuk0CSvVznWsotRu433alNdFrqG45ejoaPCaUkWERpLXjzFL2Rpllp7PJU2a/v7Ab8N05/9t27Z16KUqoFGsxnI9EosS2niSYg9SpU6B4JgTrvVW1flt1sT+0ADIJU2maXzcUTraGCRaL1Wp9rUMk16PMom8QhruxzvZIegJjFU7LLCePfS8uaQdPny4jTTL0dbee5mYokQsXTIWNY46kuMbnt8Kmec+LGWtOVIl9cT1rCB0V8WqkjAsRwta93TbwNYoGKsUSChN44lgBNCoHLHzquYKrU6qZ8lolCIN0Rh6cP0Q3U6I6IXILYOQI513hJaSKAorFpuHXJNfVlpRtmYBk1Su1obZr5dnKAO+L10Hrj3WZW+E3qh6IszE37F6EB+68mGpvKm4eb9bFrlzrok7fvr0Kfv727dvWRmdVTJHw0qiiCUSZ6wCK+7XL/AcsgNyL74DQQ730sv78Su7+t/A36MdY0sW5o40ahslXr58aZ5HtZB8GH64m9EmMZ7FpYw4T6QnrZfgenrhFxaSiSGXtPnz57e9TkNZLvTjeqhr734CNtrK41L40sUQckmj1lGKQ0rC37x544r8eNXRpnVE3ZZY7zXo8NomiO0ZUCj2uHz58rbXoZ6gc0uA+F6ZeKS/jhRDUq8MKrTho9fEkihMmhxtBI1DxKFY9XLpVcSkfoi8JGnToZO5sU5aiDQIW716ddt7ZLYtMQlhECdBGXZZMWldY5BHm5xgAroWj4C0hbYkSc/jBmggIrXJWlZM6pSETsEPGqZOndr2uuuR5rF169a2HoHPdurUKZM4CO1WTPqaDaAd+GFGKdIQkxAn9RuEWcTRyN2KSUgiSgF5aWzPTeA/lN5rZubMmR2bE4SIC4nJoltgAV/dVefZm72AtctUCJU2CMJ327hxY9t7EHbkyJFseq+EJSY16RPo3Dkq1kkr7+q0bNmyDuLQcZBEPYmHVdOBiJyIlrRDq41YPWfXOxUysi5fvtyaj+2BpcnsUV/oSoEMOk2CQGlr4ckhBwaetBhjCwH0ZHtJROPJkyc7UjcYLDjmrH7ADTEBXFfOYmB0k9oYBOjJ8b4aOYSe7QkKcYhFlq3QYLQhSidNmtS2RATwy8YOM3EQJsUjKiaWZ+vZToUQgzhkHXudb/PW5YMHD9yZM2faPsMwoc7RciYJXbGuBqJ1UIGKKLv915jsvgtJxCZDubdXr165mzdvtr1Hz5LONA8jrUwKPqsmVesKa49S3Q4WxmRPUEYdTjgiUcfUwLx589ySJUva3oMkP6IYddq6HMS4o55xBJBUeRjzfa4Zdeg56QZ43LhxoyPo7Lf1kNt7oO8wWAbNwaYjIv5lhyS7kRf96dvm5Jah8vfvX3flyhX35cuX6HfzFHOToS1H4BenCaHvO8pr8iDuwoUL7tevX+b5ZdbBair0xkFIlFDlW4ZknEClsp/TzXyAKVOmmHWFVSbDNw1l1+4f90U6IY/q4V27dpnE9bJ+v87QEydjqx/UamVVPRG+mwkNTYN+9tjkwzEx+atCm/X9WvWtDtAb68Wy9LXa1UmvCDDIpPkyOQ5ZwSzJ4jMrvFcr0rSjOUh+GcT4LSg5ugkW1Io0/SCDQBojh0hPlaJdah+tkVYrnTZowP8iq1F1TgMBBauufyB33x1v+NWFYmT5KmppgHC+NkAgbmRkpD3yn9QIseXymoTQFGQmIOKTxiZIWpvAatenVqRVXf2nTrAWMsPnKrMZHz6bJq5jvce6QK8J1cQNgKxlJapMPdZSR64/UivS9NztpkVEdKcrs5alhhWP9NeqlfWopzhZScI6QxseegZRGeg5a8C3Re1Mfl1ScP36ddcUaMuv24iOJtz7sbUjTS4qBvKmstYJoUauiuD3k5qhyr7QdUHMeCgLa1Ear9NquemdXgmum4fvJ6w1lqsuDhNrg1qSpleJK7K3TF0Q2jSd94uSZ60kK1e3qyVpQK6PVWXp2/FC3mp6jBhKKOiY2h3gtUV64TWM6wDETRPLDfSakXmH3w8g9Jlug8ZtTt4kVF0kLUYYmCCtD/DrQ5YhMGbA9L3ucdjh0y8kOHW5gU/VEEmJTcL4Pz/f7mgoAbYkAAAAAElFTkSuQmCC",
                },
            ],
        }
@@ -46,13 +46,13 @@ response = client.chat.completions.create(
 )
 completion = client.completions.create(
-    model="llama3",
+    model="llama3.2",
    prompt="Say this is a test",
 )
 list_completion = client.models.list()
-model = client.models.retrieve("llama3")
+model = client.models.retrieve("llama3.2")
 embeddings = client.embeddings.create(
    model="all-minilm",
@@ -74,7 +74,7 @@ const openai = new OpenAI({
 const chatCompletion = await openai.chat.completions.create({
    messages: [{ role: 'user', content: 'Say this is a test' }],
-    model: 'llama3',
+    model: 'llama3.2',
 })
 const response = await openai.chat.completions.create({
@@ -86,7 +86,7 @@ const response = await openai.chat.completions.create({
            { type: "text", text: "What's in this image?" },
            {
            type: "image_url",
-            image_url: "iVBORw0KGgoAAAANSUhEUgAAAG0AAABmCAYAAADBPx+VAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAA3VSURBVHgB7Z27r0zdG8fX743i1bi1ikMoFMQloXRpKFFIqI7LH4BEQ+NWIkjQuSWCRIEoULk0gsK1kCBI0IhrQVT7tz/7zZo888yz1r7MnDl7z5xvsjkzs2fP3uu71nNfa7lkAsm7d++Sffv2JbNmzUqcc8m0adOSzZs3Z+/XES4ZckAWJEGWPiCxjsQNLWmQsWjRIpMseaxcuTKpG/7HP27I8P79e7dq1ars/yL4/v27S0ejqwv+cUOGEGGpKHR37tzJCEpHV9tnT58+dXXCJDdECBE2Ojrqjh071hpNECjx4cMHVycM1Uhbv359B2F79+51586daxN/+pyRkRFXKyRDAqxEp4yMlDDzXG1NPnnyJKkThoK0VFd1ELZu3TrzXKxKfW7dMBQ6bcuWLW2v0VlHjx41z717927ba22U9APcw7Nnz1oGEPeL3m3p2mTAYYnFmMOMXybPPXv2bNIPpFZr1NHn4HMw0KRBjg9NuRw95s8PEcz/6DZELQd/09C9QGq5RsmSRybqkwHGjh07OsJSsYYm3ijPpyHzoiacg35MLdDSIS/O1yM778jOTwYUkKNHWUzUWaOsylE00MyI0fcnOwIdjvtNdW/HZwNLGg+sR1kMepSNJXmIwxBZiG8tDTpEZzKg0GItNsosY8USkxDhD0Rinuiko2gfL/RbiD2LZAjU9zKQJj8RDR0vJBR1/Phx9+PHj9Z7REF4nTZkxzX4LCXHrV271qXkBAPGfP/atWvu/PnzHe4C97F48eIsRLZ9+3a3f/9+87dwP1JxaF7/3r17ba+5l4EcaVo0lj3SBq5kGTJSQmLWMjgYNei2GPT1MuMqGTDEFHzeQSP2wi/jGnkmPJ/nhccs44jvDAxpVcxnq0F6eT8h4ni/iIWpR5lPyA6ETkNXoSukvpJAD3AsXLiwpZs49+fPn5ke4j10TqYvegSfn0OnafC+Tv9ooA/JPkgQysqQNBzagXY55nO/oa1F7qvIPWkRL12WRpMWUvpVDYmxAPehxWSe8ZEXL20sadYIozfmNch4QJPAfeJgW3rNsnzphBKNJM2KKODo1rVOMRYik5ETy3ix4qWNI81qAAirizgMIc+yhTytx0JWZuNI03qsrgWlGtwjoS9XwgUhWGyhUaRZZQNNIEwCiXD16tXcAHUs79co0vSD8rrJCIW98pzvxpAWyyo3HYwqS0+H0BjStClcZJT5coMm6D2LOF8TolGJtK9fvyZpyiC5ePFi9nc/oJU4eiEP0jVoAnHa9wyJycITMP78+eMeP37sXrx44d6+fdt6f82aNdkx1pg9e3Zb5W+RSRE+n+VjksQWifvVaTKFhn5O8my63K8Qabdv33b379/PiAP//vuvW7BggZszZ072/+TJk91YgkafPn166zXB1rQHFvouAWHq9z3SEevSUerqCn2/dDCeta2jxYbr69evk4MHDyY7d+7MjhMnTiTPnz9Pfv/+nfQT2ggpO2dMF8cghuoM7Ygj5iWCqRlGFml0QC/ftGmTmzt3rmsaKDsgBSPh0/8yPeLLBihLkOKJc0jp8H8vUzcxIA1k6QJ/c78tWEyj5P3o4u9+jywNPdJi5rAH9x0KHcl4Hg570eQp3+vHXGyrmEeigzQsQsjavXt38ujRo44LQuDDhw+TW7duRS1HGgMxhNXHgflaNTOsHyKvHK5Ijo2jbFjJBQK9YwFd6RVMzfgRBmEfP37suBBm/p49e1qjEP2mwTViNRo0VJWH1deMXcNK08uUjVUu7s/zRaL+oLNxz1bpANco4npUgX4G2eFbpDFyQoQxojBCpEGSytmOH8qrH5Q9vuzD6ofQylkCUmh8DBAr+q8JCyVNtWQIidKQE9wNtLSQnS4jDSsxNHogzFuQBw4cyM61UKVsjfr3ooBkPSqqQHesUPWVtzi9/vQi1T+rJj7WiTz4Pt/l3LxUkr5P2VYZaZ4URpsE+st/dujQoaBBYokbrz/8TJNQYLSonrPS9kUaSkPeZyj1AWSj+d+VBoy1pIWVNed8P0Ll/ee5HdGRhrHhR5GGN0r4LGZBaj8oFDJitBTJzIZgFcmU0Y8ytWMZMzJOaXUSrUs5RxKnrxmbb5YXO9VGUhtpXldhEUogFr3IzIsvlpmdosVcGVGXFWp2oU9kLFL3dEkSz6NHEY1sjSRdIuDFWEhd8KxFqsRi1uM/nz9/zpxnwlESONdg6dKlbsaMGS4EHFHtjFIDHwKOo46l4TxSuxgDzi+rE2jg+BaFruOX4HXa0Nnf1lwAPufZeF8/r6zD97WK2qFnGjBxTw5qNGPxT+5T/r7/7RawFC3j4vTp09koCxkeHjqbHJqArmH5UrFKKksnxrK7FuRIs8STfBZv+luugXZ2pR/pP9Ois4z+TiMzUUkUjD0iEi1fzX8GmXyuxUBRcaUfykV0YZnlJGKQpOiGB76x5GeWkWWJc3mOrK6S7xdND+W5N6XyaRgtWJFe13GkaZnKOsYqGdOVVVbGupsyA/l7emTLHi7vwTdirNEt0qxnzAvBFcnQF16xh/TMpUuXHDowhlA9vQVraQhkudRdzOnK+04ZSP3DUhVSP61YsaLtd/ks7ZgtPcXqPqEafHkdqa84X6aCeL7YWlv6edGFHb+ZFICPlljHhg0bKuk0CSvVznWsotRu433alNdFrqG45ejoaPCaUkWERpLXjzFL2Rpllp7PJU2a/v7Ab8N05/9t27Z16KUqoFGsxnI9EosS2niSYg9SpU6B4JgTrvVW1flt1sT+0ADIJU2maXzcUTraGCRaL1Wp9rUMk16PMom8QhruxzvZIegJjFU7LLCePfS8uaQdPny4jTTL0dbee5mYokQsXTIWNY46kuMbnt8Kmec+LGWtOVIl9cT1rCB0V8WqkjAsRwta93TbwNYoGKsUSChN44lgBNCoHLHzquYKrU6qZ8lolCIN0Rh6cP0Q3U6I6IXILYOQI513hJaSKAorFpuHXJNfVlpRtmYBk1Su1obZr5dnKAO+L10Hrj3WZW+E3qh6IszE37F6EB+68mGpvKm4eb9bFrlzrok7fvr0Kfv727dvWRmdVTJHw0qiiCUSZ6wCK+7XL/AcsgNyL74DQQ730sv78Su7+t/A36MdY0sW5o40ahslXr58aZ5HtZB8GH64m9EmMZ7FpYw4T6QnrZfgenrhFxaSiSGXtPnz57e9TkNZLvTjeqhr734CNtrK41L40sUQckmj1lGKQ0rC37x544r8eNXRpnVE3ZZY7zXo8NomiO0ZUCj2uHz58rbXoZ6gc0uA+F6ZeKS/jhRDUq8MKrTho9fEkihMmhxtBI1DxKFY9XLpVcSkfoi8JGnToZO5sU5aiDQIW716ddt7ZLYtMQlhECdBGXZZMWldY5BHm5xgAroWj4C0hbYkSc/jBmggIrXJWlZM6pSETsEPGqZOndr2uuuR5rF169a2HoHPdurUKZM4CO1WTPqaDaAd+GFGKdIQkxAn9RuEWcTRyN2KSUgiSgF5aWzPTeA/lN5rZubMmR2bE4SIC4nJoltgAV/dVefZm72AtctUCJU2CMJ327hxY9t7EHbkyJFseq+EJSY16RPo3Dkq1kkr7+q0bNmyDuLQcZBEPYmHVdOBiJyIlrRDq41YPWfXOxUysi5fvtyaj+2BpcnsUV/oSoEMOk2CQGlr4ckhBwaetBhjCwH0ZHtJROPJkyc7UjcYLDjmrH7ADTEBXFfOYmB0k9oYBOjJ8b4aOYSe7QkKcYhFlq3QYLQhSidNmtS2RATwy8YOM3EQJsUjKiaWZ+vZToUQgzhkHXudb/PW5YMHD9yZM2faPsMwoc7RciYJXbGuBqJ1UIGKKLv915jsvgtJxCZDubdXr165mzdvtr1Hz5LONA8jrUwKPqsmVesKa49S3Q4WxmRPUEYdTjgiUcfUwLx589ySJUva3oMkP6IYddq6HMS4o55xBJBUeRjzfa4Zdeg56QZ43LhxoyPo7Lf1kNt7oO8wWAbNwaYjIv5lhyS7kRf96dvm5Jah8vfvX3flyhX35cuX6HfzFHOToS1H4BenCaHvO8pr8iDuwoUL7tevX+b5ZdbBair0xkFIlFDlW4ZknEClsp/TzXyAKVOmmHWFVSbDNw1l1+4f90U6IY/q4V27dpnE9bJ+v87QEydjqx/UamVVPRG+mwkNTYN+9tjkwzEx+atCm/X9WvWtDtAb68Wy9LXa1UmvCDDIpPkyOQ5ZwSzJ4jMrvFcr0rSjOUh+GcT4LSg5ugkW1Io0/SCDQBojh0hPlaJdah+tkVYrnTZowP8iq1F1TgMBBauufyB33x1v+NWFYmT5KmppgHC+NkAgbmRkpD3yn9QIseXymoTQFGQmIOKTxiZIWpvAatenVqRVXf2nTrAWMsPnKrMZHz6bJq5jvce6QK8J1cQNgKxlJapMPdZSR64/UivS9NztpkVEdKcrs5alhhWP9NeqlfWopzhZScI6QxseegZRGeg5a8C3Re1Mfl1ScP36ddcUaMuv24iOJtz7sbUjTS4qBvKmstYJoUauiuD3k5qhyr7QdUHMeCgLa1Ear9NquemdXgmum4fvJ6w1lqsuDhNrg1qSpleJK7K3TF0Q2jSd94uSZ60kK1e3qyVpQK6PVWXp2/FC3mp6jBhKKOiY2h3gtUV64TWM6wDETRPLDfSakXmH3w8g9Jlug8ZtTt4kVF0kLUYYmCCtD/DrQ5YhMGbA9L3ucdjh0y8kOHW5gU/VEEmJTcL4Pz/f7mgoAbYkAAAAAElFTkSuQmCC",
+            image_url: "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAG0AAABmCAYAAADBPx+VAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAA3VSURBVHgB7Z27r0zdG8fX743i1bi1ikMoFMQloXRpKFFIqI7LH4BEQ+NWIkjQuSWCRIEoULk0gsK1kCBI0IhrQVT7tz/7zZo888yz1r7MnDl7z5xvsjkzs2fP3uu71nNfa7lkAsm7d++Sffv2JbNmzUqcc8m0adOSzZs3Z+/XES4ZckAWJEGWPiCxjsQNLWmQsWjRIpMseaxcuTKpG/7HP27I8P79e7dq1ars/yL4/v27S0ejqwv+cUOGEGGpKHR37tzJCEpHV9tnT58+dXXCJDdECBE2Ojrqjh071hpNECjx4cMHVycM1Uhbv359B2F79+51586daxN/+pyRkRFXKyRDAqxEp4yMlDDzXG1NPnnyJKkThoK0VFd1ELZu3TrzXKxKfW7dMBQ6bcuWLW2v0VlHjx41z717927ba22U9APcw7Nnz1oGEPeL3m3p2mTAYYnFmMOMXybPPXv2bNIPpFZr1NHn4HMw0KRBjg9NuRw95s8PEcz/6DZELQd/09C9QGq5RsmSRybqkwHGjh07OsJSsYYm3ijPpyHzoiacg35MLdDSIS/O1yM778jOTwYUkKNHWUzUWaOsylE00MyI0fcnOwIdjvtNdW/HZwNLGg+sR1kMepSNJXmIwxBZiG8tDTpEZzKg0GItNsosY8USkxDhD0Rinuiko2gfL/RbiD2LZAjU9zKQJj8RDR0vJBR1/Phx9+PHj9Z7REF4nTZkxzX4LCXHrV271qXkBAPGfP/atWvu/PnzHe4C97F48eIsRLZ9+3a3f/9+87dwP1JxaF7/3r17ba+5l4EcaVo0lj3SBq5kGTJSQmLWMjgYNei2GPT1MuMqGTDEFHzeQSP2wi/jGnkmPJ/nhccs44jvDAxpVcxnq0F6eT8h4ni/iIWpR5lPyA6ETkNXoSukvpJAD3AsXLiwpZs49+fPn5ke4j10TqYvegSfn0OnafC+Tv9ooA/JPkgQysqQNBzagXY55nO/oa1F7qvIPWkRL12WRpMWUvpVDYmxAPehxWSe8ZEXL20sadYIozfmNch4QJPAfeJgW3rNsnzphBKNJM2KKODo1rVOMRYik5ETy3ix4qWNI81qAAirizgMIc+yhTytx0JWZuNI03qsrgWlGtwjoS9XwgUhWGyhUaRZZQNNIEwCiXD16tXcAHUs79co0vSD8rrJCIW98pzvxpAWyyo3HYwqS0+H0BjStClcZJT5coMm6D2LOF8TolGJtK9fvyZpyiC5ePFi9nc/oJU4eiEP0jVoAnHa9wyJycITMP78+eMeP37sXrx44d6+fdt6f82aNdkx1pg9e3Zb5W+RSRE+n+VjksQWifvVaTKFhn5O8my63K8Qabdv33b379/PiAP//vuvW7BggZszZ072/+TJk91YgkafPn166zXB1rQHFvouAWHq9z3SEevSUerqCn2/dDCeta2jxYbr69evk4MHDyY7d+7MjhMnTiTPnz9Pfv/+nfQT2ggpO2dMF8cghuoM7Ygj5iWCqRlGFml0QC/ftGmTmzt3rmsaKDsgBSPh0/8yPeLLBihLkOKJc0jp8H8vUzcxIA1k6QJ/c78tWEyj5P3o4u9+jywNPdJi5rAH9x0KHcl4Hg570eQp3+vHXGyrmEeigzQsQsjavXt38ujRo44LQuDDhw+TW7duRS1HGgMxhNXHgflaNTOsHyKvHK5Ijo2jbFjJBQK9YwFd6RVMzfgRBmEfP37suBBm/p49e1qjEP2mwTViNRo0VJWH1deMXcNK08uUjVUu7s/zRaL+oLNxz1bpANco4npUgX4G2eFbpDFyQoQxojBCpEGSytmOH8qrH5Q9vuzD6ofQylkCUmh8DBAr+q8JCyVNtWQIidKQE9wNtLSQnS4jDSsxNHogzFuQBw4cyM61UKVsjfr3ooBkPSqqQHesUPWVtzi9/vQi1T+rJj7WiTz4Pt/l3LxUkr5P2VYZaZ4URpsE+st/dujQoaBBYokbrz/8TJNQYLSonrPS9kUaSkPeZyj1AWSj+d+VBoy1pIWVNed8P0Ll/ee5HdGRhrHhR5GGN0r4LGZBaj8oFDJitBTJzIZgFcmU0Y8ytWMZMzJOaXUSrUs5RxKnrxmbb5YXO9VGUhtpXldhEUogFr3IzIsvlpmdosVcGVGXFWp2oU9kLFL3dEkSz6NHEY1sjSRdIuDFWEhd8KxFqsRi1uM/nz9/zpxnwlESONdg6dKlbsaMGS4EHFHtjFIDHwKOo46l4TxSuxgDzi+rE2jg+BaFruOX4HXa0Nnf1lwAPufZeF8/r6zD97WK2qFnGjBxTw5qNGPxT+5T/r7/7RawFC3j4vTp09koCxkeHjqbHJqArmH5UrFKKksnxrK7FuRIs8STfBZv+luugXZ2pR/pP9Ois4z+TiMzUUkUjD0iEi1fzX8GmXyuxUBRcaUfykV0YZnlJGKQpOiGB76x5GeWkWWJc3mOrK6S7xdND+W5N6XyaRgtWJFe13GkaZnKOsYqGdOVVVbGupsyA/l7emTLHi7vwTdirNEt0qxnzAvBFcnQF16xh/TMpUuXHDowhlA9vQVraQhkudRdzOnK+04ZSP3DUhVSP61YsaLtd/ks7ZgtPcXqPqEafHkdqa84X6aCeL7YWlv6edGFHb+ZFICPlljHhg0bKuk0CSvVznWsotRu433alNdFrqG45ejoaPCaUkWERpLXjzFL2Rpllp7PJU2a/v7Ab8N05/9t27Z16KUqoFGsxnI9EosS2niSYg9SpU6B4JgTrvVW1flt1sT+0ADIJU2maXzcUTraGCRaL1Wp9rUMk16PMom8QhruxzvZIegJjFU7LLCePfS8uaQdPny4jTTL0dbee5mYokQsXTIWNY46kuMbnt8Kmec+LGWtOVIl9cT1rCB0V8WqkjAsRwta93TbwNYoGKsUSChN44lgBNCoHLHzquYKrU6qZ8lolCIN0Rh6cP0Q3U6I6IXILYOQI513hJaSKAorFpuHXJNfVlpRtmYBk1Su1obZr5dnKAO+L10Hrj3WZW+E3qh6IszE37F6EB+68mGpvKm4eb9bFrlzrok7fvr0Kfv727dvWRmdVTJHw0qiiCUSZ6wCK+7XL/AcsgNyL74DQQ730sv78Su7+t/A36MdY0sW5o40ahslXr58aZ5HtZB8GH64m9EmMZ7FpYw4T6QnrZfgenrhFxaSiSGXtPnz57e9TkNZLvTjeqhr734CNtrK41L40sUQckmj1lGKQ0rC37x544r8eNXRpnVE3ZZY7zXo8NomiO0ZUCj2uHz58rbXoZ6gc0uA+F6ZeKS/jhRDUq8MKrTho9fEkihMmhxtBI1DxKFY9XLpVcSkfoi8JGnToZO5sU5aiDQIW716ddt7ZLYtMQlhECdBGXZZMWldY5BHm5xgAroWj4C0hbYkSc/jBmggIrXJWlZM6pSETsEPGqZOndr2uuuR5rF169a2HoHPdurUKZM4CO1WTPqaDaAd+GFGKdIQkxAn9RuEWcTRyN2KSUgiSgF5aWzPTeA/lN5rZubMmR2bE4SIC4nJoltgAV/dVefZm72AtctUCJU2CMJ327hxY9t7EHbkyJFseq+EJSY16RPo3Dkq1kkr7+q0bNmyDuLQcZBEPYmHVdOBiJyIlrRDq41YPWfXOxUysi5fvtyaj+2BpcnsUV/oSoEMOk2CQGlr4ckhBwaetBhjCwH0ZHtJROPJkyc7UjcYLDjmrH7ADTEBXFfOYmB0k9oYBOjJ8b4aOYSe7QkKcYhFlq3QYLQhSidNmtS2RATwy8YOM3EQJsUjKiaWZ+vZToUQgzhkHXudb/PW5YMHD9yZM2faPsMwoc7RciYJXbGuBqJ1UIGKKLv915jsvgtJxCZDubdXr165mzdvtr1Hz5LONA8jrUwKPqsmVesKa49S3Q4WxmRPUEYdTjgiUcfUwLx589ySJUva3oMkP6IYddq6HMS4o55xBJBUeRjzfa4Zdeg56QZ43LhxoyPo7Lf1kNt7oO8wWAbNwaYjIv5lhyS7kRf96dvm5Jah8vfvX3flyhX35cuX6HfzFHOToS1H4BenCaHvO8pr8iDuwoUL7tevX+b5ZdbBair0xkFIlFDlW4ZknEClsp/TzXyAKVOmmHWFVSbDNw1l1+4f90U6IY/q4V27dpnE9bJ+v87QEydjqx/UamVVPRG+mwkNTYN+9tjkwzEx+atCm/X9WvWtDtAb68Wy9LXa1UmvCDDIpPkyOQ5ZwSzJ4jMrvFcr0rSjOUh+GcT4LSg5ugkW1Io0/SCDQBojh0hPlaJdah+tkVYrnTZowP8iq1F1TgMBBauufyB33x1v+NWFYmT5KmppgHC+NkAgbmRkpD3yn9QIseXymoTQFGQmIOKTxiZIWpvAatenVqRVXf2nTrAWMsPnKrMZHz6bJq5jvce6QK8J1cQNgKxlJapMPdZSR64/UivS9NztpkVEdKcrs5alhhWP9NeqlfWopzhZScI6QxseegZRGeg5a8C3Re1Mfl1ScP36ddcUaMuv24iOJtz7sbUjTS4qBvKmstYJoUauiuD3k5qhyr7QdUHMeCgLa1Ear9NquemdXgmum4fvJ6w1lqsuDhNrg1qSpleJK7K3TF0Q2jSd94uSZ60kK1e3qyVpQK6PVWXp2/FC3mp6jBhKKOiY2h3gtUV64TWM6wDETRPLDfSakXmH3w8g9Jlug8ZtTt4kVF0kLUYYmCCtD/DrQ5YhMGbA9L3ucdjh0y8kOHW5gU/VEEmJTcL4Pz/f7mgoAbYkAAAAAElFTkSuQmCC",
            },
        ],
        },
@@ -94,13 +94,13 @@ const response = await openai.chat.completions.create({
 })
 const completion = await openai.completions.create({
-    model: "llama3",
+    model: "llama3.2",
    prompt: "Say this is a test.",
 })
 const listCompletion = await openai.models.list()
-const model = await openai.models.retrieve("llama3")
+const model = await openai.models.retrieve("llama3.2")
 const embedding = await openai.embeddings.create({
  model: "all-minilm",
@@ -114,7 +114,7 @@ const embedding = await openai.embeddings.create({
 curl http://localhost:11434/v1/chat/completions \
    -H "Content-Type: application/json" \
    -d '{
-        "model": "llama3",
+        "model": "llama3.2",
        "messages": [
            {
                "role": "system",
@@ -142,7 +142,7 @@ curl http://localhost:11434/v1/chat/completions \
          {
            "type": "image_url",
            "image_url": {
-               "url": "iVBORw0KGgoAAAANSUhEUgAAAG0AAABmCAYAAADBPx+VAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAA3VSURBVHgB7Z27r0zdG8fX743i1bi1ikMoFMQloXRpKFFIqI7LH4BEQ+NWIkjQuSWCRIEoULk0gsK1kCBI0IhrQVT7tz/7zZo888yz1r7MnDl7z5xvsjkzs2fP3uu71nNfa7lkAsm7d++Sffv2JbNmzUqcc8m0adOSzZs3Z+/XES4ZckAWJEGWPiCxjsQNLWmQsWjRIpMseaxcuTKpG/7HP27I8P79e7dq1ars/yL4/v27S0ejqwv+cUOGEGGpKHR37tzJCEpHV9tnT58+dXXCJDdECBE2Ojrqjh071hpNECjx4cMHVycM1Uhbv359B2F79+51586daxN/+pyRkRFXKyRDAqxEp4yMlDDzXG1NPnnyJKkThoK0VFd1ELZu3TrzXKxKfW7dMBQ6bcuWLW2v0VlHjx41z717927ba22U9APcw7Nnz1oGEPeL3m3p2mTAYYnFmMOMXybPPXv2bNIPpFZr1NHn4HMw0KRBjg9NuRw95s8PEcz/6DZELQd/09C9QGq5RsmSRybqkwHGjh07OsJSsYYm3ijPpyHzoiacg35MLdDSIS/O1yM778jOTwYUkKNHWUzUWaOsylE00MyI0fcnOwIdjvtNdW/HZwNLGg+sR1kMepSNJXmIwxBZiG8tDTpEZzKg0GItNsosY8USkxDhD0Rinuiko2gfL/RbiD2LZAjU9zKQJj8RDR0vJBR1/Phx9+PHj9Z7REF4nTZkxzX4LCXHrV271qXkBAPGfP/atWvu/PnzHe4C97F48eIsRLZ9+3a3f/9+87dwP1JxaF7/3r17ba+5l4EcaVo0lj3SBq5kGTJSQmLWMjgYNei2GPT1MuMqGTDEFHzeQSP2wi/jGnkmPJ/nhccs44jvDAxpVcxnq0F6eT8h4ni/iIWpR5lPyA6ETkNXoSukvpJAD3AsXLiwpZs49+fPn5ke4j10TqYvegSfn0OnafC+Tv9ooA/JPkgQysqQNBzagXY55nO/oa1F7qvIPWkRL12WRpMWUvpVDYmxAPehxWSe8ZEXL20sadYIozfmNch4QJPAfeJgW3rNsnzphBKNJM2KKODo1rVOMRYik5ETy3ix4qWNI81qAAirizgMIc+yhTytx0JWZuNI03qsrgWlGtwjoS9XwgUhWGyhUaRZZQNNIEwCiXD16tXcAHUs79co0vSD8rrJCIW98pzvxpAWyyo3HYwqS0+H0BjStClcZJT5coMm6D2LOF8TolGJtK9fvyZpyiC5ePFi9nc/oJU4eiEP0jVoAnHa9wyJycITMP78+eMeP37sXrx44d6+fdt6f82aNdkx1pg9e3Zb5W+RSRE+n+VjksQWifvVaTKFhn5O8my63K8Qabdv33b379/PiAP//vuvW7BggZszZ072/+TJk91YgkafPn166zXB1rQHFvouAWHq9z3SEevSUerqCn2/dDCeta2jxYbr69evk4MHDyY7d+7MjhMnTiTPnz9Pfv/+nfQT2ggpO2dMF8cghuoM7Ygj5iWCqRlGFml0QC/ftGmTmzt3rmsaKDsgBSPh0/8yPeLLBihLkOKJc0jp8H8vUzcxIA1k6QJ/c78tWEyj5P3o4u9+jywNPdJi5rAH9x0KHcl4Hg570eQp3+vHXGyrmEeigzQsQsjavXt38ujRo44LQuDDhw+TW7duRS1HGgMxhNXHgflaNTOsHyKvHK5Ijo2jbFjJBQK9YwFd6RVMzfgRBmEfP37suBBm/p49e1qjEP2mwTViNRo0VJWH1deMXcNK08uUjVUu7s/zRaL+oLNxz1bpANco4npUgX4G2eFbpDFyQoQxojBCpEGSytmOH8qrH5Q9vuzD6ofQylkCUmh8DBAr+q8JCyVNtWQIidKQE9wNtLSQnS4jDSsxNHogzFuQBw4cyM61UKVsjfr3ooBkPSqqQHesUPWVtzi9/vQi1T+rJj7WiTz4Pt/l3LxUkr5P2VYZaZ4URpsE+st/dujQoaBBYokbrz/8TJNQYLSonrPS9kUaSkPeZyj1AWSj+d+VBoy1pIWVNed8P0Ll/ee5HdGRhrHhR5GGN0r4LGZBaj8oFDJitBTJzIZgFcmU0Y8ytWMZMzJOaXUSrUs5RxKnrxmbb5YXO9VGUhtpXldhEUogFr3IzIsvlpmdosVcGVGXFWp2oU9kLFL3dEkSz6NHEY1sjSRdIuDFWEhd8KxFqsRi1uM/nz9/zpxnwlESONdg6dKlbsaMGS4EHFHtjFIDHwKOo46l4TxSuxgDzi+rE2jg+BaFruOX4HXa0Nnf1lwAPufZeF8/r6zD97WK2qFnGjBxTw5qNGPxT+5T/r7/7RawFC3j4vTp09koCxkeHjqbHJqArmH5UrFKKksnxrK7FuRIs8STfBZv+luugXZ2pR/pP9Ois4z+TiMzUUkUjD0iEi1fzX8GmXyuxUBRcaUfykV0YZnlJGKQpOiGB76x5GeWkWWJc3mOrK6S7xdND+W5N6XyaRgtWJFe13GkaZnKOsYqGdOVVVbGupsyA/l7emTLHi7vwTdirNEt0qxnzAvBFcnQF16xh/TMpUuXHDowhlA9vQVraQhkudRdzOnK+04ZSP3DUhVSP61YsaLtd/ks7ZgtPcXqPqEafHkdqa84X6aCeL7YWlv6edGFHb+ZFICPlljHhg0bKuk0CSvVznWsotRu433alNdFrqG45ejoaPCaUkWERpLXjzFL2Rpllp7PJU2a/v7Ab8N05/9t27Z16KUqoFGsxnI9EosS2niSYg9SpU6B4JgTrvVW1flt1sT+0ADIJU2maXzcUTraGCRaL1Wp9rUMk16PMom8QhruxzvZIegJjFU7LLCePfS8uaQdPny4jTTL0dbee5mYokQsXTIWNY46kuMbnt8Kmec+LGWtOVIl9cT1rCB0V8WqkjAsRwta93TbwNYoGKsUSChN44lgBNCoHLHzquYKrU6qZ8lolCIN0Rh6cP0Q3U6I6IXILYOQI513hJaSKAorFpuHXJNfVlpRtmYBk1Su1obZr5dnKAO+L10Hrj3WZW+E3qh6IszE37F6EB+68mGpvKm4eb9bFrlzrok7fvr0Kfv727dvWRmdVTJHw0qiiCUSZ6wCK+7XL/AcsgNyL74DQQ730sv78Su7+t/A36MdY0sW5o40ahslXr58aZ5HtZB8GH64m9EmMZ7FpYw4T6QnrZfgenrhFxaSiSGXtPnz57e9TkNZLvTjeqhr734CNtrK41L40sUQckmj1lGKQ0rC37x544r8eNXRpnVE3ZZY7zXo8NomiO0ZUCj2uHz58rbXoZ6gc0uA+F6ZeKS/jhRDUq8MKrTho9fEkihMmhxtBI1DxKFY9XLpVcSkfoi8JGnToZO5sU5aiDQIW716ddt7ZLYtMQlhECdBGXZZMWldY5BHm5xgAroWj4C0hbYkSc/jBmggIrXJWlZM6pSETsEPGqZOndr2uuuR5rF169a2HoHPdurUKZM4CO1WTPqaDaAd+GFGKdIQkxAn9RuEWcTRyN2KSUgiSgF5aWzPTeA/lN5rZubMmR2bE4SIC4nJoltgAV/dVefZm72AtctUCJU2CMJ327hxY9t7EHbkyJFseq+EJSY16RPo3Dkq1kkr7+q0bNmyDuLQcZBEPYmHVdOBiJyIlrRDq41YPWfXOxUysi5fvtyaj+2BpcnsUV/oSoEMOk2CQGlr4ckhBwaetBhjCwH0ZHtJROPJkyc7UjcYLDjmrH7ADTEBXFfOYmB0k9oYBOjJ8b4aOYSe7QkKcYhFlq3QYLQhSidNmtS2RATwy8YOM3EQJsUjKiaWZ+vZToUQgzhkHXudb/PW5YMHD9yZM2faPsMwoc7RciYJXbGuBqJ1UIGKKLv915jsvgtJxCZDubdXr165mzdvtr1Hz5LONA8jrUwKPqsmVesKa49S3Q4WxmRPUEYdTjgiUcfUwLx589ySJUva3oMkP6IYddq6HMS4o55xBJBUeRjzfa4Zdeg56QZ43LhxoyPo7Lf1kNt7oO8wWAbNwaYjIv5lhyS7kRf96dvm5Jah8vfvX3flyhX35cuX6HfzFHOToS1H4BenCaHvO8pr8iDuwoUL7tevX+b5ZdbBair0xkFIlFDlW4ZknEClsp/TzXyAKVOmmHWFVSbDNw1l1+4f90U6IY/q4V27dpnE9bJ+v87QEydjqx/UamVVPRG+mwkNTYN+9tjkwzEx+atCm/X9WvWtDtAb68Wy9LXa1UmvCDDIpPkyOQ5ZwSzJ4jMrvFcr0rSjOUh+GcT4LSg5ugkW1Io0/SCDQBojh0hPlaJdah+tkVYrnTZowP8iq1F1TgMBBauufyB33x1v+NWFYmT5KmppgHC+NkAgbmRkpD3yn9QIseXymoTQFGQmIOKTxiZIWpvAatenVqRVXf2nTrAWMsPnKrMZHz6bJq5jvce6QK8J1cQNgKxlJapMPdZSR64/UivS9NztpkVEdKcrs5alhhWP9NeqlfWopzhZScI6QxseegZRGeg5a8C3Re1Mfl1ScP36ddcUaMuv24iOJtz7sbUjTS4qBvKmstYJoUauiuD3k5qhyr7QdUHMeCgLa1Ear9NquemdXgmum4fvJ6w1lqsuDhNrg1qSpleJK7K3TF0Q2jSd94uSZ60kK1e3qyVpQK6PVWXp2/FC3mp6jBhKKOiY2h3gtUV64TWM6wDETRPLDfSakXmH3w8g9Jlug8ZtTt4kVF0kLUYYmCCtD/DrQ5YhMGbA9L3ucdjh0y8kOHW5gU/VEEmJTcL4Pz/f7mgoAbYkAAAAAElFTkSuQmCC"
+               "url": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAG0AAABmCAYAAADBPx+VAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAA3VSURBVHgB7Z27r0zdG8fX743i1bi1ikMoFMQloXRpKFFIqI7LH4BEQ+NWIkjQuSWCRIEoULk0gsK1kCBI0IhrQVT7tz/7zZo888yz1r7MnDl7z5xvsjkzs2fP3uu71nNfa7lkAsm7d++Sffv2JbNmzUqcc8m0adOSzZs3Z+/XES4ZckAWJEGWPiCxjsQNLWmQsWjRIpMseaxcuTKpG/7HP27I8P79e7dq1ars/yL4/v27S0ejqwv+cUOGEGGpKHR37tzJCEpHV9tnT58+dXXCJDdECBE2Ojrqjh071hpNECjx4cMHVycM1Uhbv359B2F79+51586daxN/+pyRkRFXKyRDAqxEp4yMlDDzXG1NPnnyJKkThoK0VFd1ELZu3TrzXKxKfW7dMBQ6bcuWLW2v0VlHjx41z717927ba22U9APcw7Nnz1oGEPeL3m3p2mTAYYnFmMOMXybPPXv2bNIPpFZr1NHn4HMw0KRBjg9NuRw95s8PEcz/6DZELQd/09C9QGq5RsmSRybqkwHGjh07OsJSsYYm3ijPpyHzoiacg35MLdDSIS/O1yM778jOTwYUkKNHWUzUWaOsylE00MyI0fcnOwIdjvtNdW/HZwNLGg+sR1kMepSNJXmIwxBZiG8tDTpEZzKg0GItNsosY8USkxDhD0Rinuiko2gfL/RbiD2LZAjU9zKQJj8RDR0vJBR1/Phx9+PHj9Z7REF4nTZkxzX4LCXHrV271qXkBAPGfP/atWvu/PnzHe4C97F48eIsRLZ9+3a3f/9+87dwP1JxaF7/3r17ba+5l4EcaVo0lj3SBq5kGTJSQmLWMjgYNei2GPT1MuMqGTDEFHzeQSP2wi/jGnkmPJ/nhccs44jvDAxpVcxnq0F6eT8h4ni/iIWpR5lPyA6ETkNXoSukvpJAD3AsXLiwpZs49+fPn5ke4j10TqYvegSfn0OnafC+Tv9ooA/JPkgQysqQNBzagXY55nO/oa1F7qvIPWkRL12WRpMWUvpVDYmxAPehxWSe8ZEXL20sadYIozfmNch4QJPAfeJgW3rNsnzphBKNJM2KKODo1rVOMRYik5ETy3ix4qWNI81qAAirizgMIc+yhTytx0JWZuNI03qsrgWlGtwjoS9XwgUhWGyhUaRZZQNNIEwCiXD16tXcAHUs79co0vSD8rrJCIW98pzvxpAWyyo3HYwqS0+H0BjStClcZJT5coMm6D2LOF8TolGJtK9fvyZpyiC5ePFi9nc/oJU4eiEP0jVoAnHa9wyJycITMP78+eMeP37sXrx44d6+fdt6f82aNdkx1pg9e3Zb5W+RSRE+n+VjksQWifvVaTKFhn5O8my63K8Qabdv33b379/PiAP//vuvW7BggZszZ072/+TJk91YgkafPn166zXB1rQHFvouAWHq9z3SEevSUerqCn2/dDCeta2jxYbr69evk4MHDyY7d+7MjhMnTiTPnz9Pfv/+nfQT2ggpO2dMF8cghuoM7Ygj5iWCqRlGFml0QC/ftGmTmzt3rmsaKDsgBSPh0/8yPeLLBihLkOKJc0jp8H8vUzcxIA1k6QJ/c78tWEyj5P3o4u9+jywNPdJi5rAH9x0KHcl4Hg570eQp3+vHXGyrmEeigzQsQsjavXt38ujRo44LQuDDhw+TW7duRS1HGgMxhNXHgflaNTOsHyKvHK5Ijo2jbFjJBQK9YwFd6RVMzfgRBmEfP37suBBm/p49e1qjEP2mwTViNRo0VJWH1deMXcNK08uUjVUu7s/zRaL+oLNxz1bpANco4npUgX4G2eFbpDFyQoQxojBCpEGSytmOH8qrH5Q9vuzD6ofQylkCUmh8DBAr+q8JCyVNtWQIidKQE9wNtLSQnS4jDSsxNHogzFuQBw4cyM61UKVsjfr3ooBkPSqqQHesUPWVtzi9/vQi1T+rJj7WiTz4Pt/l3LxUkr5P2VYZaZ4URpsE+st/dujQoaBBYokbrz/8TJNQYLSonrPS9kUaSkPeZyj1AWSj+d+VBoy1pIWVNed8P0Ll/ee5HdGRhrHhR5GGN0r4LGZBaj8oFDJitBTJzIZgFcmU0Y8ytWMZMzJOaXUSrUs5RxKnrxmbb5YXO9VGUhtpXldhEUogFr3IzIsvlpmdosVcGVGXFWp2oU9kLFL3dEkSz6NHEY1sjSRdIuDFWEhd8KxFqsRi1uM/nz9/zpxnwlESONdg6dKlbsaMGS4EHFHtjFIDHwKOo46l4TxSuxgDzi+rE2jg+BaFruOX4HXa0Nnf1lwAPufZeF8/r6zD97WK2qFnGjBxTw5qNGPxT+5T/r7/7RawFC3j4vTp09koCxkeHjqbHJqArmH5UrFKKksnxrK7FuRIs8STfBZv+luugXZ2pR/pP9Ois4z+TiMzUUkUjD0iEi1fzX8GmXyuxUBRcaUfykV0YZnlJGKQpOiGB76x5GeWkWWJc3mOrK6S7xdND+W5N6XyaRgtWJFe13GkaZnKOsYqGdOVVVbGupsyA/l7emTLHi7vwTdirNEt0qxnzAvBFcnQF16xh/TMpUuXHDowhlA9vQVraQhkudRdzOnK+04ZSP3DUhVSP61YsaLtd/ks7ZgtPcXqPqEafHkdqa84X6aCeL7YWlv6edGFHb+ZFICPlljHhg0bKuk0CSvVznWsotRu433alNdFrqG45ejoaPCaUkWERpLXjzFL2Rpllp7PJU2a/v7Ab8N05/9t27Z16KUqoFGsxnI9EosS2niSYg9SpU6B4JgTrvVW1flt1sT+0ADIJU2maXzcUTraGCRaL1Wp9rUMk16PMom8QhruxzvZIegJjFU7LLCePfS8uaQdPny4jTTL0dbee5mYokQsXTIWNY46kuMbnt8Kmec+LGWtOVIl9cT1rCB0V8WqkjAsRwta93TbwNYoGKsUSChN44lgBNCoHLHzquYKrU6qZ8lolCIN0Rh6cP0Q3U6I6IXILYOQI513hJaSKAorFpuHXJNfVlpRtmYBk1Su1obZr5dnKAO+L10Hrj3WZW+E3qh6IszE37F6EB+68mGpvKm4eb9bFrlzrok7fvr0Kfv727dvWRmdVTJHw0qiiCUSZ6wCK+7XL/AcsgNyL74DQQ730sv78Su7+t/A36MdY0sW5o40ahslXr58aZ5HtZB8GH64m9EmMZ7FpYw4T6QnrZfgenrhFxaSiSGXtPnz57e9TkNZLvTjeqhr734CNtrK41L40sUQckmj1lGKQ0rC37x544r8eNXRpnVE3ZZY7zXo8NomiO0ZUCj2uHz58rbXoZ6gc0uA+F6ZeKS/jhRDUq8MKrTho9fEkihMmhxtBI1DxKFY9XLpVcSkfoi8JGnToZO5sU5aiDQIW716ddt7ZLYtMQlhECdBGXZZMWldY5BHm5xgAroWj4C0hbYkSc/jBmggIrXJWlZM6pSETsEPGqZOndr2uuuR5rF169a2HoHPdurUKZM4CO1WTPqaDaAd+GFGKdIQkxAn9RuEWcTRyN2KSUgiSgF5aWzPTeA/lN5rZubMmR2bE4SIC4nJoltgAV/dVefZm72AtctUCJU2CMJ327hxY9t7EHbkyJFseq+EJSY16RPo3Dkq1kkr7+q0bNmyDuLQcZBEPYmHVdOBiJyIlrRDq41YPWfXOxUysi5fvtyaj+2BpcnsUV/oSoEMOk2CQGlr4ckhBwaetBhjCwH0ZHtJROPJkyc7UjcYLDjmrH7ADTEBXFfOYmB0k9oYBOjJ8b4aOYSe7QkKcYhFlq3QYLQhSidNmtS2RATwy8YOM3EQJsUjKiaWZ+vZToUQgzhkHXudb/PW5YMHD9yZM2faPsMwoc7RciYJXbGuBqJ1UIGKKLv915jsvgtJxCZDubdXr165mzdvtr1Hz5LONA8jrUwKPqsmVesKa49S3Q4WxmRPUEYdTjgiUcfUwLx589ySJUva3oMkP6IYddq6HMS4o55xBJBUeRjzfa4Zdeg56QZ43LhxoyPo7Lf1kNt7oO8wWAbNwaYjIv5lhyS7kRf96dvm5Jah8vfvX3flyhX35cuX6HfzFHOToS1H4BenCaHvO8pr8iDuwoUL7tevX+b5ZdbBair0xkFIlFDlW4ZknEClsp/TzXyAKVOmmHWFVSbDNw1l1+4f90U6IY/q4V27dpnE9bJ+v87QEydjqx/UamVVPRG+mwkNTYN+9tjkwzEx+atCm/X9WvWtDtAb68Wy9LXa1UmvCDDIpPkyOQ5ZwSzJ4jMrvFcr0rSjOUh+GcT4LSg5ugkW1Io0/SCDQBojh0hPlaJdah+tkVYrnTZowP8iq1F1TgMBBauufyB33x1v+NWFYmT5KmppgHC+NkAgbmRkpD3yn9QIseXymoTQFGQmIOKTxiZIWpvAatenVqRVXf2nTrAWMsPnKrMZHz6bJq5jvce6QK8J1cQNgKxlJapMPdZSR64/UivS9NztpkVEdKcrs5alhhWP9NeqlfWopzhZScI6QxseegZRGeg5a8C3Re1Mfl1ScP36ddcUaMuv24iOJtz7sbUjTS4qBvKmstYJoUauiuD3k5qhyr7QdUHMeCgLa1Ear9NquemdXgmum4fvJ6w1lqsuDhNrg1qSpleJK7K3TF0Q2jSd94uSZ60kK1e3qyVpQK6PVWXp2/FC3mp6jBhKKOiY2h3gtUV64TWM6wDETRPLDfSakXmH3w8g9Jlug8ZtTt4kVF0kLUYYmCCtD/DrQ5YhMGbA9L3ucdjh0y8kOHW5gU/VEEmJTcL4Pz/f7mgoAbYkAAAAAElFTkSuQmCC"
            }
          }
        ]
@@ -154,13 +154,13 @@ curl http://localhost:11434/v1/chat/completions \
 curl http://localhost:11434/v1/completions \
    -H "Content-Type: application/json" \
    -d '{
-        "model": "llama3",
+        "model": "llama3.2",
        "prompt": "Say this is a test"
    }'
 curl http://localhost:11434/v1/models
-curl http://localhost:11434/v1/models/llama3
+curl http://localhost:11434/v1/models/llama3.2
 curl http://localhost:11434/v1/embeddings \
    -H "Content-Type: application/json" \
@@ -182,7 +182,6 @@ curl http://localhost:11434/v1/embeddings \
 - [x] Reproducible outputs
 - [x] Vision
 - [x] Tools (streaming support coming soon)
- [ ] Vision
 - [ ] Logprobs
 #### Supported request fields
@@ -275,7 +274,7 @@ curl http://localhost:11434/v1/embeddings \
 Before using a model, pull it locally `ollama pull`:
 ```shell
-ollama pull llama3
+ollama pull llama3.2
 ```
 ### Default model names
@@ -283,7 +282,7 @@ ollama pull llama3
 For tooling that relies on default OpenAI model names such as `gpt-3.5-turbo`, use `ollama cp` to copy an existing model name to a temporary name:
 ```
-ollama cp llama3 gpt-3.5-turbo
+ollama cp llama3.2 gpt-3.5-turbo
 ```
 Afterwards, this new model name can be specified the `model` field:
@@ -301,3 +300,28 @@ curl http://localhost:11434/v1/chat/completions \
        ]
    }'
 ```
+### Setting the context size
+The OpenAI API does not have a way of setting the context size for a model. If you need to change the context size, create a `Modelfile` which looks like:
+```modelfile
+FROM <some model>
+PARAMETER num_ctx <context size>
+```
+Use the `ollama create mymodel` command to create a new model with the updated context size. Call the API with the updated model name:
+```shell
+curl http://localhost:11434/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "mymodel",
+        "messages": [
+            {
+                "role": "user",
+                "content": "Hello!"
+            }
+        ]
+    }'
+```
--- a/docs/template.md
+++ b/docs/template.md
@@ -33,7 +33,7 @@ Omitting a template in these models puts the responsibility of correctly templat
 To add templates in your model, you'll need to add a `TEMPLATE` command to the Modelfile. Here's an example using Meta's Llama 3.
 ```dockerfile
-FROM llama3
+FROM llama3.2
 TEMPLATE """{{- if .System }}<|start_header_id|>system<|end_header_id|>
@@ -112,15 +112,9 @@ Keep the following tips and best practices in mind when working with Go template
 ChatML is a popular template format. It can be used for models such as Databrick's DBRX, Intel's Neural Chat, and Microsoft's Orca 2.
 ```gotmpl
-{{- if .System }}<|im_start|>system
-{{ .System }}<|im_end|>
-{{ end }}
 {{- range .Messages }}<|im_start|>{{ .Role }}
 {{ .Content }}<|im_end|>
 {{ end }}<|im_start|>assistant
-{{ else }}
-{{ if .System }}<|im_start|>system
-{{ .System }}<|im_end|>
 ```
 ### Example Tools

--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -91,6 +91,25 @@ If none of those resolve the problem, gather additional information and file an
 - Check dmesg for any errors `sudo dmesg | grep -i nvrm` and `sudo dmesg | grep -i nvidia`
+## AMD GPU Discovery
+On linux, AMD GPU access typically requires `video` and/or `render` group membership to access the `/dev/kfd` device.  If permissions are not set up correctly, Ollama will detect this and report an error in the server log.
+When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU.  Use `ls -lnd /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the **numeric** group IDs on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices.   For example, in the following output `crw-rw---- 1 0  44 226,   0 Sep 16 16:55 /dev/dri/card0` the group ID column is `44` 
+If Ollama initially works on the GPU in a docker container, but then switches to running on CPU after some period of time with errors in the server log reporting GPU discovery failures, this can be resolved by disabling systemd cgroup management in Docker.  Edit `/etc/docker/daemon.json` on the host and add `"exec-opts": ["native.cgroupdriver=cgroupfs"]` to the docker configuration.
+If you are experiencing problems getting Ollama to correctly discover or use your GPU for inference, the following may help isolate the failure.
+- `AMD_LOG_LEVEL=3` Enable info log levels in the AMD HIP/ROCm libraries.  This can help show more detailed error codes that can help troubleshoot problems
+- `OLLAMA_DEBUG=1` During GPU discovery additional information will be reported
+- Check dmesg for any errors from amdgpu or kfd drivers `sudo dmesg | grep -i amdgpu` and `sudo dmesg | grep -i kfd`
+## Multiple AMD GPUs
+If you experience gibberish responses when models load across multiple AMD GPUs on Linux, see the following guide.
+- https://rocm.docs.amd.com/projects/radeon/en/latest/docs/install/native_linux/mgpu.html#mgpu-known-issues-and-limitations
 ## Windows Terminal Errors
 Older versions of Windows 10 (e.g., 21H1) are known to have a bug where the standard terminal program does not display control characters correctly.  This can result in a long string of strings like `←[?25h←[?25l` being displayed, sometimes erroring with `The parameter is incorrect`  To resolve this problem, please update to Win 10 22H1 or newer.
--- a/docs/windows.md
+++ b/docs/windows.md
-# Ollama Windows Preview
+# Ollama Windows
-Welcome to the Ollama Windows preview.
+Welcome to Ollama for Windows.
 No more WSL required!
 Ollama now runs as a native Windows application, including NVIDIA and AMD Radeon GPU support.
-After installing Ollama Windows Preview, Ollama will run in the background and
+After installing Ollama for Windows, Ollama will run in the background and
 the `ollama` command line is available in `cmd`, `powershell` or your favorite
 terminal application. As usual the Ollama [api](./api.md) will be served on
 `http://localhost:11434`.
-As this is a preview release, you should expect a few bugs here and there.  If
-you run into a problem you can reach out on
-[Discord](https://discord.gg/ollama), or file an
-[issue](https://github.com/ollama/ollama/issues).
-Logs will often be helpful in diagnosing the problem (see
-[Troubleshooting](#troubleshooting) below)
 ## System Requirements
 * Windows 10 22H2 or newer, Home or Pro
@@ -25,19 +18,41 @@ Logs will often be helpful in diagnosing the problem (see
 Ollama uses unicode characters for progress indication, which may render as unknown squares in some older terminal fonts in Windows 10. If you see this, try changing your terminal font settings.
+## Filesystem Requirements
+The Ollama install does not require Administrator, and installs in your home directory by default.  You'll need at least 4GB of space for the binary install.  Once you've installed Ollama, you'll need additional space for storing the Large Language models, which can be tens to hundreds of GB in size.  If your home directory doesn't have enough space, you can change where the binaries are installed, and where the models are stored.
+### Changing Install Location
+To install the Ollama application in a location different than your home directory, start the installer with the following flag
+```powershell
+OllamaSetup.exe /DIR="d:\some\location"
+```
+### Changing Model Location
+To change where Ollama stores the downloaded models instead of using your home directory, set the environment variable `OLLAMA_MODELS` in your user account.
+1. Start the Settings (Windows 11) or Control Panel (Windows 10) application and search for _environment variables_.
+2. Click on _Edit environment variables for your account_.
+3. Edit or create a new variable for your user account for `OLLAMA_MODELS` where you want the models stored
+4. Click OK/Apply to save.
+If Ollama is already running, Quit the tray application and relaunch it from the Start menu, or a new terminal started after you saved the environment variables.
 ## API Access
 Here's a quick example showing API access from `powershell`
 ```powershell
-(Invoke-WebRequest -method POST -Body '{"model":"llama3", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json
+(Invoke-WebRequest -method POST -Body '{"model":"llama3.2", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json
 ```
 ## Troubleshooting
-While we're in preview, `OLLAMA_DEBUG` is always enabled, which adds
-a "view logs" menu item to the app, and increases logging for the GUI app and
-server.
 Ollama on Windows stores files in a few different locations.  You can view them in
 the explorer window by hitting `<cmd>+R` and type in:
 - `explorer %LOCALAPPDATA%\Ollama` contains logs, and downloaded updates
@@ -48,6 +63,13 @@ the explorer window by hitting `<cmd>+R` and type in:
 - `explorer %HOMEPATH%\.ollama` contains models and configuration
 - `explorer %TEMP%` contains temporary executable files in one or more `ollama*` directories
+## Uninstall
+The Ollama Windows installer registers an Uninstaller application.  Under `Add or remove programs` in Windows Settings, you can uninstall Ollama.
+> [!NOTE]
+> If you have [changed the OLLAMA_MODELS location](#changing-model-location), the installer will not remove your downloaded models
 ## Standalone CLI

--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -30,9 +30,7 @@ func Host() *url.URL {
 		defaultPort = "443"
 	}
-	// trim trailing slashes
+	hostport, path, _ := strings.Cut(hostport, "/")
-	hostport = strings.TrimRight(hostport, "/")
 	host, port, err := net.SplitHostPort(hostport)
 	if err != nil {
 		host, port = "127.0.0.1", defaultPort
@@ -45,15 +43,13 @@ func Host() *url.URL {
 	if n, err := strconv.ParseInt(port, 10, 32); err != nil || n > 65535 || n < 0 {
 		slog.Warn("invalid port, using default", "port", port, "default", defaultPort)
-		return &url.URL{
+		port = defaultPort
-			Scheme: scheme,
-			Host:   net.JoinHostPort(host, defaultPort),
-		}
 	}
 	return &url.URL{
 		Scheme: scheme,
 		Host:   net.JoinHostPort(host, port),
+		Path:   path,
 	}
 }
@@ -76,6 +72,7 @@ func Origins() (origins []string) {
 		"app://*",
 		"file://*",
 		"tauri://*",
+		"vscode-webview://*",
 	)
 	return origins
@@ -116,6 +113,26 @@ func KeepAlive() (keepAlive time.Duration) {
 	return keepAlive
 }
+// LoadTimeout returns the duration for stall detection during model loads. LoadTimeout can be configured via the OLLAMA_LOAD_TIMEOUT environment variable.
+// Zero or Negative values are treated as infinite.
+// Default is 5 minutes.
+func LoadTimeout() (loadTimeout time.Duration) {
+	loadTimeout = 5 * time.Minute
+	if s := Var("OLLAMA_LOAD_TIMEOUT"); s != "" {
+		if d, err := time.ParseDuration(s); err == nil {
+			loadTimeout = d
+		} else if n, err := strconv.ParseInt(s, 10, 64); err == nil {
+			loadTimeout = time.Duration(n) * time.Second
+		}
+	}
+	if loadTimeout <= 0 {
+		return time.Duration(math.MaxInt64)
+	}
+	return loadTimeout
+}
 func Bool(k string) func() bool {
 	return func() bool {
 		if s := Var(k); s != "" {
@@ -144,6 +161,8 @@ var (
 	SchedSpread = Bool("OLLAMA_SCHED_SPREAD")
 	// IntelGPU enables experimental Intel GPU detection.
 	IntelGPU = Bool("OLLAMA_INTEL_GPU")
+	// MultiUserCache optimizes prompt caching for multi-user scenarios
+	MultiUserCache = Bool("OLLAMA_MULTIUSER_CACHE")
 )
 func String(s string) func() string {
@@ -163,53 +182,6 @@ var (
 	HsaOverrideGfxVersion = String("HSA_OVERRIDE_GFX_VERSION")
 )
-func RunnersDir() (p string) {
-	if p := Var("OLLAMA_RUNNERS_DIR"); p != "" {
-		return p
-	}
-	if runtime.GOOS != "windows" {
-		return
-	}
-	defer func() {
-		if p == "" {
-			slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama_runners'")
-		}
-	}()
-	// On Windows we do not carry the payloads inside the main executable
-	exe, err := os.Executable()
-	if err != nil {
-		return
-	}
-	cwd, err := os.Getwd()
-	if err != nil {
-		return
-	}
-	var paths []string
-	for _, root := range []string{filepath.Dir(exe), cwd} {
-		paths = append(paths,
-			root,
-			filepath.Join(root, "windows-"+runtime.GOARCH),
-			filepath.Join(root, "dist", "windows-"+runtime.GOARCH),
-		)
-	}
-	// Try a few variations to improve developer experience when building from source in the local tree
-	for _, path := range paths {
-		candidate := filepath.Join(path, "ollama_runners")
-		if _, err := os.Stat(candidate); err == nil {
-			p = candidate
-			break
-		}
-	}
-	return p
-}
 func Uint(key string, defaultValue uint) func() uint {
 	return func() uint {
 		if s := Var(key); s != "" {
@@ -235,6 +207,23 @@ var (
 	MaxVRAM = Uint("OLLAMA_MAX_VRAM", 0)
 )
+func Uint64(key string, defaultValue uint64) func() uint64 {
+	return func() uint64 {
+		if s := Var(key); s != "" {
+			if n, err := strconv.ParseUint(s, 10, 64); err != nil {
+				slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue)
+			} else {
+				return n
+			}
+		}
+		return defaultValue
+	}
+}
+// Set aside VRAM per GPU
+var GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0)
 type EnvVar struct {
 	Name        string
 	Value       any
@@ -245,9 +234,11 @@ func AsMap() map[string]EnvVar {
 	ret := map[string]EnvVar{
 		"OLLAMA_DEBUG":             {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
 		"OLLAMA_FLASH_ATTENTION":   {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"},
+		"OLLAMA_GPU_OVERHEAD":      {"OLLAMA_GPU_OVERHEAD", GpuOverhead(), "Reserve a portion of VRAM per GPU (bytes)"},
 		"OLLAMA_HOST":              {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
 		"OLLAMA_KEEP_ALIVE":        {"OLLAMA_KEEP_ALIVE", KeepAlive(), "The duration that models stay loaded in memory (default \"5m\")"},
 		"OLLAMA_LLM_LIBRARY":       {"OLLAMA_LLM_LIBRARY", LLMLibrary(), "Set LLM library to bypass autodetection"},
+		"OLLAMA_LOAD_TIMEOUT":      {"OLLAMA_LOAD_TIMEOUT", LoadTimeout(), "How long to allow model loads to stall before giving up (default \"5m\")"},
 		"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners(), "Maximum number of loaded models per GPU"},
 		"OLLAMA_MAX_QUEUE":         {"OLLAMA_MAX_QUEUE", MaxQueue(), "Maximum number of queued requests"},
 		"OLLAMA_MODELS":            {"OLLAMA_MODELS", Models(), "The path to the models directory"},
@@ -255,18 +246,32 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_NOPRUNE":           {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"},
 		"OLLAMA_NUM_PARALLEL":      {"OLLAMA_NUM_PARALLEL", NumParallel(), "Maximum number of parallel requests"},
 		"OLLAMA_ORIGINS":           {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
-		"OLLAMA_RUNNERS_DIR":       {"OLLAMA_RUNNERS_DIR", RunnersDir(), "Location for runners"},
 		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
 		"OLLAMA_TMPDIR":            {"OLLAMA_TMPDIR", TmpDir(), "Location for temporary files"},
+		"OLLAMA_MULTIUSER_CACHE":   {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
+		// Informational
+		"HTTP_PROXY":  {"HTTP_PROXY", String("HTTP_PROXY")(), "HTTP proxy"},
+		"HTTPS_PROXY": {"HTTPS_PROXY", String("HTTPS_PROXY")(), "HTTPS proxy"},
+		"NO_PROXY":    {"NO_PROXY", String("NO_PROXY")(), "No proxy"},
 	}
+	if runtime.GOOS != "windows" {
+		// Windows environment variables are case-insensitive so there's no need to duplicate them
+		ret["http_proxy"] = EnvVar{"http_proxy", String("http_proxy")(), "HTTP proxy"}
+		ret["https_proxy"] = EnvVar{"https_proxy", String("https_proxy")(), "HTTPS proxy"}
+		ret["no_proxy"] = EnvVar{"no_proxy", String("no_proxy")(), "No proxy"}
+	}
 	if runtime.GOOS != "darwin" {
 		ret["CUDA_VISIBLE_DEVICES"] = EnvVar{"CUDA_VISIBLE_DEVICES", CudaVisibleDevices(), "Set which NVIDIA devices are visible"}
-		ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices(), "Set which AMD devices are visible"}
+		ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices(), "Set which AMD devices are visible by numeric ID"}
-		ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices(), "Set which AMD devices are visible"}
+		ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices(), "Set which AMD devices are visible by UUID or numeric ID"}
-		ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal(), "Set which AMD devices are visible"}
+		ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal(), "Set which AMD devices are visible by numeric ID"}
 		ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion(), "Override the gfx used for all detected AMD GPUs"}
 		ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGPU(), "Enable experimental Intel GPU detection"}
 	}
 	return ret
 }
@@ -282,3 +287,12 @@ func Values() map[string]string {
 func Var(key string) string {
 	return strings.Trim(strings.TrimSpace(os.Getenv(key)), "\"'")
 }
+// On windows, we keep the binary at the top directory, but
+// other platforms use a "bin" directory, so this returns ".."
+func LibRelativeToExe() string {
+	if runtime.GOOS == "windows" {
+		return "."
+	}
+	return ".."
+}
--- a/envconfig/config_test.go
+++ b/envconfig/config_test.go
@@ -13,34 +13,35 @@ func TestHost(t *testing.T) {
 		value  string
 		expect string
 	}{
-		"empty":               {"", "127.0.0.1:11434"},
+		"empty":               {"", "http://127.0.0.1:11434"},
-		"only address":        {"1.2.3.4", "1.2.3.4:11434"},
+		"only address":        {"1.2.3.4", "http://1.2.3.4:11434"},
-		"only port":           {":1234", ":1234"},
+		"only port":           {":1234", "http://:1234"},
-		"address and port":    {"1.2.3.4:1234", "1.2.3.4:1234"},
+		"address and port":    {"1.2.3.4:1234", "http://1.2.3.4:1234"},
-		"hostname":            {"example.com", "example.com:11434"},
+		"hostname":            {"example.com", "http://example.com:11434"},
-		"hostname and port":   {"example.com:1234", "example.com:1234"},
+		"hostname and port":   {"example.com:1234", "http://example.com:1234"},
-		"zero port":           {":0", ":0"},
+		"zero port":           {":0", "http://:0"},
-		"too large port":      {":66000", ":11434"},
+		"too large port":      {":66000", "http://:11434"},
-		"too small port":      {":-1", ":11434"},
+		"too small port":      {":-1", "http://:11434"},
-		"ipv6 localhost":      {"[::1]", "[::1]:11434"},
+		"ipv6 localhost":      {"[::1]", "http://[::1]:11434"},
-		"ipv6 world open":     {"[::]", "[::]:11434"},
+		"ipv6 world open":     {"[::]", "http://[::]:11434"},
-		"ipv6 no brackets":    {"::1", "[::1]:11434"},
+		"ipv6 no brackets":    {"::1", "http://[::1]:11434"},
-		"ipv6 + port":         {"[::1]:1337", "[::1]:1337"},
+		"ipv6 + port":         {"[::1]:1337", "http://[::1]:1337"},
-		"extra space":         {" 1.2.3.4 ", "1.2.3.4:11434"},
+		"extra space":         {" 1.2.3.4 ", "http://1.2.3.4:11434"},
-		"extra quotes":        {"\"1.2.3.4\"", "1.2.3.4:11434"},
+		"extra quotes":        {"\"1.2.3.4\"", "http://1.2.3.4:11434"},
-		"extra space+quotes":  {" \" 1.2.3.4 \" ", "1.2.3.4:11434"},
+		"extra space+quotes":  {" \" 1.2.3.4 \" ", "http://1.2.3.4:11434"},
-		"extra single quotes": {"'1.2.3.4'", "1.2.3.4:11434"},
+		"extra single quotes": {"'1.2.3.4'", "http://1.2.3.4:11434"},
-		"http":                {"http://1.2.3.4", "1.2.3.4:80"},
+		"http":                {"http://1.2.3.4", "http://1.2.3.4:80"},
-		"http port":           {"http://1.2.3.4:4321", "1.2.3.4:4321"},
+		"http port":           {"http://1.2.3.4:4321", "http://1.2.3.4:4321"},
-		"https":               {"https://1.2.3.4", "1.2.3.4:443"},
+		"https":               {"https://1.2.3.4", "https://1.2.3.4:443"},
-		"https port":          {"https://1.2.3.4:4321", "1.2.3.4:4321"},
+		"https port":          {"https://1.2.3.4:4321", "https://1.2.3.4:4321"},
+		"proxy path":          {"https://example.com/ollama", "https://example.com:443/ollama"},
 	}
 	for name, tt := range cases {
 		t.Run(name, func(t *testing.T) {
 			t.Setenv("OLLAMA_HOST", tt.value)
-			if host := Host(); host.Host != tt.expect {
+			if host := Host(); host.String() != tt.expect {
-				t.Errorf("%s: expected %s, got %s", name, tt.expect, host.Host)
+				t.Errorf("%s: expected %s, got %s", name, tt.expect, host.String())
 			}
 		})
 	}
@@ -67,6 +68,7 @@ func TestOrigins(t *testing.T) {
 			"app://*",
 			"file://*",
 			"tauri://*",
+			"vscode-webview://*",
 		}},
 		{"http://10.0.0.1", []string{
 			"http://10.0.0.1",
@@ -85,6 +87,7 @@ func TestOrigins(t *testing.T) {
 			"app://*",
 			"file://*",
 			"tauri://*",
+			"vscode-webview://*",
 		}},
 		{"http://172.16.0.1,https://192.168.0.1", []string{
 			"http://172.16.0.1",
@@ -104,6 +107,7 @@ func TestOrigins(t *testing.T) {
 			"app://*",
 			"file://*",
 			"tauri://*",
+			"vscode-webview://*",
 		}},
 		{"http://totally.safe,http://definitely.legit", []string{
 			"http://totally.safe",
@@ -123,6 +127,7 @@ func TestOrigins(t *testing.T) {
 			"app://*",
 			"file://*",
 			"tauri://*",
+			"vscode-webview://*",
 		}},
 	}
 	for _, tt := range cases {
@@ -214,6 +219,40 @@ func TestKeepAlive(t *testing.T) {
 	}
 }
+func TestLoadTimeout(t *testing.T) {
+	defaultTimeout := 5 * time.Minute
+	cases := map[string]time.Duration{
+		"":       defaultTimeout,
+		"1s":     time.Second,
+		"1m":     time.Minute,
+		"1h":     time.Hour,
+		"5m0s":   defaultTimeout,
+		"1h2m3s": 1*time.Hour + 2*time.Minute + 3*time.Second,
+		"0":      time.Duration(math.MaxInt64),
+		"60":     60 * time.Second,
+		"120":    2 * time.Minute,
+		"3600":   time.Hour,
+		"-0":     time.Duration(math.MaxInt64),
+		"-1":     time.Duration(math.MaxInt64),
+		"-1m":    time.Duration(math.MaxInt64),
+		// invalid values
+		" ":   defaultTimeout,
+		"???": defaultTimeout,
+		"1d":  defaultTimeout,
+		"1y":  defaultTimeout,
+		"1w":  defaultTimeout,
+	}
+	for tt, expect := range cases {
+		t.Run(tt, func(t *testing.T) {
+			t.Setenv("OLLAMA_LOAD_TIMEOUT", tt)
+			if actual := LoadTimeout(); actual != expect {
+				t.Errorf("%s: expected %s, got %s", tt, expect, actual)
+			}
+		})
+	}
+}
 func TestVar(t *testing.T) {
 	cases := map[string]string{
 		"value":       "value",

--- a/examples/go-chat/main.go
+++ b/examples/go-chat/main.go
@@ -35,7 +35,7 @@ func main() {
 	ctx := context.Background()
 	req := &api.ChatRequest{
-		Model:    "llama3.1",
+		Model:    "llama3.2",
 		Messages: messages,
 	}

--- a/examples/langchain-python-rag-document/README.md
+++ b/examples/langchain-python-rag-document/README.md
@@ -4,10 +4,10 @@ This example provides an interface for asking questions to a PDF document.
 ## Setup
-1. Ensure you have the `llama3.1` model installed:
+1. Ensure you have the `llama3.2` model installed:
 ```
-ollama pull llama3.1
+ollama pull llama3.2
 ```
 2. Install the Python Requirements.

--- a/examples/langchain-python-rag-document/main.py
+++ b/examples/langchain-python-rag-document/main.py
@@ -51,7 +51,7 @@ while True:
        template=template,
    )
-    llm = Ollama(model="llama3.1", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
+    llm = Ollama(model="llama3.2", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
    qa_chain = RetrievalQA.from_chain_type(
        llm,
        retriever=vectorstore.as_retriever(),

--- a/examples/langchain-python-rag-privategpt/requirements.txt
+++ b/examples/langchain-python-rag-privategpt/requirements.txt
 langchain==0.0.274
 gpt4all==1.0.8
-chromadb==0.4.7
+chromadb==0.5.0
 llama-cpp-python==0.1.81
 urllib3==2.0.4
 PyMuPDF==1.23.5
@@ -12,4 +12,4 @@ pandoc==2.3
 pypandoc==1.11
 tqdm==4.66.1
 sentence_transformers==2.2.2
 numpy>=1.22.2 # not directly required, pinned by Snyk to avoid a vulnerability
\ No newline at end of file