update

7004fab5 · xuxzh1 · 42dd5af5 · 7004fab5 · 7004fab5 · 7004fab5
Commit 7004fab5 authored Dec 09, 2024 by xuxzh1 🎱
20 changed files
--- a/README.md
+++ b/README.md
@@ -21,7 +21,7 @@ docker run -i -t -d  --device=/dev/kfd --privileged --network=host --device=/dev
 1、下载源码
 ```bash
-git clone -b 0.4.7 http://developer.sourcefind.cn/codes/OpenDAS/ollama.git --depth=1
+git clone -b 0.5.1 http://developer.sourcefind.cn/codes/OpenDAS/ollama.git --depth=1
 cd ollama
 ```
@@ -41,7 +41,7 @@ go env -w GOPROXY=https://goproxy.cn,direct
 ##### 运行编译
 ```bash
+export LIBRARY_PATH=/opt/dtk/lib:$LIBRARY_PATH
 make -j 16
 go build .
 ```

--- a/api/types.go
+++ b/api/types.go
@@ -67,7 +67,7 @@ type GenerateRequest struct {
 	Raw bool `json:"raw,omitempty"`
 	// Format specifies the format to return a response in.
-	Format string `json:"format"`
+	Format json.RawMessage `json:"format,omitempty"`
 	// KeepAlive controls how long the model will stay loaded in memory following
 	// this request.
@@ -94,7 +94,7 @@ type ChatRequest struct {
 	Stream *bool `json:"stream,omitempty"`
 	// Format is the format to return the response in (e.g. "json").
-	Format string `json:"format"`
+	Format json.RawMessage `json:"format,omitempty"`
 	// KeepAlive controls how long the model will stay loaded into memory
 	// following the request.

--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -8,6 +8,7 @@ import (
 	"crypto/ed25519"
 	"crypto/rand"
 	"crypto/sha256"
+	"encoding/json"
 	"encoding/pem"
 	"errors"
 	"fmt"
@@ -1035,10 +1036,14 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 		return nil
 	}
+	if opts.Format == "json" {
+		opts.Format = `"` + opts.Format + `"`
+	}
 	req := &api.ChatRequest{
 		Model:    opts.Model,
 		Messages: opts.Messages,
-		Format:   opts.Format,
+		Format:   json.RawMessage(opts.Format),
 		Options:  opts.Options,
 	}
@@ -1120,12 +1125,16 @@ func generate(cmd *cobra.Command, opts runOptions) error {
 		}
 	}
+	if opts.Format == "json" {
+		opts.Format = `"` + opts.Format + `"`
+	}
 	request := api.GenerateRequest{
 		Model:     opts.Model,
 		Prompt:    opts.Prompt,
 		Context:   generateContext,
 		Images:    opts.Images,
-		Format:    opts.Format,
+		Format:    json.RawMessage(opts.Format),
 		System:    opts.System,
 		Options:   opts.Options,
 		KeepAlive: opts.KeepAlive,
@@ -1445,6 +1454,7 @@ func NewCLI() *cobra.Command {
 				envVars["OLLAMA_SCHED_SPREAD"],
 				envVars["OLLAMA_TMPDIR"],
 				envVars["OLLAMA_FLASH_ATTENTION"],
+				envVars["OLLAMA_KV_CACHE_TYPE"],
 				envVars["OLLAMA_LLM_LIBRARY"],
 				envVars["OLLAMA_GPU_OVERHEAD"],
 				envVars["OLLAMA_LOAD_TIMEOUT"],

--- a/cmd/cmd_test.go
+++ b/cmd/cmd_test.go
@@ -8,7 +8,6 @@ import (
 	"net/http"
 	"net/http/httptest"
 	"os"
-	"path/filepath"
 	"strings"
 	"testing"
@@ -180,18 +179,14 @@ Weigh anchor!
 	t.Run("license", func(t *testing.T) {
 		var b bytes.Buffer
-		license, err := os.ReadFile(filepath.Join("..", "LICENSE"))
+		license := "MIT License\nCopyright (c) Ollama\n"
-		if err != nil {
-			t.Fatal(err)
-		}
 		if err := showInfo(&api.ShowResponse{
 			Details: api.ModelDetails{
 				Family:            "test",
 				ParameterSize:     "7B",
 				QuantizationLevel: "FP16",
 			},
-			License: string(license),
+			License: license,
 		}, &b); err != nil {
 			t.Fatal(err)
 		}

--- a/convert/tokenizer.go
+++ b/convert/tokenizer.go
@@ -10,6 +10,7 @@ import (
 	"log/slog"
 	"os"
 	"slices"
+	"strings"
 	"golang.org/x/exp/maps"
 )
@@ -60,7 +61,25 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
 			addedTokens[t.Content] = t
 		}
-		t.Merges = tt.Model.Merges
+		if len(tt.Model.Merges) == 0 {
+			// noop; merges is empty
+		} else if err := json.Unmarshal(tt.Model.Merges, &t.Merges); err == nil {
+			// noop; merges is []string
+		} else if merges, err := func() ([][]string, error) {
+			var merges [][]string
+			if err := json.Unmarshal(tt.Model.Merges, &merges); err != nil {
+				return nil, err
+			}
+			return merges, nil
+		}(); err == nil {
+			t.Merges = make([]string, len(merges))
+			for i := range merges {
+				t.Merges[i] = strings.Join(merges[i], " ")
+			}
+		} else {
+			return nil, fmt.Errorf("could not parse tokenizer merges. expected []string or [][]string: %w", err)
+		}
 		sha256sum := sha256.New()
 		for _, pt := range tt.PreTokenizer.PreTokenizers {
@@ -158,7 +177,7 @@ type tokenizer struct {
 	Model       struct {
 		Type   string          `json:"type"`
 		Vocab  map[string]int  `json:"vocab"`
-		Merges []string       `json:"merges"`
+		Merges json.RawMessage `json:"merges"`
 	} `json:"model"`
 	PreTokenizer struct {

--- a/convert/tokenizer_test.go
+++ b/convert/tokenizer_test.go
@@ -191,6 +191,62 @@ func TestParseTokenizer(t *testing.T) {
 				Pre: "default",
 			},
 		},
+		{
+			name: "list string merges",
+			fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
+				"tokenizer.json": strings.NewReader(`{
+					"model": {
+						"merges": [
+							"a b",
+							"c d",
+							"e f"
+						]
+					}
+				}`),
+			}),
+			want: &Tokenizer{
+				Vocabulary: &Vocabulary{
+					Model: "gpt2",
+				},
+				Merges: []string{
+					"a b",
+					"c d",
+					"e f",
+				},
+				Pre: "default",
+			},
+		},
+		{
+			name: "list list string merges",
+			fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
+				"tokenizer.json": strings.NewReader(`{
+					"model": {
+						"merges": [
+							[
+								"a", "b"
+							],
+							[
+								"c", "d"
+							],
+							[
+								"e", "f"
+							]
+						]
+					}
+				}`),
+			}),
+			want: &Tokenizer{
+				Vocabulary: &Vocabulary{
+					Model: "gpt2",
+				},
+				Merges: []string{
+					"a b",
+					"c d",
+					"e f",
+				},
+				Pre: "default",
+			},
+		},
 	}
 	for _, tt := range cases {

--- a/discover/types.go
+++ b/discover/types.go
@@ -183,3 +183,17 @@ func (si SystemInfo) GetOptimalThreadCount() int {
 	return coreCount
 }
+// For each GPU, check if it does NOT support flash attention
+func (l GpuInfoList) FlashAttentionSupported() bool {
+	for _, gpu := range l {
+		supportsFA := gpu.Library == "metal" ||
+			(gpu.Library == "cuda" && gpu.DriverMajor >= 7) ||
+			gpu.Library == "rocm"
+		if !supportsFA {
+			return false
+		}
+	}
+	return true
+}
\ No newline at end of file
--- a/docs/api.md
+++ b/docs/api.md
@@ -45,14 +45,18 @@ Generate a response for a given prompt with a provided model. This is a streamin
 Advanced parameters (optional):
- `format`: the format to return a response in. Currently the only accepted value is `json`
+- `format`: the format to return a response in. Format can be `json` or a JSON schema
 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
 - `system`: system message to (overrides what is defined in the `Modelfile`)
 - `template`: the prompt template to use (overrides what is defined in the `Modelfile`)
- `context`: the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory
 - `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
 - `raw`: if `true` no formatting will be applied to the prompt. You may choose to use the `raw` parameter if you are specifying a full templated prompt in your request to the API
 - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
+- `context` (deprecated): the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory
+#### Structured outputs
+Structured outputs are supported by providing a JSON schema in the `format` parameter. The model will generate a response that matches the schema. See the [structured outputs](#request-structured-outputs) example below.
 #### JSON mode
@@ -185,6 +189,52 @@ curl http://localhost:11434/api/generate -d '{
 }
 ```
+#### Request (Structured outputs)
+##### Request
+```shell
+curl -X POST http://localhost:11434/api/generate -H "Content-Type: application/json" -d '{
+  "model": "llama3.1:8b",
+  "prompt": "Ollama is 22 years old and is busy saving the world. Respond using JSON",
+  "stream": false,
+  "format": {
+    "type": "object",
+    "properties": {
+      "age": {
+        "type": "integer"
+      },
+      "available": {
+        "type": "boolean"
+      }
+    },
+    "required": [
+      "age",
+      "available"
+    ]
+  }
+}'
+```
+##### Response
+```json
+{
+  "model": "llama3.1:8b",
+  "created_at": "2024-12-06T00:48:09.983619Z",
+  "response": "{\n  \"age\": 22,\n  \"available\": true\n}",
+  "done": true,
+  "done_reason": "stop",
+  "context": [1, 2, 3],
+  "total_duration": 1075509083,
+  "load_duration": 567678166,
+  "prompt_eval_count": 28,
+  "prompt_eval_duration": 236000000,
+  "eval_count": 16,
+  "eval_duration": 269000000
+}
+```
 #### Request (JSON mode)
 > [!IMPORTANT]
@@ -456,11 +506,15 @@ The `message` object has the following fields:
 Advanced parameters (optional):
- `format`: the format to return a response in. Currently the only accepted value is `json`
+- `format`: the format to return a response in. Format can be `json` or a JSON schema. 
 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
 - `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
 - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
+### Structured outputs
+Structured outputs are supported by providing a JSON schema in the `format` parameter. The model will generate a response that matches the schema. See the [Chat request (Structured outputs)](#chat-request-structured-outputs) example below.
 ### Examples
 #### Chat Request (Streaming)
@@ -551,6 +605,54 @@ curl http://localhost:11434/api/chat -d '{
 }
 ```
+#### Chat request (Structured outputs)
+##### Request
+```shell
+curl -X POST http://localhost:11434/api/chat -H "Content-Type: application/json" -d '{
+  "model": "llama3.1",
+  "messages": [{"role": "user", "content": "Ollama is 22 years old and busy saving the world. Return a JSON object with the age and availability."}],
+  "stream": false,
+  "format": {
+    "type": "object",
+    "properties": {
+      "age": {
+        "type": "integer"
+      },
+      "available": {
+        "type": "boolean"
+      }
+    },
+    "required": [
+      "age",
+      "available"
+    ]
+  },
+  "options": {
+    "temperature": 0
+  }
+}'
+```
+##### Response
+```json
+{
+  "model": "llama3.1",
+  "created_at": "2024-12-06T00:46:58.265747Z",
+  "message": { "role": "assistant", "content": "{\"age\": 22, \"available\": false}" },
+  "done_reason": "stop",
+  "done": true,
+  "total_duration": 2254970291,
+  "load_duration": 574751416,
+  "prompt_eval_count": 34,
+  "prompt_eval_duration": 1502000000,
+  "eval_count": 12,
+  "eval_duration": 175000000
+}
+```
 #### Chat request (With History)
 Send a chat message with a conversation history. You can use this same approach to start the conversation using multi-shot or chain-of-thought prompting.

--- a/docs/faq.md
+++ b/docs/faq.md
@@ -151,7 +151,7 @@ Refer to the section [above](#how-do-i-configure-ollama-server) for how to set e
 Ollama runs an HTTP server and can be exposed using a proxy server such as Nginx. To do so, configure the proxy to forward requests and optionally set required headers (if not exposing Ollama on the network). For example, with Nginx:
-```
+```nginx
 server {
    listen 80;
    server_name example.com;  # Replace with your domain or IP
@@ -285,4 +285,28 @@ Note: Windows with Radeon GPUs currently default to 1 model maximum due to limit
 ## How does Ollama load models on multiple GPUs?
-Installing multiple GPUs of the same brand can be a great way to increase your available VRAM to load larger models.  When you load a new model, Ollama evaluates the required VRAM for the model against what is currently available.  If the model will entirely fit on any single GPU, Ollama will load the model on that GPU.  This typically provides the best performance as it reduces the amount of data transfering across the PCI bus during inference.  If the model does not fit entirely on one GPU, then it will be spread across all the available GPUs.
+When loading a new model, Ollama evaluates the required VRAM for the model against what is currently available.  If the model will entirely fit on any single GPU, Ollama will load the model on that GPU.  This typically provides the best performance as it reduces the amount of data transferring across the PCI bus during inference.  If the model does not fit entirely on one GPU, then it will be spread across all the available GPUs.
+## How can I enable Flash Attention?
+Flash Attention is a feature of most modern models that can significantly reduce memory usage as the context size grows.  To enable Flash Attention, set the `OLLAMA_FLASH_ATTENTION` environment variable to `1` when starting the Ollama server.
+## How can I set the quantization type for the K/V cache?
+The K/V context cache can be quantized to significantly reduce memory usage when Flash Attention is enabled.
+To use quantized K/V cache with Ollama you can set the following environment variable:
+- `OLLAMA_KV_CACHE_TYPE` - The quantization type for the K/V cache.  Default is `f16`.
+> Note: Currently this is a global option - meaning all models will run with the specified quantization type.
+The currently available K/V cache quantization types are:
+- `f16` - high precision and memory usage (default).
+- `q8_0` - 8-bit quantization, uses approximately 1/2 the memory of `f16` with a very small loss in precision, this usually has no noticeable impact on the model's quality (recommended if not using f16).
+- `q4_0` - 4-bit quantization, uses approximately 1/4 the memory of `f16` with a small-medium loss in precision that may be more noticeable at higher context sizes.
+How much the cache quantization impacts the model's response quality will depend on the model and the task.  Models that have a high GQA count (e.g. Qwen2) may see a larger impact on precision from quantization than models with a low GQA count.
+You may need to experiment with different quantization types to find the best balance between memory usage and quality.
--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -63,7 +63,7 @@ SYSTEM You are Mario from super mario bros, acting as an assistant.
 To use this:
 1. Save it as a file (e.g. `Modelfile`)
-2. `ollama create choose-a-model-name -f <location of the file e.g. ./Modelfile>'`
+2. `ollama create choose-a-model-name -f <location of the file e.g. ./Modelfile>`
 3. `ollama run choose-a-model-name`
 4. Start using the model!
@@ -156,7 +156,7 @@ PARAMETER <parameter> <parametervalue>
 | seed           | Sets the random number seed to use for generation. Setting this to a specific number will make the model generate the same text for the same prompt. (Default: 0)                                                                                       | int        | seed 42              |
 | stop           | Sets the stop sequences to use. When this pattern is encountered the LLM will stop generating text and return. Multiple stop patterns may be set by specifying multiple separate `stop` parameters in a modelfile.                                      | string     | stop "AI assistant:" |
 | tfs_z          | Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting. (default: 1)                                               | float      | tfs_z 1              |
-| num_predict    | Maximum number of tokens to predict when generating text. (Default: 128, -1 = infinite generation, -2 = fill context)                                                                                                                                   | int        | num_predict 42       |
+| num_predict    | Maximum number of tokens to predict when generating text. (Default: -1, infinite generation)                                                                                                                                   | int        | num_predict 42       |
 | top_k          | Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)                                                                        | int        | top_k 40             |
 | top_p          | Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)                                                                 | float      | top_p 0.9            |
 | min_p          | Alternative to the top_p, and aims to ensure a balance of quality and variety. The parameter *p* represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with *p*=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out. (Default: 0.0) | float      | min_p 0.05            |

--- a/docs/openai.md
+++ b/docs/openai.md
@@ -59,6 +59,40 @@ embeddings = client.embeddings.create(
    input=["why is the sky blue?", "why is the grass green?"],
 )
 ```
+#### Structured outputs
+```py
+rom pydantic import BaseModel
+from openai import OpenAI
+client = OpenAI(base_url="http://localhost:11434/v1", api_key="ollama")
+# Define the schema for the response
+class FriendInfo(BaseModel):
+    name: str
+    age: int 
+    is_available: bool
+class FriendList(BaseModel):
+    friends: list[FriendInfo]
+try:
+    completion = client.beta.chat.completions.parse(
+        temperature=0,
+        model="llama3.1:8b",
+        messages=[
+            {"role": "user", "content": "I have two friends. The first is Ollama 22 years old busy saving the world, and the second is Alonso 23 years old and wants to hang out. Return a list of friends in JSON format"}
+        ],
+        response_format=FriendList,
+    )
+    friends_response = completion.choices[0].message
+    if friends_response.parsed:
+        print(friends_response.parsed)
+    elif friends_response.refusal:
+        print(friends_response.refusal)
+except Exception as e:
+    print(f"Error: {e}")
+```
 ### OpenAI JavaScript library

--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -153,6 +153,8 @@ var (
 	Debug = Bool("OLLAMA_DEBUG")
 	// FlashAttention enables the experimental flash attention feature.
 	FlashAttention = Bool("OLLAMA_FLASH_ATTENTION")
+	// KvCacheType is the quantization type for the K/V cache.
+	KvCacheType = String("OLLAMA_KV_CACHE_TYPE")
 	// NoHistory disables readline history.
 	NoHistory = Bool("OLLAMA_NOHISTORY")
 	// NoPrune disables pruning of model blobs on startup.
@@ -234,6 +236,7 @@ func AsMap() map[string]EnvVar {
 	ret := map[string]EnvVar{
 		"OLLAMA_DEBUG":             {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
 		"OLLAMA_FLASH_ATTENTION":   {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"},
+		"OLLAMA_KV_CACHE_TYPE":     {"OLLAMA_KV_CACHE_TYPE", KvCacheType(), "Quantization type for the K/V cache (default: f16)"},
 		"OLLAMA_GPU_OVERHEAD":      {"OLLAMA_GPU_OVERHEAD", GpuOverhead(), "Reserve a portion of VRAM per GPU (bytes)"},
 		"OLLAMA_HOST":              {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
 		"OLLAMA_KEEP_ALIVE":        {"OLLAMA_KEEP_ALIVE", KeepAlive(), "The duration that models stay loaded in memory (default \"5m\")"},

--- a/examples/README.md
+++ b/examples/README.md
 # Examples
 This directory contains different examples of using Ollama.
+## Python examples
+Ollama Python examples at [ollama-python/examples](https://github.com/ollama/ollama-python/tree/main/examples)
+## JavaScript examples
+Ollama JavaScript examples at [ollama-js/examples](https://github.com/ollama/ollama-js/tree/main/examples)
+## OpenAI compatibility examples
+Ollama OpenAI compatibility examples at [ollama/examples/openai](../docs/openai.md)
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -85,9 +85,12 @@ COMPILER inline get_compiler() {
 import "C"
 import (
+	"bytes"
 	_ "embed"
+	"encoding/json"
 	"errors"
 	"fmt"
+	"log/slog"
 	"runtime"
 	"runtime/cgo"
 	"slices"
@@ -140,7 +143,7 @@ type ContextParams struct {
 	c C.struct_llama_context_params
 }
-func NewContextParams(numCtx int, batchSize int, numSeqMax int, threads int, flashAttention bool) ContextParams {
+func NewContextParams(numCtx int, batchSize int, numSeqMax int, threads int, flashAttention bool, kvCacheType string) ContextParams {
 	params := C.llama_context_default_params()
 	params.n_ctx = C.uint(numCtx)
 	params.n_batch = C.uint(batchSize)
@@ -149,9 +152,28 @@ func NewContextParams(numCtx int, batchSize int, numSeqMax int, threads int, fla
 	params.n_threads_batch = params.n_threads
 	params.embeddings = C.bool(true)
 	params.flash_attn = C.bool(flashAttention)
+	params.type_k = kvCacheTypeFromStr(strings.ToLower(kvCacheType))
+	params.type_v = kvCacheTypeFromStr(strings.ToLower(kvCacheType))
 	return ContextParams{c: params}
 }
+// kvCacheTypeFromStr converts a string cache type to the corresponding GGML type value
+func kvCacheTypeFromStr(s string) C.enum_ggml_type {
+	if s == "" {
+		return C.GGML_TYPE_F16
+	}
+	switch s {
+	case "q8_0":
+		return C.GGML_TYPE_Q8_0
+	case "q4_0":
+		return C.GGML_TYPE_Q4_0
+	default:
+		return C.GGML_TYPE_F16
+	}
+}
 type Context struct {
 	c          *C.struct_llama_context
 	numThreads int
@@ -680,3 +702,33 @@ func (s *SamplingContext) Sample(llamaContext *Context, idx int) int {
 func (s *SamplingContext) Accept(id int, applyGrammar bool) {
 	C.gpt_sampler_caccept(s.c, C.llama_token(id), C.bool(applyGrammar))
 }
+type JsonSchema struct {
+	Defs       map[string]any `json:"$defs,omitempty"`
+	Properties map[string]any `json:"properties,omitempty"`
+	Required   []string       `json:"required,omitempty"`
+	Title      string         `json:"title,omitempty"`
+	Type       string         `json:"type,omitempty"`
+}
+func (js JsonSchema) AsGrammar() string {
+	var b bytes.Buffer
+	if err := json.NewEncoder(&b).Encode(js); err != nil {
+		return ""
+	}
+	cStr := C.CString(b.String())
+	defer C.free(unsafe.Pointer(cStr))
+	// Allocate buffer for grammar output with reasonable size
+	const maxLen = 32768 // 32KB
+	buf := make([]byte, maxLen)
+	// Call C function to convert schema to grammar
+	length := C.schema_to_grammar(cStr, (*C.char)(unsafe.Pointer(&buf[0])), C.size_t(maxLen))
+	if length == 0 {
+		slog.Warn("unable to convert schema to grammar")
+	}
+	return string(buf[:length])
+}
--- a/llama/llama_test.go
+++ b/llama/llama_test.go
 package llama
+import (
+	"strings"
+	"testing"
+	"github.com/google/go-cmp/cmp"
+)
+func TestJsonSchema(t *testing.T) {
+	testCases := []struct {
+		name     string
+		schema   JsonSchema
+		expected string
+	}{
+		{
+			name: "empty schema",
+			schema: JsonSchema{
+				Type: "object",
+			},
+			expected: `array ::= "[" space ( value ("," space value)* )? "]" space
+boolean ::= ("true" | "false") space
+char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+decimal-part ::= [0-9]{1,16}
+integral-part ::= [0] | [1-9] [0-9]{0,15}
+null ::= "null" space
+number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
+object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
+root ::= object
+space ::= | " " | "\n" [ \t]{0,20}
+string ::= "\"" char* "\"" space
+value ::= object | array | string | number | boolean | null`,
+		},
+		{
+			name: "invalid schema with circular reference",
+			schema: JsonSchema{
+				Type: "object",
+				Properties: map[string]any{
+					"self": map[string]any{
+						"$ref": "#", // Self reference
+					},
+				},
+			},
+			expected: "", // Should return empty string for invalid schema
+		},
+		{
+			name: "schema with invalid type",
+			schema: JsonSchema{
+				Type: "invalid_type", // Invalid type
+				Properties: map[string]any{
+					"foo": map[string]any{
+						"type": "string",
+					},
+				},
+			},
+			expected: "", // Should return empty string for invalid schema
+		},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			result := tc.schema.AsGrammar()
+			if !strings.EqualFold(strings.TrimSpace(result), strings.TrimSpace(tc.expected)) {
+				if diff := cmp.Diff(tc.expected, result); diff != "" {
+					t.Fatalf("grammar mismatch (-want +got):\n%s", diff)
+				}
+			}
+		})
+	}
+}
\ No newline at end of file
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -850,6 +850,7 @@ func (s *Server) loadModel(
 	lpath multiLPath,
 	ppath string,
 	kvSize int,
+	kvCacheType string,
 	flashAttention bool,
 	threads int,
 	multiUserCache bool,
@@ -862,7 +863,7 @@ func (s *Server) loadModel(
 		panic(err)
 	}
-	ctxParams := llama.NewContextParams(kvSize, s.batchSize*s.parallel, s.parallel, threads, flashAttention)
+	ctxParams := llama.NewContextParams(kvSize, s.batchSize*s.parallel, s.parallel, threads, flashAttention, kvCacheType)
 	s.lc, err = llama.NewContextWithModel(s.model, ctxParams)
 	if err != nil {
 		panic(err)
@@ -903,6 +904,7 @@ func main() {
 	mainGpu := flag.Int("main-gpu", 0, "Main GPU")
 	flashAttention := flag.Bool("flash-attn", false, "Enable flash attention")
 	kvSize := flag.Int("ctx-size", 2048, "Context (or KV cache) size")
+	kvCacheType := flag.String("kv-cache-type", "", "quantization type for KV cache (default: f16)")
 	port := flag.Int("port", 8080, "Port to expose the server on")
 	threads := flag.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
 	verbose := flag.Bool("verbose", false, "verbose output (default: disabled)")
@@ -970,7 +972,7 @@ func main() {
 	}
 	server.ready.Add(1)
-	go server.loadModel(params, *mpath, lpaths, *ppath, *kvSize, *flashAttention, *threads, *multiUserCache)
+	go server.loadModel(params, *mpath, lpaths, *ppath, *kvSize, *kvCacheType, *flashAttention, *threads, *multiUserCache)
 	server.cond = sync.NewCond(&server.mu)

--- a/llama/sampling_ext.cpp
+++ b/llama/sampling_ext.cpp
 // TODO: this is a temporary wrapper to allow calling C++ code from CGo
 #include "sampling.h"
 #include "sampling_ext.h"
+#include "json-schema-to-grammar.h"
 struct gpt_sampler *gpt_sampler_cinit(
    const struct llama_model *model, struct gpt_sampler_cparams *params)
 {
-    try {
+    try 
+    {
        gpt_sampler_params sparams;
        sparams.top_k = params->top_k;
        sparams.top_p = params->top_p;
@@ -24,7 +26,9 @@ struct gpt_sampler *gpt_sampler_cinit(
        sparams.seed = params->seed;
        sparams.grammar = params->grammar;
        return gpt_sampler_init(model, sparams);
-    } catch (const std::exception & err) {
+    }
+     catch (const std::exception & err) 
+     {
        return nullptr;
    }
 }
@@ -54,3 +58,24 @@ void gpt_sampler_caccept(
 {
    gpt_sampler_accept(sampler, id, apply_grammar);
 }
+int schema_to_grammar(const char *json_schema, char *grammar, size_t max_len)
+{
+    try
+    {
+        nlohmann::json schema = nlohmann::json::parse(json_schema);
+        std::string grammar_str = json_schema_to_grammar(schema);
+        size_t len = grammar_str.length();
+        if (len >= max_len)
+        {
+            len = max_len - 1;
+        }
+        strncpy(grammar, grammar_str.c_str(), len);
+        return len;
+    }
+    catch (const std::exception &e)
+    {
+        strncpy(grammar, "", max_len - 1);
+        return 0;
+    }
+}
\ No newline at end of file
--- a/llama/sampling_ext.h
+++ b/llama/sampling_ext.h
@@ -47,6 +47,8 @@ extern "C"
        llama_token id,
        bool apply_grammar);
+    int schema_to_grammar(const char *json_schema, char *grammar, size_t max_len);
 #ifdef __cplusplus
 }
 #endif

--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -360,7 +360,7 @@ func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
 	}, offset, nil
 }
-func (llm GGML) GraphSize(context, batch uint64) (kv, partialOffload, fullOffload uint64) {
+func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialOffload, fullOffload uint64) {
 	embedding := llm.KV().EmbeddingLength()
 	heads := llm.KV().HeadCount()
 	headsKV := llm.KV().HeadCountKV()
@@ -372,7 +372,8 @@ func (llm GGML) GraphSize(context, batch uint64) (kv, partialOffload, fullOffloa
 	layers := llm.Tensors().Layers()
-	kv = 2 * context * llm.KV().BlockCount() * (embeddingHeadsK + embeddingHeadsV) * headsKV
+	bytesPerElement := kvCacheBytesPerElement(kvCacheType)
+	kv = uint64(float64(context*llm.KV().BlockCount()*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
 	switch llm.KV().Architecture() {
 	case "llama":
@@ -527,3 +528,34 @@ func (llm GGML) GraphSize(context, batch uint64) (kv, partialOffload, fullOffloa
 	return
 }
+// SupportsKVCacheType checks if the requested cache type is supported
+func (ggml GGML) SupportsKVCacheType(cacheType string) bool {
+	validKVCacheTypes := []string{"f16", "q8_0", "q4_0"}
+	return slices.Contains(validKVCacheTypes, cacheType)
+}
+// SupportsFlashAttention checks if the model supports flash attention
+func (ggml GGML) SupportsFlashAttention() bool {
+	_, isEmbedding := ggml.KV()[fmt.Sprintf("%s.pooling_type", ggml.KV().Architecture())]
+	if isEmbedding {
+		return false
+	}
+	// Check head counts match and are non-zero
+	headCountK := ggml.KV().EmbeddingHeadCountK()
+	headCountV := ggml.KV().EmbeddingHeadCountV()
+	return headCountK != 0 && headCountV != 0 && headCountK == headCountV
+}
+// kvCacheBytesPerElement returns the number of bytes per element for a given KV cache type
+func kvCacheBytesPerElement(cacheType string) float64 {
+	switch cacheType {
+	case "q8_0":
+		return 1 // 1/2 of fp16
+	case "q4_0":
+		return 0.5 // 1/4 of fp16
+	default:
+		return 2 // f16 (default)
+	}
+}
\ No newline at end of file
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -123,7 +123,23 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
 		slog.Warn("model missing blk.0 layer size")
 	}
-	kv, graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
+	fa := envconfig.FlashAttention() &&
+		discover.GetGPUInfo().FlashAttentionSupported() &&
+		ggml.SupportsFlashAttention()
+	var kvct string
+	if fa {
+		requested := strings.ToLower(envconfig.KvCacheType())
+		if requested != "" && ggml.SupportsKVCacheType(requested) {
+			kvct = requested
+		}
+	}
+	kv, graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)), kvct)
+	// KV is proportional to the number of layers
+	layerSize += kv / ggml.KV().BlockCount()
 	if graphPartialOffload == 0 {
 		graphPartialOffload = ggml.KV().GQA() * kv / 6
 	}
@@ -131,9 +147,6 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
 		graphFullOffload = graphPartialOffload
 	}
-	// KV is proportional to the number of layers
-	layerSize += kv / ggml.KV().BlockCount()
 	// on metal there's no partial offload overhead
 	if gpus[0].Library == "metal" {
 		graphPartialOffload = graphFullOffload