Merge pull request #1146 from dhiltgen/ext_server_cgo

Add cgo implementation for llama.cpp

Merge pull request #1146 from dhiltgen/ext_server_cgo
Add cgo implementation for llama.cpp
96fb441a · Daniel Hiltgen · GitHub · fabf2f34 · 495c06e4 · 96fb441a
Unverified Commit 96fb441a authored Dec 22, 2023 by Daniel Hiltgen Committed by GitHub Dec 22, 2023
20 changed files
--- a/llm/dynamic_shim.h
+++ b/llm/dynamic_shim.h
+#include <stdlib.h>
+
+#include "server.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+struct dynamic_llama_server {
+  void *handle;
+  void (*llama_server_init)(ext_server_params_t *sparams,
+                            ext_server_resp_t *err);
+  void (*llama_server_start)();
+  void (*llama_server_stop)();
+  void (*llama_server_completion)(const char *json_req,
+                                  ext_server_resp_t *resp);
+  void (*llama_server_completion_next_result)(const int task_id,
+                                              ext_server_task_result_t *result);
+  void (*llama_server_completion_cancel)(const int task_id,
+                                         ext_server_resp_t *err);
+  void (*llama_server_release_task_result)(ext_server_task_result_t *result);
+  void (*llama_server_tokenize)(const char *json_req, char **json_resp,
+                                ext_server_resp_t *err);
+  void (*llama_server_detokenize)(const char *json_req, char **json_resp,
+                                  ext_server_resp_t *err);
+  void (*llama_server_embedding)(const char *json_req, char **json_resp,
+                                 ext_server_resp_t *err);
+  void (*llama_server_release_json_resp)(char **json_resp);
+};
+
+void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s,
+                       ext_server_resp_t *err);
+
+// No good way to call C function pointers from Go so inline the indirection
+void dynamic_shim_llama_server_init(struct dynamic_llama_server s,
+                                    ext_server_params_t *sparams,
+                                    ext_server_resp_t *err);
+
+void dynamic_shim_llama_server_start(struct dynamic_llama_server s);
+
+void dynamic_shim_llama_server_stop(struct dynamic_llama_server s);
+
+void dynamic_shim_llama_server_completion(struct dynamic_llama_server s,
+                                          const char *json_req,
+                                          ext_server_resp_t *resp);
+
+void dynamic_shim_llama_server_completion_next_result(
+    struct dynamic_llama_server s, const int task_id,
+    ext_server_task_result_t *result);
+
+void dynamic_shim_llama_server_completion_cancel(struct dynamic_llama_server s,
+                                                 const int task_id,
+                                                 ext_server_resp_t *err);
+
+void dynamic_shim_llama_server_release_task_result(
+    struct dynamic_llama_server s, ext_server_task_result_t *result);
+
+void dynamic_shim_llama_server_tokenize(struct dynamic_llama_server s,
+                                        const char *json_req, char **json_resp,
+                                        ext_server_resp_t *err);
+
+void dynamic_shim_llama_server_detokenize(struct dynamic_llama_server s,
+                                          const char *json_req,
+                                          char **json_resp,
+                                          ext_server_resp_t *err);
+
+void dynamic_shim_llama_server_embedding(struct dynamic_llama_server s,
+                                         const char *json_req, char **json_resp,
+                                         ext_server_resp_t *err);
+void dynamic_shim_llama_server_release_json_resp(struct dynamic_llama_server s,
+                                                 char **json_resp);
+
+#ifdef __cplusplus
+}
+#endif
\ No newline at end of file
--- a/llm/ext_server.go
+++ b/llm/ext_server.go
+package llm
+
+/*
+#cgo CFLAGS: -I${SRCDIR}/llama.cpp/gguf -I${SRCDIR}/llama.cpp/gguf/common -I${SRCDIR}/llama.cpp/gguf/examples/server
+#cgo CFLAGS: -DNDEBUG -DLLAMA_SERVER_LIBRARY=1 -D_XOPEN_SOURCE=600 -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
+#cgo CFLAGS: -Wmissing-noreturn -Wall -Wextra -Wcast-qual -Wno-unused-function -Wno-array-bounds
+#cgo CPPFLAGS: -Ofast -Wall -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations -Wno-unused-but-set-variable
+#cgo darwin CFLAGS: -D_DARWIN_C_SOURCE
+#cgo darwin CPPFLAGS:  -DGGML_USE_ACCELERATE
+#cgo darwin CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_NDEBUG
+#cgo darwin LDFLAGS: -lc++ -framework Accelerate
+#cgo darwin LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
+#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/metal/common/libcommon.a
+#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/metal/examples/server/libext_server.a
+#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/metal/libllama.a
+#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/metal/libggml_static.a
+#cgo linux CFLAGS: -D_GNU_SOURCE
+#cgo linux windows CFLAGS: -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_USE_CUBLAS
+#cgo linux LDFLAGS: -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib/stubs
+#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/examples/server/libext_server.a
+#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/common/libcommon.a
+#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/libllama.a
+#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/libggml_static.a
+#cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm
+#cgo windows LDFLAGS: -L${SRCDIR}/llama.cpp/gguf/build/wincpu/dist/lib
+#cgo windows LDFLAGS: -lcpu_server -lpthread
+
+#include <stdlib.h>
+#include "server.h"
+
+*/
+import "C"
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"log"
+	"os"
+	"strings"
+	"sync"
+	"time"
+	"unsafe"
+
+	"github.com/jmorganca/ollama/api"
+	"github.com/jmorganca/ollama/gpu"
+)
+
+func newExtServerResp(len C.size_t) C.ext_server_resp_t {
+	var resp C.ext_server_resp_t
+	resp.msg_len = len
+	bytes := make([]byte, len)
+	resp.msg = (*C.char)(C.CBytes(bytes))
+	return resp
+}
+
+func freeExtServerResp(resp C.ext_server_resp_t) {
+	if resp.msg_len == 0 {
+		return
+	}
+	C.free(unsafe.Pointer(resp.msg))
+}
+
+func extServerResponseToErr(resp C.ext_server_resp_t) error {
+	return fmt.Errorf(C.GoString(resp.msg))
+}
+
+type extServer interface {
+	LLM
+	llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t)
+	llama_server_start()
+	llama_server_stop()
+	llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t)
+	llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t)
+	llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t)
+	llama_server_release_task_result(result *C.ext_server_task_result_t)
+	llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
+	llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
+	llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
+	llama_server_release_json_resp(json_resp **C.char)
+}
+
+type llamaExtServer struct {
+	api.Options
+}
+
+// Note: current implementation does not support concurrent instantiations
+var mutex sync.Mutex
+
+func (llm *llamaExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) {
+	C.llama_server_init(sparams, err)
+}
+func (llm *llamaExtServer) llama_server_start() {
+	C.llama_server_start()
+}
+func (llm *llamaExtServer) llama_server_stop() {
+	C.llama_server_stop()
+}
+
+func (llm *llamaExtServer) llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) {
+	C.llama_server_completion(json_req, resp)
+}
+func (llm *llamaExtServer) llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) {
+	C.llama_server_completion_next_result(task_id, resp)
+}
+func (llm *llamaExtServer) llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) {
+	C.llama_server_completion_cancel(task_id, err)
+}
+func (llm *llamaExtServer) llama_server_release_task_result(result *C.ext_server_task_result_t) {
+	C.llama_server_release_task_result(result)
+}
+
+func (llm *llamaExtServer) llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
+	C.llama_server_tokenize(json_req, json_resp, err)
+}
+func (llm *llamaExtServer) llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
+	C.llama_server_detokenize(json_req, json_resp, err)
+}
+func (llm *llamaExtServer) llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
+	C.llama_server_embedding(json_req, json_resp, err)
+}
+func (llm *llamaExtServer) llama_server_release_json_resp(json_resp **C.char) {
+	C.llama_server_release_json_resp(json_resp)
+}
+
+func newDefaultExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
+	server := &llamaExtServer{opts}
+	return newExtServer(server, model, adapters, projectors, numLayers, opts)
+}
+
+func newExtServer(server extServer, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
+	if !mutex.TryLock() {
+		log.Printf("concurrent llm servers not yet supported, waiting for prior server to complete")
+		mutex.Lock()
+	}
+	fileInfo, err := os.Stat(model)
+	if err != nil {
+		return nil, err
+	}
+	var sparams C.ext_server_params_t
+	sparams.model = C.CString(model)
+	defer C.free(unsafe.Pointer(sparams.model))
+
+	numGPU := gpu.NumGPU(numLayers, fileInfo.Size(), opts)
+
+	sparams.embedding = true
+	sparams.n_ctx = C.uint(opts.NumCtx)
+	sparams.n_batch = C.uint(opts.NumBatch)
+	sparams.n_gpu_layers = C.int(numGPU)
+	sparams.main_gpu = C.int(opts.MainGPU)
+	sparams.n_parallel = 1 // TODO - wire up concurrency
+
+	// Always use the value encoded in the model
+	sparams.rope_freq_base = 0.0
+	sparams.rope_freq_scale = 0.0
+	sparams.memory_f16 = C.bool(opts.F16KV)
+	sparams.use_mlock = C.bool(opts.UseMLock)
+	sparams.use_mmap = C.bool(opts.UseMMap)
+	sparams.numa = C.bool(opts.UseNUMA)
+
+	sparams.lora_adapters = nil
+	for i := 0; i < len(adapters); i++ {
+		la := (*C.ext_server_lora_adapter_t)(C.malloc(C.sizeof_ext_server_lora_adapter_t))
+		defer C.free(unsafe.Pointer(la))
+		la.adapter = C.CString(adapters[i])
+		defer C.free(unsafe.Pointer(la.adapter))
+		la.scale = C.float(1.0) // TODO expose scale/weights up through ollama UX
+		la.next = nil
+		if i == 0 {
+			sparams.lora_adapters = la
+		} else {
+			tmp := sparams.lora_adapters
+			for ; tmp.next != nil; tmp = tmp.next {
+			}
+			tmp.next = la
+		}
+	}
+
+	if len(projectors) > 0 {
+		// TODO: applying multiple projectors is not supported by the llama.cpp server yet
+		sparams.mmproj = C.CString(projectors[0])
+		defer C.free(unsafe.Pointer(sparams.mmproj))
+	} else {
+		sparams.mmproj = nil
+	}
+
+	sparams.n_threads = C.uint(opts.NumThread)
+
+	log.Printf("Initializing internal llama server")
+	resp := newExtServerResp(128)
+	defer freeExtServerResp(resp)
+	server.llama_server_init(&sparams, &resp)
+	if resp.id < 0 {
+		return nil, extServerResponseToErr(resp)
+	}
+
+	log.Printf("Starting internal llama main loop")
+	server.llama_server_start()
+	return server, nil
+}
+
+func (llm *llamaExtServer) Predict(ctx context.Context, pred PredictOpts, fn func(PredictResult)) error {
+	return predict(llm, llm.Options, ctx, pred, fn)
+}
+
+func predict(llm extServer, opts api.Options, ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
+	resp := newExtServerResp(128)
+	defer freeExtServerResp(resp)
+	var imageData []ImageData
+	if len(predict.Images) > 0 {
+		for cnt, i := range predict.Images {
+			imageData = append(imageData, ImageData{Data: i, ID: cnt})
+		}
+	}
+	log.Printf("loaded %d images", len(imageData))
+
+	request := map[string]any{
+		"prompt":            predict.Prompt,
+		"stream":            true,
+		"n_predict":         opts.NumPredict,
+		"n_keep":            opts.NumKeep,
+		"temperature":       opts.Temperature,
+		"top_k":             opts.TopK,
+		"top_p":             opts.TopP,
+		"tfs_z":             opts.TFSZ,
+		"typical_p":         opts.TypicalP,
+		"repeat_last_n":     opts.RepeatLastN,
+		"repeat_penalty":    opts.RepeatPenalty,
+		"presence_penalty":  opts.PresencePenalty,
+		"frequency_penalty": opts.FrequencyPenalty,
+		"mirostat":          opts.Mirostat,
+		"mirostat_tau":      opts.MirostatTau,
+		"mirostat_eta":      opts.MirostatEta,
+		"penalize_nl":       opts.PenalizeNewline,
+		"seed":              opts.Seed,
+		"stop":              opts.Stop,
+		"image_data":        imageData,
+	}
+
+	if predict.Format == "json" {
+		request["grammar"] = jsonGrammar
+	}
+
+	retryDelay := 100 * time.Microsecond
+	for retries := 0; retries < maxRetries; retries++ {
+		if retries > 0 {
+			time.Sleep(retryDelay) // wait before retrying
+			retryDelay *= 2        // exponential backoff
+		}
+
+		// Handling JSON marshaling with special characters unescaped.
+		buffer := &bytes.Buffer{}
+		enc := json.NewEncoder(buffer)
+		enc.SetEscapeHTML(false)
+
+		if err := enc.Encode(request); err != nil {
+			return fmt.Errorf("failed to marshal data: %w", err)
+		}
+
+		req := C.CString(buffer.String())
+		defer C.free(unsafe.Pointer(req))
+
+		llm.llama_server_completion(req, &resp)
+		if resp.id < 0 {
+			return extServerResponseToErr(resp)
+		}
+
+		retryNeeded := false
+	out:
+		for {
+			select {
+			case <-ctx.Done():
+				// This handles the request cancellation
+				llm.llama_server_completion_cancel(resp.id, &resp)
+				if resp.id < 0 {
+					return extServerResponseToErr(resp)
+				} else {
+					return nil
+				}
+			default:
+				var result C.ext_server_task_result_t
+				llm.llama_server_completion_next_result(resp.id, &result)
+				json_resp := C.GoString(result.json_resp)
+				llm.llama_server_release_task_result(&result)
+
+				var p prediction
+				if err := json.Unmarshal([]byte(json_resp), &p); err != nil {
+					llm.llama_server_completion_cancel(resp.id, &resp)
+					if resp.id < 0 {
+						return fmt.Errorf("error unmarshaling llm prediction response: %w and cancel %s", err, C.GoString(resp.msg))
+					} else {
+						return fmt.Errorf("error unmarshaling llm prediction response: %w", err)
+					}
+				}
+
+				if bool(result.error) && strings.Contains(json_resp, "slot unavailable") {
+					retryNeeded = true
+					// task will already be canceled
+					break out
+				}
+
+				if p.Content != "" {
+					fn(PredictResult{
+						Content: p.Content,
+					})
+				}
+
+				if p.Stop {
+					fn(PredictResult{
+						Done:               true,
+						PromptEvalCount:    p.Timings.PromptN,
+						PromptEvalDuration: parseDurationMs(p.Timings.PromptMS),
+						EvalCount:          p.Timings.PredictedN,
+						EvalDuration:       parseDurationMs(p.Timings.PredictedMS),
+					})
+					return nil
+				}
+			}
+		}
+		if !retryNeeded {
+			return nil // success
+		}
+	}
+
+	// should never reach here ideally
+	return fmt.Errorf("max retries exceeded")
+}
+func (llm *llamaExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
+	return encode(llm, ctx, prompt)
+}
+
+func encode(llm extServer, ctx context.Context, prompt string) ([]int, error) {
+	data, err := json.Marshal(TokenizeRequest{Content: prompt})
+	if err != nil {
+		return nil, fmt.Errorf("marshaling encode data: %w", err)
+	}
+	req := C.CString(string(data))
+	defer C.free(unsafe.Pointer(req))
+	var json_resp *C.char
+	resp := newExtServerResp(128)
+	defer freeExtServerResp(resp)
+	llm.llama_server_tokenize(req, &json_resp, &resp)
+	if resp.id < 0 {
+		return nil, extServerResponseToErr(resp)
+	}
+	defer llm.llama_server_release_json_resp(&json_resp)
+
+	var encoded TokenizeResponse
+	if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &encoded); err2 != nil {
+		return nil, fmt.Errorf("unmarshal encode response: %w", err2)
+	}
+
+	return encoded.Tokens, err
+}
+
+func (llm *llamaExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
+	return decode(llm, ctx, tokens)
+}
+
+func decode(llm extServer, ctx context.Context, tokens []int) (string, error) {
+	if len(tokens) == 0 {
+		return "", nil
+	}
+	data, err := json.Marshal(DetokenizeRequest{Tokens: tokens})
+	if err != nil {
+		return "", fmt.Errorf("marshaling decode data: %w", err)
+	}
+
+	req := C.CString(string(data))
+	defer C.free(unsafe.Pointer(req))
+	var json_resp *C.char
+	resp := newExtServerResp(128)
+	defer freeExtServerResp(resp)
+	llm.llama_server_detokenize(req, &json_resp, &resp)
+	if resp.id < 0 {
+		return "", extServerResponseToErr(resp)
+	}
+	defer llm.llama_server_release_json_resp(&json_resp)
+
+	var decoded DetokenizeResponse
+	if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &decoded); err2 != nil {
+		return "", fmt.Errorf("unmarshal encode response: %w", err2)
+	}
+
+	return decoded.Content, err
+}
+
+func (llm *llamaExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
+	return embedding(llm, ctx, input)
+}
+func embedding(llm extServer, ctx context.Context, input string) ([]float64, error) {
+	data, err := json.Marshal(TokenizeRequest{Content: input})
+	if err != nil {
+		return nil, fmt.Errorf("error marshaling embed data: %w", err)
+	}
+
+	req := C.CString(string(data))
+	defer C.free(unsafe.Pointer(req))
+	var json_resp *C.char
+	resp := newExtServerResp(128)
+	defer freeExtServerResp(resp)
+	llm.llama_server_embedding(req, &json_resp, &resp)
+	if resp.id < 0 {
+		return nil, extServerResponseToErr(resp)
+	}
+	defer llm.llama_server_release_json_resp(&json_resp)
+
+	var embedding EmbeddingResponse
+	if err := json.Unmarshal([]byte(C.GoString(json_resp)), &embedding); err != nil {
+		return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
+	}
+
+	return embedding.Embedding, nil
+}
+
+func (llm *llamaExtServer) Close() {
+	close(llm)
+}
+
+func close(llm extServer) {
+	llm.llama_server_stop()
+	mutex.Unlock()
+}
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -86,74 +86,6 @@ type container interface {
 	Decode(*readSeekOffset) (model, error)
 }

-type containerGGML struct{}
-
-func (c *containerGGML) Name() string {
-	return "ggml"
-}
-
-func (c *containerGGML) Decode(ro *readSeekOffset) (model, error) {
-	// file contents aren't decoded
-	ro.Seek(0, io.SeekEnd)
-	return nil, nil
-}
-
-type containerGGMF struct {
-	version uint32
-}
-
-func (c *containerGGMF) Name() string {
-	return "ggmf"
-}
-
-func (c *containerGGMF) Decode(ro *readSeekOffset) (model, error) {
-	var version uint32
-	binary.Read(ro, binary.LittleEndian, &version)
-
-	switch version {
-	case 1:
-	default:
-		return nil, errors.New("invalid version")
-	}
-
-	c.version = version
-
-	// remaining file contents aren't decoded
-	ro.Seek(0, io.SeekEnd)
-
-	return nil, nil
-}
-
-type containerGGJT struct {
-	version uint32
-}
-
-func (c *containerGGJT) Name() string {
-	return "ggjt"
-}
-
-func (c *containerGGJT) Decode(ro *readSeekOffset) (model, error) {
-	var version uint32
-	binary.Read(ro, binary.LittleEndian, &version)
-
-	switch version {
-	case 1, 2, 3:
-	default:
-		return nil, errors.New("invalid version")
-	}
-
-	c.version = version
-
-	// different model types may have different layouts for hyperparameters
-	var llama llamaModel
-	binary.Read(ro, binary.LittleEndian, &llama.hyperparameters)
-
-	// remaining file contents aren't decoded
-	ro.Seek(0, io.SeekEnd)
-
-	return &llama, nil
-}
-
 type containerLORA struct {
 	version uint32
 }
@@ -194,6 +126,8 @@ const (
 	FILE_MAGIC_GGUF_BE = 0x47475546
 )

+var ErrUnsupportedFormat = errors.New("unsupported model format")
+
 func DecodeGGML(r io.ReadSeeker) (*GGML, error) {
 	ro := readSeekOffset{ReadSeeker: r}

@@ -204,12 +138,8 @@ func DecodeGGML(r io.ReadSeeker) (*GGML, error) {

 	var c container
 	switch magic {
-	case FILE_MAGIC_GGML:
-		c = &containerGGML{}
-	case FILE_MAGIC_GGMF:
-		c = &containerGGMF{}
-	case FILE_MAGIC_GGJT:
-		c = &containerGGJT{}
+	case FILE_MAGIC_GGML, FILE_MAGIC_GGMF, FILE_MAGIC_GGJT:
+		return nil, ErrUnsupportedFormat
 	case FILE_MAGIC_GGLA:
 		c = &containerLORA{}
 	case FILE_MAGIC_GGUF_LE:

--- a/llm/llama.cpp/gen_common.sh
+++ b/llm/llama.cpp/gen_common.sh
+# common logic accross linux and darwin
+
+init_vars() {
+    LLAMACPP_DIR=gguf
+    PATCHES="0001-Expose-callable-API-for-server.patch"
+    CMAKE_DEFS="-DLLAMA_ACCELERATE=on -DLLAMA_SERVER_VERBOSE=off"
+    # TODO - LLAMA_K_QUANTS is stale and needs to be mapped to newer cmake settings
+    CMAKE_TARGETS="--target ggml --target ggml_static --target llama --target build_info --target common --target ext_server --target llava_static"
+    if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
+        CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on ${CMAKE_DEFS}"
+    else
+        # TODO - add additional optimization flags...
+        CMAKE_DEFS="-DCMAKE_BUILD_TYPE=Release ${CMAKE_DEFS}"
+    fi
+}
+
+git_module_setup() {
+    if [ -n "${OLLAMA_SKIP_PATCHING}" ]; then
+        echo "Skipping submodule initialization"
+        return
+    fi
+    git submodule init
+    git submodule update --force gguf
+
+}
+
+apply_patches() {
+    if [ -n "${OLLAMA_SKIP_PATCHING}" ]; then
+        echo "Skipping submodule patching"
+        return
+    fi
+    # Workaround git apply not handling creation well for iteration
+    rm -f gguf/examples/server/server.h
+    for patch in ${PATCHES}; do
+        git -C gguf apply ../patches/${patch}
+    done
+}
+
+build() {
+    cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS}
+    cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8
+}
--- a/llm/llama.cpp/gen_darwin.sh
+++ b/llm/llama.cpp/gen_darwin.sh
+#!/bin/bash
+# This script is intended to run inside the go generate
+# working directory must be ../llm/llama.cpp
+
+# TODO - add hardening to detect missing tools (cmake, etc.)
+
+set -ex
+set -o pipefail
+echo "Starting darwin generate script"
+source $(dirname $0)/gen_common.sh
+init_vars
+CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DLLAMA_METAL=on ${CMAKE_DEFS}"
+BUILD_DIR="gguf/build/metal"
+case "${GOARCH}" in
+    "amd64")
+        CMAKE_DEFS="-DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 ${CMAKE_DEFS}"
+        ;;
+     "arm64")
+        CMAKE_DEFS="-DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 ${CMAKE_DEFS}"
+        ;;
+    *)
+        echo "GOARCH must be set"
+        echo "this script is meant to be run from within go generate"
+        exit 1
+        ;;
+esac
+
+git_module_setup
+apply_patches
+build
\ No newline at end of file
--- a/llm/llama.cpp/gen_linux.sh
+++ b/llm/llama.cpp/gen_linux.sh
+#!/bin/bash
+# This script is intended to run inside the go generate
+# working directory must be llm/llama.cpp
+
+# First we build our default built-in library which will be linked into the CGO
+# binary as a normal dependency. This default build is CPU based.
+#
+# Then we build a CUDA dynamic library (although statically linked with the CUDA
+# library dependencies for maximum portability)
+#
+# Then if we detect ROCm, we build a dynamically loaded ROCm lib.  ROCm is particularly
+# important to be a dynamic lib even if it's the only GPU library detected because
+# we can't redistribute the objectfiles but must rely on dynamic libraries at
+# runtime, which could lead the server not to start if not present.
+
+set -ex
+set -o pipefail
+
+echo "Starting linux generate script"
+if [ -z "${CUDACXX}" -a -x /usr/local/cuda/bin/nvcc ]; then
+    export CUDACXX=/usr/local/cuda/bin/nvcc
+fi
+COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_ACCELERATE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off"
+OLLAMA_DYN_LIB_DIR="gguf/build/lib"
+source $(dirname $0)/gen_common.sh
+init_vars
+git_module_setup
+apply_patches
+
+mkdir -p ${OLLAMA_DYN_LIB_DIR}
+touch ${OLLAMA_DYN_LIB_DIR}/.generated
+
+#
+# CPU first for the default library
+#
+CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
+BUILD_DIR="gguf/build/cpu"
+build
+
+if [ -d /usr/local/cuda/lib64/ ]; then
+    echo "CUDA libraries detected - building dynamic CUDA library"
+    init_vars
+    CMAKE_DEFS="-DLLAMA_CUBLAS=on ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
+    BUILD_DIR="gguf/build/cuda"
+    CUDA_LIB_DIR=/usr/local/cuda/lib64
+    build
+    gcc -fPIC -g -shared -o ${OLLAMA_DYN_LIB_DIR}/libcuda_server.so \
+        -Wl,--whole-archive \
+        ${BUILD_DIR}/examples/server/libext_server.a \
+        ${BUILD_DIR}/common/libcommon.a \
+        ${BUILD_DIR}/libllama.a \
+        -Wl,--no-whole-archive \
+        ${CUDA_LIB_DIR}/libcudart_static.a \
+        ${CUDA_LIB_DIR}/libcublas_static.a \
+        ${CUDA_LIB_DIR}/libcublasLt_static.a \
+        ${CUDA_LIB_DIR}/libcudadevrt.a \
+        ${CUDA_LIB_DIR}/libculibos.a \
+        -lrt -lpthread -ldl -lstdc++ -lm
+fi
+
+if [ -z "${ROCM_PATH}" ]; then
+    # Try the default location in case it exists
+    ROCM_PATH=/opt/rocm
+fi
+
+if [ -z "${CLBlast_DIR}" ]; then
+    # Try the default location in case it exists
+    if [ -d /usr/lib/cmake/CLBlast ]; then
+        export CLBlast_DIR=/usr/lib/cmake/CLBlast
+    fi
+fi
+
+if [ -d "${ROCM_PATH}" ]; then
+    echo "ROCm libraries detected - building dynamic ROCm library"
+    init_vars
+    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102' -DGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102'"
+    BUILD_DIR="gguf/build/rocm"
+    build
+    gcc -fPIC -g -shared -o ${OLLAMA_DYN_LIB_DIR}/librocm_server.so \
+        -Wl,--whole-archive \
+        ${BUILD_DIR}/examples/server/libext_server.a \
+        ${BUILD_DIR}/common/libcommon.a \
+        ${BUILD_DIR}/libllama.a \
+        -Wl,--no-whole-archive \
+        -lrt -lpthread -ldl -lstdc++ -lm \
+        -L/opt/rocm/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ \
+        -Wl,-rpath,/opt/rocm/lib,-rpath,/opt/amdgpu/lib/x86_64-linux-gnu/ \
+        -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu
+fi
--- a/llm/llama.cpp/gen_windows.ps1
+++ b/llm/llama.cpp/gen_windows.ps1
+#!powershell
+
+$ErrorActionPreference = "Stop"
+
+function init_vars {
+    $script:patches = @("0001-Expose-callable-API-for-server.patch")
+    $script:cmakeDefs = @("-DBUILD_SHARED_LIBS=on", "-DLLAMA_NATIVE=off", "-DLLAMA_F16C=off", "-DLLAMA_FMA=off", "-DLLAMA_AVX512=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX=on", "-DLLAMA_K_QUANTS=on", "-DLLAMA_ACCELERATE=on", "-A","x64")
+
+    if ($env:CGO_CFLAGS -contains "-g") {
+        $script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on")
+        $script:config = "RelWithDebInfo"
+    } else {
+        $script:config = "Release"
+    }
+}
+
+function git_module_setup {
+    # TODO add flags to skip the init/patch logic to make it easier to mod llama.cpp code in-repo
+    & git submodule init
+    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+    & git submodule update --force gguf
+    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+}
+
+function apply_patches {
+    rm -erroraction ignore -path "gguf/examples/server/server.h"
+    foreach ($patch in $script:patches) {
+        write-host "Applying patch $patch"
+        & git -C gguf apply ../patches/$patch
+        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+    }
+}
+
+function build {
+    write-host "generating config with: cmake -S gguf -B $script:buildDir $script:cmakeDefs"
+    & cmake --version
+    & cmake -S gguf -B $script:buildDir $script:cmakeDefs
+    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+    write-host "building with: cmake --build $script:buildDir --config $script:config"
+    & cmake --build $script:buildDir --config $script:config
+    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+}
+
+function install {
+    rm -erroraction ignore -recurse -force -path $script:installDir
+    & cmake --install $script:buildDir --prefix $script:installDir --config $script:config
+    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+
+}
+
+init_vars
+git_module_setup
+apply_patches
+
+# first build CPU based
+$script:buildDir="gguf/build/wincpu"
+$script:installDir="gguf/build/wincpu/dist"
+
+build
+# install
+
+md gguf/build/lib -ea 0
+md gguf/build/wincpu/dist/lib -ea 0
+mv gguf/build/wincpu/bin/$script:config/ext_server_shared.dll gguf/build/wincpu/dist/lib/cpu_server.dll
+
+
+# Nope, this barfs on lots of symbol problems
+#mv gguf/build/wincpu/examples/server/$script:config/ext_server_shared.dll gguf/build/wincpu/dist/lib/cpu_server.lib
+# Nope: this needs lots of include paths to pull in things like msvcprt.lib and other deps
+# & cl.exe `
+#     gguf/build/wincpu/examples/server/$script:config/ext_server.lib `
+#     gguf/build/wincpu/common/$script:config/common.lib `
+#     gguf/build/wincpu/$script:config/llama.lib `
+#     gguf/build/wincpu/$script:config/ggml_static.lib `
+#     /link /DLL /DEF:cpu_server.def /NOENTRY /MACHINE:X64  /OUT:gguf/build/wincpu/dist/lib/cpu_server.dll
+# if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+
+# Then build cuda as a dynamically loaded library
+init_vars
+$script:buildDir="gguf/build/wincuda"
+$script:installDir="gguf/build/wincuda/dist"
+$script:cmakeDefs += @("-DLLAMA_CUBLAS=ON", "-DBUILD_SHARED_LIBS=on")
+build
+install
+cp gguf/build/wincuda/dist/bin/ext_server_shared.dll gguf/build/lib/cuda_server.dll
+
+# TODO - more to do here to create a usable dll
+
+
+# TODO - implement ROCm support on windows
+md gguf/build/winrocm/lib -ea 0
+echo $null >> gguf/build/winrocm/lib/.generated
--- a/llm/llama.cpp/generate_darwin.go
+++ b/llm/llama.cpp/generate_darwin.go
+package llm
+
+//go:generate sh ./gen_darwin.sh
--- a/llm/llama.cpp/generate_darwin_amd64.go
+++ b/llm/llama.cpp/generate_darwin_amd64.go
-package llm
-
-//go:generate git submodule init
-
-//go:generate git submodule update --force ggml
-//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
-//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
-//go:generate git -C ggml apply ../patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
-//go:generate git -C ggml apply ../patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
-//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
-//go:generate cmake --build ggml/build/cpu --target server --config Release
-//go:generate mv ggml/build/cpu/bin/server ggml/build/cpu/bin/ollama-runner
-
-//go:generate git submodule update --force gguf
-//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
-//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_METAL=off -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=on
-//go:generate cmake --build gguf/build/cpu --target server --config Release
-//go:generate mv gguf/build/cpu/bin/server gguf/build/cpu/bin/ollama-runner
--- a/llm/llama.cpp/generate_darwin_arm64.go
+++ b/llm/llama.cpp/generate_darwin_arm64.go
-package llm
-
-//go:generate git submodule init
-
-//go:generate git submodule update --force ggml
-//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
-//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
-//go:generate git -C ggml apply ../patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
-//go:generate git -C ggml apply ../patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
-//go:generate cmake -S ggml -B ggml/build/metal -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
-//go:generate cmake --build ggml/build/metal --target server --config Release
-//go:generate mv ggml/build/metal/bin/server ggml/build/metal/bin/ollama-runner
-
-//go:generate git submodule update --force gguf
-//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
-//go:generate cmake -S gguf -B gguf/build/metal -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
-//go:generate cmake --build gguf/build/metal --target server --config Release
-//go:generate mv gguf/build/metal/bin/server gguf/build/metal/bin/ollama-runner
--- a/llm/llama.cpp/generate_linux.go
+++ b/llm/llama.cpp/generate_linux.go
 package llm

-//go:generate git submodule init
-
-//go:generate git submodule update --force ggml
-//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
-//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
-//go:generate git -C ggml apply ../patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
-//go:generate git -C ggml apply ../patches/0001-copy-cuda-runtime-libraries.patch
-//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
-//go:generate cmake --build ggml/build/cpu --target server --config Release
-//go:generate mv ggml/build/cpu/bin/server ggml/build/cpu/bin/ollama-runner
-
-//go:generate git submodule update --force gguf
-//go:generate git -C gguf apply ../patches/0001-copy-cuda-runtime-libraries.patch
-//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
-//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off
-//go:generate cmake --build gguf/build/cpu --target server --config Release
-//go:generate mv gguf/build/cpu/bin/server gguf/build/cpu/bin/ollama-runner
-
-//go:generate cmake -S ggml -B ggml/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
-//go:generate cmake --build ggml/build/cuda --target server --config Release
-//go:generate mv ggml/build/cuda/bin/server ggml/build/cuda/bin/ollama-runner
-//go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_CUDA_PEER_MAX_BATCH_SIZE=0
-//go:generate cmake --build gguf/build/cuda --target server --config Release
-//go:generate mv gguf/build/cuda/bin/server gguf/build/cuda/bin/ollama-runner
+//go:generate bash ./gen_linux.sh
--- a/llm/llama.cpp/generate_windows.go
+++ b/llm/llama.cpp/generate_windows.go
 package llm

-//go:generate git submodule init
-
-//go:generate git submodule update --force ggml
-//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
-//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
-//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
-//go:generate cmake --build ggml/build/cpu --target server --config Release
-//go:generate cmd /c move ggml\build\cpu\bin\Release\server.exe ggml\build\cpu\bin\Release\ollama-runner.exe
-
-//go:generate git submodule update --force gguf
-//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
-//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off
-//go:generate cmake --build gguf/build/cpu --target server --config Release
-//go:generate cmd /c move gguf\build\cpu\bin\Release\server.exe gguf\build\cpu\bin\Release\ollama-runner.exe
-
-//go:generate cmake -S ggml -B ggml/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
-//go:generate cmake --build ggml/build/cuda --target server --config Release
-//go:generate cmd /c move ggml\build\cuda\bin\Release\server.exe ggml\build\cuda\bin\Release\ollama-runner.exe
-
-//go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off
-//go:generate cmake --build gguf/build/cuda --target server --config Release
-//go:generate cmd /c move gguf\build\cuda\bin\Release\server.exe gguf\build\cuda\bin\Release\ollama-runner.exe
+//go:generate powershell -ExecutionPolicy Bypass -File ./gen_windows.ps1
--- a/ggml @ 9e232f02
+++ b/ggml @ 9e232f02
-Subproject commit 9e232f0234073358e7031c1b8d7aa45020469a3b
--- a/gguf @ 328b83de
+++ b/gguf @ 328b83de
-Subproject commit a7aee47b98e45539d491071b25778b833b77e387
+Subproject commit 328b83de23b33240e28f4e74900d1d06726f5eb1
--- a/llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch
+++ b/llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch
+From 4c72576c5f6c2217b1ecf7fd8523616acc5526ae Mon Sep 17 00:00:00 2001
+From: Daniel Hiltgen <daniel@ollama.com>
+Date: Mon, 13 Nov 2023 12:25:58 -0800
+Subject: [PATCH] Expose callable API for server
+
+This adds an extern "C" interface within the example server
+---
+ examples/server/CMakeLists.txt |  24 +++
+ examples/server/server.cpp     | 279 +++++++++++++++++++++++++++++++++
+ examples/server/server.h       |  89 +++++++++++
+ ggml-cuda.cu                   |   1 +
+ 4 files changed, 393 insertions(+)
+ create mode 100644 examples/server/server.h
+
+diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
+index 859cd12..4ea47a7 100644
+--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
+@@ -11,3 +11,27 @@ if (WIN32)
+     TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
+ endif()
+ target_compile_features(${TARGET} PRIVATE cxx_std_11)
+
+set(TARGET ext_server)
+option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
+add_library(${TARGET} STATIC server.cpp)
+target_include_directories(${TARGET} PRIVATE ../../common)
+target_include_directories(${TARGET} PRIVATE ../..)
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_definitions(${TARGET} PUBLIC LLAMA_SERVER_LIBRARY=1)
+target_link_libraries(${TARGET} PRIVATE common llama llava ${CMAKE_THREAD_LIBS_INIT})
+
+if (BUILD_SHARED_LIBS)
+    set_target_properties(ext_server PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    target_compile_definitions(ext_server PRIVATE LLAMA_SHARED LLAMA_BUILD)
+    add_library(ext_server_shared SHARED $<TARGET_OBJECTS:ext_server>)
+    target_link_libraries(ext_server_shared PRIVATE ggml llama llava common ${CMAKE_THREAD_LIBS_INIT})
+    install(TARGETS ext_server_shared LIBRARY)
+endif()
+
+if (CUDAToolkit_FOUND)
+    target_include_directories(${TARGET} PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+    if (WIN32)
+        target_link_libraries(ext_server_shared PRIVATE nvml)
+    endif()
+endif()
+\ No newline at end of file
+diff --git a/examples/server/server.cpp b/examples/server/server.cpp
+index 0403853..5e78e4d 100644
+--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
+@@ -5,6 +5,9 @@
+ #include "../llava/clip.h"
+ 
+ #include "stb_image.h"
+#if defined(LLAMA_SERVER_LIBRARY)
+#include "server.h"
+#endif
+ 
+ #ifndef NDEBUG
+ // crash the server in debug mode, otherwise send an http 500 error
+@@ -2643,6 +2646,7 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
+     }
+ }
+ 
+#ifndef LLAMA_SERVER_LIBRARY
+ int main(int argc, char **argv)
+ {
+ #if SERVER_VERBOSE != 1
+@@ -3123,3 +3127,278 @@ int main(int argc, char **argv)
+     llama_backend_free();
+     return 0;
+ }
+
+#else // LLAMA_SERVER_LIBRARY
+// Expose the llama server as a callable extern "C" API
+llama_server_context *llama = NULL;
+std::atomic<bool> ext_server_running(false);
+std::thread ext_server_thread;
+
+void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err)
+{
+#if SERVER_VERBOSE != 1
+    log_disable();
+#endif
+    assert(err != NULL && sparams != NULL);
+    err->id = 0;
+    err->msg[0] = '\0';
+    try {
+        llama = new llama_server_context;
+        log_set_target(stdout);
+        gpt_params params;
+        params.n_ctx = sparams->n_ctx;
+        params.n_batch = sparams->n_batch;
+        if (sparams->n_threads > 0) {
+            params.n_threads = sparams->n_threads;
+        }
+        params.n_parallel = sparams->n_parallel;
+        params.rope_freq_base = sparams->rope_freq_base;
+        params.rope_freq_scale = sparams->rope_freq_scale;
+
+        if (sparams->memory_f16)  {
+            params.cache_type_k = "f16";
+            params.cache_type_v = "f16";
+        } else {
+            params.cache_type_k = "f32";
+            params.cache_type_v = "f32";
+        }
+
+        params.n_gpu_layers = sparams->n_gpu_layers;
+        params.main_gpu = sparams->main_gpu;
+        params.use_mlock = sparams->use_mlock;
+        params.use_mmap = sparams->use_mmap;
+        params.numa = sparams->numa;
+        params.embedding = sparams->embedding;
+        if (sparams->model != NULL) {
+            params.model = sparams->model;
+        }
+
+        for (ext_server_lora_adapter *la = sparams->lora_adapters; la != NULL; la = la->next) {
+            params.lora_adapter.push_back(std::make_tuple(la->adapter, la->scale));
+        }
+
+        if (sparams->mmproj != NULL) {
+            params.mmproj = std::string(sparams->mmproj);
+        }
+           
+        llama_backend_init(params.numa);
+
+        // load the model
+        if (!llama->load_model(params))
+        {
+            // TODO - consider modifying the logging logic or patching load_model so we can capture more detailed error messages
+            // and pass them back to the caller for better UX
+            err->id = -1;
+            snprintf(err->msg, err->msg_len, "error loading model %s", params.model.c_str());
+            return;
+        }
+
+        llama->initialize();
+    } catch (std::exception &e) {
+        err->id = -1;
+        snprintf(err->msg, err->msg_len, "exception %s", e.what());
+    } catch (...) {
+        err->id = -1;
+        snprintf(err->msg, err->msg_len, "Unknown exception initializing llama server");
+    }
+}
+
+void llama_server_start()
+{
+    assert(llama != NULL);
+     // TODO mutex to protect thread creation
+    ext_server_thread = std::thread([&]()
+    {
+        ext_server_running = true;
+        try {
+            LOG_TEE("llama server main loop starting\n");
+            ggml_time_init();
+            while (ext_server_running.load())
+            {
+                if (!llama->update_slots()) {
+                    LOG_TEE("unexpected error in llama server update_slots - exiting main loop\n");
+                    break;
+                }
+            }
+        } catch (std::exception &e) {
+            LOG_TEE("caught exception in llama server main loop: %s\n", e.what());
+        } catch (...) {
+            LOG_TEE("caught unknown exception in llama server main loop\n");
+        }
+        LOG_TEE("\nllama server shutting down\n");
+        llama_backend_free();
+    });
+}
+
+void llama_server_stop() {
+    assert(llama != NULL);
+    // TODO - too verbose, remove once things are solid
+    LOG_TEE("requesting llama server shutdown\n");
+    ext_server_running = false;
+    ext_server_thread.join();
+    delete llama;
+    llama = NULL;
+    LOG_TEE("llama server shutdown complete\n");
+}
+
+void llama_server_completion(const char *json_req, ext_server_resp_t *resp) {
+    assert(llama != NULL && json_req != NULL && resp != NULL);
+    resp->id = -1;
+    resp->msg[0] = '\0';
+    try {
+        json data = json::parse(json_req);
+        resp->id = llama->request_completion(data, false, false, -1);
+    } catch (std::exception &e) {
+        snprintf(resp->msg, resp->msg_len, "exception %s", e.what());
+    } catch (...) {
+        snprintf(resp->msg, resp->msg_len, "Unknown exception during completion");
+    }
+}
+
+void llama_server_completion_next_result(const int task_id, ext_server_task_result_t *resp) {
+    assert(llama != NULL && resp != NULL);
+    std::string msg;
+    resp->id = -1;
+    resp->stop = false;
+    resp->error = false;
+    resp->json_resp = NULL;
+    std::string result_json;
+    try {
+        task_result result = llama->next_result(task_id);
+        result_json = result.result_json.dump(-1, ' ', false, json::error_handler_t::replace);
+        resp->id = result.id;
+        resp->stop = result.stop;
+        resp->error = result.error;
+        if (result.error) {
+            llama->request_cancel(task_id);
+        } else if (result.stop) {
+            llama->request_cancel(task_id);
+        }
+    } catch (std::exception &e) {
+        resp->error = true;
+        resp->id = -1;
+        result_json = "{\"error\":\"exception " + std::string(e.what()) + "\"}";
+    } catch (...) {
+        resp->error = true;
+        resp->id = -1;
+        result_json = "{\"error\":\"Unknown exception during completion\"}";
+    }
+    const std::string::size_type size = result_json.size() + 1;
+    resp->json_resp = new char[size];
+    snprintf(resp->json_resp, size, "%s", result_json.c_str());
+}
+
+void llama_server_release_task_result(ext_server_task_result_t *result) {
+    if (result == NULL || result->json_resp == NULL) {
+        return;
+    }
+    delete[] result->json_resp;
+}
+
+void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err) {
+    assert(llama != NULL && err != NULL);
+    err->id = 0;
+    err->msg[0] = '\0';
+    try {
+        llama->request_cancel(task_id);
+    } catch (std::exception &e) {
+        err->id = -1;
+        snprintf(err->msg, err->msg_len, "exception %s", e.what());
+    } catch (...) {
+        err->id = -1;
+        snprintf(err->msg, err->msg_len, "Unknown exception completion cancel in llama server");
+    }
+}
+
+void llama_server_tokenize(const char *json_req, char **json_resp, ext_server_resp_t *err) {
+    assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
+    *json_resp = NULL;
+    err->id = 0;
+    err->msg[0] = '\0';
+    try {
+        const json body = json::parse(json_req);
+        std::vector<llama_token> tokens;
+        if (body.count("content") != 0)
+        {
+            tokens = llama->tokenize(body["content"], false);
+        }
+        const json data = format_tokenizer_response(tokens);
+        std::string result_json = data.dump();
+        const std::string::size_type size = result_json.size() + 1;
+        *json_resp = new char[size];
+        snprintf(*json_resp, size, "%s", result_json.c_str());
+    } catch (std::exception &e) {
+        err->id = -1;
+        snprintf(err->msg, err->msg_len, "exception %s", e.what());
+    } catch (...) {
+        err->id = -1;
+        snprintf(err->msg, err->msg_len, "Unknown exception during tokenize");
+    }
+}
+
+void llama_server_release_json_resp(char **json_resp) {
+    if (json_resp == NULL || *json_resp == NULL) {
+        return;
+    }
+    delete[] *json_resp;
+}
+
+void llama_server_detokenize(const char *json_req, char **json_resp, ext_server_resp_t *err) {
+    assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
+    *json_resp = NULL;
+    err->id = 0;
+    err->msg[0] = '\0';
+    try {
+        const json body = json::parse(json_req);
+        std::string content;
+        if (body.count("tokens") != 0)
+        {
+            const std::vector<llama_token> tokens = body["tokens"];
+            content = tokens_to_str(llama->ctx, tokens.cbegin(), tokens.cend());
+        }
+        const json data = format_detokenized_response(content);
+        std::string result_json = data.dump();
+        const std::string::size_type size = result_json.size() + 1;
+        *json_resp = new char[size];
+        snprintf(*json_resp, size, "%s", result_json.c_str());
+    } catch (std::exception &e) {
+        err->id = -1;
+        snprintf(err->msg, err->msg_len, "exception %s", e.what());
+    } catch (...) {
+        err->id = -1;
+        snprintf(err->msg, err->msg_len, "Unknown exception during detokenize");
+    }
+}
+
+void llama_server_embedding(const char *json_req, char** json_resp, ext_server_resp_t *err) {
+    assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
+    *json_resp = NULL;
+    err->id = 0;
+    err->msg[0] = '\0';
+    try {
+        const json body = json::parse(json_req);
+        json prompt;
+        if (body.count("content") != 0)
+        {
+            prompt = body["content"];
+        }
+        else
+        {
+            prompt = "";
+        }
+        const int task_id = llama->request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true, -1);
+        task_result result = llama->next_result(task_id);
+        std::string result_json = result.result_json.dump();
+        const std::string::size_type size = result_json.size() + 1;
+        *json_resp = new char[size];
+        snprintf(*json_resp, size, "%s", result_json.c_str());
+    } catch (std::exception &e) {
+        err->id = -1;
+        snprintf(err->msg, err->msg_len, "exception %s", e.what());
+    } catch (...) {
+        err->id = -1;
+        snprintf(err->msg, err->msg_len, "Unknown exception during embedding");
+    }
+}
+
+#endif // LLAMA_SERVER_LIBRARY
+\ No newline at end of file
+diff --git a/examples/server/server.h b/examples/server/server.h
+new file mode 100644
+index 0000000..d22f1b6
+--- /dev/null
+++ b/examples/server/server.h
+@@ -0,0 +1,89 @@
+#if defined(LLAMA_SERVER_LIBRARY)
+#ifndef LLAMA_SERVER_H
+#define LLAMA_SERVER_H
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdbool.h>
+
+// This exposes extern C entrypoints into the llama_server 
+// To enable the server compile with LLAMA_SERVER_LIBRARY
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+    typedef struct ext_server_resp {
+        int id; // < 0 on error
+        size_t msg_len; // caller must allocate msg and set msg_len
+        char *msg;
+    } ext_server_resp_t;
+
+    // Allocated and freed by caller
+    typedef struct ext_server_lora_adapter {
+        char *adapter;
+        float scale;
+        struct ext_server_lora_adapter *next;
+    } ext_server_lora_adapter_t;
+
+    // Allocated and freed by caller
+    typedef struct ext_server_params
+    {
+        char *model;            
+        uint32_t n_ctx;         // text context, 0 = from model
+        uint32_t n_batch;       // prompt processing maximum batch size
+        uint32_t n_threads;     // number of threads to use for generation
+        int32_t n_parallel;     // number of parallel sequences to decodewra
+        float rope_freq_base;   // RoPE base frequency, 0 = from model
+        float rope_freq_scale;  // RoPE frequency scaling factor, 0 = from model
+        bool memory_f16;        // use f16 instead of f32 for memory kv
+        int32_t n_gpu_layers;   // number of layers to store in VRAM (-1 - use default)
+        int32_t main_gpu;       // the GPU that is used for scratch and small tensors
+        bool use_mlock;         // force system to keep model in RAM
+        bool use_mmap;          // use mmap if possible
+        bool numa;              // attempt optimizations that help on some NUMA systems
+        bool embedding;         // get only sentence embedding
+        ext_server_lora_adapter_t* lora_adapters;
+        char *mmproj;
+    } ext_server_params_t;
+
+    typedef struct ext_server_task_result
+    {
+        int id;
+        bool stop;
+        bool error;
+        char* json_resp; // null terminated, memory managed by ext_server
+    } ext_server_task_result_t;
+
+    // Initialize the server once per process
+    // err->id = 0 for success and err->msg[0] = NULL
+    // err->id != 0 for failure, and err->msg contains error message
+    void llama_server_init(ext_server_params_t *sparams, ext_server_resp_t *err);
+
+    // Run the main loop, called once per init
+    void llama_server_start();
+    // Stop the main loop and free up resources allocated in init and start.  Init must be called again to reuse
+    void llama_server_stop();
+
+    // json_req null terminated string, memory managed by caller
+    // resp->id >= 0 on success (task ID)
+    // resp->id < 0 on error, and resp->msg contains error message
+    void llama_server_completion(const char *json_req, ext_server_resp_t *resp);
+
+    // Caller must call llama_server_release_task_result to free resp->json_resp
+    void llama_server_completion_next_result(const int task_id, ext_server_task_result_t *result);
+    void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err);
+    void llama_server_release_task_result(ext_server_task_result_t *result);
+
+    // Caller must call llama_server_releaes_json_resp to free json_resp if err.id < 0
+    void llama_server_tokenize(const char *json_req, char **json_resp, ext_server_resp_t *err);
+    void llama_server_detokenize(const char *json_req, char **json_resp, ext_server_resp_t *err);
+    void llama_server_embedding(const char *json_req, char** json_resp, ext_server_resp_t *err);
+    void llama_server_release_json_resp(char **json_resp);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+#endif // LLAMA_SERVER_LIBRARY
+\ No newline at end of file
+diff --git a/ggml-cuda.cu b/ggml-cuda.cu
+index f20846f..9640cf3 100644
+--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
+@@ -6757,6 +6757,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
+         CUDA_CHECK(cudaGetDevice(&id));
+         src_ptr = (char *) extra->data_device[id];
+     } else {
+        fprintf(stderr, "ggml_cuda_cpy_tensor_2d assert: backend: %d\n", src->backend);
+         GGML_ASSERT(false);
+     }
+     char * dst_ptr = (char *) dst;
+-- 
+2.39.3 (Apple Git-145)
+
--- a/llm/llama.cpp/patches/0001-add-detokenize-endpoint.patch
+++ b/llm/llama.cpp/patches/0001-add-detokenize-endpoint.patch
-From 032ef7ff2423f5117bb59d42fb71be9cebf0a2de Mon Sep 17 00:00:00 2001
-From: Bruce MacDonald <brucewmacdonald@gmail.com>
-Date: Mon, 28 Aug 2023 18:08:12 -0400
-Subject: [PATCH] add detokenize endpoint
-
---
- examples/server/server.cpp | 21 +++++++++++++++++++++
- 1 file changed, 21 insertions(+)
-
-diff --git a/examples/server/server.cpp b/examples/server/server.cpp
-index 9966045..5014691 100644
--- a/examples/server/server.cpp
-+++ b/examples/server/server.cpp
-@@ -1075,6 +1075,12 @@ static json format_tokenizer_response(const std::vector<llama_token> &tokens)
-         {"tokens", tokens}};
- }
- 
-+static json format_detokenized_response(std::string content)
-+{
-+    return json{
-+        {"content", content}};
-+}
-+
- static void parse_options_completion(const json &body, llama_server_context &llama)
- {
-     gpt_params default_params;
-@@ -1361,6 +1367,21 @@ int main(int argc, char **argv)
-         const json data = format_tokenizer_response(tokens);
-         return res.set_content(data.dump(), "application/json"); });
- 
-+    svr.Post("/detokenize", [&llama](const Request &req, Response &res)
-+             {
-+        auto lock = llama.lock();
-+
-+        const json body = json::parse(req.body);
-+        std::string content;
-+        if (body.count("tokens") != 0)
-+        {
-+            const std::vector<llama_token> tokens = body["tokens"];
-+            content = tokens_to_str(llama.ctx, tokens.cbegin(), tokens.cend());
-+        }
-+
-+        const json data = format_detokenized_response(content);
-+        return res.set_content(data.dump(), "application/json"); });
-+
-     svr.Post("/embedding", [&llama](const Request &req, Response &res)
-              {
-         auto lock = llama.lock();
-- 
-2.39.2 (Apple Git-143)
-
--- a/llm/llama.cpp/patches/0001-copy-cuda-runtime-libraries.patch
+++ b/llm/llama.cpp/patches/0001-copy-cuda-runtime-libraries.patch
-From 5dd02993e8cc2ce309157736b95bb572f274a3fd Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Wed, 20 Sep 2023 14:19:52 -0700
-Subject: [PATCH] copy cuda runtime libraries
-
---
- CMakeLists.txt | 4 ++++
- 1 file changed, 4 insertions(+)
-
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 824d9f2..dd24137 100644
--- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -274,6 +274,10 @@ if (LLAMA_CUBLAS)
-             set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
-         endif()
- 
-+        configure_file(${CUDAToolkit_LIBRARY_DIR}/libcudart.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcudart.so.${CUDAToolkit_VERSION_MAJOR}.0 COPYONLY)
-+        configure_file(${CUDAToolkit_LIBRARY_DIR}/libcublas.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcublas.so.${CUDAToolkit_VERSION_MAJOR} COPYONLY)
-+        configure_file(${CUDAToolkit_LIBRARY_DIR}/libcublasLt.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcublasLt.so.${CUDAToolkit_VERSION_MAJOR} COPYONLY)
-+
-     if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-         # 52 == lowest CUDA 12 standard
-         # 60 == f16 CUDA intrinsics
-- 
-2.42.0
-
--- a/llm/llama.cpp/patches/0001-update-default-log-target.patch
+++ b/llm/llama.cpp/patches/0001-update-default-log-target.patch
-From 6465fec6290f0a7f5d4d0fbe6bcf634e4810dde6 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Mon, 23 Oct 2023 10:39:34 -0700
-Subject: [PATCH] default log stderr
-
---
- common/log.h | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/common/log.h b/common/log.h
-index b8953fd..25522cd 100644
--- a/common/log.h
-+++ b/common/log.h
-@@ -90,7 +90,7 @@
- //  }
- //
- #ifndef LOG_TARGET
-    #define LOG_TARGET log_handler()
-+    #define LOG_TARGET nullptr
- #endif
- 
- #ifndef LOG_TEE_TARGET
-- 
-2.42.0
-
--- a/llm/llama.cpp/patches/0002-34B-model-support.patch
+++ b/llm/llama.cpp/patches/0002-34B-model-support.patch
-From 6145068a6613c37bb43a7408b5496524bdcfc402 Mon Sep 17 00:00:00 2001
-From: Bruce MacDonald <brucewmacdonald@gmail.com>
-Date: Mon, 28 Aug 2023 18:08:53 -0400
-Subject: [PATCH] 34B model support
-
---
- llama.cpp | 10 ++++++++++
- 1 file changed, 10 insertions(+)
-
-diff --git a/llama.cpp b/llama.cpp
-index f2cbe76..62c5cdf 100644
--- a/llama.cpp
-+++ b/llama.cpp
-@@ -79,6 +79,7 @@ enum e_model {
-     MODEL_7B,
-     MODEL_13B,
-     MODEL_30B,
-+    MODEL_34B,
-     MODEL_65B,
-     MODEL_70B,
- };
-@@ -122,6 +123,7 @@ static std::map<e_model, size_t> MEM_REQ_SCRATCH0(int n_ctx)
-         { MODEL_7B,   ((size_t) n_ctx / 16ull + 100ull) * MB },
-         { MODEL_13B,  ((size_t) n_ctx / 12ull + 120ull) * MB },
-         { MODEL_30B,  ((size_t) n_ctx /  9ull + 160ull) * MB },
-+        { MODEL_34B,  ((size_t) n_ctx / 9ull + 160ull) * MB },
-         { MODEL_65B,  ((size_t) n_ctx /  6ull + 256ull) * MB }, // guess
-         { MODEL_70B,  ((size_t) n_ctx /  7ull + 164ull) * MB },
-     };
-@@ -135,6 +137,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
-         { MODEL_7B,  160ull * MB },
-         { MODEL_13B, 192ull * MB },
-         { MODEL_30B, 256ull * MB },
-+        { MODEL_34B, 256ull * MB },
-         { MODEL_65B, 384ull * MB }, // guess
-         { MODEL_70B, 304ull * MB },
-     };
-@@ -149,6 +152,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
-         { MODEL_7B,  10ull * MB },
-         { MODEL_13B, 12ull * MB },
-         { MODEL_30B, 16ull * MB },
-+        { MODEL_34B, 16ull * MB },
-         { MODEL_65B, 24ull * MB }, // guess
-         { MODEL_70B, 24ull * MB },
-     };
-@@ -164,6 +168,7 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
-         { MODEL_7B,   512ull * kB },
-         { MODEL_13B,  640ull * kB },
-         { MODEL_30B,  768ull * kB },
-+        { MODEL_34B,  768ull * kB },
-         { MODEL_65B, 1280ull * kB },
-         { MODEL_70B, 1280ull * kB },
-     };
-@@ -179,6 +184,7 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
-         { MODEL_7B,  128ull },
-         { MODEL_13B, 160ull },
-         { MODEL_30B, 208ull },
-+        { MODEL_34B, 208ull },
-         { MODEL_65B, 256ull },
-         { MODEL_70B, 256ull },
-     };
-@@ -1027,6 +1033,7 @@ static const char * llama_model_type_name(e_model type) {
-         case MODEL_7B: return "7B";
-         case MODEL_13B: return "13B";
-         case MODEL_30B: return "30B";
-+        case MODEL_34B: return "34B";
-         case MODEL_65B: return "65B";
-         case MODEL_70B: return "70B";
-         default: LLAMA_ASSERT(false);
-@@ -1074,6 +1081,7 @@ static void llama_model_load_internal(
-             case 26: model.type = e_model::MODEL_3B; break;
-             case 32: model.type = e_model::MODEL_7B; break;
-             case 40: model.type = e_model::MODEL_13B; break;
-+            case 48: model.type = e_model::MODEL_34B; break;
-             case 60: model.type = e_model::MODEL_30B; break;
-             case 80: model.type = e_model::MODEL_65B; break;
-             default:
-@@ -1094,6 +1102,8 @@ static void llama_model_load_internal(
-             LLAMA_LOG_WARN("%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
-             model.type = e_model::MODEL_70B;
-             hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model
-+        } else if (model.type == e_model::MODEL_34B && n_gqa == 8) {
-+            hparams.f_ffn_mult = 1.0f; // from the params.json of the 34B model
-         }
- 
-         hparams.rope_freq_base  = rope_freq_base;
-- 
-2.39.2 (Apple Git-143)
-
--- a/llm/llama.cpp/patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
+++ b/llm/llama.cpp/patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
-From dadbed99e65252d79f81101a392d0d6497b86caa Mon Sep 17 00:00:00 2001
-From: Shouzheng Liu <lshzh.hi@gmail.com>
-Date: Mon, 21 Aug 2023 06:59:29 -0400
-Subject: [PATCH] metal : fix synchronization in new matrix multiplication
- kernel (#2686)
-
---
- ggml-metal.metal | 3 ++-
- 1 file changed, 2 insertions(+), 1 deletion(-)
-
-diff --git a/ggml-metal.metal b/ggml-metal.metal
-index 3f31252..88d48f6 100644
--- a/ggml-metal.metal
-+++ b/ggml-metal.metal
-@@ -1898,10 +1898,11 @@ kernel void kernel_mul_mm(device const  uchar * src0,
-         threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
-                                       + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
-         for (int i = 0; i < 8; i++) {
-+            threadgroup_barrier(mem_flags::mem_device);
-             simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
-         }
- 
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-+        threadgroup_barrier(mem_flags::mem_device);
-         device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
-         if (sgitg==0) {
-             for (int i = 0; i < n_rows; i++) {
-- 
-2.41.0
-