Merge pull request #65 from jmorganca/bindings

call llama.cpp directly from go

Merge pull request #65 from jmorganca/bindings
call llama.cpp directly from go
62620914 · Michael Yang · GitHub · abaf7d3b · 442dec1c · 62620914
Unverified Commit 62620914 authored Jul 11, 2023 by Michael Yang Committed by GitHub Jul 11, 2023
20 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -3,8 +3,5 @@
 .env
 .venv
 *.spec
-build
 dist
-__pycache__
 ollama
-ggml-metal.metal
--- a/Makefile
+++ b/Makefile
-default: ollama
-
-.PHONY: llama
-llama:
-	cmake -S llama -B llama/build -DLLAMA_METAL=on
-	cmake --build llama/build
-
-.PHONY: ollama
-ollama: llama
-	go build .
-
-.PHONY: app
-app: ollama
-	npm install --prefix app
-	npm run --prefix app make:sign
-
-clean:
-	go clean
-	rm -rf llama/build
--- a/README.md
+++ b/README.md
@@ -75,7 +75,7 @@ ollama run ~/Downloads/vicuna-7b-v1.3.ggmlv3.q4_1.bin
 ## Building

 ```
-make
+go build .
 ```

 To run it start the server:

--- a/api/types.go
+++ b/api/types.go
 package api

+import "runtime"
+
 type PullRequest struct {
 	Model string `json:"model"`
 }
@@ -14,93 +16,76 @@ type GenerateRequest struct {
 	Model  string `json:"model"`
 	Prompt string `json:"prompt"`

-	ModelOptions   *ModelOptions   `json:"model_opts,omitempty"`
-	PredictOptions *PredictOptions `json:"predict_opts,omitempty"`
+	Options `json:"options"`
 }

-type ModelOptions struct {
-	ContextSize int    `json:"context_size,omitempty"`
-	Seed        int    `json:"seed,omitempty"`
-	NBatch      int    `json:"n_batch,omitempty"`
-	F16Memory   bool   `json:"memory_f16,omitempty"`
-	MLock       bool   `json:"mlock,omitempty"`
-	MMap        bool   `json:"mmap,omitempty"`
-	VocabOnly   bool   `json:"vocab_only,omitempty"`
-	LowVRAM     bool   `json:"low_vram,omitempty"`
-	Embeddings  bool   `json:"embeddings,omitempty"`
-	NUMA        bool   `json:"numa,omitempty"`
-	NGPULayers  int    `json:"gpu_layers,omitempty"`
-	MainGPU     string `json:"main_gpu,omitempty"`
-	TensorSplit string `json:"tensor_split,omitempty"`
+type GenerateResponse struct {
+	Response string `json:"response"`
 }

-type PredictOptions struct {
-	Seed        int     `json:"seed,omitempty"`
-	Threads     int     `json:"threads,omitempty"`
-	Tokens      int     `json:"tokens,omitempty"`
-	TopK        int     `json:"top_k,omitempty"`
-	Repeat      int     `json:"repeat,omitempty"`
-	Batch       int     `json:"batch,omitempty"`
-	NKeep       int     `json:"nkeep,omitempty"`
-	TopP        float64 `json:"top_p,omitempty"`
-	Temperature float64 `json:"temp,omitempty"`
-	Penalty     float64 `json:"penalty,omitempty"`
-	F16KV       bool
-	DebugMode   bool
-	StopPrompts []string
-	IgnoreEOS   bool `json:"ignore_eos,omitempty"`
-
-	TailFreeSamplingZ float64 `json:"tfs_z,omitempty"`
-	TypicalP          float64 `json:"typical_p,omitempty"`
-	FrequencyPenalty  float64 `json:"freq_penalty,omitempty"`
-	PresencePenalty   float64 `json:"pres_penalty,omitempty"`
-	Mirostat          int     `json:"mirostat,omitempty"`
-	MirostatETA       float64 `json:"mirostat_lr,omitempty"`
-	MirostatTAU       float64 `json:"mirostat_ent,omitempty"`
-	PenalizeNL        bool    `json:"penalize_nl,omitempty"`
-	LogitBias         string  `json:"logit_bias,omitempty"`
-
-	PathPromptCache string
-	MLock           bool `json:"mlock,omitempty"`
-	MMap            bool `json:"mmap,omitempty"`
-	PromptCacheAll  bool
-	PromptCacheRO   bool
-	MainGPU         string
-	TensorSplit     string
-}
+type Options struct {
+	Seed int `json:"seed,omitempty"`

-var DefaultModelOptions ModelOptions = ModelOptions{
-	ContextSize: 512,
-	Seed:        0,
-	F16Memory:   true,
-	MLock:       false,
-	Embeddings:  true,
-	MMap:        true,
-	LowVRAM:     false,
-}
+	// Backend options
+	UseNUMA bool `json:"numa,omitempty"`
+
+	// Model options
+	NumCtx        int  `json:"num_ctx,omitempty"`
+	NumBatch      int  `json:"num_batch,omitempty"`
+	NumGPU        int  `json:"num_gpu,omitempty"`
+	MainGPU       int  `json:"main_gpu,omitempty"`
+	LowVRAM       bool `json:"low_vram,omitempty"`
+	F16KV         bool `json:"f16_kv,omitempty"`
+	LogitsAll     bool `json:"logits_all,omitempty"`
+	VocabOnly     bool `json:"vocab_only,omitempty"`
+	UseMMap       bool `json:"use_mmap,omitempty"`
+	UseMLock      bool `json:"use_mlock,omitempty"`
+	EmbeddingOnly bool `json:"embedding_only,omitempty"`
+
+	// Predict options
+	RepeatLastN      int     `json:"repeat_last_n,omitempty"`
+	RepeatPenalty    float32 `json:"repeat_penalty,omitempty"`
+	FrequencyPenalty float32 `json:"frequency_penalty,omitempty"`
+	PresencePenalty  float32 `json:"presence_penalty,omitempty"`
+	Temperature      float32 `json:"temperature,omitempty"`
+	TopK             int     `json:"top_k,omitempty"`
+	TopP             float32 `json:"top_p,omitempty"`
+	TFSZ             float32 `json:"tfs_z,omitempty"`
+	TypicalP         float32 `json:"typical_p,omitempty"`
+	Mirostat         int     `json:"mirostat,omitempty"`
+	MirostatTau      float32 `json:"mirostat_tau,omitempty"`
+	MirostatEta      float32 `json:"mirostat_eta,omitempty"`

-var DefaultPredictOptions PredictOptions = PredictOptions{
-	Seed:              -1,
-	Threads:           -1,
-	Tokens:            512,
-	Penalty:           1.1,
-	Repeat:            64,
-	Batch:             512,
-	NKeep:             64,
-	TopK:              90,
-	TopP:              0.86,
-	TailFreeSamplingZ: 1.0,
-	TypicalP:          1.0,
-	Temperature:       0.8,
-	FrequencyPenalty:  0.0,
-	PresencePenalty:   0.0,
-	Mirostat:          0,
-	MirostatTAU:       5.0,
-	MirostatETA:       0.1,
-	MMap:              true,
-	StopPrompts:       []string{"llama"},
+	NumThread int `json:"num_thread,omitempty"`
 }

-type GenerateResponse struct {
-	Response string `json:"response"`
+func DefaultOptions() Options {
+	return Options{
+		Seed: -1,
+
+		UseNUMA: false,
+
+		NumCtx:   512,
+		NumBatch: 512,
+		NumGPU:   1,
+		LowVRAM:  false,
+		F16KV:    true,
+		UseMMap:  true,
+		UseMLock: false,
+
+		RepeatLastN:      512,
+		RepeatPenalty:    1.1,
+		FrequencyPenalty: 0.0,
+		PresencePenalty:  0.0,
+		Temperature:      0.8,
+		TopK:             40,
+		TopP:             0.9,
+		TFSZ:             1.0,
+		TypicalP:         1.0,
+		Mirostat:         0,
+		MirostatTau:      5.0,
+		MirostatEta:      0.1,
+
+		NumThread: runtime.NumCPU(),
+	}
 }
--- a/go.mod
+++ b/go.mod
@@ -39,6 +39,7 @@ require (
 	golang.org/x/arch v0.3.0 // indirect
 	golang.org/x/crypto v0.10.0 // indirect
 	golang.org/x/net v0.10.0 // indirect
+	golang.org/x/sync v0.3.0
 	golang.org/x/sys v0.10.0 // indirect
 	golang.org/x/term v0.10.0
 	golang.org/x/text v0.10.0 // indirect

--- a/go.sum
+++ b/go.sum
@@ -99,6 +99,8 @@ golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
 golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.3.0 h1:ftCYgMx6zT/asHUrPw8BLLscYtGznsLAnjq5RH9P66E=
+golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=

--- a/llama/.gitignore
+++ b/llama/.gitignore
-build
\ No newline at end of file
--- a/llama/CMakeLists.txt
+++ b/llama/CMakeLists.txt
-cmake_minimum_required(VERSION 3.12)
-project(binding)
-
-include(FetchContent)
-
-FetchContent_Declare(
-    llama_cpp
-    GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git
-    GIT_TAG        55dbb91
-)
-
-FetchContent_MakeAvailable(llama_cpp)
-
-add_library(binding ${CMAKE_CURRENT_SOURCE_DIR}/binding/binding.cpp ${llama_cpp_SOURCE_DIR}/examples/common.cpp)
-target_include_directories(binding PRIVATE ${llama_cpp_SOURCE_DIR}/examples)
-target_link_libraries(binding llama ggml_static)
-
-if (LLAMA_METAL)
-    configure_file(${llama_cpp_SOURCE_DIR}/ggml-metal.metal ${CMAKE_CURRENT_BINARY_DIR}/../../ggml-metal.metal COPYONLY)
-endif()
-
-add_custom_target(copy_libllama ALL COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:llama> ${CMAKE_CURRENT_BINARY_DIR})
-add_custom_target(copy_libggml_static ALL COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:ggml_static> ${CMAKE_CURRENT_BINARY_DIR})
--- a/llama/binding/binding.cpp
+++ b/llama/binding/binding.cpp
--- a/llama/binding/binding.h
+++ b/llama/binding/binding.h
-// MIT License
-
-// Copyright (c) 2023 go-skynet authors
-
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
-
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-// SOFTWARE.
-
-#ifdef __cplusplus
-
-extern "C" {
-
-#endif
-
-#include <stdbool.h>
-
-extern unsigned char tokenCallback(void *, char *);
-
-int load_state(void *ctx, char *statefile, char *modes);
-
-int eval(void *params_ptr, void *ctx, char *text);
-
-void save_state(void *ctx, char *dst, char *modes);
-
-void *load_model(const char *fname, int n_ctx, int n_seed, bool memory_f16,
-                 bool mlock, bool embeddings, bool mmap, bool low_vram,
-                 bool vocab_only, int n_gpu, int n_batch, const char *maingpu,
-                 const char *tensorsplit, bool numa);
-
-int get_embeddings(void *params_ptr, void *state_pr, float *res_embeddings);
-
-int get_token_embeddings(void *params_ptr, void *state_pr, int *tokens,
-                         int tokenSize, float *res_embeddings);
-
-void *llama_allocate_params(
-    const char *prompt, int seed, int threads, int tokens, int top_k,
-    float top_p, float temp, float repeat_penalty, int repeat_last_n,
-    bool ignore_eos, bool memory_f16, int n_batch, int n_keep,
-    const char **antiprompt, int antiprompt_count, float tfs_z, float typical_p,
-    float frequency_penalty, float presence_penalty, int mirostat,
-    float mirostat_eta, float mirostat_tau, bool penalize_nl,
-    const char *logit_bias, bool mlock, bool mmap, const char *maingpu,
-    const char *tensorsplit);
-
-void llama_free_params(void *params_ptr);
-
-void llama_binding_free_model(void *state);
-
-int llama_predict(void *params_ptr, void *state_pr, char *result, bool debug);
-
-#ifdef __cplusplus
-
-}
-
-#endif
--- a/llama/ggml-cuda.cu
+++ b/llama/ggml-cuda.cu
--- a/llama/ggml-cuda.h
+++ b/llama/ggml-cuda.h
+/**
+ * llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Georgi Gerganov
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "ggml.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#define GGML_CUDA_MAX_DEVICES       16
+
+void   ggml_init_cublas(void);
+void   ggml_cuda_set_tensor_split(const float * tensor_split);
+
+void   ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+void   ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
+
+// TODO: export these with GGML_API
+void * ggml_cuda_host_malloc(size_t size);
+void   ggml_cuda_host_free(void * ptr);
+
+void   ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
+
+void   ggml_cuda_free_data(struct ggml_tensor * tensor);
+void   ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
+void   ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
+void   ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
+void   ggml_cuda_set_main_device(int main_device);
+void   ggml_cuda_set_scratch_size(size_t scratch_size);
+void   ggml_cuda_free_scratch(void);
+bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
+
+#ifdef  __cplusplus
+}
+#endif
--- a/llama/ggml-metal.h
+++ b/llama/ggml-metal.h
+/**
+ * llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Georgi Gerganov
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// An interface allowing to compute ggml_cgraph with Metal
+//
+// This is a fully functional interface that extends ggml with GPU support for Apple devices.
+// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, OpenCL, etc.)
+//
+// How it works?
+//
+// As long as your program can create and evaluate a ggml_cgraph on the CPU, you can use this
+// interface to evaluate the same graph on the GPU. Instead of using ggml_graph_compute(), you
+// use ggml_metal_graph_compute() (or ggml_vulkan_graph_compute(), etc.)
+//
+// You only need to make sure that all memory buffers that you used during the graph creation
+// are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is
+// used during the graph evaluation to determine the arguments of the compute kernels.
+//
+// Synchronization between device and host memory (for example for input and output tensors)
+// is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions.
+//
+
+#pragma once
+
+#include <stddef.h>
+#include <stdbool.h>
+
+// max memory buffers that can be mapped to the device
+#define GGML_METAL_MAX_BUFFERS 16
+
+struct ggml_tensor;
+struct ggml_cgraph;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct ggml_metal_context;
+
+// number of command buffers to use
+struct ggml_metal_context * ggml_metal_init(int n_cb);
+void ggml_metal_free(struct ggml_metal_context * ctx);
+
+// set the number of command buffers to use
+void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
+
+// creates a mapping between a host memory buffer and a device memory buffer
+// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
+// - the mapping is used during computation to determine the arguments of the compute kernels
+// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
+// - max_size specifies the maximum size of a tensor and is used to create shared views such
+//   that it is guaranteed that the tensor will fit in at least one of the views
+//
+bool ggml_metal_add_buffer(
+        struct ggml_metal_context * ctx,
+                       const char * name,
+                             void * data,
+                           size_t   size,
+                           size_t   max_size);
+
+// set data from host memory into the device
+void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
+
+// get data from the device into host memory
+void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
+
+// same as ggml_graph_compute but uses Metal
+// creates gf->n_threads command buffers in parallel
+void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
+
+#ifdef __cplusplus
+}
+#endif
+
--- a/llama/ggml-metal.m
+++ b/llama/ggml-metal.m
--- a/llama/ggml-metal.metal
+++ b/llama/ggml-metal.metal
--- a/llama/ggml.c
+++ b/llama/ggml.c
--- a/llama/ggml.h
+++ b/llama/ggml.h
--- a/llama/k_quants.c
+++ b/llama/k_quants.c
--- a/llama/k_quants.h
+++ b/llama/k_quants.h
+/**
+ * llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Georgi Gerganov
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "ggml.h"
+
+#include <stdint.h>
+#include <assert.h>
+#include <stddef.h>
+
+// Super-block size
+#ifdef GGML_QKK_64
+#define QK_K 64
+#define K_SCALE_SIZE 4
+#else
+#define QK_K 256
+#define K_SCALE_SIZE 12
+#endif
+
+//
+// Super-block quantization structures
+//
+
+// 2-bit quantization
+// weight is represented as x = a * q + b
+// 16 blocks of 16 elemenets each
+// Effectively 2.5625 bits per weight
+typedef struct {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    ggml_fp16_t d;           // super-block scale for quantized scales
+    ggml_fp16_t dmin;        // super-block scale for quantized mins
+} block_q2_K;
+static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
+
+// 3-bit quantization
+// weight is represented as x = a * q
+// 16 blocks of 16 elemenets each
+// Effectively 3.4375 bits per weight
+#ifdef GGML_QKK_64
+typedef struct {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+    uint8_t scales[2];
+    ggml_fp16_t d;             // super-block scale
+} block_q3_K;
+static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
+#else
+typedef struct {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+    uint8_t scales[12];        // scales, quantized with 6 bits
+    ggml_fp16_t d;             // super-block scale
+} block_q3_K;
+static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
+#endif
+
+// 4-bit quantization
+// 16 blocks of 32 elements each
+// weight is represented as x = a * q + b
+// Effectively 4.5 bits per weight
+#ifdef GGML_QKK_64
+typedef struct {
+    ggml_fp16_t d[2];          // super-block scales/mins
+    uint8_t scales[2];         // 4-bit block scales/mins
+    uint8_t qs[QK_K/2];        // 4--bit quants
+} block_q4_K;
+static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
+#else
+typedef struct {
+    ggml_fp16_t d;             // super-block scale for quantized scales
+    ggml_fp16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+} block_q4_K;
+static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
+#endif
+
+// 5-bit quantization
+// 16 blocks of 32 elements each
+// weight is represented as x = a * q + b
+// Effectively 5.5 bits per weight
+#ifdef GGML_QKK_64
+typedef struct {
+    ggml_fp16_t d;               // super-block scale
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+} block_q5_K;
+static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
+#else
+typedef struct {
+    ggml_fp16_t d;               // super-block scale for quantized scales
+    ggml_fp16_t dmin;            // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+} block_q5_K;
+static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
+#endif
+
+// 6-bit quantization
+// weight is represented as x = a * q
+// 16 blocks of 16 elemenets each
+// Effectively 6.5625 bits per weight
+typedef struct {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    ggml_fp16_t d;           // super-block scale
+} block_q6_K;
+static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding");
+
+// This is only used for intermediate quantization and dot products
+typedef struct {
+    float   d;              // delta
+    int8_t  qs[QK_K];       // quants
+    int16_t bsums[QK_K/16]; // sum of quants in groups of 16
+} block_q8_K;
+static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");
+
+
+// Quantization
+void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
+void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
+void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
+void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
+void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
+void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
+
+void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
+void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
+void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
+void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
+void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
+void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
+
+// Dequantization
+void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
+void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
+void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
+void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
+void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
+void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
+
+// Dot product
+void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+
+// Quantization with histogram collection
+size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
+size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
+size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
+size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
+size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
+
--- a/llama/llama-util.h
+++ b/llama/llama-util.h