update ollama_use_funcs

7926888f · wangkx1 · 28e95ed5 · 28e95ed5 · 28e95ed5 · 28e95ed5
Commit 7926888f authored Aug 28, 2024 by wangkx1
20 changed files
--- a/ollama/llm/ggml.go
+++ b/ollama/llm/ggml.go
-package llm
-import (
-	"encoding/binary"
-	"errors"
-	"fmt"
-	"io"
-	"strings"
-	"github.com/ollama/ollama/util/bufioutil"
-)
-type GGML struct {
-	container
-	model
-}
-type model interface {
-	KV() KV
-	Tensors() Tensors
-}
-type KV map[string]any
-func (kv KV) u64(key string) uint64 {
-	switch v := kv[key].(type) {
-	case uint64:
-		return v
-	case uint32:
-		return uint64(v)
-	case float64:
-		return uint64(v)
-	default:
-		return 0
-	}
-}
-func (kv KV) Architecture() string {
-	if s, ok := kv["general.architecture"].(string); ok {
-		return s
-	}
-	return "unknown"
-}
-func (kv KV) ParameterCount() uint64 {
-	return kv.u64("general.parameter_count")
-}
-func (kv KV) FileType() fileType {
-	if u64 := kv.u64("general.file_type"); u64 > 0 {
-		return fileType(uint32(u64))
-	}
-	return fileTypeUnknown
-}
-func (kv KV) BlockCount() uint64 {
-	return kv.u64(fmt.Sprintf("%s.block_count", kv.Architecture()))
-}
-func (kv KV) HeadCount() uint64 {
-	return kv.u64(fmt.Sprintf("%s.attention.head_count", kv.Architecture()))
-}
-func (kv KV) HeadCountKV() uint64 {
-	if headCountKV := kv.u64(fmt.Sprintf("%s.attention.head_count_kv", kv.Architecture())); headCountKV > 0 {
-		return headCountKV
-	}
-	return 1
-}
-func (kv KV) EmbeddingHeadCount() uint64 {
-	if heads := kv.HeadCount(); heads > 0 {
-		return kv.EmbeddingLength() / kv.HeadCount()
-	}
-	return 0
-}
-func (kv KV) EmbeddingHeadCountK() uint64 {
-	if k := kv.u64(fmt.Sprintf("%s.attention.key_length", kv.Architecture())); k > 0 {
-		return k
-	}
-	return kv.EmbeddingHeadCount()
-}
-func (kv KV) EmbeddingHeadCountV() uint64 {
-	if v := kv.u64(fmt.Sprintf("%s.attention.value_length", kv.Architecture())); v > 0 {
-		return v
-	}
-	return kv.EmbeddingHeadCount()
-}
-func (kv KV) GQA() uint64 {
-	return kv.HeadCount() / kv.HeadCountKV()
-}
-func (kv KV) EmbeddingLength() uint64 {
-	return kv.u64(fmt.Sprintf("%s.embedding_length", kv.Architecture()))
-}
-func (kv KV) ContextLength() uint64 {
-	return kv.u64(fmt.Sprintf("%s.context_length", kv.Architecture()))
-}
-func (kv KV) ChatTemplate() string {
-	s, _ := kv["tokenizer.chat_template"].(string)
-	return s
-}
-type Tensors struct {
-	Items  []*Tensor
-	Offset uint64
-}
-func (ts Tensors) Layers() map[string]Layer {
-	layers := make(map[string]Layer)
-	for _, t := range ts.Items {
-		parts := strings.Split(t.Name, ".")
-		if parts[0] == "blk" {
-			// join first and second part, e.g. blk.%d
-			parts = append([]string{fmt.Sprintf("%s.%s", parts[0], parts[1])}, parts[2:]...)
-		}
-		if _, ok := layers[parts[0]]; !ok {
-			layers[parts[0]] = make(Layer)
-		}
-		layers[parts[0]][strings.Join(parts[1:], ".")] = t
-	}
-	return layers
-}
-type Layer map[string]*Tensor
-func (l Layer) size() (size uint64) {
-	for _, t := range l {
-		size += t.Size()
-	}
-	return size
-}
-type Tensor struct {
-	Name   string `json:"name"`
-	Kind   uint32 `json:"kind"`
-	Offset uint64 `json:"-"`
-	// Shape is the number of elements in each dimension
-	Shape []uint64 `json:"shape"`
-	io.WriterTo `json:"-"`
-}
-func (t Tensor) blockSize() uint64 {
-	switch t.Kind {
-	case 0, 1, 24, 25, 26, 27, 28, 30: // F32, F16, I8, I16, I32, I64, F64, BF16
-		return 1
-	case 2, 3, 4, 5, 6, 7, 8, 9, 20: // Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, Q8_1, IQ4_NL
-		return 32
-	default: // All others
-		return 256
-	}
-}
-func (t Tensor) typeSize() uint64 {
-	blockSize := t.blockSize()
-	switch t.Kind {
-	case 0: // FP32
-		return 4
-	case 1: // FP16
-		return 2
-	case 2: // Q4_0
-		return 2 + blockSize/2
-	case 3: // Q4_1
-		return 2 + 2 + blockSize/2
-	case 6: // Q5_0
-		return 2 + 4 + blockSize/2
-	case 7: // Q5_1
-		return 2 + 2 + 4 + blockSize/2
-	case 8: // Q8_0
-		return 2 + blockSize
-	case 9: // Q8_1
-		return 4 + 4 + blockSize
-	case 10: // Q2_K
-		return blockSize/16 + blockSize/4 + 2 + 2
-	case 11: // Q3_K
-		return blockSize/8 + blockSize/4 + 12 + 2
-	case 12: // Q4_K
-		return 2 + 2 + 12 + blockSize/2
-	case 13: // Q5_K
-		return 2 + 2 + 12 + blockSize/8 + blockSize/2
-	case 14: // Q6_K
-		return blockSize/2 + blockSize/4 + blockSize/16 + 2
-	case 15: // Q8_K
-		return 2 + blockSize + 2*blockSize/16
-	case 16: // IQ2_XXS
-		return 2 + 2*blockSize/8
-	case 17: // IQ2_XS
-		return 2 + 2*blockSize/8 + blockSize/32
-	case 18: // IQ3_XXS
-		return 2 + blockSize/4 + blockSize/8
-	case 19: // IQ1_S
-		return 2 + blockSize/8 + blockSize/16
-	case 20: // IQ4_NL
-		return 2 + blockSize/2
-	case 21: // IQ3_S
-		return 2 + blockSize/4 + blockSize/8 + blockSize/32 + 4
-	case 22: // IQ2_S
-		return 2 + blockSize/4 + blockSize/16
-	case 23: // IQ4_XS
-		return 2 + 2 + blockSize/2 + blockSize/64
-	case 24: // I8
-		return 1
-	case 25: // I16
-		return 2
-	case 26: // I32
-		return 4
-	case 27: // I64
-		return 8
-	case 28: // F64
-		return 8
-	case 29: // IQ1_M
-		return blockSize/8 + blockSize/16 + blockSize/32
-	default:
-		return 0
-	}
-}
-func (t Tensor) parameters() uint64 {
-	var count uint64 = 1
-	for _, n := range t.Shape {
-		count *= n
-	}
-	return count
-}
-func (t Tensor) Size() uint64 {
-	return t.parameters() * t.typeSize() / t.blockSize()
-}
-type container interface {
-	Name() string
-	Decode(io.ReadSeeker) (model, error)
-}
-const (
-	// Magic constant for `ggml` files (unversioned).
-	FILE_MAGIC_GGML = 0x67676d6c
-	// Magic constant for `ggml` files (versioned, ggmf).
-	FILE_MAGIC_GGMF = 0x67676d66
-	// Magic constant for `ggml` files (versioned, ggjt).
-	FILE_MAGIC_GGJT = 0x67676a74
-	// Magic constant for `ggla` files (LoRA adapter).
-	FILE_MAGIC_GGLA = 0x67676C61
-	// Magic constant for `gguf` files (versioned, gguf)
-	FILE_MAGIC_GGUF_LE = 0x46554747
-	FILE_MAGIC_GGUF_BE = 0x47475546
-)
-var ErrUnsupportedFormat = errors.New("unsupported model format")
-func DetectGGMLType(b []byte) string {
-	switch binary.LittleEndian.Uint32(b[:4]) {
-	case FILE_MAGIC_GGML:
-		return "ggml"
-	case FILE_MAGIC_GGMF:
-		return "ggmf"
-	case FILE_MAGIC_GGJT:
-		return "ggjt"
-	case FILE_MAGIC_GGLA:
-		return "ggla"
-	case FILE_MAGIC_GGUF_LE, FILE_MAGIC_GGUF_BE:
-		return "gguf"
-	default:
-		return ""
-	}
-}
-// DecodeGGML decodes a GGML model from the given reader.
-//
-// It collects array values for arrays with a size less than or equal to
-// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
-// the maxArraySize is negative, all arrays are collected.
-func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
-	if maxArraySize == 0 {
-		maxArraySize = 1024
-	}
-	rs = bufioutil.NewBufferedSeeker(rs, 32<<10)
-	var magic uint32
-	if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
-		return nil, 0, err
-	}
-	var c container
-	switch magic {
-	case FILE_MAGIC_GGML, FILE_MAGIC_GGMF, FILE_MAGIC_GGJT:
-		return nil, 0, ErrUnsupportedFormat
-	case FILE_MAGIC_GGLA:
-		c = &containerGGLA{}
-	case FILE_MAGIC_GGUF_LE:
-		c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
-	case FILE_MAGIC_GGUF_BE:
-		c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
-	default:
-		return nil, 0, errors.New("invalid file magic")
-	}
-	model, err := c.Decode(rs)
-	if err != nil {
-		return nil, 0, err
-	}
-	offset, err := rs.Seek(0, io.SeekCurrent)
-	if err != nil {
-		return nil, 0, err
-	}
-	// final model type
-	return &GGML{
-		container: c,
-		model:     model,
-	}, offset, nil
-}
-func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload uint64) {
-	embedding := llm.KV().EmbeddingLength()
-	heads := llm.KV().HeadCount()
-	headsKV := llm.KV().HeadCountKV()
-	vocab := uint64(llm.KV()["tokenizer.ggml.tokens"].(*array).size)
-	embeddingHeads := llm.KV().EmbeddingHeadCount()
-	embeddingHeadsK := llm.KV().EmbeddingHeadCountK()
-	layers := llm.Tensors().Layers()
-	switch llm.KV().Architecture() {
-	case "llama":
-		fullOffload = 4 * batch * (1 + 4*embedding + context*(1+heads))
-		partialOffload = 4 * batch * embedding
-		partialOffload += max(
-			// 4*batch*(4+6*embedding+context*(2*heads)+llm.KV().GQA()),
-			4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
-			4*batch*(embedding+vocab)+embedding*vocab*105/128,
-		)
-		if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok {
-			// mixtral 8x22b
-			ff := uint64(llm.KV()["llama.feed_forward_length"].(uint32))
-			partialOffload = max(
-				3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
-				4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
-			)
-		} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
-			// mixtral 8x7b
-			ffnGateWeight1 := ffnGateWeight.Shape[1]
-			fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
-			partialOffload = max(
-				4*batch*(3+embeddingHeads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
-				4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
-			)
-		}
-	case "gemma", "gemma2":
-		fullOffload = max(
-			4*batch*(embedding+vocab),
-			4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
-		)
-		partialOffload = max(
-			4*embedding*batch+embedding*vocab*105/128+4*vocab*batch,
-			4*batch*(2*embedding+1+2*embeddingHeadsK*heads+context+context*heads)+
-				4*embeddingHeadsK*context*8+
-				embedding*embeddingHeadsK*heads*9/16,
-		)
-	case "command-r":
-		fullOffload = max(
-			4*batch*(embedding+vocab),
-			4*batch*(2+4*embedding+context*(1+heads)),
-		)
-		partialOffload = max(
-			4*batch*(embedding+vocab)+embedding*vocab*105/128,
-			4*batch*(1+2*embedding+context*(1+heads))+4*embedding*context+embedding*embedding*9/16,
-		)
-	case "qwen2":
-		fullOffload = max(
-			4*batch*(embedding+vocab),
-			4*batch*(1+2*embedding+context+context*heads),
-		)
-		partialOffload = max(
-			4*batch*(embedding+vocab)+embedding*vocab*105/128,
-			4*(batch*(1+2*embedding+context*(1+heads))+embedding*(1+context)),
-		)
-	case "phi2":
-		fullOffload = max(
-			4*batch*(embedding+vocab),
-			4*batch*(1+4*embedding+context+context*heads),
-		)
-		partialOffload = max(
-			4*batch*(2*embedding+vocab)+embedding*vocab*105/128,
-			4*batch*(2+3*embedding+context+context*heads),
-		)
-	case "stablelm":
-		fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2)
-		partialOffload = max(
-			4*batch*(vocab+2*embedding),
-			fullOffload,
-		)
-	case "deepseek2":
-		fullOffload = max(
-			4*batch*(3*embedding+vocab),
-			4*batch*(3*embedding+2+context*(1+headsKV)+2*embeddingHeadsK*headsKV),
-		)
-		partialOffload = max(
-			4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
-			4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16,
-		)
-	case "chatglm":
-		fullOffload = 4 * batch * (embedding + vocab)
-		partialOffload = 4*batch*(embedding+vocab) + embedding*vocab*105/128
-		if qkvBias, ok := layers["blk.0"]["attn_qkv.bias"]; ok {
-			fullOffload = max(
-				fullOffload,
-				4*batch*(2+
-					2*embedding+
-					context+
-					context*heads+
-					embeddingHeadsK*heads+
-					qkvBias.Shape[0]),
-			)
-			partialOffload = max(
-				partialOffload,
-				4*batch*(1+
-					2*embedding+
-					embeddingHeadsK*heads+
-					context+
-					context*heads)+
-					4*embeddingHeadsK*context+
-					4*context*embeddingHeadsK+
-					4*qkvBias.Shape[0],
-			)
-		}
-	}
-	return
-}
--- a/ollama/llm/ggml_test.go
+++ b/ollama/llm/ggml_test.go
-package llm
--- a/ollama/llm/gguf.go
+++ b/ollama/llm/gguf.go
-package llm
-import (
-	"bytes"
-	"cmp"
-	"encoding/binary"
-	"encoding/json"
-	"fmt"
-	"io"
-	"log/slog"
-	"slices"
-	"strings"
-	"golang.org/x/exp/maps"
-)
-type containerGGUF struct {
-	ByteOrder binary.ByteOrder
-	Version uint32
-	V1 struct {
-		NumTensor uint32
-		NumKV     uint32
-	}
-	V2 struct {
-		NumTensor uint64
-		NumKV     uint64
-	}
-	V3 struct {
-		NumTensor uint64
-		NumKV     uint64
-	}
-	maxArraySize int
-}
-func (c *containerGGUF) canCollectArray(size int) bool {
-	return c.maxArraySize < 0 || size <= c.maxArraySize
-}
-func (c *containerGGUF) Name() string {
-	return "gguf"
-}
-func (c *containerGGUF) Decode(rs io.ReadSeeker) (model, error) {
-	if err := binary.Read(rs, c.ByteOrder, &c.Version); err != nil {
-		return nil, err
-	}
-	var err error
-	switch c.Version {
-	case 1:
-		err = binary.Read(rs, c.ByteOrder, &c.V1)
-	case 2:
-		err = binary.Read(rs, c.ByteOrder, &c.V2)
-	default:
-		err = binary.Read(rs, c.ByteOrder, &c.V3)
-	}
-	if err != nil {
-		return nil, err
-	}
-	model := newGGUF(c)
-	if err := model.Decode(rs); err != nil {
-		return nil, err
-	}
-	return model, nil
-}
-const (
-	ggufTypeUint8 uint32 = iota
-	ggufTypeInt8
-	ggufTypeUint16
-	ggufTypeInt16
-	ggufTypeUint32
-	ggufTypeInt32
-	ggufTypeFloat32
-	ggufTypeBool
-	ggufTypeString
-	ggufTypeArray
-	ggufTypeUint64
-	ggufTypeInt64
-	ggufTypeFloat64
-)
-type gguf struct {
-	*containerGGUF
-	kv      KV
-	tensors []*Tensor
-	parameters   uint64
-	tensorOffset uint64
-	scratch [16 << 10]byte
-}
-func newGGUF(container *containerGGUF) *gguf {
-	return &gguf{
-		containerGGUF: container,
-		kv:            make(KV),
-	}
-}
-func (llm *gguf) KV() KV {
-	return llm.kv
-}
-func (llm *gguf) Tensors() Tensors {
-	return Tensors{
-		Items:  llm.tensors,
-		Offset: llm.tensorOffset,
-	}
-}
-func (llm *gguf) numTensor() uint64 {
-	switch llm.Version {
-	case 1:
-		return uint64(llm.V1.NumTensor)
-	case 2:
-		return llm.V2.NumTensor
-	default:
-		return llm.V3.NumTensor
-	}
-}
-func (llm *gguf) numKV() uint64 {
-	switch llm.Version {
-	case 1:
-		return uint64(llm.V1.NumKV)
-	case 2:
-		return llm.V2.NumKV
-	default:
-		return llm.V3.NumKV
-	}
-}
-func (llm *gguf) Decode(rs io.ReadSeeker) error {
-	// decode key-values
-	for i := 0; uint64(i) < llm.numKV(); i++ {
-		k, err := readGGUFString(llm, rs)
-		if err != nil {
-			return err
-		}
-		t, err := readGGUF[uint32](llm, rs)
-		if err != nil {
-			return err
-		}
-		var v any
-		switch t {
-		case ggufTypeUint8:
-			v, err = readGGUF[uint8](llm, rs)
-		case ggufTypeInt8:
-			v, err = readGGUF[int8](llm, rs)
-		case ggufTypeUint16:
-			v, err = readGGUF[uint16](llm, rs)
-		case ggufTypeInt16:
-			v, err = readGGUF[int16](llm, rs)
-		case ggufTypeUint32:
-			v, err = readGGUF[uint32](llm, rs)
-		case ggufTypeInt32:
-			v, err = readGGUF[int32](llm, rs)
-		case ggufTypeUint64:
-			v, err = readGGUF[uint64](llm, rs)
-		case ggufTypeInt64:
-			v, err = readGGUF[int64](llm, rs)
-		case ggufTypeFloat32:
-			v, err = readGGUF[float32](llm, rs)
-		case ggufTypeFloat64:
-			v, err = readGGUF[float64](llm, rs)
-		case ggufTypeBool:
-			v, err = readGGUF[bool](llm, rs)
-		case ggufTypeString:
-			v, err = readGGUFString(llm, rs)
-		case ggufTypeArray:
-			v, err = readGGUFArray(llm, rs)
-		default:
-			return fmt.Errorf("invalid type: %d", t)
-		}
-		if err != nil {
-			return err
-		}
-		llm.kv[k] = v
-	}
-	// decode tensors
-	for range llm.numTensor() {
-		name, err := readGGUFString(llm, rs)
-		if err != nil {
-			return fmt.Errorf("failed to read tensor name: %w", err)
-		}
-		// dims is the number of dimensions in the tensor
-		dims, err := readGGUF[uint32](llm, rs)
-		if err != nil {
-			return fmt.Errorf("failed to read tensor dimensions: %w", err)
-		}
-		shape := make([]uint64, dims)
-		for i := 0; uint32(i) < dims; i++ {
-			shape[i], err = readGGUF[uint64](llm, rs)
-			if err != nil {
-				return fmt.Errorf("failed to read tensor shape: %w", err)
-			}
-		}
-		kind, err := readGGUF[uint32](llm, rs)
-		if err != nil {
-			return fmt.Errorf("failed to read tensor kind: %w", err)
-		}
-		offset, err := readGGUF[uint64](llm, rs)
-		if err != nil {
-			return fmt.Errorf("failed to read tensor offset: %w", err)
-		}
-		tensor := Tensor{
-			Name:   name,
-			Kind:   kind,
-			Offset: offset,
-			Shape:  shape[:],
-		}
-		llm.tensors = append(llm.tensors, &tensor)
-		llm.parameters += tensor.parameters()
-	}
-	// patch KV with parameter count
-	llm.kv["general.parameter_count"] = llm.parameters
-	alignment, ok := llm.kv["general.alignment"].(uint32)
-	if !ok {
-		alignment = 32
-	}
-	offset, err := rs.Seek(0, io.SeekCurrent)
-	if err != nil {
-		return err
-	}
-	padding := ggufPadding(offset, int64(alignment))
-	llm.tensorOffset = uint64(offset + padding)
-	for _, tensor := range llm.tensors {
-		offset, err := rs.Seek(0, io.SeekCurrent)
-		if err != nil {
-			return fmt.Errorf("failed to get current offset: %w", err)
-		}
-		padding := ggufPadding(offset, int64(alignment))
-		if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
-			return fmt.Errorf("failed to seek to init padding: %w", err)
-		}
-		if _, err := rs.Seek(int64(tensor.Size()), io.SeekCurrent); err != nil {
-			return fmt.Errorf("failed to seek to tensor: %w", err)
-		}
-	}
-	return nil
-}
-func readGGUF[T any](llm *gguf, r io.Reader) (T, error) {
-	var t T
-	err := binary.Read(r, llm.ByteOrder, &t)
-	return t, err
-}
-func writeGGUF[V any](w io.Writer, t uint32, v V) error {
-	if err := binary.Write(w, binary.LittleEndian, t); err != nil {
-		return err
-	}
-	return binary.Write(w, binary.LittleEndian, v)
-}
-func readGGUFV1String(llm *gguf, r io.Reader) (string, error) {
-	var length uint64
-	if err := binary.Read(r, llm.ByteOrder, &length); err != nil {
-		return "", err
-	}
-	var b bytes.Buffer
-	if _, err := io.CopyN(&b, r, int64(length)); err != nil {
-		return "", err
-	}
-	// gguf v1 strings are null-terminated
-	b.Truncate(b.Len() - 1)
-	return b.String(), nil
-}
-func discardGGUFString(llm *gguf, r io.Reader) error {
-	buf := llm.scratch[:8]
-	_, err := io.ReadFull(r, buf)
-	if err != nil {
-		return err
-	}
-	size := int(llm.ByteOrder.Uint64(buf))
-	for size > 0 {
-		n, err := r.Read(llm.scratch[:min(size, cap(llm.scratch))])
-		if err != nil {
-			return err
-		}
-		size -= n
-	}
-	return nil
-}
-func readGGUFString(llm *gguf, r io.Reader) (string, error) {
-	if llm.Version == 1 {
-		return readGGUFV1String(llm, r)
-	}
-	buf := llm.scratch[:8]
-	_, err := io.ReadFull(r, buf)
-	if err != nil {
-		return "", err
-	}
-	length := int(llm.ByteOrder.Uint64(buf))
-	if length > len(llm.scratch) {
-		buf = make([]byte, length)
-	} else {
-		buf = llm.scratch[:length]
-	}
-	clear(buf)
-	_, err = io.ReadFull(r, buf)
-	if err != nil {
-		return "", err
-	}
-	return string(buf), nil
-}
-func writeGGUFString(w io.Writer, s string) error {
-	if err := binary.Write(w, binary.LittleEndian, ggufTypeString); err != nil {
-		return err
-	}
-	if err := binary.Write(w, binary.LittleEndian, uint64(len(s))); err != nil {
-		return err
-	}
-	_, err := io.Copy(w, strings.NewReader(s))
-	return err
-}
-type array struct {
-	size   int
-	values []any
-}
-func (a *array) MarshalJSON() ([]byte, error) {
-	return json.Marshal(a.values)
-}
-func readGGUFV1Array(llm *gguf, r io.Reader) (*array, error) {
-	t, err := readGGUF[uint32](llm, r)
-	if err != nil {
-		return nil, err
-	}
-	n, err := readGGUF[uint32](llm, r)
-	if err != nil {
-		return nil, err
-	}
-	a := &array{size: int(n)}
-	if llm.canCollectArray(int(n)) {
-		a.values = make([]any, 0, int(n))
-	}
-	for i := range n {
-		var e any
-		switch t {
-		case ggufTypeUint8:
-			e, err = readGGUF[uint8](llm, r)
-		case ggufTypeInt8:
-			e, err = readGGUF[int8](llm, r)
-		case ggufTypeUint16:
-			e, err = readGGUF[uint16](llm, r)
-		case ggufTypeInt16:
-			e, err = readGGUF[int16](llm, r)
-		case ggufTypeUint32:
-			e, err = readGGUF[uint32](llm, r)
-		case ggufTypeInt32:
-			e, err = readGGUF[int32](llm, r)
-		case ggufTypeUint64:
-			e, err = readGGUF[uint64](llm, r)
-		case ggufTypeInt64:
-			e, err = readGGUF[int64](llm, r)
-		case ggufTypeFloat32:
-			e, err = readGGUF[float32](llm, r)
-		case ggufTypeFloat64:
-			e, err = readGGUF[float64](llm, r)
-		case ggufTypeBool:
-			e, err = readGGUF[bool](llm, r)
-		case ggufTypeString:
-			e, err = readGGUFV1String(llm, r)
-		default:
-			return nil, fmt.Errorf("invalid array type: %d", t)
-		}
-		if err != nil {
-			return nil, err
-		}
-		if a.values != nil {
-			a.values[i] = e
-		}
-	}
-	return a, nil
-}
-func readGGUFArray(llm *gguf, r io.Reader) (*array, error) {
-	if llm.Version == 1 {
-		return readGGUFV1Array(llm, r)
-	}
-	t, err := readGGUF[uint32](llm, r)
-	if err != nil {
-		return nil, err
-	}
-	n, err := readGGUF[uint64](llm, r)
-	if err != nil {
-		return nil, err
-	}
-	a := &array{size: int(n)}
-	if llm.canCollectArray(int(n)) {
-		a.values = make([]any, int(n))
-	}
-	for i := range n {
-		var e any
-		switch t {
-		case ggufTypeUint8:
-			e, err = readGGUF[uint8](llm, r)
-		case ggufTypeInt8:
-			e, err = readGGUF[int8](llm, r)
-		case ggufTypeUint16:
-			e, err = readGGUF[uint16](llm, r)
-		case ggufTypeInt16:
-			e, err = readGGUF[int16](llm, r)
-		case ggufTypeUint32:
-			e, err = readGGUF[uint32](llm, r)
-		case ggufTypeInt32:
-			e, err = readGGUF[int32](llm, r)
-		case ggufTypeUint64:
-			e, err = readGGUF[uint64](llm, r)
-		case ggufTypeInt64:
-			e, err = readGGUF[int64](llm, r)
-		case ggufTypeFloat32:
-			e, err = readGGUF[float32](llm, r)
-		case ggufTypeFloat64:
-			e, err = readGGUF[float64](llm, r)
-		case ggufTypeBool:
-			e, err = readGGUF[bool](llm, r)
-		case ggufTypeString:
-			if a.values != nil {
-				e, err = readGGUFString(llm, r)
-			} else {
-				err = discardGGUFString(llm, r)
-			}
-		default:
-			return nil, fmt.Errorf("invalid array type: %d", t)
-		}
-		if err != nil {
-			return nil, err
-		}
-		if a.values != nil {
-			a.values[i] = e
-		}
-	}
-	return a, nil
-}
-// writeGGUFArray writes a slice s of type E to the write with a gguf type of t
-func writeGGUFArray[S ~[]E, E any](w io.Writer, t uint32, s S) error {
-	if err := binary.Write(w, binary.LittleEndian, ggufTypeArray); err != nil {
-		return err
-	}
-	if err := binary.Write(w, binary.LittleEndian, t); err != nil {
-		return err
-	}
-	if err := binary.Write(w, binary.LittleEndian, uint64(len(s))); err != nil {
-		return err
-	}
-	return binary.Write(w, binary.LittleEndian, s)
-}
-func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error {
-	if err := binary.Write(ws, binary.LittleEndian, []byte("GGUF")); err != nil {
-		return err
-	}
-	if err := binary.Write(ws, binary.LittleEndian, uint32(3)); err != nil {
-		return err
-	}
-	if err := binary.Write(ws, binary.LittleEndian, uint64(len(ts))); err != nil {
-		return err
-	}
-	if err := binary.Write(ws, binary.LittleEndian, uint64(len(kv))); err != nil {
-		return err
-	}
-	keys := maps.Keys(kv)
-	slices.Sort(keys)
-	for _, key := range keys {
-		if err := ggufWriteKV(ws, key, kv[key]); err != nil {
-			return err
-		}
-	}
-	slices.SortFunc(ts, func(a, b Tensor) int {
-		var i, j int
-		if n, err := fmt.Sscanf(a.Name, "blk.%d", &i); err != nil || n != 1 {
-			return cmp.Compare(a.Name, b.Name)
-		} else if n, err := fmt.Sscanf(b.Name, "blk.%d", &j); err != nil || n != 1 {
-			return cmp.Compare(a.Name, b.Name)
-		}
-		return cmp.Compare(i, j)
-	})
-	var s uint64
-	for _, t := range ts {
-		t.Offset = s
-		if err := ggufWriteTensorInfo(ws, t); err != nil {
-			return err
-		}
-		s += t.Size()
-	}
-	var alignment int64 = 32
-	for _, t := range ts {
-		if err := ggufWriteTensor(ws, t, alignment); err != nil {
-			return err
-		}
-	}
-	return nil
-}
-func ggufWriteKV(ws io.WriteSeeker, k string, v any) error {
-	slog.Debug(k, "type", fmt.Sprintf("%T", v))
-	if err := binary.Write(ws, binary.LittleEndian, uint64(len(k))); err != nil {
-		return err
-	}
-	if err := binary.Write(ws, binary.LittleEndian, []byte(k)); err != nil {
-		return err
-	}
-	var err error
-	switch v := v.(type) {
-	case uint32:
-		err = writeGGUF(ws, ggufTypeUint32, v)
-	case float32:
-		err = writeGGUF(ws, ggufTypeFloat32, v)
-	case bool:
-		err = writeGGUF(ws, ggufTypeBool, v)
-	case string:
-		err = writeGGUFString(ws, v)
-	case []int32:
-		err = writeGGUFArray(ws, ggufTypeInt32, v)
-	case []uint32:
-		err = writeGGUFArray(ws, ggufTypeUint32, v)
-	case []float32:
-		err = writeGGUFArray(ws, ggufTypeFloat32, v)
-	case []string:
-		if err := binary.Write(ws, binary.LittleEndian, ggufTypeArray); err != nil {
-			return err
-		}
-		if err := binary.Write(ws, binary.LittleEndian, ggufTypeString); err != nil {
-			return err
-		}
-		if err := binary.Write(ws, binary.LittleEndian, uint64(len(v))); err != nil {
-			return err
-		}
-		for _, e := range v {
-			if err := binary.Write(ws, binary.LittleEndian, uint64(len(e))); err != nil {
-				return err
-			}
-			if err := binary.Write(ws, binary.LittleEndian, []byte(e)); err != nil {
-				return err
-			}
-		}
-	default:
-		return fmt.Errorf("improper type for '%s'", k)
-	}
-	return err
-}
-func ggufWriteTensorInfo(ws io.WriteSeeker, t Tensor) error {
-	slog.Debug(t.Name, "kind", t.Kind, "shape", t.Shape, "offset", t.Offset)
-	if err := binary.Write(ws, binary.LittleEndian, uint64(len(t.Name))); err != nil {
-		return err
-	}
-	if err := binary.Write(ws, binary.LittleEndian, []byte(t.Name)); err != nil {
-		return err
-	}
-	if err := binary.Write(ws, binary.LittleEndian, uint32(len(t.Shape))); err != nil {
-		return err
-	}
-	for i := range len(t.Shape) {
-		if err := binary.Write(ws, binary.LittleEndian, t.Shape[len(t.Shape)-i-1]); err != nil {
-			return err
-		}
-	}
-	if err := binary.Write(ws, binary.LittleEndian, t.Kind); err != nil {
-		return err
-	}
-	return binary.Write(ws, binary.LittleEndian, t.Offset)
-}
-func ggufWriteTensor(ws io.WriteSeeker, t Tensor, alignment int64) error {
-	offset, err := ws.Seek(0, io.SeekCurrent)
-	if err != nil {
-		return err
-	}
-	if err := binary.Write(ws, binary.LittleEndian, bytes.Repeat([]byte{0}, int(ggufPadding(offset, alignment)))); err != nil {
-		return err
-	}
-	_, err = t.WriteTo(ws)
-	return err
-}
-func ggufPadding(offset, align int64) int64 {
-	return (align - offset%align) % align
-}
--- a/ollama/llm/llama.cpp/.clang-tidy
+++ b/ollama/llm/llama.cpp/.clang-tidy
---
-Checks: >
-    bugprone-*,
-    -bugprone-easily-swappable-parameters,
-    -bugprone-implicit-widening-of-multiplication-result,
-    -bugprone-misplaced-widening-cast,
-    -bugprone-narrowing-conversions,
-    readability-*,
-    -readability-avoid-unconditional-preprocessor-if,
-    -readability-function-cognitive-complexity,
-    -readability-identifier-length,
-    -readability-implicit-bool-conversion,
-    -readability-magic-numbers,
-    -readability-uppercase-literal-suffix,
-    -readability-simplify-boolean-expr,
-    clang-analyzer-*,
-    -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
-    performance-*,
-    portability-*,
-    misc-*,
-    -misc-const-correctness,
-    -misc-non-private-member-variables-in-classes,
-    -misc-no-recursion,
-FormatStyle: none
--- a/ollama/llm/llama.cpp/.devops/cloud-v-pipeline
+++ b/ollama/llm/llama.cpp/.devops/cloud-v-pipeline
-node('x86_runner1'){            // Running on x86 runner containing latest vector qemu, latest vector gcc and all the necessary libraries
-    stage('Cleanup'){
-        cleanWs()               // Cleaning previous CI build in workspace
-    }
-    stage('checkout repo'){
-        retry(5){               // Retry if the cloning fails due to some reason
-            checkout scm        // Clone the repo on Runner
-        }
-    }
-    stage('Compiling llama.cpp'){
-        sh'''#!/bin/bash
-            make RISCV=1 RISCV_CROSS_COMPILE=1 # Compiling llama for RISC-V
-        '''
-    }
-    stage('Running llama.cpp'){
-        sh'''#!/bin/bash
-            module load gnu-bin2/0.1            # loading latest versions of vector qemu and vector gcc
-            qemu-riscv64 -L /softwares/gnu-bin2/sysroot  -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./llama-cli -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt            # Running llama.cpp on vector qemu-riscv64
-            cat llama_log.txt                   # Printing results
-        '''
-    }
-}
--- a/ollama/llm/llama.cpp/.devops/full-cuda.Dockerfile
+++ b/ollama/llm/llama.cpp/.devops/full-cuda.Dockerfile
-ARG UBUNTU_VERSION=22.04
-# This needs to generally match the container host's environment.
-ARG CUDA_VERSION=11.7.1
-# Target the CUDA build image
-ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
-FROM ${BASE_CUDA_DEV_CONTAINER} AS build
-# Unless otherwise specified, we make a fat build.
-ARG CUDA_DOCKER_ARCH=all
-RUN apt-get update && \
-    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
-COPY requirements.txt   requirements.txt
-COPY requirements       requirements
-RUN pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt
-WORKDIR /app
-COPY . .
-# Set nvcc architecture
-ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
-# Enable CUDA
-ENV GGML_CUDA=1
-# Enable cURL
-ENV LLAMA_CURL=1
-RUN make -j$(nproc)
-ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/ollama/llm/llama.cpp/.devops/full-rocm.Dockerfile
+++ b/ollama/llm/llama.cpp/.devops/full-rocm.Dockerfile
-ARG UBUNTU_VERSION=22.04
-# This needs to generally match the container host's environment.
-ARG ROCM_VERSION=5.6
-# Target the CUDA build image
-ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
-FROM ${BASE_ROCM_DEV_CONTAINER} AS build
-# Unless otherwise specified, we make a fat build.
-# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
-# This is mostly tied to rocBLAS supported archs.
-ARG ROCM_DOCKER_ARCH=\
-    gfx803 \
-    gfx900 \
-    gfx906 \
-    gfx908 \
-    gfx90a \
-    gfx1010 \
-    gfx1030 \
-    gfx1100 \
-    gfx1101 \
-    gfx1102
-COPY requirements.txt   requirements.txt
-COPY requirements       requirements
-RUN pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt
-WORKDIR /app
-COPY . .
-# Set nvcc architecture
-ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
-# Enable ROCm
-ENV GGML_HIPBLAS=1
-ENV CC=/opt/rocm/llvm/bin/clang
-ENV CXX=/opt/rocm/llvm/bin/clang++
-# Enable cURL
-ENV LLAMA_CURL=1
-RUN apt-get update && \
-    apt-get install -y libcurl4-openssl-dev
-RUN make -j$(nproc)
-ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/ollama/llm/llama.cpp/.devops/full.Dockerfile
+++ b/ollama/llm/llama.cpp/.devops/full.Dockerfile
-ARG UBUNTU_VERSION=22.04
-FROM ubuntu:$UBUNTU_VERSION AS build
-RUN apt-get update && \
-    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
-COPY requirements.txt   requirements.txt
-COPY requirements       requirements
-RUN pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt
-WORKDIR /app
-COPY . .
-ENV LLAMA_CURL=1
-RUN make -j$(nproc)
-ENV LC_ALL=C.utf8
-ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/ollama/llm/llama.cpp/.devops/llama-cli-cuda.Dockerfile
+++ b/ollama/llm/llama.cpp/.devops/llama-cli-cuda.Dockerfile
-ARG UBUNTU_VERSION=22.04
-# This needs to generally match the container host's environment.
-ARG CUDA_VERSION=11.7.1
-# Target the CUDA build image
-ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
-# Target the CUDA runtime image
-ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
-FROM ${BASE_CUDA_DEV_CONTAINER} AS build
-# Unless otherwise specified, we make a fat build.
-ARG CUDA_DOCKER_ARCH=all
-RUN apt-get update && \
-    apt-get install -y build-essential git
-WORKDIR /app
-COPY . .
-# Set nvcc architecture
-ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
-# Enable CUDA
-ENV GGML_CUDA=1
-RUN make -j$(nproc) llama-cli
-FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
-RUN apt-get update && \
-    apt-get install -y libgomp1
-COPY --from=build /app/llama-cli /llama-cli
-ENTRYPOINT [ "/llama-cli" ]
--- a/ollama/llm/llama.cpp/.devops/llama-cli-intel.Dockerfile
+++ b/ollama/llm/llama.cpp/.devops/llama-cli-intel.Dockerfile
-ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
-FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
-ARG GGML_SYCL_F16=OFF
-RUN apt-get update && \
-    apt-get install -y git
-WORKDIR /app
-COPY . .
-RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
-        echo "GGML_SYCL_F16 is set" && \
-        export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
-    fi && \
-    echo "Building with static libs" && \
-    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \
-    ${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \
-    cmake --build build --config Release --target llama-cli
-FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
-COPY --from=build /app/build/bin/llama-cli /llama-cli
-ENV LC_ALL=C.utf8
-ENTRYPOINT [ "/llama-cli" ]
--- a/ollama/llm/llama.cpp/.devops/llama-cli-rocm.Dockerfile
+++ b/ollama/llm/llama.cpp/.devops/llama-cli-rocm.Dockerfile
-ARG UBUNTU_VERSION=22.04
-# This needs to generally match the container host's environment.
-ARG ROCM_VERSION=5.6
-# Target the CUDA build image
-ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
-FROM ${BASE_ROCM_DEV_CONTAINER} AS build
-# Unless otherwise specified, we make a fat build.
-# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
-# This is mostly tied to rocBLAS supported archs.
-ARG ROCM_DOCKER_ARCH=\
-    gfx803 \
-    gfx900 \
-    gfx906 \
-    gfx908 \
-    gfx90a \
-    gfx1010 \
-    gfx1030 \
-    gfx1100 \
-    gfx1101 \
-    gfx1102
-COPY requirements.txt   requirements.txt
-COPY requirements       requirements
-RUN pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt
-WORKDIR /app
-COPY . .
-# Set nvcc architecture
-ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
-# Enable ROCm
-ENV GGML_HIPBLAS=1
-ENV CC=/opt/rocm/llvm/bin/clang
-ENV CXX=/opt/rocm/llvm/bin/clang++
-RUN make -j$(nproc) llama-cli
-ENTRYPOINT [ "/app/llama-cli" ]
--- a/ollama/llm/llama.cpp/.devops/llama-cli-vulkan.Dockerfile
+++ b/ollama/llm/llama.cpp/.devops/llama-cli-vulkan.Dockerfile
-ARG UBUNTU_VERSION=jammy
-FROM ubuntu:$UBUNTU_VERSION AS build
-# Install build tools
-RUN apt update && apt install -y git build-essential cmake wget libgomp1
-# Install Vulkan SDK
-RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
-    wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
-    apt update -y && \
-    apt-get install -y vulkan-sdk
-# Build it
-WORKDIR /app
-COPY . .
-RUN cmake -B build -DGGML_VULKAN=1 && \
-    cmake --build build --config Release --target llama-cli
-# Clean up
-WORKDIR /
-RUN cp /app/build/bin/llama-cli /llama-cli && \
-    rm -rf /app
-ENV LC_ALL=C.utf8
-ENTRYPOINT [ "/llama-cli" ]
--- a/ollama/llm/llama.cpp/.devops/llama-cli.Dockerfile
+++ b/ollama/llm/llama.cpp/.devops/llama-cli.Dockerfile
-ARG UBUNTU_VERSION=22.04
-FROM ubuntu:$UBUNTU_VERSION AS build
-RUN apt-get update && \
-    apt-get install -y build-essential git
-WORKDIR /app
-COPY . .
-RUN make -j$(nproc) llama-cli
-FROM ubuntu:$UBUNTU_VERSION AS runtime
-RUN apt-get update && \
-    apt-get install -y libgomp1
-COPY --from=build /app/llama-cli /llama-cli
-ENV LC_ALL=C.utf8
-ENTRYPOINT [ "/llama-cli" ]
--- a/ollama/llm/llama.cpp/.devops/llama-cpp-cuda.srpm.spec
+++ b/ollama/llm/llama.cpp/.devops/llama-cpp-cuda.srpm.spec
-# SRPM for building from source and packaging an RPM for RPM-based distros.
-# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
-# Built and maintained by John Boero - boeroboy@gmail.com
-# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
-# Notes for llama.cpp:
-# 1. Tags are currently based on hash - which will not sort asciibetically.
-#    We need to declare standard versioning if people want to sort latest releases.
-# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
-# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
-#    Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
-# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
-#    It is up to the user to install the correct vendor-specific support.
-Name:           llama.cpp-cuda
-Version:        %( date "+%%Y%%m%%d" )
-Release:        1%{?dist}
-Summary:        CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
-License:        MIT
-Source0:        https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
-BuildRequires:  coreutils make gcc-c++ git cuda-toolkit
-Requires:       cuda-toolkit
-URL:            https://github.com/ggerganov/llama.cpp
-%define debug_package %{nil}
-%define source_date_epoch_from_changelog 0
-%description
-CPU inference for Meta's Lllama2 models using default options.
-%prep
-%setup -n llama.cpp-master
-%build
-make -j GGML_CUDA=1
-%install
-mkdir -p %{buildroot}%{_bindir}/
-cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
-cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
-cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
-mkdir -p %{buildroot}/usr/lib/systemd/system
-%{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llamacuda.service
-[Unit]
-Description=Llama.cpp server, CPU only (no GPU support in this build).
-After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
-[Service]
-Type=simple
-EnvironmentFile=/etc/sysconfig/llama
-ExecStart=/usr/bin/llama-cuda-server $LLAMA_ARGS
-ExecReload=/bin/kill -s HUP $MAINPID
-Restart=never
-[Install]
-WantedBy=default.target
-EOF
-mkdir -p %{buildroot}/etc/sysconfig
-%{__cat} <<EOF  > %{buildroot}/etc/sysconfig/llama
-LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
-EOF
-%clean
-rm -rf %{buildroot}
-rm -rf %{_builddir}/*
-%files
-%{_bindir}/llama-cuda-cli
-%{_bindir}/llama-cuda-server
-%{_bindir}/llama-cuda-simple
-/usr/lib/systemd/system/llamacuda.service
-%config /etc/sysconfig/llama
-%pre
-%post
-%preun
-%postun
-%changelog
--- a/ollama/llm/llama.cpp/.devops/llama-cpp.srpm.spec
+++ b/ollama/llm/llama.cpp/.devops/llama-cpp.srpm.spec
-# SRPM for building from source and packaging an RPM for RPM-based distros.
-# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
-# Built and maintained by John Boero - boeroboy@gmail.com
-# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
-# Notes for llama.cpp:
-# 1. Tags are currently based on hash - which will not sort asciibetically.
-#    We need to declare standard versioning if people want to sort latest releases.
-#    In the meantime, YYYYMMDD format will be used.
-# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
-# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
-#    Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
-# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
-#    It is up to the user to install the correct vendor-specific support.
-Name:           llama.cpp
-Version:        %( date "+%%Y%%m%%d" )
-Release:        1%{?dist}
-Summary:        CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
-License:        MIT
-Source0:        https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
-BuildRequires:  coreutils make gcc-c++ git libstdc++-devel
-Requires:       libstdc++
-URL:            https://github.com/ggerganov/llama.cpp
-%define debug_package %{nil}
-%define source_date_epoch_from_changelog 0
-%description
-CPU inference for Meta's Lllama2 models using default options.
-Models are not included in this package and must be downloaded separately.
-%prep
-%setup -n llama.cpp-master
-%build
-make -j
-%install
-mkdir -p %{buildroot}%{_bindir}/
-cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
-cp -p llama-server %{buildroot}%{_bindir}/llama-server
-cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
-mkdir -p %{buildroot}/usr/lib/systemd/system
-%{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llama.service
-[Unit]
-Description=Llama.cpp server, CPU only (no GPU support in this build).
-After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
-[Service]
-Type=simple
-EnvironmentFile=/etc/sysconfig/llama
-ExecStart=/usr/bin/llama-server $LLAMA_ARGS
-ExecReload=/bin/kill -s HUP $MAINPID
-Restart=never
-[Install]
-WantedBy=default.target
-EOF
-mkdir -p %{buildroot}/etc/sysconfig
-%{__cat} <<EOF  > %{buildroot}/etc/sysconfig/llama
-LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
-EOF
-%clean
-rm -rf %{buildroot}
-rm -rf %{_builddir}/*
-%files
-%{_bindir}/llama-cli
-%{_bindir}/llama-server
-%{_bindir}/llama-simple
-/usr/lib/systemd/system/llama.service
-%config /etc/sysconfig/llama
-%pre
-%post
-%preun
-%postun
-%changelog
--- a/ollama/llm/llama.cpp/.devops/llama-server-cuda.Dockerfile
+++ b/ollama/llm/llama.cpp/.devops/llama-server-cuda.Dockerfile
-ARG UBUNTU_VERSION=22.04
-# This needs to generally match the container host's environment.
-ARG CUDA_VERSION=11.7.1
-# Target the CUDA build image
-ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
-# Target the CUDA runtime image
-ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
-FROM ${BASE_CUDA_DEV_CONTAINER} AS build
-# Unless otherwise specified, we make a fat build.
-ARG CUDA_DOCKER_ARCH=all
-RUN apt-get update && \
-    apt-get install -y build-essential git libcurl4-openssl-dev
-WORKDIR /app
-COPY . .
-# Set nvcc architecture
-ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
-# Enable CUDA
-ENV GGML_CUDA=1
-# Enable cURL
-ENV LLAMA_CURL=1
-RUN make -j$(nproc) llama-server
-FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
-RUN apt-get update && \
-    apt-get install -y libcurl4-openssl-dev libgomp1 curl
-COPY --from=build /app/llama-server /llama-server
-HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
-ENTRYPOINT [ "/llama-server" ]
--- a/ollama/llm/llama.cpp/.devops/llama-server-intel.Dockerfile
+++ b/ollama/llm/llama.cpp/.devops/llama-server-intel.Dockerfile
-ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
-FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
-ARG GGML_SYCL_F16=OFF
-RUN apt-get update && \
-    apt-get install -y git libcurl4-openssl-dev
-WORKDIR /app
-COPY . .
-RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
-        echo "GGML_SYCL_F16 is set" && \
-        export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
-    fi && \
-    echo "Building with dynamic libs" && \
-    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
-    cmake --build build --config Release --target llama-server
-FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
-RUN apt-get update && \
-    apt-get install -y libcurl4-openssl-dev curl
-COPY --from=build /app/build/bin/llama-server /llama-server
-ENV LC_ALL=C.utf8
-HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
-ENTRYPOINT [ "/llama-server" ]
--- a/ollama/llm/llama.cpp/.devops/llama-server-rocm.Dockerfile
+++ b/ollama/llm/llama.cpp/.devops/llama-server-rocm.Dockerfile
-ARG UBUNTU_VERSION=22.04
-# This needs to generally match the container host's environment.
-ARG ROCM_VERSION=5.6
-# Target the CUDA build image
-ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
-FROM ${BASE_ROCM_DEV_CONTAINER} AS build
-# Unless otherwise specified, we make a fat build.
-# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
-# This is mostly tied to rocBLAS supported archs.
-ARG ROCM_DOCKER_ARCH=\
-    gfx803 \
-    gfx900 \
-    gfx906 \
-    gfx908 \
-    gfx90a \
-    gfx1010 \
-    gfx1030 \
-    gfx1100 \
-    gfx1101 \
-    gfx1102
-COPY requirements.txt   requirements.txt
-COPY requirements       requirements
-RUN pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt
-WORKDIR /app
-COPY . .
-# Set nvcc architecture
-ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
-# Enable ROCm
-ENV GGML_HIPBLAS=1
-ENV CC=/opt/rocm/llvm/bin/clang
-ENV CXX=/opt/rocm/llvm/bin/clang++
-# Enable cURL
-ENV LLAMA_CURL=1
-RUN apt-get update && \
-    apt-get install -y libcurl4-openssl-dev curl
-RUN make -j$(nproc) llama-server
-HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
-ENTRYPOINT [ "/app/llama-server" ]
--- a/ollama/llm/llama.cpp/.devops/llama-server-vulkan.Dockerfile
+++ b/ollama/llm/llama.cpp/.devops/llama-server-vulkan.Dockerfile
-ARG UBUNTU_VERSION=jammy
-FROM ubuntu:$UBUNTU_VERSION AS build
-# Install build tools
-RUN apt update && apt install -y git build-essential cmake wget
-# Install Vulkan SDK and cURL
-RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
-    wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
-    apt update -y && \
-    apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
-# Build it
-WORKDIR /app
-COPY . .
-RUN cmake -B build -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
-    cmake --build build --config Release --target llama-server
-# Clean up
-WORKDIR /
-RUN cp /app/build/bin/llama-server /llama-server && \
-    rm -rf /app
-ENV LC_ALL=C.utf8
-HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
-ENTRYPOINT [ "/llama-server" ]
--- a/ollama/llm/llama.cpp/.devops/llama-server.Dockerfile
+++ b/ollama/llm/llama.cpp/.devops/llama-server.Dockerfile
-ARG UBUNTU_VERSION=22.04
-FROM ubuntu:$UBUNTU_VERSION AS build
-RUN apt-get update && \
-    apt-get install -y build-essential git libcurl4-openssl-dev
-WORKDIR /app
-COPY . .
-ENV LLAMA_CURL=1
-RUN make -j$(nproc) llama-server
-FROM ubuntu:$UBUNTU_VERSION AS runtime
-RUN apt-get update && \
-    apt-get install -y libcurl4-openssl-dev libgomp1 curl
-COPY --from=build /app/llama-server /llama-server
-ENV LC_ALL=C.utf8
-HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
-ENTRYPOINT [ "/llama-server" ]