llama.go 5.15 KB
Newer Older
1
2
3
4
5
6
7
package convert

import (
	"encoding/binary"
	"fmt"
	"io"
	"log/slog"
Patrick Devine's avatar
Patrick Devine committed
8
9
	"os"
	"path/filepath"
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
	"regexp"
	"strings"

	"github.com/nlpodyssey/gopickle/pytorch"
	"github.com/pdevine/tensor"
	"github.com/pdevine/tensor/native"
	"github.com/x448/float16"

	"github.com/ollama/ollama/llm"
)

type LlamaModel struct {
	ModelData
}

Patrick Devine's avatar
Patrick Devine committed
25
func llamaTorchLayerHandler(w io.Writer, r torchWriterTo) error {
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
	slog.Debug(fmt.Sprintf("repacking layer '%s'", r.t.Name))

	data := r.storage.(*pytorch.HalfStorage).Data
	tData := make([]uint16, len(data))
	for cnt, v := range data {
		tData[cnt] = uint16(float16.Fromfloat32(v))
	}

	var err error
	var heads uint32
	if strings.Contains(r.t.Name, "attn_q") {
		heads = uint32(r.params.AttentionHeads)
	} else if strings.Contains(r.t.Name, "attn_k") {
		heads = uint32(r.params.KeyValHeads)
		if heads == 0 {
			heads = uint32(r.params.AttentionHeads)
		}
	} else {
		return fmt.Errorf("unknown layer type")
	}

	slog.Debug(fmt.Sprintf("heads = %d", heads))

	tData, err = llamaRepack(tData, int(heads), r.t.Shape)
	if err != nil {
		return err
	}

	if err = binary.Write(w, r.bo, tData); err != nil {
		return err
	}
	return nil
}

func llamaRepack(data []uint16, heads int, shape []uint64) ([]uint16, error) {
	n := tensor.New(tensor.WithShape(int(shape[0]), int(shape[1])), tensor.WithBacking(data))
	origShape := n.Shape().Clone()

	// reshape the tensor and swap axes 1 and 2 to unpack the layer for gguf
	if err := n.Reshape(heads, 2, origShape[0]/heads/2, origShape[1]); err != nil {
		return nil, err
	}

	if err := n.T(0, 2, 1, 3); err != nil {
		return nil, err
	}

	if err := n.Reshape(origShape...); err != nil {
		return nil, err
	}

	if err := n.Transpose(); err != nil {
		return nil, err
	}
	newN, err := native.SelectU16(n, 1)
	if err != nil {
		return nil, err
	}

	var fullTensor []uint16
	for _, v := range newN {
		fullTensor = append(fullTensor, v...)
	}
	return fullTensor, nil
}

func (m *LlamaModel) GetTensors() error {
	t, err := m.Format.GetTensors(m.Path, m.Params)
	if err != nil {
		return err
	}

	m.Tensors = []llm.Tensor{}

	pattern := `^blk\.[0-9]+\.attn_(?P<layer>q|k)\.weight$`
	re, err := regexp.Compile(pattern)
	if err != nil {
		return err
	}

	for _, l := range t {
		matches := re.FindAllStringSubmatch(l.Name, -1)
		if len(matches) > 0 {
			slog.Debug(fmt.Sprintf("setting handler for: %s", l.Name))
Patrick Devine's avatar
Patrick Devine committed
110
111
			switch m.Format.(type) {
			case *TorchFormat:
Patrick Devine's avatar
Patrick Devine committed
112
113
114
				wt := l.WriterTo.(torchWriterTo)
				wt.handler = llamaTorchLayerHandler
				l.WriterTo = wt
Patrick Devine's avatar
Patrick Devine committed
115
			case *SafetensorFormat:
Patrick Devine's avatar
Patrick Devine committed
116
117
118
119
				wt := l.WriterTo.(safetensorWriterTo)
				wt.handler = mistralLayerHandler
				l.WriterTo = wt
			}
120
121
122
123
124
125
126
127
		}
		m.Tensors = append(m.Tensors, l)
	}

	return nil
}

func (m *LlamaModel) LoadVocab() error {
Patrick Devine's avatar
Patrick Devine committed
128
129
130
131
	v := &Vocab{
		Tokens: []string{},
		Types:  []int32{},
		Merges: []string{},
132
133
	}

Patrick Devine's avatar
Patrick Devine committed
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
	tokpath := filepath.Join(m.Path, "tokenizer.json")
	slog.Debug(fmt.Sprintf("looking for %s", tokpath))
	if _, err := os.Stat(tokpath); !os.IsNotExist(err) {
		t, err := newTokenizer(tokpath)
		if err != nil {
			return err
		}

		for _, tok := range t.Model.Tokens {
			v.Tokens = append(v.Tokens, tok.Content)
			var tokType int32
			switch {
			case tok.Special:
				tokType = 3
			case tok.UserDefined:
				tokType = 4
			default:
				tokType = 1
			}
			v.Types = append(v.Types, tokType)
		}
		v.Merges = t.Model.Merges
	} else {
		slog.Debug("loading sentence piece vocab")
		v, err = LoadSentencePieceTokens(m.Path, m.Params)
		if err != nil {
			return err
		}

		slog.Debug("vocab loaded")
164

Patrick Devine's avatar
Patrick Devine committed
165
	}
166
	m.Vocab = v
Patrick Devine's avatar
Patrick Devine committed
167

168
169
170
	return nil
}

Michael Yang's avatar
Michael Yang committed
171
func (m *LlamaModel) WriteGGUF(ws io.WriteSeeker) error {
172
173
174
175
176
177
178
179
	kv := llm.KV{
		"general.architecture":                   "llama",
		"general.name":                           m.Name,
		"llama.vocab_size":                       uint32(len(m.Vocab.Tokens)),
		"llama.context_length":                   uint32(m.Params.ContextSize),
		"llama.embedding_length":                 uint32(m.Params.HiddenSize),
		"llama.block_count":                      uint32(m.Params.HiddenLayers),
		"llama.feed_forward_length":              uint32(m.Params.IntermediateSize),
Patrick Devine's avatar
Patrick Devine committed
180
		"llama.rope.freq_base":                   float32(m.Params.RopeFrequencyBase),
181
182
183
184
		"llama.rope.dimension_count":             uint32(m.Params.HiddenSize / m.Params.AttentionHeads),
		"llama.attention.head_count":             uint32(m.Params.AttentionHeads),
		"llama.attention.head_count_kv":          uint32(m.Params.KeyValHeads),
		"llama.attention.layer_norm_rms_epsilon": float32(m.Params.NormEPS),
Patrick Devine's avatar
Patrick Devine committed
185
186
187
188
		//"general.file_type":                      uint32(1),
		"general.file_type": uint32(2),
		//"tokenizer.ggml.model":                   "llama",
		"tokenizer.ggml.model": "gpt2",
189
190
191
192
193
194
195

		"tokenizer.ggml.tokens":     m.Vocab.Tokens,
		"tokenizer.ggml.token_type": m.Vocab.Types,

		"tokenizer.ggml.bos_token_id":     uint32(m.Params.BoSTokenID),
		"tokenizer.ggml.eos_token_id":     uint32(m.Params.EoSTokenID),
		"tokenizer.ggml.unknown_token_id": uint32(0),
Patrick Devine's avatar
Patrick Devine committed
196
197
198
199
200
201
202
203
		//"tokenizer.ggml.add_bos_token":    true,
		//"tokenizer.ggml.add_eos_token":    false,
	}

	if len(m.Vocab.Merges) > 0 {
		kv["tokenizer.ggml.merges"] = m.Vocab.Merges
	} else {
		kv["tokenizer.ggml.scores"] = m.Vocab.Scores
204
205
	}

Michael Yang's avatar
Michael Yang committed
206
	return llm.NewGGUFV3(m.Params.ByteOrder).Encode(ws, kv, m.Tensors)
207
}