gguf_writer.py 34.1 KB
Newer Older
mashun1's avatar
v1  
mashun1 committed
1
2
3
4
5
6
7
from __future__ import annotations

import logging
import os
import shutil
import struct
import tempfile
xuxzh1's avatar
init  
xuxzh1 committed
8
from dataclasses import dataclass
mashun1's avatar
v1  
mashun1 committed
9
from enum import Enum, auto
xuxzh1's avatar
init  
xuxzh1 committed
10
11
from math import prod
from pathlib import Path
mashun1's avatar
v1  
mashun1 committed
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
from io import BufferedWriter
from typing import IO, Any, Sequence, Mapping
from string import ascii_letters, digits

import numpy as np

from .constants import (
    GGUF_DEFAULT_ALIGNMENT,
    GGUF_MAGIC,
    GGUF_VERSION,
    GGMLQuantizationType,
    GGUFEndian,
    GGUFValueType,
    Keys,
    RopeScalingType,
    PoolingType,
    TokenType,
)

from .quants import quant_shape_from_byte_shape

logger = logging.getLogger(__name__)


xuxzh1's avatar
init  
xuxzh1 committed
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
SHARD_NAME_FORMAT = "{:s}-{:05d}-of-{:05d}.gguf"


@dataclass
class TensorInfo:
    shape: Sequence[int]
    dtype: GGMLQuantizationType
    nbytes: int
    tensor: np.ndarray[Any, Any] | None = None


@dataclass
class GGUFValue:
    value: Any
    type: GGUFValueType


mashun1's avatar
v1  
mashun1 committed
53
class WriterState(Enum):
xuxzh1's avatar
init  
xuxzh1 committed
54
    NO_FILE = auto()
mashun1's avatar
v1  
mashun1 committed
55
56
57
58
    EMPTY   = auto()
    HEADER  = auto()
    KV_DATA = auto()
    TI_DATA = auto()
xuxzh1's avatar
init  
xuxzh1 committed
59
    WEIGHTS = auto()
mashun1's avatar
v1  
mashun1 committed
60
61
62


class GGUFWriter:
xuxzh1's avatar
init  
xuxzh1 committed
63
64
    fout: list[BufferedWriter] | None
    path: Path | None
mashun1's avatar
v1  
mashun1 committed
65
    temp_file: tempfile.SpooledTemporaryFile[bytes] | None
xuxzh1's avatar
init  
xuxzh1 committed
66
67
68
    tensors: list[dict[str, TensorInfo]]
    kv_data: list[dict[str, GGUFValue]]
    state: WriterState
mashun1's avatar
v1  
mashun1 committed
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
    _simple_value_packing = {
        GGUFValueType.UINT8:   "B",
        GGUFValueType.INT8:    "b",
        GGUFValueType.UINT16:  "H",
        GGUFValueType.INT16:   "h",
        GGUFValueType.UINT32:  "I",
        GGUFValueType.INT32:   "i",
        GGUFValueType.FLOAT32: "f",
        GGUFValueType.UINT64:  "Q",
        GGUFValueType.INT64:   "q",
        GGUFValueType.FLOAT64: "d",
        GGUFValueType.BOOL:    "?",
    }

    def __init__(
xuxzh1's avatar
init  
xuxzh1 committed
84
85
        self, path: os.PathLike[str] | str | None, arch: str, use_temp_file: bool = False, endianess: GGUFEndian = GGUFEndian.LITTLE,
        split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False
mashun1's avatar
v1  
mashun1 committed
86
    ):
xuxzh1's avatar
init  
xuxzh1 committed
87
88
        self.fout = None
        self.path = Path(path) if path else None
mashun1's avatar
v1  
mashun1 committed
89
90
91
92
93
        self.arch = arch
        self.endianess = endianess
        self.data_alignment = GGUF_DEFAULT_ALIGNMENT
        self.use_temp_file = use_temp_file
        self.temp_file = None
xuxzh1's avatar
init  
xuxzh1 committed
94
95
96
97
98
99
        self.tensors = [{}]
        self.kv_data = [{}]
        self.split_max_tensors = split_max_tensors
        self.split_max_size = split_max_size
        self.dry_run = dry_run
        self.small_first_shard = small_first_shard
mashun1's avatar
v1  
mashun1 committed
100
101
102
        logger.info("gguf: This GGUF file is for {0} Endian only".format(
            "Big" if self.endianess == GGUFEndian.BIG else "Little",
        ))
xuxzh1's avatar
init  
xuxzh1 committed
103
104
105
106
        self.state = WriterState.NO_FILE

        if self.small_first_shard:
            self.tensors.append({})
mashun1's avatar
v1  
mashun1 committed
107
108
109

        self.add_architecture()

xuxzh1's avatar
init  
xuxzh1 committed
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
    def get_total_parameter_count(self) -> tuple[int, int, int, int]:
        total_params = 0
        shared_params = 0
        expert_params = 0

        expert_sum = 0
        n_expert_tensors = 0

        last_lora_a: tuple[str, TensorInfo] | None = None

        for tensors in self.tensors:
            for name, info in tensors.items():

                shape = info.shape

                if name.endswith(".lora_a"):
                    last_lora_a = (name, info)
                    continue
                elif name.endswith(".lora_b"):
                    if last_lora_a is None or last_lora_a[0] != name[:-1] + "a":
                        # Bail when the LoRA pair can't be found trivially
                        logger.warning("can't measure LoRA size correctly, tensor order is unusual")
                        return 0, 0, 0, 0
                    else:
                        shape = (*shape[:-1], last_lora_a[1].shape[-1])

                size = prod(shape)

                if "_exps." in name:
                    expert_params += (size // shape[-3])
                    expert_sum += shape[-3]
                    n_expert_tensors += 1
                else:
                    shared_params += size

                total_params += size

        # Hopefully this should work even for variable-expert-count models
        expert_count = (expert_sum // n_expert_tensors) if n_expert_tensors > 0 else 0

        # Negate the total to signal it's likely not exact
        if last_lora_a is not None:
            total_params = -total_params

        # NOTE: keep the output in the same order as accepted by 'size_label' in gguf-py/gguf/utility.py
        return total_params, shared_params, expert_params, expert_count

    def format_shard_names(self, path: Path) -> list[Path]:
        if len(self.tensors) == 1:
            return [path]
        return [path.with_name(SHARD_NAME_FORMAT.format(path.stem, i + 1, len(self.tensors))) for i in range(len(self.tensors))]

    def open_output_file(self, path: Path | None = None) -> None:
        if self.state is WriterState.EMPTY and self.fout is not None and (path is None or path == self.path):
            # allow calling this multiple times as long as the path is the same
            return

        if self.state is not WriterState.NO_FILE:
            raise ValueError(f'Expected output file to be not yet opened, got {self.state}')

        if path is not None:
            self.path = path

        if self.path is not None:
            filenames = self.print_plan()
            self.fout = [open(filename, "wb") for filename in filenames]
            self.state = WriterState.EMPTY

    def print_plan(self) -> list[Path]:
        logger.info("Writing the following files:")
        assert self.path is not None
        filenames = self.format_shard_names(self.path)
        assert len(filenames) == len(self.tensors)
        for name, tensors in zip(filenames, self.tensors):
            logger.info(f"{name}: n_tensors = {len(tensors)}, total_size = {GGUFWriter.format_n_bytes_to_str(sum(ti.nbytes for ti in tensors.values()))}")

        if self.dry_run:
            logger.info("Dry run, not writing files")
            for name in filenames:
                print(name)  # noqa: NP100
            exit()

        return filenames

    def add_shard_kv_data(self) -> None:
        if len(self.tensors) == 1:
            return

        total_tensors = sum(len(t) for t in self.tensors)
        assert self.fout is not None
        total_splits = len(self.fout)
        self.kv_data.extend({} for _ in range(len(self.kv_data), total_splits))
        for i, kv_data in enumerate(self.kv_data):
            kv_data[Keys.Split.LLM_KV_SPLIT_NO] = GGUFValue(i, GGUFValueType.UINT16)
            kv_data[Keys.Split.LLM_KV_SPLIT_COUNT] = GGUFValue(total_splits, GGUFValueType.UINT16)
            kv_data[Keys.Split.LLM_KV_SPLIT_TENSORS_COUNT] = GGUFValue(total_tensors, GGUFValueType.INT32)

    def write_header_to_file(self, path: Path | None = None) -> None:
        if len(self.tensors) == 1 and (self.split_max_tensors != 0 or self.split_max_size != 0):
            logger.warning("Model fails split requirements, not splitting")

        self.open_output_file(path)

mashun1's avatar
v1  
mashun1 committed
213
214
215
        if self.state is not WriterState.EMPTY:
            raise ValueError(f'Expected output file to be empty, got {self.state}')

xuxzh1's avatar
init  
xuxzh1 committed
216
217
218
219
220
221
222
223
224
225
226
227
        assert self.fout is not None
        assert len(self.fout) == len(self.tensors)
        assert len(self.kv_data) == 1

        self.add_shard_kv_data()

        for fout, tensors, kv_data in zip(self.fout, self.tensors, self.kv_data):
            fout.write(self._pack("<I", GGUF_MAGIC, skip_pack_prefix = True))
            fout.write(self._pack("I", GGUF_VERSION))
            fout.write(self._pack("Q", len(tensors)))
            fout.write(self._pack("Q", len(kv_data)))
            fout.flush()
mashun1's avatar
v1  
mashun1 committed
228
229
230
231
232
        self.state = WriterState.HEADER

    def write_kv_data_to_file(self) -> None:
        if self.state is not WriterState.HEADER:
            raise ValueError(f'Expected output file to contain the header, got {self.state}')
xuxzh1's avatar
init  
xuxzh1 committed
233
234
235
236
237
238
239
240
241
242
        assert self.fout is not None

        for fout, kv_data in zip(self.fout, self.kv_data):
            kv_bytes = bytearray()

            for key, val in kv_data.items():
                kv_bytes += self._pack_val(key, GGUFValueType.STRING, add_vtype=False)
                kv_bytes += self._pack_val(val.value, val.type, add_vtype=True)

            fout.write(kv_bytes)
mashun1's avatar
v1  
mashun1 committed
243
244
245
246
247
248
249

        self.flush()
        self.state = WriterState.KV_DATA

    def write_ti_data_to_file(self) -> None:
        if self.state is not WriterState.KV_DATA:
            raise ValueError(f'Expected output file to contain KV data, got {self.state}')
xuxzh1's avatar
init  
xuxzh1 committed
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
        assert self.fout is not None

        for fout, tensors in zip(self.fout, self.tensors):
            ti_data = bytearray()
            offset_tensor = 0

            for name, ti in tensors.items():
                ti_data += self._pack_val(name, GGUFValueType.STRING, add_vtype=False)
                n_dims = len(ti.shape)
                ti_data += self._pack("I", n_dims)
                for j in range(n_dims):
                    ti_data += self._pack("Q", ti.shape[n_dims - 1 - j])
                ti_data += self._pack("I", ti.dtype)
                ti_data += self._pack("Q", offset_tensor)
                offset_tensor += GGUFWriter.ggml_pad(ti.nbytes, self.data_alignment)

            fout.write(ti_data)
            fout.flush()
mashun1's avatar
v1  
mashun1 committed
268
269
        self.state = WriterState.TI_DATA

xuxzh1's avatar
init  
xuxzh1 committed
270
271
272
273
274
    def add_key_value(self, key: str, val: Any, vtype: GGUFValueType) -> None:
        if any(key in kv_data for kv_data in self.kv_data):
            raise ValueError(f'Duplicated key name {key!r}')

        self.kv_data[0][key] = GGUFValue(value=val, type=vtype)
mashun1's avatar
v1  
mashun1 committed
275
276

    def add_uint8(self, key: str, val: int) -> None:
xuxzh1's avatar
init  
xuxzh1 committed
277
        self.add_key_value(key,val, GGUFValueType.UINT8)
mashun1's avatar
v1  
mashun1 committed
278
279

    def add_int8(self, key: str, val: int) -> None:
xuxzh1's avatar
init  
xuxzh1 committed
280
        self.add_key_value(key, val, GGUFValueType.INT8)
mashun1's avatar
v1  
mashun1 committed
281
282

    def add_uint16(self, key: str, val: int) -> None:
xuxzh1's avatar
init  
xuxzh1 committed
283
        self.add_key_value(key, val, GGUFValueType.UINT16)
mashun1's avatar
v1  
mashun1 committed
284
285

    def add_int16(self, key: str, val: int) -> None:
xuxzh1's avatar
init  
xuxzh1 committed
286
        self.add_key_value(key, val, GGUFValueType.INT16)
mashun1's avatar
v1  
mashun1 committed
287
288

    def add_uint32(self, key: str, val: int) -> None:
xuxzh1's avatar
init  
xuxzh1 committed
289
        self.add_key_value(key, val, GGUFValueType.UINT32)
mashun1's avatar
v1  
mashun1 committed
290
291

    def add_int32(self, key: str, val: int) -> None:
xuxzh1's avatar
init  
xuxzh1 committed
292
        self.add_key_value(key, val, GGUFValueType.INT32)
mashun1's avatar
v1  
mashun1 committed
293
294

    def add_float32(self, key: str, val: float) -> None:
xuxzh1's avatar
init  
xuxzh1 committed
295
        self.add_key_value(key, val, GGUFValueType.FLOAT32)
mashun1's avatar
v1  
mashun1 committed
296
297

    def add_uint64(self, key: str, val: int) -> None:
xuxzh1's avatar
init  
xuxzh1 committed
298
        self.add_key_value(key, val, GGUFValueType.UINT64)
mashun1's avatar
v1  
mashun1 committed
299
300

    def add_int64(self, key: str, val: int) -> None:
xuxzh1's avatar
init  
xuxzh1 committed
301
        self.add_key_value(key, val, GGUFValueType.INT64)
mashun1's avatar
v1  
mashun1 committed
302
303

    def add_float64(self, key: str, val: float) -> None:
xuxzh1's avatar
init  
xuxzh1 committed
304
        self.add_key_value(key, val, GGUFValueType.FLOAT64)
mashun1's avatar
v1  
mashun1 committed
305
306

    def add_bool(self, key: str, val: bool) -> None:
xuxzh1's avatar
init  
xuxzh1 committed
307
        self.add_key_value(key, val, GGUFValueType.BOOL)
mashun1's avatar
v1  
mashun1 committed
308
309
310
311

    def add_string(self, key: str, val: str) -> None:
        if not val:
            return
xuxzh1's avatar
init  
xuxzh1 committed
312
        self.add_key_value(key, val, GGUFValueType.STRING)
mashun1's avatar
v1  
mashun1 committed
313
314

    def add_array(self, key: str, val: Sequence[Any]) -> None:
xuxzh1's avatar
init  
xuxzh1 committed
315
316
317
        if len(val) == 0:
            return
        self.add_key_value(key, val, GGUFValueType.ARRAY)
mashun1's avatar
v1  
mashun1 committed
318
319
320
321
322
323
324
325
326

    @staticmethod
    def ggml_pad(x: int, n: int) -> int:
        return ((x + n - 1) // n) * n

    def add_tensor_info(
        self, name: str, tensor_shape: Sequence[int], tensor_dtype: np.dtype,
        tensor_nbytes: int, raw_dtype: GGMLQuantizationType | None = None,
    ) -> None:
xuxzh1's avatar
init  
xuxzh1 committed
327
328
        if self.state is not WriterState.NO_FILE:
            raise ValueError(f'Expected output file to be not yet opened, got {self.state}')
mashun1's avatar
v1  
mashun1 committed
329

xuxzh1's avatar
init  
xuxzh1 committed
330
331
        if any(name in tensors for tensors in self.tensors):
            raise ValueError(f'Duplicated tensor name {name!r}')
mashun1's avatar
v1  
mashun1 committed
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353

        if raw_dtype is None:
            if tensor_dtype == np.float16:
                dtype = GGMLQuantizationType.F16
            elif tensor_dtype == np.float32:
                dtype = GGMLQuantizationType.F32
            elif tensor_dtype == np.float64:
                dtype = GGMLQuantizationType.F64
            elif tensor_dtype == np.int8:
                dtype = GGMLQuantizationType.I8
            elif tensor_dtype == np.int16:
                dtype = GGMLQuantizationType.I16
            elif tensor_dtype == np.int32:
                dtype = GGMLQuantizationType.I32
            elif tensor_dtype == np.int64:
                dtype = GGMLQuantizationType.I64
            else:
                raise ValueError("Only F16, F32, F64, I8, I16, I32, I64 tensors are supported for now")
        else:
            dtype = raw_dtype
            if tensor_dtype == np.uint8:
                tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype)
xuxzh1's avatar
init  
xuxzh1 committed
354
355
356
357
358
359
360
361
362
363
364
365
366

        # make sure there is at least one tensor before splitting
        if len(self.tensors[-1]) > 0:
            if (  # split when over tensor limit
                self.split_max_tensors != 0
                and len(self.tensors[-1]) >= self.split_max_tensors
            ) or (   # split when over size limit
                self.split_max_size != 0
                and sum(ti.nbytes for ti in self.tensors[-1].values()) + tensor_nbytes > self.split_max_size
            ):
                self.tensors.append({})

        self.tensors[-1][name] = TensorInfo(shape=tensor_shape, dtype=dtype, nbytes=tensor_nbytes)
mashun1's avatar
v1  
mashun1 committed
367
368
369
370
371
372
373
374
375
376
377
378
379

    def add_tensor(
        self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None,
        raw_dtype: GGMLQuantizationType | None = None,
    ) -> None:
        if self.endianess == GGUFEndian.BIG:
            tensor.byteswap(inplace=True)
        if self.use_temp_file and self.temp_file is None:
            fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256 * 1024 * 1024)
            fp.seek(0)
            self.temp_file = fp

        shape: Sequence[int] = raw_shape if raw_shape is not None else tensor.shape
xuxzh1's avatar
init  
xuxzh1 committed
380
        self.add_tensor_info(name, shape, tensor.dtype, tensor.nbytes, raw_dtype=raw_dtype)
mashun1's avatar
v1  
mashun1 committed
381
382

        if self.temp_file is None:
xuxzh1's avatar
init  
xuxzh1 committed
383
            self.tensors[-1][name].tensor = tensor
mashun1's avatar
v1  
mashun1 committed
384
385
386
387
388
389
390
391
392
393
394
            return

        tensor.tofile(self.temp_file)
        self.write_padding(self.temp_file, tensor.nbytes)

    def write_padding(self, fp: IO[bytes], n: int, align: int | None = None) -> None:
        pad = GGUFWriter.ggml_pad(n, align if align is not None else self.data_alignment) - n
        if pad != 0:
            fp.write(bytes([0] * pad))

    def write_tensor_data(self, tensor: np.ndarray[Any, Any]) -> None:
xuxzh1's avatar
init  
xuxzh1 committed
395
396
397
        if self.state is not WriterState.TI_DATA and self.state is not WriterState.WEIGHTS:
            raise ValueError(f'Expected output file to contain tensor info or weights, got {self.state}')
        assert self.fout is not None
mashun1's avatar
v1  
mashun1 committed
398
399
400

        if self.endianess == GGUFEndian.BIG:
            tensor.byteswap(inplace=True)
xuxzh1's avatar
init  
xuxzh1 committed
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420

        file_id = -1
        for i, tensors in enumerate(self.tensors):
            if len(tensors) > 0:
                file_id = i
                break

        fout = self.fout[file_id]

        # pop the first tensor info
        # TODO: cleaner way to get the first key
        first_tensor_name = [name for name, _ in zip(self.tensors[file_id].keys(), range(1))][0]
        ti = self.tensors[file_id].pop(first_tensor_name)
        assert ti.nbytes == tensor.nbytes

        self.write_padding(fout, fout.tell())
        tensor.tofile(fout)
        self.write_padding(fout, tensor.nbytes)

        self.state = WriterState.WEIGHTS
mashun1's avatar
v1  
mashun1 committed
421
422
423
424

    def write_tensors_to_file(self, *, progress: bool = False) -> None:
        self.write_ti_data_to_file()

xuxzh1's avatar
init  
xuxzh1 committed
425
426
427
428
        assert self.fout is not None

        for fout in self.fout:
            self.write_padding(fout, fout.tell())
mashun1's avatar
v1  
mashun1 committed
429
430

        if self.temp_file is None:
xuxzh1's avatar
init  
xuxzh1 committed
431
432
            shard_bar = None
            bar = None
mashun1's avatar
v1  
mashun1 committed
433
434
435
436

            if progress:
                from tqdm import tqdm

xuxzh1's avatar
init  
xuxzh1 committed
437
                total_bytes = sum(ti.nbytes for t in self.tensors for ti in t.values())
mashun1's avatar
v1  
mashun1 committed
438

xuxzh1's avatar
init  
xuxzh1 committed
439
440
                if len(self.fout) > 1:
                    shard_bar = tqdm(desc=f"Shard (0/{len(self.fout)})", total=None, unit="byte", unit_scale=True)
mashun1's avatar
v1  
mashun1 committed
441
442
                bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)

xuxzh1's avatar
init  
xuxzh1 committed
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
            for i, (fout, tensors) in enumerate(zip(self.fout, self.tensors)):
                if shard_bar is not None:
                    shard_bar.set_description(f"Shard ({i + 1}/{len(self.fout)})")
                    total = sum(ti.nbytes for ti in tensors.values())
                    shard_bar.reset(total=(total if total > 0 else None))

                # relying on the fact that Python dicts preserve insertion order (since 3.7)
                for ti in tensors.values():
                    assert ti.tensor is not None  # can only iterate once over the tensors
                    assert ti.tensor.nbytes == ti.nbytes
                    ti.tensor.tofile(fout)
                    if shard_bar is not None:
                        shard_bar.update(ti.nbytes)
                    if bar is not None:
                        bar.update(ti.nbytes)
                    self.write_padding(fout, ti.nbytes)
                    ti.tensor = None
        else:
            self.temp_file.seek(0)
mashun1's avatar
v1  
mashun1 committed
462

xuxzh1's avatar
init  
xuxzh1 committed
463
464
465
            shutil.copyfileobj(self.temp_file, self.fout[0 if not self.small_first_shard else 1])
            self.flush()
            self.temp_file.close()
mashun1's avatar
v1  
mashun1 committed
466

xuxzh1's avatar
init  
xuxzh1 committed
467
        self.state = WriterState.WEIGHTS
mashun1's avatar
v1  
mashun1 committed
468
469

    def flush(self) -> None:
xuxzh1's avatar
init  
xuxzh1 committed
470
471
472
        assert self.fout is not None
        for fout in self.fout:
            fout.flush()
mashun1's avatar
v1  
mashun1 committed
473
474

    def close(self) -> None:
xuxzh1's avatar
init  
xuxzh1 committed
475
476
477
478
479
480
481
        if self.fout is not None:
            for fout in self.fout:
                fout.close()
            self.fout = None

    def add_type(self, type_name: str) -> None:
        self.add_string(Keys.General.TYPE, type_name)
mashun1's avatar
v1  
mashun1 committed
482
483
484
485

    def add_architecture(self) -> None:
        self.add_string(Keys.General.ARCHITECTURE, self.arch)

xuxzh1's avatar
init  
xuxzh1 committed
486
487
488
489
490
491
492
493
494
495
496
497
498
    def add_quantization_version(self, quantization_version: int) -> None:
        self.add_uint32(Keys.General.QUANTIZATION_VERSION, quantization_version)

    def add_custom_alignment(self, alignment: int) -> None:
        self.data_alignment = alignment
        self.add_uint32(Keys.General.ALIGNMENT, alignment)

    def add_file_type(self, ftype: int) -> None:
        self.add_uint32(Keys.General.FILE_TYPE, ftype)

    def add_name(self, name: str) -> None:
        self.add_string(Keys.General.NAME, name)

mashun1's avatar
v1  
mashun1 committed
499
500
501
502
503
504
    def add_author(self, author: str) -> None:
        self.add_string(Keys.General.AUTHOR, author)

    def add_version(self, version: str) -> None:
        self.add_string(Keys.General.VERSION, version)

xuxzh1's avatar
init  
xuxzh1 committed
505
506
    def add_organization(self, organization: str) -> None:
        self.add_string(Keys.General.ORGANIZATION, organization)
mashun1's avatar
v1  
mashun1 committed
507

xuxzh1's avatar
init  
xuxzh1 committed
508
509
510
511
512
    def add_finetune(self, finetune: str) -> None:
        self.add_string(Keys.General.FINETUNE, finetune)

    def add_basename(self, basename: str) -> None:
        self.add_string(Keys.General.BASENAME, basename)
mashun1's avatar
v1  
mashun1 committed
513
514
515
516

    def add_description(self, description: str) -> None:
        self.add_string(Keys.General.DESCRIPTION, description)

xuxzh1's avatar
init  
xuxzh1 committed
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
    def add_quantized_by(self, quantized: str) -> None:
        self.add_string(Keys.General.QUANTIZED_BY, quantized)

    def add_size_label(self, size_label: str) -> None:
        self.add_string(Keys.General.SIZE_LABEL, size_label)

    def add_license(self, license: str) -> None:
        self.add_string(Keys.General.LICENSE, license)

    def add_license_name(self, license: str) -> None:
        self.add_string(Keys.General.LICENSE_NAME, license)

    def add_license_link(self, license: str) -> None:
        self.add_string(Keys.General.LICENSE_LINK, license)

    def add_url(self, url: str) -> None:
        self.add_string(Keys.General.URL, url)

    def add_doi(self, doi: str) -> None:
        self.add_string(Keys.General.DOI, doi)

    def add_uuid(self, uuid: str) -> None:
        self.add_string(Keys.General.UUID, uuid)

    def add_repo_url(self, repo_url: str) -> None:
        self.add_string(Keys.General.REPO_URL, repo_url)
mashun1's avatar
v1  
mashun1 committed
543
544
545
546

    def add_source_url(self, url: str) -> None:
        self.add_string(Keys.General.SOURCE_URL, url)

xuxzh1's avatar
init  
xuxzh1 committed
547
548
    def add_source_doi(self, doi: str) -> None:
        self.add_string(Keys.General.SOURCE_DOI, doi)
mashun1's avatar
v1  
mashun1 committed
549

xuxzh1's avatar
init  
xuxzh1 committed
550
551
    def add_source_uuid(self, uuid: str) -> None:
        self.add_string(Keys.General.SOURCE_UUID, uuid)
mashun1's avatar
v1  
mashun1 committed
552

xuxzh1's avatar
init  
xuxzh1 committed
553
554
    def add_source_repo_url(self, repo_url: str) -> None:
        self.add_string(Keys.General.SOURCE_REPO_URL, repo_url)
mashun1's avatar
v1  
mashun1 committed
555

xuxzh1's avatar
init  
xuxzh1 committed
556
557
    def add_base_model_count(self, source_count: int) -> None:
        self.add_uint32(Keys.General.BASE_MODEL_COUNT, source_count)
mashun1's avatar
v1  
mashun1 committed
558

xuxzh1's avatar
init  
xuxzh1 committed
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
    def add_base_model_name(self, source_id: int, name: str) -> None:
        self.add_string(Keys.General.BASE_MODEL_NAME.format(id=source_id), name)

    def add_base_model_author(self, source_id: int, author: str) -> None:
        self.add_string(Keys.General.BASE_MODEL_AUTHOR.format(id=source_id), author)

    def add_base_model_version(self, source_id: int, version: str) -> None:
        self.add_string(Keys.General.BASE_MODEL_VERSION.format(id=source_id), version)

    def add_base_model_organization(self, source_id: int, organization: str) -> None:
        self.add_string(Keys.General.BASE_MODEL_ORGANIZATION.format(id=source_id), organization)

    def add_base_model_url(self, source_id: int, url: str) -> None:
        self.add_string(Keys.General.BASE_MODEL_URL.format(id=source_id), url)

    def add_base_model_doi(self, source_id: int, doi: str) -> None:
        self.add_string(Keys.General.BASE_MODEL_DOI.format(id=source_id), doi)

    def add_base_model_uuid(self, source_id: int, uuid: str) -> None:
        self.add_string(Keys.General.BASE_MODEL_UUID.format(id=source_id), uuid)

    def add_base_model_repo_url(self, source_id: int, repo_url: str) -> None:
        self.add_string(Keys.General.BASE_MODEL_REPO_URL.format(id=source_id), repo_url)

    def add_tags(self, tags: Sequence[str]) -> None:
        self.add_array(Keys.General.TAGS, tags)

    def add_languages(self, languages: Sequence[str]) -> None:
        self.add_array(Keys.General.LANGUAGES, languages)

    def add_datasets(self, datasets: Sequence[str]) -> None:
        self.add_array(Keys.General.DATASETS, datasets)

    def add_tensor_data_layout(self, layout: str) -> None:
        self.add_string(Keys.LLM.TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
mashun1's avatar
v1  
mashun1 committed
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609

    def add_vocab_size(self, size: int) -> None:
        self.add_uint32(Keys.LLM.VOCAB_SIZE.format(arch=self.arch), size)

    def add_context_length(self, length: int) -> None:
        self.add_uint32(Keys.LLM.CONTEXT_LENGTH.format(arch=self.arch), length)

    def add_embedding_length(self, length: int) -> None:
        self.add_uint32(Keys.LLM.EMBEDDING_LENGTH.format(arch=self.arch), length)

    def add_block_count(self, length: int) -> None:
        self.add_uint32(Keys.LLM.BLOCK_COUNT.format(arch=self.arch), length)

    def add_leading_dense_block_count(self, length: int) -> None:
        self.add_uint32(Keys.LLM.LEADING_DENSE_BLOCK_COUNT.format(arch=self.arch), length)

xuxzh1's avatar
init  
xuxzh1 committed
610
611
612
613
614
    def add_feed_forward_length(self, length: int | Sequence[int]) -> None:
        if isinstance(length, int):
            self.add_uint32(Keys.LLM.FEED_FORWARD_LENGTH.format(arch=self.arch), length)
        else:
            self.add_array(Keys.LLM.FEED_FORWARD_LENGTH.format(arch=self.arch), length)
mashun1's avatar
v1  
mashun1 committed
615
616
617
618

    def add_expert_feed_forward_length(self, length: int) -> None:
        self.add_uint32(Keys.LLM.EXPERT_FEED_FORWARD_LENGTH.format(arch=self.arch), length)

xuxzh1's avatar
init  
xuxzh1 committed
619
620
621
    def add_expert_shared_feed_forward_length(self, length: int) -> None:
        self.add_uint32(Keys.LLM.EXPERT_SHARED_FEED_FORWARD_LENGTH.format(arch=self.arch), length)

mashun1's avatar
v1  
mashun1 committed
622
623
624
    def add_parallel_residual(self, use: bool) -> None:
        self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)

xuxzh1's avatar
init  
xuxzh1 committed
625
626
    def add_decoder_start_token_id(self, id: int) -> None:
        self.add_uint32(Keys.LLM.DECODER_START_TOKEN_ID.format(arch=self.arch), id)
mashun1's avatar
v1  
mashun1 committed
627

xuxzh1's avatar
init  
xuxzh1 committed
628
629
630
631
632
633
634
635
636
637
638
    def add_head_count(self, count: int | Sequence[int]) -> None:
        if isinstance(count, int):
            self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count)
        else:
            self.add_array(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count)

    def add_head_count_kv(self, count: int | Sequence[int]) -> None:
        if isinstance(count, int):
            self.add_uint32(Keys.Attention.HEAD_COUNT_KV.format(arch=self.arch), count)
        else:
            self.add_array(Keys.Attention.HEAD_COUNT_KV.format(arch=self.arch), count)
mashun1's avatar
v1  
mashun1 committed
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654

    def add_key_length(self, length: int) -> None:
        self.add_uint32(Keys.Attention.KEY_LENGTH.format(arch=self.arch), length)

    def add_value_length(self, length: int) -> None:
        self.add_uint32(Keys.Attention.VALUE_LENGTH.format(arch=self.arch), length)

    def add_max_alibi_bias(self, bias: float) -> None:
        self.add_float32(Keys.Attention.MAX_ALIBI_BIAS.format(arch=self.arch), bias)

    def add_clamp_kqv(self, value: float) -> None:
        self.add_float32(Keys.Attention.CLAMP_KQV.format(arch=self.arch), value)

    def add_logit_scale(self, value: float) -> None:
        self.add_float32(Keys.LLM.LOGIT_SCALE.format(arch=self.arch), value)

xuxzh1's avatar
init  
xuxzh1 committed
655
656
657
658
659
660
    def add_attn_logit_softcapping(self, value: float) -> None:
        self.add_float32(Keys.LLM.ATTN_LOGIT_SOFTCAPPING.format(arch=self.arch), value)

    def add_final_logit_softcapping(self, value: float) -> None:
        self.add_float32(Keys.LLM.FINAL_LOGIT_SOFTCAPPING.format(arch=self.arch), value)

mashun1's avatar
v1  
mashun1 committed
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
    def add_expert_count(self, count: int) -> None:
        self.add_uint32(Keys.LLM.EXPERT_COUNT.format(arch=self.arch), count)

    def add_expert_used_count(self, count: int) -> None:
        self.add_uint32(Keys.LLM.EXPERT_USED_COUNT.format(arch=self.arch), count)

    def add_expert_shared_count(self, count: int) -> None:
        self.add_uint32(Keys.LLM.EXPERT_SHARED_COUNT.format(arch=self.arch), count)

    def add_expert_weights_scale(self, value: float) -> None:
        self.add_float32(Keys.LLM.EXPERT_WEIGHTS_SCALE.format(arch=self.arch), value)

    def add_layer_norm_eps(self, value: float) -> None:
        self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)

    def add_layer_norm_rms_eps(self, value: float) -> None:
        self.add_float32(Keys.Attention.LAYERNORM_RMS_EPS.format(arch=self.arch), value)

    def add_causal_attention(self, value: bool) -> None:
        self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value)

    def add_q_lora_rank(self, length: int) -> None:
        self.add_uint32(Keys.Attention.Q_LORA_RANK.format(arch=self.arch), length)

    def add_kv_lora_rank(self, length: int) -> None:
        self.add_uint32(Keys.Attention.KV_LORA_RANK.format(arch=self.arch), length)

xuxzh1's avatar
init  
xuxzh1 committed
688
689
690
691
692
693
    def add_relative_attn_buckets_count(self, value: int) -> None:
        self.add_uint32(Keys.Attention.REL_BUCKETS_COUNT.format(arch=self.arch), value)

    def add_sliding_window(self, value: int) -> None:
        self.add_uint32(Keys.Attention.SLIDING_WINDOW.format(arch=self.arch), value)

mashun1's avatar
v1  
mashun1 committed
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
    def add_pooling_type(self, value: PoolingType) -> None:
        self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)

    def add_rope_dimension_count(self, count: int) -> None:
        self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count)

    def add_rope_freq_base(self, value: float) -> None:
        self.add_float32(Keys.Rope.FREQ_BASE.format(arch=self.arch), value)

    def add_rope_scaling_type(self, value: RopeScalingType) -> None:
        self.add_string(Keys.Rope.SCALING_TYPE.format(arch=self.arch), value.value)

    def add_rope_scaling_factor(self, value: float) -> None:
        self.add_float32(Keys.Rope.SCALING_FACTOR.format(arch=self.arch), value)

xuxzh1's avatar
init  
xuxzh1 committed
709
    def add_rope_scaling_attn_factors(self, value: float) -> None:
mashun1's avatar
v1  
mashun1 committed
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
        self.add_float32(Keys.Rope.SCALING_ATTN_FACTOR.format(arch=self.arch), value)

    def add_rope_scaling_orig_ctx_len(self, value: int) -> None:
        self.add_uint32(Keys.Rope.SCALING_ORIG_CTX_LEN.format(arch=self.arch), value)

    def add_rope_scaling_finetuned(self, value: bool) -> None:
        self.add_bool(Keys.Rope.SCALING_FINETUNED.format(arch=self.arch), value)

    def add_rope_scaling_yarn_log_mul(self, value: float) -> None:
        self.add_float32(Keys.Rope.SCALING_YARN_LOG_MUL.format(arch=self.arch), value)

    def add_ssm_conv_kernel(self, value: int) -> None:
        self.add_uint32(Keys.SSM.CONV_KERNEL.format(arch=self.arch), value)

    def add_ssm_inner_size(self, value: int) -> None:
        self.add_uint32(Keys.SSM.INNER_SIZE.format(arch=self.arch), value)

    def add_ssm_state_size(self, value: int) -> None:
        self.add_uint32(Keys.SSM.STATE_SIZE.format(arch=self.arch), value)

    def add_ssm_time_step_rank(self, value: int) -> None:
        self.add_uint32(Keys.SSM.TIME_STEP_RANK.format(arch=self.arch), value)

    def add_tokenizer_model(self, model: str) -> None:
        self.add_string(Keys.Tokenizer.MODEL, model)

    def add_tokenizer_pre(self, pre: str) -> None:
        self.add_string(Keys.Tokenizer.PRE, pre)

    def add_token_list(self, tokens: Sequence[str] | Sequence[bytes] | Sequence[bytearray]) -> None:
        self.add_array(Keys.Tokenizer.LIST, tokens)

    def add_token_merges(self, merges: Sequence[str] | Sequence[bytes] | Sequence[bytearray]) -> None:
        self.add_array(Keys.Tokenizer.MERGES, merges)

    def add_token_types(self, types: Sequence[TokenType] | Sequence[int]) -> None:
        self.add_array(Keys.Tokenizer.TOKEN_TYPE, types)

    def add_token_type_count(self, value: int) -> None:
        self.add_uint32(Keys.Tokenizer.TOKEN_TYPE_COUNT, value)

    def add_token_scores(self, scores: Sequence[float]) -> None:
        self.add_array(Keys.Tokenizer.SCORES, scores)

    def add_bos_token_id(self, id: int) -> None:
        self.add_uint32(Keys.Tokenizer.BOS_ID, id)

    def add_eos_token_id(self, id: int) -> None:
        self.add_uint32(Keys.Tokenizer.EOS_ID, id)

    def add_unk_token_id(self, id: int) -> None:
        self.add_uint32(Keys.Tokenizer.UNK_ID, id)

    def add_sep_token_id(self, id: int) -> None:
        self.add_uint32(Keys.Tokenizer.SEP_ID, id)

    def add_pad_token_id(self, id: int) -> None:
        self.add_uint32(Keys.Tokenizer.PAD_ID, id)

    def add_cls_token_id(self, id: int) -> None:
        self.add_uint32(Keys.Tokenizer.CLS_ID, id)

    def add_mask_token_id(self, id: int) -> None:
        self.add_uint32(Keys.Tokenizer.MASK_ID, id)

    def add_add_bos_token(self, value: bool) -> None:
        self.add_bool(Keys.Tokenizer.ADD_BOS, value)

    def add_add_eos_token(self, value: bool) -> None:
        self.add_bool(Keys.Tokenizer.ADD_EOS, value)

    def add_add_space_prefix(self, value: bool) -> None:
        self.add_bool(Keys.Tokenizer.ADD_PREFIX, value)

xuxzh1's avatar
init  
xuxzh1 committed
784
785
786
787
788
789
    def add_remove_extra_whitespaces(self, value: bool) -> None:
        self.add_bool(Keys.Tokenizer.REMOVE_EXTRA_WS, value)

    def add_precompiled_charsmap(self, charsmap: Sequence[bytes]) -> None:
        self.add_array(Keys.Tokenizer.PRECOMPILED_CHARSMAP, charsmap)

mashun1's avatar
v1  
mashun1 committed
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
    def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
        if not isinstance(value, str):
            template_default = None
            template_names = set()

            for choice in value:
                name = choice.get('name', '')
                template = choice.get('template')

                # Allowing non-alphanumerical characters in template name is probably not a good idea, so filter it
                name = ''.join((c if c in ascii_letters + digits else '_' for c in name))

                if name and template is not None:
                    if name == 'default':
                        template_default = template
                    else:
                        template_names.add(name)
                        self.add_string(Keys.Tokenizer.CHAT_TEMPLATE_N.format(name=name), template)

            if template_names:
                self.add_array(Keys.Tokenizer.CHAT_TEMPLATES, list(template_names))

            if template_default is None:
                return

            value = template_default

        self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value)

    def add_prefix_token_id(self, id: int) -> None:
        self.add_uint32(Keys.Tokenizer.PREFIX_ID, id)

    def add_suffix_token_id(self, id: int) -> None:
        self.add_uint32(Keys.Tokenizer.SUFFIX_ID, id)

    def add_middle_token_id(self, id: int) -> None:
        self.add_uint32(Keys.Tokenizer.MIDDLE_ID, id)

    def add_eot_token_id(self, id: int) -> None:
        self.add_uint32(Keys.Tokenizer.EOT_ID, id)

xuxzh1's avatar
init  
xuxzh1 committed
831
832
833
    def add_eom_token_id(self, id: int) -> None:
        self.add_uint32(Keys.Tokenizer.EOM_ID, id)

mashun1's avatar
v1  
mashun1 committed
834
835
836
837
838
839
    def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
        pack_prefix = ''
        if not skip_pack_prefix:
            pack_prefix = '<' if self.endianess == GGUFEndian.LITTLE else '>'
        return struct.pack(f'{pack_prefix}{fmt}', value)

xuxzh1's avatar
init  
xuxzh1 committed
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
    def _pack_val(self, val: Any, vtype: GGUFValueType, add_vtype: bool) -> bytes:
        kv_data = bytearray()

        if add_vtype:
            kv_data += self._pack("I", vtype)

        pack_fmt = self._simple_value_packing.get(vtype)
        if pack_fmt is not None:
            kv_data += self._pack(pack_fmt, val, skip_pack_prefix = vtype == GGUFValueType.BOOL)
        elif vtype == GGUFValueType.STRING:
            encoded_val = val.encode("utf-8") if isinstance(val, str) else val
            kv_data += self._pack("Q", len(encoded_val))
            kv_data += encoded_val
        elif vtype == GGUFValueType.ARRAY:

            if not isinstance(val, Sequence):
                raise ValueError("Invalid GGUF metadata array, expecting sequence")

            if len(val) == 0:
                raise ValueError("Invalid GGUF metadata array. Empty array")

            if isinstance(val, bytes):
                ltype = GGUFValueType.UINT8
            else:
                ltype = GGUFValueType.get_type(val[0])
                if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
                    raise ValueError("All items in a GGUF array should be of the same type")
            kv_data += self._pack("I", ltype)
            kv_data += self._pack("Q", len(val))
            for item in val:
                kv_data += self._pack_val(item, ltype, add_vtype=False)
        else:
            raise ValueError("Invalid GGUF metadata value type or value")

        return kv_data

    @staticmethod
    def format_n_bytes_to_str(num: int) -> str:
        if num == 0:
            return "negligible - metadata only"
        fnum = float(num)
        for unit in ("", "K", "M", "G"):
            if abs(fnum) < 1000.0:
                return f"{fnum:3.1f}{unit}"
            fnum /= 1000.0
        return f"{fnum:.1f}T - over 1TB, split recommended"