Fix GGUF dequantize for `gguf==0.9.1` (#32298)

* fix gguf dequantize for gguf==0.9.1 * fix old version * make style

Fix GGUF dequantize for `gguf==0.9.1` (#32298)
* fix gguf dequantize for gguf==0.9.1 * fix old version * make style
934fe150 · Isotr0py · GitHub · 3e8106d2 · 934fe150 · 934fe150
Unverified Commit 934fe150 authored Jul 30, 2024 by Isotr0py Committed by GitHub Jul 30, 2024
Showing with 25 additions and 23 deletions

src/transformers/integrations/ggml.py src/transformers/integrations/ggml.py +22 -22

src/transformers/modeling_gguf_pytorch_utils.py src/transformers/modeling_gguf_pytorch_utils.py +3 -1

No files found.
--- a/src/transformers/integrations/ggml.py
+++ b/src/transformers/integrations/ggml.py
@@ -217,13 +217,13 @@ def _gguf_parse_value(_value, data_type):
    return _value
-def dequantize_q4_k(data):
+def dequantize_q4_k(data, n_bytes: int):
    # C implementation
    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L1929
    # C struct definition
    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L116
    block_size = GGML_BLOCK_SIZES["Q4_K"]
-    num_blocks = len(data) // block_size
+    num_blocks = n_bytes // block_size
    data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2)
    data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size)
@@ -248,13 +248,13 @@ def dequantize_q4_k(data):
    return factors * qs2 - offsets
-def dequantize_q4_0(data):
+def dequantize_q4_0(data, n_bytes: int):
    # C implementation
    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L1086
    # C struct definition
    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L11
    block_size = GGML_BLOCK_SIZES["Q4_0"]
-    num_blocks = len(data) // block_size
+    num_blocks = n_bytes // block_size
    data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2)
    data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size)
@@ -274,13 +274,13 @@ def dequantize_q4_0(data):
    return (scales * quants).astype(np.float32)
-def dequantize_q6_k(data):
+def dequantize_q6_k(data, n_bytes: int):
    # C implementation
    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L2275
    # C struct definition
    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L152
    block_size = GGML_BLOCK_SIZES["Q6_K"]
-    num_blocks = len(data) // block_size
+    num_blocks = n_bytes // block_size
    data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2)
    data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size)
@@ -327,11 +327,11 @@ def dequantize_q6_k(data):
    )
-def dequantize_q8_0(data):
+def dequantize_q8_0(data, n_bytes: int):
    # C struct definition
    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L43
    block_size = GGML_BLOCK_SIZES["Q8_0"]
-    num_blocks = len(data) // block_size
+    num_blocks = n_bytes // block_size
    scales = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, 1 + 16)[:, :1].astype(np.float32)
    qs = np.frombuffer(data, dtype=np.int8).reshape(num_blocks, 2 + 32)[:, 2:]
@@ -339,12 +339,12 @@ def dequantize_q8_0(data):
    return scales * qs
-def dequantize_q2_k(data):
+def dequantize_q2_k(data, n_bytes: int):
    # C implementation
    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L1547
    # C struct definition
    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L74
-    num_blocks = len(data) // GGML_BLOCK_SIZES["Q2_K"]
+    num_blocks = n_bytes // GGML_BLOCK_SIZES["Q2_K"]
    data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, GGML_BLOCK_SIZES["Q2_K"] // 2)
    data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, GGML_BLOCK_SIZES["Q2_K"])
@@ -379,12 +379,12 @@ def dequantize_q2_k(data):
    return d * (scales & 15) * (tmp & 3) - dmin * (scales >> 4)
-def dequantize_q3_k(data):
+def dequantize_q3_k(data, n_bytes: int):
    # C implementation
    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L1723C32-L1723C42
    # C struct definition
    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L95
-    num_blocks = len(data) // GGML_BLOCK_SIZES["Q3_K"]
+    num_blocks = n_bytes // GGML_BLOCK_SIZES["Q3_K"]
    data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, GGML_BLOCK_SIZES["Q3_K"] // 2)
    data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, GGML_BLOCK_SIZES["Q3_K"])
@@ -428,12 +428,12 @@ def dequantize_q3_k(data):
    )
-def dequantize_q5_k(data):
+def dequantize_q5_k(data, n_bytes: int):
    # C implementation
    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L2129
    # C struct definition
    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L138
-    num_blocks = len(data) // GGML_BLOCK_SIZES["Q5_K"]
+    num_blocks = n_bytes // GGML_BLOCK_SIZES["Q5_K"]
    data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, GGML_BLOCK_SIZES["Q5_K"] // 2)
    data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, GGML_BLOCK_SIZES["Q5_K"])
@@ -487,25 +487,25 @@ def dequantize_q5_k(data):
    )
-def load_dequant_gguf_tensor(shape, ggml_type, data):
+def load_dequant_gguf_tensor(shape, ggml_type, data, n_bytes):
    if ggml_type == GGML_TYPES["F32"]:
        values = data
    elif ggml_type == GGML_TYPES["F16"]:
        values = data
    elif ggml_type == GGML_TYPES["Q8_0"]:
-        values = dequantize_q8_0(data)
+        values = dequantize_q8_0(data, n_bytes)
    elif ggml_type == GGML_TYPES["Q4_0"]:
-        values = dequantize_q4_0(data)
+        values = dequantize_q4_0(data, n_bytes)
    elif ggml_type == GGML_TYPES["Q4_K"]:
-        values = dequantize_q4_k(data)
+        values = dequantize_q4_k(data, n_bytes)
    elif ggml_type == GGML_TYPES["Q6_K"]:
-        values = dequantize_q6_k(data)
+        values = dequantize_q6_k(data, n_bytes)
    elif ggml_type == GGML_TYPES["Q2_K"]:
-        values = dequantize_q2_k(data)
+        values = dequantize_q2_k(data, n_bytes)
    elif ggml_type == GGML_TYPES["Q3_K"]:
-        values = dequantize_q3_k(data)
+        values = dequantize_q3_k(data, n_bytes)
    elif ggml_type == GGML_TYPES["Q5_K"]:
-        values = dequantize_q5_k(data)
+        values = dequantize_q5_k(data, n_bytes)
    else:
        raise NotImplementedError(
            f"ggml_type {ggml_type} not implemented - please raise an issue on huggingface transformers: https://github.com/huggingface/transformers/issues/new/choose"

--- a/src/transformers/modeling_gguf_pytorch_utils.py
+++ b/src/transformers/modeling_gguf_pytorch_utils.py
@@ -145,7 +145,9 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
            shape = tensor.shape
            name = tensor.name
-            weights = load_dequant_gguf_tensor(shape=shape, ggml_type=tensor.tensor_type, data=tensor.data)
+            weights = load_dequant_gguf_tensor(
+                shape=shape, ggml_type=tensor.tensor_type, data=tensor.data, n_bytes=int(tensor.n_bytes)
+            )
            if architecture == "llama" and (".attn_k." in name or ".attn_q." in name):
                num_heads = parsed_parameters["config"]["num_attention_heads"]