"tests/encoder_decoder/test_modeling_encoder_decoder.py" did not exist on "bc0d26d1dea73b23f6e388c18709287d5423a2d8"
Unverified Commit 934fe150 authored by Isotr0py's avatar Isotr0py Committed by GitHub
Browse files

Fix GGUF dequantize for `gguf==0.9.1` (#32298)

* fix gguf dequantize for gguf==0.9.1

* fix old version

* make style
parent 3e8106d2
...@@ -217,13 +217,13 @@ def _gguf_parse_value(_value, data_type): ...@@ -217,13 +217,13 @@ def _gguf_parse_value(_value, data_type):
return _value return _value
def dequantize_q4_k(data): def dequantize_q4_k(data, n_bytes: int):
# C implementation # C implementation
# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L1929 # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L1929
# C struct definition # C struct definition
# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L116 # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L116
block_size = GGML_BLOCK_SIZES["Q4_K"] block_size = GGML_BLOCK_SIZES["Q4_K"]
num_blocks = len(data) // block_size num_blocks = n_bytes // block_size
data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2) data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2)
data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size) data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size)
...@@ -248,13 +248,13 @@ def dequantize_q4_k(data): ...@@ -248,13 +248,13 @@ def dequantize_q4_k(data):
return factors * qs2 - offsets return factors * qs2 - offsets
def dequantize_q4_0(data): def dequantize_q4_0(data, n_bytes: int):
# C implementation # C implementation
# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L1086 # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L1086
# C struct definition # C struct definition
# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L11 # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L11
block_size = GGML_BLOCK_SIZES["Q4_0"] block_size = GGML_BLOCK_SIZES["Q4_0"]
num_blocks = len(data) // block_size num_blocks = n_bytes // block_size
data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2) data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2)
data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size) data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size)
...@@ -274,13 +274,13 @@ def dequantize_q4_0(data): ...@@ -274,13 +274,13 @@ def dequantize_q4_0(data):
return (scales * quants).astype(np.float32) return (scales * quants).astype(np.float32)
def dequantize_q6_k(data): def dequantize_q6_k(data, n_bytes: int):
# C implementation # C implementation
# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L2275 # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L2275
# C struct definition # C struct definition
# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L152 # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L152
block_size = GGML_BLOCK_SIZES["Q6_K"] block_size = GGML_BLOCK_SIZES["Q6_K"]
num_blocks = len(data) // block_size num_blocks = n_bytes // block_size
data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2) data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2)
data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size) data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size)
...@@ -327,11 +327,11 @@ def dequantize_q6_k(data): ...@@ -327,11 +327,11 @@ def dequantize_q6_k(data):
) )
def dequantize_q8_0(data): def dequantize_q8_0(data, n_bytes: int):
# C struct definition # C struct definition
# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L43 # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L43
block_size = GGML_BLOCK_SIZES["Q8_0"] block_size = GGML_BLOCK_SIZES["Q8_0"]
num_blocks = len(data) // block_size num_blocks = n_bytes // block_size
scales = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, 1 + 16)[:, :1].astype(np.float32) scales = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, 1 + 16)[:, :1].astype(np.float32)
qs = np.frombuffer(data, dtype=np.int8).reshape(num_blocks, 2 + 32)[:, 2:] qs = np.frombuffer(data, dtype=np.int8).reshape(num_blocks, 2 + 32)[:, 2:]
...@@ -339,12 +339,12 @@ def dequantize_q8_0(data): ...@@ -339,12 +339,12 @@ def dequantize_q8_0(data):
return scales * qs return scales * qs
def dequantize_q2_k(data): def dequantize_q2_k(data, n_bytes: int):
# C implementation # C implementation
# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L1547 # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L1547
# C struct definition # C struct definition
# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L74 # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L74
num_blocks = len(data) // GGML_BLOCK_SIZES["Q2_K"] num_blocks = n_bytes // GGML_BLOCK_SIZES["Q2_K"]
data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, GGML_BLOCK_SIZES["Q2_K"] // 2) data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, GGML_BLOCK_SIZES["Q2_K"] // 2)
data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, GGML_BLOCK_SIZES["Q2_K"]) data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, GGML_BLOCK_SIZES["Q2_K"])
...@@ -379,12 +379,12 @@ def dequantize_q2_k(data): ...@@ -379,12 +379,12 @@ def dequantize_q2_k(data):
return d * (scales & 15) * (tmp & 3) - dmin * (scales >> 4) return d * (scales & 15) * (tmp & 3) - dmin * (scales >> 4)
def dequantize_q3_k(data): def dequantize_q3_k(data, n_bytes: int):
# C implementation # C implementation
# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L1723C32-L1723C42 # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L1723C32-L1723C42
# C struct definition # C struct definition
# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L95 # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L95
num_blocks = len(data) // GGML_BLOCK_SIZES["Q3_K"] num_blocks = n_bytes // GGML_BLOCK_SIZES["Q3_K"]
data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, GGML_BLOCK_SIZES["Q3_K"] // 2) data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, GGML_BLOCK_SIZES["Q3_K"] // 2)
data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, GGML_BLOCK_SIZES["Q3_K"]) data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, GGML_BLOCK_SIZES["Q3_K"])
...@@ -428,12 +428,12 @@ def dequantize_q3_k(data): ...@@ -428,12 +428,12 @@ def dequantize_q3_k(data):
) )
def dequantize_q5_k(data): def dequantize_q5_k(data, n_bytes: int):
# C implementation # C implementation
# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L2129 # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L2129
# C struct definition # C struct definition
# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L138 # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L138
num_blocks = len(data) // GGML_BLOCK_SIZES["Q5_K"] num_blocks = n_bytes // GGML_BLOCK_SIZES["Q5_K"]
data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, GGML_BLOCK_SIZES["Q5_K"] // 2) data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, GGML_BLOCK_SIZES["Q5_K"] // 2)
data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, GGML_BLOCK_SIZES["Q5_K"]) data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, GGML_BLOCK_SIZES["Q5_K"])
...@@ -487,25 +487,25 @@ def dequantize_q5_k(data): ...@@ -487,25 +487,25 @@ def dequantize_q5_k(data):
) )
def load_dequant_gguf_tensor(shape, ggml_type, data): def load_dequant_gguf_tensor(shape, ggml_type, data, n_bytes):
if ggml_type == GGML_TYPES["F32"]: if ggml_type == GGML_TYPES["F32"]:
values = data values = data
elif ggml_type == GGML_TYPES["F16"]: elif ggml_type == GGML_TYPES["F16"]:
values = data values = data
elif ggml_type == GGML_TYPES["Q8_0"]: elif ggml_type == GGML_TYPES["Q8_0"]:
values = dequantize_q8_0(data) values = dequantize_q8_0(data, n_bytes)
elif ggml_type == GGML_TYPES["Q4_0"]: elif ggml_type == GGML_TYPES["Q4_0"]:
values = dequantize_q4_0(data) values = dequantize_q4_0(data, n_bytes)
elif ggml_type == GGML_TYPES["Q4_K"]: elif ggml_type == GGML_TYPES["Q4_K"]:
values = dequantize_q4_k(data) values = dequantize_q4_k(data, n_bytes)
elif ggml_type == GGML_TYPES["Q6_K"]: elif ggml_type == GGML_TYPES["Q6_K"]:
values = dequantize_q6_k(data) values = dequantize_q6_k(data, n_bytes)
elif ggml_type == GGML_TYPES["Q2_K"]: elif ggml_type == GGML_TYPES["Q2_K"]:
values = dequantize_q2_k(data) values = dequantize_q2_k(data, n_bytes)
elif ggml_type == GGML_TYPES["Q3_K"]: elif ggml_type == GGML_TYPES["Q3_K"]:
values = dequantize_q3_k(data) values = dequantize_q3_k(data, n_bytes)
elif ggml_type == GGML_TYPES["Q5_K"]: elif ggml_type == GGML_TYPES["Q5_K"]:
values = dequantize_q5_k(data) values = dequantize_q5_k(data, n_bytes)
else: else:
raise NotImplementedError( raise NotImplementedError(
f"ggml_type {ggml_type} not implemented - please raise an issue on huggingface transformers: https://github.com/huggingface/transformers/issues/new/choose" f"ggml_type {ggml_type} not implemented - please raise an issue on huggingface transformers: https://github.com/huggingface/transformers/issues/new/choose"
......
...@@ -145,7 +145,9 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): ...@@ -145,7 +145,9 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
shape = tensor.shape shape = tensor.shape
name = tensor.name name = tensor.name
weights = load_dequant_gguf_tensor(shape=shape, ggml_type=tensor.tensor_type, data=tensor.data) weights = load_dequant_gguf_tensor(
shape=shape, ggml_type=tensor.tensor_type, data=tensor.data, n_bytes=int(tensor.n_bytes)
)
if architecture == "llama" and (".attn_k." in name or ".attn_q." in name): if architecture == "llama" and (".attn_k." in name or ".attn_q." in name):
num_heads = parsed_parameters["config"]["num_attention_heads"] num_heads = parsed_parameters["config"]["num_attention_heads"]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment