Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
ef378ad6
"...git@developer.sourcefind.cn:OpenDAS/torch-harmonics.git" did not exist on "ca46b9d213132b1ad614e4e6c57dfbf04858021d"
Unverified
Commit
ef378ad6
authored
Mar 14, 2025
by
Patrick Devine
Committed by
GitHub
Mar 14, 2025
Browse files
gemma3 quantization (#9776)
parent
2d2247e5
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
149 additions
and
0 deletions
+149
-0
llama/llama.cpp/src/llama-arch.cpp
llama/llama.cpp/src/llama-arch.cpp
+19
-0
llama/llama.cpp/src/llama-arch.h
llama/llama.cpp/src/llama-arch.h
+1
-0
llama/llama.cpp/src/llama-model.cpp
llama/llama.cpp/src/llama-model.cpp
+7
-0
llama/llama.cpp/src/llama-quant.cpp
llama/llama.cpp/src/llama-quant.cpp
+9
-0
llama/patches/0021-gemma3-quantization.patch
llama/patches/0021-gemma3-quantization.patch
+113
-0
No files found.
llama/llama.cpp/src/llama-arch.cpp
View file @
ef378ad6
...
@@ -37,6 +37,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
...
@@ -37,6 +37,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{
LLM_ARCH_MINICPM3
,
"minicpm3"
},
{
LLM_ARCH_MINICPM3
,
"minicpm3"
},
{
LLM_ARCH_GEMMA
,
"gemma"
},
{
LLM_ARCH_GEMMA
,
"gemma"
},
{
LLM_ARCH_GEMMA2
,
"gemma2"
},
{
LLM_ARCH_GEMMA2
,
"gemma2"
},
{
LLM_ARCH_GEMMA3
,
"gemma3"
},
{
LLM_ARCH_STARCODER2
,
"starcoder2"
},
{
LLM_ARCH_STARCODER2
,
"starcoder2"
},
{
LLM_ARCH_MAMBA
,
"mamba"
},
{
LLM_ARCH_MAMBA
,
"mamba"
},
{
LLM_ARCH_XVERSE
,
"xverse"
},
{
LLM_ARCH_XVERSE
,
"xverse"
},
...
@@ -804,6 +805,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
...
@@ -804,6 +805,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{
LLM_TENSOR_FFN_POST_NORM
,
"blk.%d.post_ffw_norm"
},
{
LLM_TENSOR_FFN_POST_NORM
,
"blk.%d.post_ffw_norm"
},
},
},
},
},
{
LLM_ARCH_GEMMA3
,
{
{
LLM_TENSOR_TOKEN_EMBD
,
"token_embd"
},
{
LLM_TENSOR_OUTPUT_NORM
,
"output_norm"
},
{
LLM_TENSOR_ATTN_NORM
,
"blk.%d.attn_norm"
},
{
LLM_TENSOR_ATTN_Q
,
"blk.%d.attn_q"
},
{
LLM_TENSOR_ATTN_K
,
"blk.%d.attn_k"
},
{
LLM_TENSOR_ATTN_V
,
"blk.%d.attn_v"
},
{
LLM_TENSOR_ATTN_OUT
,
"blk.%d.attn_output"
},
{
LLM_TENSOR_ATTN_POST_NORM
,
"blk.%d.post_attention_norm"
},
{
LLM_TENSOR_FFN_NORM
,
"blk.%d.ffn_norm"
},
{
LLM_TENSOR_FFN_GATE
,
"blk.%d.ffn_gate"
},
{
LLM_TENSOR_FFN_DOWN
,
"blk.%d.ffn_down"
},
{
LLM_TENSOR_FFN_UP
,
"blk.%d.ffn_up"
},
{
LLM_TENSOR_FFN_POST_NORM
,
"blk.%d.post_ffw_norm"
},
},
},
{
{
LLM_ARCH_STARCODER2
,
LLM_ARCH_STARCODER2
,
{
{
...
...
llama/llama.cpp/src/llama-arch.h
View file @
ef378ad6
...
@@ -41,6 +41,7 @@ enum llm_arch {
...
@@ -41,6 +41,7 @@ enum llm_arch {
LLM_ARCH_MINICPM3
,
LLM_ARCH_MINICPM3
,
LLM_ARCH_GEMMA
,
LLM_ARCH_GEMMA
,
LLM_ARCH_GEMMA2
,
LLM_ARCH_GEMMA2
,
LLM_ARCH_GEMMA3
,
LLM_ARCH_STARCODER2
,
LLM_ARCH_STARCODER2
,
LLM_ARCH_MAMBA
,
LLM_ARCH_MAMBA
,
LLM_ARCH_XVERSE
,
LLM_ARCH_XVERSE
,
...
...
llama/llama.cpp/src/llama-model.cpp
View file @
ef378ad6
...
@@ -878,6 +878,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
...
@@ -878,6 +878,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default:
type
=
LLM_TYPE_UNKNOWN
;
default:
type
=
LLM_TYPE_UNKNOWN
;
}
}
}
break
;
}
break
;
case
LLM_ARCH_GEMMA3
:
{
}
break
;
case
LLM_ARCH_STARCODER2
:
case
LLM_ARCH_STARCODER2
:
{
{
ml
.
get_key
(
LLM_KV_ATTENTION_LAYERNORM_EPS
,
hparams
.
f_norm_eps
);
ml
.
get_key
(
LLM_KV_ATTENTION_LAYERNORM_EPS
,
hparams
.
f_norm_eps
);
...
@@ -2537,6 +2540,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
...
@@ -2537,6 +2540,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer
.
ffn_post_norm
=
create_tensor
(
tn
(
LLM_TENSOR_FFN_POST_NORM
,
"weight"
,
i
),
{
n_embd
},
0
);
layer
.
ffn_post_norm
=
create_tensor
(
tn
(
LLM_TENSOR_FFN_POST_NORM
,
"weight"
,
i
),
{
n_embd
},
0
);
}
}
}
break
;
}
break
;
case
LLM_ARCH_GEMMA3
:
{
}
break
;
case
LLM_ARCH_STARCODER2
:
case
LLM_ARCH_STARCODER2
:
{
{
tok_embd
=
create_tensor
(
tn
(
LLM_TENSOR_TOKEN_EMBD
,
"weight"
),
{
n_embd
,
n_vocab
},
0
);
tok_embd
=
create_tensor
(
tn
(
LLM_TENSOR_TOKEN_EMBD
,
"weight"
),
{
n_embd
,
n_vocab
},
0
);
...
@@ -4029,6 +4035,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
...
@@ -4029,6 +4035,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
case
LLM_ARCH_PHIMOE
:
case
LLM_ARCH_PHIMOE
:
case
LLM_ARCH_GEMMA
:
case
LLM_ARCH_GEMMA
:
case
LLM_ARCH_GEMMA2
:
case
LLM_ARCH_GEMMA2
:
case
LLM_ARCH_GEMMA3
:
case
LLM_ARCH_STARCODER2
:
case
LLM_ARCH_STARCODER2
:
case
LLM_ARCH_OPENELM
:
case
LLM_ARCH_OPENELM
:
case
LLM_ARCH_GPTNEOX
:
case
LLM_ARCH_GPTNEOX
:
...
...
llama/llama.cpp/src/llama-quant.cpp
View file @
ef378ad6
...
@@ -737,6 +737,15 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
...
@@ -737,6 +737,15 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
// This used to be a regex, but <regex> has an extreme cost to compile times.
// This used to be a regex, but <regex> has an extreme cost to compile times.
bool
quantize
=
name
.
rfind
(
"weight"
)
==
name
.
size
()
-
6
;
// ends with 'weight'?
bool
quantize
=
name
.
rfind
(
"weight"
)
==
name
.
size
()
-
6
;
// ends with 'weight'?
// don't quantize vision stuff
quantize
&=
name
.
find
(
"v.blk."
)
==
std
::
string
::
npos
;
quantize
&=
name
.
find
(
"mm.mm_input_projection.weight"
)
==
std
::
string
::
npos
;
quantize
&=
name
.
find
(
"mm.mm_soft_emb_norm.weight"
)
==
std
::
string
::
npos
;
quantize
&=
name
.
find
(
"v.patch_embedding.weight"
)
==
std
::
string
::
npos
;
quantize
&=
name
.
find
(
"v.position_embedding.weight"
)
==
std
::
string
::
npos
;
quantize
&=
name
.
find
(
"v.post_layernorm.weight"
)
==
std
::
string
::
npos
;
// quantize only 2D and 3D tensors (experts)
// quantize only 2D and 3D tensors (experts)
quantize
&=
(
ggml_n_dims
(
tensor
)
>=
2
);
quantize
&=
(
ggml_n_dims
(
tensor
)
>=
2
);
...
...
llama/patches/0021-gemma3-quantization.patch
0 → 100644
View file @
ef378ad6
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Patrick Devine <patrick@infrahq.com>
Date: Fri, 14 Mar 2025 16:33:23 -0700
Subject: [PATCH] gemma3 quantization
---
src/llama-arch.cpp | 19 +++++++++++++++++++
src/llama-arch.h | 1 +
src/llama-model.cpp | 7 +++++++
src/llama-quant.cpp | 9 +++++++++
4 files changed, 36 insertions(+)
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index b6f20286..b443fcd3 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -37,6 +37,7 @@
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_MINICPM3, "minicpm3" },
{ LLM_ARCH_GEMMA, "gemma" },
{ LLM_ARCH_GEMMA2, "gemma2" },
+ { LLM_ARCH_GEMMA3, "gemma3" },
{ LLM_ARCH_STARCODER2, "starcoder2" },
{ LLM_ARCH_MAMBA, "mamba" },
{ LLM_ARCH_XVERSE, "xverse" },
@@ -804,6 +805,24 @@
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
},
},
+ {
+ LLM_ARCH_GEMMA3,
+ {
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
+ { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
+ },
+ },
{
LLM_ARCH_STARCODER2,
{
diff --git a/src/llama-arch.h b/src/llama-arch.h
index ec742224..aad92a5d 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -41,6 +41,7 @@
enum llm_arch {
LLM_ARCH_MINICPM3,
LLM_ARCH_GEMMA,
LLM_ARCH_GEMMA2,
+ LLM_ARCH_GEMMA3,
LLM_ARCH_STARCODER2,
LLM_ARCH_MAMBA,
LLM_ARCH_XVERSE,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index ab1a07d1..70183041 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -878,6 +878,9 @@
void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN;
}
} break;
+ case LLM_ARCH_GEMMA3:
+ {
+ } break;
case LLM_ARCH_STARCODER2:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -2537,6 +2540,9 @@
bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
}
} break;
+ case LLM_ARCH_GEMMA3:
+ {
+ } break;
case LLM_ARCH_STARCODER2:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -4029,6 +4035,7 @@
enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
case LLM_ARCH_PHIMOE:
case LLM_ARCH_GEMMA:
case LLM_ARCH_GEMMA2:
+ case LLM_ARCH_GEMMA3:
case LLM_ARCH_STARCODER2:
case LLM_ARCH_OPENELM:
case LLM_ARCH_GPTNEOX:
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 6eb1da08..d2f3a510 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -737,6 +737,15 @@
static void llama_model_quantize_impl(const std::string & fname_inp, const std::
// This used to be a regex, but <regex> has an extreme cost to compile times.
bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
+ // don't quantize vision stuff
+ quantize &= name.find("v.blk.") == std::string::npos;
+
+ quantize &= name.find("mm.mm_input_projection.weight") == std::string::npos;
+ quantize &= name.find("mm.mm_soft_emb_norm.weight") == std::string::npos;
+ quantize &= name.find("v.patch_embedding.weight") == std::string::npos;
+ quantize &= name.find("v.position_embedding.weight") == std::string::npos;
+ quantize &= name.find("v.post_layernorm.weight") == std::string::npos;
+
// quantize only 2D and 3D tensors (experts)
quantize &= (ggml_n_dims(tensor) >= 2);
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment