Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
2695ab05
Unverified
Commit
2695ab05
authored
Apr 08, 2025
by
Yun Dai
Committed by
GitHub
Apr 08, 2025
Browse files
Fix loading KV quantization scale; Enable modelopt kv cache (#4686)
Co-authored-by:
qingquansong
<
ustcsqq@gmail.com
>
parent
88d6fd9a
Changes
38
Show whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
18 additions
and
5 deletions
+18
-5
python/sglang/srt/models/internlm2.py
python/sglang/srt/models/internlm2.py
+1
-0
python/sglang/srt/models/llama.py
python/sglang/srt/models/llama.py
+1
-0
python/sglang/srt/models/minicpm.py
python/sglang/srt/models/minicpm.py
+1
-0
python/sglang/srt/models/minicpm3.py
python/sglang/srt/models/minicpm3.py
+2
-0
python/sglang/srt/models/mixtral.py
python/sglang/srt/models/mixtral.py
+1
-0
python/sglang/srt/models/mixtral_quant.py
python/sglang/srt/models/mixtral_quant.py
+1
-0
python/sglang/srt/models/mllama.py
python/sglang/srt/models/mllama.py
+1
-0
python/sglang/srt/models/olmo.py
python/sglang/srt/models/olmo.py
+1
-0
python/sglang/srt/models/olmo2.py
python/sglang/srt/models/olmo2.py
+1
-0
python/sglang/srt/models/olmoe.py
python/sglang/srt/models/olmoe.py
+1
-0
python/sglang/srt/models/phi3_small.py
python/sglang/srt/models/phi3_small.py
+1
-0
python/sglang/srt/models/qwen.py
python/sglang/srt/models/qwen.py
+1
-0
python/sglang/srt/models/qwen2.py
python/sglang/srt/models/qwen2.py
+1
-0
python/sglang/srt/models/qwen2_moe.py
python/sglang/srt/models/qwen2_moe.py
+1
-0
python/sglang/srt/models/stablelm.py
python/sglang/srt/models/stablelm.py
+1
-0
python/sglang/srt/models/xverse.py
python/sglang/srt/models/xverse.py
+1
-0
python/sglang/srt/models/xverse_moe.py
python/sglang/srt/models/xverse_moe.py
+1
-0
python/sglang/test/test_utils.py
python/sglang/test/test_utils.py
+0
-5
No files found.
python/sglang/srt/models/internlm2.py
View file @
2695ab05
...
@@ -145,6 +145,7 @@ class InternLM2Attention(nn.Module):
...
@@ -145,6 +145,7 @@ class InternLM2Attention(nn.Module):
self
.
scaling
,
self
.
scaling
,
self
.
num_kv_heads
,
self
.
num_kv_heads
,
layer_id
,
layer_id
,
quant_config
=
quant_config
,
prefix
=
add_prefix
(
"attn"
,
prefix
),
prefix
=
add_prefix
(
"attn"
,
prefix
),
)
)
...
...
python/sglang/srt/models/llama.py
View file @
2695ab05
...
@@ -170,6 +170,7 @@ class LlamaAttention(nn.Module):
...
@@ -170,6 +170,7 @@ class LlamaAttention(nn.Module):
self
.
scaling
,
self
.
scaling
,
num_kv_heads
=
self
.
num_kv_heads
,
num_kv_heads
=
self
.
num_kv_heads
,
layer_id
=
layer_id
,
layer_id
=
layer_id
,
quant_config
=
quant_config
,
prefix
=
add_prefix
(
"attn"
,
prefix
),
prefix
=
add_prefix
(
"attn"
,
prefix
),
)
)
...
...
python/sglang/srt/models/minicpm.py
View file @
2695ab05
...
@@ -146,6 +146,7 @@ class MiniCPMAttention(nn.Module):
...
@@ -146,6 +146,7 @@ class MiniCPMAttention(nn.Module):
self
.
scaling
,
self
.
scaling
,
num_kv_heads
=
self
.
num_kv_heads
,
num_kv_heads
=
self
.
num_kv_heads
,
layer_id
=
layer_id
,
layer_id
=
layer_id
,
quant_config
=
quant_config
,
prefix
=
add_prefix
(
"attn"
,
prefix
),
prefix
=
add_prefix
(
"attn"
,
prefix
),
)
)
...
...
python/sglang/srt/models/minicpm3.py
View file @
2695ab05
...
@@ -192,6 +192,7 @@ class MiniCPM3Attention(nn.Module):
...
@@ -192,6 +192,7 @@ class MiniCPM3Attention(nn.Module):
self
.
scaling
,
self
.
scaling
,
num_kv_heads
=
self
.
num_local_heads
,
num_kv_heads
=
self
.
num_local_heads
,
layer_id
=
layer_id
,
layer_id
=
layer_id
,
quant_config
=
quant_config
,
prefix
=
add_prefix
(
"attn"
,
prefix
),
prefix
=
add_prefix
(
"attn"
,
prefix
),
)
)
...
@@ -343,6 +344,7 @@ class MiniCPM3AttentionMLA(nn.Module):
...
@@ -343,6 +344,7 @@ class MiniCPM3AttentionMLA(nn.Module):
num_kv_heads
=
1
,
num_kv_heads
=
1
,
layer_id
=
layer_id
,
layer_id
=
layer_id
,
v_head_dim
=
self
.
kv_lora_rank
,
v_head_dim
=
self
.
kv_lora_rank
,
quant_config
=
quant_config
,
prefix
=
add_prefix
(
"attn"
,
prefix
),
prefix
=
add_prefix
(
"attn"
,
prefix
),
)
)
...
...
python/sglang/srt/models/mixtral.py
View file @
2695ab05
...
@@ -169,6 +169,7 @@ class MixtralAttention(nn.Module):
...
@@ -169,6 +169,7 @@ class MixtralAttention(nn.Module):
self
.
scaling
,
self
.
scaling
,
num_kv_heads
=
self
.
num_kv_heads
,
num_kv_heads
=
self
.
num_kv_heads
,
layer_id
=
layer_id
,
layer_id
=
layer_id
,
quant_config
=
quant_config
,
prefix
=
add_prefix
(
"attn"
,
prefix
),
prefix
=
add_prefix
(
"attn"
,
prefix
),
)
)
...
...
python/sglang/srt/models/mixtral_quant.py
View file @
2695ab05
...
@@ -232,6 +232,7 @@ class MixtralAttention(nn.Module):
...
@@ -232,6 +232,7 @@ class MixtralAttention(nn.Module):
self
.
scaling
,
self
.
scaling
,
num_kv_heads
=
self
.
num_kv_heads
,
num_kv_heads
=
self
.
num_kv_heads
,
layer_id
=
layer_id
,
layer_id
=
layer_id
,
quant_config
=
quant_config
,
prefix
=
add_prefix
(
"attn"
,
prefix
),
prefix
=
add_prefix
(
"attn"
,
prefix
),
)
)
...
...
python/sglang/srt/models/mllama.py
View file @
2695ab05
...
@@ -535,6 +535,7 @@ class MllamaTextCrossAttention(nn.Module):
...
@@ -535,6 +535,7 @@ class MllamaTextCrossAttention(nn.Module):
self
.
num_local_key_value_heads
,
self
.
num_local_key_value_heads
,
layer_id
=
layer_id
,
layer_id
=
layer_id
,
is_cross_attention
=
True
,
is_cross_attention
=
True
,
quant_config
=
quant_config
,
prefix
=
add_prefix
(
"attn"
,
prefix
),
prefix
=
add_prefix
(
"attn"
,
prefix
),
)
)
...
...
python/sglang/srt/models/olmo.py
View file @
2695ab05
...
@@ -93,6 +93,7 @@ class OlmoAttention(nn.Module):
...
@@ -93,6 +93,7 @@ class OlmoAttention(nn.Module):
self
.
scaling
,
self
.
scaling
,
num_kv_heads
=
self
.
num_heads
,
num_kv_heads
=
self
.
num_heads
,
layer_id
=
layer_id
,
layer_id
=
layer_id
,
quant_config
=
quant_config
,
prefix
=
add_prefix
(
"attn"
,
prefix
),
prefix
=
add_prefix
(
"attn"
,
prefix
),
)
)
...
...
python/sglang/srt/models/olmo2.py
View file @
2695ab05
...
@@ -118,6 +118,7 @@ class Olmo2Attention(nn.Module):
...
@@ -118,6 +118,7 @@ class Olmo2Attention(nn.Module):
self
.
scaling
,
self
.
scaling
,
num_kv_heads
=
self
.
num_kv_heads
,
num_kv_heads
=
self
.
num_kv_heads
,
layer_id
=
layer_id
,
layer_id
=
layer_id
,
quant_config
=
quant_config
,
prefix
=
add_prefix
(
"attn"
,
prefix
),
prefix
=
add_prefix
(
"attn"
,
prefix
),
)
)
...
...
python/sglang/srt/models/olmoe.py
View file @
2695ab05
...
@@ -170,6 +170,7 @@ class OlmoeAttention(nn.Module):
...
@@ -170,6 +170,7 @@ class OlmoeAttention(nn.Module):
self
.
scaling
,
self
.
scaling
,
layer_id
=
layer_id
,
layer_id
=
layer_id
,
num_kv_heads
=
self
.
num_kv_heads
,
num_kv_heads
=
self
.
num_kv_heads
,
quant_config
=
quant_config
,
prefix
=
add_prefix
(
"attn"
,
prefix
),
prefix
=
add_prefix
(
"attn"
,
prefix
),
)
)
...
...
python/sglang/srt/models/phi3_small.py
View file @
2695ab05
...
@@ -202,6 +202,7 @@ class Phi3SmallSelfAttention(nn.Module):
...
@@ -202,6 +202,7 @@ class Phi3SmallSelfAttention(nn.Module):
self
.
scale
,
self
.
scale
,
num_kv_heads
=
self
.
num_kv_heads_per_partion
,
num_kv_heads
=
self
.
num_kv_heads_per_partion
,
layer_id
=
layer_id
,
layer_id
=
layer_id
,
quant_config
=
quant_config
,
prefix
=
add_prefix
(
"attn"
,
prefix
),
prefix
=
add_prefix
(
"attn"
,
prefix
),
)
)
...
...
python/sglang/srt/models/qwen.py
View file @
2695ab05
...
@@ -133,6 +133,7 @@ class QWenAttention(nn.Module):
...
@@ -133,6 +133,7 @@ class QWenAttention(nn.Module):
self
.
scaling
,
self
.
scaling
,
num_kv_heads
=
self
.
num_heads
,
num_kv_heads
=
self
.
num_heads
,
layer_id
=
layer_id
,
layer_id
=
layer_id
,
quant_config
=
quant_config
,
prefix
=
add_prefix
(
"attn"
,
prefix
),
prefix
=
add_prefix
(
"attn"
,
prefix
),
)
)
...
...
python/sglang/srt/models/qwen2.py
View file @
2695ab05
...
@@ -154,6 +154,7 @@ class Qwen2Attention(nn.Module):
...
@@ -154,6 +154,7 @@ class Qwen2Attention(nn.Module):
self
.
scaling
,
self
.
scaling
,
num_kv_heads
=
self
.
num_kv_heads
,
num_kv_heads
=
self
.
num_kv_heads
,
layer_id
=
layer_id
,
layer_id
=
layer_id
,
quant_config
=
quant_config
,
prefix
=
add_prefix
(
"attn"
,
prefix
),
prefix
=
add_prefix
(
"attn"
,
prefix
),
)
)
...
...
python/sglang/srt/models/qwen2_moe.py
View file @
2695ab05
...
@@ -231,6 +231,7 @@ class Qwen2MoeAttention(nn.Module):
...
@@ -231,6 +231,7 @@ class Qwen2MoeAttention(nn.Module):
self
.
scaling
,
self
.
scaling
,
num_kv_heads
=
self
.
num_kv_heads
,
num_kv_heads
=
self
.
num_kv_heads
,
layer_id
=
layer_id
,
layer_id
=
layer_id
,
quant_config
=
quant_config
,
prefix
=
add_prefix
(
"attn"
,
prefix
),
prefix
=
add_prefix
(
"attn"
,
prefix
),
)
)
...
...
python/sglang/srt/models/stablelm.py
View file @
2695ab05
...
@@ -149,6 +149,7 @@ class StablelmAttention(nn.Module):
...
@@ -149,6 +149,7 @@ class StablelmAttention(nn.Module):
self
.
scaling
,
self
.
scaling
,
num_kv_heads
=
self
.
num_key_value_heads
,
num_kv_heads
=
self
.
num_key_value_heads
,
layer_id
=
layer_id
,
layer_id
=
layer_id
,
quant_config
=
quant_config
,
prefix
=
add_prefix
(
"attn"
,
prefix
),
prefix
=
add_prefix
(
"attn"
,
prefix
),
)
)
...
...
python/sglang/srt/models/xverse.py
View file @
2695ab05
...
@@ -153,6 +153,7 @@ class XverseAttention(nn.Module):
...
@@ -153,6 +153,7 @@ class XverseAttention(nn.Module):
self
.
scaling
,
self
.
scaling
,
num_kv_heads
=
self
.
num_kv_heads
,
num_kv_heads
=
self
.
num_kv_heads
,
layer_id
=
layer_id
,
layer_id
=
layer_id
,
quant_config
=
quant_config
,
prefix
=
add_prefix
(
"attn"
,
prefix
),
prefix
=
add_prefix
(
"attn"
,
prefix
),
)
)
...
...
python/sglang/srt/models/xverse_moe.py
View file @
2695ab05
...
@@ -252,6 +252,7 @@ class XverseAttention(nn.Module):
...
@@ -252,6 +252,7 @@ class XverseAttention(nn.Module):
self
.
scaling
,
self
.
scaling
,
num_kv_heads
=
self
.
num_kv_heads
,
num_kv_heads
=
self
.
num_kv_heads
,
layer_id
=
layer_id
,
layer_id
=
layer_id
,
quant_config
=
quant_config
,
prefix
=
add_prefix
(
"attn"
,
prefix
),
prefix
=
add_prefix
(
"attn"
,
prefix
),
)
)
...
...
python/sglang/test/test_utils.py
View file @
2695ab05
...
@@ -37,11 +37,6 @@ DEFAULT_FP8_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST = (
...
@@ -37,11 +37,6 @@ DEFAULT_FP8_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST = (
DEFAULT_FP8_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST
=
(
DEFAULT_FP8_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST
=
(
"nvidia/Llama-3.1-8B-Instruct-FP8"
"nvidia/Llama-3.1-8B-Instruct-FP8"
)
)
# TODO(yundai424): right now specifying to an older revision since the latest one
# carries kv cache quantization which doesn't work yet
DEFAULT_FP8_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST_REVISION
=
(
"13858565416dbdc0b4e7a4a677fadfbd5b9e5bb9"
)
DEFAULT_MODEL_NAME_FOR_TEST
=
"meta-llama/Llama-3.1-8B-Instruct"
DEFAULT_MODEL_NAME_FOR_TEST
=
"meta-llama/Llama-3.1-8B-Instruct"
DEFAULT_SMALL_MODEL_NAME_FOR_TEST
=
"meta-llama/Llama-3.2-1B-Instruct"
DEFAULT_SMALL_MODEL_NAME_FOR_TEST
=
"meta-llama/Llama-3.2-1B-Instruct"
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment