Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
1693e754
Commit
1693e754
authored
Nov 14, 2025
by
yiqa
Browse files
使用groupgemm完成高吞吐模式适配
parent
ce363e89
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
6 additions
and
6 deletions
+6
-6
python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_marlin.py
...ntization/compressed_tensors/compressed_tensors_marlin.py
+6
-6
No files found.
python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_marlin.py
View file @
1693e754
...
...
@@ -42,7 +42,7 @@ class SlimQuantCompressedTensorsMarlinConfig(CompressedTensorsConfig):
sparsity_ignore_list
:
list
[
str
],
kv_cache_scheme
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
config
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
packed_modules_mapping
:
Optional
[
dict
[
str
,
list
[
str
]]]
=
None
,
packed_modules_mapping
:
Optional
[
dict
[
str
,
list
[
str
]]]
=
None
,
):
super
().
__init__
(
target_scheme_map
,
...
...
@@ -52,10 +52,10 @@ class SlimQuantCompressedTensorsMarlinConfig(CompressedTensorsConfig):
sparsity_ignore_list
,
kv_cache_scheme
,
config
,
packed_modules_mapping
,
packed_modules_mapping
,
)
@
classmethod
def
override_quantization_method
(
cls
,
hf_quant_cfg
,
user_quant
)
->
Optional
[
str
]:
...
...
@@ -73,7 +73,7 @@ class SlimQuantCompressedTensorsMarlinConfig(CompressedTensorsConfig):
prefix
:
str
,
)
->
Optional
[
"QuantizeMethodBase"
]:
from
sglang.srt.layers.moe.fused_moe_triton.layer
import
FusedMoE
# Avoid circular import
#
from sglang.srt.layers.radix_attention import RadixAttention
from
sglang.srt.layers.radix_attention
import
RadixAttention
# Check if the layer is skipped for quantization.
if
should_ignore_layer
(
prefix
,
ignore
=
self
.
ignore
,
...
...
@@ -85,8 +85,8 @@ class SlimQuantCompressedTensorsMarlinConfig(CompressedTensorsConfig):
return
UnquantizedEmbeddingMethod
()
#UnquantizedLinearMethod()
layer
.
scheme
=
scheme
return
CompressedTensorsLinearMethod
(
self
)
#
if isinstance(layer, RadixAttention):
#
return CompressedTensorsKVCacheMethod(self)
if
isinstance
(
layer
,
RadixAttention
):
return
CompressedTensorsKVCacheMethod
(
self
)
if
isinstance
(
layer
,
FusedMoE
):
return
CompressedTensorsMarlinMoEMethod
.
get_moe_method
(
self
,
layer
)
return
None
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment