"vscode:/vscode.git/clone" did not exist on "8d32dc603d03fa2d9b2fe538fbc53ef049433762"
Unverified Commit 07728bf5 authored by JartX's avatar JartX Committed by GitHub
Browse files

[BugFix] add select_gemm_impl on CompressedTensorsWNA16MoEMethod to support LoRA (#31453)


Signed-off-by: default avatarJartX <sagformas@epdcenter.es>
parent 3f52fa5a
...@@ -1996,6 +1996,29 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod): ...@@ -1996,6 +1996,29 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
block_shape=[0, self.group_size], block_shape=[0, self.group_size],
) )
def select_gemm_impl(
self,
prepare_finalize: mk.FusedMoEPrepareAndFinalize,
layer: torch.nn.Module,
) -> mk.FusedMoEPermuteExpertsUnpermute:
if self.moe.is_lora_enabled:
assert self.moe_quant_config is not None
from vllm.triton_utils import HAS_TRITON
if HAS_TRITON:
from vllm.model_executor.layers.fused_moe import TritonExperts
layer.w13_weight = layer.w13_weight_packed
layer.w2_weight = layer.w2_weight_packed
return TritonExperts(quant_config=self.moe_quant_config)
else:
raise NotImplementedError(
"TritonExperts requires Triton. "
"Install triton or disable LoRA for MoE."
)
raise NotImplementedError
def apply( def apply(
self, self,
layer: FusedMoE, layer: FusedMoE,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment