[Bugfix] Fix W4A8_FP8 MoE tp>1 correctness and view() TypeError (#40310)

Signed-off-by: EdalatiAli <aliedalati@cohere.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>

[Bugfix] Fix W4A8_FP8 MoE tp>1 correctness and view() TypeError (#40310)
Signed-off-by: EdalatiAli <aliedalati@cohere.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
f946659f · EdalatiAli · GitHub · f90aa446 · f946659f · f946659f
Unverified Commit f946659f authored Apr 21, 2026 by EdalatiAli Committed by GitHub Apr 21, 2026
2 changed files
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a8_fp8.py
@@ -198,11 +198,15 @@ class CompressedTensorsW4A8Fp8MoEMethod(CompressedTensorsMoEMethod):
        # encode and reorder weight tensors, and get the layout to pass to
        # the grouped gemm kernel. `b_strides1/2` specifies the entire layout
        convert_packed_uint4b8_to_signed_int4_inplace(layer.w13_weight_packed)
+        # mirror the sync in CutlassW4A8LinearKernel; required for tp>1 correctness
+        torch.accelerator.synchronize()
        w13_weight_shuffled, self.b_strides1 = (
            ops.cutlass_encode_and_reorder_int4b_grouped(layer.w13_weight_packed)
        )
        replace_parameter(layer, "w13_weight_packed", w13_weight_shuffled)
        convert_packed_uint4b8_to_signed_int4_inplace(layer.w2_weight_packed)
+        # mirror the sync in CutlassW4A8LinearKernel; required for tp>1 correctness
+        torch.accelerator.synchronize()
        w2_weight_shuffled, self.b_strides2 = (
            ops.cutlass_encode_and_reorder_int4b_grouped(layer.w2_weight_packed)
        )

--- a/vllm/model_executor/layers/quantization/utils/quant_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py
@@ -818,7 +818,7 @@ def convert_bf16_scales_to_fp8(
    # restore original shape
    fp8_scales = fp8_scales.view(orig_shape)
-    chan_scales = chan_scales.view(orig_shape[:-1], -1)
+    chan_scales = chan_scales.view(*orig_shape[:-1], -1)
    return fp8_scales, chan_scales