Merge branch 'v0.8.2-ori-wm' into 'v0.8.2-ori'

[fix]修复模型注册失败及其他报错 See merge request dcutoolkit/deeplearing/vllm!92

Merge branch 'v0.8.2-ori-wm' into 'v0.8.2-ori'
[fix]修复模型注册失败及其他报错 See merge request dcutoolkit/deeplearing/vllm!92
77f7bb45 · zhuwenwen · 31f6b24f · 1a397b82 · 77f7bb45 · 77f7bb45
Commit 77f7bb45 authored Apr 01, 2025 by zhuwenwen
3 changed files
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -815,7 +815,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
            expert_ids,
            num_tokens_post_padded,
            B.shape[1] if not use_nn_moe else B.shape[2],
-            A.shape[2],
+            A.shape[1],
            EM,
            topk_ids.numel(),
            A.stride(0),
@@ -1178,7 +1178,7 @@ def inplace_fused_experts(hidden_states: torch.Tensor,
                          w2: torch.Tensor,
                          topk_weights: torch.Tensor,
                          topk_ids: torch.Tensor,
-                          activation: str = "silu",
+                          activation: Optional[str] = None,
                          use_fp8_w8a8: bool = False,
                          use_int8_w8a16: bool = False,
                          use_int4_w4a16: bool = False,
@@ -1205,7 +1205,7 @@ def inplace_fused_experts_fake(
        w2: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
-        activation: str = "silu",
+        activation: Optional[str] = None,
        use_fp8_w8a8: bool = False,
        use_int8_w8a16: bool = False,
        use_int4_w4a16: bool = False,
@@ -1218,7 +1218,7 @@ def inplace_fused_experts_fake(
        a1_scale: Optional[torch.Tensor] = None,
        a2_scale: Optional[torch.Tensor] = None,
        block_shape: Optional[List[int]] = None,
-        use_nn_moe: Optional[bool] = False,) -> None:
+        use_nn_moe: Optional[bool] = False) -> None:
    pass


@@ -1236,7 +1236,7 @@ def outplace_fused_experts(
        w2: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
-        activation: str = "silu",
+        activation: Optional[str] = None,
        use_fp8_w8a8: bool = False,
        use_int8_w8a16: bool = False,
        use_int4_w4a16: bool = False,
@@ -1263,7 +1263,7 @@ def outplace_fused_experts_fake(
        w2: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
-        activation: str = "silu",
+        activation: Optional[str] = None,
        use_fp8_w8a8: bool = False,
        use_int8_w8a16: bool = False,
        use_int4_w4a16: bool = False,

--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -685,7 +685,7 @@ class FusedMoE(torch.nn.Module):
        # is_transposed: if the dim to shard the weight
        # should be flipped. Required by GPTQ, compressed-tensors
        # should be whatever dimension intermediate_size_per_partition is
-        s_transposed = getattr(param, "is_transposed", False) or self.use_nn_moe
+        is_transposed  = getattr(param, "is_transposed", False) or self.use_nn_moe
        shard_dim = SHARD_ID_TO_SHARDED_DIM[shard_id]
        if is_transposed:
            shard_dim = int(not shard_dim)

--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -61,12 +61,12 @@ _ROCM_PARTIALLY_SUPPORTED_MODELS: Dict[str, str] = {
 }

 # Prevent use of clashing `{CUDA/HIP}_VISIBLE_DEVICES``
-if "HIP_VISIBLE_DEVICES" in os.environ:
-    val = os.environ["HIP_VISIBLE_DEVICES"]
-    if cuda_val := os.environ.get("CUDA_VISIBLE_DEVICES", None):
-        assert val == cuda_val
-    else:
-        os.environ["CUDA_VISIBLE_DEVICES"] = val
+# if "HIP_VISIBLE_DEVICES" in os.environ:
+#     val = os.environ["HIP_VISIBLE_DEVICES"]
+#     if cuda_val := os.environ.get("CUDA_VISIBLE_DEVICES", None):
+#         assert val == cuda_val
+#     else:
+#         os.environ["CUDA_VISIBLE_DEVICES"] = val

 # AMDSMI utils
 # Note that NVML is not affected by `{CUDA/HIP}_VISIBLE_DEVICES`,