update deepseek-v2

96802ca0 · zhuwenwen · 4d4c6fe3 · 96802ca0 · 96802ca0
Commit 96802ca0 authored Feb 26, 2025 by zhuwenwen
Show whitespace changes
Inline Side-by-side

Showing with 8 additions and 10 deletions

benchmarks/kernels/benchmark_moe_int4.py benchmarks/kernels/benchmark_moe_int4.py +5 -5

vllm/model_executor/models/deepseek_v2.py vllm/model_executor/models/deepseek_v2.py +3 -5

No files found.
--- a/benchmarks/kernels/benchmark_moe_int4.py
+++ b/benchmarks/kernels/benchmark_moe_int4.py
@@ -161,7 +161,7 @@ def benchmark_config(
        nn_moe = False 
        block_shape=[0, group_size]
    input_gating = torch.randn(num_tokens, num_experts, dtype=torch.float32)
-    print(f"shape: {x.shape[0]} | config: {config}")
+
    def prepare(i: int):
        input_gating.copy_(gating_output[i])

@@ -187,6 +187,7 @@ def benchmark_config(
                a2_scale=a2_scale,
                use_nn_moe=nn_moe,
                block_shape=block_shape,
+                moe_ep_size=1,
            )

    # JIT compilation & warmup
@@ -221,8 +222,7 @@ def benchmark_config(
        end_event.record()
        end_event.synchronize()
        latencies.append(start_event.elapsed_time(end_event))
-    avg = sum(latencies) / (num_iters * 10) * 1000  # us
-    print(f"avg: {avg}")
+    avg = sum(latencies) / (num_iters) * 1000  # us

    # graph.reset()
    return avg
@@ -694,7 +694,7 @@ if __name__ == "__main__":
    parser = FlexibleArgumentParser()
    parser.add_argument("--model",
                        type=str,
-                        default="/home/yang/llm-models/vllm-awq-models/DeepSeek-R1-AWQ/")
+                        default="")
    parser.add_argument("--tp-size",
                        "-tp",
                        "--tensor-parallel-size",

--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -666,8 +666,6 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP):

    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()
-        # 暂时awq不支持cutlass
-        envs.VLLM_USE_TRITON_AWQ = True
        config = vllm_config.model_config.hf_config
        quant_config = vllm_config.quant_config

@@ -875,13 +873,13 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP):
                    weight.data.copy_(_weight)
                    
                    weight.data=weight.data.reshape(ori_shape[1],-1)
-        # 暂时不支持TN   
-        if self.quant_method == "awq" and not envs.VLLM_USE_TRITON_AWQ:
+
+        if self.config.quantization_config["quant_method"] == "awq" and not envs.VLLM_USE_TRITON_AWQ:
            lay_key_words = [
                "self_attn.q_a_proj.qweight",
                "self_attn.q_b_proj.qweight",
-                "self_attn.kv_a_proj_with_mqa.qweight",
                "self_attn.kv_b_proj.qweight",
+                "self_attn.kv_a_proj_with_mqa.qweight",
                "self_attn.o_proj.qweight",
                "mlp.gate_up_proj.qweight",
                "mlp.down_proj.qweight",