Commit 96802ca0 authored by zhuwenwen's avatar zhuwenwen
Browse files

update deepseek-v2

parent 4d4c6fe3
......@@ -161,7 +161,7 @@ def benchmark_config(
nn_moe = False
block_shape=[0, group_size]
input_gating = torch.randn(num_tokens, num_experts, dtype=torch.float32)
print(f"shape: {x.shape[0]} | config: {config}")
def prepare(i: int):
input_gating.copy_(gating_output[i])
......@@ -187,6 +187,7 @@ def benchmark_config(
a2_scale=a2_scale,
use_nn_moe=nn_moe,
block_shape=block_shape,
moe_ep_size=1,
)
# JIT compilation & warmup
......@@ -221,8 +222,7 @@ def benchmark_config(
end_event.record()
end_event.synchronize()
latencies.append(start_event.elapsed_time(end_event))
avg = sum(latencies) / (num_iters * 10) * 1000 # us
print(f"avg: {avg}")
avg = sum(latencies) / (num_iters) * 1000 # us
# graph.reset()
return avg
......@@ -694,7 +694,7 @@ if __name__ == "__main__":
parser = FlexibleArgumentParser()
parser.add_argument("--model",
type=str,
default="/home/yang/llm-models/vllm-awq-models/DeepSeek-R1-AWQ/")
default="")
parser.add_argument("--tp-size",
"-tp",
"--tensor-parallel-size",
......
......@@ -666,8 +666,6 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
# 暂时awq不支持cutlass
envs.VLLM_USE_TRITON_AWQ = True
config = vllm_config.model_config.hf_config
quant_config = vllm_config.quant_config
......@@ -875,13 +873,13 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP):
weight.data.copy_(_weight)
weight.data=weight.data.reshape(ori_shape[1],-1)
# 暂时不支持TN
if self.quant_method == "awq" and not envs.VLLM_USE_TRITON_AWQ:
if self.config.quantization_config["quant_method"] == "awq" and not envs.VLLM_USE_TRITON_AWQ:
lay_key_words = [
"self_attn.q_a_proj.qweight",
"self_attn.q_b_proj.qweight",
"self_attn.kv_a_proj_with_mqa.qweight",
"self_attn.kv_b_proj.qweight",
"self_attn.kv_a_proj_with_mqa.qweight",
"self_attn.o_proj.qweight",
"mlp.gate_up_proj.qweight",
"mlp.down_proj.qweight",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment