Commit 3d68ca03 authored by zhuwenwen's avatar zhuwenwen
Browse files

update use_int8_w8a8

parent 10cdc93d
...@@ -1570,7 +1570,7 @@ def inplace_fused_experts(hidden_states: torch.Tensor, ...@@ -1570,7 +1570,7 @@ def inplace_fused_experts(hidden_states: torch.Tensor,
start_expert: Optional[int] = -1, start_expert: Optional[int] = -1,
end_expert: Optional[int] = -1) -> None: end_expert: Optional[int] = -1) -> None:
fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, True, fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, True,
activation, use_fp8_w8a8, use_int8_w8a16, activation, use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16,
use_int4_w4a16, global_num_experts, expert_map, use_int4_w4a16, global_num_experts, expert_map,
w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, a2_scale, w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, a2_scale,
block_shape, use_nn_moe, moe_ep_size=moe_ep_size, block_shape, use_nn_moe, moe_ep_size=moe_ep_size,
...@@ -1637,7 +1637,7 @@ def outplace_fused_experts( ...@@ -1637,7 +1637,7 @@ def outplace_fused_experts(
start_expert: Optional[int] = -1, start_expert: Optional[int] = -1,
end_expert: Optional[int] = -1) -> torch.Tensor: end_expert: Optional[int] = -1) -> torch.Tensor:
return fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, return fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids,
False, activation, use_fp8_w8a8, use_int8_w8a16, False, activation, use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16,
use_int4_w4a16, global_num_experts, expert_map, use_int4_w4a16, global_num_experts, expert_map,
w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, w1_scale, w2_scale, w1_zp, w2_zp, a1_scale,
a2_scale, block_shape, a2_scale, block_shape,
...@@ -1708,7 +1708,7 @@ def fused_experts(hidden_states: torch.Tensor, ...@@ -1708,7 +1708,7 @@ def fused_experts(hidden_states: torch.Tensor,
if inplace: if inplace:
torch.ops.vllm.inplace_fused_experts( torch.ops.vllm.inplace_fused_experts(
hidden_states, w1, w2, topk_weights, topk_ids, activation, hidden_states, w1, w2, topk_weights, topk_ids, activation,
use_fp8_w8a8, use_int8_w8a16, use_int4_w4a16, global_num_experts, use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16, use_int4_w4a16, global_num_experts,
expert_map, w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, a2_scale, expert_map, w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, a2_scale,
block_shape, block_shape,
use_nn_moe, use_nn_moe,
...@@ -1719,7 +1719,7 @@ def fused_experts(hidden_states: torch.Tensor, ...@@ -1719,7 +1719,7 @@ def fused_experts(hidden_states: torch.Tensor,
else: else:
return torch.ops.vllm.outplace_fused_experts( return torch.ops.vllm.outplace_fused_experts(
hidden_states, w1, w2, topk_weights, topk_ids, activation, hidden_states, w1, w2, topk_weights, topk_ids, activation,
use_fp8_w8a8, use_int8_w8a16, use_int4_w4a16, global_num_experts, use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16, use_int4_w4a16, global_num_experts,
expert_map, w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, a2_scale, expert_map, w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, a2_scale,
block_shape, block_shape,
use_nn_moe, use_nn_moe,
...@@ -2081,6 +2081,7 @@ def fused_moe( ...@@ -2081,6 +2081,7 @@ def fused_moe(
inplace=inplace, inplace=inplace,
activation=activation, activation=activation,
use_fp8_w8a8=use_fp8_w8a8, use_fp8_w8a8=use_fp8_w8a8,
use_int8_w8a8=use_int8_w8a8,
use_int8_w8a16=use_int8_w8a16, use_int8_w8a16=use_int8_w8a16,
use_int4_w4a16=use_int4_w4a16, use_int4_w4a16=use_int4_w4a16,
global_num_experts=global_num_experts, global_num_experts=global_num_experts,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment