Unverified Commit 4034c3d3 authored by Turner Jabbour's avatar Turner Jabbour Committed by GitHub
Browse files

[Core] Move test utility to test file (#35672)


Signed-off-by: default avatarTurner Jabbour <doubleujabbour@gmail.com>
parent 7560d674
...@@ -26,9 +26,10 @@ from vllm.model_executor.layers.fused_moe.config import mxfp4_w4a16_moe_quant_co ...@@ -26,9 +26,10 @@ from vllm.model_executor.layers.fused_moe.config import mxfp4_w4a16_moe_quant_co
from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import ( from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
triton_kernel_moe_forward, triton_kernel_moe_forward,
) )
from vllm.model_executor.layers.utils import shuffle_weight
from vllm.utils.math_utils import round_up from vllm.utils.math_utils import round_up
from .utils import shuffle_weight
def deshuffle(w: torch.Tensor): def deshuffle(w: torch.Tensor):
first = w[..., ::2] first = w[..., ::2]
......
...@@ -33,11 +33,10 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularK ...@@ -33,11 +33,10 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularK
from vllm.model_executor.layers.fused_moe.prepare_finalize import ( from vllm.model_executor.layers.fused_moe.prepare_finalize import (
MoEPrepareAndFinalizeNoEP, MoEPrepareAndFinalizeNoEP,
) )
from vllm.model_executor.layers.utils import shuffle_weight
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed from vllm.utils.torch_utils import set_random_seed
from .utils import make_dummy_moe_config from .utils import make_dummy_moe_config, shuffle_weight
MNK = [ MNK = [
(1, 512, 384), (1, 512, 384),
......
...@@ -33,6 +33,16 @@ from vllm.utils.deep_gemm import per_block_cast_to_fp8 ...@@ -33,6 +33,16 @@ from vllm.utils.deep_gemm import per_block_cast_to_fp8
from vllm.utils.math_utils import round_up from vllm.utils.math_utils import round_up
def shuffle_weight(w: torch.Tensor) -> torch.Tensor:
"""Fold weights to adjacent locations for Triton MoE / SwiGLU kernel layout."""
shape = w.shape
n = shape[-1]
first = w[..., : n // 2]
second = w[..., n // 2 :]
stacked = torch.stack((first, second), dim=-1)
return stacked.reshape(shape)
def make_dummy_moe_config( def make_dummy_moe_config(
num_experts: int = 1, num_experts: int = 1,
experts_per_token: int = 1, experts_per_token: int = 1,
......
...@@ -31,27 +31,6 @@ def is_layer_moe_router_gate(prefix: str) -> bool: ...@@ -31,27 +31,6 @@ def is_layer_moe_router_gate(prefix: str) -> bool:
return prefix.rsplit(".", 1)[-1] in MOE_LAYER_ROUTER_GATE_SUFFIXES return prefix.rsplit(".", 1)[-1] in MOE_LAYER_ROUTER_GATE_SUFFIXES
def shuffle_weight(w: torch.Tensor) -> torch.Tensor:
# Shuffle weight along the last dimension so that
# we folded the weights to adjance location
# Example:
# input:
# [[1, 2, 3, 4, 5, 6],
# [7, 8, 9, 10, 11, 12]]
# output:
# [[1, 4, 2, 5, 3, 6],
# [7, 10, 8, 11, 9, 12]]
# This will be used together with triton swiglu kernel
shape = w.shape
N = shape[-1]
first = w[..., : N // 2]
second = w[..., N // 2 :]
stacked = torch.stack((first, second), dim=-1)
w_shuffled = stacked.reshape(shape)
return w_shuffled
def get_token_bin_counts_and_mask( def get_token_bin_counts_and_mask(
tokens: torch.Tensor, tokens: torch.Tensor,
vocab_size: int, vocab_size: int,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment