Unverified Commit 42135d68 authored by Robert Shaw's avatar Robert Shaw Committed by GitHub
Browse files

[MoE Refactor] Oracle Select FP8+NVFP4 Kernels In Priority (#32414)

parent e14467be
...@@ -634,6 +634,46 @@ steps: ...@@ -634,6 +634,46 @@ steps:
- pip install helion - pip install helion
- pytest -v -s kernels/helion/ - pytest -v -s kernels/helion/
- label: Kernels FP8 MoE Test (1 H100)
timeout_in_minutes: 90
gpu: h100
num_gpus: 1
optional: true
commands:
- pytest -v -s kernels/moe/test_cutlass_moe.py
- pytest -v -s kernels/moe/test_flashinfer.py
- pytest -v -s kernels/moe/test_gpt_oss_triton_kernels.py
- pytest -v -s kernels/moe/test_modular_oai_triton_moe.py
- pytest -v -s kernels/moe/test_moe.py
# - pytest -v -s kernels/moe/test_block_fp8.py - failing on main
- pytest -v -s kernels/moe/test_block_int8.py
- pytest -v -s kernels/moe/test_triton_moe_no_act_mul.py
- pytest -v -s kernels/moe/test_triton_moe_ptpc_fp8.py
- label: Kernels FP8 MoE Test (2 H100s)
timeout_in_minutes: 90
gpu: h100
num_gpus: 2
optional: true
commands:
- pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py
- pytest -v -s kernels/moe/test_deepep_moe.py
- pytest -v -s kernels/moe/test_pplx_cutlass_moe.py
# - pytest -v -s kernels/moe/test_pplx_moe.py - failing on main
- label: Kernels Fp4 MoE Test (B200)
timeout_in_minutes: 60
gpu: b200
num_gpus: 1
optional: true
commands:
- pytest -v -s kernels/moe/test_cutedsl_moe.py
- pytest -v -s kernels/moe/test_flashinfer_moe.py
- pytest -v -s kernels/moe/test_nvfp4_moe.py
- pytest -v -s kernels/moe/test_ocp_mx_moe.py
- label: Model Executor Test # 23min - label: Model Executor Test # 23min
timeout_in_minutes: 35 timeout_in_minutes: 35
torch_nightly: true torch_nightly: true
......
...@@ -9,6 +9,7 @@ but use different quantization strategies and backends. ...@@ -9,6 +9,7 @@ but use different quantization strategies and backends.
import torch import torch
import vllm.model_executor.layers.fused_moe.modular_kernel as mk import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from tests.kernels.moe.utils import make_dummy_moe_config
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8 from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
...@@ -138,12 +139,13 @@ def bench_run( ...@@ -138,12 +139,13 @@ def bench_run(
fn = mk.FusedMoEModularKernel( fn = mk.FusedMoEModularKernel(
MoEPrepareAndFinalizeNoEP(), MoEPrepareAndFinalizeNoEP(),
CutlassExpertsFp8( CutlassExpertsFp8(
out_dtype=a.dtype, moe_config=make_dummy_moe_config(
e=num_experts, num_experts=num_experts,
n=n, hidden_dim=k,
k=k, intermediate_size_per_partition=n,
in_dtype=a.dtype,
),
quant_config=quant_config, quant_config=quant_config,
device=w1.device,
), ),
) )
......
...@@ -12,6 +12,7 @@ import torch ...@@ -12,6 +12,7 @@ import torch
import torch.utils.benchmark as benchmark import torch.utils.benchmark as benchmark
import vllm.model_executor.layers.fused_moe.modular_kernel as mk import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from tests.kernels.moe.utils import make_dummy_moe_config
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.fused_moe.config import ( from vllm.model_executor.layers.fused_moe.config import (
...@@ -198,8 +199,7 @@ def bench_run( ...@@ -198,8 +199,7 @@ def bench_run(
kernel = mk.FusedMoEModularKernel( kernel = mk.FusedMoEModularKernel(
MoEPrepareAndFinalizeNoEP(defer_input_quant=True), MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
CutlassExpertsFp4( CutlassExpertsFp4(
out_dtype=dtype, make_dummy_moe_config(),
max_experts_per_worker=e,
quant_config=quant_config, quant_config=quant_config,
), ),
) )
...@@ -244,8 +244,7 @@ def bench_run( ...@@ -244,8 +244,7 @@ def bench_run(
kernel = mk.FusedMoEModularKernel( kernel = mk.FusedMoEModularKernel(
MoEPrepareAndFinalizeNoEP(defer_input_quant=True), MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
CutlassExpertsFp4( CutlassExpertsFp4(
out_dtype=dtype, make_dummy_moe_config(),
max_experts_per_worker=e,
quant_config=quant_config, quant_config=quant_config,
), ),
) )
......
...@@ -6,6 +6,7 @@ import torch.utils.benchmark as benchmark ...@@ -6,6 +6,7 @@ import torch.utils.benchmark as benchmark
from benchmark_shapes import WEIGHT_SHAPES_MOE from benchmark_shapes import WEIGHT_SHAPES_MOE
import vllm.model_executor.layers.fused_moe.modular_kernel as mk import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from tests.kernels.moe.utils import make_dummy_moe_config
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
...@@ -134,13 +135,13 @@ def bench_run( ...@@ -134,13 +135,13 @@ def bench_run(
fn = mk.FusedMoEModularKernel( fn = mk.FusedMoEModularKernel(
MoEPrepareAndFinalizeNoEP(), MoEPrepareAndFinalizeNoEP(),
CutlassExpertsFp8( CutlassExpertsFp8(
out_dtype=a.dtype, moe_config=make_dummy_moe_config(
# NOTE(rob): w2 is shaped as [E, hidden, intermediate] num_experts=w2.shape[0],
e=w2.shape[0], hidden_dim=w2.shape[1],
n=w2.shape[2], intermediate_size_per_partition=w2.shape[2],
k=w2.shape[1], in_dtype=a.dtype,
),
quant_config=quant_config, quant_config=quant_config,
device=w1.device,
), ),
) )
...@@ -166,13 +167,13 @@ def bench_run( ...@@ -166,13 +167,13 @@ def bench_run(
fn = mk.FusedMoEModularKernel( fn = mk.FusedMoEModularKernel(
MoEPrepareAndFinalizeNoEP(), MoEPrepareAndFinalizeNoEP(),
CutlassExpertsFp8( CutlassExpertsFp8(
out_dtype=a.dtype, moe_config=make_dummy_moe_config(
# NOTE(rob): w2 is shaped as [E, hidden, intermediate] num_experts=w2.shape[0],
e=w2.shape[0], hidden_dim=w2.shape[1],
n=w2.shape[2], intermediate_size_per_partition=w2.shape[2],
k=w2.shape[1], in_dtype=a.dtype,
),
quant_config=quant_config, quant_config=quant_config,
device=w1.device,
), ),
) )
......
...@@ -16,10 +16,16 @@ import torch ...@@ -16,10 +16,16 @@ import torch
from ray.experimental.tqdm_ray import tqdm from ray.experimental.tqdm_ray import tqdm
from vllm.model_executor.layers.fused_moe.config import ( from vllm.model_executor.layers.fused_moe.config import (
FusedMoEConfig,
FusedMoEParallelConfig,
FusedMoEQuantConfig, FusedMoEQuantConfig,
RoutingMethodType,
_get_config_dtype_str, _get_config_dtype_str,
) )
from vllm.model_executor.layers.fused_moe.fused_moe import * from vllm.model_executor.layers.fused_moe.fused_moe import *
from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
TritonOrDeepGemmExperts,
)
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.transformers_utils.config import get_config from vllm.transformers_utils.config import get_config
from vllm.triton_utils import triton from vllm.triton_utils import triton
...@@ -194,10 +200,33 @@ def benchmark_config( ...@@ -194,10 +200,33 @@ def benchmark_config(
block_shape=block_quant_shape, block_shape=block_quant_shape,
) )
deep_gemm_experts = mk.FusedMoEModularKernel(
prepare_finalize=MoEPrepareAndFinalizeNoEP(),
fused_experts=TritonOrDeepGemmExperts(
moe_config=FusedMoEConfig(
num_experts=num_experts,
experts_per_token=topk,
hidden_dim=hidden_size,
intermediate_size_per_partition=shard_intermediate_size,
num_local_experts=num_experts,
activation="silu",
parallel_config=FusedMoEParallelConfig.make_no_parallel(),
in_dtype=init_dtype,
routing_method=RoutingMethodType.TopK,
),
quant_config=quant_config,
),
)
with override_config(config): with override_config(config):
topk_weights, topk_ids, token_expert_indices = fused_topk( topk_weights, topk_ids, token_expert_indices = fused_topk(
x, input_gating, topk, renormalize=not use_deep_gemm x, input_gating, topk, renormalize=not use_deep_gemm
) )
if use_deep_gemm:
return deep_gemm_experts(
x, w1, w2, topk_weights, topk_ids, inplace=True
)
return fused_experts( return fused_experts(
x, x,
w1, w1,
...@@ -206,7 +235,6 @@ def benchmark_config( ...@@ -206,7 +235,6 @@ def benchmark_config(
topk_ids, topk_ids,
inplace=True, inplace=True,
quant_config=quant_config, quant_config=quant_config,
allow_deep_gemm=use_deep_gemm,
) )
# JIT compilation & warmup # JIT compilation & warmup
......
...@@ -85,10 +85,10 @@ To be used with a particular `FusedMoEPrepareAndFinalize` subclass, MoE kernels ...@@ -85,10 +85,10 @@ To be used with a particular `FusedMoEPrepareAndFinalize` subclass, MoE kernels
|--------|-------------------|--------------|---------------|---------------------|-----------------------|---------|--------| |--------|-------------------|--------------|---------------|---------------------|-----------------------|---------|--------|
| triton | standard | all<sup>1</sup> | G,A,T | silu, gelu,</br>swigluoai,</br>silu_no_mul,</br>gelu_no_mul | Y | Y | [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts],</br>[`TritonExperts`][vllm.model_executor.layers.fused_moe.fused_moe.TritonExperts] | | triton | standard | all<sup>1</sup> | G,A,T | silu, gelu,</br>swigluoai,</br>silu_no_mul,</br>gelu_no_mul | Y | Y | [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts],</br>[`TritonExperts`][vllm.model_executor.layers.fused_moe.fused_moe.TritonExperts] |
| triton (batched) | batched | all<sup>1</sup> | G,A,T | silu, gelu | <sup>6</sup> | Y | [`BatchedTritonExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedTritonExperts] | | triton (batched) | batched | all<sup>1</sup> | G,A,T | silu, gelu | <sup>6</sup> | Y | [`BatchedTritonExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedTritonExperts] |
| deep gemm | standard,</br>batched | fp8 | G(128),A,T | silu, gelu | <sup>6</sup> | Y | [`deep_gemm_moe_fp8`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.deep_gemm_moe_fp8],</br>[`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.DeepGemmExperts],</br>[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe.BatchedDeepGemmExperts] | | deep gemm | standard,</br>batched | fp8 | G(128),A,T | silu, gelu | <sup>6</sup> | Y | </br>[`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.DeepGemmExperts],</br>[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe.BatchedDeepGemmExperts] |
| cutlass_fp4 | standard,</br>batched | nvfp4 | A,T | silu | Y | Y | [`CutlassExpertsFp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp4] | | cutlass_fp4 | standard,</br>batched | nvfp4 | A,T | silu | Y | Y | [`CutlassExpertsFp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp4] |
| cutlass_fp8 | standard,</br>batched | fp8 | A,T | silu, gelu | Y | Y | [`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp8],</br>[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassBatchedExpertsFp8] | | cutlass_fp8 | standard,</br>batched | fp8 | A,T | silu, gelu | Y | Y | [`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp8],</br>[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassBatchedExpertsFp8] |
| flashinfer | standard | nvfp4,</br>fp8 | T | <sup>5</sup> | N | Y | [`flashinfer_cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.flashinfer_cutlass_moe_fp4],</br>[`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] | | flashinfer | standard | nvfp4,</br>fp8 | T | <sup>5</sup> | N | Y | [`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] |
| gpt oss triton | standard | N/A | N/A | <sup>5</sup> | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts] | | gpt oss triton | standard | N/A | N/A | <sup>5</sup> | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts] |
| marlin | standard,</br>batched | <sup>3</sup> / N/A | <sup>3</sup> / N/A | silu,</br>swigluoai | Y | Y | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],</br>[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] | | marlin | standard,</br>batched | <sup>3</sup> / N/A | <sup>3</sup> / N/A | silu,</br>swigluoai | Y | Y | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],</br>[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] |
| trtllm | standard | mxfp4,</br>nvfp4 | G(16),G(32) | <sup>5</sup> | N | Y | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts] | | trtllm | standard | mxfp4,</br>nvfp4 | G(16),G(32) | <sup>5</sup> | N | Y | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts] |
......
...@@ -43,7 +43,7 @@ from vllm.forward_context import get_forward_context, set_forward_context ...@@ -43,7 +43,7 @@ from vllm.forward_context import get_forward_context, set_forward_context
from vllm.model_executor.layers.quantization.utils.quant_utils import ( from vllm.model_executor.layers.quantization.utils.quant_utils import (
QuantKey, QuantKey,
kFp8StaticTensorSym, kFp8StaticTensorSym,
kNvfp4Quant, kNvfp4Dynamic,
) )
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.flashinfer import has_flashinfer from vllm.utils.flashinfer import has_flashinfer
...@@ -215,7 +215,7 @@ class TestAttentionFp8StaticQuantPatternModel(AttentionQuantPatternModel): ...@@ -215,7 +215,7 @@ class TestAttentionFp8StaticQuantPatternModel(AttentionQuantPatternModel):
class TestAttentionNvfp4QuantPatternModel(AttentionQuantPatternModel): class TestAttentionNvfp4QuantPatternModel(AttentionQuantPatternModel):
"""Test model for AttentionNvfp4QuantPattern fusion.""" """Test model for AttentionNvfp4QuantPattern fusion."""
quant_key = kNvfp4Quant quant_key = kNvfp4Dynamic
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
...@@ -468,7 +468,7 @@ def test_attention_quant_pattern( ...@@ -468,7 +468,7 @@ def test_attention_quant_pattern(
# Note: for fp8, fully_replaced=False because query quant ops remain in graph. # Note: for fp8, fully_replaced=False because query quant ops remain in graph.
# Only output quant ops are fused into attention. # Only output quant ops are fused into attention.
test_backend.check_before_ops([quant_op], fully_replaced=quant_key is kNvfp4Quant) test_backend.check_before_ops([quant_op], fully_replaced=quant_key is kNvfp4Dynamic)
# access the underlying `AttnFusionPass` on the `LazyInitPass` # access the underlying `AttnFusionPass` on the `LazyInitPass`
assert attn_pass.pass_.matched_count == sum(attn_fusion_supported) assert attn_pass.pass_.matched_count == sum(attn_fusion_supported)
......
...@@ -44,7 +44,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import W8A8BlockFp8 ...@@ -44,7 +44,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import W8A8BlockFp8
from vllm.model_executor.layers.quantization.utils.quant_utils import ( from vllm.model_executor.layers.quantization.utils.quant_utils import (
GroupShape, GroupShape,
kFp8StaticTensorSym, kFp8StaticTensorSym,
kNvfp4Quant, kNvfp4Dynamic,
) )
from vllm.platforms import current_platform from vllm.platforms import current_platform
...@@ -134,11 +134,11 @@ class TestSiluMulNvfp4QuantModel(torch.nn.Module): ...@@ -134,11 +134,11 @@ class TestSiluMulNvfp4QuantModel(torch.nn.Module):
def ops_in_model_before(self): def ops_in_model_before(self):
return [ return [
SILU_MUL_OP if self.enable_silu_mul_custom_op else torch.ops.aten.mul, SILU_MUL_OP if self.enable_silu_mul_custom_op else torch.ops.aten.mul,
QUANT_OPS[kNvfp4Quant], QUANT_OPS[kNvfp4Dynamic],
] ]
def ops_in_model_after(self): def ops_in_model_after(self):
return [FUSED_OPS[kNvfp4Quant]] return [FUSED_OPS[kNvfp4Dynamic]]
class TestSiluMulGroupFp8QuantModel(torch.nn.Module): class TestSiluMulGroupFp8QuantModel(torch.nn.Module):
......
...@@ -3,3 +3,6 @@ accuracy_threshold: 0.92 ...@@ -3,3 +3,6 @@ accuracy_threshold: 0.92
num_questions: 1319 num_questions: 1319
num_fewshot: 5 num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel" server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel"
env:
VLLM_USE_FLASHINFER_MOE_FP8: "0"
VLLM_USE_DEEP_GEMM: "0"
...@@ -6,4 +6,3 @@ server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enab ...@@ -6,4 +6,3 @@ server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enab
env: env:
VLLM_USE_DEEP_GEMM: "1" VLLM_USE_DEEP_GEMM: "1"
VLLM_USE_DEEP_GEMM_MOE: "1" VLLM_USE_DEEP_GEMM_MOE: "1"
VLLM_USE_DEEP_GEMM_E8M0: "0"
...@@ -6,4 +6,3 @@ server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enab ...@@ -6,4 +6,3 @@ server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enab
env: env:
VLLM_USE_DEEP_GEMM: "1" VLLM_USE_DEEP_GEMM: "1"
VLLM_USE_DEEP_GEMM_MOE: "1" VLLM_USE_DEEP_GEMM_MOE: "1"
VLLM_USE_DEEP_GEMM_E8M0: "0"
...@@ -3,3 +3,5 @@ accuracy_threshold: 0.92 ...@@ -3,3 +3,5 @@ accuracy_threshold: 0.92
num_questions: 1319 num_questions: 1319
num_fewshot: 5 num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2" server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
env:
VLLM_USE_FLASHINFER_MOE_FP8: "0"
...@@ -4,7 +4,5 @@ num_questions: 1319 ...@@ -4,7 +4,5 @@ num_questions: 1319
num_fewshot: 5 num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2" server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
env: env:
VLLM_USE_DEEP_GEMM: "0"
VLLM_USE_DEEP_GEMM_MOE: "0"
VLLM_USE_FLASHINFER_MOE_FP8: "1" VLLM_USE_FLASHINFER_MOE_FP8: "1"
VLLM_FLASHINFER_MOE_BACKEND: "throughput" VLLM_FLASHINFER_MOE_BACKEND: "throughput"
...@@ -4,7 +4,5 @@ num_questions: 1319 ...@@ -4,7 +4,5 @@ num_questions: 1319
num_fewshot: 5 num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2" server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
env: env:
VLLM_USE_DEEP_GEMM: "0"
VLLM_USE_DEEP_GEMM_MOE: "0"
VLLM_USE_FLASHINFER_MOE_FP8: "1" VLLM_USE_FLASHINFER_MOE_FP8: "1"
VLLM_FLASHINFER_MOE_BACKEND: "latency" VLLM_FLASHINFER_MOE_BACKEND: "latency"
...@@ -4,6 +4,4 @@ num_questions: 1319 ...@@ -4,6 +4,4 @@ num_questions: 1319
num_fewshot: 5 num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2" server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
env: env:
VLLM_USE_DEEP_GEMM: "0"
VLLM_USE_DEEP_GEMM_MOE: "0"
VLLM_TEST_FORCE_FP8_MARLIN: "1" VLLM_TEST_FORCE_FP8_MARLIN: "1"
...@@ -4,5 +4,5 @@ num_questions: 1319 ...@@ -4,5 +4,5 @@ num_questions: 1319
num_fewshot: 5 num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2" server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
env: env:
VLLM_USE_FLASHINFER_MOE_FP8: "0"
VLLM_USE_DEEP_GEMM: "0" VLLM_USE_DEEP_GEMM: "0"
VLLM_USE_DEEP_GEMM_MOE: "0"
...@@ -4,7 +4,5 @@ num_questions: 1319 ...@@ -4,7 +4,5 @@ num_questions: 1319
num_fewshot: 5 num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2" server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
env: env:
VLLM_USE_DEEP_GEMM: "0"
VLLM_USE_DEEP_GEMM_MOE: "0"
VLLM_USE_FLASHINFER_MOE_FP8: "1" VLLM_USE_FLASHINFER_MOE_FP8: "1"
VLLM_FLASHINFER_MOE_BACKEND: "throughput" VLLM_FLASHINFER_MOE_BACKEND: "throughput"
...@@ -4,6 +4,4 @@ num_questions: 1319 ...@@ -4,6 +4,4 @@ num_questions: 1319
num_fewshot: 5 num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2" server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
env: env:
VLLM_USE_DEEP_GEMM: "0"
VLLM_USE_DEEP_GEMM_MOE: "0"
VLLM_TEST_FORCE_FP8_MARLIN: "1" VLLM_TEST_FORCE_FP8_MARLIN: "1"
...@@ -4,5 +4,5 @@ num_questions: 1319 ...@@ -4,5 +4,5 @@ num_questions: 1319
num_fewshot: 5 num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2" server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
env: env:
VLLM_USE_FLASHINFER_MOE_FP8: "0"
VLLM_USE_DEEP_GEMM: "0" VLLM_USE_DEEP_GEMM: "0"
VLLM_USE_DEEP_GEMM_MOE: "0"
...@@ -3,3 +3,5 @@ accuracy_threshold: 0.88 ...@@ -3,3 +3,5 @@ accuracy_threshold: 0.88
num_questions: 1319 num_questions: 1319
num_fewshot: 5 num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2" server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
env:
VLLM_USE_FLASHINFER_MOE_FP4: "0"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment