"vllm/vscode:/vscode.git/clone" did not exist on "9841d48a108b0e14da6572de6be0b47d70bbb641"
Commit f756a682 authored by Yongye Zhu's avatar Yongye Zhu Committed by GitHub
Browse files

[gpt-oss] guard import when triton kernel is not installed (#22529)


Signed-off-by: default avatarYongye Zhu <zyy1102000@gmail.com>
Signed-off-by: default avatarWoosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: default avatarWoosuk Kwon <woosuk.kwon@berkeley.edu>
parent f0964e29
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any, Optional from typing import TYPE_CHECKING, Any, Optional
import torch import torch
...@@ -8,13 +8,16 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk ...@@ -8,13 +8,16 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
TopKWeightAndReduceDelegate) TopKWeightAndReduceDelegate)
from vllm.model_executor.layers.fused_moe.utils import extract_required_args from vllm.model_executor.layers.fused_moe.utils import extract_required_args
from vllm.utils import has_triton_kernels
if True: if has_triton_kernels():
import triton_kernels.swiglu import triton_kernels.swiglu
from triton_kernels.matmul_ogs import (FnSpecs, FusedActivation, from triton_kernels.matmul_ogs import FnSpecs, FusedActivation, matmul_ogs
PrecisionConfig, matmul_ogs)
from triton_kernels.routing import routing from triton_kernels.routing import routing
if TYPE_CHECKING:
from triton_kernels.matmul_ogs import PrecisionConfig
def triton_kernel_moe_forward( def triton_kernel_moe_forward(
hidden_states: torch.Tensor, hidden_states: torch.Tensor,
...@@ -33,8 +36,8 @@ def triton_kernel_moe_forward( ...@@ -33,8 +36,8 @@ def triton_kernel_moe_forward(
w2_scale: Optional[torch.Tensor] = None, w2_scale: Optional[torch.Tensor] = None,
w1_bias: Optional[torch.Tensor] = None, w1_bias: Optional[torch.Tensor] = None,
w2_bias: Optional[torch.Tensor] = None, w2_bias: Optional[torch.Tensor] = None,
w1_precision=None, # PrecisionConfig or None w1_precision: Optional["PrecisionConfig"] = None,
w2_precision=None, # PrecisionConfig or None w2_precision: Optional["PrecisionConfig"] = None,
a1_scale: Optional[torch.Tensor] = None, a1_scale: Optional[torch.Tensor] = None,
a2_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None,
block_shape: Optional[list[int]] = None, block_shape: Optional[list[int]] = None,
...@@ -90,8 +93,8 @@ def triton_kernel_fused_experts( ...@@ -90,8 +93,8 @@ def triton_kernel_fused_experts(
w2_scale: Optional[torch.Tensor] = None, w2_scale: Optional[torch.Tensor] = None,
w1_bias: Optional[torch.Tensor] = None, w1_bias: Optional[torch.Tensor] = None,
w2_bias: Optional[torch.Tensor] = None, w2_bias: Optional[torch.Tensor] = None,
w1_precision=None, # PrecisionConfig or None w1_precision: Optional["PrecisionConfig"] = None,
w2_precision=None, # PrecisionConfig or None w2_precision: Optional["PrecisionConfig"] = None,
a1_scale: Optional[torch.Tensor] = None, a1_scale: Optional[torch.Tensor] = None,
a2_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None,
block_shape: Optional[list[int]] = None, block_shape: Optional[list[int]] = None,
...@@ -141,8 +144,14 @@ def triton_kernel_fused_experts( ...@@ -141,8 +144,14 @@ def triton_kernel_fused_experts(
class BatchedOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute): class BatchedOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
def __init__(self, quant_config, max_num_tokens: int, num_dispatchers: int, def __init__(
w1_precision: PrecisionConfig, w2_precision: PrecisionConfig): self,
quant_config,
max_num_tokens: int,
num_dispatchers: int,
w1_precision: "PrecisionConfig",
w2_precision: "PrecisionConfig",
):
super().__init__(quant_config) super().__init__(quant_config)
self.max_num_tokens = max_num_tokens self.max_num_tokens = max_num_tokens
self.num_dispatchers = num_dispatchers self.num_dispatchers = num_dispatchers
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment