Unverified Commit 40b8f553 authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

[Docs] Reduce time spent generating API docs (#34255)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 5045d5c9
...@@ -63,8 +63,9 @@ plugins: ...@@ -63,8 +63,9 @@ plugins:
- git-revision-date-localized: - git-revision-date-localized:
# exclude autogenerated files # exclude autogenerated files
exclude: exclude:
- argparse/* - api/*
- examples/* - examples/*
- generated/*
- minify: - minify:
minify_html: true minify_html: true
minify_js: true minify_js: true
...@@ -92,7 +93,6 @@ plugins: ...@@ -92,7 +93,6 @@ plugins:
- "!.*_pb2_grpc" # Exclude auto-generated gRPC stubs - "!.*_pb2_grpc" # Exclude auto-generated gRPC stubs
summary: summary:
modules: true modules: true
show_if_no_docstring: true
show_signature_annotations: true show_signature_annotations: true
separate_signature: true separate_signature: true
show_overloads: true show_overloads: true
......
...@@ -1557,6 +1557,7 @@ class ModelConfig: ...@@ -1557,6 +1557,7 @@ class ModelConfig:
@property @property
def attn_type(self) -> AttnTypeStr: def attn_type(self) -> AttnTypeStr:
"""Determine the attention type based on model configuration."""
if self.pooler_config is not None: if self.pooler_config is not None:
seq_pooling_type = self._model_info.default_seq_pooling_type seq_pooling_type = self._model_info.default_seq_pooling_type
if seq_pooling_type == "CLS": if seq_pooling_type == "CLS":
......
...@@ -4,3 +4,4 @@ ...@@ -4,3 +4,4 @@
from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.async_llm import AsyncLLM
AsyncLLMEngine = AsyncLLM # type: ignore AsyncLLMEngine = AsyncLLM # type: ignore
"""The `AsyncLLMEngine` class is an alias of [vllm.v1.engine.async_llm.AsyncLLM][]."""
...@@ -4,3 +4,4 @@ ...@@ -4,3 +4,4 @@
from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
LLMEngine = V1LLMEngine # type: ignore LLMEngine = V1LLMEngine # type: ignore
"""The `LLMEngine` class is an alias of [vllm.v1.engine.llm_engine.LLMEngine][]."""
...@@ -298,6 +298,7 @@ which can be passed to ...@@ -298,6 +298,7 @@ which can be passed to
SingletonInputs: TypeAlias = DecoderOnlyInputs | MultiModalEncDecInputs SingletonInputs: TypeAlias = DecoderOnlyInputs | MultiModalEncDecInputs
"""The inputs for a single encoder/decoder prompt."""
@dataclass @dataclass
......
...@@ -206,6 +206,8 @@ class SGLFusedMOE: ...@@ -206,6 +206,8 @@ class SGLFusedMOE:
class CPUFusedMOE: class CPUFusedMOE:
"""CPU-based fused MoE implementation."""
def __init__(self, layer: torch.nn.Module) -> None: def __init__(self, layer: torch.nn.Module) -> None:
use_grouped_gemm, isa = self.check_grouped_gemm(layer) use_grouped_gemm, isa = self.check_grouped_gemm(layer)
self.isa = isa self.isa = isa
......
...@@ -376,6 +376,8 @@ class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute): ...@@ -376,6 +376,8 @@ class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute):
class CutlassExpertsFp8(CutlassExpertsFp8Base): class CutlassExpertsFp8(CutlassExpertsFp8Base):
"""CUTLASS FP8 fused MoE expert implementation."""
@staticmethod @staticmethod
def activation_format() -> mk.FusedMoEActivationFormat: def activation_format() -> mk.FusedMoEActivationFormat:
return mk.FusedMoEActivationFormat.Standard return mk.FusedMoEActivationFormat.Standard
...@@ -423,6 +425,8 @@ class CutlassExpertsFp8(CutlassExpertsFp8Base): ...@@ -423,6 +425,8 @@ class CutlassExpertsFp8(CutlassExpertsFp8Base):
class CutlassBatchedExpertsFp8(CutlassExpertsFp8Base): class CutlassBatchedExpertsFp8(CutlassExpertsFp8Base):
"""Batched CUTLASS FP8 fused MoE expert implementation."""
@staticmethod @staticmethod
def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool: def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool:
# BATCHED activation format works with EP because # BATCHED activation format works with EP because
...@@ -651,6 +655,8 @@ def run_cutlass_moe_fp4( ...@@ -651,6 +655,8 @@ def run_cutlass_moe_fp4(
class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute): class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute):
"""CUTLASS FP4 fused MoE expert implementation."""
@property @property
def expects_unquantized_inputs(self) -> bool: def expects_unquantized_inputs(self) -> bool:
return True return True
......
...@@ -113,6 +113,8 @@ def _valid_deep_gemm( ...@@ -113,6 +113,8 @@ def _valid_deep_gemm(
class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
"""DeepGemm-based fused MoE expert implementation."""
def __init__(self, moe_config: FusedMoEConfig, quant_config: FusedMoEQuantConfig): def __init__(self, moe_config: FusedMoEConfig, quant_config: FusedMoEQuantConfig):
super().__init__(moe_config=moe_config, quant_config=quant_config) super().__init__(moe_config=moe_config, quant_config=quant_config)
assert quant_config.block_shape == get_mk_alignment_for_contiguous_layout() assert quant_config.block_shape == get_mk_alignment_for_contiguous_layout()
......
...@@ -637,6 +637,8 @@ class MarlinExpertsBase(mk.FusedMoEPermuteExpertsUnpermute): ...@@ -637,6 +637,8 @@ class MarlinExpertsBase(mk.FusedMoEPermuteExpertsUnpermute):
class MarlinExperts(MarlinExpertsBase): class MarlinExperts(MarlinExpertsBase):
"""Marlin-based fused MoE expert implementation."""
def supports_expert_map(self) -> bool: def supports_expert_map(self) -> bool:
return True return True
...@@ -738,6 +740,8 @@ class MarlinExperts(MarlinExpertsBase): ...@@ -738,6 +740,8 @@ class MarlinExperts(MarlinExpertsBase):
class BatchedMarlinExperts(MarlinExpertsBase): class BatchedMarlinExperts(MarlinExpertsBase):
"""Batched Marlin-based fused MoE expert implementation."""
def __init__( def __init__(
self, self,
moe_config: FusedMoEConfig, moe_config: FusedMoEConfig,
......
...@@ -1527,6 +1527,7 @@ def fused_experts( ...@@ -1527,6 +1527,7 @@ def fused_experts(
expert_map: torch.Tensor | None = None, expert_map: torch.Tensor | None = None,
quant_config: FusedMoEQuantConfig | None = None, quant_config: FusedMoEQuantConfig | None = None,
) -> torch.Tensor: ) -> torch.Tensor:
"""Run fused MoE expert computation using Triton kernels."""
if quant_config is None: if quant_config is None:
quant_config = FUSED_MOE_UNQUANTIZED_CONFIG quant_config = FUSED_MOE_UNQUANTIZED_CONFIG
...@@ -1879,6 +1880,8 @@ def fused_experts_impl( ...@@ -1879,6 +1880,8 @@ def fused_experts_impl(
class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
"""Triton-based fused MoE expert implementation."""
def __init__( def __init__(
self, self,
moe_config: FusedMoEConfig, moe_config: FusedMoEConfig,
......
...@@ -221,6 +221,7 @@ def triton_kernel_fused_experts( ...@@ -221,6 +221,7 @@ def triton_kernel_fused_experts(
intermediate_cache: torch.Tensor | None = None, intermediate_cache: torch.Tensor | None = None,
a1q_scale: torch.Tensor | None = None, a1q_scale: torch.Tensor | None = None,
) -> torch.Tensor: ) -> torch.Tensor:
"""Triton implementation of fused expert computation using OAI kernels."""
if quant_config is None: if quant_config is None:
quant_config = FUSED_MOE_UNQUANTIZED_CONFIG quant_config = FUSED_MOE_UNQUANTIZED_CONFIG
...@@ -444,6 +445,8 @@ class BaseOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute): ...@@ -444,6 +445,8 @@ class BaseOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
class OAITritonExperts(BaseOAITritonExperts): class OAITritonExperts(BaseOAITritonExperts):
"""OAI Triton-based fused MoE expert implementation."""
@staticmethod @staticmethod
def activation_format() -> mk.FusedMoEActivationFormat: def activation_format() -> mk.FusedMoEActivationFormat:
return mk.FusedMoEActivationFormat.Standard return mk.FusedMoEActivationFormat.Standard
......
...@@ -63,6 +63,8 @@ def pplx_hidden_dim_scale_bytes( ...@@ -63,6 +63,8 @@ def pplx_hidden_dim_scale_bytes(
class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
"""PPLX-based prepare and finalize for expert parallelism."""
def __init__( def __init__(
self, self,
a2a: pplx.AllToAll, a2a: pplx.AllToAll,
......
...@@ -131,6 +131,8 @@ class MoEPrepareAndFinalizeNaiveEP(mk.FusedMoEPrepareAndFinalize): ...@@ -131,6 +131,8 @@ class MoEPrepareAndFinalizeNaiveEP(mk.FusedMoEPrepareAndFinalize):
class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize): class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize):
"""MoE prepare and finalize without expert parallelism."""
@property @property
def activation_format(self) -> mk.FusedMoEActivationFormat: def activation_format(self) -> mk.FusedMoEActivationFormat:
return mk.FusedMoEActivationFormat.Standard return mk.FusedMoEActivationFormat.Standard
......
...@@ -192,6 +192,7 @@ def rocm_aiter_fused_experts( ...@@ -192,6 +192,7 @@ def rocm_aiter_fused_experts(
num_local_tokens: torch.Tensor | None = None, num_local_tokens: torch.Tensor | None = None,
output_dtype: torch.dtype | None = None, output_dtype: torch.dtype | None = None,
) -> torch.Tensor: ) -> torch.Tensor:
"""ROCm AITER fused MoE expert computation."""
if quant_config is None: if quant_config is None:
quant_config = FUSED_MOE_UNQUANTIZED_CONFIG quant_config = FUSED_MOE_UNQUANTIZED_CONFIG
......
...@@ -18,6 +18,8 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( ...@@ -18,6 +18,8 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
class TrtLlmGenExperts(mk.FusedMoEPermuteExpertsUnpermute): class TrtLlmGenExperts(mk.FusedMoEPermuteExpertsUnpermute):
"""TensorRT-LLM-based fused MoE expert implementation."""
def __init__( def __init__(
self, self,
moe_config: FusedMoEConfig, moe_config: FusedMoEConfig,
......
...@@ -680,6 +680,8 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod): ...@@ -680,6 +680,8 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod):
class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
"""W8A8 FP8 MoE quantization using compressed tensors."""
def __init__( def __init__(
self, self,
weight_quant: QuantizationArgs, weight_quant: QuantizationArgs,
......
...@@ -235,6 +235,8 @@ class Mxfp4Config(QuantizationConfig): ...@@ -235,6 +235,8 @@ class Mxfp4Config(QuantizationConfig):
class Mxfp4MoEMethod(FusedMoEMethodBase): class Mxfp4MoEMethod(FusedMoEMethodBase):
"""MXFP4 MoE quantization method."""
def __init__(self, moe: FusedMoEConfig): def __init__(self, moe: FusedMoEConfig):
super().__init__(moe) super().__init__(moe)
self.weight_dtype = "mxfp4" self.weight_dtype = "mxfp4"
......
...@@ -73,6 +73,7 @@ class Blip2ImageEmbeddingInputs(TensorSchema): ...@@ -73,6 +73,7 @@ class Blip2ImageEmbeddingInputs(TensorSchema):
Blip2ImageInputs: TypeAlias = Blip2ImagePixelInputs | Blip2ImageEmbeddingInputs Blip2ImageInputs: TypeAlias = Blip2ImagePixelInputs | Blip2ImageEmbeddingInputs
"""Alias for supported BLIP-2 image input types."""
class Blip2QFormerMultiHeadAttention(nn.Module): class Blip2QFormerMultiHeadAttention(nn.Module):
......
...@@ -121,6 +121,7 @@ class LlavaImageEmbeddingInputs(TensorSchema): ...@@ -121,6 +121,7 @@ class LlavaImageEmbeddingInputs(TensorSchema):
LlavaImageInputs: TypeAlias = ( LlavaImageInputs: TypeAlias = (
LlavaImagePixelInputs | PixtralHFImagePixelInputs | LlavaImageEmbeddingInputs LlavaImagePixelInputs | PixtralHFImagePixelInputs | LlavaImageEmbeddingInputs
) )
"""Alias for supported LLaVA image input types."""
class LlavaMultiModalProjector(nn.Module): class LlavaMultiModalProjector(nn.Module):
......
...@@ -78,6 +78,7 @@ class LlavaNextImageEmbeddingInputs(TensorSchema): ...@@ -78,6 +78,7 @@ class LlavaNextImageEmbeddingInputs(TensorSchema):
LlavaNextImageInputs: TypeAlias = ( LlavaNextImageInputs: TypeAlias = (
LlavaNextImagePixelInputs | LlavaNextImageEmbeddingInputs LlavaNextImagePixelInputs | LlavaNextImageEmbeddingInputs
) )
"""Alias for supported LLaVA-NeXT image input types."""
class LlavaNextLikeConfig(LlavaLikeConfig, Protocol): class LlavaNextLikeConfig(LlavaLikeConfig, Protocol):
...@@ -106,6 +107,7 @@ class LlavaNextProcessingInfo(BaseLlavaProcessingInfo): ...@@ -106,6 +107,7 @@ class LlavaNextProcessingInfo(BaseLlavaProcessingInfo):
image_width: int, image_width: int,
image_height: int, image_height: int,
) -> int: ) -> int:
"""Get the number of image tokens for the given image dimensions."""
hf_config = self.get_hf_config() hf_config = self.get_hf_config()
vision_encoder_info = self.get_vision_encoder_info() vision_encoder_info = self.get_vision_encoder_info()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment