vllm.model_executor.layers.fused_moe.fused_moe_method_base ¶

logger `module-attribute` ¶

logger = init_logger(__name__)

FusedMoEMethodBase ¶

Bases: QuantizeMethodBase

Source code in vllm/model_executor/layers/fused_moe/fused_moe_method_base.py

class FusedMoEMethodBase(QuantizeMethodBase):
    def __init__(self, moe: FusedMoEConfig):
        super().__init__()
        self.moe: FusedMoEConfig = moe
        self.moe_quant_config: FusedMoEQuantConfig | None = None

    @abstractmethod
    def create_weights(
        self,
        layer: torch.nn.Module,
        num_experts: int,
        hidden_size: int,
        intermediate_size_per_partition: int,
        params_dtype: torch.dtype,
        **extra_weight_attrs,
    ):
        raise NotImplementedError

    def uses_weight_scale_2_pattern(self) -> bool:
        """
        Returns True if this quantization method uses 'weight_scale_2' pattern
        for per-tensor weight scales (e.g., FP4 variants), False otherwise.

        This method should be overridden by subclasses that use the
        'weight_scale_2' pattern instead of the standard 'weight_scale' pattern.
        """
        return False

    def maybe_make_prepare_finalize(
        self,
        routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
    ) -> FusedMoEPrepareAndFinalize | None:
        from .all2all_utils import maybe_make_prepare_finalize

        return maybe_make_prepare_finalize(
            self.moe, self.moe_quant_config, routing_tables
        )

    def select_gemm_impl(
        self,
        prepare_finalize: FusedMoEPrepareAndFinalize,
        layer: torch.nn.Module,
    ) -> FusedMoEPermuteExpertsUnpermute:
        # based on the all2all implementation, select the appropriate
        # gemm implementation
        raise NotImplementedError(
            f"{self.__class__.__name__} must select appropriate gemm "
            "implementation based on the prepare_finalize"
        )

    def prepare_dp_allgather_tensor(
        self,
        layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
        hidden_states: torch.Tensor,
        router_logits: torch.Tensor,
    ) -> tuple[torch.Tensor, list[torch.Tensor]]:
        """Hook to prepare tensors and extra tensors for DP allgather + EP dispatch."""
        raise NotImplementedError(
            "Method 'prepare_dp_allgather_tensor' is not implemented in "
            f"{self.__class__.__name__}."
        )

    @abstractmethod
    def get_fused_moe_quant_config(
        self, layer: torch.nn.Module
    ) -> FusedMoEQuantConfig | None:
        raise NotImplementedError

    @property
    def topk_indices_dtype(self) -> torch.dtype | None:
        return None

    @property
    def supports_eplb(self) -> bool:
        return False

    @property
    def allow_inplace(self) -> bool:
        return False

    @property
    def method_name(self) -> str:
        return self.__class__.__name__

    @property
    def is_monolithic(self) -> bool:
        return False

    # @abstractmethod
    def apply(
        self,
        layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
        x: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
        raise NotImplementedError

    # @abstractmethod
    def apply_monolithic(
        self,
        layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
        x: torch.Tensor,
        router_logits: torch.Tensor,
    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
        raise NotImplementedError

allow_inplace `property` ¶

allow_inplace: bool

is_monolithic `property` ¶

is_monolithic: bool

method_name `property` ¶

method_name: str

moe `instance-attribute` ¶

moe: FusedMoEConfig = moe

moe_quant_config `instance-attribute` ¶

moe_quant_config: FusedMoEQuantConfig | None = None

supports_eplb `property` ¶

supports_eplb: bool

topk_indices_dtype `property` ¶

topk_indices_dtype: dtype | None

init ¶

__init__(moe: FusedMoEConfig)

Source code in vllm/model_executor/layers/fused_moe/fused_moe_method_base.py

def __init__(self, moe: FusedMoEConfig):
    super().__init__()
    self.moe: FusedMoEConfig = moe
    self.moe_quant_config: FusedMoEQuantConfig | None = None

apply ¶

apply(
    layer: FusedMoE,
    x: Tensor,
    topk_weights: Tensor,
    topk_ids: Tensor,
) -> Tensor | tuple[Tensor, Tensor]

Source code in vllm/model_executor/layers/fused_moe/fused_moe_method_base.py

def apply(
    self,
    layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
    x: torch.Tensor,
    topk_weights: torch.Tensor,
    topk_ids: torch.Tensor,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
    raise NotImplementedError

apply_monolithic ¶

apply_monolithic(
    layer: FusedMoE, x: Tensor, router_logits: Tensor
) -> Tensor | tuple[Tensor, Tensor]

Source code in vllm/model_executor/layers/fused_moe/fused_moe_method_base.py

def apply_monolithic(
    self,
    layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
    x: torch.Tensor,
    router_logits: torch.Tensor,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
    raise NotImplementedError

create_weights `abstractmethod` ¶

create_weights(
    layer: Module,
    num_experts: int,
    hidden_size: int,
    intermediate_size_per_partition: int,
    params_dtype: dtype,
    **extra_weight_attrs,
)

Source code in vllm/model_executor/layers/fused_moe/fused_moe_method_base.py

@abstractmethod
def create_weights(
    self,
    layer: torch.nn.Module,
    num_experts: int,
    hidden_size: int,
    intermediate_size_per_partition: int,
    params_dtype: torch.dtype,
    **extra_weight_attrs,
):
    raise NotImplementedError

get_fused_moe_quant_config `abstractmethod` ¶

get_fused_moe_quant_config(
    layer: Module,
) -> FusedMoEQuantConfig | None

Source code in vllm/model_executor/layers/fused_moe/fused_moe_method_base.py

@abstractmethod
def get_fused_moe_quant_config(
    self, layer: torch.nn.Module
) -> FusedMoEQuantConfig | None:
    raise NotImplementedError

maybe_make_prepare_finalize ¶

maybe_make_prepare_finalize(
    routing_tables: tuple[Tensor, Tensor, Tensor]
    | None = None,
) -> FusedMoEPrepareAndFinalize | None

Source code in vllm/model_executor/layers/fused_moe/fused_moe_method_base.py

def maybe_make_prepare_finalize(
    self,
    routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
) -> FusedMoEPrepareAndFinalize | None:
    from .all2all_utils import maybe_make_prepare_finalize

    return maybe_make_prepare_finalize(
        self.moe, self.moe_quant_config, routing_tables
    )

prepare_dp_allgather_tensor ¶

prepare_dp_allgather_tensor(
    layer: FusedMoE,
    hidden_states: Tensor,
    router_logits: Tensor,
) -> tuple[Tensor, list[Tensor]]

Hook to prepare tensors and extra tensors for DP allgather + EP dispatch.

Source code in vllm/model_executor/layers/fused_moe/fused_moe_method_base.py

def prepare_dp_allgather_tensor(
    self,
    layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
    hidden_states: torch.Tensor,
    router_logits: torch.Tensor,
) -> tuple[torch.Tensor, list[torch.Tensor]]:
    """Hook to prepare tensors and extra tensors for DP allgather + EP dispatch."""
    raise NotImplementedError(
        "Method 'prepare_dp_allgather_tensor' is not implemented in "
        f"{self.__class__.__name__}."
    )

select_gemm_impl ¶

select_gemm_impl(
    prepare_finalize: FusedMoEPrepareAndFinalize,
    layer: Module,
) -> FusedMoEPermuteExpertsUnpermute

Source code in vllm/model_executor/layers/fused_moe/fused_moe_method_base.py

def select_gemm_impl(
    self,
    prepare_finalize: FusedMoEPrepareAndFinalize,
    layer: torch.nn.Module,
) -> FusedMoEPermuteExpertsUnpermute:
    # based on the all2all implementation, select the appropriate
    # gemm implementation
    raise NotImplementedError(
        f"{self.__class__.__name__} must select appropriate gemm "
        "implementation based on the prepare_finalize"
    )

uses_weight_scale_2_pattern ¶

uses_weight_scale_2_pattern() -> bool

Returns True if this quantization method uses 'weight_scale_2' pattern for per-tensor weight scales (e.g., FP4 variants), False otherwise.

This method should be overridden by subclasses that use the 'weight_scale_2' pattern instead of the standard 'weight_scale' pattern.

Source code in vllm/model_executor/layers/fused_moe/fused_moe_method_base.py

def uses_weight_scale_2_pattern(self) -> bool:
    """
    Returns True if this quantization method uses 'weight_scale_2' pattern
    for per-tensor weight scales (e.g., FP4 variants), False otherwise.

    This method should be overridden by subclasses that use the
    'weight_scale_2' pattern instead of the standard 'weight_scale' pattern.
    """
    return False

vllm.model_executor.layers.fused_moe.fused_moe_method_base ¶

logger module-attribute ¶

FusedMoEMethodBase ¶

allow_inplace property ¶

is_monolithic property ¶

method_name property ¶

moe instance-attribute ¶

moe_quant_config instance-attribute ¶

supports_eplb property ¶

topk_indices_dtype property ¶

__init__ ¶

apply ¶

apply_monolithic ¶

create_weights abstractmethod ¶

get_fused_moe_quant_config abstractmethod ¶

maybe_make_prepare_finalize ¶

prepare_dp_allgather_tensor ¶

select_gemm_impl ¶

uses_weight_scale_2_pattern ¶

logger `module-attribute` ¶

allow_inplace `property` ¶

is_monolithic `property` ¶

method_name `property` ¶

moe `instance-attribute` ¶

moe_quant_config `instance-attribute` ¶

supports_eplb `property` ¶

topk_indices_dtype `property` ¶

init ¶

create_weights `abstractmethod` ¶

get_fused_moe_quant_config `abstractmethod` ¶