vllm.v1.attention.backends.mla.flashinfer_mla ¶

FLASHINFER_MLA_WORKSPACE_BUFFER_SIZE `module-attribute` ¶

FLASHINFER_MLA_WORKSPACE_BUFFER_SIZE = 128 * 1024 * 1024

g_fi_workspace `module-attribute` ¶

g_fi_workspace = zeros(
    FLASHINFER_MLA_WORKSPACE_BUFFER_SIZE,
    dtype=uint8,
    device="cuda",
)

logger `module-attribute` ¶

logger = init_logger(__name__)

FlashInferMLABackend ¶

Bases: MLACommonBackend

Source code in vllm/v1/attention/backends/mla/flashinfer_mla.py

class FlashInferMLABackend(MLACommonBackend):
    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
    supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [
        "auto",
        "bfloat16",
        "fp8",
        "fp8_e4m3",
    ]

    @staticmethod
    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
        return [32, 64]

    @staticmethod
    def get_name() -> str:
        return "FLASHINFER_MLA"

    @staticmethod
    def get_impl_cls() -> type["FlashInferMLAImpl"]:
        return FlashInferMLAImpl

    @staticmethod
    def get_builder_cls() -> type["FlashInferMLAMetadataBuilder"]:
        return FlashInferMLAMetadataBuilder

    @classmethod
    def supports_compute_capability(cls, capability: DeviceCapability) -> bool:
        return capability.major == 10

    @classmethod
    def supports_combination(
        cls,
        head_size: int,
        dtype: torch.dtype,
        kv_cache_dtype: CacheDType | None,
        block_size: int,
        use_mla: bool,
        has_sink: bool,
        use_sparse: bool,
        device_capability: DeviceCapability,
    ) -> str | None:
        # FlashInfer MLA kernel requires qk_nope_head_dim == 128
        from vllm.config import get_current_vllm_config

        vllm_config = get_current_vllm_config()
        if vllm_config.model_config is not None:
            hf_text_config = vllm_config.model_config.hf_text_config
            qk_nope_head_dim = getattr(hf_text_config, "qk_nope_head_dim", 1)
            if qk_nope_head_dim != 128:
                return (
                    f"FlashInfer MLA kernel requires qk_nope_head_dim == 128, "
                    f"but got {qk_nope_head_dim}"
                )
        return None

    @classmethod
    def get_required_kv_cache_layout(cls) -> "KVCacheLayoutType | None":
        return "HND"

supported_dtypes `class-attribute` ¶

supported_dtypes: list[dtype] = [float16, bfloat16]

supported_kv_cache_dtypes `class-attribute` ¶

supported_kv_cache_dtypes: list[CacheDType] = [
    "auto",
    "bfloat16",
    "fp8",
    "fp8_e4m3",
]

get_builder_cls `staticmethod` ¶

get_builder_cls() -> type[FlashInferMLAMetadataBuilder]

Source code in vllm/v1/attention/backends/mla/flashinfer_mla.py

@staticmethod
def get_builder_cls() -> type["FlashInferMLAMetadataBuilder"]:
    return FlashInferMLAMetadataBuilder

get_impl_cls `staticmethod` ¶

get_impl_cls() -> type[FlashInferMLAImpl]

Source code in vllm/v1/attention/backends/mla/flashinfer_mla.py

@staticmethod
def get_impl_cls() -> type["FlashInferMLAImpl"]:
    return FlashInferMLAImpl

get_name `staticmethod` ¶

get_name() -> str

Source code in vllm/v1/attention/backends/mla/flashinfer_mla.py

@staticmethod
def get_name() -> str:
    return "FLASHINFER_MLA"

get_required_kv_cache_layout `classmethod` ¶

get_required_kv_cache_layout() -> KVCacheLayoutType | None

Source code in vllm/v1/attention/backends/mla/flashinfer_mla.py

@classmethod
def get_required_kv_cache_layout(cls) -> "KVCacheLayoutType | None":
    return "HND"

get_supported_kernel_block_sizes `staticmethod` ¶

get_supported_kernel_block_sizes() -> list[
    int | MultipleOf
]

Source code in vllm/v1/attention/backends/mla/flashinfer_mla.py

@staticmethod
def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
    return [32, 64]

supports_combination `classmethod` ¶

supports_combination(
    head_size: int,
    dtype: dtype,
    kv_cache_dtype: CacheDType | None,
    block_size: int,
    use_mla: bool,
    has_sink: bool,
    use_sparse: bool,
    device_capability: DeviceCapability,
) -> str | None

Source code in vllm/v1/attention/backends/mla/flashinfer_mla.py

@classmethod
def supports_combination(
    cls,
    head_size: int,
    dtype: torch.dtype,
    kv_cache_dtype: CacheDType | None,
    block_size: int,
    use_mla: bool,
    has_sink: bool,
    use_sparse: bool,
    device_capability: DeviceCapability,
) -> str | None:
    # FlashInfer MLA kernel requires qk_nope_head_dim == 128
    from vllm.config import get_current_vllm_config

    vllm_config = get_current_vllm_config()
    if vllm_config.model_config is not None:
        hf_text_config = vllm_config.model_config.hf_text_config
        qk_nope_head_dim = getattr(hf_text_config, "qk_nope_head_dim", 1)
        if qk_nope_head_dim != 128:
            return (
                f"FlashInfer MLA kernel requires qk_nope_head_dim == 128, "
                f"but got {qk_nope_head_dim}"
            )
    return None

supports_compute_capability `classmethod` ¶

supports_compute_capability(
    capability: DeviceCapability,
) -> bool

Source code in vllm/v1/attention/backends/mla/flashinfer_mla.py

@classmethod
def supports_compute_capability(cls, capability: DeviceCapability) -> bool:
    return capability.major == 10

FlashInferMLAImpl ¶

Bases: MLACommonImpl[MLACommonMetadata]

Source code in vllm/v1/attention/backends/mla/flashinfer_mla.py

class FlashInferMLAImpl(MLACommonImpl[MLACommonMetadata]):
    def __init__(
        self,
        num_heads: int,
        head_size: int,
        scale: float,
        num_kv_heads: int,
        alibi_slopes: list[float] | None,
        sliding_window: int | None,
        kv_cache_dtype: str,
        logits_soft_cap: float | None,
        attn_type: str,
        kv_sharing_target_layer_name: str | None,
        # MLA Specific Arguments
        **mla_args,
    ) -> None:
        super().__init__(
            num_heads,
            head_size,
            scale,
            num_kv_heads,
            alibi_slopes,
            sliding_window,
            kv_cache_dtype,
            logits_soft_cap,
            attn_type,
            kv_sharing_target_layer_name,
            **mla_args,
        )

        unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap]
        if any(unsupported_features):
            raise NotImplementedError(
                "FlashInferMLAImpl does not support one of the following: "
                "alibi_slopes, sliding_window, logits_soft_cap"
            )

        if attn_type != AttentionType.DECODER:
            raise NotImplementedError(
                "Encoder self-attention and "
                "encoder/decoder cross-attention "
                "are not implemented for "
                "FlashInferMLAImpl"
            )

        self._workspace_buffer = g_fi_workspace
        self.bmm1_scale: float | None = None
        self.bmm2_scale: float | None = None

    def _forward_decode(
        self,
        q: torch.Tensor | tuple[torch.Tensor, torch.Tensor],
        kv_c_and_k_pe_cache: torch.Tensor,
        attn_metadata: MLACommonMetadata,
        layer: AttentionLayer,
    ) -> tuple[torch.Tensor, torch.Tensor | None]:
        assert kv_c_and_k_pe_cache.numel() > 0
        assert attn_metadata.decode is not None

        if isinstance(q, tuple):
            q_nope, q_pe = q
            q = torch.cat([q_nope, q_pe], dim=-1)

        # trtllm API requires extra dimension q_len_per_request for MTP
        if attn_metadata.num_decode_tokens % attn_metadata.num_decodes != 0:
            logger.warning_once(
                """FlashInferMLAImpl got a query of uneven length.
                This usually indicates an issue in batch reordering
                or incorrect setup in dummy_run."""
            )
            q = q.unsqueeze(1)
        else:
            q = q.view(attn_metadata.num_decodes, -1, q.shape[-2], q.shape[-1])

        if self.bmm1_scale is None:
            self.bmm1_scale = layer._q_scale_float * layer._k_scale_float * self.scale
        if self.bmm2_scale is None:
            self.bmm2_scale = layer._v_scale_float

        o = trtllm_batch_decode_with_kv_cache_mla(
            query=q,
            kv_cache=kv_c_and_k_pe_cache.unsqueeze(1),
            workspace_buffer=self._workspace_buffer,
            qk_nope_head_dim=self.qk_nope_head_dim,
            kv_lora_rank=self.kv_lora_rank,
            qk_rope_head_dim=self.qk_rope_head_dim,
            block_tables=attn_metadata.decode.block_table,
            seq_lens=attn_metadata.decode.seq_lens,
            max_seq_len=attn_metadata.max_seq_len,
            bmm1_scale=self.bmm1_scale,
            bmm2_scale=self.bmm2_scale,
        )

        # Flatten the output for consistent shape
        o = o.view(-1, o.shape[-2], o.shape[-1])

        # TODO: Return LSE pending support from Flashinfer API:
        # https://github.com/flashinfer-ai/flashinfer/pull/1566
        return o, None

_workspace_buffer `instance-attribute` ¶

_workspace_buffer = g_fi_workspace

bmm1_scale `instance-attribute` ¶

bmm1_scale: float | None = None

bmm2_scale `instance-attribute` ¶

bmm2_scale: float | None = None

init ¶

__init__(
    num_heads: int,
    head_size: int,
    scale: float,
    num_kv_heads: int,
    alibi_slopes: list[float] | None,
    sliding_window: int | None,
    kv_cache_dtype: str,
    logits_soft_cap: float | None,
    attn_type: str,
    kv_sharing_target_layer_name: str | None,
    **mla_args,
) -> None

Source code in vllm/v1/attention/backends/mla/flashinfer_mla.py

def __init__(
    self,
    num_heads: int,
    head_size: int,
    scale: float,
    num_kv_heads: int,
    alibi_slopes: list[float] | None,
    sliding_window: int | None,
    kv_cache_dtype: str,
    logits_soft_cap: float | None,
    attn_type: str,
    kv_sharing_target_layer_name: str | None,
    # MLA Specific Arguments
    **mla_args,
) -> None:
    super().__init__(
        num_heads,
        head_size,
        scale,
        num_kv_heads,
        alibi_slopes,
        sliding_window,
        kv_cache_dtype,
        logits_soft_cap,
        attn_type,
        kv_sharing_target_layer_name,
        **mla_args,
    )

    unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap]
    if any(unsupported_features):
        raise NotImplementedError(
            "FlashInferMLAImpl does not support one of the following: "
            "alibi_slopes, sliding_window, logits_soft_cap"
        )

    if attn_type != AttentionType.DECODER:
        raise NotImplementedError(
            "Encoder self-attention and "
            "encoder/decoder cross-attention "
            "are not implemented for "
            "FlashInferMLAImpl"
        )

    self._workspace_buffer = g_fi_workspace
    self.bmm1_scale: float | None = None
    self.bmm2_scale: float | None = None

_forward_decode ¶

_forward_decode(
    q: Tensor | tuple[Tensor, Tensor],
    kv_c_and_k_pe_cache: Tensor,
    attn_metadata: MLACommonMetadata,
    layer: AttentionLayer,
) -> tuple[Tensor, Tensor | None]

Source code in vllm/v1/attention/backends/mla/flashinfer_mla.py

def _forward_decode(
    self,
    q: torch.Tensor | tuple[torch.Tensor, torch.Tensor],
    kv_c_and_k_pe_cache: torch.Tensor,
    attn_metadata: MLACommonMetadata,
    layer: AttentionLayer,
) -> tuple[torch.Tensor, torch.Tensor | None]:
    assert kv_c_and_k_pe_cache.numel() > 0
    assert attn_metadata.decode is not None

    if isinstance(q, tuple):
        q_nope, q_pe = q
        q = torch.cat([q_nope, q_pe], dim=-1)

    # trtllm API requires extra dimension q_len_per_request for MTP
    if attn_metadata.num_decode_tokens % attn_metadata.num_decodes != 0:
        logger.warning_once(
            """FlashInferMLAImpl got a query of uneven length.
            This usually indicates an issue in batch reordering
            or incorrect setup in dummy_run."""
        )
        q = q.unsqueeze(1)
    else:
        q = q.view(attn_metadata.num_decodes, -1, q.shape[-2], q.shape[-1])

    if self.bmm1_scale is None:
        self.bmm1_scale = layer._q_scale_float * layer._k_scale_float * self.scale
    if self.bmm2_scale is None:
        self.bmm2_scale = layer._v_scale_float

    o = trtllm_batch_decode_with_kv_cache_mla(
        query=q,
        kv_cache=kv_c_and_k_pe_cache.unsqueeze(1),
        workspace_buffer=self._workspace_buffer,
        qk_nope_head_dim=self.qk_nope_head_dim,
        kv_lora_rank=self.kv_lora_rank,
        qk_rope_head_dim=self.qk_rope_head_dim,
        block_tables=attn_metadata.decode.block_table,
        seq_lens=attn_metadata.decode.seq_lens,
        max_seq_len=attn_metadata.max_seq_len,
        bmm1_scale=self.bmm1_scale,
        bmm2_scale=self.bmm2_scale,
    )

    # Flatten the output for consistent shape
    o = o.view(-1, o.shape[-2], o.shape[-1])

    # TODO: Return LSE pending support from Flashinfer API:
    # https://github.com/flashinfer-ai/flashinfer/pull/1566
    return o, None

FlashInferMLAMetadataBuilder ¶

Bases: MLACommonMetadataBuilder[MLACommonMetadata]

Source code in vllm/v1/attention/backends/mla/flashinfer_mla.py

class FlashInferMLAMetadataBuilder(MLACommonMetadataBuilder[MLACommonMetadata]):
    _cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.UNIFORM_BATCH
    query_len_support: ClassVar[QueryLenSupport] = QueryLenSupport.UNIFORM

_cudagraph_support `class-attribute` ¶

_cudagraph_support: AttentionCGSupport = UNIFORM_BATCH

query_len_support `class-attribute` ¶

query_len_support: QueryLenSupport = UNIFORM

vllm.v1.attention.backends.mla.flashinfer_mla ¶

FLASHINFER_MLA_WORKSPACE_BUFFER_SIZE module-attribute ¶

g_fi_workspace module-attribute ¶

logger module-attribute ¶

FlashInferMLABackend ¶

supported_dtypes class-attribute ¶

supported_kv_cache_dtypes class-attribute ¶

get_builder_cls staticmethod ¶

get_impl_cls staticmethod ¶

get_name staticmethod ¶

get_required_kv_cache_layout classmethod ¶

get_supported_kernel_block_sizes staticmethod ¶

supports_combination classmethod ¶

supports_compute_capability classmethod ¶

FlashInferMLAImpl ¶

_workspace_buffer instance-attribute ¶

bmm1_scale instance-attribute ¶

bmm2_scale instance-attribute ¶

__init__ ¶

_forward_decode ¶

FlashInferMLAMetadataBuilder ¶

_cudagraph_support class-attribute ¶

query_len_support class-attribute ¶

FLASHINFER_MLA_WORKSPACE_BUFFER_SIZE `module-attribute` ¶

g_fi_workspace `module-attribute` ¶

logger `module-attribute` ¶

supported_dtypes `class-attribute` ¶

supported_kv_cache_dtypes `class-attribute` ¶

get_builder_cls `staticmethod` ¶

get_impl_cls `staticmethod` ¶

get_name `staticmethod` ¶

get_required_kv_cache_layout `classmethod` ¶

get_supported_kernel_block_sizes `staticmethod` ¶

supports_combination `classmethod` ¶

supports_compute_capability `classmethod` ¶

_workspace_buffer `instance-attribute` ¶

bmm1_scale `instance-attribute` ¶

bmm2_scale `instance-attribute` ¶

init ¶

_cudagraph_support `class-attribute` ¶

query_len_support `class-attribute` ¶