feat: add triton kernels to decrease latency of large batches (#2687)

* feat: add triton kernels to decrease latency of large batches * cast to int32 * fix kernel * fix kernel * disable triton on rocm * fix speculation * add slots filtering kernel

feat: add triton kernels to decrease latency of large batches (#2687)
* feat: add triton kernels to decrease latency of large batches * cast to int32 * fix kernel * fix kernel * disable triton on rocm * fix speculation * add slots filtering kernel
6f88bd93 · OlivierDehaene · GitHub · 0f346a32 · 6f88bd93 · 6f88bd93
Unverified Commit 6f88bd93 authored Oct 25, 2024 by OlivierDehaene Committed by GitHub Oct 25, 2024
4 changed files
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
--- a/server/text_generation_server/models/metadata_kernels.py
+++ b/server/text_generation_server/models/metadata_kernels.py
+import torch
+import triton
+
+import triton.language as tl
+
+from loguru import logger
+from typing import List, Optional
+from torch.utils._triton import has_triton as has_triton_torch
+
+from text_generation_server.utils.import_utils import (
+    SYSTEM,
+)
+from text_generation_server.utils.log import log_master
+
+_HAS_TRITON: Optional[bool] = None
+
+
+def has_triton():
+    global _HAS_TRITON
+    if _HAS_TRITON is None:
+        # FIXME: it seems that has_triton_torch is bugged on RocM
+        #        For now, only accept cuda
+        _HAS_TRITON = has_triton_torch() if SYSTEM == "cuda" else False
+        if _HAS_TRITON:
+            log_master(logger.info, "Using optimized Triton indexing kernels.")
+
+    return _HAS_TRITON
+
+
+def block_tables_to_padded(
+    max_blocks: int,
+    cu_seqlen: torch.Tensor,
+    block_tables: torch.Tensor,
+    block_tables_ragged: torch.Tensor,
+):
+    def grid(meta):
+        return (
+            triton.cdiv(max_blocks, meta["BLOCK_SIZE"]),
+            len(block_tables),
+        )
+
+    triton_block_tables_to_padded[grid](
+        cu_seqlen,
+        block_tables,
+        block_tables_ragged,
+        block_tables.shape[1],
+        BLOCK_SIZE=256,
+    )
+
+
+def block_tables_to_ragged(
+    *,
+    block_tables: torch.Tensor,
+    input_lengths: List[int],
+    cache_lengths: List[int],
+    input_lengths_tensor: torch.Tensor,
+    cache_lengths_tensor: torch.Tensor,
+    max_current_length: int
+) -> torch.Tensor:
+    """Convert block table to ragged format compatible with FlashInfer."""
+    assert len(input_lengths) == len(cache_lengths)
+
+    total_len = sum(input_lengths) + sum(cache_lengths)
+    block_tables_ragged = torch.empty(
+        total_len, dtype=torch.int32, device=block_tables.device
+    )
+
+    if has_triton():
+        cu_seqlen = torch.nn.functional.pad(
+            torch.cumsum(input_lengths_tensor + cache_lengths_tensor, dim=0), (1, 0)
+        )
+
+        def grid(meta):
+            return (
+                triton.cdiv(max_current_length, meta["BLOCK_SIZE"]),
+                len(cache_lengths),
+            )
+
+        triton_block_tables_to_ragged[grid](
+            cu_seqlen,
+            block_tables,
+            block_tables_ragged,
+            block_tables.shape[1],
+            BLOCK_SIZE=256,
+        )
+    else:
+        offset = 0
+        for i, (input_length, cache_length) in enumerate(
+            zip(input_lengths, cache_lengths)
+        ):
+            seq_len = cache_length + input_length
+            block_tables_ragged[offset : offset + seq_len] = block_tables[i][:seq_len]
+            offset += seq_len
+
+    return block_tables_ragged
+
+
+def copy_next_input_ids_inplace(
+    max_next_input_ids: int,
+    all_input_ids: torch.Tensor,
+    cache_lengths: torch.Tensor,
+    input_lengths: torch.Tensor,
+    prompt_lengths: torch.Tensor,
+    next_input_ids: torch.Tensor,
+    cu_accepted_ids: torch.Tensor,
+):
+    def grid(meta):
+        return (
+            triton.cdiv(max_next_input_ids, meta["BLOCK_SIZE"]),
+            len(all_input_ids),
+        )
+
+    triton_copy_next_input_ids_inplace[grid](
+        all_input_ids,
+        cache_lengths,
+        input_lengths,
+        prompt_lengths,
+        next_input_ids,
+        cu_accepted_ids,
+        all_input_ids.shape[1],
+        BLOCK_SIZE=16,
+    )
+
+
+def prepare_position_slot_ids(
+    max_input_length: int,
+    cache_lengths: torch.Tensor,
+    cu_seqlen: torch.Tensor,
+    cu_slots: torch.Tensor,
+    position_ids: torch.Tensor,
+    slot_indices: torch.Tensor,
+):
+    def grid(meta):
+        return (
+            triton.cdiv(max_input_length, meta["BLOCK_SIZE"]),
+            len(cache_lengths),
+        )
+
+    triton_prepare_position_slot_ids[grid](
+        cache_lengths, cu_seqlen, cu_slots, position_ids, slot_indices, BLOCK_SIZE=256
+    )
+
+
+def slots_filtering(
+    max_slots: int,
+    slots: torch.Tensor,
+    filtered_slots: torch.Tensor,
+    cu_slots: torch.Tensor,
+    slots_start: torch.Tensor,
+):
+    def grid(meta):
+        return (
+            triton.cdiv(max_slots, meta["BLOCK_SIZE"]),
+            len(slots_start),
+        )
+
+    triton_slots_filtering[grid](
+        slots, filtered_slots, slots_start, cu_slots, BLOCK_SIZE=256
+    )
+
+
+@triton.jit
+def triton_slots_filtering(
+    # Inputs
+    slots_ptr,
+    filtered_slots_ptr,
+    slots_start_ptr,
+    cu_slots_ptr,
+    # Const values
+    BLOCK_SIZE: "tl.constexpr",
+):
+    # Position in block_tables_ragged.numel() / BLOCK_SIZE
+    pid = tl.program_id(axis=0)
+    # Position in batch
+    bid = tl.program_id(axis=1)
+
+    block_start = pid * BLOCK_SIZE
+    block_arange = block_start + tl.arange(0, BLOCK_SIZE)
+
+    filter_start = tl.load(slots_start_ptr + bid)
+
+    slot_start = tl.load(cu_slots_ptr + bid)
+    slot_end = tl.load(cu_slots_ptr + bid + 1)
+
+    mask = (slot_start + block_arange) < slot_end
+
+    slots = tl.load(slots_ptr + filter_start + block_arange, mask=mask)
+    tl.store(filtered_slots_ptr + slot_start + block_arange, slots, mask=mask)
+
+
+@triton.jit
+def triton_block_tables_to_padded(
+    # Inputs
+    cu_seqlen_ptr,
+    # Outputs
+    block_tables_ptr,
+    block_tables_ragged_ptr,
+    # Stride
+    stride_block_tables,
+    # Const values
+    BLOCK_SIZE: "tl.constexpr",
+):
+    # Position in block_tables_ragged.numel() / BLOCK_SIZE
+    pid = tl.program_id(axis=0)
+    # Position in batch
+    bid = tl.program_id(axis=1)
+
+    block_start = pid * BLOCK_SIZE
+    block_arange = block_start + tl.arange(0, BLOCK_SIZE)
+
+    seq_start = tl.load(cu_seqlen_ptr + bid)
+    seq_end = tl.load(cu_seqlen_ptr + bid + 1)
+
+    mask = (seq_start + block_arange) < seq_end
+
+    blocks = tl.load(block_tables_ragged_ptr + seq_start + block_arange, mask=mask)
+    tl.store(
+        block_tables_ptr + bid * stride_block_tables + block_arange, blocks, mask=mask
+    )
+
+
+@triton.jit
+def triton_block_tables_to_ragged(
+    # Inputs
+    cu_seqlen_ptr,
+    # Outputs
+    block_tables_ptr,
+    block_tables_ragged_ptr,
+    # Stride
+    stride_block_tables,
+    # Const values
+    BLOCK_SIZE: "tl.constexpr",
+):
+    # Position in block_tables_ragged.numel() / BLOCK_SIZE
+    pid = tl.program_id(axis=0)
+    # Position in batch
+    bid = tl.program_id(axis=1)
+
+    block_start = pid * BLOCK_SIZE
+    block_arange = block_start + tl.arange(0, BLOCK_SIZE)
+
+    seq_start = tl.load(cu_seqlen_ptr + bid)
+    seq_end = tl.load(cu_seqlen_ptr + bid + 1)
+
+    mask = (seq_start + block_arange) < seq_end
+
+    blocks = tl.load(
+        block_tables_ptr + bid * stride_block_tables + block_arange, mask=mask
+    )
+    tl.store(block_tables_ragged_ptr + seq_start + block_arange, blocks, mask=mask)
+
+
+@triton.jit
+def triton_copy_next_input_ids_inplace(
+    # Inputs
+    all_input_ids_ptr,
+    cache_lengths_ptr,
+    input_lengths_ptr,
+    prompt_lengths_ptr,
+    next_input_ids_ptr,
+    cu_accepted_ids_ptr,
+    # Stride
+    stride_all_input_ids,
+    # Const values
+    BLOCK_SIZE: "tl.constexpr",
+):
+    # Position in max_accepted_ids / BLOCK_SIZE
+    pid = tl.program_id(axis=0)
+    # Position in batch
+    bid = tl.program_id(axis=1)
+
+    block_start = pid * BLOCK_SIZE
+    block_arange = block_start + tl.arange(0, BLOCK_SIZE)
+
+    # Used for correctly indexing in all_input_ids
+    cache_length = tl.load(cache_lengths_ptr + bid)
+    input_length = tl.load(input_lengths_ptr + bid)
+    prompt_length = tl.load(prompt_lengths_ptr + bid)
+
+    # Start/End of next_input_ids for this request
+    next_input_ids_start = tl.load(cu_accepted_ids_ptr + bid)
+    next_input_ids_end = tl.load(cu_accepted_ids_ptr + bid + 1)
+
+    # Mask values out of range
+    mask = (next_input_ids_start + block_arange) < next_input_ids_end
+
+    # Mask values for request still prefilling
+    decode_mask = (cache_length + input_length + block_arange) >= prompt_length
+
+    mask = mask & decode_mask
+
+    # Load this request next input ids
+    next_input_ids = tl.load(
+        next_input_ids_ptr + next_input_ids_start + block_arange, mask=mask
+    )
+
+    # Store in all_input_ids, since it is a 2D tensor, apply stride * bid
+    tl.store(
+        all_input_ids_ptr
+        + stride_all_input_ids * bid
+        + cache_length
+        + input_length
+        + block_arange,
+        next_input_ids,
+        mask=mask,
+    )
+
+
+@triton.jit
+def triton_prepare_position_slot_ids(
+    # Inputs
+    cache_lengths_ptr,
+    cu_seqlen_ptr,
+    cu_slots_ptr,
+    # Outputs
+    position_ids_ptr,
+    slot_indices_ptr,
+    # Const values
+    BLOCK_SIZE: "tl.constexpr",
+):
+    # Position in max_input_length / BLOCK_SIZE
+    pid = tl.program_id(axis=0)
+    # Position in batch
+    bid = tl.program_id(axis=1)
+
+    block_start = pid * BLOCK_SIZE
+    block_arange = block_start + tl.arange(0, BLOCK_SIZE)
+
+    cache_length = tl.load(cache_lengths_ptr + bid)
+
+    seq_start = tl.load(cu_seqlen_ptr + bid)
+    seq_end = tl.load(cu_seqlen_ptr + bid + 1)
+
+    slot_start = tl.load(cu_slots_ptr + bid)
+
+    mask = (seq_start + block_arange) < seq_end
+
+    tl.store(
+        position_ids_ptr + seq_start + block_arange,
+        cache_length + block_arange,
+        mask=mask,
+    )
+    tl.store(
+        slot_indices_ptr + seq_start + block_arange,
+        slot_start + cache_length + block_arange,
+        mask=mask,
+    )
--- a/server/text_generation_server/models/mllama_causal_lm.py
+++ b/server/text_generation_server/models/mllama_causal_lm.py
@@ -14,11 +14,9 @@ from transformers import (

 from text_generation_server.models.vlm_causal_lm import VlmCausalLMBatch, VlmCausalLM
 from text_generation_server.pb import generate_pb2
-from text_generation_server.models.flash_causal_lm import (
-    block_tables_to_ragged,
-)
 from text_generation_server.models.globals import PREFIX_CACHING, ATTENTION
 from text_generation_server.layers.attention import Seqlen
+from text_generation_server.models.metadata_kernels import block_tables_to_ragged


 tracer = trace.get_tracer(__name__)
@@ -283,6 +281,9 @@ class MllamaCausalLM(VlmCausalLM):
                    block_tables=block_tables,
                    input_lengths=batch.input_lengths,
                    cache_lengths=batch.cache_lengths,
+                    input_lengths_tensor=batch.input_lengths_tensor,
+                    cache_lengths_tensor=batch.cache_lengths_tensor,
+                    max_current_length=batch.max_current_length,
                )
            with self._forward_context(
                block_tables=block_tables,
@@ -338,6 +339,9 @@ class MllamaCausalLM(VlmCausalLM):
                block_tables=block_tables,
                input_lengths=batch.input_lengths,
                cache_lengths=batch.cache_lengths,
+                input_lengths_tensor=batch.input_lengths_tensor,
+                cache_lengths_tensor=batch.cache_lengths_tensor,
+                max_current_length=batch.max_current_length,
            )
            cuda_graph["block_tables"][: block_tables.shape[0]] = block_tables
        else:

--- a/server/text_generation_server/models/vlm_causal_lm.py
+++ b/server/text_generation_server/models/vlm_causal_lm.py
@@ -11,12 +11,12 @@ from text_generation_server.pb import generate_pb2
 from text_generation_server.models.flash_causal_lm import (
    FlashCausalLMBatch,
    FlashCausalLM,
-    block_tables_to_ragged,
 )
 from text_generation_server.models.globals import PREFIX_CACHING, ATTENTION
 from text_generation_server.utils.log import log_master
 from transformers import AutoProcessor
 from text_generation_server.layers.attention import Seqlen
+from text_generation_server.models.metadata_kernels import block_tables_to_ragged

 tracer = trace.get_tracer(__name__)

@@ -363,6 +363,9 @@ class VlmCausalLM(FlashCausalLM):
                    block_tables=block_tables,
                    input_lengths=batch.input_lengths,
                    cache_lengths=batch.cache_lengths,
+                    input_lengths_tensor=batch.input_lengths_tensor,
+                    cache_lengths_tensor=batch.cache_lengths_tensor,
+                    max_current_length=batch.max_current_length,
                )
            with self._forward_context(
                block_tables=block_tables,
@@ -411,6 +414,9 @@ class VlmCausalLM(FlashCausalLM):
                block_tables=block_tables,
                input_lengths=batch.input_lengths,
                cache_lengths=batch.cache_lengths,
+                input_lengths_tensor=batch.input_lengths_tensor,
+                cache_lengths_tensor=batch.cache_lengths_tensor,
+                max_current_length=batch.max_current_length,
            )
            cuda_graph["block_tables"][: block_tables.shape[0]] = block_tables
        else: