Fixing graph capture for flash decoding. (#2163)

022f6515 · Nicolas Patry · GitHub · 4327210e · 022f6515
Unverified Commit 022f6515 authored Jul 02, 2024 by Nicolas Patry Committed by GitHub Jul 02, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 3 additions and 2 deletions

server/text_generation_server/models/flash_causal_lm.py server/text_generation_server/models/flash_causal_lm.py +3 -2

No files found.
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@@ -926,7 +926,7 @@ class FlashCausalLM(Model):
            "slots": slots,
            "input_lengths": input_lengths,
        }
-        input_lengths = Seqlen(input_lengths=input_lengths)
+        input_lengths_ = Seqlen(input_lengths=input_lengths)
        graph = torch.cuda.CUDAGraph()
        self.cuda_graphs[bs]["graph"] = graph

@@ -939,7 +939,7 @@ class FlashCausalLM(Model):
            kv_cache=self.kv_cache,
            block_tables=block_tables,
            slots=slots,
-            input_lengths=input_lengths,
+            input_lengths=input_lengths_,
            max_s=max_s,
            prefill_cache_indices=None,
            lm_head_indices=None,
@@ -947,6 +947,7 @@ class FlashCausalLM(Model):
        torch.cuda.synchronize()

        with torch.cuda.graph(graph, pool=MEM_POOL):
+            input_lengths = Seqlen(input_lengths=input_lengths)
            logits, speculative_logits = self.model.forward(
                input_ids=input_ids,
                position_ids=position_ids,