fix(server): fix flash neox rotary embeddings (#150)

08b7e4a2 · OlivierDehaene · GitHub · 610bb1f9 · 08b7e4a2
Unverified Commit 08b7e4a2 authored Mar 30, 2023 by OlivierDehaene Committed by GitHub Mar 30, 2023
Show whitespace changes
Inline Side-by-side

Showing with 4 additions and 4 deletions

server/text_generation_server/models/flash_neox_modeling.py server/text_generation_server/models/flash_neox_modeling.py +4 -4

No files found.
--- a/server/text_generation_server/models/flash_neox_modeling.py
+++ b/server/text_generation_server/models/flash_neox_modeling.py
@@ -319,12 +319,12 @@ class FlashNeoxAttention(torch.nn.Module):
            layer_past[...] = qkv_rot[:, 1:]
            # output
-            attn_output = torch.empty_like(qkv[:, 0])
+            attn_output = torch.empty_like(qkv_rot[:, 0])
            # flash attention
            flash_attn_cuda.fwd(
-                qkv[:, 0],
+                qkv_rot[:, 0],
-                qkv[:, 1],
+                qkv_rot[:, 1],
-                qkv[:, 2],
+                qkv_rot[:, 2],
                attn_output,
                cu_seqlens,
                cu_seqlens,