[Rotary] Set device before launching Triton kernel to avoid error

97951590 · Tri Dao · 6d673cd9 · 97951590 · 97951590
Commit 97951590 authored Sep 05, 2023 by Tri Dao
Show whitespace changes
Inline Side-by-side

Showing with 30 additions and 30 deletions

flash_attn/ops/triton/rotary.py flash_attn/ops/triton/rotary.py +30 -27

tests/models/test_baichuan.py tests/models/test_baichuan.py +0 -3

No files found.
--- a/flash_attn/ops/triton/rotary.py
+++ b/flash_attn/ops/triton/rotary.py
@@ -205,6 +205,9 @@ def apply_rotary(
    grid = lambda META: (triton.cdiv(seqlen, META["BLOCK_M"]), batch, nheads)  # noqa
    BLOCK_M = 4 if interleaved else (8 if rotary_dim <= 64 else 4)
+    # Need this, otherwise Triton tries to launch from cuda:0 and we get
+    # ValueError: Pointer argument (at 0) cannot be accessed from Triton (cpu tensor?)
+    with torch.cuda.device(x.device.index):
        rotary_kernel[grid](
            output,  # data ptrs
            x,

--- a/tests/models/test_baichuan.py
+++ b/tests/models/test_baichuan.py
@@ -148,9 +148,6 @@ def test_baichuan_parallel_forward(model_name, world_size):
    rank = parallel_state.get_tensor_model_parallel_rank()
    process_group = parallel_state.get_tensor_model_parallel_group()
-    # Need this, otherwise the Triton kernel seems to launched from the wrong device.
-    torch.cuda.set_device(device)
    pretrained_state_dict = remap_state_dict_hf_baichuan(
        state_dict_from_pretrained(model_name), config
    )