test

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>

test
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
018b1b5f · Woosuk Kwon · e9820408 · 018b1b5f
Commit 018b1b5f authored Nov 28, 2024 by Woosuk Kwon
Hide whitespace changes
Inline Side-by-side

Showing with 10 additions and 10 deletions

csrc/flash_attn/flash_api.cpp csrc/flash_attn/flash_api.cpp +10 -10

No files found.
--- a/csrc/flash_attn/flash_api.cpp
+++ b/csrc/flash_attn/flash_api.cpp
@@ -406,16 +406,16 @@ mha_fwd(at::Tensor &q,         // batch_size x seqlen_q x num_heads x head_size
        params, batch_size, num_heads, head_size, seqlen_k, seqlen_q,
        head_size_rounded, p_dropout, /*num_splits*/ 0, dprops, opts);

-    // NOTE(woosuk): Commented out because they are not used in inference.
-    // // number of times random will be generated per thread, to offset philox counter in thc random
-    // // state
-    // // We use a custom RNG that increases the offset by batch_size * nheads * 32.
-    // int64_t counter_offset = params.b * params.h * 32;
-    // auto options = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
-    // auto rng_state = torch::empty({2}, options.dtype(torch::kInt64));
-    // // Forward kernel will populate memory with the seed and offset.
-    // params.rng_state = reinterpret_cast<uint64_t*>(rng_state.data_ptr());
+    // number of times random will be generated per thread, to offset philox counter in thc random
+    // state
+    // We use a custom RNG that increases the offset by batch_size * nheads * 32.
+    int64_t counter_offset = params.b * params.h * 32;
+    auto options = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    auto rng_state = torch::empty({2}, options.dtype(torch::kInt64));
+    // Forward kernel will populate memory with the seed and offset.
+    params.rng_state = reinterpret_cast<uint64_t*>(rng_state.data_ptr());

+    // NOTE(woosuk): Commented out because they are not used in inference.
    // if (p_dropout > 0.0)  {
    //     auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
    //         gen_, at::cuda::detail::getDefaultCUDAGenerator());
@@ -661,7 +661,6 @@ mha_varlen_fwd(at::Tensor &q,  // total_q x num_heads x head_size, total_q := \s
                               p_dropout, /*num_splits*/ 0, dprops, opts);
    }

-    // NOTE(woosuk): Commented out because they are not used in inference.
    // number of times random will be generated per thread, to offset philox counter in thc random
    // state
    // We use a custom RNG that increases the offset by batch_size * nheads * 32.
@@ -671,6 +670,7 @@ mha_varlen_fwd(at::Tensor &q,  // total_q x num_heads x head_size, total_q := \s
    // Forward kernel will populate memory with the seed and offset.
    params.rng_state = reinterpret_cast<uint64_t*>(rng_state.data_ptr());

+    // NOTE(woosuk): Commented out because they are not used in inference.
    // if (p_dropout > 0.0)  {
    //     auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
    //         gen_, at::cuda::detail::getDefaultCUDAGenerator());