[FusedDense] Run black on fused_dense.py

bcfa7c97 · Tri Dao · 2286d7ce · bcfa7c97 · bcfa7c97 · bcfa7c97
Commit bcfa7c97 authored Aug 16, 2023 by Tri Dao
Showing with 282 additions and 129 deletions

csrc/flash_attn/src/flash_bwd_kernel.h csrc/flash_attn/src/flash_bwd_kernel.h +1 -1

flash_attn/ops/fused_dense.py flash_attn/ops/fused_dense.py +278 -128

pyproject.toml pyproject.toml +3 -0

No files found.
--- a/csrc/flash_attn/src/flash_bwd_kernel.h
+++ b/csrc/flash_attn/src/flash_bwd_kernel.h
@@ -822,7 +822,7 @@ inline __device__ void compute_dq_dk_dv_1colblock(const Params &params, const in
            // Putting this causal masking right after acc_s is *much* slower for some reason.
            // TD [2023-08-16]: We need the 2nd condition because if seqlen_q is long and seqlen_k is short
            // (e.g., 256 and 2), the 2nd block of seqlen_q (from 128 to 255), we're not doing causal masking.
-            // But we still want to mask out elements not beyond actual_seqlen_k.
+            // But we still want to mask out elements beyond actual_seqlen_k.
            if (m_block * kBlockM < (n_block + 1) * kBlockN
                || (!Is_even_MN && (n_block + 1) * kBlockN >= binfo.actual_seqlen_k)) {
                flash::apply_mask_causal(scores, n_block * kBlockN + (tidx / 32 / AtomLayoutMS) * MMA_N_SdP * 16,

--- a/flash_attn/ops/fused_dense.py
+++ b/flash_attn/ops/fused_dense.py
--- a/pyproject.toml
+++ b/pyproject.toml
+[tool.black]
+line-length = 100
+target-version = ['py38']
\ No newline at end of file