Run isort and black on python files

f1a73d07 · Tri Dao · cbb4cf5f · f1a73d07 · f1a73d07 · f1a73d07
Commit f1a73d07 authored Aug 18, 2023 by Tri Dao
14 changed files
--- a/flash_attn/modules/embedding.py
+++ b/flash_attn/modules/embedding.py
--- a/flash_attn/modules/mha.py
+++ b/flash_attn/modules/mha.py
@@ -732,8 +732,12 @@ class ParallelMHA(nn.Module):
            self.num_heads % self.num_heads_kv == 0
        ), "num_heads must be divisible by num_heads_kv"
-        self.num_heads_per_rank = get_dim_for_local_rank(self.num_heads, self.world_size, self.local_rank)
+        self.num_heads_per_rank = get_dim_for_local_rank(
-        self.num_heads_kv_per_rank = get_dim_for_local_rank(self.num_heads, self.world_size, self.local_rank)
+            self.num_heads, self.world_size, self.local_rank
+        )
+        self.num_heads_kv_per_rank = get_dim_for_local_rank(
+            self.num_heads, self.world_size, self.local_rank
+        )
        self.head_dim = self.embed_dim // num_heads
        qkv_dim = self.head_dim * (self.num_heads + 2 * self.num_heads_kv)

--- a/flash_attn/modules/mlp.py
+++ b/flash_attn/modules/mlp.py
--- a/flash_attn/ops/activations.py
+++ b/flash_attn/ops/activations.py
@@ -5,7 +5,6 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 # 1/sqrt(2*pi)-> 0.3989423
 # 1/sqrt(2)   -> 0.70710678
 # sqrt(2/pi)  -> 0.79788456
@@ -18,17 +17,19 @@ def bias_gelu(y, bias):
    x = bias + y
    return (x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))).to(dtype=y.dtype)
 # gradient of tanh approximation of gelu
 # gradient of actual gelu is:
 # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
 @torch.jit.script
 def bias_gelu_back(g, y, bias):
-    """Assume that y has shape (B, D) and bias has shape (D)
+    """Assume that y has shape (B, D) and bias has shape (D)"""
-    """
    x = bias + y
    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
    # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
-    ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
+    ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (
+        1 + tanh_out
+    )
    grad_y = ff * g
    return grad_y.to(dtype=y.dtype), grad_y.sum(dim=(0), dtype=bias.dtype)
@@ -56,6 +57,7 @@ bias_gelu_impl = GeLUFunction.apply
 def gelu_fwd(x):
    return (x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))).to(dtype=x.dtype)
 # gradient of tanh approximation of gelu
 # gradient of actual gelu is:
 # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
@@ -63,7 +65,9 @@ def gelu_fwd(x):
 def gelu_bwd(g, x):
    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
    # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
-    ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
+    ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (
+        1 + tanh_out
+    )
    return (ff * g).to(dtype=x.dtype)
@@ -76,10 +80,11 @@ class FastGeLUFunction(torch.autograd.Function):
    @staticmethod
    def backward(ctx, grad_output):
-        input, = ctx.saved_tensors
+        (input,) = ctx.saved_tensors
        tmp = gelu_bwd(grad_output, input)
        return tmp
 fast_gelu_impl = FastGeLUFunction.apply

--- a/flash_attn/ops/fused_dense.py
+++ b/flash_attn/ops/fused_dense.py
--- a/flash_attn/ops/layer_norm.py
+++ b/flash_attn/ops/layer_norm.py
--- a/flash_attn/ops/rms_norm.py
+++ b/flash_attn/ops/rms_norm.py
--- a/flash_attn/ops/triton/k_activations.py
+++ b/flash_attn/ops/triton/k_activations.py
--- a/flash_attn/ops/triton/linear.py
+++ b/flash_attn/ops/triton/linear.py
--- a/flash_attn/ops/triton/mlp.py
+++ b/flash_attn/ops/triton/mlp.py
--- a/flash_attn/utils/benchmark.py
+++ b/flash_attn/utils/benchmark.py
--- a/flash_attn/utils/distributed.py
+++ b/flash_attn/utils/distributed.py
--- a/flash_attn/utils/generation.py
+++ b/flash_attn/utils/generation.py
--- a/flash_attn/utils/pretrained.py
+++ b/flash_attn/utils/pretrained.py