Merge pull request #55 from ROCmSoftwarePlatform/IFU-master-2021-10-15

IFU-2021-10-15 (+ remove redundant defines + C10_CUDA_CHECK)

Merge pull request #55 from ROCmSoftwarePlatform/IFU-master-2021-10-15
IFU-2021-10-15 (+ remove redundant defines + C10_CUDA_CHECK)
cc92a4b4 · Jithun Nair · GitHub · 1e0f9bc6 · fec3141c · cc92a4b4
Unverified Commit cc92a4b4 authored Dec 08, 2021 by Jithun Nair Committed by GitHub Dec 08, 2021
3 changed files
--- a/tests/L0/run_transformer/run_utils_test.py
+++ b/tests/L0/run_transformer/run_utils_test.py
+import torch
+
+from apex.transformer.tensor_parallel import utils
+
+
+def test_divide():
+    assert utils.divide(8, 4) == 2
+
+
+def test_split_tensor_along_last_dim():
+    inputy = torch.randn((100, 100, 100))
+    splits = utils.split_tensor_along_last_dim(inputy, 10)
+    last_dim_shapes = torch.tensor([int(split.size()[-1]) for split in splits])
+    assert torch.equal(last_dim_shapes, torch.full((10,), 10))
+
+
+if __name__ == "__main__":
+    test_divide()
+    test_split_tensor_along_last_dim()
+    print(">> passed the test :-)")
--- a/tests/L0/run_transformer/test_fused_softmax.py
+++ b/tests/L0/run_transformer/test_fused_softmax.py
+"""Test for fused softmax functions.
+
+Ref: https://github.com/NVIDIA/Megatron-LM/blob/40becfc96c4144985458ac0e0fae45dbb111fbd2/megatron/fused_kernels/tests/test_fused_kernels.py
+"""  # NOQA
+import itertools
+import unittest
+
+import torch
+
+from apex.transformer import AttnMaskType
+from apex.transformer.functional import FusedScaleMaskSoftmax
+
+
+def attention_mask_func(attention_scores, attention_mask):
+    return attention_scores.masked_fill(attention_mask, -10000.0)
+
+
+autocast_dtypes = (torch.half, torch.bfloat16) if torch.cuda.is_bf16_supported() else (torch.half,)
+
+
+class TestFusedScaleMaskSoftmax(unittest.TestCase):
+
+    def _setup_fused_softmax(self, input_in_fp16, input_in_bf16, scale=None, softmax_in_fp32=False, attn_mask_type=AttnMaskType.padding):
+        fused_fn = FusedScaleMaskSoftmax(
+            input_in_fp16=input_in_fp16,
+            input_in_bf16=input_in_bf16,
+            mask_func=attention_mask_func,
+            scale=scale,
+            softmax_in_fp32=softmax_in_fp32,
+            attn_mask_type=attn_mask_type,
+            scaled_masked_softmax_fusion=True,
+        )
+        torch_fn = FusedScaleMaskSoftmax(
+            input_in_fp16=input_in_fp16,
+            input_in_bf16=input_in_bf16,
+            mask_func=attention_mask_func,
+            scale=scale,
+            softmax_in_fp32=softmax_in_fp32,
+            attn_mask_type=attn_mask_type,
+            scaled_masked_softmax_fusion=False,
+        )
+        return fused_fn, torch_fn
+
+    def test_fused_scale_mask_softmax(self):
+        """
+        attention_scores.shape = [4, 12, 24, 24]
+        mask.shape = [4, 1, 24, 24]
+        """
+        for (dtype, scale, softmax_in_fp32) in itertools.product(
+                (torch.half, torch.bfloat16),
+                (None, 2.0),
+                (False, True),
+        ):
+            with self.subTest(f"{dtype}-{scale}-{softmax_in_fp32}"):
+                input_in_fp16 = dtype == torch.half
+                input_in_bf16 = dtype == torch.bfloat16
+                if not (scale is None or softmax_in_fp32):
+                    with self.assertRaises(RuntimeError):
+                        self._setup_fused_softmax(input_in_fp16, input_in_bf16, scale, softmax_in_fp32, AttnMaskType.padding)
+                    return
+                fused_fn, torch_fn = self._setup_fused_softmax(input_in_fp16, input_in_bf16, scale, softmax_in_fp32, AttnMaskType.padding)
+
+                attention_scores_0 = torch.randn((4, 12, 24, 24)).to(device="cuda", dtype=dtype).requires_grad_(True)
+                with torch.no_grad():
+                    attention_scores_1 = attention_scores_0.clone().requires_grad_(True)
+                mask = torch.randint(0, 2, (4, 1, 24, 24), device="cuda").bool()
+                expected = fused_fn(attention_scores_0, mask)
+                actual = torch_fn(attention_scores_1, mask)
+                torch.testing.assert_allclose(actual, expected)
+
+                g0 = torch.rand_like(actual)
+                with torch.no_grad():
+                    g1 = g0.clone()
+                expected.backward(g0)
+                actual.backward(g1)
+
+    def test_autocast_fused_scale_mask_softmax(self):
+        for dtype in autocast_dtypes:
+            with self.subTest(f"{dtype}"):
+                input_in_fp16 = dtype == torch.half
+                input_in_bf16 = dtype == torch.bfloat16
+                fused_fn, torch_fn = self._setup_fused_softmax(
+                    input_in_fp16, input_in_bf16, attn_mask_type=AttnMaskType.padding)
+
+                attention_scores_0 = torch.randn((4, 12, 24, 24)).cuda().requires_grad_(True)
+                with torch.no_grad():
+                    attention_scores_1 = attention_scores_0.clone().to(dtype).requires_grad_(True)
+                mask = torch.randint(0, 2, (4, 1, 24, 24)).bool().cuda()
+
+                expected = torch_fn(attention_scores_1, mask)
+                with torch.cuda.amp.autocast(dtype=dtype):
+                    actual = fused_fn(attention_scores_0, mask)
+                    self.assertEqual(actual.dtype, dtype)
+                torch.testing.assert_allclose(actual, expected)
+
+                g0 = torch.rand_like(actual)
+                with torch.no_grad():
+                    g1 = g0.clone()
+                expected.backward(g0)
+                actual.backward(g1)
+
+    def test_fused_upper_triangle_mask_softmax(self):
+        """
+        attn_weights.shape: [4, 12, 24, 24]
+        total_mask.shape: [4, 1, 24, 24]
+
+        total_mask[0, 0], a 24x24 matrix is like a lower triangular matrix, but
+        upper elements are True and lower elements and diagonal are False.
+        """
+        for (dtype, scale, softmax_in_fp32) in itertools.product(
+                (torch.half, torch.bfloat16),
+                (None, 2.0),
+                (False, True),
+        ):
+            with self.subTest(f"{dtype}-{scale}-{softmax_in_fp32}"):
+                input_in_fp16 = dtype == torch.half
+                input_in_bf16 = dtype == torch.bfloat16
+                if not (scale is None or softmax_in_fp32):
+                    with self.assertRaises(RuntimeError):
+                        self._setup_fused_softmax(
+                            input_in_fp16, input_in_bf16, scale, softmax_in_fp32, AttnMaskType.causal)
+                    return
+                fused_fn, torch_fn = self._setup_fused_softmax(
+                    input_in_fp16, input_in_bf16, scale, softmax_in_fp32, AttnMaskType.causal)
+
+                attn_weights_0 = torch.randn((4, 12, 24, 24)).to(device="cuda", dtype=dtype).requires_grad_(True)
+                with torch.no_grad():
+                    attn_weights_1 = attn_weights_0.clone().requires_grad_(True)
+                total_mask = (~(
+                    torch.tril(torch.randn((24, 24), device="cuda")).bool()
+                ).unsqueeze(0).unsqueeze(0))
+                total_mask = total_mask.repeat((4, 1, 1, 1))
+                expected = fused_fn(attn_weights_0, total_mask)
+                actual = torch_fn(attn_weights_1, total_mask)
+                torch.testing.assert_allclose(actual, expected)
+
+                g0 = torch.randn_like(actual)
+                with torch.no_grad():
+                    g1 = g0.clone()
+                actual.backward(g0)
+                expected.backward(g1)
+
+    def test_autocast_fused_upper_triangle_mask_softmax(self):
+        for dtype in autocast_dtypes:
+            with self.subTest(f"{dtype}"):
+                input_in_fp16 = dtype == torch.half
+                input_in_bf16 = dtype == torch.bfloat16
+                fused_fn, torch_fn = self._setup_fused_softmax(
+                    input_in_fp16, input_in_bf16, attn_mask_type=AttnMaskType.causal)
+
+                attn_weights_0 = torch.randn((4, 12, 24, 24)).cuda().requires_grad_(True)
+                with torch.no_grad():
+                    attn_weights_1 = attn_weights_0.clone().to(dtype).requires_grad_(True)
+                total_mask = (~(
+                    torch.tril(torch.randn((24, 24), device="cuda")).bool()
+                ).unsqueeze(0).unsqueeze(0))
+
+                with torch.cuda.amp.autocast(dtype=dtype):
+                    actual = fused_fn(attn_weights_0, total_mask)
+                    self.assertEqual(actual.dtype, dtype)
+                expected = torch_fn(attn_weights_1, total_mask)
+                torch.testing.assert_allclose(actual, expected)
+
+                g0 = torch.randn_like(actual)
+                with torch.no_grad():
+                    g1 = g0.clone()
+                actual.backward(g0)
+                expected.backward(g1)
--- a/tests/L0/run_transformer/test_mpu.py
+++ b/tests/L0/run_transformer/test_mpu.py
+import os
+import subprocess
+import sys
+import unittest
+
+
+def run_mpu_tests():
+    python_executable_path = sys.executable
+    # repository_root = os.path.join(os.path.dirname(__file__), "../../../")
+    # directory = os.path.abspath(os.path.join(repository_root, "tests/mpu"))
+    directory = os.path.dirname(__file__)
+    files = [
+        os.path.join(directory, f) for f in os.listdir(directory)
+        if f.startswith("run_") and os.path.isfile(os.path.join(directory, f))
+    ]
+    print("#######################################################")
+    print(f"# Python executable path: {python_executable_path}")
+    print(f"# {len(files)} tests: {files}")
+    print("#######################################################")
+    errors = []
+    for i, test_file in enumerate(files, 1):
+        test_run_cmd = f"NVIDIA_TF32_OVERRIDE=0  {python_executable_path} {test_file} --micro-batch-size 2 --num-layers 1 --hidden-size 256 --num-attention-heads 8 --max-position-embeddings 32 --encoder-seq-length 32 --use-cpu-initialization"  # NOQA
+        print(f"### {i} / {len(files)}: cmd: {test_run_cmd}")
+        try:
+            output = subprocess.check_output(
+                test_run_cmd, shell=True
+            ).decode(sys.stdout.encoding).strip()
+        except Exception as e:
+            errors.append((test_file, str(e)))
+        else:
+            if '>> passed the test :-)' not in output:
+                errors.append(test_file, output)
+    else:
+        if not errors:
+            print("### PASSED")
+        else:
+            print("### FAILED")
+            short_msg = f"{len(errors)} out of {len(files)} tests failed"
+            print(short_msg)
+            for (filename, log) in errors:
+                print(f"File: {filename}\nLog: {log}")
+            raise RuntimeError(short_msg)
+
+
+class TestMPU(unittest.TestCase):
+
+    def test_mpu(self):
+        run_mpu_tests()
+
+
+if __name__ == '__main__':
+    unittest.main()