Unverified Commit cc92a4b4 authored by Jithun Nair's avatar Jithun Nair Committed by GitHub
Browse files

Merge pull request #55 from ROCmSoftwarePlatform/IFU-master-2021-10-15

IFU-2021-10-15 (+ remove redundant defines + C10_CUDA_CHECK)
parents 1e0f9bc6 fec3141c
import torch
from apex.transformer.tensor_parallel import utils
def test_divide():
assert utils.divide(8, 4) == 2
def test_split_tensor_along_last_dim():
inputy = torch.randn((100, 100, 100))
splits = utils.split_tensor_along_last_dim(inputy, 10)
last_dim_shapes = torch.tensor([int(split.size()[-1]) for split in splits])
assert torch.equal(last_dim_shapes, torch.full((10,), 10))
if __name__ == "__main__":
test_divide()
test_split_tensor_along_last_dim()
print(">> passed the test :-)")
"""Test for fused softmax functions.
Ref: https://github.com/NVIDIA/Megatron-LM/blob/40becfc96c4144985458ac0e0fae45dbb111fbd2/megatron/fused_kernels/tests/test_fused_kernels.py
""" # NOQA
import itertools
import unittest
import torch
from apex.transformer import AttnMaskType
from apex.transformer.functional import FusedScaleMaskSoftmax
def attention_mask_func(attention_scores, attention_mask):
return attention_scores.masked_fill(attention_mask, -10000.0)
autocast_dtypes = (torch.half, torch.bfloat16) if torch.cuda.is_bf16_supported() else (torch.half,)
class TestFusedScaleMaskSoftmax(unittest.TestCase):
def _setup_fused_softmax(self, input_in_fp16, input_in_bf16, scale=None, softmax_in_fp32=False, attn_mask_type=AttnMaskType.padding):
fused_fn = FusedScaleMaskSoftmax(
input_in_fp16=input_in_fp16,
input_in_bf16=input_in_bf16,
mask_func=attention_mask_func,
scale=scale,
softmax_in_fp32=softmax_in_fp32,
attn_mask_type=attn_mask_type,
scaled_masked_softmax_fusion=True,
)
torch_fn = FusedScaleMaskSoftmax(
input_in_fp16=input_in_fp16,
input_in_bf16=input_in_bf16,
mask_func=attention_mask_func,
scale=scale,
softmax_in_fp32=softmax_in_fp32,
attn_mask_type=attn_mask_type,
scaled_masked_softmax_fusion=False,
)
return fused_fn, torch_fn
def test_fused_scale_mask_softmax(self):
"""
attention_scores.shape = [4, 12, 24, 24]
mask.shape = [4, 1, 24, 24]
"""
for (dtype, scale, softmax_in_fp32) in itertools.product(
(torch.half, torch.bfloat16),
(None, 2.0),
(False, True),
):
with self.subTest(f"{dtype}-{scale}-{softmax_in_fp32}"):
input_in_fp16 = dtype == torch.half
input_in_bf16 = dtype == torch.bfloat16
if not (scale is None or softmax_in_fp32):
with self.assertRaises(RuntimeError):
self._setup_fused_softmax(input_in_fp16, input_in_bf16, scale, softmax_in_fp32, AttnMaskType.padding)
return
fused_fn, torch_fn = self._setup_fused_softmax(input_in_fp16, input_in_bf16, scale, softmax_in_fp32, AttnMaskType.padding)
attention_scores_0 = torch.randn((4, 12, 24, 24)).to(device="cuda", dtype=dtype).requires_grad_(True)
with torch.no_grad():
attention_scores_1 = attention_scores_0.clone().requires_grad_(True)
mask = torch.randint(0, 2, (4, 1, 24, 24), device="cuda").bool()
expected = fused_fn(attention_scores_0, mask)
actual = torch_fn(attention_scores_1, mask)
torch.testing.assert_allclose(actual, expected)
g0 = torch.rand_like(actual)
with torch.no_grad():
g1 = g0.clone()
expected.backward(g0)
actual.backward(g1)
def test_autocast_fused_scale_mask_softmax(self):
for dtype in autocast_dtypes:
with self.subTest(f"{dtype}"):
input_in_fp16 = dtype == torch.half
input_in_bf16 = dtype == torch.bfloat16
fused_fn, torch_fn = self._setup_fused_softmax(
input_in_fp16, input_in_bf16, attn_mask_type=AttnMaskType.padding)
attention_scores_0 = torch.randn((4, 12, 24, 24)).cuda().requires_grad_(True)
with torch.no_grad():
attention_scores_1 = attention_scores_0.clone().to(dtype).requires_grad_(True)
mask = torch.randint(0, 2, (4, 1, 24, 24)).bool().cuda()
expected = torch_fn(attention_scores_1, mask)
with torch.cuda.amp.autocast(dtype=dtype):
actual = fused_fn(attention_scores_0, mask)
self.assertEqual(actual.dtype, dtype)
torch.testing.assert_allclose(actual, expected)
g0 = torch.rand_like(actual)
with torch.no_grad():
g1 = g0.clone()
expected.backward(g0)
actual.backward(g1)
def test_fused_upper_triangle_mask_softmax(self):
"""
attn_weights.shape: [4, 12, 24, 24]
total_mask.shape: [4, 1, 24, 24]
total_mask[0, 0], a 24x24 matrix is like a lower triangular matrix, but
upper elements are True and lower elements and diagonal are False.
"""
for (dtype, scale, softmax_in_fp32) in itertools.product(
(torch.half, torch.bfloat16),
(None, 2.0),
(False, True),
):
with self.subTest(f"{dtype}-{scale}-{softmax_in_fp32}"):
input_in_fp16 = dtype == torch.half
input_in_bf16 = dtype == torch.bfloat16
if not (scale is None or softmax_in_fp32):
with self.assertRaises(RuntimeError):
self._setup_fused_softmax(
input_in_fp16, input_in_bf16, scale, softmax_in_fp32, AttnMaskType.causal)
return
fused_fn, torch_fn = self._setup_fused_softmax(
input_in_fp16, input_in_bf16, scale, softmax_in_fp32, AttnMaskType.causal)
attn_weights_0 = torch.randn((4, 12, 24, 24)).to(device="cuda", dtype=dtype).requires_grad_(True)
with torch.no_grad():
attn_weights_1 = attn_weights_0.clone().requires_grad_(True)
total_mask = (~(
torch.tril(torch.randn((24, 24), device="cuda")).bool()
).unsqueeze(0).unsqueeze(0))
total_mask = total_mask.repeat((4, 1, 1, 1))
expected = fused_fn(attn_weights_0, total_mask)
actual = torch_fn(attn_weights_1, total_mask)
torch.testing.assert_allclose(actual, expected)
g0 = torch.randn_like(actual)
with torch.no_grad():
g1 = g0.clone()
actual.backward(g0)
expected.backward(g1)
def test_autocast_fused_upper_triangle_mask_softmax(self):
for dtype in autocast_dtypes:
with self.subTest(f"{dtype}"):
input_in_fp16 = dtype == torch.half
input_in_bf16 = dtype == torch.bfloat16
fused_fn, torch_fn = self._setup_fused_softmax(
input_in_fp16, input_in_bf16, attn_mask_type=AttnMaskType.causal)
attn_weights_0 = torch.randn((4, 12, 24, 24)).cuda().requires_grad_(True)
with torch.no_grad():
attn_weights_1 = attn_weights_0.clone().to(dtype).requires_grad_(True)
total_mask = (~(
torch.tril(torch.randn((24, 24), device="cuda")).bool()
).unsqueeze(0).unsqueeze(0))
with torch.cuda.amp.autocast(dtype=dtype):
actual = fused_fn(attn_weights_0, total_mask)
self.assertEqual(actual.dtype, dtype)
expected = torch_fn(attn_weights_1, total_mask)
torch.testing.assert_allclose(actual, expected)
g0 = torch.randn_like(actual)
with torch.no_grad():
g1 = g0.clone()
actual.backward(g0)
expected.backward(g1)
import os
import subprocess
import sys
import unittest
def run_mpu_tests():
python_executable_path = sys.executable
# repository_root = os.path.join(os.path.dirname(__file__), "../../../")
# directory = os.path.abspath(os.path.join(repository_root, "tests/mpu"))
directory = os.path.dirname(__file__)
files = [
os.path.join(directory, f) for f in os.listdir(directory)
if f.startswith("run_") and os.path.isfile(os.path.join(directory, f))
]
print("#######################################################")
print(f"# Python executable path: {python_executable_path}")
print(f"# {len(files)} tests: {files}")
print("#######################################################")
errors = []
for i, test_file in enumerate(files, 1):
test_run_cmd = f"NVIDIA_TF32_OVERRIDE=0 {python_executable_path} {test_file} --micro-batch-size 2 --num-layers 1 --hidden-size 256 --num-attention-heads 8 --max-position-embeddings 32 --encoder-seq-length 32 --use-cpu-initialization" # NOQA
print(f"### {i} / {len(files)}: cmd: {test_run_cmd}")
try:
output = subprocess.check_output(
test_run_cmd, shell=True
).decode(sys.stdout.encoding).strip()
except Exception as e:
errors.append((test_file, str(e)))
else:
if '>> passed the test :-)' not in output:
errors.append(test_file, output)
else:
if not errors:
print("### PASSED")
else:
print("### FAILED")
short_msg = f"{len(errors)} out of {len(files)} tests failed"
print(short_msg)
for (filename, log) in errors:
print(f"File: {filename}\nLog: {log}")
raise RuntimeError(short_msg)
class TestMPU(unittest.TestCase):
def test_mpu(self):
run_mpu_tests()
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment