Unverified Commit 5baa68d3 authored by Jeff Daily's avatar Jeff Daily Committed by GitHub
Browse files

use __launch_bounds__ for multi_tensor_apply (#44)

use __launch_bounds__(1024) for multi_tensor_apply, re-enable skipped tests
parent 85b56d01
......@@ -28,6 +28,9 @@ template<int n> struct TensorListMetadata
template<typename T, typename U, typename... ArgTypes>
#ifdef __HIP_PLATFORM_HCC__
__launch_bounds__(1024)
#endif
__global__ void multi_tensor_apply_kernel(
int chunk_size,
volatile int* noop_flag,
......
......@@ -6,7 +6,6 @@ import torch.nn.functional as F
import torch.optim as optim
from apex import amp
from apex.testing.common_utils import skipIfRocm
from utils import common_init, FLOAT
......@@ -161,7 +160,6 @@ class TestCheckpointing(unittest.TestCase):
# skip tests for different opt_levels
continue
@skipIfRocm
def test_loss_scale_decrease(self):
num_losses = 3
nb_decrease_loss_scales = [0, 1, 2]
......
......@@ -13,8 +13,6 @@ from torch.nn import Parameter
from utils import common_init, HALF, FLOAT,\
ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
from apex.testing.common_utils import skipIfRocm
try:
import amp_C
disabled = False
......
......@@ -12,8 +12,6 @@ from math import floor
from utils import common_init, HALF, FLOAT,\
ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
from apex.testing.common_utils import skipIfRocm
try:
import amp_C
from amp_C import multi_tensor_axpby
......@@ -103,7 +101,6 @@ class TestMultiTensorAxpby(unittest.TestCase):
# self.assertTrue(self.overflow_buf.item())
@unittest.skipIf(disabled, "amp_C is unavailable")
@skipIfRocm
def test_fuzz(self):
input_size_pairs = (
(7777*77, 555*555),
......@@ -143,7 +140,6 @@ class TestMultiTensorAxpby(unittest.TestCase):
@unittest.skipIf(disabled, "amp_C is unavailable")
@unittest.skipIf(not try_nhwc, "torch version is 1.4 or earlier, may not support nhwc")
@skipIfRocm
def test_fuzz_nhwc(self):
input_size_pairs = (
((7, 77, 7, 77), (5, 55, 5, 55)),
......
......@@ -11,8 +11,6 @@ import torch.nn.functional as F
from utils import common_init, HALF, FLOAT,\
ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
from apex.testing.common_utils import skipIfRocm
try:
import amp_C
from amp_C import multi_tensor_l2norm
......
......@@ -11,8 +11,6 @@ import torch.nn.functional as F
from utils import common_init, HALF, FLOAT,\
ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
from apex.testing.common_utils import skipIfRocm
try:
import amp_C
from amp_C import multi_tensor_scale
......@@ -90,7 +88,6 @@ class TestMultiTensorScale(unittest.TestCase):
# self.downscale(self.fp32, self.fp16, self.fp16_ref)
@unittest.skipIf(disabled, "amp_C is unavailable")
@skipIfRocm
def test_fuzz(self):
input_size_pairs = (
(7777*77, 555*555),
......
......@@ -13,8 +13,6 @@ from torch.nn import Parameter
from utils import common_init, HALF, FLOAT,\
ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
from apex.testing.common_utils import skipIfRocm
class MyModel(torch.nn.Module):
def __init__(self, unique):
super(MyModel, self).__init__()
......
......@@ -40,3 +40,6 @@ class TestFusedLayerNormElemWise(TestFusedLayerNorm):
self.module_cpu_ = apex.normalization.FusedLayerNorm(normalized_shape=[32, 16], elementwise_affine=True).cpu()
self.module_cuda_ = apex.normalization.FusedLayerNorm(normalized_shape=[32, 16], elementwise_affine=True).cuda()
if __name__ == '__main__':
unittest.main()
......@@ -6,8 +6,6 @@ import torch
import apex
from itertools import product
from apex.testing.common_utils import skipIfRocm
class TestFusedOptimizer(unittest.TestCase):
def setUp(self, max_abs_diff=1e-3, max_rel_diff=1, iters=7):
self.max_abs_diff = max_abs_diff
......@@ -87,7 +85,6 @@ class TestFusedAdam(TestFusedOptimizer):
self.ref_optim = torch.optim.Adam
self.fused_optim = apex.optimizers.FusedAdam
@skipIfRocm
def test_float(self):
self.gen_single_type_test(param_type=torch.float)
......@@ -98,12 +95,10 @@ class TestFusedAdam(TestFusedOptimizer):
# Uses apex optimizers(controlled by apex_only flag) for both types.
# Doesn't use upstream optimizer like other tests as they seem to be
# numerically unstable for half types
@skipIfRocm
def test_bfloat16(self):
self.max_abs_diff = 1e-2
self.gen_single_type_test(param_type=torch.bfloat16, apex_only=True)
@skipIfRocm
@unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required")
def test_multi_device(self):
devices = ("cuda:0", "cuda:1")
......@@ -196,7 +191,6 @@ class TestFusedAdagrad(TestFusedOptimizer):
self.ref_optim = torch.optim.Adagrad
self.fused_optim = apex.optimizers.FusedAdagrad
@skipIfRocm
def test_float(self):
self.gen_single_type_test(param_type=torch.float)
......@@ -204,7 +198,6 @@ class TestFusedAdagrad(TestFusedOptimizer):
def test_half(self):
self.gen_single_type_test(param_type=torch.float16)
@skipIfRocm
@unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required")
def test_multi_device(self):
devices = ("cuda:0", "cuda:1")
......@@ -213,7 +206,6 @@ class TestFusedAdagrad(TestFusedOptimizer):
self.gen_single_type_test(param_type=torch.float, device=tensor_dev)
@skipIfRocm
def test_multi_params(self):
sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]]
adagrad_option = {"lr": 5e-4, "eps": 1e-08, "weight_decay": 0}
......
......@@ -5,7 +5,6 @@ import torch
from torch.optim import Optimizer
import apex
from apex.multi_tensor_apply import multi_tensor_applier
from apex.testing.common_utils import skipIfRocm
from itertools import product
class RefLAMB(Optimizer):
......@@ -212,7 +211,6 @@ class TestFusedLAMB(unittest.TestCase):
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
@skipIfRocm
def test_float(self):
self.gen_single_type_test(param_type=torch.float)
......@@ -220,7 +218,6 @@ class TestFusedLAMB(unittest.TestCase):
def test_half(self):
self.gen_single_type_test(param_type=torch.float16)
@skipIfRocm
@unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required")
def test_multi_device(self):
devices = ("cuda:0", "cuda:1")
......@@ -228,7 +225,6 @@ class TestFusedLAMB(unittest.TestCase):
with torch.cuda.device(current_dev):
self.gen_single_type_test(param_type=torch.float, device=tensor_dev)
@skipIfRocm
def test_multi_params(self):
sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]]
weight_decay = [0, 0.01]
......@@ -249,7 +245,6 @@ class TestFusedLAMB(unittest.TestCase):
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
@skipIfRocm
def test_lamb_option(self):
nelem = 1
tensor = torch.rand(nelem, dtype=torch.float, device='cuda')
......
import unittest
import sys
from apex.testing.common_utils import TEST_WITH_ROCM, skipIfRocm
from apex.testing.common_utils import TEST_WITH_ROCM
test_dirs = ["run_amp", "run_fp16util", "run_optimizers", "run_fused_layer_norm", "run_pyprof_nvtx", "run_pyprof_data", "run_mlp"]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment