"git@developer.sourcefind.cn:renzhc/diffusers_dcu.git" did not exist on "73bb97adfc3d0cb184c5fd66a1d5699c249a7fd8"
Unverified Commit 5baa68d3 authored by Jeff Daily's avatar Jeff Daily Committed by GitHub
Browse files

use __launch_bounds__ for multi_tensor_apply (#44)

use __launch_bounds__(1024) for multi_tensor_apply, re-enable skipped tests
parent 85b56d01
...@@ -28,6 +28,9 @@ template<int n> struct TensorListMetadata ...@@ -28,6 +28,9 @@ template<int n> struct TensorListMetadata
template<typename T, typename U, typename... ArgTypes> template<typename T, typename U, typename... ArgTypes>
#ifdef __HIP_PLATFORM_HCC__
__launch_bounds__(1024)
#endif
__global__ void multi_tensor_apply_kernel( __global__ void multi_tensor_apply_kernel(
int chunk_size, int chunk_size,
volatile int* noop_flag, volatile int* noop_flag,
......
...@@ -6,7 +6,6 @@ import torch.nn.functional as F ...@@ -6,7 +6,6 @@ import torch.nn.functional as F
import torch.optim as optim import torch.optim as optim
from apex import amp from apex import amp
from apex.testing.common_utils import skipIfRocm
from utils import common_init, FLOAT from utils import common_init, FLOAT
...@@ -44,7 +43,7 @@ class TestCheckpointing(unittest.TestCase): ...@@ -44,7 +43,7 @@ class TestCheckpointing(unittest.TestCase):
'Parameter in state_dict not FLOAT') 'Parameter in state_dict not FLOAT')
def train_step(self, model, optimizer, data, loss_ids): def train_step(self, model, optimizer, data, loss_ids):
optimizer.zero_grad() optimizer.zero_grad()
output = model(data) output = model(data)
...@@ -102,12 +101,12 @@ class TestCheckpointing(unittest.TestCase): ...@@ -102,12 +101,12 @@ class TestCheckpointing(unittest.TestCase):
if opt_level == res_opt_level: if opt_level == res_opt_level:
# train for nb_epochs and restore after nb_epochs_restore # train for nb_epochs and restore after nb_epochs_restore
for epoch in range(nb_epochs): for epoch in range(nb_epochs):
x = torch.randn(16, 3, 24, 24, device='cuda') x = torch.randn(16, 3, 24, 24, device='cuda')
output = self.train_step( output = self.train_step(
model, optimizer, x, range(num_losses)) model, optimizer, x, range(num_losses))
# Initialize model one step before comparing. # Initialize model one step before comparing.
# Otherwise the batchnorm layers will be updated # Otherwise the batchnorm layers will be updated
# additionally in restore_model # additionally in restore_model
if epoch == (nb_epochs_restore - 1): if epoch == (nb_epochs_restore - 1):
# Load model and optimizer # Load model and optimizer
...@@ -161,7 +160,6 @@ class TestCheckpointing(unittest.TestCase): ...@@ -161,7 +160,6 @@ class TestCheckpointing(unittest.TestCase):
# skip tests for different opt_levels # skip tests for different opt_levels
continue continue
@skipIfRocm
def test_loss_scale_decrease(self): def test_loss_scale_decrease(self):
num_losses = 3 num_losses = 3
nb_decrease_loss_scales = [0, 1, 2] nb_decrease_loss_scales = [0, 1, 2]
...@@ -171,10 +169,10 @@ class TestCheckpointing(unittest.TestCase): ...@@ -171,10 +169,10 @@ class TestCheckpointing(unittest.TestCase):
nb_decrease_loss_scales_tmp = list(nb_decrease_loss_scales) nb_decrease_loss_scales_tmp = list(nb_decrease_loss_scales)
model = MyModel().to('cuda') model = MyModel().to('cuda')
optimizer = optim.SGD(model.parameters(), optimizer = optim.SGD(model.parameters(),
lr=self.initial_lr) lr=self.initial_lr)
model, optimizer = amp.initialize( model, optimizer = amp.initialize(
model, optimizer, opt_level=opt_level, num_losses=num_losses, model, optimizer, opt_level=opt_level, num_losses=num_losses,
verbosity=0) verbosity=0)
...@@ -182,26 +180,26 @@ class TestCheckpointing(unittest.TestCase): ...@@ -182,26 +180,26 @@ class TestCheckpointing(unittest.TestCase):
if amp._amp_state.opt_properties.loss_scale != 'dynamic': if amp._amp_state.opt_properties.loss_scale != 'dynamic':
#print('Static loss scale set. Skipping opt_level.') #print('Static loss scale set. Skipping opt_level.')
continue continue
# force to skip some updates to decrease the loss_scale # force to skip some updates to decrease the loss_scale
initial_loss_scales = [] initial_loss_scales = []
for idx in range(num_losses): for idx in range(num_losses):
initial_loss_scales.append( initial_loss_scales.append(
amp._amp_state.loss_scalers[idx].loss_scale()) amp._amp_state.loss_scalers[idx].loss_scale())
for _ in range(len(nb_decrease_loss_scales)): for _ in range(len(nb_decrease_loss_scales)):
x = torch.randn(16, 3, 24, 24, device='cuda') x = torch.randn(16, 3, 24, 24, device='cuda')
for idx in range(num_losses): for idx in range(num_losses):
while nb_decrease_loss_scales_tmp[idx] > 0: while nb_decrease_loss_scales_tmp[idx] > 0:
optimizer.zero_grad() optimizer.zero_grad()
output = model(x * 2**17) output = model(x * 2**17)
loss = output.mean() loss = output.mean()
with amp.scale_loss(loss, optimizer, loss_id=idx) as scaled_loss: with amp.scale_loss(loss, optimizer, loss_id=idx) as scaled_loss:
scaled_loss.backward(retain_graph=True) scaled_loss.backward(retain_graph=True)
optimizer.step() optimizer.step()
nb_decrease_loss_scales_tmp[idx] -= 1 nb_decrease_loss_scales_tmp[idx] -= 1
# Check loss scales afterwards # Check loss scales afterwards
updated_loss_scales = [] updated_loss_scales = []
for idx in range(num_losses): for idx in range(num_losses):
...@@ -243,7 +241,7 @@ class TestCheckpointing(unittest.TestCase): ...@@ -243,7 +241,7 @@ class TestCheckpointing(unittest.TestCase):
# Create dummy data # Create dummy data
data = torch.randn(10, 3, 4, 4, device='cuda') data = torch.randn(10, 3, 4, 4, device='cuda')
target = torch.randn(10, 6, 4, 4, device='cuda') target = torch.randn(10, 6, 4, 4, device='cuda')
# Get initnial loss # Get initnial loss
optimizer.zero_grad() optimizer.zero_grad()
output = model(data) output = model(data)
...@@ -266,4 +264,4 @@ class TestCheckpointing(unittest.TestCase): ...@@ -266,4 +264,4 @@ class TestCheckpointing(unittest.TestCase):
if __name__=='__main__': if __name__=='__main__':
unittest.main() unittest.main()
...@@ -13,8 +13,6 @@ from torch.nn import Parameter ...@@ -13,8 +13,6 @@ from torch.nn import Parameter
from utils import common_init, HALF, FLOAT,\ from utils import common_init, HALF, FLOAT,\
ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
from apex.testing.common_utils import skipIfRocm
try: try:
import amp_C import amp_C
disabled = False disabled = False
......
...@@ -12,8 +12,6 @@ from math import floor ...@@ -12,8 +12,6 @@ from math import floor
from utils import common_init, HALF, FLOAT,\ from utils import common_init, HALF, FLOAT,\
ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
from apex.testing.common_utils import skipIfRocm
try: try:
import amp_C import amp_C
from amp_C import multi_tensor_axpby from amp_C import multi_tensor_axpby
...@@ -103,7 +101,6 @@ class TestMultiTensorAxpby(unittest.TestCase): ...@@ -103,7 +101,6 @@ class TestMultiTensorAxpby(unittest.TestCase):
# self.assertTrue(self.overflow_buf.item()) # self.assertTrue(self.overflow_buf.item())
@unittest.skipIf(disabled, "amp_C is unavailable") @unittest.skipIf(disabled, "amp_C is unavailable")
@skipIfRocm
def test_fuzz(self): def test_fuzz(self):
input_size_pairs = ( input_size_pairs = (
(7777*77, 555*555), (7777*77, 555*555),
...@@ -143,7 +140,6 @@ class TestMultiTensorAxpby(unittest.TestCase): ...@@ -143,7 +140,6 @@ class TestMultiTensorAxpby(unittest.TestCase):
@unittest.skipIf(disabled, "amp_C is unavailable") @unittest.skipIf(disabled, "amp_C is unavailable")
@unittest.skipIf(not try_nhwc, "torch version is 1.4 or earlier, may not support nhwc") @unittest.skipIf(not try_nhwc, "torch version is 1.4 or earlier, may not support nhwc")
@skipIfRocm
def test_fuzz_nhwc(self): def test_fuzz_nhwc(self):
input_size_pairs = ( input_size_pairs = (
((7, 77, 7, 77), (5, 55, 5, 55)), ((7, 77, 7, 77), (5, 55, 5, 55)),
......
...@@ -11,8 +11,6 @@ import torch.nn.functional as F ...@@ -11,8 +11,6 @@ import torch.nn.functional as F
from utils import common_init, HALF, FLOAT,\ from utils import common_init, HALF, FLOAT,\
ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
from apex.testing.common_utils import skipIfRocm
try: try:
import amp_C import amp_C
from amp_C import multi_tensor_l2norm from amp_C import multi_tensor_l2norm
...@@ -69,7 +67,7 @@ class TestMultiTensorL2Norm(unittest.TestCase): ...@@ -69,7 +67,7 @@ class TestMultiTensorL2Norm(unittest.TestCase):
(33333, 555), (33333, 555),
(555, 33333)) (555, 33333))
appliers = ( appliers = (
MultiTensorApply(2048*32), MultiTensorApply(2048*32),
MultiTensorApply(333), MultiTensorApply(333),
MultiTensorApply(33333)) MultiTensorApply(33333))
repeat_tensors = ( repeat_tensors = (
......
...@@ -11,11 +11,9 @@ import torch.nn.functional as F ...@@ -11,11 +11,9 @@ import torch.nn.functional as F
from utils import common_init, HALF, FLOAT,\ from utils import common_init, HALF, FLOAT,\
ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
from apex.testing.common_utils import skipIfRocm
try: try:
import amp_C import amp_C
from amp_C import multi_tensor_scale from amp_C import multi_tensor_scale
from apex.multi_tensor_apply import MultiTensorApply from apex.multi_tensor_apply import MultiTensorApply
disabled = False disabled = False
except ImportError as err: except ImportError as err:
...@@ -56,7 +54,7 @@ class TestMultiTensorScale(unittest.TestCase): ...@@ -56,7 +54,7 @@ class TestMultiTensorScale(unittest.TestCase):
out_list = [out.float() for out in out_list] out_list = [out.float() for out in out_list]
self.assertTrue(all([torch.allclose(out, self.ref.to(out.dtype)) for out in out_list])) self.assertTrue(all([torch.allclose(out, self.ref.to(out.dtype)) for out in out_list]))
self.assertTrue(self.overflow_buf.item() == 0) self.assertTrue(self.overflow_buf.item() == 0)
def find_inf(self, sizea, sizeb, applier, repeat_tensors, in_type, out_type, t, ind, val, inplace=False): def find_inf(self, sizea, sizeb, applier, repeat_tensors, in_type, out_type, t, ind, val, inplace=False):
self.overflow_buf.zero_() self.overflow_buf.zero_()
a = torch.cuda.FloatTensor(sizea).fill_(self.scale) a = torch.cuda.FloatTensor(sizea).fill_(self.scale)
...@@ -84,13 +82,12 @@ class TestMultiTensorScale(unittest.TestCase): ...@@ -84,13 +82,12 @@ class TestMultiTensorScale(unittest.TestCase):
# @unittest.skipIf(disabled, "amp_C is unavailable") # @unittest.skipIf(disabled, "amp_C is unavailable")
# def test_fp16_to_fp16(self): # def test_fp16_to_fp16(self):
# self.downscale(self.fp16, self.fp16, self.fp16_ref) # self.downscale(self.fp16, self.fp16, self.fp16_ref)
# #
# @unittest.skipIf(disabled, "amp_C is unavailable") # @unittest.skipIf(disabled, "amp_C is unavailable")
# def test_fp32_to_fp16(self): # def test_fp32_to_fp16(self):
# self.downscale(self.fp32, self.fp16, self.fp16_ref) # self.downscale(self.fp32, self.fp16, self.fp16_ref)
@unittest.skipIf(disabled, "amp_C is unavailable") @unittest.skipIf(disabled, "amp_C is unavailable")
@skipIfRocm
def test_fuzz(self): def test_fuzz(self):
input_size_pairs = ( input_size_pairs = (
(7777*77, 555*555), (7777*77, 555*555),
...@@ -102,7 +99,7 @@ class TestMultiTensorScale(unittest.TestCase): ...@@ -102,7 +99,7 @@ class TestMultiTensorScale(unittest.TestCase):
(33333, 555), (33333, 555),
(555, 33333)) (555, 33333))
appliers = ( appliers = (
MultiTensorApply(2048*32), MultiTensorApply(2048*32),
MultiTensorApply(333), MultiTensorApply(333),
MultiTensorApply(33333)) MultiTensorApply(33333))
repeat_tensors = ( repeat_tensors = (
......
...@@ -13,8 +13,6 @@ from torch.nn import Parameter ...@@ -13,8 +13,6 @@ from torch.nn import Parameter
from utils import common_init, HALF, FLOAT,\ from utils import common_init, HALF, FLOAT,\
ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
from apex.testing.common_utils import skipIfRocm
class MyModel(torch.nn.Module): class MyModel(torch.nn.Module):
def __init__(self, unique): def __init__(self, unique):
super(MyModel, self).__init__() super(MyModel, self).__init__()
...@@ -43,7 +41,7 @@ class TestMultipleModelsOptimizersLosses(unittest.TestCase): ...@@ -43,7 +41,7 @@ class TestMultipleModelsOptimizersLosses(unittest.TestCase):
def tearDown(self): def tearDown(self):
pass pass
def test_2models2losses1optimizer(self): def test_2models2losses1optimizer(self):
model0 = MyModel(1) model0 = MyModel(1)
model1 = MyModel(2) model1 = MyModel(2)
......
...@@ -6,7 +6,7 @@ import torch ...@@ -6,7 +6,7 @@ import torch
import apex import apex
from torch.autograd import Variable from torch.autograd import Variable
class TestFusedLayerNorm(unittest.TestCase): class TestFusedLayerNorm(unittest.TestCase):
def setUp(self): def setUp(self):
# bias and weight are set to 0 and 1 respectively, so no need to copy parameters from cpu module to the gpu one # bias and weight are set to 0 and 1 respectively, so no need to copy parameters from cpu module to the gpu one
...@@ -33,10 +33,13 @@ class TestFusedLayerNorm(unittest.TestCase): ...@@ -33,10 +33,13 @@ class TestFusedLayerNorm(unittest.TestCase):
def test_large_batch(self): def test_large_batch(self):
self._test_same_output(65536) self._test_same_output(65536)
class TestFusedLayerNormElemWise(TestFusedLayerNorm): class TestFusedLayerNormElemWise(TestFusedLayerNorm):
def setUp(self): def setUp(self):
self.module_cpu_ = apex.normalization.FusedLayerNorm(normalized_shape=[32, 16], elementwise_affine=True).cpu() self.module_cpu_ = apex.normalization.FusedLayerNorm(normalized_shape=[32, 16], elementwise_affine=True).cpu()
self.module_cuda_ = apex.normalization.FusedLayerNorm(normalized_shape=[32, 16], elementwise_affine=True).cuda() self.module_cuda_ = apex.normalization.FusedLayerNorm(normalized_shape=[32, 16], elementwise_affine=True).cuda()
if __name__ == '__main__':
unittest.main()
...@@ -6,8 +6,6 @@ import torch ...@@ -6,8 +6,6 @@ import torch
import apex import apex
from itertools import product from itertools import product
from apex.testing.common_utils import skipIfRocm
class TestFusedOptimizer(unittest.TestCase): class TestFusedOptimizer(unittest.TestCase):
def setUp(self, max_abs_diff=1e-3, max_rel_diff=1, iters=7): def setUp(self, max_abs_diff=1e-3, max_rel_diff=1, iters=7):
self.max_abs_diff = max_abs_diff self.max_abs_diff = max_abs_diff
...@@ -87,7 +85,6 @@ class TestFusedAdam(TestFusedOptimizer): ...@@ -87,7 +85,6 @@ class TestFusedAdam(TestFusedOptimizer):
self.ref_optim = torch.optim.Adam self.ref_optim = torch.optim.Adam
self.fused_optim = apex.optimizers.FusedAdam self.fused_optim = apex.optimizers.FusedAdam
@skipIfRocm
def test_float(self): def test_float(self):
self.gen_single_type_test(param_type=torch.float) self.gen_single_type_test(param_type=torch.float)
...@@ -98,12 +95,10 @@ class TestFusedAdam(TestFusedOptimizer): ...@@ -98,12 +95,10 @@ class TestFusedAdam(TestFusedOptimizer):
# Uses apex optimizers(controlled by apex_only flag) for both types. # Uses apex optimizers(controlled by apex_only flag) for both types.
# Doesn't use upstream optimizer like other tests as they seem to be # Doesn't use upstream optimizer like other tests as they seem to be
# numerically unstable for half types # numerically unstable for half types
@skipIfRocm
def test_bfloat16(self): def test_bfloat16(self):
self.max_abs_diff = 1e-2 self.max_abs_diff = 1e-2
self.gen_single_type_test(param_type=torch.bfloat16, apex_only=True) self.gen_single_type_test(param_type=torch.bfloat16, apex_only=True)
@skipIfRocm
@unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required") @unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required")
def test_multi_device(self): def test_multi_device(self):
devices = ("cuda:0", "cuda:1") devices = ("cuda:0", "cuda:1")
...@@ -196,7 +191,6 @@ class TestFusedAdagrad(TestFusedOptimizer): ...@@ -196,7 +191,6 @@ class TestFusedAdagrad(TestFusedOptimizer):
self.ref_optim = torch.optim.Adagrad self.ref_optim = torch.optim.Adagrad
self.fused_optim = apex.optimizers.FusedAdagrad self.fused_optim = apex.optimizers.FusedAdagrad
@skipIfRocm
def test_float(self): def test_float(self):
self.gen_single_type_test(param_type=torch.float) self.gen_single_type_test(param_type=torch.float)
...@@ -204,7 +198,6 @@ class TestFusedAdagrad(TestFusedOptimizer): ...@@ -204,7 +198,6 @@ class TestFusedAdagrad(TestFusedOptimizer):
def test_half(self): def test_half(self):
self.gen_single_type_test(param_type=torch.float16) self.gen_single_type_test(param_type=torch.float16)
@skipIfRocm
@unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required") @unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required")
def test_multi_device(self): def test_multi_device(self):
devices = ("cuda:0", "cuda:1") devices = ("cuda:0", "cuda:1")
...@@ -213,7 +206,6 @@ class TestFusedAdagrad(TestFusedOptimizer): ...@@ -213,7 +206,6 @@ class TestFusedAdagrad(TestFusedOptimizer):
self.gen_single_type_test(param_type=torch.float, device=tensor_dev) self.gen_single_type_test(param_type=torch.float, device=tensor_dev)
@skipIfRocm
def test_multi_params(self): def test_multi_params(self):
sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]] sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]]
adagrad_option = {"lr": 5e-4, "eps": 1e-08, "weight_decay": 0} adagrad_option = {"lr": 5e-4, "eps": 1e-08, "weight_decay": 0}
......
...@@ -5,7 +5,6 @@ import torch ...@@ -5,7 +5,6 @@ import torch
from torch.optim import Optimizer from torch.optim import Optimizer
import apex import apex
from apex.multi_tensor_apply import multi_tensor_applier from apex.multi_tensor_apply import multi_tensor_applier
from apex.testing.common_utils import skipIfRocm
from itertools import product from itertools import product
class RefLAMB(Optimizer): class RefLAMB(Optimizer):
...@@ -212,7 +211,6 @@ class TestFusedLAMB(unittest.TestCase): ...@@ -212,7 +211,6 @@ class TestFusedLAMB(unittest.TestCase):
self.assertLessEqual(max_abs_diff, self.max_abs_diff) self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff) self.assertLessEqual(max_rel_diff, self.max_rel_diff)
@skipIfRocm
def test_float(self): def test_float(self):
self.gen_single_type_test(param_type=torch.float) self.gen_single_type_test(param_type=torch.float)
...@@ -220,7 +218,6 @@ class TestFusedLAMB(unittest.TestCase): ...@@ -220,7 +218,6 @@ class TestFusedLAMB(unittest.TestCase):
def test_half(self): def test_half(self):
self.gen_single_type_test(param_type=torch.float16) self.gen_single_type_test(param_type=torch.float16)
@skipIfRocm
@unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required") @unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required")
def test_multi_device(self): def test_multi_device(self):
devices = ("cuda:0", "cuda:1") devices = ("cuda:0", "cuda:1")
...@@ -228,7 +225,6 @@ class TestFusedLAMB(unittest.TestCase): ...@@ -228,7 +225,6 @@ class TestFusedLAMB(unittest.TestCase):
with torch.cuda.device(current_dev): with torch.cuda.device(current_dev):
self.gen_single_type_test(param_type=torch.float, device=tensor_dev) self.gen_single_type_test(param_type=torch.float, device=tensor_dev)
@skipIfRocm
def test_multi_params(self): def test_multi_params(self):
sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]] sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]]
weight_decay = [0, 0.01] weight_decay = [0, 0.01]
...@@ -249,7 +245,6 @@ class TestFusedLAMB(unittest.TestCase): ...@@ -249,7 +245,6 @@ class TestFusedLAMB(unittest.TestCase):
self.assertLessEqual(max_abs_diff, self.max_abs_diff) self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff) self.assertLessEqual(max_rel_diff, self.max_rel_diff)
@skipIfRocm
def test_lamb_option(self): def test_lamb_option(self):
nelem = 1 nelem = 1
tensor = torch.rand(nelem, dtype=torch.float, device='cuda') tensor = torch.rand(nelem, dtype=torch.float, device='cuda')
......
import unittest import unittest
import sys import sys
from apex.testing.common_utils import TEST_WITH_ROCM, skipIfRocm from apex.testing.common_utils import TEST_WITH_ROCM
test_dirs = ["run_amp", "run_fp16util", "run_optimizers", "run_fused_layer_norm", "run_pyprof_nvtx", "run_pyprof_data", "run_mlp"] test_dirs = ["run_amp", "run_fp16util", "run_optimizers", "run_fused_layer_norm", "run_pyprof_nvtx", "run_pyprof_data", "run_mlp"]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment