Commit 79a2d204 authored by hubertlu-tw's avatar hubertlu-tw
Browse files

Merge remote-tracking branch 'origin/master' into IFU-master-2021-10-15

parents 39a65c92 1e0f9bc6
......@@ -160,7 +160,7 @@ class SelfMultiheadAttn(nn.Module):
outputs = self.attn_func(attn_mask is not None, is_training, self.num_heads, self.scaling, lyr_nrm_results,
input_weights, self.out_proj_weight,
input_bias, self.out_proj_bias,
mask, self.dropout)
mask, self.mask_additive, self.dropout)
if is_training:
outputs = jit_dropout_add(outputs, query, self.dropout, is_training)
else:
......
import unittest
import sys
test_dirs = ["groupbn", "layer_norm", "multihead_attn", "."] # "." for test_label_smoothing.py
ROCM_BLACKLIST = [
"groupbn",
"layer_norm"
]
runner = unittest.TextTestRunner(verbosity=2)
errcode = 0
for test_dir in test_dirs:
if test_dir in ROCM_BLACKLIST:
continue
suite = unittest.TestLoader().discover(test_dir)
print("\nExecuting tests from " + test_dir)
result = runner.run(suite)
if not result.wasSuccessful():
errcode = 1
sys.exit(errcode)
......@@ -134,7 +134,9 @@ if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4):
version_ge_1_5 = ['-DVERSION_GE_1_5']
version_dependent_macros = version_ge_1_1 + version_ge_1_3 + version_ge_1_5
if "--distributed_adam" in sys.argv:
if "--distributed_adam" in sys.argv or "--cuda_ext" in sys.argv:
from torch.utils.cpp_extension import CUDAExtension
if "--distributed_adam" in sys.argv:
sys.argv.remove("--distributed_adam")
from torch.utils.cpp_extension import BuildExtension
......@@ -154,7 +156,9 @@ if "--distributed_adam" in sys.argv:
extra_compile_args={'cxx': ['-O3',] + version_dependent_macros,
'nvcc':nvcc_args_adam if not IS_ROCM_PYTORCH else hipcc_args_adam}))
if "--distributed_lamb" in sys.argv:
if "--distributed_lamb" in sys.argv or "--cuda_ext" in sys.argv:
from torch.utils.cpp_extension import CUDAExtension
if "--distributed_lamb" in sys.argv:
sys.argv.remove("--distributed_lamb")
from torch.utils.cpp_extension import BuildExtension
......@@ -175,7 +179,7 @@ if "--distributed_lamb" in sys.argv:
'nvcc': nvcc_args_distributed_lamb if not IS_ROCM_PYTORCH else hipcc_args_distributed_lamb}))
if "--cuda_ext" in sys.argv:
sys.argv.remove("--cuda_ext")
from torch.utils.cpp_extension import CUDAExtension
if torch.utils.cpp_extension.CUDA_HOME is None and not IS_ROCM_PYTORCH:
raise RuntimeError("--cuda_ext was requested, but nvcc was not found. Are you sure your environment has nvcc available? If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
......@@ -264,7 +268,9 @@ if "--cuda_ext" in sys.argv:
'--expt-extended-lambda'] + version_dependent_macros}))
"""
if "--bnp" in sys.argv:
if "--bnp" in sys.argv or "--cuda_ext" in sys.argv:
from torch.utils.cpp_extension import CUDAExtension
if "--bnp" in sys.argv:
sys.argv.remove("--bnp")
from torch.utils.cpp_extension import BuildExtension
......@@ -287,7 +293,9 @@ if "--bnp" in sys.argv:
'-D__CUDA_NO_HALF_CONVERSIONS__',
'-D__CUDA_NO_HALF2_OPERATORS__'] + version_dependent_macros}))
if "--xentropy" in sys.argv:
if "--xentropy" in sys.argv or "--cuda_ext" in sys.argv:
from torch.utils.cpp_extension import CUDAExtension
if "--xentropy" in sys.argv:
sys.argv.remove("--xentropy")
from torch.utils.cpp_extension import BuildExtension
......@@ -307,7 +315,9 @@ if "--xentropy" in sys.argv:
'nvcc':['-O3'] + version_dependent_macros}))
if "--deprecated_fused_adam" in sys.argv:
if "--deprecated_fused_adam" in sys.argv or "--cuda_ext" in sys.argv:
from torch.utils.cpp_extension import CUDAExtension
if "--deprecated_fused_adam" in sys.argv:
sys.argv.remove("--deprecated_fused_adam")
from torch.utils.cpp_extension import BuildExtension
......@@ -328,7 +338,9 @@ if "--deprecated_fused_adam" in sys.argv:
extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
'nvcc' : nvcc_args_fused_adam if not IS_ROCM_PYTORCH else hipcc_args_fused_adam}))
if "--deprecated_fused_lamb" in sys.argv:
if "--deprecated_fused_lamb" in sys.argv or "--cuda_ext" in sys.argv:
from torch.utils.cpp_extension import CUDAExtension
if "--deprecated_fused_lamb" in sys.argv:
sys.argv.remove("--deprecated_fused_lamb")
from torch.utils.cpp_extension import BuildExtension
......@@ -421,7 +433,9 @@ if "--fmha" in sys.argv:
include_dirs=[os.path.join(this_dir, "apex/contrib/csrc"), os.path.join(this_dir, "apex/contrib/csrc/fmha/src")]))
if "--fast_multihead_attn" in sys.argv:
if "--fast_multihead_attn" in sys.argv or "--cuda_ext" in sys.argv:
from torch.utils.cpp_extension import CUDAExtension
if "--fast_multihead_attn" in sys.argv:
sys.argv.remove("--fast_multihead_attn")
from torch.utils.cpp_extension import BuildExtension
......@@ -554,6 +568,8 @@ if "--fast_bottleneck" in sys.argv:
include_dirs=[os.path.join(this_dir, 'apex/contrib/csrc/cudnn-frontend/include')],
extra_compile_args={'cxx': ['-O3',] + version_dependent_macros + generator_flag}))
if "--cuda_ext" in sys.argv:
sys.argv.remove("--cuda_ext")
setup(
name='apex',
......
#!/bin/bash
APEX_TEST_WITH_ROCM=1 python3.6 run_test.py
APEX_TEST_WITH_ROCM=1 python run_test.py
......@@ -6,8 +6,8 @@ export WORLD_SIZE=2
# Test with opt_level="O2"
echo "running opt_level O2"
python3.6 -m torch.distributed.launch --nproc_per_node=2 amp_master_params/amp_master_params.py --opt_level "O2"
python3.6 amp_master_params/compare.py
python -m torch.distributed.launch --nproc_per_node=2 amp_master_params/amp_master_params.py --opt_level "O2"
python amp_master_params/compare.py
# delete the model files
echo -e "O2 test completed. Deleting model files\n"
......@@ -19,9 +19,9 @@ rm rank1master.pth
# Test with opt_level="O5"
#echo "running opt_level O5"
#python3.6 -m torch.distributed.launch --nproc_per_node=2 amp_master_params/amp_master_params.py --opt_level "O5"
#python3.6 amp_master_params/compare.py
#
#python -m torch.distributed.launch --nproc_per_node=2 amp_master_params/amp_master_params.py --opt_level "O5"
#python amp_master_params/compare.py
## delete the model files
#echo "O5 test completed. Deleting model files"
#rm rank0model.pth
......@@ -31,7 +31,16 @@ rm rank1master.pth
## Run the Sync BN Tests.
echo "Running syncbn tests"
python3.6 -m torch.distributed.launch --nproc_per_node=2 synced_batchnorm/two_gpu_test_different_batch_size.py --apex
python -m torch.distributed.launch --nproc_per_node=2 synced_batchnorm/two_gpu_unit_test.py
python -m torch.distributed.launch --nproc_per_node=2 synced_batchnorm/two_gpu_unit_test.py --fp16
python -m torch.distributed.launch --nproc_per_node=2 synced_batchnorm/two_gpu_test_different_batch_size.py --apex
echo "Running syncbn python only tests"
python3.6 synced_batchnorm/python_single_gpu_unit_test.py
python synced_batchnorm/python_single_gpu_unit_test.py
echo "Running syncbn batchnorm1d tests"
python synced_batchnorm/test_batchnorm1d.py
#beware, you need a system with at least 4 gpus to test group_size<world_size (currently fail both on upstream and rocm fork)
#python -m torch.distributed.launch --nproc_per_node=4 test_groups.py --group_size=2
## Run the DDP Tests
echo "running DDP tests"
HIP_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 DDP/ddp_race_condition_test.py
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment