run_rocm_distributed.sh 1.44 KB
Newer Older
rohithkrn's avatar
rohithkrn committed
1
2
3
4
5
6
7
8
#!/bin/bash
set -e

# To run the test on 2 gpus
export WORLD_SIZE=2

# Test with opt_level="O2"
echo "running opt_level O2"
9
10
python -m torch.distributed.launch --nproc_per_node=2 amp_master_params/amp_master_params.py --opt_level "O2"
python amp_master_params/compare.py
rohithkrn's avatar
rohithkrn committed
11
12
13
14
15
16
17
18
19
20

# delete the model files
echo -e "O2 test completed. Deleting model files\n"
rm rank0model.pth
rm rank1model.pth
rm rank0master.pth
rm rank1master.pth


# Test with opt_level="O5"
21
#echo "running opt_level O5"
22
23
24
#python -m torch.distributed.launch --nproc_per_node=2 amp_master_params/amp_master_params.py --opt_level "O5"
#python amp_master_params/compare.py

25
26
27
28
29
30
## delete the model files
#echo "O5 test completed. Deleting model files"
#rm rank0model.pth
#rm rank1model.pth
#rm rank0master.pth
#rm rank1master.pth
31
32
33

## Run the Sync BN Tests.
echo "Running syncbn tests"
34
35
36
python -m torch.distributed.launch --nproc_per_node=2 synced_batchnorm/two_gpu_unit_test.py
python -m torch.distributed.launch --nproc_per_node=2 synced_batchnorm/two_gpu_unit_test.py --fp16
python -m torch.distributed.launch --nproc_per_node=2 synced_batchnorm/two_gpu_test_different_batch_size.py --apex
37
echo "Running syncbn python only tests"
38
39
40
python synced_batchnorm/python_single_gpu_unit_test.py
echo "Running syncbn batchnorm1d tests"
python synced_batchnorm/test_batchnorm1d.py 
41

42
43
44
## Run the DDP Tests
echo "running DDP tests"
HIP_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 DDP/ddp_race_condition_test.py