Unverified Commit ad1bbc52 authored by shenggan's avatar shenggan Committed by GitHub
Browse files

fix dap init in benchmark (#24)

parent 259b6c87
......@@ -35,19 +35,6 @@ def main():
args = parser.parse_args()
args.distributed = False
if 'WORLD_SIZE' in os.environ:
args.distributed = int(os.environ['WORLD_SIZE']) > 1
args.local_rank = int(os.environ['LOCAL_RANK'])
torch.cuda.set_device(args.local_rank)
torch.distributed.init_process_group(backend='nccl', init_method='env://')
args.world_size = torch.distributed.get_world_size()
args.global_rank = torch.distributed.get_rank()
print(
'Training in distributed mode with multiple processes, 1 GPU per process. Process %d, total %d.'
% (args.global_rank, args.world_size))
init_dap(args.dap_size)
precision = torch.bfloat16
......
......@@ -8,8 +8,10 @@ def ensure_divisibility(numerator, denominator):
"""Ensure that numerator is divisible by the denominator."""
assert numerator % denominator == 0, '{} is not divisible by {}'.format(numerator, denominator)
def set_distributed_environ(key, value):
os.environ[str(key)] = str(value)
def set_missing_distributed_environ(key, value):
if key not in os.environ:
os.environ[str(key)] = str(value)
def init_dap(tensor_model_parallel_size_=None):
......@@ -21,19 +23,18 @@ def init_dap(tensor_model_parallel_size_=None):
else:
tensor_model_parallel_size_ = 1
if torch.torch.distributed.is_initialized():
if torch.distributed.is_initialized():
_logger = colossalai.logging.get_dist_logger()
_logger.error(
"use fastfold.distributed.init_dap instead of torch.distributed.init_process_group!")
exit(-1)
# set distributed environ for single device launch
if 'RANK' not in os.environ:
set_distributed_environ('WORLD_SIZE', 1)
set_distributed_environ('RANK', 0)
set_distributed_environ('LOCAL_RANK', 0)
set_distributed_environ('MASTER_ADDR', "localhost")
set_distributed_environ('MASTER_PORT', 10045)
set_missing_distributed_environ('WORLD_SIZE', 1)
set_missing_distributed_environ('RANK', 0)
set_missing_distributed_environ('LOCAL_RANK', 0)
set_missing_distributed_environ('MASTER_ADDR', "localhost")
set_missing_distributed_environ('MASTER_PORT', -1)
colossalai.launch_from_torch(
config={"parallel": dict(tensor=dict(size=tensor_model_parallel_size_))})
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment