"...git@developer.sourcefind.cn:chenpangpang/open-webui.git" did not exist on "05bbca5b07ef9149e57e6048b489aa313113132e"
Unverified Commit ad1bbc52 authored by shenggan's avatar shenggan Committed by GitHub
Browse files

fix dap init in benchmark (#24)

parent 259b6c87
...@@ -35,19 +35,6 @@ def main(): ...@@ -35,19 +35,6 @@ def main():
args = parser.parse_args() args = parser.parse_args()
args.distributed = False
if 'WORLD_SIZE' in os.environ:
args.distributed = int(os.environ['WORLD_SIZE']) > 1
args.local_rank = int(os.environ['LOCAL_RANK'])
torch.cuda.set_device(args.local_rank)
torch.distributed.init_process_group(backend='nccl', init_method='env://')
args.world_size = torch.distributed.get_world_size()
args.global_rank = torch.distributed.get_rank()
print(
'Training in distributed mode with multiple processes, 1 GPU per process. Process %d, total %d.'
% (args.global_rank, args.world_size))
init_dap(args.dap_size) init_dap(args.dap_size)
precision = torch.bfloat16 precision = torch.bfloat16
......
...@@ -8,7 +8,9 @@ def ensure_divisibility(numerator, denominator): ...@@ -8,7 +8,9 @@ def ensure_divisibility(numerator, denominator):
"""Ensure that numerator is divisible by the denominator.""" """Ensure that numerator is divisible by the denominator."""
assert numerator % denominator == 0, '{} is not divisible by {}'.format(numerator, denominator) assert numerator % denominator == 0, '{} is not divisible by {}'.format(numerator, denominator)
def set_distributed_environ(key, value):
def set_missing_distributed_environ(key, value):
if key not in os.environ:
os.environ[str(key)] = str(value) os.environ[str(key)] = str(value)
...@@ -21,19 +23,18 @@ def init_dap(tensor_model_parallel_size_=None): ...@@ -21,19 +23,18 @@ def init_dap(tensor_model_parallel_size_=None):
else: else:
tensor_model_parallel_size_ = 1 tensor_model_parallel_size_ = 1
if torch.torch.distributed.is_initialized(): if torch.distributed.is_initialized():
_logger = colossalai.logging.get_dist_logger() _logger = colossalai.logging.get_dist_logger()
_logger.error( _logger.error(
"use fastfold.distributed.init_dap instead of torch.distributed.init_process_group!") "use fastfold.distributed.init_dap instead of torch.distributed.init_process_group!")
exit(-1) exit(-1)
# set distributed environ for single device launch # set distributed environ for single device launch
if 'RANK' not in os.environ: set_missing_distributed_environ('WORLD_SIZE', 1)
set_distributed_environ('WORLD_SIZE', 1) set_missing_distributed_environ('RANK', 0)
set_distributed_environ('RANK', 0) set_missing_distributed_environ('LOCAL_RANK', 0)
set_distributed_environ('LOCAL_RANK', 0) set_missing_distributed_environ('MASTER_ADDR', "localhost")
set_distributed_environ('MASTER_ADDR', "localhost") set_missing_distributed_environ('MASTER_PORT', -1)
set_distributed_environ('MASTER_PORT', 10045)
colossalai.launch_from_torch( colossalai.launch_from_torch(
config={"parallel": dict(tensor=dict(size=tensor_model_parallel_size_))}) config={"parallel": dict(tensor=dict(size=tensor_model_parallel_size_))})
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment