"git@developer.sourcefind.cn:OpenDAS/colossalai.git" did not exist on "115bcc0b416cd050326cbd6e42b1cc00a8053a82"
Unverified Commit d182b0bd authored by YuliangLiu0306's avatar YuliangLiu0306 Committed by GitHub
Browse files

[hotfix] fix some bugs caused by size mismatch. (#1011)

* [CLI] add CLI launcher

* Revert "[CLI] add CLI launcher"

This reverts commit df7e6506d4500af6a9220ef7fe4d3c7b1daebd4c.

* [hotfix]fix some bugs caused by size mismatch.

* add warning logs

* polish
parent 9833d814
...@@ -23,6 +23,8 @@ def run_benchmark(args: Config) -> None: ...@@ -23,6 +23,8 @@ def run_benchmark(args: Config) -> None:
if args.gpus is None: if args.gpus is None:
click.echo("Error: --num_gpus is not given") click.echo("Error: --num_gpus is not given")
exit() exit()
if args.gpus <= 1:
click.echo("Warning: tensor parallel will be activated with at least 2 devices.")
click.echo("=== Benchmarking Parameters ===") click.echo("=== Benchmarking Parameters ===")
for k, v in args.items(): for k, v in args.items():
...@@ -63,6 +65,13 @@ def run_dist_profiling(rank: int, world_size: int, port_list: List[int], config_ ...@@ -63,6 +65,13 @@ def run_dist_profiling(rank: int, world_size: int, port_list: List[int], config_
colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl') colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
timer = MultiTimer() timer = MultiTimer()
# 1D parallel should be skipped if in_features or out_features is not able to be divided exactly by 1D parallel size.
if config.parallel.tensor.mode == '1d' and hyperparams.dimension % config.parallel.tensor.size != 0:
click.echo(
"1D parallel will be skipped because in_features or out_features is not able to be divided exactly by 1D parallel size."
)
continue
if hyperparams.model == 'mlp': if hyperparams.model == 'mlp':
model = MLP(dim=hyperparams.dimension, layers=hyperparams.layers) model = MLP(dim=hyperparams.dimension, layers=hyperparams.layers)
else: else:
......
...@@ -48,9 +48,15 @@ def find_all_configs(device_cnt: int) -> List[Dict]: ...@@ -48,9 +48,15 @@ def find_all_configs(device_cnt: int) -> List[Dict]:
""" """
def _is_square(num): def _is_square(num):
# 2D parallel should be implemented with at least 2 devices.
if num <= 1:
return False
return math.floor(math.sqrt(num))**2 == num return math.floor(math.sqrt(num))**2 == num
def _is_cube(num): def _is_cube(num):
# 3D parallel should be implemented with at least 2 devices.
if num <= 1:
return False
return math.floor(num**(1. / 3.))**3 == num return math.floor(num**(1. / 3.))**3 == num
config_list = [] config_list = []
...@@ -63,7 +69,7 @@ def find_all_configs(device_cnt: int) -> List[Dict]: ...@@ -63,7 +69,7 @@ def find_all_configs(device_cnt: int) -> List[Dict]:
config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='1d'))) config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='1d')))
config_list.append(config) config_list.append(config)
# add 1D config only if device_cnt is a square # add 2D config only if device_cnt is a square
if _is_square(device_cnt): if _is_square(device_cnt):
config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='2d'))) config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='2d')))
config_list.append(config) config_list.append(config)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment