[hotfix] fix some bugs caused by size mismatch. (#1011)

* [CLI] add CLI launcher * Revert "[CLI] add CLI launcher" This reverts commit df7e6506d4500af6a9220ef7fe4d3c7b1daebd4c. * [hotfix]fix some bugs caused by size mismatch. * add warning logs * polish

[hotfix] fix some bugs caused by size mismatch. (#1011)
* [CLI] add CLI launcher * Revert "[CLI] add CLI launcher" This reverts commit df7e6506d4500af6a9220ef7fe4d3c7b1daebd4c. * [hotfix]fix some bugs caused by size mismatch. * add warning logs * polish
d182b0bd · YuliangLiu0306 · GitHub · 9833d814 · d182b0bd · d182b0bd
Unverified Commit d182b0bd authored May 23, 2022 by YuliangLiu0306 Committed by GitHub May 23, 2022
Hide whitespace changes
Inline Side-by-side

Showing with 16 additions and 1 deletion

colossalai/cli/benchmark/benchmark.py colossalai/cli/benchmark/benchmark.py +9 -0

colossalai/cli/benchmark/utils.py colossalai/cli/benchmark/utils.py +7 -1

No files found.
--- a/colossalai/cli/benchmark/benchmark.py
+++ b/colossalai/cli/benchmark/benchmark.py
@@ -23,6 +23,8 @@ def run_benchmark(args: Config) -> None:
    if args.gpus is None:
        click.echo("Error: --num_gpus is not given")
        exit()
+    if args.gpus <= 1:
+        click.echo("Warning: tensor parallel will be activated with at least 2 devices.")
    click.echo("=== Benchmarking Parameters ===")
    for k, v in args.items():
@@ -63,6 +65,13 @@ def run_dist_profiling(rank: int, world_size: int, port_list: List[int], config_
        colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
        timer = MultiTimer()
+        # 1D parallel should be skipped if in_features or out_features is not able to be divided exactly by 1D parallel size.
+        if config.parallel.tensor.mode == '1d' and hyperparams.dimension % config.parallel.tensor.size != 0:
+            click.echo(
+                "1D parallel will be skipped because in_features or out_features is not able to be divided exactly by 1D parallel size."
+            )
+            continue
        if hyperparams.model == 'mlp':
            model = MLP(dim=hyperparams.dimension, layers=hyperparams.layers)
        else:

--- a/colossalai/cli/benchmark/utils.py
+++ b/colossalai/cli/benchmark/utils.py
@@ -48,9 +48,15 @@ def find_all_configs(device_cnt: int) -> List[Dict]:
    """
    def _is_square(num):
+        # 2D parallel should be implemented with at least 2 devices.
+        if num <= 1:
+            return False
        return math.floor(math.sqrt(num))**2 == num
    def _is_cube(num):
+        # 3D parallel should be implemented with at least 2 devices.
+        if num <= 1:
+            return False
        return math.floor(num**(1. / 3.))**3 == num
    config_list = []
@@ -63,7 +69,7 @@ def find_all_configs(device_cnt: int) -> List[Dict]:
    config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='1d')))
    config_list.append(config)
-    # add 1D config only if device_cnt is a square
+    # add 2D config only if device_cnt is a square
    if _is_square(device_cnt):
        config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='2d')))
        config_list.append(config)