Unverified Commit 6e4ac081 authored by HELSON's avatar HELSON Committed by GitHub
Browse files

[hotfix] fix chunk size can not be divided (#2867)

* [hotfix] fix chunk size can not be divided

* [hotfix] use numpy for python3.8
parent a4fc125c
......@@ -72,6 +72,9 @@ class ChunkManager:
if tensor.numel() > chunk_size:
chunk_size = tensor.numel()
dp_size = tensor.process_group.dp_world_size()
chunk_size = chunk_size + (-chunk_size % dp_size)
chunk = Chunk(
chunk_size=chunk_size,
process_group=tensor.process_group,
......
......@@ -119,6 +119,7 @@ def search_chunk_configuration(
assert search_range_byte >= 0
params_dict = classify_params_by_dp_degree(param_order, strict_ddp_flag)
size_lcm = np.lcm.reduce(list(params_dict.keys()))
config_dict: Dict[int, Dict] = dict()
total_param_size = 0
......@@ -154,6 +155,8 @@ def search_chunk_configuration(
min_chunk_waste = temp_waste
best_chunk_size = chunk_size
# the chunk size needs to be divided by each groups sizes
best_chunk_size = best_chunk_size + (-best_chunk_size % size_lcm)
for dp_degree in params_dict:
if dp_degree in config_dict:
continue
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment