"git@developer.sourcefind.cn:OpenDAS/colossalai.git" did not exist on "5545114fd84c8aa39b18aa0ad8816ddbc6dab360"
Unverified Commit 0f02b8c6 authored by Ziyue Jiang's avatar Ziyue Jiang Committed by GitHub
Browse files

add avg partition (#2483)


Co-authored-by: default avatarZiyue Jiang <ziyue.jiang@gmail.com>
parent 99d9713b
...@@ -9,6 +9,40 @@ def pipe_split(): ...@@ -9,6 +9,40 @@ def pipe_split():
pass pass
def avgcompute_split_pass(gm: torch.fx.GraphModule, pp_size: int):
"""
In avgcompute_split_pass, we split module by the fwd flops.
"""
mod_graph = gm.graph
# To use avgcompute_split_pass, we need run meta_info_prop interpreter first.
# If nodes don't have meta info, this pass will fall back to normal balanced split pass.
check_node = list(mod_graph.nodes)[0]
if 'tensor_meta' not in check_node.meta:
return balanced_split_pass(gm, pp_size)
total_fwd_flop = 0
for node in mod_graph.nodes:
total_fwd_flop += node.fwd_flop
partition_flop = total_fwd_flop // pp_size
accumulate_fwd_flop = 0
for node in mod_graph.nodes:
if pp_size <= 1:
break
if 'pipe_split' in node.name:
continue
accumulate_fwd_flop += node.fwd_flop
if accumulate_fwd_flop >= partition_flop:
total_fwd_flop = total_fwd_flop - accumulate_fwd_flop
accumulate_fwd_flop = 0
pp_size -= 1
partition_flop = total_fwd_flop // pp_size
with mod_graph.inserting_after(node):
split_node = mod_graph.create_node('call_function', pipe_split)
gm.recompile()
return gm
def avgnode_split_pass(gm: torch.fx.GraphModule, pp_size: int): def avgnode_split_pass(gm: torch.fx.GraphModule, pp_size: int):
""" """
In avgnode_split_pass, simpliy split graph by node number. In avgnode_split_pass, simpliy split graph by node number.
...@@ -104,8 +138,10 @@ def balanced_split_pass_v2(gm: torch.fx.GraphModule, pp_size: int): ...@@ -104,8 +138,10 @@ def balanced_split_pass_v2(gm: torch.fx.GraphModule, pp_size: int):
continue continue
accumulate_node_size += node.node_size accumulate_node_size += node.node_size
if accumulate_node_size >= partition_size: if accumulate_node_size >= partition_size:
total_element_size = total_element_size - accumulate_node_size
accumulate_node_size = 0 accumulate_node_size = 0
pp_size -= 1 pp_size -= 1
partition_size = total_element_size // pp_size
with mod_graph.inserting_after(node): with mod_graph.inserting_after(node):
split_node = mod_graph.create_node('call_function', pipe_split) split_node = mod_graph.create_node('call_function', pipe_split)
gm.recompile() gm.recompile()
......
...@@ -112,7 +112,8 @@ class MetaInfoProp(torch.fx.Interpreter): ...@@ -112,7 +112,8 @@ class MetaInfoProp(torch.fx.Interpreter):
n.meta['tensor_meta'] = tensor_meta n.meta['tensor_meta'] = tensor_meta
n.meta = {**n.meta, **asdict(meta_info)} # extend MetaInfo to `n.meta` n.meta = {**n.meta, **asdict(meta_info)} # extend MetaInfo to `n.meta`
# TODO: the attribute node_size should be removed in the future # TODO: the attribute node_size should be removed in the future
setattr(n, 'node_size', activation_size(n.meta.get('fwd_in', 0)) + activation_size(n.meta.get('fwd_tmp', 0))) setattr(n, 'node_size', activation_size(n.meta.get('fwd_out', 0)) + activation_size(n.meta.get('fwd_tmp', 0)))
setattr(n, 'fwd_flop', n.meta.get('fwd_flop', 0))
n.meta['type'] = type(result) n.meta['type'] = type(result)
# retain the autograd graph # retain the autograd graph
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment