[pipeline/chimera] reconstruct PipelineBase and Worker to support more...

[pipeline/chimera] reconstruct PipelineBase and Worker to support more feasible custom schedule | finish Chimera (#1595) * [pipeline/tuning] improve dispatch performance both time and space cost * [pipeline/converge] add interface for testing convergence * [NFC] polish colossalai/utils/multi_tensor_apply/multi_tensor_apply.py code style * Update PipelineBase.py * [pipeline/chimera] reconstruct PipelineBase and Worker to support more feasible custom schedule | finish Chimera

[pipeline/chimera] reconstruct PipelineBase and Worker to support more...
[pipeline/chimera] reconstruct PipelineBase and Worker to support more feasible custom schedule | finish Chimera (#1595) * [pipeline/tuning] improve dispatch performance both time and space cost * [pipeline/converge] add interface for testing convergence * [NFC] polish colossalai/utils/multi_tensor_apply/multi_tensor_apply.py code style * Update PipelineBase.py * [pipeline/chimera] reconstruct PipelineBase and Worker to support more feasible custom schedule | finish Chimera
edc9e419 · Kirigaya Kazuto · GitHub · c9e8ce67 · edc9e419 · edc9e419
Unverified Commit edc9e419 authored Sep 19, 2022 by Kirigaya Kazuto Committed by GitHub Sep 19, 2022
8 changed files
--- a/colossalai/pipeline/rpc/PipelineBase.py
+++ b/colossalai/pipeline/rpc/PipelineBase.py
--- a/colossalai/pipeline/rpc/_pipeline_schedule.py
+++ b/colossalai/pipeline/rpc/_pipeline_schedule.py
+from typing import List, Callable, Dict
+
+import torch.nn as nn
+from torch.futures import Future
+from torch._C._distributed_rpc import PyRRef
+
+from colossalai.pipeline.rpc._pipeline_base import PipelineEngineBase, WorkerBase, UniqueKey, Phase
+
+# Implementation of different Pipeline schedule
+# <strategy>Worker defines the worker for each stage
+# <strategy>PipelineEngine is the class for use
+
+
+class FillDrainWorker(WorkerBase):
+
+    def _get_work_item_key(self) -> UniqueKey:
+        # execute backward first (if backward phase in work_list)
+        num_microbatches = self.num_microbatches
+
+        if self.forward_times < num_microbatches:
+            target_phase = Phase.FORWARD
+            target_microbatch_id = self.forward_times
+        else:
+            target_phase = Phase.BACKWARD
+            target_microbatch_id = self.backward_times
+
+        target_key = UniqueKey(target_microbatch_id, target_phase)
+
+        with self.work_list_condition_lock:
+            self.work_list_condition_lock.wait_for(lambda: target_key in self.work_list)
+
+        return target_key
+
+
+class FillDrainPipelineEngine(PipelineEngineBase):
+
+    def __init__(self,
+                 module_partitions: List[nn.Module],
+                 stage_num: int,
+                 num_microbatches: int,
+                 device: str,
+                 chunk: int = 1,
+                 criterion: Callable = None,
+                 metric: Callable = None,
+                 checkpoint: bool = False) -> None:
+
+        if chunk > 1:
+            assert num_microbatches % stage_num == 0, \
+                "if you use interleaving strategy, make sure 'num_microbatches' is a multiple of stage_num!"
+        use_1F1B = False
+
+        super().__init__(FillDrainWorker, module_partitions, stage_num, num_microbatches, device, use_1F1B, chunk,
+                         criterion, metric, checkpoint)
+
+
+class OneFOneBWorker(WorkerBase):
+
+    def _get_work_item_key(self) -> UniqueKey:
+        # execute backward first (if backward phase in work_list)
+        pp_rank = self.pp_rank
+        actual_stage_num = self.actual_stage_num
+        num_microbatches = self.num_microbatches
+        is_last_stage = pp_rank == actual_stage_num - 1
+
+        if self.outstanding <= self.outstanding_range[0]:
+            target_phase = Phase.FORWARD
+            target_microbatch_id = self.forward_times
+        elif self.outstanding >= self.outstanding_range[1]:
+            target_phase = Phase.BACKWARD
+            target_microbatch_id = self.backward_times
+        else:
+            raise ValueError("outstanding_range[1] - outstanding_range[0] must be in [0, 1]")
+
+        target_key = UniqueKey(target_microbatch_id, target_phase)
+
+        # change outstanding_range at:
+        # 1. forward times reach actual_stage_num, this is the end of continuous forward
+        # 2. forward times reach num_microbatches, this is the end of 1F1B mode
+        if not is_last_stage and \
+            target_key.phase == Phase.FORWARD:
+            if target_key.microbatch_id == actual_stage_num - 1:
+                outstanding_min = actual_stage_num - pp_rank - 1
+                outstanding_max = actual_stage_num - pp_rank
+                self.outstanding_range = (outstanding_min, outstanding_max)
+            elif target_key.microbatch_id == num_microbatches - 1:
+                self.outstanding_range = (0, 0)
+
+        with self.work_list_condition_lock:
+            self.work_list_condition_lock.wait_for(lambda: target_key in self.work_list)
+
+        return target_key
+
+
+class OneFOneBPipelineEngine(PipelineEngineBase):
+
+    def __init__(self,
+                 module_partitions: List[nn.Module],
+                 stage_num: int,
+                 num_microbatches: int,
+                 device: str,
+                 chunk: int = 1,
+                 criterion: Callable = None,
+                 metric: Callable = None,
+                 checkpoint: bool = False) -> None:
+
+        if chunk > 1:
+            assert num_microbatches % stage_num == 0, \
+                "if you use interleaving strategy, make sure 'num_microbatches' is a multiple of stage_num!"
+        use_1F1B = True
+
+        super().__init__(OneFOneBWorker, module_partitions, stage_num, num_microbatches, device, use_1F1B, chunk,
+                         criterion, metric, checkpoint)
+
+
+class ChimeraWorker(WorkerBase):
+
+    def _get_producer_consumer(self) -> None:
+        rank = self.pp_rank
+        min_pp_rank = (rank // self.actual_stage_num) * self.actual_stage_num
+        max_pp_rank = min_pp_rank + self.actual_stage_num - 1
+
+        assert self.producer_stage_ids is None, f"all the producers of rank {rank} has been subscribed"
+        assert self.consumer_stage_ids is None, f"all the consumers of rank {rank} has been subscribed"
+
+        # should be aranged in order, the order of the input of current forward
+        self.producer_stage_ids = []
+        self.consumer_stage_ids = []
+
+        # Just for demo
+        prev_rank = rank - 1
+        next_rank = rank + 1
+        if prev_rank >= min_pp_rank:
+            self.producer_stage_ids.append(prev_rank)
+        if next_rank <= max_pp_rank:
+            self.consumer_stage_ids.append(next_rank)
+
+    def _get_work_item_key(self) -> UniqueKey:
+        pp_rank = self.pp_rank
+        stage_num = self.actual_stage_num
+        real_microbatch_num = self.num_microbatches // 2
+
+        if self.forward_times < real_microbatch_num:
+            if (pp_rank + 1) % stage_num == 0:    # last rank
+                forward_blocks = self.forward_times // (self.num_microbatches // stage_num)
+                if forward_blocks > self.backward_times:
+                    target_phase = Phase.BACKWARD
+                    target_microbatch_id = self.backward_times
+                else:
+                    target_phase = Phase.FORWARD
+                    target_microbatch_id = self.forward_times
+            else:    # others
+                target_phase = Phase.FORWARD
+                target_microbatch_id = self.forward_times
+        else:
+            target_phase = Phase.BACKWARD
+            target_microbatch_id = self.backward_times
+
+        # In up pipeline, microbatch_id to consume is 0, 2, 4 (2n)
+        # In down pipeline, microbatch_id to consume is 1, 3, 5 (2n + 1)
+        real_target_microbatch_id = target_microbatch_id * 2
+        if pp_rank >= stage_num:
+            real_target_microbatch_id += 1
+        target_key = UniqueKey(real_target_microbatch_id, target_phase)
+
+        with self.work_list_condition_lock:
+            self.work_list_condition_lock.wait_for(lambda: target_key in self.work_list)
+
+        return target_key
+
+    def is_first_stage(self):
+        return (self.pp_rank % self.actual_stage_num) == 0
+
+    def is_last_stage(self):
+        return (self.pp_rank % self.actual_stage_num) == self.actual_stage_num - 1
+
+
+class ChimeraPipelineEngine(PipelineEngineBase):
+
+    def __init__(self,
+                 module_partitions,
+                 stage_num,
+                 num_microbatches,
+                 device: str,
+                 criterion: Callable = None,
+                 metric: Callable = None,
+                 checkpoint: bool = False) -> None:
+
+        assert num_microbatches % stage_num == 0, \
+            "In Chimera, num_microbatches must be the multiply of stage_num!"
+        use_1F1B = False
+        chunk = 1
+        super().__init__(ChimeraWorker, module_partitions, stage_num, num_microbatches, device, use_1F1B, chunk,
+                         criterion, metric, checkpoint)
+
+    def _consume_constraint(self, microbatch_id: int, forward_only: bool, ret_future: Dict[PyRRef, List[Future]],
+                            input_worker_rrefs: List[PyRRef], output_worker_rrefs: List[PyRRef]):
+        pass
+
+    def _create_pp_rank_to_rpc_worker_id(self) -> None:
+        stage_num = self.stage_num
+        self.pp_rank_to_rpc_worker_id = [0] * (stage_num * 2)
+        for pp_rank in range(stage_num):
+            self.pp_rank_to_rpc_worker_id[pp_rank] = pp_rank
+            self.pp_rank_to_rpc_worker_id[pp_rank + stage_num] = stage_num - pp_rank - 1
+
+    def _create_pp_rank_to_module_partition_id(self) -> None:
+        stage_num = self.stage_num
+        self.pp_rank_to_module_partition_id = [0] * (stage_num * 2)
+        for pp_rank in range(stage_num):
+            self.pp_rank_to_module_partition_id[pp_rank] = pp_rank
+            self.pp_rank_to_module_partition_id[pp_rank + stage_num] = pp_rank
+
+    def _create_ret_future(self, output_pp_ranks: List[int]) -> Dict[int, List[Future]]:
+        num_microbatches = self.num_microbatches
+        stage_num = self.stage_num
+        up_ret_future = {pp_rank: [None] * num_microbatches for pp_rank in output_pp_ranks}
+        down_ret_future = {pp_rank + stage_num: [None] * num_microbatches for pp_rank in output_pp_ranks}
+        # merge up and down
+        return {**up_ret_future, **down_ret_future}
+
+    def _set_input(self, input_pp_ranks: List[int], microbatch_id: int, microbatch, forward_only: bool):
+        # offset is 0 for all the ranks in up pipeline
+        # offset is stage_num for all the ranks in down pipeline
+        offset = (microbatch_id % 2) * self.stage_num
+        for pp_rank in input_pp_ranks:
+            worker_rref = self.pp_rank_to_worker_rref[pp_rank + offset]
+            worker_rref.remote().set_input(microbatch_id, microbatch, forward_only)
+
+    def _set_labels(self, output_pp_ranks: List[int], microbatch_id: int, microlabels):
+        # offset is 0 for all the ranks in up pipeline
+        # offset is stage_num for all the ranks in down pipeline
+        offset = (microbatch_id % 2) * self.stage_num
+        for pp_rank in output_pp_ranks:
+            worker_rref = self.pp_rank_to_worker_rref[pp_rank + offset]
+            worker_rref.remote().set_labels(microbatch_id, microlabels)
+
+    def _subscribe_forward(self, microbatch_id: int, output_pp_ranks: List[int], ret_future: Dict[int, List[Future]]):
+        key = UniqueKey(microbatch_id, Phase.FORWARD)
+        offset = (microbatch_id % 2) * self.stage_num
+        for pp_rank in output_pp_ranks:
+            worker_rref = self.pp_rank_to_worker_rref[pp_rank + offset]
+            ret_future[pp_rank + offset][microbatch_id] = worker_rref.rpc_async().get_output_by_key(key)
+
+    def _ensure_backward(self, forward_only: bool, input_pp_ranks: List[int]):
+        stage_num = self.stage_num
+        num_microbatches = self.num_microbatches
+        if not forward_only:
+            for pp_rank in input_pp_ranks:
+                up_last_microbatch_id = num_microbatches - 2
+                down_last_microbatch_id = num_microbatches - 1
+
+                up_worker_rref = self.pp_rank_to_worker_rref[pp_rank]
+                down_worker_rref = self.pp_rank_to_worker_rref[pp_rank + stage_num]
+
+                up_key = UniqueKey(up_last_microbatch_id, Phase.BACKWARD)
+                down_key = UniqueKey(down_last_microbatch_id, Phase.BACKWARD)
+
+                up_worker_rref.rpc_sync().get_output_by_key(up_key)
+                down_worker_rref.rpc_sync().get_output_by_key(down_key)
+
+    def _collect_forward_result(self, output_pp_ranks: List[int], ret_future: Dict[PyRRef, List[Future]]):
+        """Logic of collection of forward in Chimera.
+        Currently, only one input one output model is supported
+        """
+        stage_num = self.stage_num
+        forward_result = []
+        for pp_rank in output_pp_ranks:
+            worker_forward_result = [None] * self.num_microbatches
+            for microbatch_id in range(self.num_microbatches):
+                offset = (microbatch_id % 2) * stage_num
+                ret = ret_future[pp_rank + offset][microbatch_id].wait()
+                worker_forward_result[microbatch_id] = ret
+
+            worker_forward_result = list(zip(*worker_forward_result))
+            forward_result.extend(worker_forward_result)
+
+        return forward_result
--- a/data/cifar-10-python.tar.gz
+++ b/data/cifar-10-python.tar.gz
--- a/tests/test_pipeline/test_cuda_rpc_chimera.py
+++ b/tests/test_pipeline/test_cuda_rpc_chimera.py
+import torch
+from torch import nn
+
+from colossalai.pipeline.rpc._pipeline_schedule import FillDrainPipelineEngine, OneFOneBPipelineEngine, ChimeraPipelineEngine
+from rpc_test_utils import rpc_run, parse_args, RpcTestModel
+
+
+def run_master(args):
+    torch.manual_seed(100)
+
+    epoch = args.epoch
+    device = args.device
+    stage_num = 4
+    chunk = 1
+    num_microbatches = 4
+    actual_stage_num = 4
+    use_checkpoint = False
+
+    sample_num = 1024
+    feat_num = 10
+    h = 10
+    batch_size = 1024
+
+    assert sample_num % batch_size == 0
+
+    module_partitions = [RpcTestModel(pp_rank, actual_stage_num, feat_num, h) for pp_rank in range(actual_stage_num)]
+    engine = ChimeraPipelineEngine(module_partitions=module_partitions,
+                                   stage_num=stage_num,
+                                   num_microbatches=num_microbatches,
+                                   device=device,
+                                   checkpoint=use_checkpoint)
+
+    input_sample = torch.randn((sample_num, feat_num), device=device)
+
+    for _ in range(epoch):
+        _ = engine.forward_backward(input_sample, forward_only=False)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    args.world_size = 4
+    args.num_microbatches = 4
+    rpc_run(args, run_master)
--- a/tests/test_pipeline/test_cuda_rpc_optimizer.py
+++ b/tests/test_pipeline/test_cuda_rpc_optimizer.py
@@ -3,7 +3,7 @@ from torch import nn
 from torch import autograd
 from torch.optim import SGD, Adam, RMSprop, Optimizer

-from colossalai.pipeline.rpc.PipelineBase import FillDrainPipelineEngine, OneFOneBPipelineEngine
+from colossalai.pipeline.rpc._pipeline_schedule import FillDrainPipelineEngine, OneFOneBPipelineEngine
 from colossalai.testing import assert_close
 from rpc_test_utils import rpc_run, parse_args, RpcTestModel


--- a/tests/test_pipeline/test_cuda_rpc_performance.py
+++ b/tests/test_pipeline/test_cuda_rpc_performance.py
+import os
+from typing import Callable, List, Optional, Type, Union
+import time
+
+import pytest
+import torch
+import torch.nn as nn
+from titans.dataloader.cifar10 import build_cifar
+from torchvision.models import resnet50
+from torchvision.models.resnet import BasicBlock, Bottleneck, conv1x1
+from tqdm import tqdm
+
+from rpc_test_utils import rpc_run, parse_args
+import colossalai
+import colossalai.nn as col_nn
+from colossalai.logging import disable_existing_loggers, get_dist_logger
+from colossalai.trainer import Trainer, hooks
+from colossalai.utils import MultiTimer, get_dataloader
+from colossalai.context import ParallelMode
+from colossalai.pipeline.pipelinable import PipelinableContext, PipelinableModel
+from colossalai.pipeline.rpc._pipeline_schedule import OneFOneBPipelineEngine
+
+
+def flatten(x):
+    return torch.flatten(x, 1)
+
+
+class Flatten(nn.Module):
+
+    def forward(self, x):
+        return torch.flatten(x, start_dim=1)
+
+
+def run_master(args):
+    batch_size = args.batch_size
+    chunk = args.chunk
+    device = args.device
+    world_size = args.world_size
+    stage_num = world_size
+    num_microbatches = args.num_microbatches
+
+    assert chunk == 1
+
+    pipelinable = PipelinableContext()
+
+    # build model partitions
+    with pipelinable:
+        # input : [B, 3, 32, 32]
+        model = resnet50()
+
+    exec_seq = [
+        'conv1', 'bn1', 'relu', 'maxpool', 'layer1', 'layer2', 'layer3', 'layer4', 'avgpool', (flatten, "behind"), 'fc'
+    ]
+    pipelinable.to_layer_list(exec_seq)
+    module_partitions: List[PipelinableModel] = [
+        pipelinable.partition(chunk, stage_num, pp_rank) for pp_rank in range(world_size)
+    ]
+
+    # build dataloader
+    root = os.environ.get('DATA', './data')
+    train_dataloader, test_dataloader = build_cifar(batch_size, root, padding=4, crop=32, resize=32)
+    criterion = nn.CrossEntropyLoss()
+
+    partition_1 = module_partitions[0]
+    partition_2 = []
+    for model in module_partitions[1]._module_list:
+        partition_2.append(model)
+    partition_2.insert(len(partition_2) - 1, Flatten())
+    partition_2 = nn.Sequential(*partition_2)
+    module_partitions = [partition_1, partition_2]
+
+    pp_engine = OneFOneBPipelineEngine(module_partitions=module_partitions,
+                                       stage_num=stage_num,
+                                       num_microbatches=num_microbatches,
+                                       device=device,
+                                       chunk=chunk,
+                                       criterion=criterion,
+                                       checkpoint=False)
+
+    pp_engine.initialize_optimizer(torch.optim.Adam, lr=1e-3)
+    s = time.time()
+
+    for bx, by in tqdm(train_dataloader):
+        pp_engine.forward_backward(bx, labels=by, forward_only=False)
+
+    cost_time = time.time() - s
+
+    print("total cost time :", cost_time)
+    print("cost time per batch:", cost_time / len(train_dataloader))
+
+
+@pytest.mark.skip("Test for performance, no need for CI")
+def main():
+    args = parse_args()
+    # this is due to limitation of partition function
+    args.world_size = 2
+    args.chunk = 1
+    rpc_run(args, run_master)
+
+
+if __name__ == '__main__':
+    main()
--- a/tests/test_pipeline/test_cuda_rpc_pipeline.py
+++ b/tests/test_pipeline/test_cuda_rpc_pipeline.py
 import torch
 from torch import nn

-from colossalai.pipeline.rpc.PipelineBase import FillDrainPipelineEngine, OneFOneBPipelineEngine
+from colossalai.pipeline.rpc._pipeline_schedule import FillDrainPipelineEngine, OneFOneBPipelineEngine
 from rpc_test_utils import rpc_run, parse_args, RpcTestModel



--- a/tests/test_pipeline/test_cuda_rpc_value_correctness.py
+++ b/tests/test_pipeline/test_cuda_rpc_value_correctness.py
@@ -2,7 +2,7 @@ import torch
 from torch import nn
 from torch import autograd

-from colossalai.pipeline.rpc.PipelineBase import FillDrainPipelineEngine, OneFOneBPipelineEngine
+from colossalai.pipeline.rpc._pipeline_schedule import FillDrainPipelineEngine, OneFOneBPipelineEngine
 from colossalai.testing import assert_close
 from rpc_test_utils import rpc_run, parse_args, RpcTestModel

@@ -36,7 +36,7 @@ def run_master(args):
                                    chunk=chunk,
                                    checkpoint=use_checkpoint)

-    forward_result = engine.forward_backward(input_sample)
+    forward_result = engine.forward_backward(input_sample)[0]

    cuda_rpc_result = []
    single_result = []