ddp.py 3.38 KB
Newer Older
ver217's avatar
ver217 committed
1
2
3
4
5
6
7
import os
import random

import numpy as np
import torch
import torch.distributed as dist
import torch.nn as nn
Fazzie-Maqianli's avatar
Fazzie-Maqianli committed
8
from chatgpt.models.base import Actor
BlueRum's avatar
BlueRum committed
9
from chatgpt.models.lora import LoraLinear
ver217's avatar
ver217 committed
10
11
from chatgpt.replay_buffer import ReplayBuffer
from torch.nn.parallel import DistributedDataParallel as DDP
12
from torch.optim import Optimizer
13
from torch.utils.data import DataLoader
ver217's avatar
ver217 committed
14

15
from .base import Strategy
ver217's avatar
ver217 committed
16
from .naive import NaiveStrategy
17
from .sampler import DistributedSampler
ver217's avatar
ver217 committed
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53


class DDPStrategy(NaiveStrategy):
    """
        Strategy for distributed training using torch.distributed.
    """

    def __init__(self, seed: int = 42) -> None:
        self.seed = seed
        super().__init__()

    def setup_distributed(self) -> None:
        try:
            rank = int(os.environ['RANK'])
            local_rank = int(os.environ['LOCAL_RANK'])
            world_size = int(os.environ['WORLD_SIZE'])
            host = os.environ['MASTER_ADDR']
            port = int(os.environ['MASTER_PORT'])
        except KeyError as e:
            raise RuntimeError(
                f"Could not find {e} in the torch environment, visit https://www.colossalai.org/ for more information on launching with torch"
            )
        dist.init_process_group('nccl', init_method=f'tcp://[{host}]:{port}', world_size=world_size, rank=rank)
        self.set_seed(self.seed)
        torch.cuda.set_device(local_rank)

    def set_seed(self, seed: int) -> None:
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)

    def setup_model(self, model: nn.Module) -> nn.Module:
        device = torch.cuda.current_device()
        return DDP(model, device_ids=[device])

    def setup_dataloader(self, replay_buffer: ReplayBuffer, pin_memory: bool = False) -> DataLoader:
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
        # DDP only mode, replay buffers on each rank are different.
        # sampler = DistributedSampler(replay_buffer,
        #                              num_replicas=dist.get_world_size(),
        #                              rank=dist.get_rank(),
        #                              shuffle=True,
        #                              seed=self.seed,
        #                              drop_last=True)
        return DataLoader(
            replay_buffer,
            batch_size=replay_buffer.sample_batch_size,
        #   sampler=sampler,
            shuffle=True,
            drop_last=True,
            pin_memory=pin_memory,
            collate_fn=replay_buffer.collate_fn)
69
70
71
72
73
74
75

    @staticmethod
    def _unwrap_actor(actor: Actor) -> nn.Module:
        model: DDP = Strategy._unwrap_actor(actor)
        return model.module

    def save_model(self, model: nn.Module, path: str, only_rank0: bool = False) -> None:
BlueRum's avatar
BlueRum committed
76
77
78
79
80
        for module in model.modules():
            if isinstance(module, LoraLinear):
                module.merge_weights=True
                module.eval()
                
81
82
        if only_rank0 and dist.get_rank() != 0:
            return
BlueRum's avatar
BlueRum committed
83
84
85
86
        model = model.model.module
        state_dict = model.state_dict()
        torch.save(state_dict, path)
        
87
88
89
90
    def save_optimizer(self, optimizer: Optimizer, path: str, only_rank0: bool = False) -> None:
        if only_rank0 and dist.get_rank() != 0:
            return
        super().save_optimizer(optimizer, path, only_rank0)
91
92
93

    def setup_sampler(self, dataset) -> DistributedSampler:
        return DistributedSampler(dataset, dist.get_world_size(), dist.get_rank())