"tests/vscode:/vscode.git/clone" did not exist on "88016c372a5962eb98f4dfc71243ccd64433710e"
benchmark.py 5.16 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import argparse

import torch
from benchmark_utils import benchmark
from torch.utils.data import DataLoader, Dataset
from transformers import (
    AlbertConfig,
    AlbertForSequenceClassification,
    BertConfig,
    BertForSequenceClassification,
    get_linear_schedule_with_warmup,
)

import colossalai
from colossalai.booster import Booster
from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
from colossalai.cluster import DistCoordinator
from colossalai.nn.optimizer import HybridAdam

# ==============================
# Prepare Hyperparameters
# ==============================
NUM_EPOCHS = 3
BATCH_SIZE = 32
LEARNING_RATE = 2.4e-5
WEIGHT_DECAY = 0.01
WARMUP_FRACTION = 0.1
SEQ_LEN = 512
VOCAB_SIZE = 1000
NUM_LABELS = 10
DATASET_LEN = 1000


class RandintDataset(Dataset):
    def __init__(self, dataset_length: int, sequence_length: int, vocab_size: int, n_class: int):
        self._sequence_length = sequence_length
        self._vocab_size = vocab_size
        self._n_class = n_class
        self._dataset_length = dataset_length
        self._datas = torch.randint(
            low=0,
            high=self._vocab_size,
43
44
45
46
            size=(
                self._dataset_length,
                self._sequence_length,
            ),
47
48
            dtype=torch.long,
        )
49
        self._labels = torch.randint(low=0, high=self._n_class, size=(self._dataset_length, 1), dtype=torch.long)
50
51
52
53
54
55
56
57
58
59
60
61
62

    def __len__(self):
        return self._dataset_length

    def __getitem__(self, idx):
        return self._datas[idx], self._labels[idx]


def main():
    # ==============================
    # Parse Arguments
    # ==============================
    parser = argparse.ArgumentParser()
63
64
65
66
67
68
69
70
71
    parser.add_argument("-t", "--task", default="mrpc", help="GLUE task to run")
    parser.add_argument(
        "-p",
        "--plugin",
        type=str,
        default="torch_ddp",
        choices=["torch_ddp", "torch_ddp_fp16", "gemini", "low_level_zero"],
        help="plugin to use",
    )
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
    parser.add_argument(
        "--model_type",
        type=str,
        default="bert",
        help="bert or albert",
    )

    args = parser.parse_args()

    # ==============================
    # Launch Distributed Environment
    # ==============================
    colossalai.launch_from_torch(config={}, seed=42)
    coordinator = DistCoordinator()

    # local_batch_size = BATCH_SIZE // coordinator.world_size
    lr = LEARNING_RATE * coordinator.world_size

    # ==============================
    # Instantiate Plugin and Booster
    # ==============================
    booster_kwargs = {}
94
95
96
    if args.plugin == "torch_ddp_fp16":
        booster_kwargs["mixed_precision"] = "fp16"
    if args.plugin.startswith("torch_ddp"):
97
        plugin = TorchDDPPlugin()
98
99
100
    elif args.plugin == "gemini":
        plugin = GeminiPlugin(placement_policy="cuda", strict_ddp_mode=True, initial_scale=2**5)
    elif args.plugin == "low_level_zero":
101
102
103
104
105
106
107
108
        plugin = LowLevelZeroPlugin(initial_scale=2**5)

    booster = Booster(plugin=plugin, **booster_kwargs)

    # ==============================
    # Prepare Dataloader
    # ==============================

109
110
111
    train_dataset = RandintDataset(
        dataset_length=DATASET_LEN, sequence_length=SEQ_LEN, vocab_size=VOCAB_SIZE, n_class=NUM_LABELS
    )
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)

    # ====================================
    # Prepare model, optimizer
    # ====================================
    # bert pretrained model

    if args.model_type == "bert":
        cfg = BertConfig(vocab_size=VOCAB_SIZE, num_labels=NUM_LABELS)
        model = BertForSequenceClassification(cfg)
    elif args.model_type == "albert":
        cfg = AlbertConfig(vocab_size=VOCAB_SIZE, num_labels=NUM_LABELS)
        model = AlbertForSequenceClassification(cfg)
    else:
        raise RuntimeError

    # optimizer
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": WEIGHT_DECAY,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]

    optimizer = HybridAdam(optimizer_grouped_parameters, lr=lr, eps=1e-8)

    # lr scheduler
    total_steps = len(train_dataloader) * NUM_EPOCHS
    num_warmup_steps = int(WARMUP_FRACTION * total_steps)
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=total_steps,
    )

    # criterion
    criterion = lambda inputs: inputs[0]

    # ==============================
    # Boost with ColossalAI
    # ==============================
    model, optimizer, _, _, lr_scheduler = booster.boost(model, optimizer, lr_scheduler=lr_scheduler)

    # ==============================
    # Benchmark model
    # ==============================

164
165
166
    results = benchmark(
        model, booster, optimizer, lr_scheduler, train_dataloader, criterion=criterion, epoch_num=NUM_EPOCHS
    )
167
168
169
170

    coordinator.print_on_master(results)


171
if __name__ == "__main__":
172
    main()