initialize.py 15.8 KB
Newer Older
Jared Casper's avatar
Jared Casper committed
1
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
Mohammad's avatar
Mohammad committed
2
3

"""Megatron initialization."""
liangjing's avatar
liangjing committed
4
import logging
Mohammad's avatar
Mohammad committed
5
6
import random
import os
7
import time
Mohammad's avatar
Mohammad committed
8

9
import numpy as np
Mohammad's avatar
Mohammad committed
10
import torch
Ryan Prenger's avatar
Ryan Prenger committed
11
from datetime import timedelta
Mohammad's avatar
Mohammad committed
12

liangjing's avatar
liangjing committed
13
14
15
16
from megatron.legacy import fused_kernels
from megatron.training import get_adlr_autoresume
from megatron.training import get_args
from megatron.training import get_tensorboard_writer
17
from megatron.core import mpu, tensor_parallel
liangjing's avatar
liangjing committed
18
19
20
21
22
23
24
25
26
from megatron.training.arguments import parse_args, validate_args
from megatron.training.yaml_arguments import validate_yaml
from megatron.training.checkpointing import load_args_from_checkpoint
from megatron.training.global_vars import set_global_variables
from megatron.core.fusions.fused_bias_dropout import bias_dropout_add_fused_train
from megatron.core.fusions.fused_bias_gelu import bias_gelu
from megatron.core.fusions.fused_bias_swiglu import bias_swiglu

logger = logging.getLogger(__name__)
27

Mohammad's avatar
Mohammad committed
28

liangjing's avatar
v1  
liangjing committed
29
30
31
32
33
def initialize_megatron(
    extra_args_provider=None,
    args_defaults={},
    ignore_unknown_args=False,
    allow_no_cuda=False,
liangjing's avatar
liangjing committed
34
35
36
    skip_mpu_initialization=False,
    get_embedding_ranks=None,
    get_position_embedding_ranks=None
liangjing's avatar
v1  
liangjing committed
37
):
Mohammad's avatar
Mohammad committed
38
    """Set global variables, initialize distributed, and
Raul Puri's avatar
Raul Puri committed
39
    set autoresume and random seeds.
liangjing's avatar
v1  
liangjing committed
40
41
    `allow_no_cuda` should not be set unless using megatron for cpu only
    data processing. In general this arg should not be set unless you know
42
    what you are doing.
liangjing's avatar
v1  
liangjing committed
43
    Returns a function to finalize distributed env initialization
Boris Fomitchev's avatar
Boris Fomitchev committed
44
    (optionally, only when args.lazy_mpu_init == True)
45
    """
Raul Puri's avatar
Raul Puri committed
46
47
    if not allow_no_cuda:
        # Make sure cuda is available.
liangjing's avatar
v1  
liangjing committed
48
        assert torch.cuda.is_available(), "Megatron requires CUDA."
Mohammad's avatar
Mohammad committed
49

50
51
52
    # Parse arguments
    args = parse_args(extra_args_provider, ignore_unknown_args)

liangjing's avatar
liangjing committed
53
54
55
56
57
58
    # Prep for checkpoint conversion.
    if args.ckpt_convert_format is not None:
        assert args.ckpt_convert_save is not None
        assert args.load is not None
        args.exit_on_missing_checkpoint = True

liangjing's avatar
v1  
liangjing committed
59
    if args.use_checkpoint_args or args_defaults.get("use_checkpoint_args", False):
liangjing's avatar
liangjing committed
60
        assert args.load is not None, "--use-checkpoint-args requires --load argument"
61
62
        load_args_from_checkpoint(args)

liangjing's avatar
liangjing committed
63
64
65
66
67
    if args.yaml_cfg is not None:
        args = validate_yaml(args, args_defaults)
    else:
        validate_args(args, args_defaults)

liangjing's avatar
v1  
liangjing committed
68

69
    # set global args, build tokenizer, and set adlr-autoresume,
Mohammad's avatar
Mohammad committed
70
    # tensorboard-writer, and timers.
71
    set_global_variables(args)
Mohammad's avatar
Mohammad committed
72

liangjing's avatar
liangjing committed
73
74
75
    # set logging level
    setup_logging()

76
    # torch.distributed initialization
77
    def finish_mpu_init():
78
79
        args = get_args()
        # Pytorch distributed.
liangjing's avatar
liangjing committed
80
        _initialize_distributed(get_embedding_ranks, get_position_embedding_ranks)
liangjing's avatar
v1  
liangjing committed
81

82
83
        # Random seeds for reproducibility.
        if args.rank == 0:
liangjing's avatar
v1  
liangjing committed
84
            print("> setting random seeds to {} ...".format(args.seed))
85
        _set_random_seed(args.seed, args.data_parallel_random_init)
Mohammad's avatar
Mohammad committed
86

liangjing's avatar
liangjing committed
87
88
89
    if skip_mpu_initialization:
        return None

Mohammad's avatar
Mohammad committed
90
    args = get_args()
liangjing's avatar
v1  
liangjing committed
91
    if args.lazy_mpu_init:
92
        # TODO is this still a necessary option?
liangjing's avatar
v1  
liangjing committed
93
        args.use_cpu_initialization = True
94
        # delayed initialization of DDP-related stuff
95
96
        # We only set basic DDP globals
        mpu.set_tensor_model_parallel_world_size(args.tensor_model_parallel_size)
97
98
        # and return function for external DDP manager
        # to call when it has DDP initialized
99
        mpu.set_tensor_model_parallel_rank(args.rank)
100
        return finish_mpu_init
101
    else:
102
103
        # Megatron's MPU is the master. Complete initialization right away.
        finish_mpu_init()
104

105
106
        # Autoresume.
        _init_autoresume()
mshoeybi's avatar
mshoeybi committed
107

108
109
110
        # Compile dependencies.
        _compile_dependencies()

liangjing's avatar
liangjing committed
111
112
113
        if args.tp_comm_overlap:
           _initialize_tp_communicators()

114
115
        # No continuation function
        return None
116
117
118
119
120
121
122
123
124
125
126
127


def _compile_dependencies():

    args = get_args()

    # =========================
    # Compile dataset C++ code.
    # =========================
    # TODO: move this to ninja
    if torch.distributed.get_rank() == 0:
        start_time = time.time()
liangjing's avatar
v1  
liangjing committed
128
        print("> compiling dataset index builder ...")
liangjing's avatar
liangjing committed
129
        from megatron.core.datasets.utils import compile_helpers
liangjing's avatar
v1  
liangjing committed
130

liangjing's avatar
liangjing committed
131
        compile_helpers()
liangjing's avatar
v1  
liangjing committed
132
133
134
135
136
        print(
            ">>> done with dataset index builder. Compilation time: {:.3f} "
            "seconds".format(time.time() - start_time),
            flush=True,
        )
137
138
139
140
141
142
143

    # ==================
    # Load fused kernels
    # ==================

    # Custom kernel constraints check.
    seq_len = args.seq_length
liangjing's avatar
v1  
liangjing committed
144
145
146
    attn_batch_size = (
        args.num_attention_heads / args.tensor_model_parallel_size
    ) * args.micro_batch_size
147
148
    # Constraints on sequence length and attn_batch_size to enable warp based
    # optimization and upper triangular optimization (for causal mask)
liangjing's avatar
v1  
liangjing committed
149
150
151
152
153
154
    custom_kernel_constraint = (
        seq_len > 16
        and seq_len <= 16384
        and seq_len % 4 == 0
        and attn_batch_size % 4 == 0
    )
155
    # Print a warning.
liangjing's avatar
v1  
liangjing committed
156
157
158
159
160
    if not (
        (args.fp16 or args.bf16)
        and custom_kernel_constraint
        and args.masked_softmax_fusion
    ):
161
        if args.rank == 0:
liangjing's avatar
v1  
liangjing committed
162
163
164
165
166
167
168
            print(
                "WARNING: constraints for invoking optimized"
                " fused softmax kernel are not met. We default"
                " back to unfused kernel invocations.",
                flush=True,
            )

169
170
171
    # Always build on rank zero first.
    if torch.distributed.get_rank() == 0:
        start_time = time.time()
liangjing's avatar
v1  
liangjing committed
172
173
        print("> compiling and loading fused kernels ...", flush=True)
        #fused_kernels.load(args)
174
175
176
        torch.distributed.barrier()
    else:
        torch.distributed.barrier()
liangjing's avatar
v1  
liangjing committed
177
        #fused_kernels.load(args)
178
179
180
181
182
183
    # Simple barrier to make sure all ranks have passed the
    # compilation phase successfully before moving on to the
    # rest of the program. We think this might ensure that
    # the lock is released.
    torch.distributed.barrier()
    if torch.distributed.get_rank() == 0:
liangjing's avatar
v1  
liangjing committed
184
185
186
187
188
        print(
            ">>> done with compiling and loading fused kernels. "
            "Compilation time: {:.3f} seconds".format(time.time() - start_time),
            flush=True,
        )
189

liangjing's avatar
liangjing committed
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
def _initialize_tp_communicators():
    """ initializing the communicators with user buffers for high-performance tensor-model-parallel
        communication overlap """

    try:
       import yaml

       import transformer_engine
       from transformer_engine.pytorch import module as te_module

    except ImportError:
       raise RuntimeError("Tensor Parallel Communication/GEMM Overlap optimization needs 'yaml' and "
             "'transformer_engine' packages")

    args = get_args()

    if args.tp_comm_overlap_cfg is not None:
       with open(args.tp_comm_overlap_cfg,"r") as stream:
          ub_cfgs = yaml.safe_load(stream)
    else:
       ub_cfgs = {}

    input_shape = [(args.seq_length * args.micro_batch_size) // args.context_parallel_size , args.hidden_size]

    #We create a MPI process group, which is needed to bootstrap the pipelined
    #tensor-model-parallel communication overlap
    torch.distributed.new_group(backend='mpi')

    te_module.base.initialize_ub(shape = input_shape, tp_size = args.tensor_model_parallel_size,
                                 use_fp8 = (args.fp8 is not None) , ub_cfgs = ub_cfgs,)
Mohammad's avatar
Mohammad committed
220

liangjing's avatar
liangjing committed
221
def _initialize_distributed(get_embedding_ranks, get_position_embedding_ranks):
222
    """Initialize torch.distributed and core model parallel."""
Mohammad's avatar
Mohammad committed
223
224
    args = get_args()

Raul Puri's avatar
Raul Puri committed
225
    device_count = torch.cuda.device_count()
Mohammad's avatar
Mohammad committed
226
227
228
    if torch.distributed.is_initialized():

        if args.rank == 0:
liangjing's avatar
v1  
liangjing committed
229
230
231
232
233
            print(
                "torch distributed is already initialized, "
                "skipping initialization ...",
                flush=True,
            )
liangjing's avatar
liangjing committed
234
235
        args.rank = torch.distributed.get_rank()
        args.world_size = torch.distributed.get_world_size()
Mohammad's avatar
Mohammad committed
236
237
238
239

    else:

        if args.rank == 0:
liangjing's avatar
v1  
liangjing committed
240
            print("> initializing torch distributed ...", flush=True)
Mohammad's avatar
Mohammad committed
241
        # Manually set the device ids.
242
        if device_count > 0:
liangjing's avatar
liangjing committed
243
244
245
            #torch.cuda.set_device(args.local_rank)
            #device_id = torch.device(f'cuda:{args.local_rank}')
            device_id = args.rank % device_count
246
            if args.local_rank is not None:
liangjing's avatar
v1  
liangjing committed
247
                assert (
liangjing's avatar
liangjing committed
248
                    args.local_rank == device_id
liangjing's avatar
v1  
liangjing committed
249
                ), "expected local-rank to be the same as rank % device-count."
250
            else:
liangjing's avatar
liangjing committed
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
                args.local_rank = device_id
            torch.cuda.set_device(device_id)
        else:
            device_id = None

        # Call the init process
        torch.distributed.init_process_group(
                backend=args.distributed_backend,
                world_size=args.world_size,
                rank=args.rank,
                init_method=args.dist_url,
                timeout=timedelta(minutes=args.distributed_timeout_minutes),
                )
        #init_process_group_kwargs = {
        #    'backend' : args.distributed_backend,
        #    'world_size': args.world_size,
        #    'rank': args.rank,
        #    'timeout': timedelta(minutes=args.distributed_timeout_minutes),
        #}

        #torch.distributed.init_process_group(**init_process_group_kwargs)
Mohammad's avatar
Mohammad committed
272

273
    # Set the tensor model-parallel, pipeline model-parallel, and
274
    # data-parallel communicators.
275
    if device_count > 0:
276
        if mpu.model_parallel_is_initialized():
liangjing's avatar
v1  
liangjing committed
277
            print("model parallel is already initialized")
278
        else:
liangjing's avatar
v1  
liangjing committed
279
280
281
282
283
            mpu.initialize_model_parallel(
                args.tensor_model_parallel_size,
                args.pipeline_model_parallel_size,
                args.virtual_pipeline_model_parallel_size,
                args.pipeline_model_parallel_split_rank,
liangjing's avatar
liangjing committed
284
285
286
287
288
289
290
291
292
                context_parallel_size=args.context_parallel_size,
                expert_model_parallel_size=args.expert_model_parallel_size,
                distributed_timeout_minutes=args.distributed_timeout_minutes,
                nccl_communicator_config_path=args.nccl_communicator_config_path,
                order='tp-cp-ep-dp-pp' if not args.use_tp_pp_dp_mapping else 'tp-pp-dp',
                encoder_tensor_model_parallel_size=args.encoder_tensor_model_parallel_size,
                encoder_pipeline_model_parallel_size=args.encoder_pipeline_model_parallel_size,
                get_embedding_ranks=get_embedding_ranks,
                get_position_embedding_ranks=get_position_embedding_ranks,
liangjing's avatar
v1  
liangjing committed
293
            )
294
            if args.rank == 0:
liangjing's avatar
v1  
liangjing committed
295
296
297
298
299
300
301
302
                print(
                    f"> initialized tensor model parallel with size "
                    f"{mpu.get_tensor_model_parallel_world_size()}"
                )
                print(
                    f"> initialized pipeline model parallel with size "
                    f"{mpu.get_pipeline_model_parallel_world_size()}"
                )
Mohammad's avatar
Mohammad committed
303
304
305
306
307
308
309
310
311
312
313


def _init_autoresume():
    """Set autoresume start time."""
    autoresume = get_adlr_autoresume()
    if autoresume:
        torch.distributed.barrier()
        autoresume.init()
        torch.distributed.barrier()


314
def _set_random_seed(seed_, data_parallel_random_init=False):
Mohammad's avatar
Mohammad committed
315
    """Set random seed for reproducability."""
316
    if seed_ is not None and seed_ > 0:
317
318
319
320
321
        # Ensure that different pipeline MP stages get different seeds.
        seed = seed_ + (100 * mpu.get_pipeline_model_parallel_rank())
        # Ensure different data parallel ranks get different seeds
        if data_parallel_random_init:
            seed = seed + (10 * mpu.get_data_parallel_rank())
Mohammad's avatar
Mohammad committed
322
323
324
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
325
        if torch.cuda.device_count() > 0:
326
            tensor_parallel.model_parallel_cuda_manual_seed(seed)
Mohammad's avatar
Mohammad committed
327
    else:
liangjing's avatar
v1  
liangjing committed
328
        raise ValueError("Seed ({}) should be a positive integer.".format(seed))
Mohammad's avatar
Mohammad committed
329
330


331
def write_args_to_tensorboard():
Mohammad's avatar
Mohammad committed
332
333
334
335
336
    """Write arguments to tensorboard."""
    args = get_args()
    writer = get_tensorboard_writer()
    if writer:
        for arg in vars(args):
liangjing's avatar
v1  
liangjing committed
337
            writer.add_text(arg, str(getattr(args, arg)), global_step=args.iteration)
338
339


340
def set_jit_fusion_options():
Sangkug Lym's avatar
Sangkug Lym committed
341
342
    """Set PyTorch JIT layer fusion options."""
    # flags required to enable jit fusion kernels
liangjing's avatar
v1  
liangjing committed
343
344
    TORCH_MAJOR = int(torch.__version__.split(".")[0])
    TORCH_MINOR = int(torch.__version__.split(".")[1])
Sangkug Lym's avatar
Sangkug Lym committed
345
346
347
348
349
350
351
    if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10):
        # nvfuser
        torch._C._jit_set_profiling_executor(True)
        torch._C._jit_set_profiling_mode(True)
        torch._C._jit_override_can_fuse_on_cpu(False)
        torch._C._jit_override_can_fuse_on_gpu(False)
        torch._C._jit_set_texpr_fuser_enabled(False)
liangjing's avatar
liangjing committed
352
        torch._C._jit_set_nvfuser_enabled(False) #True
Sangkug Lym's avatar
Sangkug Lym committed
353
354
355
356
357
358
359
        torch._C._debug_set_autodiff_subgraph_inlining(False)
    else:
        # legacy pytorch fuser
        torch._C._jit_set_profiling_mode(False)
        torch._C._jit_set_profiling_executor(False)
        torch._C._jit_override_can_fuse_on_cpu(True)
        torch._C._jit_override_can_fuse_on_gpu(True)
360

361
362
    _warmup_jit_function()

363

364
def _warmup_jit_function():
liangjing's avatar
v1  
liangjing committed
365
    """Compilie JIT functions before the main training steps"""
366
367
    args = get_args()
    if args.bf16:
368
        dtype = torch.bfloat16
369
    elif args.fp16:
370
        dtype = torch.float16
371
    else:
372
        dtype = torch.float32
373
374

    # Warmup fused bias+gelu
liangjing's avatar
v1  
liangjing committed
375
376
377
378
379
380
381
    bias = torch.rand(
        args.ffn_hidden_size // args.tensor_model_parallel_size,
        dtype=dtype,
        device="cuda",
    )
    input = torch.rand(
        (
liangjing's avatar
liangjing committed
382
            args.seq_length // args.context_parallel_size,
liangjing's avatar
v1  
liangjing committed
383
384
385
386
387
388
            args.micro_batch_size,
            args.ffn_hidden_size // args.tensor_model_parallel_size,
        ),
        dtype=dtype,
        device="cuda",
    )
389
    # Warmup JIT fusions with the input grad_enable state of both forward
390
    # prop and recomputation
391
392
    for bias_grad, input_grad in zip([True, True], [False, True]):
        bias.requires_grad, input.requires_grad = bias_grad, input_grad
393
        for _ in range(5):
liangjing's avatar
liangjing committed
394
395
396
397
            if args.swiglu:
                output = bias_swiglu(input, bias)
            else:
                output = bias_gelu(bias, input)
398
    del bias, input, output
399
400

    # Warmup fused bias+dropout+add
Vijay Korthikanti's avatar
Vijay Korthikanti committed
401
402
403
404
    if args.sequence_parallel:
        seq_length = args.seq_length // mpu.get_tensor_model_parallel_world_size()
    else:
        seq_length = args.seq_length
liangjing's avatar
v1  
liangjing committed
405
    input = torch.rand(
liangjing's avatar
liangjing committed
406
        (seq_length // args.context_parallel_size, args.micro_batch_size, args.hidden_size),
liangjing's avatar
v1  
liangjing committed
407
408
409
410
        dtype=dtype,
        device="cuda",
    )
    residual = torch.rand(
liangjing's avatar
liangjing committed
411
        (seq_length // args.context_parallel_size, args.micro_batch_size, args.hidden_size),
liangjing's avatar
v1  
liangjing committed
412
413
414
415
416
417
        dtype=dtype,
        device="cuda",
    )
    bias = torch.rand((args.hidden_size), dtype=dtype, device="cuda").expand_as(
        residual
    )
418
419
    dropout_rate = 0.1
    # Warmup JIT fusions with the input grad_enable state of both forward
420
    # prop and recomputation
liangjing's avatar
v1  
liangjing committed
421
422
423
    for input_grad, bias_grad, residual_grad in zip(
        [False, True], [True, True], [True, True]
    ):
424
425
426
        input.requires_grad = input_grad
        bias.requires_grad = bias_grad
        residual.requires_grad = residual_grad
427
        for _ in range(5):
liangjing's avatar
liangjing committed
428
            output = bias_dropout_add_fused_train([input, bias], residual, dropout_rate)
429
    del bias, input, residual, output
430
    torch.cuda.empty_cache()
liangjing's avatar
liangjing committed
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453


def setup_logging() -> None:
    """ Sets the default logging level based on cmdline args and env vars.

    Precedence:
    1. Command line argument `--logging-level`
    2. Env var `MEGATRON_LOGGING_LEVEL`
    3. Default logging level (INFO)

    Returns: None
    """
    args = get_args()
    logging_level = None
    env_logging_level = os.getenv('MEGATRON_LOGGING_LEVEL', None)
    if env_logging_level is not None:
        logging_level = int(env_logging_level)
    if args.logging_level is not None:
        logging_level = args.logging_level

    if logging_level is not None:
        logger.info(f'Setting logging level to {logging_level}')
        logging.getLogger().setLevel(logging_level)