update jit warmup code to handle sequence parallelism

a4ef7c40 · Vijay Korthikanti · 7f9a48ba · a4ef7c40
Commit a4ef7c40 authored May 16, 2022 by Vijay Korthikanti
Hide whitespace changes
Inline Side-by-side

Showing with 9 additions and 3 deletions

megatron/initialize.py megatron/initialize.py +9 -3

No files found.
--- a/megatron/initialize.py
+++ b/megatron/initialize.py
@@ -266,7 +266,13 @@ def _warmup_jit_function():
    # Warmup fused bias+gelu
    bias = torch.rand(args.ffn_hidden_size // args.tensor_model_parallel_size,
                      dtype=dtype, device='cuda')
-    input = torch.rand((args.seq_length, args.micro_batch_size,
+
+    if args.sequence_parallel:
+        seq_length = args.seq_length // mpu.get_tensor_model_parallel_world_size()
+    else:
+        seq_length = args.seq_length
+
+    input = torch.rand((seq_length, args.micro_batch_size,
                        args.ffn_hidden_size // args.tensor_model_parallel_size),
                       dtype=dtype, device='cuda')
    # Warmup JIT fusions with the input grad_enable state of both forward
@@ -278,9 +284,9 @@ def _warmup_jit_function():
    del bias, input, output

    # Warmup fused bias+dropout+add
-    input = torch.rand((args.seq_length, args.micro_batch_size, args.hidden_size),
+    input = torch.rand((seq_length, args.micro_batch_size, args.hidden_size),
                       dtype=dtype, device='cuda')
-    residual = torch.rand((args.seq_length, args.micro_batch_size, args.hidden_size),
+    residual = torch.rand((seq_length, args.micro_batch_size, args.hidden_size),
                          dtype=dtype, device='cuda')
    bias = torch.rand((args.hidden_size), dtype=dtype, device='cuda').expand_as(residual)
    dropout_rate = 0.1