Unverified Commit b1c75f6f authored by Masaki Kozuki's avatar Masaki Kozuki Committed by GitHub
Browse files

small changes in test and logger format (#1278)

* cosmetic refactor in test

* log with PID

* log more info: rank, pid, filename, lineNo
parent a960fe8c
...@@ -34,6 +34,6 @@ class RankInfoFormatter(logging.Formatter): ...@@ -34,6 +34,6 @@ class RankInfoFormatter(logging.Formatter):
_library_root_logger = logging.getLogger(__name__) _library_root_logger = logging.getLogger(__name__)
handler = logging.StreamHandler() handler = logging.StreamHandler()
handler.setFormatter(RankInfoFormatter("%(asctime)s - %(name)s - %(levelname)s - %(rank_info)s - %(message)s")) handler.setFormatter(RankInfoFormatter("%(asctime)s - PID:%(process)d - rank:%(rank_info)s - %(filename)s:%(lineno)d - %(levelname)s - %(message)s", "%y-%m-%d %H:%M:%S"))
_library_root_logger.addHandler(handler) _library_root_logger.addHandler(handler)
_library_root_logger.propagate = False _library_root_logger.propagate = False
from typing import Optional from typing import Optional, Sequence
import torch import torch
def _get_autocast_dtypes() -> Sequence[torch.dtype]:
if torch.cuda.is_bf16_supported():
return [torch.half, torch.bfloat16]
return [torch.half]
def _get_current_dtype(dtype: Optional[torch.dtype] = None) -> torch.dtype: def _get_current_dtype(dtype: Optional[torch.dtype] = None) -> torch.dtype:
if not torch.is_autocast_enabled(): if not torch.is_autocast_enabled():
return torch.float or dtype return torch.float or dtype
......
...@@ -175,15 +175,15 @@ def initialize_model_parallel( ...@@ -175,15 +175,15 @@ def initialize_model_parallel(
_EMBEDDING_GLOBAL_RANKS = embedding_ranks _EMBEDDING_GLOBAL_RANKS = embedding_ranks
def get_rank_info() -> Tuple[int, int, int]: def get_rank_info() -> Tuple[int, int, int]:
"""Returns a tuple of (tensor, pipeline, data)-parallel-rank for logger.""" """Returns a tuple of (data, tensor, pipeline, virtual pipeline)-parallel-rank for logger."""
if model_parallel_is_initialized(): if model_parallel_is_initialized():
return ( return (
get_data_parallel_rank(),
get_tensor_model_parallel_rank(), get_tensor_model_parallel_rank(),
get_pipeline_model_parallel_rank(), get_pipeline_model_parallel_rank(),
# get_virtual_pipeline_model_parallel_rank(), get_virtual_pipeline_model_parallel_rank(),
get_data_parallel_rank(),
) )
return (0, 0, 0) return (0, 0, 0, 0)
def model_parallel_is_initialized(): def model_parallel_is_initialized():
......
...@@ -4,6 +4,7 @@ import warnings ...@@ -4,6 +4,7 @@ import warnings
import torch import torch
from torch.cuda.amp import GradScaler from torch.cuda.amp import GradScaler
from apex._autocast_utils import _get_autocast_dtypes
from apex.transformer import parallel_state from apex.transformer import parallel_state
from apex.transformer.pipeline_parallel import get_forward_backward_func from apex.transformer.pipeline_parallel import get_forward_backward_func
from apex.transformer.pipeline_parallel.schedules.common import _get_params_for_weight_decay_optimization from apex.transformer.pipeline_parallel.schedules.common import _get_params_for_weight_decay_optimization
...@@ -36,7 +37,6 @@ fwd_bwd_functions = { ...@@ -36,7 +37,6 @@ fwd_bwd_functions = {
} }
# TODO (mkozuki): Add a case with `autocast` and `GradScaler`.
# Run forward & backward for one minibatch. # Run forward & backward for one minibatch.
def forward_backward_func_template( def forward_backward_func_template(
args, args,
...@@ -47,10 +47,10 @@ def forward_backward_func_template( ...@@ -47,10 +47,10 @@ def forward_backward_func_template(
dtype: torch.dtype, dtype: torch.dtype,
grad_scaler: Optional[GradScaler], grad_scaler: Optional[GradScaler],
) -> None: ) -> None:
print_separator(f"name: {name}, pipeline model parallel size: {pipeline_model_parallel_size}") print_separator(f"name: {name}, dtype: {dtype}, use grad_scaler: {grad_scaler is not None}, pipeline model parallel size: {pipeline_model_parallel_size}")
virtual_pipeline_model_parallel_size = 2 if name == "interleaving" else None virtual_pipeline_model_parallel_size = 2 if name == "interleaving" else None
if name == "no_pipelining": if name == "no_pipelining":
# note (mkozuki): `forward_backward_no_pipelining` is **NOTE** compatible with # note (mkozuki): `forward_backward_no_pipelining` is **NOT** compatible with
# pipeline_model_parallel_size>1. So use pipeline_model_parallel_size as # pipeline_model_parallel_size>1. So use pipeline_model_parallel_size as
# tensor_model_parallel_size and set pipeline_model_parallel_size to 1. # tensor_model_parallel_size and set pipeline_model_parallel_size to 1.
parallel_state.initialize_model_parallel(1, 1, None) parallel_state.initialize_model_parallel(1, 1, None)
...@@ -120,9 +120,7 @@ if __name__ == "__main__": ...@@ -120,9 +120,7 @@ if __name__ == "__main__":
batch_size = args.global_batch_size batch_size = args.global_batch_size
micro_batch_size = args.micro_batch_size micro_batch_size = args.micro_batch_size
autocast_dtypes = ( dtypes = [torch.float32] + _get_autocast_dtypes()
[torch.half, torch.bfloat16] if torch.cuda.is_bf16_supported() else [torch.half]
) + [torch.float32]
for forward_only in (True, False): for forward_only in (True, False):
for name, forward_backward_func in fwd_bwd_functions.items(): for name, forward_backward_func in fwd_bwd_functions.items():
if name == "interleaving" and torch.cuda.device_count() <= 2: if name == "interleaving" and torch.cuda.device_count() <= 2:
...@@ -131,7 +129,7 @@ if __name__ == "__main__": ...@@ -131,7 +129,7 @@ if __name__ == "__main__":
"while interleaved scheduled pipeline parallel requires >2 gpus." "while interleaved scheduled pipeline parallel requires >2 gpus."
) )
continue continue
for dtype in autocast_dtypes: for dtype in dtypes:
if torch.distributed.get_rank() == 0: if torch.distributed.get_rank() == 0:
_logger.info(f"forward_only: {forward_only}, name: {name}, dtype: {dtype}") _logger.info(f"forward_only: {forward_only}, name: {name}, dtype: {dtype}")
grad_scaler = torch.cuda.amp.GradScaler(init_scale=4.0) if dtype == torch.half else None grad_scaler = torch.cuda.amp.GradScaler(init_scale=4.0) if dtype == torch.half else None
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment