Commit 89fe0277 authored by Hongxin Liu's avatar Hongxin Liu
Browse files

[legacy] move trainer to legacy (#4545)

* [legacy] move trainer to legacy

* [doc] update docs related to trainer

* [test] ignore legacy test
parent 807e01a4
from typing import Union, List, Any
from typing import Any, List, Union
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from colossalai.engine import Engine
from colossalai.legacy.trainer.hooks import BaseHook
from colossalai.logging import DistributedLogger
from colossalai.utils import MultiTimer
from colossalai.utils import is_dp_rank_0, is_tp_rank_0, is_no_pp_or_last_stage
from colossalai.trainer.hooks import BaseHook
from colossalai.utils import MultiTimer, is_dp_rank_0, is_no_pp_or_last_stage, is_tp_rank_0
class Trainer:
......
from ._base_hook import BaseHook
from ._checkpoint_hook import SaveCheckpointHook
from ._log_hook import (LogMemoryByEpochHook, LogMetricByEpochHook, LogMetricByStepHook, LogTimingByEpochHook,
TensorboardHook)
from ._log_hook import (
LogMemoryByEpochHook,
LogMetricByEpochHook,
LogMetricByStepHook,
LogTimingByEpochHook,
TensorboardHook,
)
from ._lr_scheduler_hook import LRSchedulerHook
from ._metric_hook import AccuracyHook, LossHook, MetricHook, ThroughputHook
......
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import torch
from colossalai.logging import get_dist_logger
from colossalai.legacy.trainer.hooks import BaseHook
from colossalai.logging import get_dist_logger
from colossalai.registry import HOOKS
from colossalai.trainer.hooks import BaseHook
from colossalai.utils.checkpointing import save_checkpoint
from ._lr_scheduler_hook import LRSchedulerHook
......
......@@ -3,17 +3,17 @@
import os
import os.path as osp
from typing import List
from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.registry import HOOKS
from colossalai.legacy.trainer.hooks._metric_hook import ThroughputMetric
from colossalai.logging import DistributedLogger
from colossalai.utils import report_memory_usage, is_dp_rank_0, \
is_tp_rank_0, is_no_pp_or_last_stage, MultiTimer
from colossalai.registry import HOOKS
from colossalai.utils import MultiTimer, is_dp_rank_0, is_no_pp_or_last_stage, is_tp_rank_0, report_memory_usage
from ._base_hook import BaseHook
from ._commons_ import _format_number
from colossalai.trainer.hooks._metric_hook import ThroughputMetric
class LogByEpochHook(BaseHook):
......
from colossalai.registry import HOOKS
from torch import Tensor
from colossalai.registry import HOOKS
from ._metric_hook import LearningRateMetric, MetricHook
......
......@@ -6,6 +6,7 @@ from typing import Callable
import torch
import torch.distributed as dist
from colossalai.communication import all_reduce
from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc
......
......@@ -43,7 +43,7 @@ from colossalai.engine.schedule import (InterleavedPipelineSchedule,
PipelineSchedule)
from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper
from colossalai.trainer import Trainer, hooks
from colossalai.legacy.trainer import Trainer, hooks
from colossalai.utils.timer import MultiTimer
from model_zoo.gpt import GPTLMLoss
from torch.nn import functional as F
......@@ -268,3 +268,4 @@ def train():
return_output_label=False,
)
```
<!-- doc-test-command: echo -->
......@@ -38,7 +38,7 @@ from colossalai.builder import build_pipeline_model
from colossalai.engine.schedule import (InterleavedPipelineSchedule,
PipelineSchedule)
from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.trainer import Trainer, hooks
from colossalai.legacy.trainer import Trainer, hooks
from colossalai.utils import MultiTimer, get_dataloader
from timm.models import vision_transformer as vit
from torchvision import transforms
......@@ -245,3 +245,4 @@ def train():
hooks=hook_list,
display_progress=True)
```
<!-- doc-test-command: echo -->
......@@ -79,7 +79,7 @@ from colossalai.core import global_context as gpc
from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.nn.lr_scheduler import LinearWarmupLR
from colossalai.nn.metric import Accuracy
from colossalai.trainer import Trainer, hooks
from colossalai.legacy.trainer import Trainer, hooks
```
- Other modules
......@@ -644,3 +644,4 @@ torchrun --standalone --nproc_per_node <NUM_GPUs> train_hybrid.py --config ./co
# If your torch >= 1.9.0
# python -m torch.distributed.run --standalone --nproc_per_node= <NUM_GPUs> train_hybrid.py --config ./configs/config_hybrid_parallel.py
```
<!-- doc-test-command: echo -->
......@@ -64,7 +64,7 @@ Trainer is a more high-level wrapper for the user to execute training with fewer
```python
from colossalai.logging import get_dist_logger
from colossalai.trainer import Trainer, hooks
from colossalai.legacy.trainer import Trainer, hooks
# build components and initialize with colossalai.initialize
...
......@@ -107,7 +107,7 @@ If you want to customize your own hook class, you can inherit `hooks.BaseHook` a
```python
from colossalai.logging import get_dist_logger
from colossalai.trainer import hooks
from colossalai.legacy.trainer import hooks
class LogMessageHook(hooks.BaseHook):
......@@ -345,7 +345,7 @@ If you wish to train with a trainer object, you can follow the code snippet belo
```python
from colossalai.nn.metric import Accuracy
from colossalai.trainer import Trainer, hooks
from colossalai.legacy.trainer import Trainer, hooks
# create a trainer object
......@@ -387,3 +387,4 @@ python -m torch.distributed.launch --nproc_per_node <num_gpus> --master_addr loc
# with trainer
python -m torch.distributed.launch --nproc_per_node <num_gpus> --master_addr localhost --master_port 29500 run_resnet_cifar10_with_trainer.py
```
<!-- doc-test-command: echo -->
......@@ -41,7 +41,7 @@ for epoch in range(num_epochs):
#### Save when using trainer
```python
from colossalai.trainer import Trainer, hooks
from colossalai.legacy.trainer import Trainer, hooks
model = ...
engine, _, _, _ = colossalai.initialize(model=model, ...)
trainer = Trainer(engine, ...)
......@@ -61,3 +61,4 @@ model = ...
load_checkpoint('xxx.pt', model)
... # train or test
```
<!-- doc-test-command: echo -->
......@@ -267,7 +267,7 @@ from pathlib import Path
from colossalai.core import global_context as gpc
from colossalai.logging import get_dist_logger
from colossalai.utils import get_dataloader
from colossalai.trainer import Trainer, hooks
from colossalai.legacy.trainer import Trainer, hooks
from colossalai.nn.lr_scheduler import LinearWarmupLR
from timm.models import vit_base_patch16_224
from torchvision import datasets, transforms
......
......@@ -79,7 +79,7 @@ import colossalai.nn as col_nn
from colossalai.core import global_context as gpc
from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.trainer import Trainer, hooks
from colossalai.legacy.trainer import Trainer, hooks
from colossalai.utils import MultiTimer, get_dataloader
from colossalai.context import ParallelMode
from colossalai.pipeline.pipelinable import PipelinableContext
......@@ -157,3 +157,4 @@ trainer.fit(train_dataloader=train_dataloader,
```
We use `2` pipeline stages and the batch will be split into `4` micro batches.
<!-- doc-test-command: echo -->
......@@ -43,7 +43,7 @@ from colossalai.engine.schedule import (InterleavedPipelineSchedule,
PipelineSchedule)
from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper
from colossalai.trainer import Trainer, hooks
from colossalai.legacy.trainer import Trainer, hooks
from colossalai.utils.timer import MultiTimer
from model_zoo.gpt import GPTLMLoss
from torch.nn import functional as F
......@@ -273,3 +273,4 @@ def train():
return_output_label=False,
)
```
<!-- doc-test-command: echo -->
......@@ -36,7 +36,7 @@ from colossalai.builder import build_pipeline_model
from colossalai.engine.schedule import (InterleavedPipelineSchedule,
PipelineSchedule)
from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.trainer import Trainer, hooks
from colossalai.legacy.trainer import Trainer, hooks
from colossalai.utils import MultiTimer, get_dataloader
from timm.models import vision_transformer as vit
from torchvision import transforms
......@@ -244,3 +244,4 @@ def train():
hooks=hook_list,
display_progress=True)
```
<!-- doc-test-command: echo -->
......@@ -74,7 +74,7 @@ from colossalai.core import global_context as gpc
from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.nn.lr_scheduler import LinearWarmupLR
from colossalai.nn.metric import Accuracy
from colossalai.trainer import Trainer, hooks
from colossalai.legacy.trainer import Trainer, hooks
```
- 其他模块
......@@ -589,3 +589,4 @@ torchrun --standalone --nproc_per_node <NUM_GPUs> train_hybrid.py --config ./co
# If your torch >= 1.9.0
# python -m torch.distributed.run --standalone --nproc_per_node= <NUM_GPUs> train_hybrid.py --config ./configs/config_hybrid_parallel.py
```
<!-- doc-test-command: echo -->
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment