Unverified Commit b5f9e37c authored by Hongxin Liu's avatar Hongxin Liu Committed by GitHub
Browse files

[legacy] clean up legacy code (#4743)

* [legacy] remove outdated codes of pipeline (#4692)

* [legacy] remove cli of benchmark and update optim (#4690)

* [legacy] remove cli of benchmark and update optim

* [doc] fix cli doc test

* [legacy] fix engine clip grad norm

* [legacy] remove outdated colo tensor (#4694)

* [legacy] remove outdated colo tensor

* [test] fix test import

* [legacy] move outdated zero to legacy (#4696)

* [legacy] clean up utils (#4700)

* [legacy] clean up utils

* [example] update examples

* [legacy] clean up amp

* [legacy] fix amp module

* [legacy] clean up gpc (#4742)

* [legacy] clean up context

* [legacy] clean core, constants and global vars

* [legacy] refactor initialize

* [example] fix examples ci

* [example] fix examples ci

* [legacy] fix tests

* [example] fix gpt example

* [example] fix examples ci

* [devops] fix ci installation

* [example] fix examples ci
parent 32e7f994
......@@ -3,7 +3,8 @@ from typing import Any, Dict, Iterator, Optional, Tuple, Union
import torch
from torch import nn
from colossalai.tensor import ColoParameter, ColoTensor, ProcessGroup
from colossalai.legacy.tensor import ProcessGroup
from colossalai.tensor import ColoParameter, ColoTensor
from colossalai.utils.model.utils import InsertPostInitMethodToModuleSubClasses
# find named_params includes replica
......
......@@ -3,9 +3,8 @@ from .memory_stats import MemStats # isort:skip
from .memory_monitor import AsyncMemoryMonitor, SyncCudaMemoryMonitor # isort:skip
from .memstats_collector import MemStatsCollector # isort:skip
from .chunk_memstats_collector import ChunkMemStatsCollector # isort:skip
from .static_memstats_collector import StaticMemStatsCollector # isort:skip
__all__ = [
'AsyncMemoryMonitor', 'SyncCudaMemoryMonitor', 'MemStatsCollector', 'ChunkMemStatsCollector',
'StaticMemStatsCollector', 'MemStats', 'OrderedParamGenerator'
'AsyncMemoryMonitor', 'SyncCudaMemoryMonitor', 'MemStatsCollector', 'ChunkMemStatsCollector', 'MemStats',
'OrderedParamGenerator'
]
from typing import Optional
from colossalai.utils import get_current_device
from colossalai.utils.memory import colo_device_memory_capacity
from colossalai.zero.gemini.chunk import ChunkManager
from .memory_stats import MemStats
......@@ -33,4 +32,5 @@ class ChunkMemStatsCollector(MemStatsCollector):
@property
def cuda_margin_mem(self) -> float:
from colossalai.legacy.utils.memory import colo_device_memory_capacity
return colo_device_memory_capacity(get_current_device()) - self._memstats.max_overall_cuda
......@@ -5,7 +5,7 @@ from time import sleep, time
import torch
from colossalai.utils import colo_device_memory_used, get_current_device
from colossalai.utils import get_current_device
class MemoryMonitor:
......@@ -110,6 +110,7 @@ class AsyncMemoryMonitor(MemoryMonitor):
return max_usage
def _measure_usage(self):
from colossalai.legacy.utils import colo_device_memory_used
max_usage = 0
while self.keep_measuring:
max_usage = max(
......
......@@ -70,7 +70,7 @@ class MemStatsCollector:
Sampling model data statistics.
"""
if self._start_flag and not self.use_outside_memstats:
from colossalai.zero.legacy.gemini import StatefulTensor
from colossalai.legacy.zero.gemini import StatefulTensor
# The following code work for ZeroInitContext, which is deprecated in v0.1.12
cuda_mem = StatefulTensor.GST_MGR.total_mem['cuda']
......
import torch.nn
from colossalai.tensor.param_op_hook import ColoParamOpHookManager
from colossalai.utils import _cast_float
from colossalai.zero.legacy.gemini.ophooks.runtime_mem_tracer_hook import (
from colossalai.legacy.zero.gemini.ophooks.runtime_mem_tracer_hook import (
GradMemStats,
GradMemTracerHook,
ParamMemTracerHook,
)
from colossalai.tensor.param_op_hook import ColoParamOpHookManager
from colossalai.utils import _cast_float
from .memory_stats import MemStats
......
......@@ -6,8 +6,8 @@ from typing import Dict, List, Optional, Tuple, Type
import torch
from colossalai.legacy.utils.memory import colo_device_memory_capacity
from colossalai.utils import get_current_device
from colossalai.utils.memory import colo_device_memory_capacity
from colossalai.zero.gemini.chunk import Chunk
from .chunk import Chunk, ChunkManager
......
......@@ -7,9 +7,6 @@ from torch import Tensor, inf
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
from torch.distributed import ProcessGroup
from colossalai.tensor import ColoParameter
from colossalai.utils import is_model_parallel_parameter
def flatten(input_):
return _flatten_dense_tensors(input_)
......
......@@ -108,5 +108,5 @@ We support `autodoc` to extract the docstring and transform it into a Web elemen
You just need to add `{{ autodoc:<mod-name> }}` in your markdown as a single line. An example is given below and you can see the outcome in [this PR](https://github.com/hpcaitech/ColossalAI-Documentation/pull/175).
```markdown
{{ autodoc:colossalai.amp.apex_amp.convert_to_apex_amp }}
{{ autodoc:colossalai.legacy.amp.apex_amp.convert_to_apex_amp }}
```
......@@ -31,7 +31,7 @@ global context for users to easily manage their process groups. If you wish to a
define a new class and set it in your configuration file. To define your own way of creating process groups, you can
follow the steps below to create a new distributed initialization.
1. Add your parallel mode in `colossalai.context.parallel_mode.ParallelMode`.
1. Add your parallel mode in `colossalai.legacy.context.parallel_mode.ParallelMode`.
```python
class ParallelMode(Enum):
GLOBAL = 'global'
......
......@@ -37,7 +37,7 @@ import torch.nn as nn
from colossalai import nn as col_nn
from colossalai.amp import AMP_TYPE
from colossalai.legacy.builder.pipeline import partition_uniform
from colossalai.context.parallel_mode import ParallelMode
from colossalai.legacy.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule,
PipelineSchedule)
......
......@@ -30,24 +30,4 @@ This command will inform you information regarding the version compatibility and
To launch distributed jobs on single or multiple nodes, the command `colossalai run` can be used for process launching.
You may refer to [Launch Colossal-AI](./launch_colossalai.md) for more details.
## Tensor Parallel Micro-Benchmarking
As Colossal-AI provides an array of tensor parallelism methods, it is not intuitive to choose one for your hardware and
model. Therefore, we provide a simple benchmarking to evaluate the performance of various tensor parallelisms on your system.
This benchmarking is run on a simple MLP model where the input data is of the shape `(batch_size, seq_length, hidden_size)`.
Based on the number of GPUs, the CLI will look for all possible tensor parallel configurations and display the benchmarking results.
You can customize the benchmarking configurations by checking out `colossalai benchmark --help`.
```shell
# run on 4 GPUs
colossalai benchmark --gpus 4
# run on 8 GPUs
colossalai benchmark --gpus 8
```
:::caution
Only single-node benchmarking is supported currently.
:::
<!-- doc-test-command: echo -->
......@@ -24,7 +24,7 @@
并行通常由进程组来管理,参与相同并行算法的进程被置于同一进程组。对于不同的并行算法,需要创建不同的进程组。
Colossal-AI 为用户提供了一个全局 context,使他们能够轻松地管理进程组。如果你想添加新的进程组,你可以很容易地定义一个新的类并在你的配置文件中设置它。为了定义你自己的进程组创建方式,你可以按照下面的步骤来创建一个新的分布式初始化。
1.`colossalai.context.parallel_mode.ParallelMode` 中添加你自己的并行模式。
1.`colossalai.legacy.context.parallel_mode.ParallelMode` 中添加你自己的并行模式。
```python
class ParallelMode(Enum):
GLOBAL = 'global'
......
......@@ -37,7 +37,7 @@ import torch.nn as nn
from colossalai import nn as col_nn
from colossalai.amp import AMP_TYPE
from colossalai.legacy.builder.pipeline import partition_uniform
from colossalai.context.parallel_mode import ParallelMode
from colossalai.legacy.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule,
PipelineSchedule)
......
......@@ -26,22 +26,4 @@ Colossal-AI给用户提供了命令行工具,目前命令行工具可以用来
在分布式训练时,我们可以使用`colossalai run`来启动单节点或者多节点的多进程,详细的内容可以参考[启动 Colossal-AI](./launch_colossalai.md)
## 张量并行基准测试
Colossal-AI提供了多种张量并行,想要充分理解这些方法需要一定的学习成本,对于新手来说很难靠经验选择一个并行方式。
所以我们提供了一个简单的基准测试,能够让用户在自己的机器上测试不同张量并行的性能。这个基准测试跑一个并行的MLP模型,
输入数据的维度为`(批大小,序列长度,隐藏层维度)`。通过指定GPU的数量,Colossal-AI会搜索所有可行的并行配置。用户可以通过查看`colossalai benchmark --help`来自定义相关的测试参数。
```shell
# 使用4个GPU
colossalai benchmark --gpus 4
# 使用8个GPU
colossalai benchmark --gpus 8
```
:::caution
目前仅支持单节点的基准测试。
:::
<!-- doc-test-command: echo -->
......@@ -16,7 +16,7 @@ from transformers import (
get_linear_schedule_with_warmup,
)
from colossalai.core import global_context as gpc
from colossalai.legacy.core import global_context as gpc
from colossalai.nn.lr_scheduler import LinearWarmupLR
from colossalai.nn.optimizer import FusedAdam, HybridAdam
......
......@@ -17,7 +17,7 @@ from utils.logger import Logger
import colossalai
from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.legacy.core import global_context as gpc
from colossalai.nn.parallel import GeminiDDP, zero_model_wrapper, zero_optim_wrapper
from colossalai.tensor import ColoParameter, ComputePattern, ComputeSpec, ProcessGroup, ReplicaSpec, ShardSpec
from colossalai.utils import get_current_device
......
......@@ -5,7 +5,7 @@ import shutil
import psutil
import torch
from colossalai.core import global_context as gpc
from colossalai.legacy.core import global_context as gpc
def logging(s, log_path, print_=True, log_=True):
......
#!/bin/bash
set -xe
pip install -r requirements.txt
echo "this test is slow"
HF_DATASETS_OFFLINE=1
TRANSFORMERS_OFFLINE=1
DIFFUSERS_OFFLINE=1
# pip install -r requirements.txt
# "torch_ddp" "torch_ddp_fp16" "low_level_zero"
for plugin in "gemini"; do
torchrun --nproc_per_node 4 --standalone train_dreambooth_colossalai.py \
--pretrained_model_name_or_path="/data/dreambooth/diffuser/stable-diffusion-v1-4" \
--instance_data_dir="/data/dreambooth/Teyvat/data" \
--output_dir="./weight_output" \
--instance_prompt="a picture of a dog" \
--resolution=512 \
--plugin=$plugin \
--train_batch_size=1 \
--learning_rate=5e-6 \
--lr_scheduler="constant" \
--lr_warmup_steps=0 \
--test_run=True \
--num_class_images=200
done
# HF_DATASETS_OFFLINE=1
# TRANSFORMERS_OFFLINE=1
# DIFFUSERS_OFFLINE=1
# # "torch_ddp" "torch_ddp_fp16" "low_level_zero"
# for plugin in "gemini"; do
# torchrun --nproc_per_node 4 --standalone train_dreambooth_colossalai.py \
# --pretrained_model_name_or_path="/data/dreambooth/diffuser/stable-diffusion-v1-4" \
# --instance_data_dir="/data/dreambooth/Teyvat/data" \
# --output_dir="./weight_output" \
# --instance_prompt="a picture of a dog" \
# --resolution=512 \
# --plugin=$plugin \
# --train_batch_size=1 \
# --learning_rate=5e-6 \
# --lr_scheduler="constant" \
# --lr_warmup_steps=0 \
# --test_run=True \
# --num_class_images=200
# don
......@@ -7,6 +7,7 @@ from pathlib import Path
from typing import Optional
import torch
import torch.distributed as dist
import torch.nn.functional as F
import torch.utils.checkpoint
from diffusers import AutoencoderKL, DDPMScheduler, DiffusionPipeline, UNet2DConditionModel
......@@ -21,13 +22,9 @@ from transformers import AutoTokenizer, PretrainedConfig
import colossalai
from colossalai.booster import Booster
from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.nn.optimizer import HybridAdam
from colossalai.utils import get_current_device
from colossalai.zero import ColoInitContext
from colossalai.zero.gemini import get_static_torch_model
disable_existing_loggers()
logger = get_dist_logger()
......@@ -366,8 +363,8 @@ def main(args):
else:
colossalai.launch_from_torch(config={}, seed=args.seed)
local_rank = gpc.get_local_rank(ParallelMode.DATA)
world_size = gpc.get_world_size(ParallelMode.DATA)
local_rank = dist.get_rank()
world_size = dist.get_world_size()
if args.with_prior_preservation:
class_images_dir = Path(args.class_data_dir)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment