Unverified Commit 554aa959 authored by Hongxin Liu's avatar Hongxin Liu Committed by GitHub
Browse files

[legacy] move communication and nn to legacy and refactor logger (#4671)

* [legacy] move communication to legacy (#4640)

* [legacy] refactor logger and clean up legacy codes (#4654)

* [legacy] make logger independent to gpc

* [legacy] make optim independent to registry

* [legacy] move test engine to legacy

* [legacy] move nn to legacy (#4656)

* [legacy] move nn to legacy

* [checkpointio] fix save hf config

* [test] remove useledd rpc pp test

* [legacy] fix nn init

* [example] skip tutorial hybriad parallel example

* [devops] test doc check

* [devops] test doc check
parent 536397cc
......@@ -73,7 +73,7 @@ from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.nn.lr_scheduler import LinearWarmupLR
from colossalai.nn.metric import Accuracy
from colossalai.legacy.nn.metric import Accuracy
from colossalai.legacy.trainer import Trainer, hooks
```
......
......@@ -340,7 +340,7 @@ for epoch in range(gpc.config.NUM_EPOCHS):
```python
from colossalai.nn.metric import Accuracy
from colossalai.legacy.nn.metric import Accuracy
from colossalai.legacy.trainer import Trainer, hooks
......
......@@ -8,11 +8,11 @@ from torch.nn.parameter import Parameter
from colossalai.context import ParallelMode, seed
from colossalai.core import global_context as gpc
from colossalai.legacy.nn.layer.base_layer import ParallelLayer
from colossalai.legacy.nn.layer.parallel_1d._utils import gather_forward_split_backward, reduce_grad, reduce_input
from colossalai.legacy.nn.layer.parallel_1d.layers import Linear1D_Row
from colossalai.legacy.nn.layer.utils import divide
from colossalai.legacy.registry import LAYERS, LOSSES, MODELS
from colossalai.nn.layer.base_layer import ParallelLayer
from colossalai.nn.layer.parallel_1d._utils import gather_forward_split_backward, reduce_grad, reduce_input
from colossalai.nn.layer.parallel_1d.layers import Linear1D_Row
from colossalai.nn.layer.utils import divide
from colossalai.utils import get_current_device
......
......@@ -11,9 +11,9 @@ from colossalai import kernel
from colossalai import nn as col_nn
from colossalai.core import global_context as gpc
from colossalai.kernel.cuda_native.scaled_softmax import AttnMaskType
from colossalai.nn.layer import Linear1D_Col, Linear1D_Row
from colossalai.nn.layer.base_layer import ParallelLayer
from colossalai.nn.layer.utils import ACT2FN, divide
from colossalai.legacy.nn.layer import Linear1D_Col, Linear1D_Row
from colossalai.legacy.nn.layer.base_layer import ParallelLayer
from colossalai.legacy.nn.layer.utils import ACT2FN, divide
from colossalai.utils import checkpoint
from colossalai.utils.activation_checkpoint import checkpoint
......
......@@ -9,8 +9,8 @@ from colossalai import kernel
from colossalai import nn as col_nn
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.legacy.nn.layer.wrapper import PipelineSharedModuleWrapper
from colossalai.logging import get_dist_logger
from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper
from colossalai.pipeline.utils import partition_uniform
from .embed import HiddenParallelEmbedding, HiddenParallelGPTLMHead1D, VocabParallelEmbedding, VocabParallelGPTLMHead1D
......
#!/bin/bash
set -euxo pipefail
pip install -r requirements.txt
colossalai run --nproc_per_node 4 train.py --config config.py
echo "legacy example"
# pip install -r requirements.txt
# colossalai run --nproc_per_node 4 train.py --config config.py
......@@ -7,8 +7,8 @@ from tqdm import tqdm
import colossalai
from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.legacy.nn import CrossEntropyLoss
from colossalai.logging import get_dist_logger
from colossalai.nn import CrossEntropyLoss
from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
from colossalai.pipeline.pipelinable import PipelinableContext
from colossalai.utils import is_using_pp
......
from colossalai.context.parallel_mode import ParallelMode
import inspect
import torch
import torch.nn as nn
import inspect
from .layers import Embedding, BertLayer, BertDualHead, PreProcessor, VocabEmbedding
from .layers.init_method import init_normal, output_init_normal
from colossalai.core import global_context as gpc
from colossalai.context import ParallelMode
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.kernel import LayerNorm
from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper
from colossalai.legacy.nn.layer.wrapper import PipelineSharedModuleWrapper
from colossalai.logging import get_dist_logger
from colossalai.pipeline.utils import partition_uniform
from .layers import BertDualHead, BertLayer, Embedding, PreProcessor, VocabEmbedding
from .layers.init_method import init_normal, output_init_normal
class BertForPretrain(nn.Module):
def __init__(self,
vocab_size,
hidden_size,
max_sequence_length,
num_attention_heads,
num_layers,
add_binary_head,
is_naive_fp16,
num_tokentypes=2,
dropout_prob=0.1,
mlp_ratio=4,
init_std=0.02,
convert_fp16_to_fp32_in_softmax=False,
):
def __init__(
self,
vocab_size,
hidden_size,
max_sequence_length,
num_attention_heads,
num_layers,
add_binary_head,
is_naive_fp16,
num_tokentypes=2,
dropout_prob=0.1,
mlp_ratio=4,
init_std=0.02,
convert_fp16_to_fp32_in_softmax=False,
):
super().__init__()
self.seq_parallel_size = gpc.get_world_size(ParallelMode.SEQUENCE)
assert max_sequence_length % self.seq_parallel_size == 0, 'sequence length is not divisible by the sequence parallel size'
......@@ -47,19 +51,19 @@ class BertForPretrain(nn.Module):
self.bert_layers = nn.ModuleList()
for i in range(num_layers):
bert_layer = BertLayer(layer_number=i+1,
bert_layer = BertLayer(layer_number=i + 1,
hidden_size=hidden_size,
num_attention_heads=num_attention_heads,
attention_dropout=dropout_prob,
mlp_ratio=mlp_ratio,
hidden_dropout=dropout_prob,
convert_fp16_to_fp32_in_softmax=convert_fp16_to_fp32_in_softmax,
is_naive_fp16=is_naive_fp16
)
is_naive_fp16=is_naive_fp16)
self.bert_layers.append(bert_layer)
self.layer_norm = LayerNorm(hidden_size)
self.head = BertDualHead(hidden_size, self.embedding.word_embedding_weight.size(0),
self.head = BertDualHead(hidden_size,
self.embedding.word_embedding_weight.size(0),
add_binary_head=add_binary_head)
self.reset_parameters()
......@@ -166,22 +170,20 @@ class PipelineBertForPretrain(nn.Module):
end_idx = num_layers
for i in range(start_idx, end_idx):
bert_layer = BertLayer(layer_number=i+1,
bert_layer = BertLayer(layer_number=i + 1,
hidden_size=hidden_size,
num_attention_heads=num_attention_heads,
attention_dropout=dropout_prob,
mlp_ratio=mlp_ratio,
hidden_dropout=dropout_prob,
convert_fp16_to_fp32_in_softmax=convert_fp16_to_fp32_in_softmax,
is_naive_fp16=is_naive_fp16
)
is_naive_fp16=is_naive_fp16)
self.bert_layers.append(bert_layer)
if self.last_stage:
self.word_embeddings = VocabEmbedding(vocab_size, hidden_size)
self.layer_norm = LayerNorm(hidden_size)
self.head = BertDualHead(hidden_size, vocab_size,
add_binary_head=add_binary_head)
self.head = BertDualHead(hidden_size, vocab_size, add_binary_head=add_binary_head)
self.reset_parameters()
def _init_normal(self, tensor):
......
import torch
import torch.nn as nn
from colossalai.nn.layer.parallel_sequence import TransformerSelfAttentionRing
from colossalai.kernel.jit import bias_dropout_add_fused_train, bias_dropout_add_fused_inference
from colossalai.kernel.cuda_native import LayerNorm
from .mlp import TransformerMLP
from colossalai.kernel.jit import bias_dropout_add_fused_inference, bias_dropout_add_fused_train
from colossalai.legacy.nn.layer.parallel_sequence import TransformerSelfAttentionRing
from .dropout import get_bias_dropout_add
from .mlp import TransformerMLP
def attention_mask_func(attention_scores, attention_mask):
......@@ -48,8 +50,7 @@ class BertLayer(nn.Module):
layer_number=layer_number,
apply_query_key_layer_scaling=True,
convert_fp16_to_fp32_in_softmax=convert_fp16_to_fp32_in_softmax,
fp16=is_naive_fp16
)
fp16=is_naive_fp16)
self.hidden_dropout = hidden_dropout
self.bias_dropout_fusion = bias_dropout_fusion
......@@ -89,11 +90,8 @@ class BertLayer(nn.Module):
# re-enable torch grad to enable fused optimization.
with torch.enable_grad():
layernorm_input = bias_dropout_add_func(
attention_output,
attention_bias.expand_as(residual),
residual,
self.hidden_dropout)
layernorm_input = bias_dropout_add_func(attention_output, attention_bias.expand_as(residual), residual,
self.hidden_dropout)
# Layer norm post the self attention.
layernorm_output = self.post_attention_layernorm(layernorm_input)
......@@ -109,10 +107,6 @@ class BertLayer(nn.Module):
# re-enable torch grad to enable fused optimization.
with torch.enable_grad():
output = bias_dropout_add_func(
mlp_output,
mlp_bias.expand_as(residual),
residual,
self.hidden_dropout)
output = bias_dropout_add_func(mlp_output, mlp_bias.expand_as(residual), residual, self.hidden_dropout)
return output
......@@ -2,7 +2,7 @@ import torch
import torch.nn as nn
import torch.nn.functional as F
from colossalai.nn import CheckpointModule
from colossalai.legacy.nn import CheckpointModule
from .registry import non_distributed_component_funcs
from .utils.dummy_data_generator import DummyDataGenerator
......
......@@ -2,7 +2,7 @@ import torch
import torch.nn as nn
import torch.nn.functional as F
from colossalai.nn import CheckpointModule
from colossalai.legacy.nn import CheckpointModule
from .registry import non_distributed_component_funcs
from .utils.dummy_data_generator import DummyDataGenerator
......
......@@ -2,7 +2,7 @@ import torch
import torch.nn as nn
import torch.nn.functional as F
from colossalai.nn import CheckpointModule
from colossalai.legacy.nn import CheckpointModule
from .registry import non_distributed_component_funcs
from .utils import DummyDataGenerator
......
......@@ -3,7 +3,7 @@
import torch
import torch.nn as nn
from colossalai.nn import CheckpointModule
from colossalai.legacy.nn import CheckpointModule
from .registry import non_distributed_component_funcs
from .utils.dummy_data_generator import DummyDataGenerator
......
import torch
import torch.nn as nn
from colossalai.nn import CheckpointModule
from colossalai.legacy.nn import CheckpointModule
from colossalai.utils.cuda import get_current_device
from .registry import non_distributed_component_funcs
......
import pytest
import torch
from colossalai.communication.p2p_v2 import _recv_object, _send_object
from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.initialize import launch
from colossalai.legacy.communication.p2p_v2 import _recv_object, _send_object
from colossalai.logging import disable_existing_loggers
from colossalai.testing import rerun_if_address_is_in_use, spawn
......
......@@ -2,10 +2,10 @@ import pytest
import torch
import torch.distributed as dist
from colossalai.communication import all_gather, all_reduce, reduce_scatter
from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.initialize import launch
from colossalai.legacy.communication import all_gather, all_reduce, reduce_scatter
from colossalai.testing import rerun_if_address_is_in_use, spawn
from colossalai.utils import get_current_device
......
import pytest
import torch
from colossalai.communication.p2p import (
from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.initialize import launch
from colossalai.legacy.communication.p2p import (
recv_backward,
recv_forward,
send_backward,
......@@ -9,9 +12,6 @@ from colossalai.communication.p2p import (
send_forward,
send_forward_recv_backward,
)
from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.initialize import launch
from colossalai.testing import rerun_if_address_is_in_use, spawn
CONFIG = dict(parallel=dict(pipeline=2))
......
import pytest
import torch
from colossalai.communication.p2p_v2 import recv_backward, recv_forward, send_backward, send_forward
from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.initialize import launch
from colossalai.legacy.communication.p2p_v2 import recv_backward, recv_forward, send_backward, send_forward
from colossalai.logging import disable_existing_loggers
from colossalai.testing import rerun_if_address_is_in_use, spawn
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment