Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
7904baf6
Unverified
Commit
7904baf6
authored
Jan 04, 2022
by
ver217
Committed by
GitHub
Jan 04, 2022
Browse files
fix layers/schedule for hybrid parallelization (#111) (#112)
parent
f03bcb35
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
44 additions
and
18 deletions
+44
-18
colossalai/engine/schedule/_pipeline_schedule.py
colossalai/engine/schedule/_pipeline_schedule.py
+6
-2
colossalai/nn/layer/parallel_1d/layers.py
colossalai/nn/layer/parallel_1d/layers.py
+6
-2
colossalai/nn/layer/parallel_2d/_operation.py
colossalai/nn/layer/parallel_2d/_operation.py
+10
-9
colossalai/nn/layer/wrapper/pipeline_wrapper.py
colossalai/nn/layer/wrapper/pipeline_wrapper.py
+6
-0
colossalai/trainer/hooks/_metric_hook.py
colossalai/trainer/hooks/_metric_hook.py
+15
-4
model_zoo/gpt/gpt.py
model_zoo/gpt/gpt.py
+1
-1
No files found.
colossalai/engine/schedule/_pipeline_schedule.py
View file @
7904baf6
...
...
@@ -48,9 +48,13 @@ class PipelineSchedule(BaseSchedule):
# Pipeline schedule just puts data in memory
self
.
batch_data
,
self
.
batch_label
=
super
().
load_batch
(
data_iter
,
to_gpu
=
False
)
self
.
microbatch_offset
=
0
assert
self
.
batch_size
%
self
.
num_microbatches
==
0
,
\
if
isinstance
(
self
.
batch_data
,
torch
.
Tensor
):
batch_size
=
self
.
batch_data
.
size
(
0
)
else
:
batch_size
=
next
(
iter
(
self
.
batch_data
.
values
())).
size
(
0
)
assert
batch_size
%
self
.
num_microbatches
==
0
,
\
"Batch size should divided by the number of microbatches"
self
.
microbatch_size
=
self
.
batch_size
//
self
.
num_microbatches
self
.
microbatch_size
=
batch_size
//
self
.
num_microbatches
def
_get_data_slice
(
self
,
data
,
offset
):
if
isinstance
(
data
,
torch
.
Tensor
):
...
...
colossalai/nn/layer/parallel_1d/layers.py
View file @
7904baf6
...
...
@@ -71,6 +71,7 @@ class Linear1D(torch.nn.Module):
@
LAYERS
.
register_module
class
Classifier1D
(
ParallelLayer
):
"""RowLinear with given weight"""
def
__init__
(
self
,
in_features
:
int
,
num_classes
:
int
,
...
...
@@ -127,7 +128,7 @@ class Classifier1D(ParallelLayer):
output_parallel
=
F
.
linear
(
input_
,
self
.
weight
)
output
=
reduce_input
(
output_parallel
,
ParallelMode
.
PARALLEL_1D
)
if
self
.
bias
is
not
None
:
output
=
output
+
self
.
bias
return
output
...
...
@@ -152,6 +153,7 @@ class Linear1D_Col(ParallelLayer):
which is :math:`Y_i = XA_i`, defaults to False
:type gather_output: bool, optional
"""
def
__init__
(
self
,
in_features
:
int
,
out_features
:
int
,
...
...
@@ -233,6 +235,7 @@ class Linear1D_Row(ParallelLayer):
:param parallel_input: If set to ``True``, it's assumed that the input is splitted, defaults to False
:type parallel_input: bool, optional
"""
def
__init__
(
self
,
in_features
:
int
,
out_features
:
int
,
...
...
@@ -302,6 +305,7 @@ class Linear1D_Row(ParallelLayer):
class
MixedFusedLayerNorm1D
(
torch
.
nn
.
Module
):
""" Experimental
"""
def
__init__
(
self
,
normalized_shape
,
eps
=
1e-5
):
super
(
MixedFusedLayerNorm1D
,
self
).
__init__
()
...
...
colossalai/nn/layer/parallel_2d/_operation.py
View file @
7904baf6
...
...
@@ -121,7 +121,8 @@ class classifier_2d(torch.autograd.Function):
B_grad
=
torch
.
matmul
(
output_grad
.
reshape
(
-
1
,
output_grad
.
shape
[
-
1
]).
transpose
(
0
,
1
),
A
)
B_grad
=
reduce_scatter
(
B_grad
,
-
1
,
ctx
.
col_parallel_mode
)
B_grad
=
B_grad
.
reshape
(
ctx
.
B_shape
)
bias_grad
=
None
if
ctx
.
use_bias
:
bias_grad
=
torch
.
sum
(
output_grad
,
dim
=
tuple
(
range
(
output_grad
.
ndim
-
1
)))
bias_grad
=
all_reduce
(
bias_grad
,
ctx
.
col_parallel_mode
)
...
...
colossalai/nn/layer/wrapper/pipeline_wrapper.py
View file @
7904baf6
...
...
@@ -38,3 +38,9 @@ class PipelineSharedModuleWrapper:
for
p
in
module
.
parameters
():
setattr
(
p
,
'pipeline_shared_module_pg'
,
self
.
group
)
dist
.
broadcast
(
p
,
src
,
group
=
self
.
group
)
def
register_parameter
(
self
,
param
:
nn
.
Parameter
):
assert
self
.
ranks_in_group
is
not
None
,
f
'Rank
{
gpc
.
get_local_rank
(
ParallelMode
.
PIPELINE
)
}
is not in pipeline_ranks
{
self
.
pipeline_ranks
}
'
src
=
self
.
ranks_in_group
[
self
.
pipeline_ranks
[
0
]]
setattr
(
param
,
'pipeline_shared_module_pg'
,
self
.
group
)
dist
.
broadcast
(
param
,
src
,
group
=
self
.
group
)
colossalai/trainer/hooks/_metric_hook.py
View file @
7904baf6
...
...
@@ -25,6 +25,7 @@ class Metric(ABC):
:param epoch_only: Whether the metric only read for the full epoch
:type epoch_only: bool
"""
def
__init__
(
self
,
epoch_only
:
bool
):
# is the metric only read for the full epoch
self
.
_epoch_only
=
epoch_only
...
...
@@ -82,6 +83,7 @@ class LossMetric(Metric):
:param epoch_only: Whether the metric only read for the full epoch
:type epoch_only: bool
"""
def
__init__
(
self
,
epoch_only
):
super
().
__init__
(
epoch_only
=
epoch_only
)
self
.
last_step_loss
=
torch
.
zeros
(
1
,
device
=
get_current_device
())
...
...
@@ -132,6 +134,7 @@ class LearningRateMetric(Metric):
:param epoch_only: Whether the metric only read for the full epoch
:type epoch_only: bool
"""
def
__init__
(
self
,
epoch_only
:
bool
,
initial_lr
:
float
=
0.
):
super
().
__init__
(
epoch_only
=
epoch_only
)
self
.
lr
=
initial_lr
...
...
@@ -159,6 +162,7 @@ class AccuracyMetric(Metric):
:param epoch_only: Whether the metric only read for the full epoch
:type epoch_only: bool
"""
def
__init__
(
self
,
epoch_only
:
bool
,
accuracy_func
:
Callable
):
super
().
__init__
(
epoch_only
=
epoch_only
)
self
.
acc
=
accuracy_func
...
...
@@ -217,6 +221,7 @@ class MetricHook(BaseHook):
:type trainer: Trainer
:type priority: int
"""
def
__init__
(
self
,
priority
:
int
,
...
...
@@ -238,6 +243,7 @@ class LossHook(MetricHook):
:type trainer: Trainer
:type priority: int, optional
"""
def
__init__
(
self
,
priority
:
int
=
0
):
super
().
__init__
(
priority
)
...
...
@@ -278,6 +284,7 @@ class AccuracyHook(MetricHook):
:type trainer: Trainer
:type priority: int
"""
def
__init__
(
self
,
accuracy_func
:
Callable
,
priority
:
int
=
0
):
super
().
__init__
(
priority
)
self
.
accuracy_func
=
accuracy_func
...
...
@@ -351,13 +358,17 @@ class ThroughputHook(MetricHook):
trainer
.
states
[
'metrics'
][
'test'
][
'Throughput'
]
=
self
.
metric
def
before_train_epoch
(
self
,
trainer
):
if
self
.
_is_stage_to_compute
:
self
.
metric
.
reset
()
def
after_train_iter
(
self
,
trainer
,
*
args
):
if
self
.
_is_stage_to_compute
:
self
.
metric
.
update
(
trainer
.
schedule
.
batch_size
,
trainer
.
_timer
.
get_timer
(
'Train-step'
).
get_elapsed_time
())
def
before_test
(
self
,
trainer
):
if
self
.
_is_stage_to_compute
:
self
.
metric
.
reset
()
def
after_test_iter
(
self
,
trainer
,
*
args
):
if
self
.
_is_stage_to_compute
:
self
.
metric
.
update
(
trainer
.
schedule
.
batch_size
,
trainer
.
_timer
.
get_timer
(
'Test-step'
).
get_elapsed_time
())
model_zoo/gpt/gpt.py
View file @
7904baf6
...
...
@@ -133,7 +133,7 @@ class GPTBlock(CheckpointModule):
dtype
:
dtype
=
None
,
bias
:
bool
=
True
,
checkpoint
:
bool
=
False
):
super
().
__init__
()
super
().
__init__
(
checkpoint
=
checkpoint
)
self
.
norm1
=
col_nn
.
LayerNorm
(
normalized_shape
=
dim
,
eps
=
1e-6
,
dtype
=
dtype
)
self
.
attn
=
GPTSelfAttention
(
dim
=
dim
,
num_heads
=
num_heads
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment