Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
177c3744
"vscode:/vscode.git/clone" did not exist on "1a29e8fc297b8ea557dde1909a4b65f91e53b824"
Unverified
Commit
177c3744
authored
Jun 23, 2022
by
Jiarui Fang
Committed by
GitHub
Jun 23, 2022
Browse files
remove gather out in parallel action (#1163)
parent
51f1ec96
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
43 additions
and
32 deletions
+43
-32
colossalai/nn/_ops/addmm.py
colossalai/nn/_ops/addmm.py
+4
-13
colossalai/nn/_ops/embedding.py
colossalai/nn/_ops/embedding.py
+1
-3
colossalai/nn/_ops/embedding_bag.py
colossalai/nn/_ops/embedding_bag.py
+2
-3
colossalai/nn/_ops/linear.py
colossalai/nn/_ops/linear.py
+1
-4
colossalai/tensor/colo_tensor.py
colossalai/tensor/colo_tensor.py
+29
-3
colossalai/tensor/spec.py
colossalai/tensor/spec.py
+2
-3
tests/test_tensor/test_linear_tp.py
tests/test_tensor/test_linear_tp.py
+1
-0
tests/test_tensor/test_model.py
tests/test_tensor/test_model.py
+3
-3
No files found.
colossalai/nn/_ops/addmm.py
View file @
177c3744
...
...
@@ -37,10 +37,10 @@ def colo_addmm_1Dcol(input_tensor: ColoTensor, mat1: ColoTensor, mat2: ColoTenso
output_spec
=
TensorSpec
(
distspec
.
shard
(
mat2
.
spec
.
get_process_group
(),
[
-
1
],
[
mat2
.
spec
.
get_process_group_size
()]),
ParallelAction
(
ComputePattern
.
TP1D
))
output
=
ColoTensor
.
from_torch_tensor
(
output_parallel
,
spec
=
output_spec
)
if
parallel_action
.
gather_out
:
# All-Gather(Output)
output
=
output
.
convert_to_dist_spec
(
distspec
.
replicate
(
mat2
.
spec
.
get_process_group
()))
return
output
# TODO(jiaruifang) addam is special case
# since gpt call view after the Op.
return
output
.
to_replicate
()
def
colo_addmm_1d
(
mode
:
str
,
input_tensor
:
ColoTensor
,
mat1
:
ColoTensor
,
mat2
:
ColoTensor
,
beta
:
Number
,
...
...
@@ -62,11 +62,6 @@ def colo_addmm(input_tensor: GeneralTensor,
"""
input_tensor
,
mat1
,
mat2
=
tuple
(
map
(
convert_to_colo_tensor
,
(
input_tensor
,
mat1
,
mat2
)))
# building the computing graph, inputs -> op
# if GraphGlobalEnv().graph_building:
# cur_op_node = GraphOpNode('linear', [weight, bias])
# cur_op_node.add_prev_tensor(input_tensor)
# Add communication logic before and after linear call.
ret_tensor
=
None
if
not
mat2
.
has_spec
():
# No Model Parallel Applied
...
...
@@ -84,8 +79,4 @@ def colo_addmm(input_tensor: GeneralTensor,
else
:
raise
NotImplementedError
# building the computing graph, op -> output
# if GraphGlobalEnv().graph_building:
# cur_op_node.add_post_tensor(ret_tensor)
return
ret_tensor
colossalai/nn/_ops/embedding.py
View file @
177c3744
...
...
@@ -30,9 +30,7 @@ def colo_embedding_1Dcol(input_tensor: ColoTensor,
distspec
.
shard
(
weight
.
spec
.
get_process_group
(),
[
-
1
],
[
weight
.
spec
.
get_process_group_size
()]),
ParallelAction
(
ComputePattern
.
TP1D
))
output
=
ColoTensor
.
from_torch_tensor
(
output_parallel
,
spec
=
output_spec
)
if
weight
.
spec
.
parallel_action
.
gather_out
:
output
=
output
.
convert_to_dist_spec
(
distspec
.
replicate
(
weight
.
spec
.
get_process_group
()))
return
output
return
output
.
to_replicate
()
def
colo_embedding_1Drow
(
input_tensor
:
ColoTensor
,
...
...
colossalai/nn/_ops/embedding_bag.py
View file @
177c3744
...
...
@@ -36,9 +36,8 @@ def colo_embedding_bag_1Dcol(input_tensor: ColoTensor,
distspec
.
shard
(
weight
.
spec
.
get_process_group
(),
[
-
1
],
[
weight
.
spec
.
get_process_group_size
()]),
ParallelAction
(
ComputePattern
.
TP1D
))
output
=
ColoTensor
.
from_torch_tensor
(
output_parallel
,
spec
=
output_spec
)
if
weight
.
spec
.
parallel_action
.
gather_out
:
output
=
output
.
convert_to_dist_spec
(
distspec
.
replicate
(
weight
.
spec
.
get_process_group
()))
return
output
return
output
.
to_replicate
()
def
colo_embedding_bag_1d
(
tp_mode
:
str
,
...
...
colossalai/nn/_ops/linear.py
View file @
177c3744
...
...
@@ -42,10 +42,7 @@ def colo_linear_1Dcol(input_tensor: ColoTensor, weight: ColoTensor, bias: Option
distspec
.
shard
(
weight
.
spec
.
get_process_group
(),
[
-
1
],
[
weight
.
spec
.
get_process_group_size
()]),
ParallelAction
(
ComputePattern
.
TP1D
)))
if
parallel_action
.
gather_out
:
# All-Gather(Output)
output
=
output
.
convert_to_dist_spec
(
distspec
.
replicate
(
weight
.
spec
.
get_process_group
()))
return
output
return
output
.
to_replicate
()
def
colo_linear_1d
(
mode
:
str
,
input_tensor
:
ColoTensor
,
weight
:
ColoTensor
,
bias
:
Optional
[
ColoTensor
])
->
'ColoTensor'
:
...
...
colossalai/tensor/colo_tensor.py
View file @
177c3744
...
...
@@ -92,10 +92,13 @@ class ColoTensor(torch.Tensor):
def
__repr__
(
self
):
return
f
'ColoTensor:
{
super
().
__repr__
()
}
'
def
is_model_data
(
self
)
->
bool
:
return
self
.
_type
==
TensorType
.
MODEL
def
_convert_to_dist_spec
(
self
,
dist_spec
:
_DistSpec
)
->
None
:
"""_convert_to_dist_spec
Note the function will not handle the logic of backward propagation!
It is used during model tensor initializations as an internal function.
Args:
dist_spec (_DistSpec): the target dist. spec.
"""
with
DistSpecManager
.
no_grad
():
self
.
data
=
DistSpecManager
.
handle_trans_spec
(
self
,
self
.
spec
.
dist_spec
,
dist_spec
)
self
.
_tensor_spec
.
dist_spec
=
dist_spec
...
...
@@ -106,6 +109,19 @@ class ColoTensor(torch.Tensor):
ret
=
DistSpecManager
.
handle_trans_spec
(
self
,
self
.
spec
.
dist_spec
,
dist_spec
)
return
ColoTensor
.
from_torch_tensor
(
ret
,
tensor_spec
)
def
to_replicate_
(
self
):
"""to_replicate_
an inline member function, converting dist spec of the tensor to REPLICATE
"""
self
.
data
=
DistSpecManager
.
handle_trans_spec
(
self
,
self
.
spec
.
dist_spec
,
distspec
.
replicate
())
self
.
_tensor_spec
.
dist_spec
=
distspec
.
replicate
()
def
to_replicate
(
self
)
->
'ColoTensor'
:
"""to_replicate
converting dist spec of the tensor to REPLICATE
"""
return
self
.
convert_to_dist_spec
(
distspec
.
replicate
(
self
.
spec
.
get_process_group
()))
@
staticmethod
def
from_torch_tensor
(
tensor
:
torch
.
Tensor
,
spec
:
TensorSpec
=
TensorSpec
(
distspec
.
replicate
()))
->
'ColoTensor'
:
tensor
=
tensor
.
as_subclass
(
ColoTensor
)
...
...
@@ -121,3 +137,13 @@ class ColoTensor(torch.Tensor):
tensor
=
ColoTensor
(
data
,
spec
=
copy
(
self
.
spec
))
memo
[
id
(
self
)]
=
tensor
return
tensor
# TODO(jiaruifang) a patch for gpt test.
# We need to override the member function must operate on a replicated tensor
# def view(self, *args, **kwargs):
# self.data = DistSpecManager.handle_trans_spec(self,
# self.spec.dist_spec,
# distspec.replicate(self.spec.get_process_group()))
# # self._tensor_spec.dist_spec = distspec.replicate(self.spec.get_process_group())
# self.data.view(*args, **kwargs)
# return ColoTensor.from_torch_tensor(self.data)
colossalai/tensor/spec.py
View file @
177c3744
...
...
@@ -13,13 +13,12 @@ class ComputePattern(Enum):
class
ParallelAction
(
object
):
def
__init__
(
self
,
compute_pattern
:
ComputePattern
,
gather_out
:
bool
=
True
)
->
None
:
def
__init__
(
self
,
compute_pattern
:
ComputePattern
)
->
None
:
assert
isinstance
(
compute_pattern
,
ComputePattern
)
self
.
compute_pattern
=
compute_pattern
self
.
gather_out
=
gather_out
def
__repr__
(
self
):
return
f
'compute pattern:
{
self
.
compute_pattern
}
, gather out:
{
self
.
gather_out
}
'
return
f
'compute pattern:
{
self
.
compute_pattern
}
'
class
TensorSpec
(
object
):
...
...
tests/test_tensor/test_linear_tp.py
View file @
177c3744
...
...
@@ -41,6 +41,7 @@ def run_with_spec(spec_init_func):
x
=
torch
.
rand
(
2
,
4
).
cuda
()
out
=
model
(
x
)
colo_out
=
F
.
linear
(
x
,
weight
,
bias
)
colo_out
=
colo_out
.
to_replicate
()
assert
tensor_equal
(
out
,
colo_out
)
grad
=
torch
.
rand_like
(
out
)
out
.
backward
(
grad
)
...
...
tests/test_tensor/test_model.py
View file @
177c3744
...
...
@@ -26,10 +26,10 @@ def init_1d_row_linear(weight):
weight
.
set_spec
(
spec
)
def
init_1d_col_linear
(
weight
,
gather_out
=
True
):
def
init_1d_col_linear
(
weight
):
spec
=
TensorSpec
(
distspec
.
shard
(
gpc
.
get_group
(
ParallelMode
.
PARALLEL_1D
),
[
0
],
[
gpc
.
get_world_size
(
ParallelMode
.
PARALLEL_1D
)]),
ParallelAction
(
ComputePattern
.
TP1D
,
gather_out
=
gather_out
))
ParallelAction
(
ComputePattern
.
TP1D
))
with
DistSpecManager
.
no_grad
():
weight
.
set_spec
(
spec
)
...
...
@@ -98,7 +98,7 @@ def run_1d_hybrid_tp(model_name):
if
'proj2'
in
name
and
'weight'
in
name
:
init_1d_row_linear
(
p
)
if
'classifier'
in
name
and
(
'weight'
in
name
or
'bias'
in
name
):
init_1d_col_linear
(
p
,
gather_out
=
False
)
init_1d_col_linear
(
p
)
model
=
model
.
cuda
()
colo_optimizer
=
ColoOptimizer
(
dict
(
model
.
named_parameters
()),
torch
.
optim
.
SGD
,
lr
=
0.1
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment