Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
evt_fugx1
dcu_megatron
Commits
53627040
Commit
53627040
authored
Apr 15, 2025
by
dongcl
Browse files
bug fix
parent
409cdfef
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
10 additions
and
10 deletions
+10
-10
dcu_megatron/adaptor/megatron_adaptor.py
dcu_megatron/adaptor/megatron_adaptor.py
+5
-5
dcu_megatron/core/tensor_parallel/layers.py
dcu_megatron/core/tensor_parallel/layers.py
+5
-5
No files found.
dcu_megatron/adaptor/megatron_adaptor.py
View file @
53627040
...
...
@@ -207,11 +207,11 @@ class CoreAdaptation(MegatronAdaptationABC):
apply_wrapper
=
True
)
MegatronAdaptation
.
register
(
"megatron.core.tensor_parallel.layers.ColumnParallelLinear.forward"
,
ColumnParallelLinearPatch
.
forward
)
#
MegatronAdaptation.register("megatron.core.tensor_parallel.layers.RowParallelLinear.__init__",
#
row_parallel_linear_init_wrapper,
#
apply_wrapper=True)
#
MegatronAdaptation.register("megatron.core.tensor_parallel.layers.RowParallelLinear.forward",
#
RowParallelLinearPatch.forward)
MegatronAdaptation
.
register
(
"megatron.core.tensor_parallel.layers.RowParallelLinear.__init__"
,
row_parallel_linear_init_wrapper
,
apply_wrapper
=
True
)
MegatronAdaptation
.
register
(
"megatron.core.tensor_parallel.layers.RowParallelLinear.forward"
,
RowParallelLinearPatch
.
forward
)
def
patch_training
(
self
):
from
..training.tokenizer
import
build_tokenizer
...
...
dcu_megatron/core/tensor_parallel/layers.py
View file @
53627040
...
...
@@ -546,7 +546,9 @@ class LinearRS(torch.autograd.Function):
grad_output_buffer
.
append
(
grad_output
)
wgrad_compute
=
False
if
wgrad
:
world_size
=
get_tensor_model_parallel_world_size
()
if
wgrad_compute
:
if
ctx
.
sequence_parallel
:
dim_size
=
list
(
grad_output
.
size
())
dim_size
[
0
]
=
dim_size
[
0
]
*
world_size
...
...
@@ -565,12 +567,10 @@ class LinearRS(torch.autograd.Function):
total_grad_output
=
grad_output
if
ctx
.
sequence_parallel
:
world_size
=
get_tensor_model_parallel_world_size
()
sequence_len
,
batch_size
,
output_hidden_size
=
grad_output
.
size
()
input_hidden_size
=
weight
.
size
(
-
1
)
if
bw_gemm_
rs_
op
is
None
:
if
bw_
ag_
gemm_op
is
None
:
bw_ag_gemm_op
=
flux
.
AGKernel
(
get_tensor_model_parallel_group
(),
1
,
#world_size // torch.cuda.device_count(),
...
...
@@ -1013,7 +1013,7 @@ class RowParallelLinearPatch(torch.nn.Module):
assert
HAS_FLUX
,
"flux is NOT installed"
sequence_len
,
batch_size
,
input_hidden_size
=
input_parallel
.
size
()
output_hidden_size
=
weight
.
size
(
0
)
output_hidden_size
=
self
.
weight
.
size
(
0
)
world_size
=
get_tensor_model_parallel_world_size
()
if
self
.
sequence_parallel
:
current_flux_params
=
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment