Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
049121d1
Unverified
Commit
049121d1
authored
Mar 05, 2024
by
digger yu
Committed by
GitHub
Mar 05, 2024
Browse files
[hotfix] fix typo change enabel to enable under colossalai/shardformer/ (#5317)
parent
16c96d4d
Changes
8
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
16 additions
and
16 deletions
+16
-16
colossalai/shardformer/layer/_operation.py
colossalai/shardformer/layer/_operation.py
+2
-2
colossalai/shardformer/layer/normalization.py
colossalai/shardformer/layer/normalization.py
+4
-4
colossalai/shardformer/modeling/bloom.py
colossalai/shardformer/modeling/bloom.py
+1
-1
colossalai/shardformer/modeling/chatglm2_6b/modeling_chatglm.py
...alai/shardformer/modeling/chatglm2_6b/modeling_chatglm.py
+3
-3
colossalai/shardformer/modeling/gptj.py
colossalai/shardformer/modeling/gptj.py
+2
-2
colossalai/shardformer/modeling/llama.py
colossalai/shardformer/modeling/llama.py
+2
-2
colossalai/shardformer/modeling/opt.py
colossalai/shardformer/modeling/opt.py
+1
-1
colossalai/shardformer/modeling/t5.py
colossalai/shardformer/modeling/t5.py
+1
-1
No files found.
colossalai/shardformer/layer/_operation.py
View file @
049121d1
...
@@ -173,7 +173,7 @@ class _LinearWithGatherForwardReduceScatterBackward(torch.autograd.Function):
...
@@ -173,7 +173,7 @@ class _LinearWithGatherForwardReduceScatterBackward(torch.autograd.Function):
Args:
Args:
input_ (`torch.Tensor`): The input tensor from sequence parallel region.
input_ (`torch.Tensor`): The input tensor from sequence parallel region.
process_group (`torch.distributed.ProcessGroup`): The process group used for collective communication.
process_group (`torch.distributed.ProcessGroup`): The process group used for collective communication.
overlap (`bool`): Whther to overlap the all_gather op and gradient calculate in backward.
overlap (`bool`): Wh
e
ther to overlap the all_gather op and gradient calculate in backward.
"""
"""
...
@@ -534,7 +534,7 @@ class HookParameter(torch.autograd.Function):
...
@@ -534,7 +534,7 @@ class HookParameter(torch.autograd.Function):
return
grad_output
,
None
,
None
return
grad_output
,
None
,
None
def
hook_paramter_in_backward
(
input
,
weight
=
None
,
bias
=
None
):
def
hook_param
e
ter_in_backward
(
input
,
weight
=
None
,
bias
=
None
):
return
HookParameter
.
apply
(
input
,
weight
,
bias
)
return
HookParameter
.
apply
(
input
,
weight
,
bias
)
...
...
colossalai/shardformer/layer/normalization.py
View file @
049121d1
...
@@ -7,7 +7,7 @@ import torch.nn as nn
...
@@ -7,7 +7,7 @@ import torch.nn as nn
from
colossalai.lazy
import
LazyInitContext
from
colossalai.lazy
import
LazyInitContext
from
._operation
import
hook_paramter_in_backward
from
._operation
import
hook_param
e
ter_in_backward
from
.utils
import
SeqParallelUtils
from
.utils
import
SeqParallelUtils
__all__
=
[
"FusedLayerNorm"
,
"FusedRMSNorm"
,
"LayerNorm"
,
"RMSNorm"
,
"BaseLayerNorm"
]
__all__
=
[
"FusedLayerNorm"
,
"FusedRMSNorm"
,
"LayerNorm"
,
"RMSNorm"
,
"BaseLayerNorm"
]
...
@@ -29,7 +29,7 @@ try:
...
@@ -29,7 +29,7 @@ try:
def
forward
(
self
,
input
):
def
forward
(
self
,
input
):
output
=
super
().
forward
(
input
)
output
=
super
().
forward
(
input
)
output
=
hook_paramter_in_backward
(
output
,
self
.
weight
,
self
.
bias
)
output
=
hook_param
e
ter_in_backward
(
output
,
self
.
weight
,
self
.
bias
)
return
output
return
output
class
FusedRMSNormWithHook
(
ApexFusedRMSNorm
):
class
FusedRMSNormWithHook
(
ApexFusedRMSNorm
):
...
@@ -38,7 +38,7 @@ try:
...
@@ -38,7 +38,7 @@ try:
def
forward
(
self
,
input
):
def
forward
(
self
,
input
):
output
=
super
().
forward
(
input
)
output
=
super
().
forward
(
input
)
output
=
hook_paramter_in_backward
(
output
,
self
.
weight
)
output
=
hook_param
e
ter_in_backward
(
output
,
self
.
weight
)
return
output
return
output
except
ImportError
:
except
ImportError
:
...
@@ -79,7 +79,7 @@ if EnableFastLayerNorm:
...
@@ -79,7 +79,7 @@ if EnableFastLayerNorm:
def
forward
(
self
,
input
):
def
forward
(
self
,
input
):
output
=
super
().
forward
(
input
)
output
=
super
().
forward
(
input
)
output
=
hook_paramter_in_backward
(
output
,
self
.
weight
,
self
.
bias
)
output
=
hook_param
e
ter_in_backward
(
output
,
self
.
weight
,
self
.
bias
)
return
output
return
output
...
...
colossalai/shardformer/modeling/bloom.py
View file @
049121d1
...
@@ -699,7 +699,7 @@ class BloomPipelineForwards:
...
@@ -699,7 +699,7 @@ class BloomPipelineForwards:
return
{
"hidden_states"
:
hidden_states
}
return
{
"hidden_states"
:
hidden_states
}
def
get_bloom_flash_attention_forward
(
enab
e
l_jit_fused
=
False
):
def
get_bloom_flash_attention_forward
(
enabl
e
_jit_fused
=
False
):
try
:
try
:
from
xformers.ops
import
memory_efficient_attention
as
me_attention
from
xformers.ops
import
memory_efficient_attention
as
me_attention
except
:
except
:
...
...
colossalai/shardformer/modeling/chatglm2_6b/modeling_chatglm.py
View file @
049121d1
...
@@ -181,7 +181,7 @@ class RotaryEmbedding(nn.Module):
...
@@ -181,7 +181,7 @@ class RotaryEmbedding(nn.Module):
cache
=
torch
.
stack
([
torch
.
cos
(
idx_theta
),
torch
.
sin
(
idx_theta
)],
dim
=-
1
)
cache
=
torch
.
stack
([
torch
.
cos
(
idx_theta
),
torch
.
sin
(
idx_theta
)],
dim
=-
1
)
# this is to mimic the behavio
u
r of complex32, else we will get different results
# this is to mimic the behavior of complex32, else we will get different results
if
dtype
in
(
torch
.
float16
,
torch
.
bfloat16
,
torch
.
int8
):
if
dtype
in
(
torch
.
float16
,
torch
.
bfloat16
,
torch
.
int8
):
cache
=
cache
.
bfloat16
()
if
dtype
==
torch
.
bfloat16
else
cache
.
half
()
cache
=
cache
.
bfloat16
()
if
dtype
==
torch
.
bfloat16
else
cache
.
half
()
return
cache
return
cache
...
@@ -290,7 +290,7 @@ class CoreAttention(torch.nn.Module):
...
@@ -290,7 +290,7 @@ class CoreAttention(torch.nn.Module):
# [sk, b, np, hn] -> [sk, b * np, hn]
# [sk, b, np, hn] -> [sk, b * np, hn]
key_layer
=
key_layer
.
view
(
output_size
[
3
],
output_size
[
0
]
*
output_size
[
1
],
-
1
)
key_layer
=
key_layer
.
view
(
output_size
[
3
],
output_size
[
0
]
*
output_size
[
1
],
-
1
)
# preallocting input tensor: [b * np, sq, sk]
# prealloc
a
ting input tensor: [b * np, sq, sk]
matmul_input_buffer
=
torch
.
empty
(
matmul_input_buffer
=
torch
.
empty
(
output_size
[
0
]
*
output_size
[
1
],
output_size
[
0
]
*
output_size
[
1
],
output_size
[
2
],
output_size
[
2
],
...
@@ -1289,7 +1289,7 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
...
@@ -1289,7 +1289,7 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
if
has_default_max_length
and
generation_config
.
max_new_tokens
is
None
:
if
has_default_max_length
and
generation_config
.
max_new_tokens
is
None
:
warnings
.
warn
(
warnings
.
warn
(
f
"Using `max_length`'s default (
{
generation_config
.
max_length
}
) to control the generation length. "
f
"Using `max_length`'s default (
{
generation_config
.
max_length
}
) to control the generation length. "
"This behavio
u
r is deprecated and will be removed from the config in v5 of Transformers -- we"
"This behavior is deprecated and will be removed from the config in v5 of Transformers -- we"
" recommend using `max_new_tokens` to control the maximum length of the generation."
,
" recommend using `max_new_tokens` to control the maximum length of the generation."
,
UserWarning
,
UserWarning
,
)
)
...
...
colossalai/shardformer/modeling/gptj.py
View file @
049121d1
...
@@ -122,7 +122,7 @@ class GPTJPipelineForwards:
...
@@ -122,7 +122,7 @@ class GPTJPipelineForwards:
# head_mask has shape n_layer x batch x num_attention_heads x N x N
# head_mask has shape n_layer x batch x num_attention_heads x N x N
head_mask
=
self
.
get_head_mask
(
head_mask
,
self
.
config
.
n_layer
)
head_mask
=
self
.
get_head_mask
(
head_mask
,
self
.
config
.
n_layer
)
# position id to be ass
s
igned not just for the first stage for attn input
# position id to be assigned not just for the first stage for attn input
if
position_ids
is
not
None
:
if
position_ids
is
not
None
:
position_ids
=
position_ids
.
view
(
-
1
,
seq_length
)
position_ids
=
position_ids
.
view
(
-
1
,
seq_length
)
else
:
else
:
...
@@ -593,7 +593,7 @@ def get_gptj_flash_attention_forward():
...
@@ -593,7 +593,7 @@ def get_gptj_flash_attention_forward():
# key = key.permute(0, 2, 1, 3)
# key = key.permute(0, 2, 1, 3)
# query = query.permute(0, 2, 1, 3)
# query = query.permute(0, 2, 1, 3)
key
=
key
.
to
(
dtype
=
value
.
dtype
)
# fp16 compat
a
bility
key
=
key
.
to
(
dtype
=
value
.
dtype
)
# fp16 compat
i
bility
query
=
query
.
to
(
dtype
=
value
.
dtype
)
query
=
query
.
to
(
dtype
=
value
.
dtype
)
if
layer_past
is
not
None
:
if
layer_past
is
not
None
:
...
...
colossalai/shardformer/modeling/llama.py
View file @
049121d1
...
@@ -225,13 +225,13 @@ class LlamaPipelineForwards:
...
@@ -225,13 +225,13 @@ class LlamaPipelineForwards:
>>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
>>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
>>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
>>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
>>> prompt = "Hey, are you consciou
r
s? Can you talk to me?"
>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")
>>> inputs = tokenizer(prompt, return_tensors="pt")
>>> # Generate
>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you consciou
r
s? Can you talk to me?\nI'm not consciou
r
s, but I can talk to you."
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```"""
```"""
logger
=
logging
.
get_logger
(
__name__
)
logger
=
logging
.
get_logger
(
__name__
)
output_attentions
=
output_attentions
if
output_attentions
is
not
None
else
self
.
config
.
output_attentions
output_attentions
=
output_attentions
if
output_attentions
is
not
None
else
self
.
config
.
output_attentions
...
...
colossalai/shardformer/modeling/opt.py
View file @
049121d1
...
@@ -123,7 +123,7 @@ class OPTPipelineForwards:
...
@@ -123,7 +123,7 @@ class OPTPipelineForwards:
else
:
else
:
if
hidden_states
is
None
:
if
hidden_states
is
None
:
raise
ValueError
(
"hidden_states shouln't be None for intermediate stages."
)
raise
ValueError
(
"hidden_states shoul
d
n't be None for intermediate stages."
)
input_shape
=
hidden_states
.
size
()[:
-
1
]
input_shape
=
hidden_states
.
size
()[:
-
1
]
batch_size
,
seq_length
=
input_shape
[
0
],
input_shape
[
1
]
batch_size
,
seq_length
=
input_shape
[
0
],
input_shape
[
1
]
device
=
hidden_states
.
device
device
=
hidden_states
.
device
...
...
colossalai/shardformer/modeling/t5.py
View file @
049121d1
...
@@ -77,7 +77,7 @@ class T5PipelineForwards:
...
@@ -77,7 +77,7 @@ class T5PipelineForwards:
if
in_decoder
!=
(
stage
>=
decoder_starting_stage
):
if
in_decoder
!=
(
stage
>=
decoder_starting_stage
):
raise
ValueError
(
"Config in T5Stack is not aligned with pipeline setting."
)
raise
ValueError
(
"Config in T5Stack is not aligned with pipeline setting."
)
# at_first_stage: current stage is the first stage of encoder/decoder, taking input_ids/input_embed
d
s
# at_first_stage: current stage is the first stage of encoder/decoder, taking input_ids/input_embeds
# at_last_stage: current stage is the last stage of encoder/decoder, making outputs the same form as huggingface
# at_last_stage: current stage is the last stage of encoder/decoder, making outputs the same form as huggingface
at_first_stage
=
(
stage
==
0
)
or
(
stage
==
decoder_starting_stage
)
at_first_stage
=
(
stage
==
0
)
or
(
stage
==
decoder_starting_stage
)
at_last_stage
=
(
stage
==
decoder_starting_stage
-
1
)
or
(
stage
==
stage_manager
.
num_stages
-
1
)
at_last_stage
=
(
stage
==
decoder_starting_stage
-
1
)
or
(
stage
==
stage_manager
.
num_stages
-
1
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment