Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
1a60dc07
Unverified
Commit
1a60dc07
authored
Apr 28, 2023
by
tanitna
Committed by
GitHub
Apr 28, 2023
Browse files
[chat] typo accimulation_steps -> accumulation_steps (#3662)
parent
816add7e
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
18 additions
and
18 deletions
+18
-18
applications/Chat/README.md
applications/Chat/README.md
+4
-4
applications/Chat/coati/trainer/sft.py
applications/Chat/coati/trainer/sft.py
+8
-8
applications/Chat/examples/README.md
applications/Chat/examples/README.md
+1
-1
applications/Chat/examples/community/peft/train_peft_sft.py
applications/Chat/examples/community/peft/train_peft_sft.py
+2
-2
applications/Chat/examples/train_sft.py
applications/Chat/examples/train_sft.py
+2
-2
applications/Chat/examples/train_sft.sh
applications/Chat/examples/train_sft.sh
+1
-1
No files found.
applications/Chat/README.md
View file @
1a60dc07
...
...
@@ -251,7 +251,7 @@ trainer = SFTTrainer(model=model,
eval_dataloader
=
eval_dataloader
,
batch_size
=
args
.
batch_size
,
max_epochs
=
args
.
max_epochs
,
acc
i
mulation_steps
=
args
.
acc
i
mulation_steps
acc
u
mulation_steps
=
args
.
acc
u
mulation_steps
)
trainer
.
fit
()
...
...
@@ -278,7 +278,7 @@ torchrun --standalone --nproc_per_node=1 train_sft.py \
--save_path /path/to/Coati-7B \
--dataset /path/to/data.json \
--batch_size 1 \
--acc
i
mulation_steps 8 \
--acc
u
mulation_steps 8 \
--lr 2e-5 \
--max_datasets_size 512 \
--max_epochs 1 \
...
...
@@ -296,7 +296,7 @@ torchrun --standalone --nproc_per_node=1 train_sft.py \
--save_path /path/to/Coati-7B \
--dataset /path/to/data.json \
--batch_size 1 \
--acc
i
mulation_steps 8 \
--acc
u
mulation_steps 8 \
--lr 2e-5 \
--max_datasets_size 512 \
--max_epochs 1 \
...
...
@@ -313,7 +313,7 @@ torchrun --standalone --nproc_per_node=4 train_sft.py \
--save_path /path/to/Coati-7B \
--dataset /path/to/data.json \
--batch_size 1 \
--acc
i
mulation_steps 8 \
--acc
u
mulation_steps 8 \
--lr 2e-5 \
--max_datasets_size 512 \
--max_epochs 1 \
...
...
applications/Chat/coati/trainer/sft.py
View file @
1a60dc07
...
...
@@ -41,10 +41,10 @@ class SFTTrainer(Trainer):
train_dataloader
:
DataLoader
,
eval_dataloader
:
DataLoader
=
None
,
max_epochs
:
int
=
2
,
acc
i
mulation_steps
:
int
=
8
,
acc
u
mulation_steps
:
int
=
8
,
callbacks
:
List
[
Callback
]
=
[],
)
->
None
:
if
acc
i
mulation_steps
>
1
and
isinstance
(
strategy
,
ColossalAIStrategy
)
and
strategy
.
stage
==
3
:
if
acc
u
mulation_steps
>
1
and
isinstance
(
strategy
,
ColossalAIStrategy
)
and
strategy
.
stage
==
3
:
raise
ValueError
(
"Accumulation steps are not supported in stage 3 of ColossalAI"
)
super
().
__init__
(
strategy
,
max_epochs
,
callbacks
=
callbacks
)
self
.
train_dataloader
=
train_dataloader
...
...
@@ -52,8 +52,8 @@ class SFTTrainer(Trainer):
self
.
model
=
model
self
.
optimizer
=
optim
self
.
acc
i
mulation_steps
=
acc
i
mulation_steps
num_update_steps_per_epoch
=
len
(
train_dataloader
)
//
self
.
acc
i
mulation_steps
self
.
acc
u
mulation_steps
=
acc
u
mulation_steps
num_update_steps_per_epoch
=
len
(
train_dataloader
)
//
self
.
acc
u
mulation_steps
max_steps
=
math
.
ceil
(
self
.
max_epochs
*
num_update_steps_per_epoch
)
self
.
scheduler
=
get_scheduler
(
"cosine"
,
...
...
@@ -67,7 +67,7 @@ class SFTTrainer(Trainer):
wandb
.
watch
(
self
.
model
)
total_loss
=
0
# epoch_bar = tqdm(range(self.epochs), desc='Epochs', disable=not is_rank_0())
step_bar
=
tqdm
(
range
(
len
(
self
.
train_dataloader
)
//
self
.
acc
i
mulation_steps
*
self
.
max_epochs
),
step_bar
=
tqdm
(
range
(
len
(
self
.
train_dataloader
)
//
self
.
acc
u
mulation_steps
*
self
.
max_epochs
),
desc
=
f
'steps'
,
disable
=
not
is_rank_0
())
for
epoch
in
range
(
self
.
max_epochs
):
...
...
@@ -85,20 +85,20 @@ class SFTTrainer(Trainer):
if
loss
>=
2.5
and
is_rank_0
():
logger
.
warning
(
f
"batch_id:
{
batch_id
}
, abnormal loss:
{
loss
}
"
)
loss
=
loss
/
self
.
acc
i
mulation_steps
loss
=
loss
/
self
.
acc
u
mulation_steps
self
.
strategy
.
backward
(
loss
,
self
.
model
,
self
.
optimizer
)
total_loss
+=
loss
.
item
()
# gradient accumulation
if
(
batch_id
+
1
)
%
self
.
acc
i
mulation_steps
==
0
:
if
(
batch_id
+
1
)
%
self
.
acc
u
mulation_steps
==
0
:
self
.
strategy
.
optimizer_step
(
self
.
optimizer
)
self
.
optimizer
.
zero_grad
()
self
.
scheduler
.
step
()
if
is_rank_0
()
and
use_wandb
:
wandb
.
log
({
"loss"
:
total_loss
/
self
.
acc
i
mulation_steps
,
"loss"
:
total_loss
/
self
.
acc
u
mulation_steps
,
"lr"
:
self
.
scheduler
.
get_last_lr
()[
0
],
"epoch"
:
epoch
,
"batch_id"
:
batch_id
...
...
applications/Chat/examples/README.md
View file @
1a60dc07
...
...
@@ -62,7 +62,7 @@ torchrun --standalone --nproc_per_node=4 train_sft.py \
--save_path /path/to/Coati-7B \
--dataset /path/to/data.json \
--batch_size 4 \
--acc
i
mulation_steps 8 \
--acc
u
mulation_steps 8 \
--lr 2e-5 \
--max_datasets_size 512 \
--max_epochs 1 \
...
...
applications/Chat/examples/community/peft/train_peft_sft.py
View file @
1a60dc07
...
...
@@ -154,7 +154,7 @@ def train(args):
eval_dataloader
=
eval_dataloader
,
batch_size
=
args
.
batch_size
,
max_epochs
=
args
.
max_epochs
,
acc
i
mulation_steps
=
args
.
acc
i
mulation_steps
)
acc
u
mulation_steps
=
args
.
acc
u
mulation_steps
)
trainer
.
fit
(
logger
=
logger
,
log_interval
=
args
.
log_interval
)
...
...
@@ -183,7 +183,7 @@ if __name__ == '__main__':
parser
.
add_argument
(
'--lora_rank'
,
type
=
int
,
default
=
0
,
help
=
"low-rank adaptation matrices rank"
)
parser
.
add_argument
(
'--log_interval'
,
type
=
int
,
default
=
100
,
help
=
"how many steps to log"
)
parser
.
add_argument
(
'--lr'
,
type
=
float
,
default
=
5e-6
)
parser
.
add_argument
(
'--acc
i
mulation_steps'
,
type
=
int
,
default
=
8
)
parser
.
add_argument
(
'--acc
u
mulation_steps'
,
type
=
int
,
default
=
8
)
parser
.
add_argument
(
'--enable_peft_lora'
,
action
=
'store_true'
,
default
=
False
)
parser
.
add_argument
(
"--is_short_text"
,
action
=
'store_true'
,
default
=
False
)
args
=
parser
.
parse_args
()
...
...
applications/Chat/examples/train_sft.py
View file @
1a60dc07
...
...
@@ -159,7 +159,7 @@ def train(args):
train_dataloader
=
train_dataloader
,
eval_dataloader
=
eval_dataloader
,
max_epochs
=
args
.
max_epochs
,
acc
i
mulation_steps
=
args
.
acc
i
mulation_steps
)
acc
u
mulation_steps
=
args
.
acc
u
mulation_steps
)
trainer
.
fit
(
logger
=
logger
,
use_wandb
=
args
.
use_wandb
)
...
...
@@ -189,7 +189,7 @@ if __name__ == '__main__':
parser
.
add_argument
(
'--lora_rank'
,
type
=
int
,
default
=
0
,
help
=
"low-rank adaptation matrices rank"
)
parser
.
add_argument
(
'--log_interval'
,
type
=
int
,
default
=
100
,
help
=
"how many steps to log"
)
parser
.
add_argument
(
'--lr'
,
type
=
float
,
default
=
5e-6
)
parser
.
add_argument
(
'--acc
i
mulation_steps'
,
type
=
int
,
default
=
8
)
parser
.
add_argument
(
'--acc
u
mulation_steps'
,
type
=
int
,
default
=
8
)
parser
.
add_argument
(
'--use_wandb'
,
default
=
False
,
action
=
'store_true'
)
parser
.
add_argument
(
'--grad_checkpoint'
,
default
=
False
,
action
=
'store_true'
)
args
=
parser
.
parse_args
()
...
...
applications/Chat/examples/train_sft.sh
View file @
1a60dc07
...
...
@@ -6,7 +6,7 @@ torchrun --standalone --nproc_per_node=4 train_sft.py \
--save_path
/path/to/Coati-7B
\
--dataset
/path/to/data.json
\
--batch_size
4
\
--acc
i
mulation_steps
8
\
--acc
u
mulation_steps
8
\
--lr
2e-5
\
--max_datasets_size
512
\
--max_epochs
1
\
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment