Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
df1d6dc5
Unverified
Commit
df1d6dc5
authored
Jan 03, 2023
by
ZijianYY
Committed by
GitHub
Jan 03, 2023
Browse files
[examples] using args and combining two versions for PaLM (#2284)
parent
e94c79f1
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
97 additions
and
44 deletions
+97
-44
examples/language/palm/palm_config.py
examples/language/palm/palm_config.py
+0
-6
examples/language/palm/run.sh
examples/language/palm/run.sh
+11
-1
examples/language/palm/train.py
examples/language/palm/train.py
+86
-37
No files found.
examples/language/palm/palm_config.py
deleted
100644 → 0
View file @
e94c79f1
SEQ_LENGTH
=
1024
BATCH_SIZE
=
4
NUM_EPOCHS
=
4
TPDEGREE
=
2
USE_SHARD_INIT
=
False
placement
=
'cpu'
\ No newline at end of file
examples/language/palm/run.sh
View file @
df1d6dc5
env
OMP_NUM_THREADS
=
12 torchrun
--nproc_per_node
4
--master_port
29501 train.py
--config
palm_config.py
# distplan in ["colossalai", "pytorch"]
export
DISTPAN
=
"colossalai"
# The following options only valid when DISTPAN="colossalai"
export
TPDEGREE
=
1
export
GPUNUM
=
1
export
PLACEMENT
=
'cpu'
export
USE_SHARD_INIT
=
False
export
BATCH_SIZE
=
4
env
OMP_NUM_THREADS
=
12 torchrun
--standalone
--nproc_per_node
=
${
GPUNUM
}
--master_port
29501 train_new.py
--tp_degree
=
${
TPDEGREE
}
--batch_size
=
${
BATCH_SIZE
}
--placement
${
PLACEMENT
}
--shardinit
${
USE_SHARD_INIT
}
--distplan
${
DISTPAN
}
2>&1 |
tee
run.log
\ No newline at end of file
examples/language/palm/train.py
View file @
df1d6dc5
...
@@ -21,19 +21,51 @@ from colossalai.utils.model.colo_init_context import ColoInitContext
...
@@ -21,19 +21,51 @@ from colossalai.utils.model.colo_init_context import ColoInitContext
# constants
# constants
NUM_BATCHES
=
int
(
20
)
NUM_BATCHES
=
int
(
1000
)
BATCH_SIZE
=
4
GRADIENT_ACCUMULATE_EVERY
=
1
GRADIENT_ACCUMULATE_EVERY
=
1
LEARNING_RATE
=
2e-4
LEARNING_RATE
=
2e-4
VALIDATE_EVERY
=
100
VALIDATE_EVERY
=
100
GENERATE_EVERY
=
500
GENERATE_EVERY
=
500
GENERATE_LENGTH
=
512
GENERATE_LENGTH
=
512
SEQ_LEN
=
1024
SEQ_LEN
=
1024
TPDEGREE
=
1
USE_SHARD_INIT
=
False
placement
=
'cpu'
def
parse_args
():
parser
=
colossalai
.
get_default_parser
()
parser
.
add_argument
(
"--distplan"
,
type
=
str
,
default
=
'colossalai'
,
help
=
"The distributed plan [colossalai, pytorch]."
,
)
parser
.
add_argument
(
"--tp_degree"
,
type
=
int
,
default
=
1
,
help
=
"Tensor Parallelism Degree. Valid when using colossalai as dist plan."
,
)
parser
.
add_argument
(
"--placement"
,
type
=
str
,
default
=
'cpu'
,
help
=
"Placement Policy for Gemini. Valid when using colossalai as dist plan."
,
)
parser
.
add_argument
(
"--shardinit"
,
type
=
bool
,
default
=
False
,
help
=
"Shard the tensors when init the model to shrink peak memory size on the assigned device. Valid when using colossalai as dist plan."
,
)
parser
.
add_argument
(
"--batch_size"
,
type
=
int
,
default
=
8
,
help
=
"batch size per DP group of training."
,
)
args
=
parser
.
parse_args
()
return
args
# helpers
# helpers
def
cycle
(
loader
):
def
cycle
(
loader
):
while
True
:
while
True
:
...
@@ -73,22 +105,11 @@ def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy:
...
@@ -73,22 +105,11 @@ def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy:
return
model
return
model
# instantiate GPT-like decoder model
args
=
parse_args
()
if
args
.
distplan
not
in
[
"colossalai"
,
"pytorch"
]:
parser
=
colossalai
.
get_default_parser
()
raise
TypeError
(
f
"
{
args
.
distplan
}
is error"
)
args
=
parser
.
parse_args
()
disable_existing_loggers
()
disable_existing_loggers
()
colossalai
.
launch_from_torch
(
config
=
args
.
config
,
seed
=
42
)
colossalai
.
launch_from_torch
(
config
=
{})
# instantiate GPT-like decoder model
default_pg
=
ProcessGroup
(
tp_degree
=
TPDEGREE
)
default_dist_spec
=
ShardSpec
([
-
1
],
[
TPDEGREE
])
if
USE_SHARD_INIT
else
None
ctx
=
ColoInitContext
(
device
=
'cpu'
,
default_dist_spec
=
default_dist_spec
,
default_pg
=
default_pg
)
with
ctx
:
model
=
PaLM
(
num_tokens
=
256
,
dim
=
512
,
depth
=
8
)
model
=
AutoregressiveWrapper
(
model
,
max_seq_len
=
SEQ_LEN
)
with
gzip
.
open
(
"./data/enwik8.gz"
)
as
file
:
with
gzip
.
open
(
"./data/enwik8.gz"
)
as
file
:
X
=
np
.
fromstring
(
file
.
read
(
int
(
95e6
)),
dtype
=
np
.
uint8
)
X
=
np
.
fromstring
(
file
.
read
(
int
(
95e6
)),
dtype
=
np
.
uint8
)
...
@@ -114,23 +135,42 @@ class TextSamplerDataset(Dataset):
...
@@ -114,23 +135,42 @@ class TextSamplerDataset(Dataset):
train_dataset
=
TextSamplerDataset
(
data_train
,
SEQ_LEN
)
train_dataset
=
TextSamplerDataset
(
data_train
,
SEQ_LEN
)
val_dataset
=
TextSamplerDataset
(
data_val
,
SEQ_LEN
)
val_dataset
=
TextSamplerDataset
(
data_val
,
SEQ_LEN
)
train_loader
=
cycle
(
DataLoader
(
train_dataset
,
batch_size
=
BATCH_SIZE
))
train_loader
=
cycle
(
DataLoader
(
train_dataset
,
batch_size
=
args
.
batch_size
))
val_loader
=
cycle
(
DataLoader
(
val_dataset
,
batch_size
=
BATCH_SIZE
))
val_loader
=
cycle
(
DataLoader
(
val_dataset
,
batch_size
=
args
.
batch_size
))
if
args
.
distplan
==
"colossalai"
:
# instantiate GPT-like decoder model
#tensor_parallelize(model, pg)
default_pg
=
ProcessGroup
(
tp_degree
=
args
.
tp_degree
)
default_dist_spec
=
ShardSpec
([
-
1
],
[
args
.
tp_degree
])
if
args
.
shardinit
else
None
ctx
=
ColoInitContext
(
device
=
'cpu'
,
default_dist_spec
=
default_dist_spec
,
default_pg
=
default_pg
)
pg
=
default_pg
with
ctx
:
model
=
gemini_zero_dpp
(
model
,
pg
,
placement
)
model
=
PaLM
(
num_tokens
=
256
,
dim
=
512
,
depth
=
8
)
model
=
AutoregressiveWrapper
(
model
,
max_seq_len
=
SEQ_LEN
)
pg
=
default_pg
#tensor_parallelize(model, pg)
model
=
gemini_zero_dpp
(
model
,
pg
,
args
.
placement
)
#optimizer
#optimizer = GeminiAdamOptimizer(model, lr=1e-7, initial_scale=2**5)
optimizer
=
GeminiAdamOptimizer
(
model
,
lr
=
LEARNING_RATE
,
initial_scale
=
2
**
5
)
else
:
model
=
PaLM
(
num_tokens
=
256
,
dim
=
512
,
depth
=
8
)
model
=
AutoregressiveWrapper
(
model
,
max_seq_len
=
2048
)
model
.
cuda
()
optim
=
torch
.
optim
.
Adam
(
model
.
parameters
(),
lr
=
LEARNING_RATE
)
#optimizer
optimizer
=
GeminiAdamOptimizer
(
model
,
lr
=
1e-7
,
initial_scale
=
2
**
5
)
# training
# training
model
.
train
()
model
.
train
()
for
i
in
tqdm
.
tqdm
(
range
(
NUM_BATCHES
),
mininterval
=
10.0
,
desc
=
"training"
):
for
i
in
tqdm
.
tqdm
(
range
(
NUM_BATCHES
),
mininterval
=
10.0
,
desc
=
"training"
):
if
args
.
distplan
==
"colossalai"
:
optimizer
.
zero_grad
()
optimizer
.
zero_grad
()
loss
=
model
(
next
(
train_loader
))
loss
=
model
(
next
(
train_loader
))
...
@@ -142,6 +182,15 @@ for i in tqdm.tqdm(range(NUM_BATCHES), mininterval=10.0, desc="training"):
...
@@ -142,6 +182,15 @@ for i in tqdm.tqdm(range(NUM_BATCHES), mininterval=10.0, desc="training"):
# optim.step()
# optim.step()
# optim.zero_grad()
# optim.zero_grad()
optimizer
.
step
()
optimizer
.
step
()
else
:
for
__
in
range
(
GRADIENT_ACCUMULATE_EVERY
):
loss
=
model
(
next
(
train_loader
))
loss
.
backward
()
print
(
f
"training loss:
{
loss
.
item
()
}
"
)
torch
.
nn
.
utils
.
clip_grad_norm_
(
model
.
parameters
(),
0.5
)
optim
.
step
()
optim
.
zero_grad
()
# TODO
# TODO
# if i % VALIDATE_EVERY == 0:
# if i % VALIDATE_EVERY == 0:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment