Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
7c317062
Unverified
Commit
7c317062
authored
Jan 16, 2023
by
Jiarui Fang
Committed by
GitHub
Jan 16, 2023
Browse files
[CI] add test_ci.sh for palm, opt and gpt (#2475)
parent
e4c38ba3
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
107 additions
and
38 deletions
+107
-38
examples/language/gpt/gemini/run_gemini.sh
examples/language/gpt/gemini/run_gemini.sh
+2
-1
examples/language/gpt/gemini/test_ci.sh
examples/language/gpt/gemini/test_ci.sh
+35
-0
examples/language/gpt/gemini/train_gpt_demo.py
examples/language/gpt/gemini/train_gpt_demo.py
+9
-2
examples/language/gpt/test_ci.sh
examples/language/gpt/test_ci.sh
+2
-15
examples/language/opt/test_ci.sh
examples/language/opt/test_ci.sh
+4
-0
examples/language/palm/run.sh
examples/language/palm/run.sh
+1
-1
examples/language/palm/test_ci.sh
examples/language/palm/test_ci.sh
+9
-0
examples/language/palm/train.py
examples/language/palm/train.py
+45
-19
No files found.
examples/language/gpt/gemini/run_gemini.sh
View file @
7c317062
...
...
@@ -9,7 +9,7 @@ export PLACEMENT=${PLACEMENT:-"cpu"}
export
USE_SHARD_INIT
=
${
USE_SHARD_INIT
:-
False
}
export
BATCH_SIZE
=
${
BATCH_SIZE
:-
16
}
export
MODEL_TYPE
=
${
MODEL_TYPE
:-
"gpt2_medium"
}
export
TRAIN_STEP
=
${
TRAIN_STEP
:-
10
}
# export PYTHONPATH=$PWD:$PYTHONPATH
mkdir
-p
gemini_logs
...
...
@@ -21,4 +21,5 @@ torchrun --standalone --nproc_per_node=${GPUNUM} ./train_gpt_demo.py \
--placement
=
${
PLACEMENT
}
\
--shardinit
=
${
USE_SHARD_INIT
}
\
--distplan
=
${
DISTPLAN
}
\
--train_step
=
${
TRAIN_STEP
}
\
2>&1 |
tee
./gemini_logs/
${
MODEL_TYPE
}
_
${
DISTPLAN
}
_gpu_
${
GPUNUM
}
_bs_
${
BATCH_SIZE
}
_tp_
${
TPDEGREE
}
_
${
PLACEMENT
}
.log
examples/language/gpt/gemini/test_ci.sh
0 → 100644
View file @
7c317062
set
-x
$(
cd
`
dirname
$0
`
;
pwd
)
export
TRAIN_STEP
=
4
for
MODEL_TYPE
in
"gpt2_medium"
;
do
for
DISTPLAN
in
"colossalai"
;
do
for
BATCH_SIZE
in
2
;
do
for
GPUNUM
in
1 4
;
do
for
TPDEGREE
in
1 2
;
do
if
[
${
TPDEGREE
}
-gt
${
GPUNUM
}
]
;
then
continue
fi
for
PLACEMENT
in
"cpu"
"auto"
;
do
MODEL_TYPE
=
${
MODEL_TYPE
}
DISTPLAN
=
${
DISTPLAN
}
BATCH_SIZE
=
${
BATCH_SIZE
}
GPUNUM
=
${
GPUNUM
}
TPDEGREE
=
${
TPDEGREE
}
PLACEMENT
=
${
PLACEMENT
}
\
bash ./run_gemini.sh
done
done
done
done
done
for
DISTPLAN
in
"zero1"
"zero2"
;
do
for
BATCH_SIZE
in
2
;
do
for
GPUNUM
in
1 4
;
do
for
TPDEGREE
in
1
;
do
if
[
${
TPDEGREE
}
-gt
${
GPUNUM
}
]
;
then
continue
fi
MODEL_TYPE
=
${
MODEL_TYPE
}
DISTPLAN
=
${
DISTPLAN
}
BATCH_SIZE
=
${
BATCH_SIZE
}
GPUNUM
=
${
GPUNUM
}
TPDEGREE
=
${
TPDEGREE
}
\
bash ./run_gemini.sh
done
done
done
done
done
examples/language/gpt/gemini/train_gpt_demo.py
View file @
7c317062
...
...
@@ -65,7 +65,13 @@ def parse_args():
default
=
"gpt2_medium"
,
help
=
"model model scale"
,
)
parser
.
add_argument
(
"--steps"
,
type
=
int
,
default
=
10
,
help
=
"num of training steps"
)
parser
.
add_argument
(
"--train_step"
,
type
=
int
,
default
=
10
,
help
=
"training iterations for test"
,
)
args
=
parser
.
parse_args
()
return
args
...
...
@@ -237,7 +243,8 @@ def main():
SEQ_LEN
=
1024
VOCAB_SIZE
=
50257
NUM_STEPS
=
args
.
steps
NUM_STEPS
=
args
.
train_step
WARMUP_STEPS
=
1
assert
WARMUP_STEPS
<
NUM_STEPS
,
"warmup steps should smaller than the total steps"
assert
(
NUM_STEPS
-
WARMUP_STEPS
)
%
2
==
1
,
"the number of valid steps should be odd to take the median "
...
...
examples/language/gpt/test_ci.sh
View file @
7c317062
pip
install
-r
requirements.txt
# test colossalai
for
TP
in
1 2
;
do
for
PLACEMENT
in
"cpu"
"cuda"
"auto"
"const"
;
do
for
SHARD
in
"True"
"False"
;
do
colossalai run
--nproc_per_node
=
4 ./gemini/train_gpt_demo.py
--steps
4
--distplan
colossalai
--tp_degree
$TP
--placement
$PLACEMENT
--shardinit
$SHARD
||
exit
1
done
done
done
# test zero1&2
for
DIST
in
"zero1"
"zero2"
;
do
colossalai run
--nproc_per_node
=
4 ./gemini/train_gpt_demo.py
--steps
4
--distplan
$DIST
||
exit
1
done
set
-x
cd
gemini
&&
bash test_ci.sh
examples/language/opt/test_ci.sh
0 → 100644
View file @
7c317062
for
GPUNUM
in
2 1
do
env
BS
=
2
MODEL
=
"125m"
GPUNUM
=
$GPUNUM
bash ./run_gemini.sh
done
examples/language/palm/run.sh
View file @
7c317062
...
...
@@ -8,4 +8,4 @@ export PLACEMENT='cpu'
export
USE_SHARD_INIT
=
False
export
BATCH_SIZE
=
4
env
OMP_NUM_THREADS
=
12 torchrun
--standalone
--nproc_per_node
=
${
GPUNUM
}
--master_port
29501 train_new.py
--tp_degree
=
${
TPDEGREE
}
--batch_size
=
${
BATCH_SIZE
}
--placement
${
PLACEMENT
}
--shardinit
${
USE_SHARD_INIT
}
--distplan
${
DISTPAN
}
2>&1 |
tee
run.log
\ No newline at end of file
env
OMP_NUM_THREADS
=
12 torchrun
--standalone
--nproc_per_node
=
${
GPUNUM
}
--master_port
29501 train.py
--tp_degree
=
${
TPDEGREE
}
--batch_size
=
${
BATCH_SIZE
}
--placement
${
PLACEMENT
}
--shardinit
${
USE_SHARD_INIT
}
--distplan
${
DISTPAN
}
2>&1 |
tee
run.log
examples/language/palm/test_ci.sh
0 → 100644
View file @
7c317062
$(
cd
`
dirname
$0
`
;
pwd
)
for
BATCH_SIZE
in
2
do
for
GPUNUM
in
1 4
do
env
OMP_NUM_THREADS
=
12 torchrun
--standalone
--nproc_per_node
=
${
GPUNUM
}
--master_port
29501 train.py
--dummy_data
=
True
--batch_size
=
${
BATCH_SIZE
}
2>&1 |
tee
run.log
done
done
examples/language/palm/train.py
View file @
7c317062
import
gzip
import
random
from
time
import
time
from
functools
import
partial
from
time
import
time
import
numpy
as
np
import
torch
import
torch.optim
as
optim
import
torch.nn
as
nn
import
torch.optim
as
optim
import
tqdm
from
packaging
import
version
from
palm_pytorch
import
PaLM
...
...
@@ -23,7 +24,7 @@ from colossalai.utils.model.colo_init_context import ColoInitContext
# constants
NUM_BATCHES
=
int
(
10
0
)
NUM_BATCHES
=
int
(
10
)
WARMUP_BATCHES
=
1
GRADIENT_ACCUMULATE_EVERY
=
1
LEARNING_RATE
=
2e-4
...
...
@@ -66,9 +67,16 @@ def parse_args():
default
=
8
,
help
=
"batch size per DP group of training."
,
)
parser
.
add_argument
(
"--dummy_data"
,
type
=
bool
,
default
=
False
,
help
=
"use dummy dataset."
,
)
args
=
parser
.
parse_args
()
return
args
# helpers
def
cycle
(
loader
):
while
True
:
...
...
@@ -79,12 +87,15 @@ def cycle(loader):
def
decode_token
(
token
):
return
str
(
chr
(
max
(
32
,
token
)))
def
get_tflops
(
model_numel
,
batch_size
,
seq_len
,
step_time
):
return
model_numel
*
batch_size
*
seq_len
*
8
/
1e12
/
(
step_time
+
1e-12
)
def
decode_tokens
(
tokens
):
return
""
.
join
(
list
(
map
(
decode_token
,
tokens
)))
def
get_model_size
(
model
:
nn
.
Module
):
total_numel
=
0
for
module
in
model
.
modules
():
...
...
@@ -92,6 +103,7 @@ def get_model_size(model: nn.Module):
total_numel
+=
p
.
numel
()
return
total_numel
# Gemini + ZeRO DDP
def
gemini_zero_dpp
(
model
:
torch
.
nn
.
Module
,
pg
:
ProcessGroup
,
placememt_policy
:
str
=
"auto"
):
cai_version
=
colossalai
.
__version__
...
...
@@ -115,6 +127,7 @@ def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy:
raise
NotImplemented
(
f
"CAI version
{
cai_version
}
is not supported"
)
return
model
## Parameter Sharding Strategies for Tensor Parallelism
def
split_param_single_dim_tp1d
(
dim
:
int
,
param
:
ColoParameter
,
pg
:
ProcessGroup
):
spec
=
(
ShardSpec
([
dim
],
[
pg
.
tp_world_size
()]),
ComputeSpec
(
ComputePattern
.
TP1D
))
...
...
@@ -128,6 +141,7 @@ def split_param_row_tp1d(param: ColoParameter, pg: ProcessGroup):
def
split_param_col_tp1d
(
param
:
ColoParameter
,
pg
:
ProcessGroup
):
split_param_single_dim_tp1d
(
-
1
,
param
,
pg
)
# Tensor Parallel
def
tensor_parallelize
(
model
:
torch
.
nn
.
Module
,
pg
:
ProcessGroup
):
"""tensor_parallelize
...
...
@@ -159,15 +173,28 @@ def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup):
args
=
parse_args
()
if
args
.
distplan
not
in
[
"colossalai"
,
"pytorch"
]:
raise
TypeError
(
f
"
{
args
.
distplan
}
is error"
)
raise
TypeError
(
f
"
{
args
.
distplan
}
is error"
)
disable_existing_loggers
()
colossalai
.
launch_from_torch
(
config
=
{})
logger
=
get_dist_logger
()
with
gzip
.
open
(
"./data/enwik8.gz"
)
as
file
:
X
=
np
.
fromstring
(
file
.
read
(
int
(
95e6
)),
dtype
=
np
.
uint8
)
trX
,
vaX
=
np
.
split
(
X
,
[
int
(
90e6
)])
data_train
,
data_val
=
torch
.
from_numpy
(
trX
),
torch
.
from_numpy
(
vaX
)
def
generate_dataset
(
dummy_data
:
bool
=
False
):
if
not
dummy_data
:
with
gzip
.
open
(
"./data/enwik8.gz"
)
as
file
:
X
=
np
.
fromstring
(
file
.
read
(
int
(
95e6
)),
dtype
=
np
.
uint8
)
trX
,
vaX
=
np
.
split
(
X
,
[
int
(
90e6
)])
data_train
,
data_val
=
torch
.
from_numpy
(
trX
),
torch
.
from_numpy
(
vaX
)
# print(f"data_train {data_train.shape} {data_train.dtype} {max(data_train)} {min(data_train)}")
# print(f"data_val {data_val.shape} {data_val.dtype} {max(data_val)} {min(data_val)}")
return
data_train
,
data_val
else
:
return
torch
.
randint
(
0
,
100
,
(
90000000
,)),
torch
.
randint
(
0
,
100
,
(
5000000
,))
data_train
,
data_val
=
generate_dataset
(
args
.
dummy_data
)
print
(
"generate dataset ready!"
)
class
TextSamplerDataset
(
Dataset
):
...
...
@@ -216,7 +243,7 @@ else:
model
.
cuda
()
optim
=
torch
.
optim
.
Adam
(
model
.
parameters
(),
lr
=
LEARNING_RATE
)
# model is shared after TP
# model is shared after TP
numel
=
get_model_size
(
model
)
get_tflops_func
=
partial
(
get_tflops
,
numel
,
args
.
batch_size
,
SEQ_LEN
)
...
...
@@ -251,7 +278,7 @@ for i in tqdm.tqdm(range(NUM_BATCHES), mininterval=10.0, desc="training"):
)
if
i
>=
WARMUP_BATCHES
:
tflops_list
.
append
(
step_tflops
)
else
:
for
__
in
range
(
GRADIENT_ACCUMULATE_EVERY
):
loss
=
model
(
next
(
train_loader
))
...
...
@@ -261,18 +288,17 @@ for i in tqdm.tqdm(range(NUM_BATCHES), mininterval=10.0, desc="training"):
torch
.
nn
.
utils
.
clip_grad_norm_
(
model
.
parameters
(),
0.5
)
optim
.
step
()
optim
.
zero_grad
()
tflops_list
.
sort
()
median_index
=
((
NUM_BATCHES
-
WARMUP_BATCHES
)
>>
1
)
+
WARMUP_BATCHES
logger
.
info
(
f
"Median TFLOPS is
{
tflops_list
[
median_index
]:.
3
f
}
"
)
# TODO
# if i % VALIDATE_EVERY == 0:
# model.eval()
# with torch.no_grad():
# loss = model(next(val_loader))
# print(f"validation loss: {loss.item()}")
# TODO
# if i % VALIDATE_EVERY == 0:
# model.eval()
# with torch.no_grad():
# loss = model(next(val_loader))
# print(f"validation loss: {loss.item()}")
# if i % GENERATE_EVERY == 0:
# model.eval()
...
...
@@ -282,4 +308,4 @@ logger.info(f"Median TFLOPS is {tflops_list[median_index]:.3f}")
# sample = model.generate(inp[None, ...], GENERATE_LENGTH)
# output_str = decode_tokens(sample[0])
# print(output_str)
\ No newline at end of file
# print(output_str)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment