Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
236b4195
Commit
236b4195
authored
Jan 16, 2023
by
jiaruifang
Browse files
Merge branch 'main' of
https://github.com/hpcaitech/ColossalAI
into dev0116
parents
e64a05b3
7c317062
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
86 additions
and
24 deletions
+86
-24
examples/language/gpt/gemini/run_gemini.sh
examples/language/gpt/gemini/run_gemini.sh
+2
-1
examples/language/gpt/gemini/test_ci.sh
examples/language/gpt/gemini/test_ci.sh
+35
-0
examples/language/gpt/gemini/train_gpt_demo.py
examples/language/gpt/gemini/train_gpt_demo.py
+9
-2
examples/language/gpt/test_ci.sh
examples/language/gpt/test_ci.sh
+2
-15
examples/language/opt/test_ci.sh
examples/language/opt/test_ci.sh
+4
-0
examples/language/palm/run.sh
examples/language/palm/run.sh
+1
-1
examples/language/palm/test_ci.sh
examples/language/palm/test_ci.sh
+9
-0
examples/language/palm/train.py
examples/language/palm/train.py
+24
-5
No files found.
examples/language/gpt/gemini/run_gemini.sh
View file @
236b4195
...
...
@@ -9,7 +9,7 @@ export PLACEMENT=${PLACEMENT:-"cpu"}
export
USE_SHARD_INIT
=
${
USE_SHARD_INIT
:-
False
}
export
BATCH_SIZE
=
${
BATCH_SIZE
:-
16
}
export
MODEL_TYPE
=
${
MODEL_TYPE
:-
"gpt2_medium"
}
export
TRAIN_STEP
=
${
TRAIN_STEP
:-
10
}
# export PYTHONPATH=$PWD:$PYTHONPATH
mkdir
-p
gemini_logs
...
...
@@ -21,4 +21,5 @@ torchrun --standalone --nproc_per_node=${GPUNUM} ./train_gpt_demo.py \
--placement
=
${
PLACEMENT
}
\
--shardinit
=
${
USE_SHARD_INIT
}
\
--distplan
=
${
DISTPLAN
}
\
--train_step
=
${
TRAIN_STEP
}
\
2>&1 |
tee
./gemini_logs/
${
MODEL_TYPE
}
_
${
DISTPLAN
}
_gpu_
${
GPUNUM
}
_bs_
${
BATCH_SIZE
}
_tp_
${
TPDEGREE
}
_
${
PLACEMENT
}
.log
examples/language/gpt/gemini/test_ci.sh
0 → 100644
View file @
236b4195
set
-x
$(
cd
`
dirname
$0
`
;
pwd
)
export
TRAIN_STEP
=
4
for
MODEL_TYPE
in
"gpt2_medium"
;
do
for
DISTPLAN
in
"colossalai"
;
do
for
BATCH_SIZE
in
2
;
do
for
GPUNUM
in
1 4
;
do
for
TPDEGREE
in
1 2
;
do
if
[
${
TPDEGREE
}
-gt
${
GPUNUM
}
]
;
then
continue
fi
for
PLACEMENT
in
"cpu"
"auto"
;
do
MODEL_TYPE
=
${
MODEL_TYPE
}
DISTPLAN
=
${
DISTPLAN
}
BATCH_SIZE
=
${
BATCH_SIZE
}
GPUNUM
=
${
GPUNUM
}
TPDEGREE
=
${
TPDEGREE
}
PLACEMENT
=
${
PLACEMENT
}
\
bash ./run_gemini.sh
done
done
done
done
done
for
DISTPLAN
in
"zero1"
"zero2"
;
do
for
BATCH_SIZE
in
2
;
do
for
GPUNUM
in
1 4
;
do
for
TPDEGREE
in
1
;
do
if
[
${
TPDEGREE
}
-gt
${
GPUNUM
}
]
;
then
continue
fi
MODEL_TYPE
=
${
MODEL_TYPE
}
DISTPLAN
=
${
DISTPLAN
}
BATCH_SIZE
=
${
BATCH_SIZE
}
GPUNUM
=
${
GPUNUM
}
TPDEGREE
=
${
TPDEGREE
}
\
bash ./run_gemini.sh
done
done
done
done
done
examples/language/gpt/gemini/train_gpt_demo.py
View file @
236b4195
...
...
@@ -65,7 +65,13 @@ def parse_args():
default
=
"gpt2_medium"
,
help
=
"model model scale"
,
)
parser
.
add_argument
(
"--steps"
,
type
=
int
,
default
=
10
,
help
=
"num of training steps"
)
parser
.
add_argument
(
"--train_step"
,
type
=
int
,
default
=
10
,
help
=
"training iterations for test"
,
)
args
=
parser
.
parse_args
()
return
args
...
...
@@ -237,7 +243,8 @@ def main():
SEQ_LEN
=
1024
VOCAB_SIZE
=
50257
NUM_STEPS
=
args
.
steps
NUM_STEPS
=
args
.
train_step
WARMUP_STEPS
=
1
assert
WARMUP_STEPS
<
NUM_STEPS
,
"warmup steps should smaller than the total steps"
assert
(
NUM_STEPS
-
WARMUP_STEPS
)
%
2
==
1
,
"the number of valid steps should be odd to take the median "
...
...
examples/language/gpt/test_ci.sh
View file @
236b4195
pip
install
-r
requirements.txt
# test colossalai
for
TP
in
1 2
;
do
for
PLACEMENT
in
"cpu"
"cuda"
"auto"
"const"
;
do
for
SHARD
in
"True"
"False"
;
do
colossalai run
--nproc_per_node
=
4 ./gemini/train_gpt_demo.py
--steps
4
--distplan
colossalai
--tp_degree
$TP
--placement
$PLACEMENT
--shardinit
$SHARD
||
exit
1
done
done
done
# test zero1&2
for
DIST
in
"zero1"
"zero2"
;
do
colossalai run
--nproc_per_node
=
4 ./gemini/train_gpt_demo.py
--steps
4
--distplan
$DIST
||
exit
1
done
set
-x
cd
gemini
&&
bash test_ci.sh
examples/language/opt/test_ci.sh
0 → 100644
View file @
236b4195
for
GPUNUM
in
2 1
do
env
BS
=
2
MODEL
=
"125m"
GPUNUM
=
$GPUNUM
bash ./run_gemini.sh
done
examples/language/palm/run.sh
View file @
236b4195
...
...
@@ -8,4 +8,4 @@ export PLACEMENT='cpu'
export
USE_SHARD_INIT
=
False
export
BATCH_SIZE
=
4
env
OMP_NUM_THREADS
=
12 torchrun
--standalone
--nproc_per_node
=
${
GPUNUM
}
--master_port
29501 train_new.py
--tp_degree
=
${
TPDEGREE
}
--batch_size
=
${
BATCH_SIZE
}
--placement
${
PLACEMENT
}
--shardinit
${
USE_SHARD_INIT
}
--distplan
${
DISTPAN
}
2>&1 |
tee
run.log
\ No newline at end of file
env
OMP_NUM_THREADS
=
12 torchrun
--standalone
--nproc_per_node
=
${
GPUNUM
}
--master_port
29501 train.py
--tp_degree
=
${
TPDEGREE
}
--batch_size
=
${
BATCH_SIZE
}
--placement
${
PLACEMENT
}
--shardinit
${
USE_SHARD_INIT
}
--distplan
${
DISTPAN
}
2>&1 |
tee
run.log
examples/language/palm/test_ci.sh
0 → 100644
View file @
236b4195
$(
cd
`
dirname
$0
`
;
pwd
)
for
BATCH_SIZE
in
2
do
for
GPUNUM
in
1 4
do
env
OMP_NUM_THREADS
=
12 torchrun
--standalone
--nproc_per_node
=
${
GPUNUM
}
--master_port
29501 train.py
--dummy_data
=
True
--batch_size
=
${
BATCH_SIZE
}
2>&1 |
tee
run.log
done
done
examples/language/palm/train.py
View file @
236b4195
...
...
@@ -23,7 +23,7 @@ from colossalai.utils.model.colo_init_context import ColoInitContext
# constants
NUM_BATCHES
=
int
(
10
0
)
NUM_BATCHES
=
int
(
10
)
WARMUP_BATCHES
=
1
GRADIENT_ACCUMULATE_EVERY
=
1
LEARNING_RATE
=
2e-4
...
...
@@ -66,6 +66,12 @@ def parse_args():
default
=
8
,
help
=
"batch size per DP group of training."
,
)
parser
.
add_argument
(
"--dummy_data"
,
type
=
bool
,
default
=
False
,
help
=
"use dummy dataset."
,
)
args
=
parser
.
parse_args
()
return
args
...
...
@@ -171,10 +177,23 @@ disable_existing_loggers()
colossalai
.
launch_from_torch
(
config
=
{})
logger
=
get_dist_logger
()
with
gzip
.
open
(
"./data/enwik8.gz"
)
as
file
:
X
=
np
.
fromstring
(
file
.
read
(
int
(
95e6
)),
dtype
=
np
.
uint8
)
trX
,
vaX
=
np
.
split
(
X
,
[
int
(
90e6
)])
data_train
,
data_val
=
torch
.
from_numpy
(
trX
),
torch
.
from_numpy
(
vaX
)
def
generate_dataset
(
dummy_data
:
bool
=
False
):
if
not
dummy_data
:
with
gzip
.
open
(
"./data/enwik8.gz"
)
as
file
:
X
=
np
.
fromstring
(
file
.
read
(
int
(
95e6
)),
dtype
=
np
.
uint8
)
trX
,
vaX
=
np
.
split
(
X
,
[
int
(
90e6
)])
data_train
,
data_val
=
torch
.
from_numpy
(
trX
),
torch
.
from_numpy
(
vaX
)
# print(f"data_train {data_train.shape} {data_train.dtype} {max(data_train)} {min(data_train)}")
# print(f"data_val {data_val.shape} {data_val.dtype} {max(data_val)} {min(data_val)}")
return
data_train
,
data_val
else
:
return
torch
.
randint
(
0
,
100
,
(
90000000
,)),
torch
.
randint
(
0
,
100
,
(
5000000
,))
data_train
,
data_val
=
generate_dataset
(
args
.
dummy_data
)
print
(
"generate dataset ready!"
)
class
TextSamplerDataset
(
Dataset
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment