Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
4b255099
Commit
4b255099
authored
Nov 08, 2024
by
liangjing
Browse files
add qwen
parent
f2464dc2
Pipeline
#1871
passed with stage
Changes
5
Pipelines
1
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
151588 additions
and
1 deletion
+151588
-1
megatron/training/arguments.py
megatron/training/arguments.py
+1
-0
megatron/training/tokenizer/tokenizer.py
megatron/training/tokenizer/tokenizer.py
+40
-1
scripts/qwen2_7b.sh
scripts/qwen2_7b.sh
+159
-0
scripts/qwen_token/merges.txt
scripts/qwen_token/merges.txt
+151387
-0
scripts/qwen_token/vocab.json
scripts/qwen_token/vocab.json
+1
-0
No files found.
megatron/training/arguments.py
View file @
4b255099
...
@@ -1703,6 +1703,7 @@ def _add_data_args(parser):
...
@@ -1703,6 +1703,7 @@ def _add_data_args(parser):
'GPTSentencePieceTokenizer'
,
'GPTSentencePieceTokenizer'
,
'HuggingFaceTokenizer'
,
'HuggingFaceTokenizer'
,
'Llama2Tokenizer'
,
'Llama2Tokenizer'
,
'QwenTokenizer'
,
'TikTokenizer'
,
'TikTokenizer'
,
'NullTokenizer'
],
'NullTokenizer'
],
help
=
'What type of tokenizer to use.'
)
help
=
'What type of tokenizer to use.'
)
...
...
megatron/training/tokenizer/tokenizer.py
View file @
4b255099
...
@@ -15,7 +15,7 @@ from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer
...
@@ -15,7 +15,7 @@ from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer
from
.bert_tokenization
import
FullTokenizer
as
FullBertTokenizer
from
.bert_tokenization
import
FullTokenizer
as
FullBertTokenizer
from
.gpt2_tokenization
import
GPT2Tokenizer
from
.gpt2_tokenization
import
GPT2Tokenizer
from
transformers
import
Qwen2Tokenizer
def
build_tokenizer
(
args
,
**
kwargs
):
def
build_tokenizer
(
args
,
**
kwargs
):
"""Initialize tokenizer."""
"""Initialize tokenizer."""
...
@@ -49,6 +49,8 @@ def build_tokenizer(args, **kwargs):
...
@@ -49,6 +49,8 @@ def build_tokenizer(args, **kwargs):
elif
args
.
tokenizer_type
==
'Llama2Tokenizer'
:
elif
args
.
tokenizer_type
==
'Llama2Tokenizer'
:
assert
args
.
tokenizer_model
is
not
None
assert
args
.
tokenizer_model
is
not
None
tokenizer
=
_Llama2Tokenizer
(
args
.
tokenizer_model
)
tokenizer
=
_Llama2Tokenizer
(
args
.
tokenizer_model
)
elif
args
.
tokenizer_type
==
'QwenTokenizer'
:
tokenizer
=
_Qwen2Tokenizer
(
args
.
vocab_file
,
args
.
merge_file
)
elif
args
.
tokenizer_type
==
'TikTokenizer'
:
elif
args
.
tokenizer_type
==
'TikTokenizer'
:
assert
args
.
tokenizer_model
is
not
None
assert
args
.
tokenizer_model
is
not
None
assert
args
.
tiktoken_pattern
is
not
None
assert
args
.
tiktoken_pattern
is
not
None
...
@@ -132,6 +134,43 @@ class _HuggingFaceTokenizer(MegatronTokenizer):
...
@@ -132,6 +134,43 @@ class _HuggingFaceTokenizer(MegatronTokenizer):
def
eod
(
self
):
def
eod
(
self
):
return
self
.
_tokenizer
.
eos_token_id
return
self
.
_tokenizer
.
eos_token_id
class
_Qwen2Tokenizer
(
MegatronTokenizer
):
def
__init__
(
self
,
vocab_file
,
merge_file
,
extra_vocab_size
=
0
):
super
().
__init__
(
vocab_file
,
merge_file
)
self
.
tokenizer
=
Qwen2Tokenizer
(
vocab_file
,
merge_file
)
self
.
extra_vocab_size
=
extra_vocab_size
self
.
tokenizer
.
add_special_tokens
(
special_tokens_dict
=
dict
(
pad_token
=
"<|extra_0|>"
))
@
property
def
vocab_size
(
self
):
return
len
(
self
.
tokenizer
.
encoder
)
+
self
.
extra_vocab_size
@
property
def
vocab
(
self
):
return
self
.
tokenizer
.
encoder
@
property
def
inv_vocab
(
self
):
return
self
.
tokenizer
.
decoder
def
tokenize
(
self
,
text
):
return
self
.
tokenizer
.
encode
(
text
)
def
detokenize
(
self
,
token_ids
):
return
self
.
tokenizer
.
decode
(
token_ids
)
@
property
def
eod
(
self
):
return
self
.
tokenizer
.
eos_token_id
@
property
def
eos_token
(
self
):
return
self
.
tokenizer
.
eos_token
@
property
def
pad_token_id
(
self
):
return
self
.
tokenizer
.
pad_token_id
class
_BertWordPieceTokenizer
(
MegatronTokenizer
):
class
_BertWordPieceTokenizer
(
MegatronTokenizer
):
"""Original BERT wordpiece tokenizer."""
"""Original BERT wordpiece tokenizer."""
...
...
scripts/qwen2_7b.sh
0 → 100755
View file @
4b255099
#!/bin/bash
# Runs the "7B" parameter model
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
OMP_NUM_THREADS
=
1
export
NCCL_P2P_LEVEL
=
SYS
export
NCCL_ALGO
=
Ring
export
NCCL_NCHANNELS_PER_PEER
=
16
export
NCCL_MIN_NCHANNELS
=
20
export
NCCL_IB_TIMEOUT
=
22
export
CUDA_DEVICE_MAX_CONNECTIONS
=
1
export
NCCL_IB_HCA
=
xx
#based on your environment
export
NCCL_NET_GDR_LEVEL
=
SYS
export
NCCL_NET_GDR_READ
=
0
source
/opt/dtk/env.sh
lrank
=
$OMPI_COMM_WORLD_LOCAL_RANK
RANK
=
$OMPI_COMM_WORLD_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
CHECKPOINT_PATH
=
./tmp
#$1 #<Specify path>
TENSORBOARD_LOGS_PATH
=
./tmp
#$2 #<Specify path>
DATA_PATH
=
"/path_to_my-qwen_text_document"
#<Specify path and file prefix>_text_document
GPT_MODEL_ARGS
=(
--num-layers
28
--hidden-size
3584
--ffn-hidden-size
18944
--num-attention-heads
28
--seq-length
4096
--max-position-embeddings
32768
--num-query-groups
4
--group-query-attention
)
TRAINING_ARGS
=(
--log-throughput
--transformer-impl
local
--use-legacy-models
--micro-batch-size
1
--global-batch-size
12
#512
--train-iters
100
--weight-decay
0.1
--adam-beta1
0.9
--adam-beta2
0.95
--init-method-std
0.006
--clip-grad
1.0
--bf16
--use-distributed-optimizer
--use-flash-attn-triton
--disable-bias-linear
--attention-dropout
0
--hidden-dropout
0
--no-gradient-accumulation-fusion
--add-qkv-bias
--swiglu
--lr
3.0e-5
--lr-decay-style
cosine
--min-lr
3.0e-6
--lr-warmup-iters
1
--use-fast-cross-entropy-loss
)
MODEL_PARALLEL_ARGS
=(
--sequence-parallel
--tensor-model-parallel-size
2
--pipeline-model-parallel-size
4
)
DATA_ARGS
=(
--data-path
$DATA_PATH
--split
949,50,1
--untie-embeddings-and-output-weights
--use-rotary-position-embeddings
--normalization
RMSNorm
--no-position-embedding
--tokenizer-type
QwenTokenizer
--merge-file
/path_to_qwen_token/merges.txt
--vocab-file
/path_to_qwen_token/vocab.json
)
EVAL_AND_LOGGING_ARGS
=(
--log-interval
1
--save-interval
1000
--eval-interval
1000
--save
$CHECKPOINT_PATH
--load
$CHECKPOINT_PATH
--eval-iters
10
--tensorboard-dir
$TENSORBOARD_LOGS_PATH
)
APP
=
"python3 -u pretrain_gpt.py
\
${
GPT_MODEL_ARGS
[@]
}
\
${
TRAINING_ARGS
[@]
}
\
${
MODEL_PARALLEL_ARGS
[@]
}
\
${
DATA_ARGS
[@]
}
\
${
EVAL_AND_LOGGING_ARGS
[@]
}
--rank
${
RANK
}
\
--world_size
${
WORLD_SIZE
}
\
--dist_url tcp://
${
1
}
:34566
\
"
#for hygon
case
${
lrank
}
in
[
0]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
;;
[
1]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
;;
[
2]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
;;
[
3]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
;;
[
4]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
4
--membind
=
4
${
APP
}
;;
[
5]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
5
--membind
=
5
${
APP
}
;;
[
6]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
6
--membind
=
6
${
APP
}
;;
[
7]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
7
--membind
=
7
${
APP
}
;;
esac
scripts/qwen_token/merges.txt
0 → 100644
View file @
4b255099
This diff is collapsed.
Click to expand it.
scripts/qwen_token/vocab.json
0 → 100644
View file @
4b255099
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment