Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
aaeb520c
Unverified
Commit
aaeb520c
authored
Sep 04, 2023
by
yingliu-hpc
Committed by
GitHub
Sep 04, 2023
Browse files
Merge pull request #4542 from hpcaitech/chatglm
[coati] Add chatglm in coati
parents
8d7b0229
9f852f24
Changes
15
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
2165 additions
and
42 deletions
+2165
-42
.github/workflows/run_chatgpt_examples.yml
.github/workflows/run_chatgpt_examples.yml
+1
-2
.github/workflows/run_chatgpt_unit_tests.yml
.github/workflows/run_chatgpt_unit_tests.yml
+1
-2
applications/Chat/coati/dataset/sft_dataset.py
applications/Chat/coati/dataset/sft_dataset.py
+63
-12
applications/Chat/coati/models/chatglm/__init__.py
applications/Chat/coati/models/chatglm/__init__.py
+3
-0
applications/Chat/coati/models/chatglm/chatglm_actor.py
applications/Chat/coati/models/chatglm/chatglm_actor.py
+34
-0
applications/Chat/coati/models/chatglm/chatglm_tokenizer.py
applications/Chat/coati/models/chatglm/chatglm_tokenizer.py
+446
-0
applications/Chat/coati/models/chatglm/configuration_chatglm.py
...ations/Chat/coati/models/chatglm/configuration_chatglm.py
+107
-0
applications/Chat/coati/models/chatglm/modeling_chatglm.py
applications/Chat/coati/models/chatglm/modeling_chatglm.py
+1439
-0
applications/Chat/coati/trainer/sft.py
applications/Chat/coati/trainer/sft.py
+7
-3
applications/Chat/examples/requirements.txt
applications/Chat/examples/requirements.txt
+1
-0
applications/Chat/examples/train_sft.py
applications/Chat/examples/train_sft.py
+9
-3
applications/Chat/requirements-test.txt
applications/Chat/requirements-test.txt
+1
-0
applications/Chat/requirements.txt
applications/Chat/requirements.txt
+1
-1
applications/Chat/tests/test_dataset.py
applications/Chat/tests/test_dataset.py
+26
-5
applications/Chat/tests/test_models.py
applications/Chat/tests/test_models.py
+26
-14
No files found.
.github/workflows/run_chatgpt_examples.yml
View file @
aaeb520c
...
@@ -28,9 +28,8 @@ jobs:
...
@@ -28,9 +28,8 @@ jobs:
-
name
:
Checkout ColossalAI
-
name
:
Checkout ColossalAI
uses
:
actions/checkout@v2
uses
:
actions/checkout@v2
-
name
:
Install
ColossalAI and
ChatGPT
-
name
:
Install ChatGPT
run
:
|
run
:
|
pip install -e .
cd applications/Chat
cd applications/Chat
pip install -v .
pip install -v .
pip install -r examples/requirements.txt
pip install -r examples/requirements.txt
...
...
.github/workflows/run_chatgpt_unit_tests.yml
View file @
aaeb520c
...
@@ -30,9 +30,8 @@ jobs:
...
@@ -30,9 +30,8 @@ jobs:
-
name
:
Checkout ColossalAI
-
name
:
Checkout ColossalAI
uses
:
actions/checkout@v2
uses
:
actions/checkout@v2
-
name
:
Install
ColossalAI and
ChatGPT
-
name
:
Install ChatGPT
run
:
|
run
:
|
pip install -e .
cd applications/Chat
cd applications/Chat
pip install -v .
pip install -v .
pip install -r requirements-test.txt
pip install -r requirements-test.txt
...
...
applications/Chat/coati/dataset/sft_dataset.py
View file @
aaeb520c
...
@@ -19,7 +19,7 @@ import torch
...
@@ -19,7 +19,7 @@ import torch
from
torch.utils.data
import
Dataset
from
torch.utils.data
import
Dataset
from
tqdm
import
tqdm
from
tqdm
import
tqdm
from
transformers
import
PreTrainedTokenizer
from
transformers
import
PreTrainedTokenizer
from
coati.models.chatglm.chatglm_tokenizer
import
ChatGLMTokenizer
from
colossalai.logging
import
get_dist_logger
from
colossalai.logging
import
get_dist_logger
from
.utils
import
is_rank_0
,
jload
from
.utils
import
is_rank_0
,
jload
...
@@ -71,6 +71,42 @@ def _preprocess(sources: Sequence[str],
...
@@ -71,6 +71,42 @@ def _preprocess(sources: Sequence[str],
return
sequences_token
[
"input_ids"
],
labels
,
sequences_token
[
"attention_mask"
]
return
sequences_token
[
"input_ids"
],
labels
,
sequences_token
[
"attention_mask"
]
def
_preprocess_chatglm
(
sources
:
Sequence
[
str
],
targets
:
Sequence
[
str
],
tokenizer
:
PreTrainedTokenizer
,
max_length
:
int
,
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]:
"""
Preprocess the data by tokenizing.
None for attention mask, ChatGLM will calculate attention mask according to input ids
"""
labels
=
[]
input_ids
=
[]
for
source
,
target
in
zip
(
sources
,
targets
):
source_id
=
tokenizer
.
encode
(
text
=
source
,
add_special_tokens
=
False
)
target_id
=
tokenizer
.
encode
(
text
=
target
,
add_special_tokens
=
False
)
input_id
=
tokenizer
.
build_inputs_with_special_tokens
(
source_id
,
target_id
)
# truncate
sp_token_list
=
[
tokenizer
.
gmask_token_id
,
tokenizer
.
bos_token_id
]
truncate_length
=
max
(
0
,
len
(
input_id
)
-
max_length
)
input_id
=
input_id
[
truncate_length
:
]
if
truncate_length
==
len
(
source_id
)
+
1
:
input_id
=
sp_token_list
+
input_id
[
1
:
]
elif
truncate_length
>
len
(
source_id
)
+
1
:
input_id
=
sp_token_list
+
input_id
[
2
:
]
context_length
=
input_id
.
index
(
tokenizer
.
bos_token_id
)
mask_position
=
context_length
-
1
label
=
[
IGNORE_INDEX
]
*
context_length
+
input_id
[
mask_position
+
1
:]
pad_len
=
max_length
-
len
(
input_id
)
input_id
=
input_id
+
[
tokenizer
.
pad_token_id
]
*
pad_len
input_ids
.
append
(
input_id
)
labels
.
append
(
label
+
[
IGNORE_INDEX
]
*
pad_len
)
return
torch
.
tensor
(
input_ids
),
torch
.
tensor
(
labels
),
None
class
SFTDataset
(
Dataset
):
class
SFTDataset
(
Dataset
):
"""
"""
Dataset for sft model
Dataset for sft model
...
@@ -94,7 +130,10 @@ class SFTDataset(Dataset):
...
@@ -94,7 +130,10 @@ class SFTDataset(Dataset):
data
[
"completion"
]
+
tokenizer
.
eos_token
data
[
"completion"
]
+
tokenizer
.
eos_token
for
data
in
tqdm
(
dataset
,
disable
=
not
is_rank_0
())
for
data
in
tqdm
(
dataset
,
disable
=
not
is_rank_0
())
]
]
if
isinstance
(
tokenizer
,
ChatGLMTokenizer
):
self
.
input_ids
,
self
.
labels
,
self
.
attention_mask
=
\
_preprocess_chatglm
(
sources
,
targets
,
tokenizer
,
max_length
)
else
:
self
.
input_ids
,
self
.
labels
,
self
.
attention_mask
=
\
self
.
input_ids
,
self
.
labels
,
self
.
attention_mask
=
\
_preprocess
(
sources
,
targets
,
tokenizer
,
max_length
)
_preprocess
(
sources
,
targets
,
tokenizer
,
max_length
)
...
@@ -103,9 +142,13 @@ class SFTDataset(Dataset):
...
@@ -103,9 +142,13 @@ class SFTDataset(Dataset):
return
length
return
length
def
__getitem__
(
self
,
idx
):
def
__getitem__
(
self
,
idx
):
if
self
.
attention_mask
is
not
None
:
return
dict
(
input_ids
=
self
.
input_ids
[
idx
],
return
dict
(
input_ids
=
self
.
input_ids
[
idx
],
labels
=
self
.
labels
[
idx
],
labels
=
self
.
labels
[
idx
],
attention_mask
=
self
.
attention_mask
[
idx
])
attention_mask
=
self
.
attention_mask
[
idx
])
else
:
return
dict
(
input_ids
=
self
.
input_ids
[
idx
],
labels
=
self
.
labels
[
idx
])
class
SupervisedDataset
(
Dataset
):
class
SupervisedDataset
(
Dataset
):
...
@@ -137,6 +180,10 @@ class SupervisedDataset(Dataset):
...
@@ -137,6 +180,10 @@ class SupervisedDataset(Dataset):
]
]
logger
.
info
(
"Tokenizing inputs... This may take some time..."
)
logger
.
info
(
"Tokenizing inputs... This may take some time..."
)
if
isinstance
(
tokenizer
,
ChatGLMTokenizer
):
self
.
input_ids
,
self
.
labels
,
self
.
attention_mask
=
\
_preprocess_chatglm
(
sources
,
targets
,
tokenizer
,
max_length
)
else
:
self
.
input_ids
,
self
.
labels
,
self
.
attention_mask
=
\
self
.
input_ids
,
self
.
labels
,
self
.
attention_mask
=
\
_preprocess
(
sources
,
targets
,
tokenizer
,
max_length
)
_preprocess
(
sources
,
targets
,
tokenizer
,
max_length
)
...
@@ -145,6 +192,10 @@ class SupervisedDataset(Dataset):
...
@@ -145,6 +192,10 @@ class SupervisedDataset(Dataset):
return
length
return
length
def
__getitem__
(
self
,
idx
):
def
__getitem__
(
self
,
idx
):
if
self
.
attention_mask
is
not
None
:
return
dict
(
input_ids
=
self
.
input_ids
[
idx
],
return
dict
(
input_ids
=
self
.
input_ids
[
idx
],
labels
=
self
.
labels
[
idx
],
labels
=
self
.
labels
[
idx
],
attention_mask
=
self
.
attention_mask
[
idx
])
attention_mask
=
self
.
attention_mask
[
idx
])
else
:
return
dict
(
input_ids
=
self
.
input_ids
[
idx
],
labels
=
self
.
labels
[
idx
])
applications/Chat/coati/models/chatglm/__init__.py
0 → 100644
View file @
aaeb520c
from
.chatglm_actor
import
ChatGLMActor
__all__
=
[
'ChatGLMActor'
]
\ No newline at end of file
applications/Chat/coati/models/chatglm/chatglm_actor.py
0 → 100644
View file @
aaeb520c
from
typing
import
Optional
import
torch
from
.configuration_chatglm
import
ChatGLMConfig
from
.modeling_chatglm
import
ChatGLMForConditionalGeneration
from
..base
import
Actor
class
ChatGLMActor
(
Actor
):
"""
ChatGLM Actor model.
Args:
pretrained (str): Pretrained model name or path.
config (ChatGLMConfig): Model config.
checkpoint (bool): Enable gradient checkpointing.
do not support lora for now.
"""
def
__init__
(
self
,
pretrained
:
str
=
None
,
config
:
Optional
[
ChatGLMConfig
]
=
None
,
checkpoint
:
bool
=
False
)
->
None
:
if
pretrained
is
not
None
:
model
=
ChatGLMForConditionalGeneration
.
from_pretrained
(
pretrained
)
elif
config
is
not
None
:
model
=
ChatGLMForConditionalGeneration
(
config
)
else
:
model
=
ChatGLMForConditionalGeneration
(
ChatGLMConfig
())
if
checkpoint
:
model
.
gradient_checkpointing_enable
()
super
().
__init__
(
model
,
lora_rank
=
0
,
lora_train_bias
=
'none'
)
applications/Chat/coati/models/chatglm/chatglm_tokenizer.py
0 → 100644
View file @
aaeb520c
"""
This code is copied from https://huggingface.co/THUDM/chatglm-6b/blob/main/tokenization_chatglm.py
"""
"""Tokenization classes for ChatGLM."""
from
typing
import
List
,
Optional
,
Union
import
os
from
transformers.tokenization_utils
import
PreTrainedTokenizer
from
transformers.utils
import
logging
,
PaddingStrategy
from
transformers.tokenization_utils_base
import
EncodedInput
,
BatchEncoding
from
typing
import
Dict
import
sentencepiece
as
spm
import
numpy
as
np
logger
=
logging
.
get_logger
(
__name__
)
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
=
{
"THUDM/chatglm-6b"
:
2048
,
}
class
TextTokenizer
:
def
__init__
(
self
,
model_path
):
self
.
sp
=
spm
.
SentencePieceProcessor
()
self
.
sp
.
Load
(
model_path
)
self
.
num_tokens
=
self
.
sp
.
vocab_size
()
def
encode
(
self
,
text
):
return
self
.
sp
.
EncodeAsIds
(
text
)
def
decode
(
self
,
ids
:
List
[
int
]):
return
self
.
sp
.
DecodeIds
(
ids
)
def
tokenize
(
self
,
text
):
return
self
.
sp
.
EncodeAsPieces
(
text
)
def
convert_tokens_to_string
(
self
,
tokens
):
return
self
.
sp
.
DecodePieces
(
tokens
)
def
convert_tokens_to_ids
(
self
,
tokens
):
return
[
self
.
sp
.
PieceToId
(
token
)
for
token
in
tokens
]
def
convert_token_to_id
(
self
,
token
):
return
self
.
sp
.
PieceToId
(
token
)
def
convert_id_to_token
(
self
,
idx
):
return
self
.
sp
.
IdToPiece
(
idx
)
def
__len__
(
self
):
return
self
.
num_tokens
class
SPTokenizer
:
def
__init__
(
self
,
vocab_file
,
num_image_tokens
=
20000
,
max_blank_length
=
80
,
byte_fallback
=
True
,
):
assert
vocab_file
is
not
None
self
.
vocab_file
=
vocab_file
self
.
num_image_tokens
=
num_image_tokens
self
.
special_tokens
=
[
"[MASK]"
,
"[gMASK]"
,
"[sMASK]"
,
"<unused_0>"
,
"<sop>"
,
"<eop>"
,
"<ENC>"
,
"<dBLOCK>"
]
self
.
max_blank_length
=
max_blank_length
self
.
byte_fallback
=
byte_fallback
self
.
text_tokenizer
=
TextTokenizer
(
vocab_file
)
def
_get_text_tokenizer
(
self
):
return
self
.
text_tokenizer
@
staticmethod
def
get_blank_token
(
length
:
int
):
assert
length
>=
2
return
f
"<|blank_
{
length
}
|>"
@
staticmethod
def
get_tab_token
():
return
f
"<|tab|>"
@
property
def
num_text_tokens
(
self
):
return
self
.
text_tokenizer
.
num_tokens
@
property
def
num_tokens
(
self
):
return
self
.
num_image_tokens
+
self
.
num_text_tokens
@
staticmethod
def
_encode_whitespaces
(
text
:
str
,
max_len
:
int
=
80
):
text
=
text
.
replace
(
"
\t
"
,
SPTokenizer
.
get_tab_token
())
for
i
in
range
(
max_len
,
1
,
-
1
):
text
=
text
.
replace
(
" "
*
i
,
SPTokenizer
.
get_blank_token
(
i
))
return
text
def
_preprocess
(
self
,
text
:
str
,
linebreak
=
True
,
whitespaces
=
True
):
if
linebreak
:
text
=
text
.
replace
(
"
\n
"
,
"<n>"
)
if
whitespaces
:
text
=
self
.
_encode_whitespaces
(
text
,
max_len
=
self
.
max_blank_length
)
return
text
def
encode
(
self
,
text
:
str
,
linebreak
=
True
,
whitespaces
=
True
,
add_dummy_prefix
=
True
)
->
List
[
int
]:
"""
@param text: Text to encode.
@param linebreak: Whether to encode newline (
\n
) in text.
@param whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding.
@param special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text.
@param add_dummy_prefix: Whether to add dummy blank space in the beginning.
"""
text
=
self
.
_preprocess
(
text
,
linebreak
,
whitespaces
)
if
not
add_dummy_prefix
:
text
=
"<n>"
+
text
tmp
=
self
.
_get_text_tokenizer
().
encode
(
text
)
tokens
=
[
x
+
self
.
num_image_tokens
for
x
in
tmp
]
return
tokens
if
add_dummy_prefix
else
tokens
[
2
:]
def
postprocess
(
self
,
text
):
text
=
text
.
replace
(
"<n>"
,
"
\n
"
)
text
=
text
.
replace
(
SPTokenizer
.
get_tab_token
(),
"
\t
"
)
for
i
in
range
(
2
,
self
.
max_blank_length
+
1
):
text
=
text
.
replace
(
self
.
get_blank_token
(
i
),
" "
*
i
)
return
text
def
decode
(
self
,
text_ids
:
List
[
int
])
->
str
:
ids
=
[
int
(
_id
)
-
self
.
num_image_tokens
for
_id
in
text_ids
]
ids
=
[
_id
for
_id
in
ids
if
_id
>=
0
]
text
=
self
.
_get_text_tokenizer
().
decode
(
ids
)
text
=
self
.
postprocess
(
text
)
return
text
def
decode_tokens
(
self
,
tokens
:
List
[
str
])
->
str
:
text
=
self
.
_get_text_tokenizer
().
convert_tokens_to_string
(
tokens
)
text
=
self
.
postprocess
(
text
)
return
text
def
tokenize
(
self
,
text
:
str
,
linebreak
=
True
,
whitespaces
=
True
,
add_dummy_prefix
=
True
)
->
List
[
str
]:
"""
@param text: Text to encode.
@param linebreak: Whether to encode newline (
\n
) in text.
@param whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding.
@param special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text.
@param add_dummy_prefix: Whether to add dummy blank space in the beginning.
"""
text
=
self
.
_preprocess
(
text
,
linebreak
,
whitespaces
)
if
not
add_dummy_prefix
:
text
=
"<n>"
+
text
tokens
=
self
.
_get_text_tokenizer
().
tokenize
(
text
)
return
tokens
if
add_dummy_prefix
else
tokens
[
2
:]
def
__getitem__
(
self
,
x
:
Union
[
int
,
str
]):
if
isinstance
(
x
,
int
):
if
x
<
self
.
num_image_tokens
:
return
"<image_{}>"
.
format
(
x
)
else
:
return
self
.
text_tokenizer
.
convert_id_to_token
(
x
-
self
.
num_image_tokens
)
elif
isinstance
(
x
,
str
):
if
x
.
startswith
(
"<image_"
)
and
x
.
endswith
(
">"
)
and
x
[
7
:
-
1
].
isdigit
():
return
int
(
x
[
7
:
-
1
])
else
:
return
self
.
text_tokenizer
.
convert_token_to_id
(
x
)
+
self
.
num_image_tokens
else
:
raise
ValueError
(
"The key should be str or int."
)
class
ChatGLMTokenizer
(
PreTrainedTokenizer
):
"""
Construct a ChatGLM tokenizer. Based on byte-level Byte-Pair-Encoding.
Args:
vocab_file (`str`):
Path to the vocabulary file.
"""
vocab_files_names
=
{
"vocab_file"
:
"ice_text.model"
}
max_model_input_sizes
=
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names
=
[
"input_ids"
,
"attention_mask"
,
"position_ids"
]
def
__init__
(
self
,
vocab_file
,
do_lower_case
=
False
,
remove_space
=
False
,
bos_token
=
'<sop>'
,
eos_token
=
'<eop>'
,
end_token
=
'</s>'
,
mask_token
=
'[MASK]'
,
gmask_token
=
'[gMASK]'
,
padding_side
=
"left"
,
pad_token
=
"<pad>"
,
unk_token
=
"<unk>"
,
num_image_tokens
=
20000
,
**
kwargs
)
->
None
:
super
().
__init__
(
do_lower_case
=
do_lower_case
,
remove_space
=
remove_space
,
padding_side
=
padding_side
,
bos_token
=
bos_token
,
eos_token
=
eos_token
,
end_token
=
end_token
,
mask_token
=
mask_token
,
gmask_token
=
gmask_token
,
pad_token
=
pad_token
,
unk_token
=
unk_token
,
num_image_tokens
=
num_image_tokens
,
**
kwargs
)
self
.
do_lower_case
=
do_lower_case
self
.
remove_space
=
remove_space
self
.
vocab_file
=
vocab_file
self
.
bos_token
=
bos_token
self
.
eos_token
=
eos_token
self
.
end_token
=
end_token
self
.
mask_token
=
mask_token
self
.
gmask_token
=
gmask_token
self
.
sp_tokenizer
=
SPTokenizer
(
vocab_file
,
num_image_tokens
=
num_image_tokens
)
""" Initialisation """
@
property
def
gmask_token_id
(
self
)
->
Optional
[
int
]:
if
self
.
gmask_token
is
None
:
return
None
return
self
.
convert_tokens_to_ids
(
self
.
gmask_token
)
@
property
def
end_token_id
(
self
)
->
Optional
[
int
]:
"""
`Optional[int]`: Id of the end of context token in the vocabulary. Returns `None` if the token has not been
set.
"""
if
self
.
end_token
is
None
:
return
None
return
self
.
convert_tokens_to_ids
(
self
.
end_token
)
@
property
def
vocab_size
(
self
):
""" Returns vocab size """
return
self
.
sp_tokenizer
.
num_tokens
def
get_vocab
(
self
):
""" Returns vocab as a dict """
vocab
=
{
self
.
_convert_id_to_token
(
i
):
i
for
i
in
range
(
self
.
vocab_size
)}
vocab
.
update
(
self
.
added_tokens_encoder
)
return
vocab
def
preprocess_text
(
self
,
inputs
):
if
self
.
remove_space
:
outputs
=
" "
.
join
(
inputs
.
strip
().
split
())
else
:
outputs
=
inputs
if
self
.
do_lower_case
:
outputs
=
outputs
.
lower
()
return
outputs
def
_tokenize
(
self
,
text
,
**
kwargs
):
""" Returns a tokenized string. """
text
=
self
.
preprocess_text
(
text
)
seq
=
self
.
sp_tokenizer
.
tokenize
(
text
)
return
seq
def
convert_tokens_to_string
(
self
,
tokens
:
List
[
str
])
->
str
:
return
self
.
sp_tokenizer
.
decode_tokens
(
tokens
)
def
_decode
(
self
,
token_ids
:
Union
[
int
,
List
[
int
]],
**
kwargs
)
->
str
:
if
isinstance
(
token_ids
,
int
):
token_ids
=
[
token_ids
]
if
len
(
token_ids
)
==
0
:
return
""
if
self
.
pad_token_id
in
token_ids
:
# remove pad
token_ids
=
list
(
filter
((
self
.
pad_token_id
).
__ne__
,
token_ids
))
return
super
().
_decode
(
token_ids
,
**
kwargs
)
def
_convert_token_to_id
(
self
,
token
):
""" Converts a token (str) in an id using the vocab. """
return
self
.
sp_tokenizer
[
token
]
def
_convert_id_to_token
(
self
,
index
):
"""Converts an index (integer) in a token (str) using the vocab."""
return
self
.
sp_tokenizer
[
index
]
def
save_vocabulary
(
self
,
save_directory
,
filename_prefix
=
None
):
"""
Save the vocabulary and special tokens file to a directory.
Args:
save_directory (`str`):
The directory in which to save the vocabulary.
filename_prefix (`str`, *optional*):
An optional prefix to add to the named of the saved files.
Returns:
`Tuple(str)`: Paths to the files saved.
"""
if
os
.
path
.
isdir
(
save_directory
):
vocab_file
=
os
.
path
.
join
(
save_directory
,
self
.
vocab_files_names
[
"vocab_file"
]
)
else
:
vocab_file
=
save_directory
with
open
(
self
.
vocab_file
,
'rb'
)
as
fin
:
proto_str
=
fin
.
read
()
with
open
(
vocab_file
,
"wb"
)
as
writer
:
writer
.
write
(
proto_str
)
return
(
vocab_file
,)
def
build_inputs_with_special_tokens
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
)
->
List
[
int
]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A BERT sequence has the following format:
- single sequence: `[CLS] X [SEP]`
- pair of sequences: `[CLS] A [SEP] B [SEP]`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
gmask_id
=
self
.
sp_tokenizer
[
self
.
gmask_token
]
eos_id
=
self
.
sp_tokenizer
[
self
.
eos_token
]
token_ids_0
=
token_ids_0
+
[
gmask_id
,
self
.
sp_tokenizer
[
self
.
bos_token
]]
if
token_ids_1
is
not
None
:
token_ids_0
=
token_ids_0
+
token_ids_1
return
token_ids_0
def
_pad
(
self
,
encoded_inputs
:
Union
[
Dict
[
str
,
EncodedInput
],
BatchEncoding
],
max_length
:
Optional
[
int
]
=
None
,
padding_strategy
:
PaddingStrategy
=
PaddingStrategy
.
DO_NOT_PAD
,
pad_to_multiple_of
:
Optional
[
int
]
=
None
,
return_attention_mask
:
Optional
[
bool
]
=
None
,
)
->
dict
:
"""
Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
Args:
encoded_inputs:
Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
max_length: maximum length of the returned list and optionally padding length (see below).
Will truncate by taking into account the special tokens.
padding_strategy: PaddingStrategy to use for padding.
- PaddingStrategy.LONGEST Pad to the longest sequence in the batch
- PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
- PaddingStrategy.DO_NOT_PAD: Do not pad
The tokenizer padding sides are defined in self.padding_side:
- 'left': pads on the left of the sequences
- 'right': pads on the right of the sequences
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
`>= 7.5` (Volta).
return_attention_mask:
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
"""
# Load from model defaults
bos_token_id
=
self
.
sp_tokenizer
[
self
.
bos_token
]
mask_token_id
=
self
.
sp_tokenizer
[
self
.
mask_token
]
gmask_token_id
=
self
.
sp_tokenizer
[
self
.
gmask_token
]
assert
self
.
padding_side
==
"left"
required_input
=
encoded_inputs
[
self
.
model_input_names
[
0
]]
seq_length
=
len
(
required_input
)
if
padding_strategy
==
PaddingStrategy
.
LONGEST
:
max_length
=
len
(
required_input
)
if
max_length
is
not
None
and
pad_to_multiple_of
is
not
None
and
(
max_length
%
pad_to_multiple_of
!=
0
):
max_length
=
((
max_length
//
pad_to_multiple_of
)
+
1
)
*
pad_to_multiple_of
needs_to_be_padded
=
padding_strategy
!=
PaddingStrategy
.
DO_NOT_PAD
and
len
(
required_input
)
!=
max_length
# Initialize attention mask if not present.
if
max_length
is
not
None
:
if
"attention_mask"
not
in
encoded_inputs
:
if
bos_token_id
in
required_input
:
context_length
=
required_input
.
index
(
bos_token_id
)
else
:
context_length
=
seq_length
attention_mask
=
np
.
ones
((
1
,
seq_length
,
seq_length
))
attention_mask
=
np
.
tril
(
attention_mask
)
attention_mask
[:,
:,
:
context_length
]
=
1
attention_mask
=
np
.
bool_
(
attention_mask
<
0.5
)
encoded_inputs
[
"attention_mask"
]
=
attention_mask
if
"position_ids"
not
in
encoded_inputs
:
if
bos_token_id
in
required_input
:
context_length
=
required_input
.
index
(
bos_token_id
)
else
:
context_length
=
seq_length
position_ids
=
np
.
arange
(
seq_length
,
dtype
=
np
.
int64
)
mask_token
=
mask_token_id
if
mask_token_id
in
required_input
else
gmask_token_id
if
mask_token
in
required_input
:
mask_position
=
required_input
.
index
(
mask_token
)
position_ids
[
context_length
:]
=
mask_position
block_position_ids
=
np
.
concatenate
(
[
np
.
zeros
(
context_length
,
dtype
=
np
.
int64
),
np
.
arange
(
1
,
seq_length
-
context_length
+
1
,
dtype
=
np
.
int64
)])
encoded_inputs
[
"position_ids"
]
=
np
.
stack
([
position_ids
,
block_position_ids
],
axis
=
0
)
if
needs_to_be_padded
:
difference
=
max_length
-
len
(
required_input
)
if
"attention_mask"
in
encoded_inputs
:
encoded_inputs
[
"attention_mask"
]
=
np
.
pad
(
encoded_inputs
[
"attention_mask"
],
pad_width
=
[(
0
,
0
),
(
difference
,
0
),
(
difference
,
0
)],
mode
=
'constant'
,
constant_values
=
True
)
if
"token_type_ids"
in
encoded_inputs
:
encoded_inputs
[
"token_type_ids"
]
=
[
self
.
pad_token_type_id
]
*
difference
+
encoded_inputs
[
"token_type_ids"
]
if
"special_tokens_mask"
in
encoded_inputs
:
encoded_inputs
[
"special_tokens_mask"
]
=
[
1
]
*
difference
+
encoded_inputs
[
"special_tokens_mask"
]
if
"position_ids"
in
encoded_inputs
:
encoded_inputs
[
"position_ids"
]
=
np
.
pad
(
encoded_inputs
[
"position_ids"
],
pad_width
=
[(
0
,
0
),
(
difference
,
0
)])
encoded_inputs
[
self
.
model_input_names
[
0
]]
=
[
self
.
pad_token_id
]
*
difference
+
required_input
return
encoded_inputs
\ No newline at end of file
applications/Chat/coati/models/chatglm/configuration_chatglm.py
0 → 100644
View file @
aaeb520c
"""
This code is copied from https://huggingface.co/THUDM/chatglm-6b/resolve/main/configuration_chatglm.py
"""
""" ChatGLM model configuration """
from
transformers.configuration_utils
import
PretrainedConfig
from
transformers.utils
import
logging
logger
=
logging
.
get_logger
(
__name__
)
class
ChatGLMConfig
(
PretrainedConfig
):
r
"""
This is the configuration class to store the configuration of a [`~ChatGLMModel`].
It is used to instantiate an ChatGLM model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the ChatGLM-6B [THUDM/ChatGLM-6B](https://huggingface.co/THUDM/chatglm-6b) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used
to control the model outputs. Read the documentation from [`PretrainedConfig`]
for more information.
Args:
vocab_size (`int`, *optional*, defaults to 150528):
Vocabulary size of the ChatGLM-6B model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`~ChatGLMModel`] or
[`~TFChatGLMModel`].
hidden_size (`int`, *optional*, defaults to 4096):
Dimension of the encoder layers and the pooler layer.
num_hidden_layers (`int`, *optional*, defaults to 28):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 32):
Number of attention heads for each attention layer in the Transformer encoder.
inner_hidden_size (`int`, *optional*, defaults to 16384):
Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
max_sequence_length (`int`, *optional*, defaults to 512):
The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
layernorm_epsilon (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers.
use_cache (`bool`, *optional*, defaults to `True`):
Whether the model should return the last key/values attentions (not used by all models).
Example:
```python
>>> from configuration_chatglm import ChatGLMConfig
>>> from modeling_chatglm import ChatGLMModel
>>> # Initializing a ChatGLM-6B THUDM/ChatGLM-6B style configuration
>>> configuration = ChatGLMConfig()
>>> # Initializing a model from the THUDM/ChatGLM-6B style configuration
>>> model = ChatGLMModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type
=
"chatglm"
def
__init__
(
self
,
vocab_size
=
130528
,
hidden_size
=
4096
,
num_layers
=
28
,
num_attention_heads
=
32
,
layernorm_epsilon
=
1e-5
,
use_cache
=
True
,
bos_token_id
=
130004
,
eos_token_id
=
130005
,
mask_token_id
=
130000
,
gmask_token_id
=
130001
,
pad_token_id
=
3
,
max_sequence_length
=
2048
,
inner_hidden_size
=
16384
,
position_encoding_2d
=
True
,
quantization_bit
=
0
,
pre_seq_len
=
None
,
prefix_projection
=
False
,
**
kwargs
):
self
.
num_layers
=
num_layers
self
.
vocab_size
=
vocab_size
self
.
hidden_size
=
hidden_size
self
.
num_attention_heads
=
num_attention_heads
self
.
max_sequence_length
=
max_sequence_length
self
.
layernorm_epsilon
=
layernorm_epsilon
self
.
inner_hidden_size
=
inner_hidden_size
self
.
use_cache
=
use_cache
self
.
bos_token_id
=
bos_token_id
self
.
eos_token_id
=
eos_token_id
self
.
pad_token_id
=
pad_token_id
self
.
mask_token_id
=
mask_token_id
self
.
gmask_token_id
=
gmask_token_id
self
.
position_encoding_2d
=
position_encoding_2d
self
.
quantization_bit
=
quantization_bit
self
.
pre_seq_len
=
pre_seq_len
self
.
prefix_projection
=
prefix_projection
super
().
__init__
(
pad_token_id
=
pad_token_id
,
bos_token_id
=
bos_token_id
,
eos_token_id
=
eos_token_id
,
**
kwargs
)
\ No newline at end of file
applications/Chat/coati/models/chatglm/modeling_chatglm.py
0 → 100644
View file @
aaeb520c
This diff is collapsed.
Click to expand it.
applications/Chat/coati/trainer/sft.py
View file @
aaeb520c
...
@@ -52,9 +52,13 @@ class SFTTrainer(SLTrainer):
...
@@ -52,9 +52,13 @@ class SFTTrainer(SLTrainer):
for
batch_id
,
batch
in
enumerate
(
self
.
train_dataloader
):
for
batch_id
,
batch
in
enumerate
(
self
.
train_dataloader
):
batch
=
to_device
(
batch
,
torch
.
cuda
.
current_device
())
batch
=
to_device
(
batch
,
torch
.
cuda
.
current_device
())
if
"attention_mask"
in
batch
:
outputs
=
self
.
model
(
batch
[
"input_ids"
],
outputs
=
self
.
model
(
batch
[
"input_ids"
],
attention_mask
=
batch
[
"attention_mask"
],
attention_mask
=
batch
[
"attention_mask"
],
labels
=
batch
[
"labels"
])
labels
=
batch
[
"labels"
])
else
:
outputs
=
self
.
model
(
batch
[
"input_ids"
],
labels
=
batch
[
"labels"
])
loss
=
outputs
.
loss
loss
=
outputs
.
loss
loss
=
loss
/
self
.
accumulation_steps
loss
=
loss
/
self
.
accumulation_steps
...
...
applications/Chat/examples/requirements.txt
View file @
aaeb520c
pandas>=1.4.1
pandas>=1.4.1
sentencepiece
sentencepiece
colossalai==0.3.1
\ No newline at end of file
applications/Chat/examples/train_sft.py
View file @
aaeb520c
...
@@ -9,13 +9,15 @@ from coati.models.bloom import BLOOMActor
...
@@ -9,13 +9,15 @@ from coati.models.bloom import BLOOMActor
from
coati.models.gpt
import
GPTActor
from
coati.models.gpt
import
GPTActor
from
coati.models.llama
import
LlamaActor
from
coati.models.llama
import
LlamaActor
from
coati.models.opt
import
OPTActor
from
coati.models.opt
import
OPTActor
from
coati.models.chatglm
import
ChatGLMActor
from
coati.trainer
import
SFTTrainer
from
coati.trainer
import
SFTTrainer
from
coati.trainer.strategies
import
DDPStrategy
,
GeminiStrategy
,
LowLevelZeroStrategy
from
coati.trainer.strategies
import
DDPStrategy
,
GeminiStrategy
,
LowLevelZeroStrategy
from
datasets
import
load_dataset
from
datasets
import
load_dataset
from
torch.optim
import
Adam
from
torch.optim
import
Adam
from
torch.utils.data
import
DataLoader
from
torch.utils.data
import
DataLoader
from
torch.utils.data.distributed
import
DistributedSampler
from
torch.utils.data.distributed
import
DistributedSampler
from
transformers
import
AutoTokenizer
,
BloomTokenizerFast
,
LlamaTokenizer
from
transformers
import
AutoTokenizer
,
BloomTokenizerFast
,
LlamaTokenizer
,
AutoModel
from
coati.models.chatglm.chatglm_tokenizer
import
ChatGLMTokenizer
from
transformers.models.gpt2.tokenization_gpt2
import
GPT2Tokenizer
from
transformers.models.gpt2.tokenization_gpt2
import
GPT2Tokenizer
from
transformers.trainer
import
get_scheduler
from
transformers.trainer
import
get_scheduler
...
@@ -58,6 +60,8 @@ def train(args):
...
@@ -58,6 +60,8 @@ def train(args):
model
=
LlamaActor
(
pretrained
=
args
.
pretrain
,
model
=
LlamaActor
(
pretrained
=
args
.
pretrain
,
lora_rank
=
args
.
lora_rank
,
lora_rank
=
args
.
lora_rank
,
checkpoint
=
args
.
grad_checkpoint
)
checkpoint
=
args
.
grad_checkpoint
)
elif
args
.
model
==
'chatglm'
:
model
=
ChatGLMActor
(
pretrained
=
args
.
pretrain
)
else
:
else
:
raise
ValueError
(
f
'Unsupported model "
{
args
.
model
}
"'
)
raise
ValueError
(
f
'Unsupported model "
{
args
.
model
}
"'
)
...
@@ -81,6 +85,9 @@ def train(args):
...
@@ -81,6 +85,9 @@ def train(args):
"hf-internal-testing/llama-tokenizer"
if
args
.
tokenizer
is
None
else
args
.
tokenizer
)
"hf-internal-testing/llama-tokenizer"
if
args
.
tokenizer
is
None
else
args
.
tokenizer
)
tokenizer
.
eos_token
=
'<\s>'
tokenizer
.
eos_token
=
'<\s>'
tokenizer
.
pad_token
=
tokenizer
.
unk_token
tokenizer
.
pad_token
=
tokenizer
.
unk_token
elif
args
.
model
==
'chatglm'
:
tokenizer
=
ChatGLMTokenizer
.
from_pretrained
(
"THUDM/chatglm-6b"
if
args
.
tokenizer
is
None
else
args
.
tokenizer
,
trust_remote_code
=
True
)
else
:
else
:
raise
ValueError
(
f
'Unsupported model "
{
args
.
model
}
"'
)
raise
ValueError
(
f
'Unsupported model "
{
args
.
model
}
"'
)
...
@@ -99,7 +106,6 @@ def train(args):
...
@@ -99,7 +106,6 @@ def train(args):
optim
=
HybridAdam
(
model
.
parameters
(),
lr
=
args
.
lr
,
clipping_norm
=
1.0
)
optim
=
HybridAdam
(
model
.
parameters
(),
lr
=
args
.
lr
,
clipping_norm
=
1.0
)
else
:
else
:
optim
=
Adam
(
model
.
parameters
(),
lr
=
args
.
lr
)
optim
=
Adam
(
model
.
parameters
(),
lr
=
args
.
lr
)
logger
=
get_dist_logger
()
logger
=
get_dist_logger
()
# configure dataset
# configure dataset
...
@@ -185,7 +191,7 @@ if __name__ == '__main__':
...
@@ -185,7 +191,7 @@ if __name__ == '__main__':
parser
.
add_argument
(
'--strategy'
,
parser
.
add_argument
(
'--strategy'
,
choices
=
[
'ddp'
,
'colossalai_gemini'
,
'colossalai_zero2'
,
'colossalai_zero2_cpu'
],
choices
=
[
'ddp'
,
'colossalai_gemini'
,
'colossalai_zero2'
,
'colossalai_zero2_cpu'
],
default
=
'colossalai_zero2'
)
default
=
'colossalai_zero2'
)
parser
.
add_argument
(
'--model'
,
choices
=
[
'gpt2'
,
'bloom'
,
'opt'
,
'llama'
],
default
=
'bloom'
)
parser
.
add_argument
(
'--model'
,
choices
=
[
'gpt2'
,
'bloom'
,
'opt'
,
'llama'
,
'chatglm'
],
default
=
'bloom'
)
parser
.
add_argument
(
'--tokenizer'
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
'--tokenizer'
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
'--pretrain'
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
'--pretrain'
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
'--dataset'
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
'--dataset'
,
type
=
str
,
default
=
None
)
...
...
applications/Chat/requirements-test.txt
View file @
aaeb520c
pytest
pytest
colossalai==0.3.1
\ No newline at end of file
applications/Chat/requirements.txt
View file @
aaeb520c
...
@@ -2,7 +2,7 @@ transformers>=4.20.1
...
@@ -2,7 +2,7 @@ transformers>=4.20.1
tqdm
tqdm
datasets
datasets
loralib
loralib
colossalai
>
=0.
2.4
colossalai
=
=0.
3.1
torch<2.0.0, >=1.12.1
torch<2.0.0, >=1.12.1
langchain
langchain
tokenizers
tokenizers
...
...
applications/Chat/tests/test_dataset.py
View file @
aaeb520c
...
@@ -11,7 +11,7 @@ from coati.dataset.sft_dataset import IGNORE_INDEX, SFTDataset, SupervisedDatase
...
@@ -11,7 +11,7 @@ from coati.dataset.sft_dataset import IGNORE_INDEX, SFTDataset, SupervisedDatase
from
datasets
import
load_dataset
from
datasets
import
load_dataset
from
transformers
import
AutoTokenizer
,
BloomTokenizerFast
,
LlamaTokenizer
,
PreTrainedTokenizer
from
transformers
import
AutoTokenizer
,
BloomTokenizerFast
,
LlamaTokenizer
,
PreTrainedTokenizer
from
transformers.models.gpt2.tokenization_gpt2
import
GPT2Tokenizer
from
transformers.models.gpt2.tokenization_gpt2
import
GPT2Tokenizer
from
coati.models.chatglm.chatglm_tokenizer
import
ChatGLMTokenizer
SFT_DATASET
=
[
SFT_DATASET
=
[
{
{
"instruction"
:
"Provide a list of the top 10 most popular mobile games in Asia"
,
"instruction"
:
"Provide a list of the top 10 most popular mobile games in Asia"
,
...
@@ -66,6 +66,8 @@ def make_tokenizer(model: str):
...
@@ -66,6 +66,8 @@ def make_tokenizer(model: str):
elif
model
==
"llama"
:
elif
model
==
"llama"
:
tokenizer
=
LlamaTokenizer
.
from_pretrained
(
"hf-internal-testing/llama-tokenizer"
)
tokenizer
=
LlamaTokenizer
.
from_pretrained
(
"hf-internal-testing/llama-tokenizer"
)
tokenizer
.
pad_token
=
tokenizer
.
unk_token
tokenizer
.
pad_token
=
tokenizer
.
unk_token
elif
model
==
"chatglm"
:
tokenizer
=
ChatGLMTokenizer
.
from_pretrained
(
"THUDM/chatglm-6b"
,
trust_remote_code
=
True
)
else
:
else
:
raise
ValueError
(
f
"Unsupported model '
{
model
}
'"
)
raise
ValueError
(
f
"Unsupported model '
{
model
}
'"
)
return
tokenizer
return
tokenizer
...
@@ -81,12 +83,18 @@ def check_content(input_ids_stripped: torch.Tensor,
...
@@ -81,12 +83,18 @@ def check_content(input_ids_stripped: torch.Tensor,
elif
model
==
"llama"
:
elif
model
==
"llama"
:
assert
input_ids_stripped
[
0
]
==
tokenizer
.
bos_token_id
assert
input_ids_stripped
[
0
]
==
tokenizer
.
bos_token_id
input_ids_stripped
=
input_ids_stripped
[
1
:]
input_ids_stripped
=
input_ids_stripped
[
1
:]
elif
model
==
"chatglm"
:
assert
input_ids_stripped
[
0
]
==
tokenizer
.
bos_token_id
assert
input_ids_stripped
[
-
1
]
==
tokenizer
.
eos_token_id
input_ids_stripped
=
input_ids_stripped
[
1
:
-
1
]
assert
torch
.
all
(
input_ids_stripped
!=
tokenizer
.
pad_token_id
)
assert
torch
.
all
(
input_ids_stripped
!=
tokenizer
.
pad_token_id
)
assert
torch
.
all
(
input_ids_stripped
!=
tokenizer
.
bos_token_id
)
assert
torch
.
all
(
input_ids_stripped
!=
tokenizer
.
bos_token_id
)
assert
torch
.
all
(
input_ids_stripped
!=
tokenizer
.
eos_token_id
)
assert
torch
.
all
(
input_ids_stripped
!=
tokenizer
.
eos_token_id
)
assert
input_ids_stripped
!=
tokenizer
.
sep_token_id
assert
input_ids_stripped
!=
tokenizer
.
sep_token_id
assert
input_ids_stripped
!=
tokenizer
.
cls_token_id
assert
input_ids_stripped
!=
tokenizer
.
cls_token_id
if
model
==
"chatglm"
:
assert
torch
.
all
(
input_ids_stripped
!=
tokenizer
.
mask_token_id
)
else
:
assert
input_ids_stripped
!=
tokenizer
.
mask_token_id
assert
input_ids_stripped
!=
tokenizer
.
mask_token_id
...
@@ -189,7 +197,7 @@ def test_reward_dataset(model: str,
...
@@ -189,7 +197,7 @@ def test_reward_dataset(model: str,
@
pytest
.
mark
.
cpu
@
pytest
.
mark
.
cpu
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"gpt2"
,
"bloom"
,
"opt"
,
"llama"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"gpt2"
,
"bloom"
,
"opt"
,
"llama"
,
"chatglm"
])
@
pytest
.
mark
.
parametrize
(
"dataset_path"
,
[
"yizhongw/self_instruct"
,
None
])
@
pytest
.
mark
.
parametrize
(
"dataset_path"
,
[
"yizhongw/self_instruct"
,
None
])
@
pytest
.
mark
.
parametrize
(
"max_dataset_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"max_dataset_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"max_length"
,
[
32
,
1024
])
@
pytest
.
mark
.
parametrize
(
"max_length"
,
[
32
,
1024
])
...
@@ -213,6 +221,19 @@ def test_sft_dataset(model: str,
...
@@ -213,6 +221,19 @@ def test_sft_dataset(model: str,
max_length
=
max_length
)
max_length
=
max_length
)
assert
len
(
sft_dataset
)
==
min
(
max_dataset_size
,
len
(
SFT_DATASET
))
assert
len
(
sft_dataset
)
==
min
(
max_dataset_size
,
len
(
SFT_DATASET
))
if
isinstance
(
tokenizer
,
ChatGLMTokenizer
):
for
i
in
range
(
max_dataset_size
):
assert
isinstance
(
sft_dataset
[
i
],
dict
)
assert
list
(
sft_dataset
[
i
].
keys
())
==
[
"input_ids"
,
"labels"
]
input_ids
=
sft_dataset
[
i
][
"input_ids"
]
labels
=
sft_dataset
[
i
][
"labels"
]
assert
input_ids
.
shape
==
labels
.
shape
==
torch
.
Size
([
max_length
])
ignore_mask
=
labels
==
IGNORE_INDEX
assert
input_ids
.
masked_select
(
torch
.
logical_not
(
ignore_mask
))[
0
]
==
tokenizer
.
bos_token_id
check_content
(
input_ids
.
masked_select
(
torch
.
logical_not
(
ignore_mask
)),
tokenizer
,
model
)
return
for
i
in
range
(
max_dataset_size
):
for
i
in
range
(
max_dataset_size
):
assert
isinstance
(
sft_dataset
[
i
],
dict
)
assert
isinstance
(
sft_dataset
[
i
],
dict
)
assert
list
(
sft_dataset
[
i
].
keys
())
==
[
"input_ids"
,
"labels"
,
"attention_mask"
]
assert
list
(
sft_dataset
[
i
].
keys
())
==
[
"input_ids"
,
"labels"
,
"attention_mask"
]
...
...
applications/Chat/tests/test_models.py
View file @
aaeb520c
...
@@ -9,11 +9,12 @@ from coati.models.bloom import BLOOMRM, BLOOMActor, BLOOMCritic
...
@@ -9,11 +9,12 @@ from coati.models.bloom import BLOOMRM, BLOOMActor, BLOOMCritic
from
coati.models.generation
import
generate
from
coati.models.generation
import
generate
from
coati.models.gpt
import
GPTRM
,
GPTActor
,
GPTCritic
from
coati.models.gpt
import
GPTRM
,
GPTActor
,
GPTCritic
from
coati.models.llama
import
LlamaActor
,
LlamaCritic
,
LlamaRM
from
coati.models.llama
import
LlamaActor
,
LlamaCritic
,
LlamaRM
from
coati.models.chatglm
import
ChatGLMActor
from
coati.models.lora
import
LoraLinear
,
convert_to_lora_module
from
coati.models.lora
import
LoraLinear
,
convert_to_lora_module
from
coati.models.loss
import
GPTLMLoss
,
LogExpLoss
,
LogSigLoss
,
PolicyLoss
,
ValueLoss
from
coati.models.loss
import
GPTLMLoss
,
LogExpLoss
,
LogSigLoss
,
PolicyLoss
,
ValueLoss
from
coati.models.opt
import
OPTRM
,
OPTActor
,
OPTCritic
from
coati.models.opt
import
OPTRM
,
OPTActor
,
OPTCritic
from
coati.models.utils
import
calc_action_log_probs
,
compute_reward
,
masked_mean
from
coati.models.utils
import
calc_action_log_probs
,
compute_reward
,
masked_mean
from
coati.models.chatglm.chatglm_tokenizer
import
ChatGLMTokenizer
@
pytest
.
mark
.
gpu
@
pytest
.
mark
.
gpu
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
4
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
4
])
...
@@ -23,7 +24,8 @@ from coati.models.utils import calc_action_log_probs, compute_reward, masked_mea
...
@@ -23,7 +24,8 @@ from coati.models.utils import calc_action_log_probs, compute_reward, masked_mea
lambda
:
GPTActor
(),
lambda
:
GPTActor
(),
# HACK: skip llama due to long execution time
# HACK: skip llama due to long execution time
# lambda: LlamaActor(),
# lambda: LlamaActor(),
lambda
:
OPTActor
()
lambda
:
OPTActor
(),
# lambda: ChatGLMActor(),
])
])
@
pytest
.
mark
.
parametrize
(
"generate_kwargs"
,
[{
@
pytest
.
mark
.
parametrize
(
"generate_kwargs"
,
[{
"max_length"
:
64
,
"max_length"
:
64
,
...
@@ -129,12 +131,12 @@ def test_lora(lora_rank: int,
...
@@ -129,12 +131,12 @@ def test_lora(lora_rank: int,
# HACK: skip llama due to long execution time
# HACK: skip llama due to long execution time
# lambda: (LlamaActor(), LlamaCritic(), LlamaRM()),
# lambda: (LlamaActor(), LlamaCritic(), LlamaRM()),
lambda
:
(
OPTActor
(),
OPTCritic
(),
OPTRM
()),
lambda
:
(
OPTActor
(),
OPTCritic
(),
OPTRM
()),
lambda
:
(
ChatGLMActor
(),
None
,
None
),
])
])
@
torch
.
no_grad
()
@
torch
.
no_grad
()
def
test_models
(
models_maker
:
Callable
[[],
Tuple
[
Actor
,
Critic
,
RewardModel
]],
def
test_models
(
models_maker
:
Callable
[[],
Tuple
[
Actor
,
Critic
,
RewardModel
]],
batch_size
:
int
,
batch_size
:
int
,
seq_len
:
int
):
seq_len
:
int
):
actor_input
=
{
actor_input
=
{
"input_ids"
:
torch
.
randint
(
0
,
100
,
(
batch_size
,
seq_len
)),
"input_ids"
:
torch
.
randint
(
0
,
100
,
(
batch_size
,
seq_len
)),
"attention_mask"
:
torch
.
randint
(
0
,
2
,
(
batch_size
,
seq_len
))
"attention_mask"
:
torch
.
randint
(
0
,
2
,
(
batch_size
,
seq_len
))
...
@@ -150,19 +152,29 @@ def test_models(models_maker: Callable[[], Tuple[Actor, Critic, RewardModel]],
...
@@ -150,19 +152,29 @@ def test_models(models_maker: Callable[[], Tuple[Actor, Critic, RewardModel]],
}
}
actor
,
critic
,
rm
=
models_maker
()
actor
,
critic
,
rm
=
models_maker
()
if
isinstance
(
actor
,
ChatGLMActor
):
actor
=
actor
.
float
()
tokenizer
=
ChatGLMTokenizer
.
from_pretrained
(
"THUDM/chatglm-6b"
,
trust_remote_code
=
True
)
chatglm_special_token
=
torch
.
tensor
([
tokenizer
.
gmask_token_id
,
tokenizer
.
bos_token_id
]).
repeat
(
batch_size
,
1
)
actor_input
=
{
"input_ids"
:
torch
.
cat
((
torch
.
randint
(
0
,
100
,
(
batch_size
,
seq_len
//
2
)),
chatglm_special_token
,
torch
.
randint
(
0
,
100
,
(
batch_size
,
seq_len
//
2
-
2
))),
dim
=
1
),
"attention_mask"
:
torch
.
randint
(
0
,
2
,
(
batch_size
,
1
,
seq_len
,
seq_len
))
}
assert
isinstance
(
actor
,
Actor
)
assert
isinstance
(
actor
,
Actor
)
base_actor_model
=
get_base_model
(
actor
)
base_actor_model
=
get_base_model
(
actor
)
actor_output
=
actor
(
**
actor_input
)
assert
actor_output
.
logits
.
shape
[:
2
]
==
(
batch_size
,
seq_len
)
if
critic
:
assert
isinstance
(
critic
,
Critic
)
assert
isinstance
(
critic
,
Critic
)
base_critic_model
=
get_base_model
(
critic
)
base_critic_model
=
get_base_model
(
critic
)
critic_output
=
critic
(
**
critic_input
)
assert
critic_output
.
shape
==
(
batch_size
,
)
if
rm
:
assert
isinstance
(
rm
,
RewardModel
)
assert
isinstance
(
rm
,
RewardModel
)
base_rm_model
=
get_base_model
(
rm
)
base_rm_model
=
get_base_model
(
rm
)
actor_output
=
actor
(
**
actor_input
)
critic_output
=
critic
(
**
critic_input
)
rm_output
=
rm
(
**
rm_input
)
rm_output
=
rm
(
**
rm_input
)
assert
actor_output
.
logits
.
shape
[:
2
]
==
(
batch_size
,
seq_len
)
assert
critic_output
.
shape
==
(
batch_size
,
)
assert
rm_output
.
shape
==
(
batch_size
,
)
assert
rm_output
.
shape
==
(
batch_size
,
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment