Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
e43e1126
Unverified
Commit
e43e1126
authored
Jun 18, 2021
by
Bhavitvya Malik
Committed by
GitHub
Jun 17, 2021
Browse files
update desc for map in all examples (#12226)
* update desc for map in all examples * added plm * suggestions
parent
adb70eda
Changes
20
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
84 additions
and
7 deletions
+84
-7
examples/pytorch/language-modeling/requirements.txt
examples/pytorch/language-modeling/requirements.txt
+1
-1
examples/pytorch/language-modeling/run_clm.py
examples/pytorch/language-modeling/run_clm.py
+4
-0
examples/pytorch/language-modeling/run_clm_no_trainer.py
examples/pytorch/language-modeling/run_clm_no_trainer.py
+6
-0
examples/pytorch/language-modeling/run_mlm.py
examples/pytorch/language-modeling/run_mlm.py
+5
-0
examples/pytorch/language-modeling/run_mlm_no_trainer.py
examples/pytorch/language-modeling/run_mlm_no_trainer.py
+5
-0
examples/pytorch/language-modeling/run_plm.py
examples/pytorch/language-modeling/run_plm.py
+5
-0
examples/pytorch/question-answering/requirements.txt
examples/pytorch/question-answering/requirements.txt
+1
-1
examples/pytorch/question-answering/run_qa.py
examples/pytorch/question-answering/run_qa.py
+5
-0
examples/pytorch/question-answering/run_qa_beam_search.py
examples/pytorch/question-answering/run_qa_beam_search.py
+5
-0
examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
...torch/question-answering/run_qa_beam_search_no_trainer.py
+5
-0
examples/pytorch/question-answering/run_qa_no_trainer.py
examples/pytorch/question-answering/run_qa_no_trainer.py
+5
-0
examples/pytorch/summarization/requirements.txt
examples/pytorch/summarization/requirements.txt
+1
-1
examples/pytorch/summarization/run_summarization.py
examples/pytorch/summarization/run_summarization.py
+5
-0
examples/pytorch/summarization/run_summarization_no_trainer.py
...les/pytorch/summarization/run_summarization_no_trainer.py
+8
-1
examples/pytorch/token-classification/requirements.txt
examples/pytorch/token-classification/requirements.txt
+1
-1
examples/pytorch/token-classification/run_ner.py
examples/pytorch/token-classification/run_ner.py
+5
-0
examples/pytorch/token-classification/run_ner_no_trainer.py
examples/pytorch/token-classification/run_ner_no_trainer.py
+7
-1
examples/pytorch/translation/requirements.txt
examples/pytorch/translation/requirements.txt
+1
-1
examples/pytorch/translation/run_translation.py
examples/pytorch/translation/run_translation.py
+5
-0
examples/pytorch/translation/run_translation_no_trainer.py
examples/pytorch/translation/run_translation_no_trainer.py
+4
-0
No files found.
examples/pytorch/language-modeling/requirements.txt
View file @
e43e1126
torch >= 1.3
torch >= 1.3
datasets >= 1.
1.3
datasets >= 1.
8.0
sentencepiece != 0.1.92
sentencepiece != 0.1.92
protobuf
protobuf
examples/pytorch/language-modeling/run_clm.py
View file @
e43e1126
...
@@ -46,10 +46,12 @@ from transformers import (
...
@@ -46,10 +46,12 @@ from transformers import (
from
transformers.testing_utils
import
CaptureLogger
from
transformers.testing_utils
import
CaptureLogger
from
transformers.trainer_utils
import
get_last_checkpoint
from
transformers.trainer_utils
import
get_last_checkpoint
from
transformers.utils
import
check_min_version
from
transformers.utils
import
check_min_version
from
transformers.utils.versions
import
require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version
(
"4.8.0.dev0"
)
check_min_version
(
"4.8.0.dev0"
)
require_version
(
"datasets>=1.8.0"
,
"To fix: pip install -r examples/pytorch/language-modeling/requirements.txt"
)
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
...
@@ -355,6 +357,7 @@ def main():
...
@@ -355,6 +357,7 @@ def main():
num_proc
=
data_args
.
preprocessing_num_workers
,
num_proc
=
data_args
.
preprocessing_num_workers
,
remove_columns
=
column_names
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
desc
=
"Running tokenizer on dataset"
,
)
)
if
data_args
.
block_size
is
None
:
if
data_args
.
block_size
is
None
:
...
@@ -401,6 +404,7 @@ def main():
...
@@ -401,6 +404,7 @@ def main():
batched
=
True
,
batched
=
True
,
num_proc
=
data_args
.
preprocessing_num_workers
,
num_proc
=
data_args
.
preprocessing_num_workers
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
desc
=
f
"Grouping texts in chunks of
{
block_size
}
"
,
)
)
if
training_args
.
do_train
:
if
training_args
.
do_train
:
...
...
examples/pytorch/language-modeling/run_clm_no_trainer.py
View file @
e43e1126
...
@@ -48,9 +48,13 @@ from transformers import (
...
@@ -48,9 +48,13 @@ from transformers import (
get_scheduler
,
get_scheduler
,
set_seed
,
set_seed
,
)
)
from
transformers.utils.versions
import
require_version
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
require_version
(
"datasets>=1.8.0"
,
"To fix: pip install -r examples/pytorch/language-modeling/requirements.txt"
)
MODEL_CONFIG_CLASSES
=
list
(
MODEL_MAPPING
.
keys
())
MODEL_CONFIG_CLASSES
=
list
(
MODEL_MAPPING
.
keys
())
MODEL_TYPES
=
tuple
(
conf
.
model_type
for
conf
in
MODEL_CONFIG_CLASSES
)
MODEL_TYPES
=
tuple
(
conf
.
model_type
for
conf
in
MODEL_CONFIG_CLASSES
)
...
@@ -300,6 +304,7 @@ def main():
...
@@ -300,6 +304,7 @@ def main():
num_proc
=
args
.
preprocessing_num_workers
,
num_proc
=
args
.
preprocessing_num_workers
,
remove_columns
=
column_names
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
desc
=
"Running tokenizer on dataset"
,
)
)
if
args
.
block_size
is
None
:
if
args
.
block_size
is
None
:
...
@@ -346,6 +351,7 @@ def main():
...
@@ -346,6 +351,7 @@ def main():
batched
=
True
,
batched
=
True
,
num_proc
=
args
.
preprocessing_num_workers
,
num_proc
=
args
.
preprocessing_num_workers
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
desc
=
f
"Grouping texts in chunks of
{
block_size
}
"
,
)
)
train_dataset
=
lm_datasets
[
"train"
]
train_dataset
=
lm_datasets
[
"train"
]
...
...
examples/pytorch/language-modeling/run_mlm.py
View file @
e43e1126
...
@@ -45,10 +45,12 @@ from transformers import (
...
@@ -45,10 +45,12 @@ from transformers import (
)
)
from
transformers.trainer_utils
import
get_last_checkpoint
from
transformers.trainer_utils
import
get_last_checkpoint
from
transformers.utils
import
check_min_version
from
transformers.utils
import
check_min_version
from
transformers.utils.versions
import
require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version
(
"4.8.0.dev0"
)
check_min_version
(
"4.8.0.dev0"
)
require_version
(
"datasets>=1.8.0"
,
"To fix: pip install -r examples/pytorch/language-modeling/requirements.txt"
)
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
MODEL_CONFIG_CLASSES
=
list
(
MODEL_FOR_MASKED_LM_MAPPING
.
keys
())
MODEL_CONFIG_CLASSES
=
list
(
MODEL_FOR_MASKED_LM_MAPPING
.
keys
())
...
@@ -380,6 +382,7 @@ def main():
...
@@ -380,6 +382,7 @@ def main():
num_proc
=
data_args
.
preprocessing_num_workers
,
num_proc
=
data_args
.
preprocessing_num_workers
,
remove_columns
=
[
text_column_name
],
remove_columns
=
[
text_column_name
],
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
desc
=
"Running tokenizer on dataset line_by_line"
,
)
)
else
:
else
:
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
...
@@ -394,6 +397,7 @@ def main():
...
@@ -394,6 +397,7 @@ def main():
num_proc
=
data_args
.
preprocessing_num_workers
,
num_proc
=
data_args
.
preprocessing_num_workers
,
remove_columns
=
column_names
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
desc
=
"Running tokenizer on every text in dataset"
,
)
)
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
...
@@ -424,6 +428,7 @@ def main():
...
@@ -424,6 +428,7 @@ def main():
batched
=
True
,
batched
=
True
,
num_proc
=
data_args
.
preprocessing_num_workers
,
num_proc
=
data_args
.
preprocessing_num_workers
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
desc
=
f
"Grouping texts in chunks of
{
max_seq_length
}
"
,
)
)
if
training_args
.
do_train
:
if
training_args
.
do_train
:
...
...
examples/pytorch/language-modeling/run_mlm_no_trainer.py
View file @
e43e1126
...
@@ -48,9 +48,11 @@ from transformers import (
...
@@ -48,9 +48,11 @@ from transformers import (
get_scheduler
,
get_scheduler
,
set_seed
,
set_seed
,
)
)
from
transformers.utils.versions
import
require_version
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
require_version
(
"datasets>=1.8.0"
,
"To fix: pip install -r examples/pytorch/language-modeling/requirements.txt"
)
MODEL_CONFIG_CLASSES
=
list
(
MODEL_MAPPING
.
keys
())
MODEL_CONFIG_CLASSES
=
list
(
MODEL_MAPPING
.
keys
())
MODEL_TYPES
=
tuple
(
conf
.
model_type
for
conf
in
MODEL_CONFIG_CLASSES
)
MODEL_TYPES
=
tuple
(
conf
.
model_type
for
conf
in
MODEL_CONFIG_CLASSES
)
...
@@ -346,6 +348,7 @@ def main():
...
@@ -346,6 +348,7 @@ def main():
num_proc
=
args
.
preprocessing_num_workers
,
num_proc
=
args
.
preprocessing_num_workers
,
remove_columns
=
[
text_column_name
],
remove_columns
=
[
text_column_name
],
load_from_cache_file
=
not
args
.
overwrite_cache
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
desc
=
"Running tokenizer on dataset line_by_line"
,
)
)
else
:
else
:
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
...
@@ -360,6 +363,7 @@ def main():
...
@@ -360,6 +363,7 @@ def main():
num_proc
=
args
.
preprocessing_num_workers
,
num_proc
=
args
.
preprocessing_num_workers
,
remove_columns
=
column_names
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
desc
=
"Running tokenizer on every text in dataset"
,
)
)
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
...
@@ -390,6 +394,7 @@ def main():
...
@@ -390,6 +394,7 @@ def main():
batched
=
True
,
batched
=
True
,
num_proc
=
args
.
preprocessing_num_workers
,
num_proc
=
args
.
preprocessing_num_workers
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
desc
=
f
"Grouping texts in chunks of
{
max_seq_length
}
"
,
)
)
train_dataset
=
tokenized_datasets
[
"train"
]
train_dataset
=
tokenized_datasets
[
"train"
]
...
...
examples/pytorch/language-modeling/run_plm.py
View file @
e43e1126
...
@@ -41,10 +41,12 @@ from transformers import (
...
@@ -41,10 +41,12 @@ from transformers import (
)
)
from
transformers.trainer_utils
import
get_last_checkpoint
from
transformers.trainer_utils
import
get_last_checkpoint
from
transformers.utils
import
check_min_version
from
transformers.utils
import
check_min_version
from
transformers.utils.versions
import
require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version
(
"4.8.0.dev0"
)
check_min_version
(
"4.8.0.dev0"
)
require_version
(
"datasets>=1.8.0"
,
"To fix: pip install -r examples/pytorch/language-modeling/requirements.txt"
)
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
...
@@ -358,6 +360,7 @@ def main():
...
@@ -358,6 +360,7 @@ def main():
num_proc
=
data_args
.
preprocessing_num_workers
,
num_proc
=
data_args
.
preprocessing_num_workers
,
remove_columns
=
[
text_column_name
],
remove_columns
=
[
text_column_name
],
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
desc
=
"Running tokenizer on dataset line_by_line"
,
)
)
else
:
else
:
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
...
@@ -370,6 +373,7 @@ def main():
...
@@ -370,6 +373,7 @@ def main():
num_proc
=
data_args
.
preprocessing_num_workers
,
num_proc
=
data_args
.
preprocessing_num_workers
,
remove_columns
=
column_names
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
desc
=
"Running tokenizer on every text in dataset"
,
)
)
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
...
@@ -400,6 +404,7 @@ def main():
...
@@ -400,6 +404,7 @@ def main():
batched
=
True
,
batched
=
True
,
num_proc
=
data_args
.
preprocessing_num_workers
,
num_proc
=
data_args
.
preprocessing_num_workers
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
desc
=
f
"Grouping texts in chunks of
{
max_seq_length
}
"
,
)
)
if
training_args
.
do_train
:
if
training_args
.
do_train
:
...
...
examples/pytorch/question-answering/requirements.txt
View file @
e43e1126
datasets >= 1.
4
.0
datasets >= 1.
8
.0
torch >= 1.3.0
torch >= 1.3.0
examples/pytorch/question-answering/run_qa.py
View file @
e43e1126
...
@@ -42,11 +42,13 @@ from transformers import (
...
@@ -42,11 +42,13 @@ from transformers import (
)
)
from
transformers.trainer_utils
import
get_last_checkpoint
from
transformers.trainer_utils
import
get_last_checkpoint
from
transformers.utils
import
check_min_version
from
transformers.utils
import
check_min_version
from
transformers.utils.versions
import
require_version
from
utils_qa
import
postprocess_qa_predictions
from
utils_qa
import
postprocess_qa_predictions
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version
(
"4.8.0.dev0"
)
check_min_version
(
"4.8.0.dev0"
)
require_version
(
"datasets>=1.8.0"
,
"To fix: pip install -r examples/pytorch/question-answering/requirements.txt"
)
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
...
@@ -417,6 +419,7 @@ def main():
...
@@ -417,6 +419,7 @@ def main():
num_proc
=
data_args
.
preprocessing_num_workers
,
num_proc
=
data_args
.
preprocessing_num_workers
,
remove_columns
=
column_names
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
desc
=
"Running tokenizer on train dataset"
,
)
)
if
data_args
.
max_train_samples
is
not
None
:
if
data_args
.
max_train_samples
is
not
None
:
# Number of samples might increase during Feature Creation, We select only specified max samples
# Number of samples might increase during Feature Creation, We select only specified max samples
...
@@ -478,6 +481,7 @@ def main():
...
@@ -478,6 +481,7 @@ def main():
num_proc
=
data_args
.
preprocessing_num_workers
,
num_proc
=
data_args
.
preprocessing_num_workers
,
remove_columns
=
column_names
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
desc
=
"Running tokenizer on validation dataset"
,
)
)
if
data_args
.
max_eval_samples
is
not
None
:
if
data_args
.
max_eval_samples
is
not
None
:
# During Feature creation dataset samples might increase, we will select required samples again
# During Feature creation dataset samples might increase, we will select required samples again
...
@@ -497,6 +501,7 @@ def main():
...
@@ -497,6 +501,7 @@ def main():
num_proc
=
data_args
.
preprocessing_num_workers
,
num_proc
=
data_args
.
preprocessing_num_workers
,
remove_columns
=
column_names
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
desc
=
"Running tokenizer on prediction dataset"
,
)
)
if
data_args
.
max_predict_samples
is
not
None
:
if
data_args
.
max_predict_samples
is
not
None
:
# During Feature creation dataset samples might increase, we will select required samples again
# During Feature creation dataset samples might increase, we will select required samples again
...
...
examples/pytorch/question-answering/run_qa_beam_search.py
View file @
e43e1126
...
@@ -41,11 +41,13 @@ from transformers import (
...
@@ -41,11 +41,13 @@ from transformers import (
)
)
from
transformers.trainer_utils
import
get_last_checkpoint
from
transformers.trainer_utils
import
get_last_checkpoint
from
transformers.utils
import
check_min_version
from
transformers.utils
import
check_min_version
from
transformers.utils.versions
import
require_version
from
utils_qa
import
postprocess_qa_predictions_with_beam_search
from
utils_qa
import
postprocess_qa_predictions_with_beam_search
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version
(
"4.8.0.dev0"
)
check_min_version
(
"4.8.0.dev0"
)
require_version
(
"datasets>=1.8.0"
,
"To fix: pip install -r examples/pytorch/question-answering/requirements.txt"
)
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
...
@@ -429,6 +431,7 @@ def main():
...
@@ -429,6 +431,7 @@ def main():
num_proc
=
data_args
.
preprocessing_num_workers
,
num_proc
=
data_args
.
preprocessing_num_workers
,
remove_columns
=
column_names
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
desc
=
"Running tokenizer on train dataset"
,
)
)
if
data_args
.
max_train_samples
is
not
None
:
if
data_args
.
max_train_samples
is
not
None
:
# Select samples from dataset again since Feature Creation might increase number of features
# Select samples from dataset again since Feature Creation might increase number of features
...
@@ -514,6 +517,7 @@ def main():
...
@@ -514,6 +517,7 @@ def main():
num_proc
=
data_args
.
preprocessing_num_workers
,
num_proc
=
data_args
.
preprocessing_num_workers
,
remove_columns
=
column_names
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
desc
=
"Running tokenizer on validation dataset"
,
)
)
if
data_args
.
max_eval_samples
is
not
None
:
if
data_args
.
max_eval_samples
is
not
None
:
# Selecting Samples from Dataset again since Feature Creation might increase samples size
# Selecting Samples from Dataset again since Feature Creation might increase samples size
...
@@ -533,6 +537,7 @@ def main():
...
@@ -533,6 +537,7 @@ def main():
num_proc
=
data_args
.
preprocessing_num_workers
,
num_proc
=
data_args
.
preprocessing_num_workers
,
remove_columns
=
column_names
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
desc
=
"Running tokenizer on prediction dataset"
,
)
)
if
data_args
.
max_predict_samples
is
not
None
:
if
data_args
.
max_predict_samples
is
not
None
:
# During Feature creation dataset samples might increase, we will select required samples again
# During Feature creation dataset samples might increase, we will select required samples again
...
...
examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
View file @
e43e1126
...
@@ -46,11 +46,13 @@ from transformers import (
...
@@ -46,11 +46,13 @@ from transformers import (
set_seed
,
set_seed
,
)
)
from
transformers.utils
import
check_min_version
from
transformers.utils
import
check_min_version
from
transformers.utils.versions
import
require_version
from
utils_qa
import
postprocess_qa_predictions_with_beam_search
from
utils_qa
import
postprocess_qa_predictions_with_beam_search
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version
(
"4.8.0.dev0"
)
check_min_version
(
"4.8.0.dev0"
)
require_version
(
"datasets>=1.8.0"
,
"To fix: pip install -r examples/pytorch/question-answering/requirements.txt"
)
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
...
@@ -419,6 +421,7 @@ def main():
...
@@ -419,6 +421,7 @@ def main():
num_proc
=
args
.
preprocessing_num_workers
,
num_proc
=
args
.
preprocessing_num_workers
,
remove_columns
=
column_names
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
desc
=
"Running tokenizer on train dataset"
,
)
)
if
args
.
max_train_samples
is
not
None
:
if
args
.
max_train_samples
is
not
None
:
# Number of samples might increase during Feature Creation, We select only specified max samples
# Number of samples might increase during Feature Creation, We select only specified max samples
...
@@ -503,6 +506,7 @@ def main():
...
@@ -503,6 +506,7 @@ def main():
num_proc
=
args
.
preprocessing_num_workers
,
num_proc
=
args
.
preprocessing_num_workers
,
remove_columns
=
column_names
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
desc
=
"Running tokenizer on validation dataset"
,
)
)
if
args
.
max_eval_samples
is
not
None
:
if
args
.
max_eval_samples
is
not
None
:
...
@@ -523,6 +527,7 @@ def main():
...
@@ -523,6 +527,7 @@ def main():
num_proc
=
args
.
preprocessing_num_workers
,
num_proc
=
args
.
preprocessing_num_workers
,
remove_columns
=
column_names
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
desc
=
"Running tokenizer on prediction dataset"
,
)
)
if
args
.
max_predict_samples
is
not
None
:
if
args
.
max_predict_samples
is
not
None
:
# During Feature creation dataset samples might increase, we will select required samples again
# During Feature creation dataset samples might increase, we will select required samples again
...
...
examples/pytorch/question-answering/run_qa_no_trainer.py
View file @
e43e1126
...
@@ -48,11 +48,13 @@ from transformers import (
...
@@ -48,11 +48,13 @@ from transformers import (
set_seed
,
set_seed
,
)
)
from
transformers.utils
import
check_min_version
from
transformers.utils
import
check_min_version
from
transformers.utils.versions
import
require_version
from
utils_qa
import
postprocess_qa_predictions
from
utils_qa
import
postprocess_qa_predictions
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version
(
"4.8.0.dev0"
)
check_min_version
(
"4.8.0.dev0"
)
require_version
(
"datasets>=1.8.0"
,
"To fix: pip install -r examples/pytorch/question-answering/requirements.txt"
)
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
# You should update this to your particular problem to have better documentation of `model_type`
# You should update this to your particular problem to have better documentation of `model_type`
...
@@ -448,6 +450,7 @@ def main():
...
@@ -448,6 +450,7 @@ def main():
num_proc
=
args
.
preprocessing_num_workers
,
num_proc
=
args
.
preprocessing_num_workers
,
remove_columns
=
column_names
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
desc
=
"Running tokenizer on train dataset"
,
)
)
if
args
.
max_train_samples
is
not
None
:
if
args
.
max_train_samples
is
not
None
:
# Number of samples might increase during Feature Creation, We select only specified max samples
# Number of samples might increase during Feature Creation, We select only specified max samples
...
@@ -508,6 +511,7 @@ def main():
...
@@ -508,6 +511,7 @@ def main():
num_proc
=
args
.
preprocessing_num_workers
,
num_proc
=
args
.
preprocessing_num_workers
,
remove_columns
=
column_names
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
desc
=
"Running tokenizer on validation dataset"
,
)
)
if
args
.
max_eval_samples
is
not
None
:
if
args
.
max_eval_samples
is
not
None
:
...
@@ -528,6 +532,7 @@ def main():
...
@@ -528,6 +532,7 @@ def main():
num_proc
=
args
.
preprocessing_num_workers
,
num_proc
=
args
.
preprocessing_num_workers
,
remove_columns
=
column_names
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
desc
=
"Running tokenizer on prediction dataset"
,
)
)
if
args
.
max_predict_samples
is
not
None
:
if
args
.
max_predict_samples
is
not
None
:
# During Feature creation dataset samples might increase, we will select required samples again
# During Feature creation dataset samples might increase, we will select required samples again
...
...
examples/pytorch/summarization/requirements.txt
View file @
e43e1126
datasets >= 1.
1.3
datasets >= 1.
8.0
sentencepiece != 0.1.92
sentencepiece != 0.1.92
protobuf
protobuf
rouge-score
rouge-score
...
...
examples/pytorch/summarization/run_summarization.py
View file @
e43e1126
...
@@ -43,10 +43,12 @@ from transformers import (
...
@@ -43,10 +43,12 @@ from transformers import (
from
transformers.file_utils
import
is_offline_mode
from
transformers.file_utils
import
is_offline_mode
from
transformers.trainer_utils
import
get_last_checkpoint
from
transformers.trainer_utils
import
get_last_checkpoint
from
transformers.utils
import
check_min_version
from
transformers.utils
import
check_min_version
from
transformers.utils.versions
import
require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version
(
"4.8.0.dev0"
)
check_min_version
(
"4.8.0.dev0"
)
require_version
(
"datasets>=1.8.0"
,
"To fix: pip install -r examples/pytorch/summarization/requirements.txt"
)
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
...
@@ -433,6 +435,7 @@ def main():
...
@@ -433,6 +435,7 @@ def main():
num_proc
=
data_args
.
preprocessing_num_workers
,
num_proc
=
data_args
.
preprocessing_num_workers
,
remove_columns
=
column_names
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
desc
=
"Running tokenizer on train dataset"
,
)
)
if
training_args
.
do_eval
:
if
training_args
.
do_eval
:
...
@@ -448,6 +451,7 @@ def main():
...
@@ -448,6 +451,7 @@ def main():
num_proc
=
data_args
.
preprocessing_num_workers
,
num_proc
=
data_args
.
preprocessing_num_workers
,
remove_columns
=
column_names
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
desc
=
"Running tokenizer on validation dataset"
,
)
)
if
training_args
.
do_predict
:
if
training_args
.
do_predict
:
...
@@ -463,6 +467,7 @@ def main():
...
@@ -463,6 +467,7 @@ def main():
num_proc
=
data_args
.
preprocessing_num_workers
,
num_proc
=
data_args
.
preprocessing_num_workers
,
remove_columns
=
column_names
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
desc
=
"Running tokenizer on prediction dataset"
,
)
)
# Data collator
# Data collator
...
...
examples/pytorch/summarization/run_summarization_no_trainer.py
View file @
e43e1126
...
@@ -48,9 +48,12 @@ from transformers import (
...
@@ -48,9 +48,12 @@ from transformers import (
set_seed
,
set_seed
,
)
)
from
transformers.file_utils
import
is_offline_mode
from
transformers.file_utils
import
is_offline_mode
from
transformers.utils.versions
import
require_version
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
require_version
(
"datasets>=1.8.0"
,
"To fix: pip install -r examples/pytorch/summarization/requirements.txt"
)
# You should update this to your particular problem to have better documentation of `model_type`
# You should update this to your particular problem to have better documentation of `model_type`
MODEL_CONFIG_CLASSES
=
list
(
MODEL_MAPPING
.
keys
())
MODEL_CONFIG_CLASSES
=
list
(
MODEL_MAPPING
.
keys
())
MODEL_TYPES
=
tuple
(
conf
.
model_type
for
conf
in
MODEL_CONFIG_CLASSES
)
MODEL_TYPES
=
tuple
(
conf
.
model_type
for
conf
in
MODEL_CONFIG_CLASSES
)
...
@@ -419,7 +422,11 @@ def main():
...
@@ -419,7 +422,11 @@ def main():
return
model_inputs
return
model_inputs
processed_datasets
=
raw_datasets
.
map
(
processed_datasets
=
raw_datasets
.
map
(
preprocess_function
,
batched
=
True
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
args
.
overwrite_cache
preprocess_function
,
batched
=
True
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
desc
=
"Running tokenizer on dataset"
,
)
)
train_dataset
=
processed_datasets
[
"train"
]
train_dataset
=
processed_datasets
[
"train"
]
...
...
examples/pytorch/token-classification/requirements.txt
View file @
e43e1126
seqeval
seqeval
datasets >= 1.
1.3
datasets >= 1.
8.0
torch >= 1.3
torch >= 1.3
examples/pytorch/token-classification/run_ner.py
View file @
e43e1126
...
@@ -42,10 +42,12 @@ from transformers import (
...
@@ -42,10 +42,12 @@ from transformers import (
)
)
from
transformers.trainer_utils
import
get_last_checkpoint
from
transformers.trainer_utils
import
get_last_checkpoint
from
transformers.utils
import
check_min_version
from
transformers.utils
import
check_min_version
from
transformers.utils.versions
import
require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version
(
"4.8.0.dev0"
)
check_min_version
(
"4.8.0.dev0"
)
require_version
(
"datasets>=1.8.0"
,
"To fix: pip install -r examples/pytorch/token-classification/requirements.txt"
)
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
...
@@ -388,6 +390,7 @@ def main():
...
@@ -388,6 +390,7 @@ def main():
batched
=
True
,
batched
=
True
,
num_proc
=
data_args
.
preprocessing_num_workers
,
num_proc
=
data_args
.
preprocessing_num_workers
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
desc
=
"Running tokenizer on train dataset"
,
)
)
if
training_args
.
do_eval
:
if
training_args
.
do_eval
:
...
@@ -401,6 +404,7 @@ def main():
...
@@ -401,6 +404,7 @@ def main():
batched
=
True
,
batched
=
True
,
num_proc
=
data_args
.
preprocessing_num_workers
,
num_proc
=
data_args
.
preprocessing_num_workers
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
desc
=
"Running tokenizer on validation dataset"
,
)
)
if
training_args
.
do_predict
:
if
training_args
.
do_predict
:
...
@@ -414,6 +418,7 @@ def main():
...
@@ -414,6 +418,7 @@ def main():
batched
=
True
,
batched
=
True
,
num_proc
=
data_args
.
preprocessing_num_workers
,
num_proc
=
data_args
.
preprocessing_num_workers
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
desc
=
"Running tokenizer on prediction dataset"
,
)
)
# Data collator
# Data collator
...
...
examples/pytorch/token-classification/run_ner_no_trainer.py
View file @
e43e1126
...
@@ -45,9 +45,12 @@ from transformers import (
...
@@ -45,9 +45,12 @@ from transformers import (
get_scheduler
,
get_scheduler
,
set_seed
,
set_seed
,
)
)
from
transformers.utils.versions
import
require_version
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
require_version
(
"datasets>=1.8.0"
,
"To fix: pip install -r examples/pytorch/token-classification/requirements.txt"
)
# You should update this to your particular problem to have better documentation of `model_type`
# You should update this to your particular problem to have better documentation of `model_type`
MODEL_CONFIG_CLASSES
=
list
(
MODEL_MAPPING
.
keys
())
MODEL_CONFIG_CLASSES
=
list
(
MODEL_MAPPING
.
keys
())
MODEL_TYPES
=
tuple
(
conf
.
model_type
for
conf
in
MODEL_CONFIG_CLASSES
)
MODEL_TYPES
=
tuple
(
conf
.
model_type
for
conf
in
MODEL_CONFIG_CLASSES
)
...
@@ -381,7 +384,10 @@ def main():
...
@@ -381,7 +384,10 @@ def main():
return
tokenized_inputs
return
tokenized_inputs
processed_raw_datasets
=
raw_datasets
.
map
(
processed_raw_datasets
=
raw_datasets
.
map
(
tokenize_and_align_labels
,
batched
=
True
,
remove_columns
=
raw_datasets
[
"train"
].
column_names
tokenize_and_align_labels
,
batched
=
True
,
remove_columns
=
raw_datasets
[
"train"
].
column_names
,
desc
=
"Running tokenizer on dataset"
,
)
)
train_dataset
=
processed_raw_datasets
[
"train"
]
train_dataset
=
processed_raw_datasets
[
"train"
]
...
...
examples/pytorch/translation/requirements.txt
View file @
e43e1126
datasets >= 1.
1.3
datasets >= 1.
8.0
sentencepiece != 0.1.92
sentencepiece != 0.1.92
protobuf
protobuf
sacrebleu >= 1.4.12
sacrebleu >= 1.4.12
...
...
examples/pytorch/translation/run_translation.py
View file @
e43e1126
...
@@ -46,10 +46,12 @@ from transformers import (
...
@@ -46,10 +46,12 @@ from transformers import (
)
)
from
transformers.trainer_utils
import
get_last_checkpoint
from
transformers.trainer_utils
import
get_last_checkpoint
from
transformers.utils
import
check_min_version
from
transformers.utils
import
check_min_version
from
transformers.utils.versions
import
require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version
(
"4.8.0.dev0"
)
check_min_version
(
"4.8.0.dev0"
)
require_version
(
"datasets>=1.8.0"
,
"To fix: pip install -r examples/pytorch/translation/requirements.txt"
)
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
...
@@ -427,6 +429,7 @@ def main():
...
@@ -427,6 +429,7 @@ def main():
num_proc
=
data_args
.
preprocessing_num_workers
,
num_proc
=
data_args
.
preprocessing_num_workers
,
remove_columns
=
column_names
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
desc
=
"Running tokenizer on train dataset"
,
)
)
if
training_args
.
do_eval
:
if
training_args
.
do_eval
:
...
@@ -442,6 +445,7 @@ def main():
...
@@ -442,6 +445,7 @@ def main():
num_proc
=
data_args
.
preprocessing_num_workers
,
num_proc
=
data_args
.
preprocessing_num_workers
,
remove_columns
=
column_names
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
desc
=
"Running tokenizer on validation dataset"
,
)
)
if
training_args
.
do_predict
:
if
training_args
.
do_predict
:
...
@@ -457,6 +461,7 @@ def main():
...
@@ -457,6 +461,7 @@ def main():
num_proc
=
data_args
.
preprocessing_num_workers
,
num_proc
=
data_args
.
preprocessing_num_workers
,
remove_columns
=
column_names
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
desc
=
"Running tokenizer on prediction dataset"
,
)
)
# Data collator
# Data collator
...
...
examples/pytorch/translation/run_translation_no_trainer.py
View file @
e43e1126
...
@@ -48,9 +48,12 @@ from transformers import (
...
@@ -48,9 +48,12 @@ from transformers import (
get_scheduler
,
get_scheduler
,
set_seed
,
set_seed
,
)
)
from
transformers.utils.versions
import
require_version
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
require_version
(
"datasets>=1.8.0"
,
"To fix: pip install -r examples/pytorch/translation/requirements.txt"
)
# You should update this to your particular problem to have better documentation of `model_type`
# You should update this to your particular problem to have better documentation of `model_type`
MODEL_CONFIG_CLASSES
=
list
(
MODEL_MAPPING
.
keys
())
MODEL_CONFIG_CLASSES
=
list
(
MODEL_MAPPING
.
keys
())
MODEL_TYPES
=
tuple
(
conf
.
model_type
for
conf
in
MODEL_CONFIG_CLASSES
)
MODEL_TYPES
=
tuple
(
conf
.
model_type
for
conf
in
MODEL_CONFIG_CLASSES
)
...
@@ -401,6 +404,7 @@ def main():
...
@@ -401,6 +404,7 @@ def main():
num_proc
=
args
.
preprocessing_num_workers
,
num_proc
=
args
.
preprocessing_num_workers
,
remove_columns
=
column_names
,
remove_columns
=
column_names
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
load_from_cache_file
=
not
args
.
overwrite_cache
,
desc
=
"Running tokenizer on dataset"
,
)
)
train_dataset
=
processed_datasets
[
"train"
]
train_dataset
=
processed_datasets
[
"train"
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment