Unverified Commit e43e1126 authored by Bhavitvya Malik's avatar Bhavitvya Malik Committed by GitHub
Browse files

update desc for map in all examples (#12226)

* update desc for map in all examples

* added plm

* suggestions
parent adb70eda
torch >= 1.3 torch >= 1.3
datasets >= 1.1.3 datasets >= 1.8.0
sentencepiece != 0.1.92 sentencepiece != 0.1.92
protobuf protobuf
...@@ -46,10 +46,12 @@ from transformers import ( ...@@ -46,10 +46,12 @@ from transformers import (
from transformers.testing_utils import CaptureLogger from transformers.testing_utils import CaptureLogger
from transformers.trainer_utils import get_last_checkpoint from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version from transformers.utils import check_min_version
from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks. # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.8.0.dev0") check_min_version("4.8.0.dev0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -355,6 +357,7 @@ def main(): ...@@ -355,6 +357,7 @@ def main():
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
remove_columns=column_names, remove_columns=column_names,
load_from_cache_file=not data_args.overwrite_cache, load_from_cache_file=not data_args.overwrite_cache,
desc="Running tokenizer on dataset",
) )
if data_args.block_size is None: if data_args.block_size is None:
...@@ -401,6 +404,7 @@ def main(): ...@@ -401,6 +404,7 @@ def main():
batched=True, batched=True,
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
load_from_cache_file=not data_args.overwrite_cache, load_from_cache_file=not data_args.overwrite_cache,
desc=f"Grouping texts in chunks of {block_size}",
) )
if training_args.do_train: if training_args.do_train:
......
...@@ -48,9 +48,13 @@ from transformers import ( ...@@ -48,9 +48,13 @@ from transformers import (
get_scheduler, get_scheduler,
set_seed, set_seed,
) )
from transformers.utils.versions import require_version
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys()) MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
...@@ -300,6 +304,7 @@ def main(): ...@@ -300,6 +304,7 @@ def main():
num_proc=args.preprocessing_num_workers, num_proc=args.preprocessing_num_workers,
remove_columns=column_names, remove_columns=column_names,
load_from_cache_file=not args.overwrite_cache, load_from_cache_file=not args.overwrite_cache,
desc="Running tokenizer on dataset",
) )
if args.block_size is None: if args.block_size is None:
...@@ -346,6 +351,7 @@ def main(): ...@@ -346,6 +351,7 @@ def main():
batched=True, batched=True,
num_proc=args.preprocessing_num_workers, num_proc=args.preprocessing_num_workers,
load_from_cache_file=not args.overwrite_cache, load_from_cache_file=not args.overwrite_cache,
desc=f"Grouping texts in chunks of {block_size}",
) )
train_dataset = lm_datasets["train"] train_dataset = lm_datasets["train"]
......
...@@ -45,10 +45,12 @@ from transformers import ( ...@@ -45,10 +45,12 @@ from transformers import (
) )
from transformers.trainer_utils import get_last_checkpoint from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version from transformers.utils import check_min_version
from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks. # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.8.0.dev0") check_min_version("4.8.0.dev0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_LM_MAPPING.keys()) MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_LM_MAPPING.keys())
...@@ -380,6 +382,7 @@ def main(): ...@@ -380,6 +382,7 @@ def main():
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
remove_columns=[text_column_name], remove_columns=[text_column_name],
load_from_cache_file=not data_args.overwrite_cache, load_from_cache_file=not data_args.overwrite_cache,
desc="Running tokenizer on dataset line_by_line",
) )
else: else:
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
...@@ -394,6 +397,7 @@ def main(): ...@@ -394,6 +397,7 @@ def main():
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
remove_columns=column_names, remove_columns=column_names,
load_from_cache_file=not data_args.overwrite_cache, load_from_cache_file=not data_args.overwrite_cache,
desc="Running tokenizer on every text in dataset",
) )
# Main data processing function that will concatenate all texts from our dataset and generate chunks of # Main data processing function that will concatenate all texts from our dataset and generate chunks of
...@@ -424,6 +428,7 @@ def main(): ...@@ -424,6 +428,7 @@ def main():
batched=True, batched=True,
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
load_from_cache_file=not data_args.overwrite_cache, load_from_cache_file=not data_args.overwrite_cache,
desc=f"Grouping texts in chunks of {max_seq_length}",
) )
if training_args.do_train: if training_args.do_train:
......
...@@ -48,9 +48,11 @@ from transformers import ( ...@@ -48,9 +48,11 @@ from transformers import (
get_scheduler, get_scheduler,
set_seed, set_seed,
) )
from transformers.utils.versions import require_version
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys()) MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
...@@ -346,6 +348,7 @@ def main(): ...@@ -346,6 +348,7 @@ def main():
num_proc=args.preprocessing_num_workers, num_proc=args.preprocessing_num_workers,
remove_columns=[text_column_name], remove_columns=[text_column_name],
load_from_cache_file=not args.overwrite_cache, load_from_cache_file=not args.overwrite_cache,
desc="Running tokenizer on dataset line_by_line",
) )
else: else:
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
...@@ -360,6 +363,7 @@ def main(): ...@@ -360,6 +363,7 @@ def main():
num_proc=args.preprocessing_num_workers, num_proc=args.preprocessing_num_workers,
remove_columns=column_names, remove_columns=column_names,
load_from_cache_file=not args.overwrite_cache, load_from_cache_file=not args.overwrite_cache,
desc="Running tokenizer on every text in dataset",
) )
# Main data processing function that will concatenate all texts from our dataset and generate chunks of # Main data processing function that will concatenate all texts from our dataset and generate chunks of
...@@ -390,6 +394,7 @@ def main(): ...@@ -390,6 +394,7 @@ def main():
batched=True, batched=True,
num_proc=args.preprocessing_num_workers, num_proc=args.preprocessing_num_workers,
load_from_cache_file=not args.overwrite_cache, load_from_cache_file=not args.overwrite_cache,
desc=f"Grouping texts in chunks of {max_seq_length}",
) )
train_dataset = tokenized_datasets["train"] train_dataset = tokenized_datasets["train"]
......
...@@ -41,10 +41,12 @@ from transformers import ( ...@@ -41,10 +41,12 @@ from transformers import (
) )
from transformers.trainer_utils import get_last_checkpoint from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version from transformers.utils import check_min_version
from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks. # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.8.0.dev0") check_min_version("4.8.0.dev0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -358,6 +360,7 @@ def main(): ...@@ -358,6 +360,7 @@ def main():
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
remove_columns=[text_column_name], remove_columns=[text_column_name],
load_from_cache_file=not data_args.overwrite_cache, load_from_cache_file=not data_args.overwrite_cache,
desc="Running tokenizer on dataset line_by_line",
) )
else: else:
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
...@@ -370,6 +373,7 @@ def main(): ...@@ -370,6 +373,7 @@ def main():
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
remove_columns=column_names, remove_columns=column_names,
load_from_cache_file=not data_args.overwrite_cache, load_from_cache_file=not data_args.overwrite_cache,
desc="Running tokenizer on every text in dataset",
) )
# Main data processing function that will concatenate all texts from our dataset and generate chunks of # Main data processing function that will concatenate all texts from our dataset and generate chunks of
...@@ -400,6 +404,7 @@ def main(): ...@@ -400,6 +404,7 @@ def main():
batched=True, batched=True,
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
load_from_cache_file=not data_args.overwrite_cache, load_from_cache_file=not data_args.overwrite_cache,
desc=f"Grouping texts in chunks of {max_seq_length}",
) )
if training_args.do_train: if training_args.do_train:
......
datasets >= 1.4.0 datasets >= 1.8.0
torch >= 1.3.0 torch >= 1.3.0
...@@ -42,11 +42,13 @@ from transformers import ( ...@@ -42,11 +42,13 @@ from transformers import (
) )
from transformers.trainer_utils import get_last_checkpoint from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version from transformers.utils import check_min_version
from transformers.utils.versions import require_version
from utils_qa import postprocess_qa_predictions from utils_qa import postprocess_qa_predictions
# Will error if the minimal version of Transformers is not installed. Remove at your own risks. # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.8.0.dev0") check_min_version("4.8.0.dev0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -417,6 +419,7 @@ def main(): ...@@ -417,6 +419,7 @@ def main():
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
remove_columns=column_names, remove_columns=column_names,
load_from_cache_file=not data_args.overwrite_cache, load_from_cache_file=not data_args.overwrite_cache,
desc="Running tokenizer on train dataset",
) )
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
# Number of samples might increase during Feature Creation, We select only specified max samples # Number of samples might increase during Feature Creation, We select only specified max samples
...@@ -478,6 +481,7 @@ def main(): ...@@ -478,6 +481,7 @@ def main():
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
remove_columns=column_names, remove_columns=column_names,
load_from_cache_file=not data_args.overwrite_cache, load_from_cache_file=not data_args.overwrite_cache,
desc="Running tokenizer on validation dataset",
) )
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
# During Feature creation dataset samples might increase, we will select required samples again # During Feature creation dataset samples might increase, we will select required samples again
...@@ -497,6 +501,7 @@ def main(): ...@@ -497,6 +501,7 @@ def main():
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
remove_columns=column_names, remove_columns=column_names,
load_from_cache_file=not data_args.overwrite_cache, load_from_cache_file=not data_args.overwrite_cache,
desc="Running tokenizer on prediction dataset",
) )
if data_args.max_predict_samples is not None: if data_args.max_predict_samples is not None:
# During Feature creation dataset samples might increase, we will select required samples again # During Feature creation dataset samples might increase, we will select required samples again
......
...@@ -41,11 +41,13 @@ from transformers import ( ...@@ -41,11 +41,13 @@ from transformers import (
) )
from transformers.trainer_utils import get_last_checkpoint from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version from transformers.utils import check_min_version
from transformers.utils.versions import require_version
from utils_qa import postprocess_qa_predictions_with_beam_search from utils_qa import postprocess_qa_predictions_with_beam_search
# Will error if the minimal version of Transformers is not installed. Remove at your own risks. # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.8.0.dev0") check_min_version("4.8.0.dev0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -429,6 +431,7 @@ def main(): ...@@ -429,6 +431,7 @@ def main():
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
remove_columns=column_names, remove_columns=column_names,
load_from_cache_file=not data_args.overwrite_cache, load_from_cache_file=not data_args.overwrite_cache,
desc="Running tokenizer on train dataset",
) )
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
# Select samples from dataset again since Feature Creation might increase number of features # Select samples from dataset again since Feature Creation might increase number of features
...@@ -514,6 +517,7 @@ def main(): ...@@ -514,6 +517,7 @@ def main():
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
remove_columns=column_names, remove_columns=column_names,
load_from_cache_file=not data_args.overwrite_cache, load_from_cache_file=not data_args.overwrite_cache,
desc="Running tokenizer on validation dataset",
) )
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
# Selecting Samples from Dataset again since Feature Creation might increase samples size # Selecting Samples from Dataset again since Feature Creation might increase samples size
...@@ -533,6 +537,7 @@ def main(): ...@@ -533,6 +537,7 @@ def main():
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
remove_columns=column_names, remove_columns=column_names,
load_from_cache_file=not data_args.overwrite_cache, load_from_cache_file=not data_args.overwrite_cache,
desc="Running tokenizer on prediction dataset",
) )
if data_args.max_predict_samples is not None: if data_args.max_predict_samples is not None:
# During Feature creation dataset samples might increase, we will select required samples again # During Feature creation dataset samples might increase, we will select required samples again
......
...@@ -46,11 +46,13 @@ from transformers import ( ...@@ -46,11 +46,13 @@ from transformers import (
set_seed, set_seed,
) )
from transformers.utils import check_min_version from transformers.utils import check_min_version
from transformers.utils.versions import require_version
from utils_qa import postprocess_qa_predictions_with_beam_search from utils_qa import postprocess_qa_predictions_with_beam_search
# Will error if the minimal version of Transformers is not installed. Remove at your own risks. # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.8.0.dev0") check_min_version("4.8.0.dev0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -419,6 +421,7 @@ def main(): ...@@ -419,6 +421,7 @@ def main():
num_proc=args.preprocessing_num_workers, num_proc=args.preprocessing_num_workers,
remove_columns=column_names, remove_columns=column_names,
load_from_cache_file=not args.overwrite_cache, load_from_cache_file=not args.overwrite_cache,
desc="Running tokenizer on train dataset",
) )
if args.max_train_samples is not None: if args.max_train_samples is not None:
# Number of samples might increase during Feature Creation, We select only specified max samples # Number of samples might increase during Feature Creation, We select only specified max samples
...@@ -503,6 +506,7 @@ def main(): ...@@ -503,6 +506,7 @@ def main():
num_proc=args.preprocessing_num_workers, num_proc=args.preprocessing_num_workers,
remove_columns=column_names, remove_columns=column_names,
load_from_cache_file=not args.overwrite_cache, load_from_cache_file=not args.overwrite_cache,
desc="Running tokenizer on validation dataset",
) )
if args.max_eval_samples is not None: if args.max_eval_samples is not None:
...@@ -523,6 +527,7 @@ def main(): ...@@ -523,6 +527,7 @@ def main():
num_proc=args.preprocessing_num_workers, num_proc=args.preprocessing_num_workers,
remove_columns=column_names, remove_columns=column_names,
load_from_cache_file=not args.overwrite_cache, load_from_cache_file=not args.overwrite_cache,
desc="Running tokenizer on prediction dataset",
) )
if args.max_predict_samples is not None: if args.max_predict_samples is not None:
# During Feature creation dataset samples might increase, we will select required samples again # During Feature creation dataset samples might increase, we will select required samples again
......
...@@ -48,11 +48,13 @@ from transformers import ( ...@@ -48,11 +48,13 @@ from transformers import (
set_seed, set_seed,
) )
from transformers.utils import check_min_version from transformers.utils import check_min_version
from transformers.utils.versions import require_version
from utils_qa import postprocess_qa_predictions from utils_qa import postprocess_qa_predictions
# Will error if the minimal version of Transformers is not installed. Remove at your own risks. # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.8.0.dev0") check_min_version("4.8.0.dev0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# You should update this to your particular problem to have better documentation of `model_type` # You should update this to your particular problem to have better documentation of `model_type`
...@@ -448,6 +450,7 @@ def main(): ...@@ -448,6 +450,7 @@ def main():
num_proc=args.preprocessing_num_workers, num_proc=args.preprocessing_num_workers,
remove_columns=column_names, remove_columns=column_names,
load_from_cache_file=not args.overwrite_cache, load_from_cache_file=not args.overwrite_cache,
desc="Running tokenizer on train dataset",
) )
if args.max_train_samples is not None: if args.max_train_samples is not None:
# Number of samples might increase during Feature Creation, We select only specified max samples # Number of samples might increase during Feature Creation, We select only specified max samples
...@@ -508,6 +511,7 @@ def main(): ...@@ -508,6 +511,7 @@ def main():
num_proc=args.preprocessing_num_workers, num_proc=args.preprocessing_num_workers,
remove_columns=column_names, remove_columns=column_names,
load_from_cache_file=not args.overwrite_cache, load_from_cache_file=not args.overwrite_cache,
desc="Running tokenizer on validation dataset",
) )
if args.max_eval_samples is not None: if args.max_eval_samples is not None:
...@@ -528,6 +532,7 @@ def main(): ...@@ -528,6 +532,7 @@ def main():
num_proc=args.preprocessing_num_workers, num_proc=args.preprocessing_num_workers,
remove_columns=column_names, remove_columns=column_names,
load_from_cache_file=not args.overwrite_cache, load_from_cache_file=not args.overwrite_cache,
desc="Running tokenizer on prediction dataset",
) )
if args.max_predict_samples is not None: if args.max_predict_samples is not None:
# During Feature creation dataset samples might increase, we will select required samples again # During Feature creation dataset samples might increase, we will select required samples again
......
datasets >= 1.1.3 datasets >= 1.8.0
sentencepiece != 0.1.92 sentencepiece != 0.1.92
protobuf protobuf
rouge-score rouge-score
......
...@@ -43,10 +43,12 @@ from transformers import ( ...@@ -43,10 +43,12 @@ from transformers import (
from transformers.file_utils import is_offline_mode from transformers.file_utils import is_offline_mode
from transformers.trainer_utils import get_last_checkpoint from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version from transformers.utils import check_min_version
from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks. # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.8.0.dev0") check_min_version("4.8.0.dev0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -433,6 +435,7 @@ def main(): ...@@ -433,6 +435,7 @@ def main():
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
remove_columns=column_names, remove_columns=column_names,
load_from_cache_file=not data_args.overwrite_cache, load_from_cache_file=not data_args.overwrite_cache,
desc="Running tokenizer on train dataset",
) )
if training_args.do_eval: if training_args.do_eval:
...@@ -448,6 +451,7 @@ def main(): ...@@ -448,6 +451,7 @@ def main():
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
remove_columns=column_names, remove_columns=column_names,
load_from_cache_file=not data_args.overwrite_cache, load_from_cache_file=not data_args.overwrite_cache,
desc="Running tokenizer on validation dataset",
) )
if training_args.do_predict: if training_args.do_predict:
...@@ -463,6 +467,7 @@ def main(): ...@@ -463,6 +467,7 @@ def main():
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
remove_columns=column_names, remove_columns=column_names,
load_from_cache_file=not data_args.overwrite_cache, load_from_cache_file=not data_args.overwrite_cache,
desc="Running tokenizer on prediction dataset",
) )
# Data collator # Data collator
......
...@@ -48,9 +48,12 @@ from transformers import ( ...@@ -48,9 +48,12 @@ from transformers import (
set_seed, set_seed,
) )
from transformers.file_utils import is_offline_mode from transformers.file_utils import is_offline_mode
from transformers.utils.versions import require_version
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
# You should update this to your particular problem to have better documentation of `model_type` # You should update this to your particular problem to have better documentation of `model_type`
MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys()) MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
...@@ -419,7 +422,11 @@ def main(): ...@@ -419,7 +422,11 @@ def main():
return model_inputs return model_inputs
processed_datasets = raw_datasets.map( processed_datasets = raw_datasets.map(
preprocess_function, batched=True, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache preprocess_function,
batched=True,
remove_columns=column_names,
load_from_cache_file=not args.overwrite_cache,
desc="Running tokenizer on dataset",
) )
train_dataset = processed_datasets["train"] train_dataset = processed_datasets["train"]
......
seqeval seqeval
datasets >= 1.1.3 datasets >= 1.8.0
torch >= 1.3 torch >= 1.3
...@@ -42,10 +42,12 @@ from transformers import ( ...@@ -42,10 +42,12 @@ from transformers import (
) )
from transformers.trainer_utils import get_last_checkpoint from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version from transformers.utils import check_min_version
from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks. # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.8.0.dev0") check_min_version("4.8.0.dev0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -388,6 +390,7 @@ def main(): ...@@ -388,6 +390,7 @@ def main():
batched=True, batched=True,
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
load_from_cache_file=not data_args.overwrite_cache, load_from_cache_file=not data_args.overwrite_cache,
desc="Running tokenizer on train dataset",
) )
if training_args.do_eval: if training_args.do_eval:
...@@ -401,6 +404,7 @@ def main(): ...@@ -401,6 +404,7 @@ def main():
batched=True, batched=True,
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
load_from_cache_file=not data_args.overwrite_cache, load_from_cache_file=not data_args.overwrite_cache,
desc="Running tokenizer on validation dataset",
) )
if training_args.do_predict: if training_args.do_predict:
...@@ -414,6 +418,7 @@ def main(): ...@@ -414,6 +418,7 @@ def main():
batched=True, batched=True,
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
load_from_cache_file=not data_args.overwrite_cache, load_from_cache_file=not data_args.overwrite_cache,
desc="Running tokenizer on prediction dataset",
) )
# Data collator # Data collator
......
...@@ -45,9 +45,12 @@ from transformers import ( ...@@ -45,9 +45,12 @@ from transformers import (
get_scheduler, get_scheduler,
set_seed, set_seed,
) )
from transformers.utils.versions import require_version
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
# You should update this to your particular problem to have better documentation of `model_type` # You should update this to your particular problem to have better documentation of `model_type`
MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys()) MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
...@@ -381,7 +384,10 @@ def main(): ...@@ -381,7 +384,10 @@ def main():
return tokenized_inputs return tokenized_inputs
processed_raw_datasets = raw_datasets.map( processed_raw_datasets = raw_datasets.map(
tokenize_and_align_labels, batched=True, remove_columns=raw_datasets["train"].column_names tokenize_and_align_labels,
batched=True,
remove_columns=raw_datasets["train"].column_names,
desc="Running tokenizer on dataset",
) )
train_dataset = processed_raw_datasets["train"] train_dataset = processed_raw_datasets["train"]
......
datasets >= 1.1.3 datasets >= 1.8.0
sentencepiece != 0.1.92 sentencepiece != 0.1.92
protobuf protobuf
sacrebleu >= 1.4.12 sacrebleu >= 1.4.12
......
...@@ -46,10 +46,12 @@ from transformers import ( ...@@ -46,10 +46,12 @@ from transformers import (
) )
from transformers.trainer_utils import get_last_checkpoint from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version from transformers.utils import check_min_version
from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks. # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.8.0.dev0") check_min_version("4.8.0.dev0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -427,6 +429,7 @@ def main(): ...@@ -427,6 +429,7 @@ def main():
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
remove_columns=column_names, remove_columns=column_names,
load_from_cache_file=not data_args.overwrite_cache, load_from_cache_file=not data_args.overwrite_cache,
desc="Running tokenizer on train dataset",
) )
if training_args.do_eval: if training_args.do_eval:
...@@ -442,6 +445,7 @@ def main(): ...@@ -442,6 +445,7 @@ def main():
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
remove_columns=column_names, remove_columns=column_names,
load_from_cache_file=not data_args.overwrite_cache, load_from_cache_file=not data_args.overwrite_cache,
desc="Running tokenizer on validation dataset",
) )
if training_args.do_predict: if training_args.do_predict:
...@@ -457,6 +461,7 @@ def main(): ...@@ -457,6 +461,7 @@ def main():
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
remove_columns=column_names, remove_columns=column_names,
load_from_cache_file=not data_args.overwrite_cache, load_from_cache_file=not data_args.overwrite_cache,
desc="Running tokenizer on prediction dataset",
) )
# Data collator # Data collator
......
...@@ -48,9 +48,12 @@ from transformers import ( ...@@ -48,9 +48,12 @@ from transformers import (
get_scheduler, get_scheduler,
set_seed, set_seed,
) )
from transformers.utils.versions import require_version
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
# You should update this to your particular problem to have better documentation of `model_type` # You should update this to your particular problem to have better documentation of `model_type`
MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys()) MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
...@@ -401,6 +404,7 @@ def main(): ...@@ -401,6 +404,7 @@ def main():
num_proc=args.preprocessing_num_workers, num_proc=args.preprocessing_num_workers,
remove_columns=column_names, remove_columns=column_names,
load_from_cache_file=not args.overwrite_cache, load_from_cache_file=not args.overwrite_cache,
desc="Running tokenizer on dataset",
) )
train_dataset = processed_datasets["train"] train_dataset = processed_datasets["train"]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment