"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "63caa370e6c618dbe7d3fd4cbf545cc32eca1a15"
Unverified Commit 539ee456 authored by Bhadresh Savani's avatar Bhadresh Savani Committed by GitHub
Browse files

[Examples] Replicates the new --log_level feature to all trainer-based pytorch (#12359)

* added log_level

* fix comment

* fixed log_level

* Trigger CI

* Unfied logging

* simplified args for log_level
parent 64e60980
...@@ -28,6 +28,7 @@ import sys ...@@ -28,6 +28,7 @@ import sys
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Optional from typing import Optional
import datasets
from datasets import load_dataset from datasets import load_dataset
import transformers import transformers
...@@ -203,18 +204,19 @@ def main(): ...@@ -203,18 +204,19 @@ def main():
datefmt="%m/%d/%Y %H:%M:%S", datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)], handlers=[logging.StreamHandler(sys.stdout)],
) )
logger.setLevel(logging.INFO if training_args.should_log else logging.WARN)
log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
) )
# Set the verbosity to info of the Transformers logger (on main process only):
if training_args.should_log:
transformers.utils.logging.set_verbosity_info()
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")
# Detecting last checkpoint. # Detecting last checkpoint.
...@@ -246,15 +248,17 @@ def main(): ...@@ -246,15 +248,17 @@ def main():
# download the dataset. # download the dataset.
if data_args.dataset_name is not None: if data_args.dataset_name is not None:
# Downloading and loading a dataset from the hub. # Downloading and loading a dataset from the hub.
datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) raw_datasets = load_dataset(
if "validation" not in datasets.keys(): data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
datasets["validation"] = load_dataset( )
if "validation" not in raw_datasets.keys():
raw_datasets["validation"] = load_dataset(
data_args.dataset_name, data_args.dataset_name,
data_args.dataset_config_name, data_args.dataset_config_name,
split=f"train[:{data_args.validation_split_percentage}%]", split=f"train[:{data_args.validation_split_percentage}%]",
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
) )
datasets["train"] = load_dataset( raw_datasets["train"] = load_dataset(
data_args.dataset_name, data_args.dataset_name,
data_args.dataset_config_name, data_args.dataset_config_name,
split=f"train[{data_args.validation_split_percentage}%:]", split=f"train[{data_args.validation_split_percentage}%:]",
...@@ -273,7 +277,7 @@ def main(): ...@@ -273,7 +277,7 @@ def main():
) )
if extension == "txt": if extension == "txt":
extension = "text" extension = "text"
datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html. # https://huggingface.co/docs/datasets/loading_datasets.html.
...@@ -334,9 +338,9 @@ def main(): ...@@ -334,9 +338,9 @@ def main():
# Preprocessing the datasets. # Preprocessing the datasets.
# First we tokenize all the texts. # First we tokenize all the texts.
if training_args.do_train: if training_args.do_train:
column_names = datasets["train"].column_names column_names = raw_datasets["train"].column_names
else: else:
column_names = datasets["validation"].column_names column_names = raw_datasets["validation"].column_names
text_column_name = "text" if "text" in column_names else column_names[0] text_column_name = "text" if "text" in column_names else column_names[0]
# since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
...@@ -352,7 +356,7 @@ def main(): ...@@ -352,7 +356,7 @@ def main():
) )
return output return output
tokenized_datasets = datasets.map( tokenized_datasets = raw_datasets.map(
tokenize_function, tokenize_function,
batched=True, batched=True,
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
......
...@@ -28,6 +28,7 @@ import sys ...@@ -28,6 +28,7 @@ import sys
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Optional from typing import Optional
import datasets
from datasets import load_dataset from datasets import load_dataset
import transformers import transformers
...@@ -212,7 +213,13 @@ def main(): ...@@ -212,7 +213,13 @@ def main():
datefmt="%m/%d/%Y %H:%M:%S", datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)], handlers=[logging.StreamHandler(sys.stdout)],
) )
logger.setLevel(logging.INFO if training_args.should_log else logging.WARN)
log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
...@@ -220,10 +227,6 @@ def main(): ...@@ -220,10 +227,6 @@ def main():
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
) )
# Set the verbosity to info of the Transformers logger (on main process only): # Set the verbosity to info of the Transformers logger (on main process only):
if training_args.should_log:
transformers.utils.logging.set_verbosity_info()
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")
# Detecting last checkpoint. # Detecting last checkpoint.
...@@ -255,15 +258,17 @@ def main(): ...@@ -255,15 +258,17 @@ def main():
# download the dataset. # download the dataset.
if data_args.dataset_name is not None: if data_args.dataset_name is not None:
# Downloading and loading a dataset from the hub. # Downloading and loading a dataset from the hub.
datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) raw_datasets = load_dataset(
if "validation" not in datasets.keys(): data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
datasets["validation"] = load_dataset( )
if "validation" not in raw_datasets.keys():
raw_datasets["validation"] = load_dataset(
data_args.dataset_name, data_args.dataset_name,
data_args.dataset_config_name, data_args.dataset_config_name,
split=f"train[:{data_args.validation_split_percentage}%]", split=f"train[:{data_args.validation_split_percentage}%]",
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
) )
datasets["train"] = load_dataset( raw_datasets["train"] = load_dataset(
data_args.dataset_name, data_args.dataset_name,
data_args.dataset_config_name, data_args.dataset_config_name,
split=f"train[{data_args.validation_split_percentage}%:]", split=f"train[{data_args.validation_split_percentage}%:]",
...@@ -278,7 +283,7 @@ def main(): ...@@ -278,7 +283,7 @@ def main():
extension = data_args.train_file.split(".")[-1] extension = data_args.train_file.split(".")[-1]
if extension == "txt": if extension == "txt":
extension = "text" extension = "text"
datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html. # https://huggingface.co/docs/datasets/loading_datasets.html.
...@@ -337,9 +342,9 @@ def main(): ...@@ -337,9 +342,9 @@ def main():
# Preprocessing the datasets. # Preprocessing the datasets.
# First we tokenize all the texts. # First we tokenize all the texts.
if training_args.do_train: if training_args.do_train:
column_names = datasets["train"].column_names column_names = raw_datasets["train"].column_names
else: else:
column_names = datasets["validation"].column_names column_names = raw_datasets["validation"].column_names
text_column_name = "text" if "text" in column_names else column_names[0] text_column_name = "text" if "text" in column_names else column_names[0]
if data_args.max_seq_length is None: if data_args.max_seq_length is None:
...@@ -377,7 +382,7 @@ def main(): ...@@ -377,7 +382,7 @@ def main():
return_special_tokens_mask=True, return_special_tokens_mask=True,
) )
tokenized_datasets = datasets.map( tokenized_datasets = raw_datasets.map(
tokenize_function, tokenize_function,
batched=True, batched=True,
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
...@@ -392,7 +397,7 @@ def main(): ...@@ -392,7 +397,7 @@ def main():
def tokenize_function(examples): def tokenize_function(examples):
return tokenizer(examples[text_column_name], return_special_tokens_mask=True) return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
tokenized_datasets = datasets.map( tokenized_datasets = raw_datasets.map(
tokenize_function, tokenize_function,
batched=True, batched=True,
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
......
...@@ -25,6 +25,7 @@ import sys ...@@ -25,6 +25,7 @@ import sys
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Optional from typing import Optional
import datasets
from datasets import load_dataset from datasets import load_dataset
import transformers import transformers
...@@ -209,18 +210,19 @@ def main(): ...@@ -209,18 +210,19 @@ def main():
datefmt="%m/%d/%Y %H:%M:%S", datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)], handlers=[logging.StreamHandler(sys.stdout)],
) )
logger.setLevel(logging.INFO if training_args.should_log else logging.WARN)
log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
) )
# Set the verbosity to info of the Transformers logger (on main process only):
if training_args.should_log:
transformers.utils.logging.set_verbosity_info()
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")
# Detecting last checkpoint. # Detecting last checkpoint.
...@@ -252,15 +254,17 @@ def main(): ...@@ -252,15 +254,17 @@ def main():
# download the dataset. # download the dataset.
if data_args.dataset_name is not None: if data_args.dataset_name is not None:
# Downloading and loading a dataset from the hub. # Downloading and loading a dataset from the hub.
datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) raw_datasets = load_dataset(
if "validation" not in datasets.keys(): data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
datasets["validation"] = load_dataset( )
if "validation" not in raw_datasets.keys():
raw_datasets["validation"] = load_dataset(
data_args.dataset_name, data_args.dataset_name,
data_args.dataset_config_name, data_args.dataset_config_name,
split=f"train[:{data_args.validation_split_percentage}%]", split=f"train[:{data_args.validation_split_percentage}%]",
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,
) )
datasets["train"] = load_dataset( raw_datasets["train"] = load_dataset(
data_args.dataset_name, data_args.dataset_name,
data_args.dataset_config_name, data_args.dataset_config_name,
split=f"train[{data_args.validation_split_percentage}%:]", split=f"train[{data_args.validation_split_percentage}%:]",
...@@ -275,7 +279,7 @@ def main(): ...@@ -275,7 +279,7 @@ def main():
extension = data_args.train_file.split(".")[-1] extension = data_args.train_file.split(".")[-1]
if extension == "txt": if extension == "txt":
extension = "text" extension = "text"
datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html. # https://huggingface.co/docs/datasets/loading_datasets.html.
...@@ -334,9 +338,9 @@ def main(): ...@@ -334,9 +338,9 @@ def main():
# Preprocessing the datasets. # Preprocessing the datasets.
# First we tokenize all the texts. # First we tokenize all the texts.
if training_args.do_train: if training_args.do_train:
column_names = datasets["train"].column_names column_names = raw_datasets["train"].column_names
else: else:
column_names = datasets["validation"].column_names column_names = raw_datasets["validation"].column_names
text_column_name = "text" if "text" in column_names else column_names[0] text_column_name = "text" if "text" in column_names else column_names[0]
if data_args.max_seq_length > tokenizer.model_max_length: if data_args.max_seq_length > tokenizer.model_max_length:
...@@ -355,7 +359,7 @@ def main(): ...@@ -355,7 +359,7 @@ def main():
examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()] examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
return tokenizer(examples["text"], padding=padding, truncation=True, max_length=max_seq_length) return tokenizer(examples["text"], padding=padding, truncation=True, max_length=max_seq_length)
tokenized_datasets = datasets.map( tokenized_datasets = raw_datasets.map(
tokenize_function, tokenize_function,
batched=True, batched=True,
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
...@@ -368,7 +372,7 @@ def main(): ...@@ -368,7 +372,7 @@ def main():
def tokenize_function(examples): def tokenize_function(examples):
return tokenizer(examples[text_column_name]) return tokenizer(examples[text_column_name])
tokenized_datasets = datasets.map( tokenized_datasets = raw_datasets.map(
tokenize_function, tokenize_function,
batched=True, batched=True,
num_proc=data_args.preprocessing_num_workers, num_proc=data_args.preprocessing_num_workers,
......
...@@ -24,6 +24,7 @@ import sys ...@@ -24,6 +24,7 @@ import sys
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Optional, Union from typing import Optional, Union
import datasets
import numpy as np import numpy as np
import torch import torch
from datasets import load_dataset from datasets import load_dataset
...@@ -220,18 +221,18 @@ def main(): ...@@ -220,18 +221,18 @@ def main():
datefmt="%m/%d/%Y %H:%M:%S", datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)], handlers=[logging.StreamHandler(sys.stdout)],
) )
logger.setLevel(logging.INFO if training_args.should_log else logging.WARN) log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
) )
# Set the verbosity to info of the Transformers logger (on main process only):
if training_args.should_log:
transformers.utils.logging.set_verbosity_info()
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")
# Detecting last checkpoint. # Detecting last checkpoint.
...@@ -268,10 +269,10 @@ def main(): ...@@ -268,10 +269,10 @@ def main():
if data_args.validation_file is not None: if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file data_files["validation"] = data_args.validation_file
extension = data_args.train_file.split(".")[-1] extension = data_args.train_file.split(".")[-1]
datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
else: else:
# Downloading and loading the swag dataset from the hub. # Downloading and loading the swag dataset from the hub.
datasets = load_dataset("swag", "regular", cache_dir=model_args.cache_dir) raw_datasets = load_dataset("swag", "regular", cache_dir=model_args.cache_dir)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html. # https://huggingface.co/docs/datasets/loading_datasets.html.
...@@ -347,9 +348,9 @@ def main(): ...@@ -347,9 +348,9 @@ def main():
return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()} return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}
if training_args.do_train: if training_args.do_train:
if "train" not in datasets: if "train" not in raw_datasets:
raise ValueError("--do_train requires a train dataset") raise ValueError("--do_train requires a train dataset")
train_dataset = datasets["train"] train_dataset = raw_datasets["train"]
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
train_dataset = train_dataset.select(range(data_args.max_train_samples)) train_dataset = train_dataset.select(range(data_args.max_train_samples))
train_dataset = train_dataset.map( train_dataset = train_dataset.map(
...@@ -360,9 +361,9 @@ def main(): ...@@ -360,9 +361,9 @@ def main():
) )
if training_args.do_eval: if training_args.do_eval:
if "validation" not in datasets: if "validation" not in raw_datasets:
raise ValueError("--do_eval requires a validation dataset") raise ValueError("--do_eval requires a validation dataset")
eval_dataset = datasets["validation"] eval_dataset = raw_datasets["validation"]
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
eval_dataset = eval_dataset.map( eval_dataset = eval_dataset.map(
......
...@@ -24,6 +24,7 @@ import sys ...@@ -24,6 +24,7 @@ import sys
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Optional from typing import Optional
import datasets
from datasets import load_dataset, load_metric from datasets import load_dataset, load_metric
import transformers import transformers
...@@ -216,18 +217,19 @@ def main(): ...@@ -216,18 +217,19 @@ def main():
datefmt="%m/%d/%Y %H:%M:%S", datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)], handlers=[logging.StreamHandler(sys.stdout)],
) )
logger.setLevel(logging.INFO if training_args.should_log else logging.WARN)
log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
) )
# Set the verbosity to info of the Transformers logger (on main process only):
if training_args.should_log:
transformers.utils.logging.set_verbosity_info()
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")
# Detecting last checkpoint. # Detecting last checkpoint.
...@@ -259,7 +261,9 @@ def main(): ...@@ -259,7 +261,9 @@ def main():
# download the dataset. # download the dataset.
if data_args.dataset_name is not None: if data_args.dataset_name is not None:
# Downloading and loading a dataset from the hub. # Downloading and loading a dataset from the hub.
datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) raw_datasets = load_dataset(
data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
)
else: else:
data_files = {} data_files = {}
if data_args.train_file is not None: if data_args.train_file is not None:
...@@ -272,7 +276,7 @@ def main(): ...@@ -272,7 +276,7 @@ def main():
if data_args.test_file is not None: if data_args.test_file is not None:
data_files["test"] = data_args.test_file data_files["test"] = data_args.test_file
extension = data_args.test_file.split(".")[-1] extension = data_args.test_file.split(".")[-1]
datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir) raw_datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html. # https://huggingface.co/docs/datasets/loading_datasets.html.
...@@ -314,11 +318,11 @@ def main(): ...@@ -314,11 +318,11 @@ def main():
# Preprocessing the datasets. # Preprocessing the datasets.
# Preprocessing is slighlty different for training and evaluation. # Preprocessing is slighlty different for training and evaluation.
if training_args.do_train: if training_args.do_train:
column_names = datasets["train"].column_names column_names = raw_datasets["train"].column_names
elif training_args.do_eval: elif training_args.do_eval:
column_names = datasets["validation"].column_names column_names = raw_datasets["validation"].column_names
else: else:
column_names = datasets["test"].column_names column_names = raw_datasets["test"].column_names
question_column_name = "question" if "question" in column_names else column_names[0] question_column_name = "question" if "question" in column_names else column_names[0]
context_column_name = "context" if "context" in column_names else column_names[1] context_column_name = "context" if "context" in column_names else column_names[1]
answer_column_name = "answers" if "answers" in column_names else column_names[2] answer_column_name = "answers" if "answers" in column_names else column_names[2]
...@@ -407,9 +411,9 @@ def main(): ...@@ -407,9 +411,9 @@ def main():
return tokenized_examples return tokenized_examples
if training_args.do_train: if training_args.do_train:
if "train" not in datasets: if "train" not in raw_datasets:
raise ValueError("--do_train requires a train dataset") raise ValueError("--do_train requires a train dataset")
train_dataset = datasets["train"] train_dataset = raw_datasets["train"]
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
# We will select sample from whole data if agument is specified # We will select sample from whole data if agument is specified
train_dataset = train_dataset.select(range(data_args.max_train_samples)) train_dataset = train_dataset.select(range(data_args.max_train_samples))
...@@ -469,9 +473,9 @@ def main(): ...@@ -469,9 +473,9 @@ def main():
return tokenized_examples return tokenized_examples
if training_args.do_eval: if training_args.do_eval:
if "validation" not in datasets: if "validation" not in raw_datasets:
raise ValueError("--do_eval requires a validation dataset") raise ValueError("--do_eval requires a validation dataset")
eval_examples = datasets["validation"] eval_examples = raw_datasets["validation"]
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
# We will select sample from whole data # We will select sample from whole data
eval_examples = eval_examples.select(range(data_args.max_eval_samples)) eval_examples = eval_examples.select(range(data_args.max_eval_samples))
...@@ -489,9 +493,9 @@ def main(): ...@@ -489,9 +493,9 @@ def main():
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
if training_args.do_predict: if training_args.do_predict:
if "test" not in datasets: if "test" not in raw_datasets:
raise ValueError("--do_predict requires a test dataset") raise ValueError("--do_predict requires a test dataset")
predict_examples = datasets["test"] predict_examples = raw_datasets["test"]
if data_args.max_predict_samples is not None: if data_args.max_predict_samples is not None:
# We will select sample from whole data # We will select sample from whole data
predict_examples = predict_examples.select(range(data_args.max_predict_samples)) predict_examples = predict_examples.select(range(data_args.max_predict_samples))
...@@ -529,7 +533,7 @@ def main(): ...@@ -529,7 +533,7 @@ def main():
max_answer_length=data_args.max_answer_length, max_answer_length=data_args.max_answer_length,
null_score_diff_threshold=data_args.null_score_diff_threshold, null_score_diff_threshold=data_args.null_score_diff_threshold,
output_dir=training_args.output_dir, output_dir=training_args.output_dir,
is_world_process_zero=trainer.is_world_process_zero(), log_level=log_level,
prefix=stage, prefix=stage,
) )
# Format the result to the format the metric expects. # Format the result to the format the metric expects.
......
...@@ -24,6 +24,7 @@ import sys ...@@ -24,6 +24,7 @@ import sys
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Optional from typing import Optional
import datasets
from datasets import load_dataset, load_metric from datasets import load_dataset, load_metric
import transformers import transformers
...@@ -215,18 +216,18 @@ def main(): ...@@ -215,18 +216,18 @@ def main():
datefmt="%m/%d/%Y %H:%M:%S", datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)], handlers=[logging.StreamHandler(sys.stdout)],
) )
logger.setLevel(logging.INFO if training_args.should_log else logging.WARN) log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
) )
# Set the verbosity to info of the Transformers logger (on main process only):
if training_args.should_log:
transformers.utils.logging.set_verbosity_info()
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")
# Detecting last checkpoint. # Detecting last checkpoint.
...@@ -258,7 +259,9 @@ def main(): ...@@ -258,7 +259,9 @@ def main():
# download the dataset. # download the dataset.
if data_args.dataset_name is not None: if data_args.dataset_name is not None:
# Downloading and loading a dataset from the hub. # Downloading and loading a dataset from the hub.
datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) raw_datasets = load_dataset(
data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
)
else: else:
data_files = {} data_files = {}
if data_args.train_file is not None: if data_args.train_file is not None:
...@@ -270,7 +273,7 @@ def main(): ...@@ -270,7 +273,7 @@ def main():
if data_args.test_file is not None: if data_args.test_file is not None:
data_files["test"] = data_args.test_file data_files["test"] = data_args.test_file
extension = data_args.test_file.split(".")[-1] extension = data_args.test_file.split(".")[-1]
datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir) raw_datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html. # https://huggingface.co/docs/datasets/loading_datasets.html.
...@@ -303,11 +306,11 @@ def main(): ...@@ -303,11 +306,11 @@ def main():
# Preprocessing the datasets. # Preprocessing the datasets.
# Preprocessing is slighlty different for training and evaluation. # Preprocessing is slighlty different for training and evaluation.
if training_args.do_train: if training_args.do_train:
column_names = datasets["train"].column_names column_names = raw_datasets["train"].column_names
elif training_args.do_eval: elif training_args.do_eval:
column_names = datasets["validation"].column_names column_names = raw_datasets["validation"].column_names
else: else:
column_names = datasets["test"].column_names column_names = raw_datasets["test"].column_names
question_column_name = "question" if "question" in column_names else column_names[0] question_column_name = "question" if "question" in column_names else column_names[0]
context_column_name = "context" if "context" in column_names else column_names[1] context_column_name = "context" if "context" in column_names else column_names[1]
answer_column_name = "answers" if "answers" in column_names else column_names[2] answer_column_name = "answers" if "answers" in column_names else column_names[2]
...@@ -419,9 +422,9 @@ def main(): ...@@ -419,9 +422,9 @@ def main():
return tokenized_examples return tokenized_examples
if training_args.do_train: if training_args.do_train:
if "train" not in datasets: if "train" not in raw_datasets:
raise ValueError("--do_train requires a train dataset") raise ValueError("--do_train requires a train dataset")
train_dataset = datasets["train"] train_dataset = raw_datasets["train"]
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
# Select samples from Dataset, This will help to decrease processing time # Select samples from Dataset, This will help to decrease processing time
train_dataset = train_dataset.select(range(data_args.max_train_samples)) train_dataset = train_dataset.select(range(data_args.max_train_samples))
...@@ -505,9 +508,9 @@ def main(): ...@@ -505,9 +508,9 @@ def main():
return tokenized_examples return tokenized_examples
if training_args.do_eval: if training_args.do_eval:
if "validation" not in datasets: if "validation" not in raw_datasets:
raise ValueError("--do_eval requires a validation dataset") raise ValueError("--do_eval requires a validation dataset")
eval_examples = datasets["validation"] eval_examples = raw_datasets["validation"]
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
# Selecting Eval Samples from Dataset # Selecting Eval Samples from Dataset
eval_examples = eval_examples.select(range(data_args.max_eval_samples)) eval_examples = eval_examples.select(range(data_args.max_eval_samples))
...@@ -525,9 +528,9 @@ def main(): ...@@ -525,9 +528,9 @@ def main():
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
if training_args.do_predict: if training_args.do_predict:
if "test" not in datasets: if "test" not in raw_datasets:
raise ValueError("--do_predict requires a test dataset") raise ValueError("--do_predict requires a test dataset")
predict_examples = datasets["test"] predict_examples = raw_datasets["test"]
if data_args.max_predict_samples is not None: if data_args.max_predict_samples is not None:
# We will select sample from whole data # We will select sample from whole data
predict_examples = predict_examples.select(range(data_args.max_predict_samples)) predict_examples = predict_examples.select(range(data_args.max_predict_samples))
...@@ -566,7 +569,7 @@ def main(): ...@@ -566,7 +569,7 @@ def main():
start_n_top=model.config.start_n_top, start_n_top=model.config.start_n_top,
end_n_top=model.config.end_n_top, end_n_top=model.config.end_n_top,
output_dir=training_args.output_dir, output_dir=training_args.output_dir,
is_world_process_zero=trainer.is_world_process_zero(), log_level=log_level,
prefix=stage, prefix=stage,
) )
# Format the result to the format the metric expects. # Format the result to the format the metric expects.
......
...@@ -38,7 +38,7 @@ def postprocess_qa_predictions( ...@@ -38,7 +38,7 @@ def postprocess_qa_predictions(
null_score_diff_threshold: float = 0.0, null_score_diff_threshold: float = 0.0,
output_dir: Optional[str] = None, output_dir: Optional[str] = None,
prefix: Optional[str] = None, prefix: Optional[str] = None,
is_world_process_zero: bool = True, log_level: Optional[int] = logging.WARNING,
): ):
""" """
Post-processes the predictions of a question-answering model to convert them to answers that are substrings of the Post-processes the predictions of a question-answering model to convert them to answers that are substrings of the
...@@ -70,8 +70,8 @@ def postprocess_qa_predictions( ...@@ -70,8 +70,8 @@ def postprocess_qa_predictions(
answers, are saved in `output_dir`. answers, are saved in `output_dir`.
prefix (:obj:`str`, `optional`): prefix (:obj:`str`, `optional`):
If provided, the dictionaries mentioned above are saved with `prefix` added to their names. If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
is_world_process_zero (:obj:`bool`, `optional`, defaults to :obj:`True`): log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``):
Whether this process is the main process or not (used to determine if logging/saves should be done). ``logging`` log level (e.g., ``logging.WARNING``)
""" """
assert len(predictions) == 2, "`predictions` should be a tuple with two elements (start_logits, end_logits)." assert len(predictions) == 2, "`predictions` should be a tuple with two elements (start_logits, end_logits)."
all_start_logits, all_end_logits = predictions all_start_logits, all_end_logits = predictions
...@@ -91,7 +91,7 @@ def postprocess_qa_predictions( ...@@ -91,7 +91,7 @@ def postprocess_qa_predictions(
scores_diff_json = collections.OrderedDict() scores_diff_json = collections.OrderedDict()
# Logging. # Logging.
logger.setLevel(logging.INFO if is_world_process_zero else logging.WARN) logger.setLevel(log_level)
logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.") logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")
# Let's loop over all the examples! # Let's loop over all the examples!
...@@ -250,7 +250,7 @@ def postprocess_qa_predictions_with_beam_search( ...@@ -250,7 +250,7 @@ def postprocess_qa_predictions_with_beam_search(
end_n_top: int = 5, end_n_top: int = 5,
output_dir: Optional[str] = None, output_dir: Optional[str] = None,
prefix: Optional[str] = None, prefix: Optional[str] = None,
is_world_process_zero: bool = True, log_level: Optional[int] = logging.WARNING,
): ):
""" """
Post-processes the predictions of a question-answering model with beam search to convert them to answers that are substrings of the Post-processes the predictions of a question-answering model with beam search to convert them to answers that are substrings of the
...@@ -280,8 +280,8 @@ def postprocess_qa_predictions_with_beam_search( ...@@ -280,8 +280,8 @@ def postprocess_qa_predictions_with_beam_search(
answers, are saved in `output_dir`. answers, are saved in `output_dir`.
prefix (:obj:`str`, `optional`): prefix (:obj:`str`, `optional`):
If provided, the dictionaries mentioned above are saved with `prefix` added to their names. If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
is_world_process_zero (:obj:`bool`, `optional`, defaults to :obj:`True`): log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``):
Whether this process is the main process or not (used to determine if logging/saves should be done). ``logging`` log level (e.g., ``logging.WARNING``)
""" """
assert len(predictions) == 5, "`predictions` should be a tuple with five elements." assert len(predictions) == 5, "`predictions` should be a tuple with five elements."
start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits = predictions start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits = predictions
...@@ -302,7 +302,7 @@ def postprocess_qa_predictions_with_beam_search( ...@@ -302,7 +302,7 @@ def postprocess_qa_predictions_with_beam_search(
scores_diff_json = collections.OrderedDict() if version_2_with_negative else None scores_diff_json = collections.OrderedDict() if version_2_with_negative else None
# Logging. # Logging.
logger.setLevel(logging.INFO if is_world_process_zero else logging.WARN) logger.setLevel(log_level)
logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.") logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")
# Let's loop over all the examples! # Let's loop over all the examples!
......
...@@ -24,6 +24,7 @@ import sys ...@@ -24,6 +24,7 @@ import sys
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Optional from typing import Optional
import datasets
import nltk # Here to have a nice missing dependency error message early on import nltk # Here to have a nice missing dependency error message early on
import numpy as np import numpy as np
from datasets import load_dataset, load_metric from datasets import load_dataset, load_metric
...@@ -260,16 +261,18 @@ def main(): ...@@ -260,16 +261,18 @@ def main():
datefmt="%m/%d/%Y %H:%M:%S", datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)], handlers=[logging.StreamHandler(sys.stdout)],
) )
logger.setLevel(logging.INFO if training_args.should_log else logging.WARN) log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
) )
# Set the verbosity to info of the Transformers logger (on main process only):
if training_args.should_log:
transformers.utils.logging.set_verbosity_info()
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")
if data_args.source_prefix is None and model_args.model_name_or_path in [ if data_args.source_prefix is None and model_args.model_name_or_path in [
...@@ -313,7 +316,9 @@ def main(): ...@@ -313,7 +316,9 @@ def main():
# download the dataset. # download the dataset.
if data_args.dataset_name is not None: if data_args.dataset_name is not None:
# Downloading and loading a dataset from the hub. # Downloading and loading a dataset from the hub.
datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) raw_datasets = load_dataset(
data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
)
else: else:
data_files = {} data_files = {}
if data_args.train_file is not None: if data_args.train_file is not None:
...@@ -325,7 +330,7 @@ def main(): ...@@ -325,7 +330,7 @@ def main():
if data_args.test_file is not None: if data_args.test_file is not None:
data_files["test"] = data_args.test_file data_files["test"] = data_args.test_file
extension = data_args.test_file.split(".")[-1] extension = data_args.test_file.split(".")[-1]
datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html. # https://huggingface.co/docs/datasets/loading_datasets.html.
...@@ -366,11 +371,11 @@ def main(): ...@@ -366,11 +371,11 @@ def main():
# Preprocessing the datasets. # Preprocessing the datasets.
# We need to tokenize inputs and targets. # We need to tokenize inputs and targets.
if training_args.do_train: if training_args.do_train:
column_names = datasets["train"].column_names column_names = raw_datasets["train"].column_names
elif training_args.do_eval: elif training_args.do_eval:
column_names = datasets["validation"].column_names column_names = raw_datasets["validation"].column_names
elif training_args.do_predict: elif training_args.do_predict:
column_names = datasets["test"].column_names column_names = raw_datasets["test"].column_names
else: else:
logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.") logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.")
return return
...@@ -425,9 +430,9 @@ def main(): ...@@ -425,9 +430,9 @@ def main():
return model_inputs return model_inputs
if training_args.do_train: if training_args.do_train:
if "train" not in datasets: if "train" not in raw_datasets:
raise ValueError("--do_train requires a train dataset") raise ValueError("--do_train requires a train dataset")
train_dataset = datasets["train"] train_dataset = raw_datasets["train"]
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
train_dataset = train_dataset.select(range(data_args.max_train_samples)) train_dataset = train_dataset.select(range(data_args.max_train_samples))
train_dataset = train_dataset.map( train_dataset = train_dataset.map(
...@@ -441,9 +446,9 @@ def main(): ...@@ -441,9 +446,9 @@ def main():
if training_args.do_eval: if training_args.do_eval:
max_target_length = data_args.val_max_target_length max_target_length = data_args.val_max_target_length
if "validation" not in datasets: if "validation" not in raw_datasets:
raise ValueError("--do_eval requires a validation dataset") raise ValueError("--do_eval requires a validation dataset")
eval_dataset = datasets["validation"] eval_dataset = raw_datasets["validation"]
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
eval_dataset = eval_dataset.map( eval_dataset = eval_dataset.map(
...@@ -457,9 +462,9 @@ def main(): ...@@ -457,9 +462,9 @@ def main():
if training_args.do_predict: if training_args.do_predict:
max_target_length = data_args.val_max_target_length max_target_length = data_args.val_max_target_length
if "test" not in datasets: if "test" not in raw_datasets:
raise ValueError("--do_predict requires a test dataset") raise ValueError("--do_predict requires a test dataset")
predict_dataset = datasets["test"] predict_dataset = raw_datasets["test"]
if data_args.max_predict_samples is not None: if data_args.max_predict_samples is not None:
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
predict_dataset = predict_dataset.map( predict_dataset = predict_dataset.map(
......
...@@ -23,6 +23,7 @@ import sys ...@@ -23,6 +23,7 @@ import sys
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Optional from typing import Optional
import datasets
import numpy as np import numpy as np
from datasets import load_dataset, load_metric from datasets import load_dataset, load_metric
...@@ -204,18 +205,19 @@ def main(): ...@@ -204,18 +205,19 @@ def main():
datefmt="%m/%d/%Y %H:%M:%S", datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)], handlers=[logging.StreamHandler(sys.stdout)],
) )
logger.setLevel(logging.INFO if training_args.should_log else logging.WARN)
log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
) )
# Set the verbosity to info of the Transformers logger (on main process only):
if training_args.should_log:
transformers.utils.logging.set_verbosity_info()
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")
# Detecting last checkpoint. # Detecting last checkpoint.
...@@ -250,10 +252,12 @@ def main(): ...@@ -250,10 +252,12 @@ def main():
# download the dataset. # download the dataset.
if data_args.task_name is not None: if data_args.task_name is not None:
# Downloading and loading a dataset from the hub. # Downloading and loading a dataset from the hub.
datasets = load_dataset("glue", data_args.task_name, cache_dir=model_args.cache_dir) raw_datasets = load_dataset("glue", data_args.task_name, cache_dir=model_args.cache_dir)
elif data_args.dataset_name is not None: elif data_args.dataset_name is not None:
# Downloading and loading a dataset from the hub. # Downloading and loading a dataset from the hub.
datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) raw_datasets = load_dataset(
data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
)
else: else:
# Loading a dataset from your local files. # Loading a dataset from your local files.
# CSV/JSON training and evaluation files are needed. # CSV/JSON training and evaluation files are needed.
...@@ -277,10 +281,10 @@ def main(): ...@@ -277,10 +281,10 @@ def main():
if data_args.train_file.endswith(".csv"): if data_args.train_file.endswith(".csv"):
# Loading a dataset from local csv files # Loading a dataset from local csv files
datasets = load_dataset("csv", data_files=data_files, cache_dir=model_args.cache_dir) raw_datasets = load_dataset("csv", data_files=data_files, cache_dir=model_args.cache_dir)
else: else:
# Loading a dataset from local json files # Loading a dataset from local json files
datasets = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir) raw_datasets = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir)
# See more about loading any type of standard or custom dataset at # See more about loading any type of standard or custom dataset at
# https://huggingface.co/docs/datasets/loading_datasets.html. # https://huggingface.co/docs/datasets/loading_datasets.html.
...@@ -288,19 +292,19 @@ def main(): ...@@ -288,19 +292,19 @@ def main():
if data_args.task_name is not None: if data_args.task_name is not None:
is_regression = data_args.task_name == "stsb" is_regression = data_args.task_name == "stsb"
if not is_regression: if not is_regression:
label_list = datasets["train"].features["label"].names label_list = raw_datasets["train"].features["label"].names
num_labels = len(label_list) num_labels = len(label_list)
else: else:
num_labels = 1 num_labels = 1
else: else:
# Trying to have good defaults here, don't hesitate to tweak to your needs. # Trying to have good defaults here, don't hesitate to tweak to your needs.
is_regression = datasets["train"].features["label"].dtype in ["float32", "float64"] is_regression = raw_datasets["train"].features["label"].dtype in ["float32", "float64"]
if is_regression: if is_regression:
num_labels = 1 num_labels = 1
else: else:
# A useful fast method: # A useful fast method:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique
label_list = datasets["train"].unique("label") label_list = raw_datasets["train"].unique("label")
label_list.sort() # Let's sort it for determinism label_list.sort() # Let's sort it for determinism
num_labels = len(label_list) num_labels = len(label_list)
...@@ -332,12 +336,12 @@ def main(): ...@@ -332,12 +336,12 @@ def main():
use_auth_token=True if model_args.use_auth_token else None, use_auth_token=True if model_args.use_auth_token else None,
) )
# Preprocessing the datasets # Preprocessing the raw_datasets
if data_args.task_name is not None: if data_args.task_name is not None:
sentence1_key, sentence2_key = task_to_keys[data_args.task_name] sentence1_key, sentence2_key = task_to_keys[data_args.task_name]
else: else:
# Again, we try to have some nice defaults but don't hesitate to tweak to your use case. # Again, we try to have some nice defaults but don't hesitate to tweak to your use case.
non_label_column_names = [name for name in datasets["train"].column_names if name != "label"] non_label_column_names = [name for name in raw_datasets["train"].column_names if name != "label"]
if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names: if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names:
sentence1_key, sentence2_key = "sentence1", "sentence2" sentence1_key, sentence2_key = "sentence1", "sentence2"
else: else:
...@@ -396,30 +400,30 @@ def main(): ...@@ -396,30 +400,30 @@ def main():
result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]] result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]]
return result return result
datasets = datasets.map( raw_datasets = raw_datasets.map(
preprocess_function, preprocess_function,
batched=True, batched=True,
load_from_cache_file=not data_args.overwrite_cache, load_from_cache_file=not data_args.overwrite_cache,
desc="Running tokenizer on dataset", desc="Running tokenizer on dataset",
) )
if training_args.do_train: if training_args.do_train:
if "train" not in datasets: if "train" not in raw_datasets:
raise ValueError("--do_train requires a train dataset") raise ValueError("--do_train requires a train dataset")
train_dataset = datasets["train"] train_dataset = raw_datasets["train"]
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
train_dataset = train_dataset.select(range(data_args.max_train_samples)) train_dataset = train_dataset.select(range(data_args.max_train_samples))
if training_args.do_eval: if training_args.do_eval:
if "validation" not in datasets and "validation_matched" not in datasets: if "validation" not in raw_datasets and "validation_matched" not in raw_datasets:
raise ValueError("--do_eval requires a validation dataset") raise ValueError("--do_eval requires a validation dataset")
eval_dataset = datasets["validation_matched" if data_args.task_name == "mnli" else "validation"] eval_dataset = raw_datasets["validation_matched" if data_args.task_name == "mnli" else "validation"]
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
if training_args.do_predict or data_args.task_name is not None or data_args.test_file is not None: if training_args.do_predict or data_args.task_name is not None or data_args.test_file is not None:
if "test" not in datasets and "test_matched" not in datasets: if "test" not in raw_datasets and "test_matched" not in raw_datasets:
raise ValueError("--do_predict requires a test dataset") raise ValueError("--do_predict requires a test dataset")
predict_dataset = datasets["test_matched" if data_args.task_name == "mnli" else "test"] predict_dataset = raw_datasets["test_matched" if data_args.task_name == "mnli" else "test"]
if data_args.max_predict_samples is not None: if data_args.max_predict_samples is not None:
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
...@@ -497,7 +501,7 @@ def main(): ...@@ -497,7 +501,7 @@ def main():
eval_datasets = [eval_dataset] eval_datasets = [eval_dataset]
if data_args.task_name == "mnli": if data_args.task_name == "mnli":
tasks.append("mnli-mm") tasks.append("mnli-mm")
eval_datasets.append(datasets["validation_mismatched"]) eval_datasets.append(raw_datasets["validation_mismatched"])
for eval_dataset, task in zip(eval_datasets, tasks): for eval_dataset, task in zip(eval_datasets, tasks):
metrics = trainer.evaluate(eval_dataset=eval_dataset) metrics = trainer.evaluate(eval_dataset=eval_dataset)
...@@ -518,7 +522,7 @@ def main(): ...@@ -518,7 +522,7 @@ def main():
predict_datasets = [predict_dataset] predict_datasets = [predict_dataset]
if data_args.task_name == "mnli": if data_args.task_name == "mnli":
tasks.append("mnli-mm") tasks.append("mnli-mm")
predict_datasets.append(datasets["test_mismatched"]) predict_datasets.append(raw_datasets["test_mismatched"])
for predict_dataset, task in zip(predict_datasets, tasks): for predict_dataset, task in zip(predict_datasets, tasks):
# Removing the `label` columns because it contains -1 and Trainer won't like that. # Removing the `label` columns because it contains -1 and Trainer won't like that.
......
...@@ -24,6 +24,7 @@ import sys ...@@ -24,6 +24,7 @@ import sys
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Optional from typing import Optional
import datasets
import numpy as np import numpy as np
from datasets import load_dataset, load_metric from datasets import load_dataset, load_metric
...@@ -174,19 +175,19 @@ def main(): ...@@ -174,19 +175,19 @@ def main():
datefmt="%m/%d/%Y %H:%M:%S", datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)], handlers=[logging.StreamHandler(sys.stdout)],
) )
logger.setLevel(logging.INFO if training_args.should_log else logging.WARN)
log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
) )
# Set the verbosity to info of the Transformers logger (on main process only):
if training_args.should_log:
transformers.utils.logging.set_verbosity_info()
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")
# Detecting last checkpoint. # Detecting last checkpoint.
......
...@@ -25,6 +25,7 @@ import sys ...@@ -25,6 +25,7 @@ import sys
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Optional from typing import Optional
import datasets
import numpy as np import numpy as np
from datasets import ClassLabel, load_dataset, load_metric from datasets import ClassLabel, load_dataset, load_metric
...@@ -195,18 +196,19 @@ def main(): ...@@ -195,18 +196,19 @@ def main():
datefmt="%m/%d/%Y %H:%M:%S", datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)], handlers=[logging.StreamHandler(sys.stdout)],
) )
logger.setLevel(logging.INFO if training_args.should_log else logging.WARN)
log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
) )
# Set the verbosity to info of the Transformers logger (on main process only):
if training_args.should_log:
transformers.utils.logging.set_verbosity_info()
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
logger.info(f"Training/evaluation parameters {training_args}") logger.info(f"Training/evaluation parameters {training_args}")
# Detecting last checkpoint. # Detecting last checkpoint.
...@@ -238,7 +240,9 @@ def main(): ...@@ -238,7 +240,9 @@ def main():
# download the dataset. # download the dataset.
if data_args.dataset_name is not None: if data_args.dataset_name is not None:
# Downloading and loading a dataset from the hub. # Downloading and loading a dataset from the hub.
datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) raw_datasets = load_dataset(
data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
)
else: else:
data_files = {} data_files = {}
if data_args.train_file is not None: if data_args.train_file is not None:
...@@ -248,16 +252,16 @@ def main(): ...@@ -248,16 +252,16 @@ def main():
if data_args.test_file is not None: if data_args.test_file is not None:
data_files["test"] = data_args.test_file data_files["test"] = data_args.test_file
extension = data_args.train_file.split(".")[-1] extension = data_args.train_file.split(".")[-1]
datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html. # https://huggingface.co/docs/datasets/loading_datasets.html.
if training_args.do_train: if training_args.do_train:
column_names = datasets["train"].column_names column_names = raw_datasets["train"].column_names
features = datasets["train"].features features = raw_datasets["train"].features
else: else:
column_names = datasets["validation"].column_names column_names = raw_datasets["validation"].column_names
features = datasets["validation"].features features = raw_datasets["validation"].features
if data_args.text_column_name is not None: if data_args.text_column_name is not None:
text_column_name = data_args.text_column_name text_column_name = data_args.text_column_name
...@@ -288,7 +292,7 @@ def main(): ...@@ -288,7 +292,7 @@ def main():
# No need to convert the labels since they are already ints. # No need to convert the labels since they are already ints.
label_to_id = {i: i for i in range(len(label_list))} label_to_id = {i: i for i in range(len(label_list))}
else: else:
label_list = get_label_list(datasets["train"][label_column_name]) label_list = get_label_list(raw_datasets["train"][label_column_name])
label_to_id = {l: i for i, l in enumerate(label_list)} label_to_id = {l: i for i, l in enumerate(label_list)}
num_labels = len(label_list) num_labels = len(label_list)
...@@ -381,9 +385,9 @@ def main(): ...@@ -381,9 +385,9 @@ def main():
return tokenized_inputs return tokenized_inputs
if training_args.do_train: if training_args.do_train:
if "train" not in datasets: if "train" not in raw_datasets:
raise ValueError("--do_train requires a train dataset") raise ValueError("--do_train requires a train dataset")
train_dataset = datasets["train"] train_dataset = raw_datasets["train"]
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
train_dataset = train_dataset.select(range(data_args.max_train_samples)) train_dataset = train_dataset.select(range(data_args.max_train_samples))
train_dataset = train_dataset.map( train_dataset = train_dataset.map(
...@@ -395,9 +399,9 @@ def main(): ...@@ -395,9 +399,9 @@ def main():
) )
if training_args.do_eval: if training_args.do_eval:
if "validation" not in datasets: if "validation" not in raw_datasets:
raise ValueError("--do_eval requires a validation dataset") raise ValueError("--do_eval requires a validation dataset")
eval_dataset = datasets["validation"] eval_dataset = raw_datasets["validation"]
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
eval_dataset = eval_dataset.map( eval_dataset = eval_dataset.map(
...@@ -409,9 +413,9 @@ def main(): ...@@ -409,9 +413,9 @@ def main():
) )
if training_args.do_predict: if training_args.do_predict:
if "test" not in datasets: if "test" not in raw_datasets:
raise ValueError("--do_predict requires a test dataset") raise ValueError("--do_predict requires a test dataset")
predict_dataset = datasets["test"] predict_dataset = raw_datasets["test"]
if data_args.max_predict_samples is not None: if data_args.max_predict_samples is not None:
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
predict_dataset = predict_dataset.map( predict_dataset = predict_dataset.map(
......
...@@ -344,7 +344,7 @@ def main(): ...@@ -344,7 +344,7 @@ def main():
model.resize_token_embeddings(len(tokenizer)) model.resize_token_embeddings(len(tokenizer))
# Preprocessing the raw_datasets. # Preprocessing the datasets.
# First we tokenize all the texts. # First we tokenize all the texts.
padding = "max_length" if args.pad_to_max_length else False padding = "max_length" if args.pad_to_max_length else False
......
...@@ -250,6 +250,8 @@ def main(): ...@@ -250,6 +250,8 @@ def main():
logger.setLevel(log_level) logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level) datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level) transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
# Log on each process the small summary: # Log on each process the small summary:
logger.warning( logger.warning(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment