set encoding to 'utf-8' in calls to open

ae88eb88 · thomwolf · e1eab59a · ae88eb88 · ae88eb88 · ae88eb88
Commit ae88eb88 authored Dec 14, 2018 by thomwolf
7 changed files
--- a/examples/extract_features.py
+++ b/examples/extract_features.py
@@ -168,7 +168,7 @@ def read_examples(input_file):
    """Read a list of `InputExample`s from an input file."""
    examples = []
    unique_id = 0
-    with open(input_file, "r") as reader:
+    with open(input_file, "r", encoding='utf-8') as reader:
        while True:
            line = reader.readline()
            if not line:

--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -91,7 +91,7 @@ class DataProcessor(object):
    @classmethod
    def _read_tsv(cls, input_file, quotechar=None):
        """Reads a tab separated value file."""
-        with open(input_file, "r") as f:
+        with open(input_file, "r", encoding='utf-8') as f:
            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
            lines = []
            for line in reader:
@@ -413,7 +413,8 @@ def main():
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
-    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1))
+    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
+        device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(

--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -108,7 +108,7 @@ class InputFeatures(object):

 def read_squad_examples(input_file, is_training):
    """Read a SQuAD json file into a list of SquadExample."""
-    with open(input_file, "r") as reader:
+    with open(input_file, "r", encoding='utf-8') as reader:
        input_data = json.load(reader)["data"]

    def is_whitespace(c):
@@ -757,7 +757,7 @@ def main():
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
-    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits trainiing: {}".format(
+    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
        device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:

--- a/examples/run_swag.py
+++ b/examples/run_swag.py
@@ -100,7 +100,7 @@ class InputFeatures(object):


 def read_swag_examples(input_file, is_training):
-    with open(input_file, 'r') as f:
+    with open(input_file, 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        lines = list(reader)

@@ -333,7 +333,8 @@ def main():
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
-    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1))
+    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
+        device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(

--- a/pytorch_pretrained_bert/file_utils.py
+++ b/pytorch_pretrained_bert/file_utils.py
@@ -227,7 +227,7 @@ def read_set_from_file(filename: str) -> Set[str]:
    Expected file format is one item per line.
    '''
    collection = set()
-    with open(filename, 'r') as file_:
+    with open(filename, 'r', encoding='utf-8') as file_:
        for line in file_:
            collection.add(line.rstrip())
    return collection

--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -106,7 +106,7 @@ class BertConfig(object):
                initializing all weight matrices.
        """
        if isinstance(vocab_size_or_config_json_file, str):
-            with open(vocab_size_or_config_json_file, "r") as reader:
+            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
                json_config = json.loads(reader.read())
            for key, value in json_config.items():
                self.__dict__[key] = value
@@ -137,7 +137,7 @@ class BertConfig(object):
    @classmethod
    def from_json_file(cls, json_file):
        """Constructs a `BertConfig` from a json file of parameters."""
-        with open(json_file, "r") as reader:
+        with open(json_file, "r", encoding='utf-8') as reader:
            text = reader.read()
        return cls.from_dict(json.loads(text))


--- a/setup.py
+++ b/setup.py
@@ -41,7 +41,7 @@ setup(
    author="Thomas Wolf, Victor Sanh, Tim Rault, Google AI Language Team Authors",
    author_email="thomas@huggingface.co",
    description="PyTorch version of Google AI BERT model with script to load Google pre-trained models",
-    long_description=open("README.md", "r").read(),
+    long_description=open("README.md", "r", encoding='utf-8').read(),
    long_description_content_type="text/markdown",
    keywords='BERT NLP deep learning google',
    license='Apache',