Unverified Commit fd32ebed authored by Thomas Wolf's avatar Thomas Wolf Committed by GitHub
Browse files

Merge pull request #42 from weiyumou/master

Fixed UnicodeDecodeError: 'ascii' codec can't decode byte 0xc2
parents eed255a5 9ff2b7d8
...@@ -99,7 +99,7 @@ from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM ...@@ -99,7 +99,7 @@ from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Tokenized input # Tokenized input
tokenized_text = "Who was Jim Henson ? Jim Henson was a puppeteer" text = "Who was Jim Henson ? Jim Henson was a puppeteer"
tokenized_text = tokenizer.tokenize(text) tokenized_text = tokenizer.tokenize(text)
# Mask a token that we will try to predict back with `BertForMaskedLM` # Mask a token that we will try to predict back with `BertForMaskedLM`
......
...@@ -65,7 +65,7 @@ def load_vocab(vocab_file): ...@@ -65,7 +65,7 @@ def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary.""" """Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict() vocab = collections.OrderedDict()
index = 0 index = 0
with open(vocab_file, "r") as reader: with open(vocab_file, "r", encoding="utf8") as reader:
while True: while True:
token = convert_to_unicode(reader.readline()) token = convert_to_unicode(reader.readline())
if not token: if not token:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment