Unverified Commit 38dbbc26 authored by Sam Passaglia's avatar Sam Passaglia Committed by GitHub
Browse files

Fix bug leading to missing token in GPTSanJapaneseTokenizer (#23883)

* add \n

* removed copied from header
parent 03db5910
...@@ -55,7 +55,6 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { ...@@ -55,7 +55,6 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
} }
# Copied from transformers.models.gpt_neox_japanese.tokenization_gpt_neox_japanese.load_vocab_and_emoji
def load_vocab_and_emoji(vocab_file, emoji_file): def load_vocab_and_emoji(vocab_file, emoji_file):
"""Loads a vocabulary file and emoji file into a dictionary.""" """Loads a vocabulary file and emoji file into a dictionary."""
with open(emoji_file, "r", encoding="utf-8") as f: with open(emoji_file, "r", encoding="utf-8") as f:
...@@ -66,7 +65,7 @@ def load_vocab_and_emoji(vocab_file, emoji_file): ...@@ -66,7 +65,7 @@ def load_vocab_and_emoji(vocab_file, emoji_file):
ids_to_tokens = collections.OrderedDict() ids_to_tokens = collections.OrderedDict()
with open(vocab_file, "r", encoding="utf-8") as f: with open(vocab_file, "r", encoding="utf-8") as f:
token = f.readlines() token = f.readlines()
token = [[t.rstrip("\n")] if (t == "," or "," not in t) else t.rstrip("\n").split(",") for t in token] token = [[t.rstrip("\n")] if (t == ",\n" or "," not in t) else t.rstrip("\n").split(",") for t in token]
for idx, b in enumerate(token): for idx, b in enumerate(token):
ids_to_tokens[idx] = b ids_to_tokens[idx] = b
raw_vocab[",".join(b)] = idx raw_vocab[",".join(b)] = idx
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment