"git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "84b9579da70d2195774f072644dc1c4a2f1e2344"
Unverified Commit 38dbbc26 authored by Sam Passaglia's avatar Sam Passaglia Committed by GitHub
Browse files

Fix bug leading to missing token in GPTSanJapaneseTokenizer (#23883)

* add \n

* removed copied from header
parent 03db5910
......@@ -55,7 +55,6 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
}
# Copied from transformers.models.gpt_neox_japanese.tokenization_gpt_neox_japanese.load_vocab_and_emoji
def load_vocab_and_emoji(vocab_file, emoji_file):
"""Loads a vocabulary file and emoji file into a dictionary."""
with open(emoji_file, "r", encoding="utf-8") as f:
......@@ -66,7 +65,7 @@ def load_vocab_and_emoji(vocab_file, emoji_file):
ids_to_tokens = collections.OrderedDict()
with open(vocab_file, "r", encoding="utf-8") as f:
token = f.readlines()
token = [[t.rstrip("\n")] if (t == "," or "," not in t) else t.rstrip("\n").split(",") for t in token]
token = [[t.rstrip("\n")] if (t == ",\n" or "," not in t) else t.rstrip("\n").split(",") for t in token]
for idx, b in enumerate(token):
ids_to_tokens[idx] = b
raw_vocab[",".join(b)] = idx
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment