Unverified Commit a3e607d1 authored by Nicolas Patry's avatar Nicolas Patry Committed by GitHub
Browse files

Supporting Merges.txt files than contain an endline. (#15782)

(`hf-internal-testing/tiny-clip` for instance)
parent 24588c67
...@@ -166,7 +166,7 @@ class CLIPTokenizer(PreTrainedTokenizer): ...@@ -166,7 +166,7 @@ class CLIPTokenizer(PreTrainedTokenizer):
self.byte_encoder = bytes_to_unicode() self.byte_encoder = bytes_to_unicode()
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
with open(merges_file, encoding="utf-8") as merges_handle: with open(merges_file, encoding="utf-8") as merges_handle:
bpe_merges = merges_handle.read().split("\n")[1 : 49152 - 256 - 2 + 1] bpe_merges = merges_handle.read().strip().split("\n")[1 : 49152 - 256 - 2 + 1]
bpe_merges = [tuple(merge.split()) for merge in bpe_merges] bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
self.cache = {"<|startoftext|>": "<|startoftext|>", "<|endoftext|>": "<|endoftext|>"} self.cache = {"<|startoftext|>": "<|startoftext|>", "<|endoftext|>": "<|endoftext|>"}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment