Commit d5faa74c authored by Julien Chaumond's avatar Julien Chaumond
Browse files

tokenizer white space: revert to previous behavior

parent 0b77d66a
...@@ -373,7 +373,7 @@ def get_bag_of_words_indices(bag_of_words_ids_or_paths: List[str]) -> List[List[ ...@@ -373,7 +373,7 @@ def get_bag_of_words_indices(bag_of_words_ids_or_paths: List[str]) -> List[List[
filepath = id_or_path filepath = id_or_path
with open(filepath, "r") as f: with open(filepath, "r") as f:
words = f.read().split("\n") words = f.read().split("\n")
bow_indices.append([TOKENIZER.encode(word) for word in words]) bow_indices.append([TOKENIZER.encode(word, add_prefix_space=True) for word in words])
return bow_indices return bow_indices
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment