Commit 10ad7495 authored by Paco Guzman's avatar Paco Guzman Committed by Facebook Github Bot
Browse files

sort dictionary items lexicographically for consistency

Summary: Sorts dictionaries lexicographically before creating counter. This makes distributed preprocessing deterministic

Reviewed By: myleott

Differential Revision: D14678214

fbshipit-source-id: 7a9e2f0cb367e8fb76da01e108dda4c6c5aab505
parent eef6663c
...@@ -116,7 +116,7 @@ class Dictionary(object): ...@@ -116,7 +116,7 @@ class Dictionary(object):
new_symbols = self.symbols[:self.nspecial] new_symbols = self.symbols[:self.nspecial]
new_count = self.count[:self.nspecial] new_count = self.count[:self.nspecial]
c = Counter(dict(zip(self.symbols[self.nspecial:], self.count[self.nspecial:]))) c = Counter(dict(sorted(zip(self.symbols[self.nspecial:], self.count[self.nspecial:]))))
for symbol, count in c.most_common(nwords - self.nspecial): for symbol, count in c.most_common(nwords - self.nspecial):
if count >= threshold: if count >= threshold:
new_indices[symbol] = len(new_symbols) new_indices[symbol] = len(new_symbols)
...@@ -261,7 +261,7 @@ class Dictionary(object): ...@@ -261,7 +261,7 @@ class Dictionary(object):
@staticmethod @staticmethod
def add_file_to_dictionary(filename, dict, tokenize, num_workers): def add_file_to_dictionary(filename, dict, tokenize, num_workers):
def merge_result(counter): def merge_result(counter):
for w, c in counter.items(): for w, c in sorted(counter.items()):
dict.add_symbol(w, c) dict.add_symbol(w, c)
if num_workers > 1: if num_workers > 1:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment