fix vocab size in binarized_data (distil): int16 vs int32

2ae98336 · VictorSanh · 0dbddba6 · 2ae98336
Commit 2ae98336 authored Feb 18, 2020 by VictorSanh
Show whitespace changes
Inline Side-by-side

Showing with 6 additions and 2 deletions

examples/distillation/scripts/binarized_data.py examples/distillation/scripts/binarized_data.py +6 -2

No files found.
--- a/examples/distillation/scripts/binarized_data.py
+++ b/examples/distillation/scripts/binarized_data.py
@@ -75,13 +75,17 @@ def main():
        iter += 1
        if iter % interval == 0:
            end = time.time()
-            logger.info(f"{iter} examples processed. - {(end-start)/interval:.2f}s/expl")
+            logger.info(f"{iter} examples processed. - {(end-start):.2f}s/{interval}expl")
            start = time.time()
    logger.info("Finished binarization")
    logger.info(f"{len(data)} examples processed.")
    dp_file = f"{args.dump_file}.{args.tokenizer_name}.pickle"
+    vocab_size = tokenizer.vocab_size
+    if vocab_size < (1 << 16):
        rslt_ = [np.uint16(d) for d in rslt]
+    else:
+        rslt_ = [np.int32(d) for d in rslt]
    random.shuffle(rslt_)
    logger.info(f"Dump to {dp_file}")
    with open(dp_file, "wb") as handle: