Unverified Commit db034660 authored by Thomas Wang's avatar Thomas Wang Committed by GitHub
Browse files

Fix hashing for deduplication (#17048)

parent 39f8eafc
import gzip
import hashlib
import multiprocessing
import os
import shutil
......@@ -13,7 +14,7 @@ from transformers import HfArgumentParser
def get_hash(example):
"""Get hash of content field."""
return {"hash": hash(example["content"])}
return {"hash": hashlib.md5(example["content"].strip().encode("utf-8")).hexdigest()}
def line_stats(example):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment