Unverified Commit db034660 authored by Thomas Wang's avatar Thomas Wang Committed by GitHub
Browse files

Fix hashing for deduplication (#17048)

parent 39f8eafc
import gzip import gzip
import hashlib
import multiprocessing import multiprocessing
import os import os
import shutil import shutil
...@@ -13,7 +14,7 @@ from transformers import HfArgumentParser ...@@ -13,7 +14,7 @@ from transformers import HfArgumentParser
def get_hash(example): def get_hash(example):
"""Get hash of content field.""" """Get hash of content field."""
return {"hash": hash(example["content"])} return {"hash": hashlib.md5(example["content"].strip().encode("utf-8")).hexdigest()}
def line_stats(example): def line_stats(example):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment