"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "7fb2a8b3d93104fbe45667fffc6716cb26b9515e"
Unverified Commit db034660 authored by Thomas Wang's avatar Thomas Wang Committed by GitHub
Browse files

Fix hashing for deduplication (#17048)

parent 39f8eafc
import gzip import gzip
import hashlib
import multiprocessing import multiprocessing
import os import os
import shutil import shutil
...@@ -13,7 +14,7 @@ from transformers import HfArgumentParser ...@@ -13,7 +14,7 @@ from transformers import HfArgumentParser
def get_hash(example): def get_hash(example):
"""Get hash of content field.""" """Get hash of content field."""
return {"hash": hash(example["content"])} return {"hash": hashlib.md5(example["content"].strip().encode("utf-8")).hexdigest()}
def line_stats(example): def line_stats(example):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment