Unverified Commit fd65aa98 authored by Lucain's avatar Lucain Committed by GitHub
Browse files

Set `usedforsecurity=False` in hashlib methods (FIPS compliance) (#27483)

* Set usedforsecurity=False in hashlib methods (FIPS compliance)

* trigger ci

* tokenizers version

* deps

* bump hfh version

* let's try this
parent 5603fad2
import gzip
import hashlib
import json
import multiprocessing
import os
......@@ -11,6 +10,7 @@ from pathlib import Path
import numpy as np
from arguments import PreprocessingArguments
from datasets import load_dataset
from huggingface_hub.utils import insecure_hashlib
from minhash_deduplication import deduplicate_dataset
from transformers import AutoTokenizer, HfArgumentParser
......@@ -21,7 +21,7 @@ PATTERN = re.compile(r"\s+")
def get_hash(example):
"""Get hash of content field."""
return {"hash": hashlib.md5(re.sub(PATTERN, "", example["content"]).encode("utf-8")).hexdigest()}
return {"hash": insecure_hashlib.md5(re.sub(PATTERN, "", example["content"]).encode("utf-8")).hexdigest()}
def line_stats(example):
......
......@@ -28,7 +28,6 @@ import tempfile
from collections import OrderedDict
from contextlib import contextmanager
from functools import partial
from hashlib import sha256
from io import BytesIO
from pathlib import Path
from urllib.parse import urlparse
......@@ -39,6 +38,7 @@ import numpy as np
import requests
import wget
from filelock import FileLock
from huggingface_hub.utils import insecure_hashlib
from PIL import Image
from tqdm.auto import tqdm
from yaml import Loader, dump, load
......@@ -402,12 +402,12 @@ def get_from_cache(
def url_to_filename(url, etag=None):
url_bytes = url.encode("utf-8")
url_hash = sha256(url_bytes)
url_hash = insecure_hashlib.sha256(url_bytes)
filename = url_hash.hexdigest()
if etag:
etag_bytes = etag.encode("utf-8")
etag_hash = sha256(etag_bytes)
etag_hash = insecure_hashlib.sha256(etag_bytes)
filename += "." + etag_hash.hexdigest()
if url.endswith(".h5"):
......
......@@ -28,7 +28,6 @@ import tempfile
from collections import OrderedDict
from contextlib import contextmanager
from functools import partial
from hashlib import sha256
from io import BytesIO
from pathlib import Path
from urllib.parse import urlparse
......@@ -39,6 +38,7 @@ import numpy as np
import requests
import wget
from filelock import FileLock
from huggingface_hub.utils import insecure_hashlib
from PIL import Image
from tqdm.auto import tqdm
from yaml import Loader, dump, load
......@@ -402,12 +402,12 @@ def get_from_cache(
def url_to_filename(url, etag=None):
url_bytes = url.encode("utf-8")
url_hash = sha256(url_bytes)
url_hash = insecure_hashlib.sha256(url_bytes)
filename = url_hash.hexdigest()
if etag:
etag_bytes = etag.encode("utf-8")
etag_hash = sha256(etag_bytes)
etag_hash = insecure_hashlib.sha256(etag_bytes)
filename += "." + etag_hash.hexdigest()
if url.endswith(".h5"):
......
......@@ -118,7 +118,7 @@ _deps = [
"fugashi>=1.0",
"GitPython<3.1.19",
"hf-doc-builder>=0.3.0",
"huggingface-hub>=0.16.4,<1.0",
"huggingface-hub>=0.19.3,<1.0",
"importlib_metadata",
"ipadic>=1.0.0,<2.0",
"isort>=5.5.4",
......@@ -321,6 +321,7 @@ extras["testing"] = (
"rjieba",
"beautifulsoup4",
"tensorboard",
"pydantic",
)
+ extras["retrieval"]
+ extras["modelcreation"]
......
......@@ -25,7 +25,7 @@ deps = {
"fugashi": "fugashi>=1.0",
"GitPython": "GitPython<3.1.19",
"hf-doc-builder": "hf-doc-builder>=0.3.0",
"huggingface-hub": "huggingface-hub>=0.16.4,<1.0",
"huggingface-hub": "huggingface-hub>=0.19.3,<1.0",
"importlib_metadata": "importlib_metadata",
"ipadic": "ipadic>=1.0.0,<2.0",
"isort": "isort>=5.5.4",
......
......@@ -15,7 +15,6 @@
# limitations under the License.
import argparse
import hashlib
import io
import json
import os
......@@ -24,6 +23,7 @@ import urllib
import warnings
import torch
from huggingface_hub.utils import insecure_hashlib
from torch import nn
from tqdm import tqdm
......@@ -114,7 +114,7 @@ def _download(url: str, root: str) -> io.BytesIO:
if os.path.isfile(download_target):
model_bytes = open(download_target, "rb").read()
if hashlib.sha256(model_bytes).hexdigest() == expected_sha256:
if insecure_hashlib.sha256(model_bytes).hexdigest() == expected_sha256:
return torch.load(io.BytesIO(model_bytes))
else:
warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
......@@ -132,7 +132,7 @@ def _download(url: str, root: str) -> io.BytesIO:
loop.update(len(buffer))
model_bytes = open(download_target, "rb").read()
if hashlib.sha256(model_bytes).hexdigest() != expected_sha256:
if insecure_hashlib.sha256(model_bytes).hexdigest() != expected_sha256:
raise RuntimeError(
"Model has been downloaded but the SHA256 checksum does not not match. Please retry loading the model."
)
......
......@@ -12,9 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import hashlib
import unittest
from huggingface_hub.utils import insecure_hashlib
from transformers import MODEL_FOR_DEPTH_ESTIMATION_MAPPING, is_torch_available, is_vision_available
from transformers.pipelines import DepthEstimationPipeline, pipeline
from transformers.testing_utils import (
......@@ -44,7 +45,7 @@ else:
def hashimage(image: Image) -> str:
m = hashlib.md5(image.tobytes())
m = insecure_hashlib.md5(image.tobytes())
return m.hexdigest()
......
......@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import hashlib
import tempfile
import unittest
from typing import Dict
......@@ -21,6 +20,7 @@ import datasets
import numpy as np
import requests
from datasets import load_dataset
from huggingface_hub.utils import insecure_hashlib
from transformers import (
MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
......@@ -59,7 +59,7 @@ else:
def hashimage(image: Image) -> str:
m = hashlib.md5(image.tobytes())
m = insecure_hashlib.md5(image.tobytes())
return m.hexdigest()[:10]
......
......@@ -12,11 +12,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import hashlib
import unittest
from typing import Dict
import numpy as np
from huggingface_hub.utils import insecure_hashlib
from transformers import (
MODEL_FOR_MASK_GENERATION_MAPPING,
......@@ -46,7 +46,7 @@ else:
def hashimage(image: Image) -> str:
m = hashlib.md5(image.tobytes())
m = insecure_hashlib.md5(image.tobytes())
return m.hexdigest()[:10]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment