Unverified Commit 3055232b authored by Karan Bansal's avatar Karan Bansal Committed by GitHub
Browse files

[Feature] Add FIPS 140-3 compliant hash algorithm option for multimodal hashing (#32386)


Signed-off-by: default avatarKaran Bansal <karanb192@gmail.com>
parent 965765ae
...@@ -73,6 +73,7 @@ if TYPE_CHECKING: ...@@ -73,6 +73,7 @@ if TYPE_CHECKING:
VLLM_MAX_AUDIO_CLIP_FILESIZE_MB: int = 25 VLLM_MAX_AUDIO_CLIP_FILESIZE_MB: int = 25
VLLM_VIDEO_LOADER_BACKEND: str = "opencv" VLLM_VIDEO_LOADER_BACKEND: str = "opencv"
VLLM_MEDIA_CONNECTOR: str = "http" VLLM_MEDIA_CONNECTOR: str = "http"
VLLM_MM_HASHER_ALGORITHM: str = "blake3"
VLLM_TARGET_DEVICE: str = "cuda" VLLM_TARGET_DEVICE: str = "cuda"
VLLM_MAIN_CUDA_VERSION: str = "12.9" VLLM_MAIN_CUDA_VERSION: str = "12.9"
VLLM_FLOAT32_MATMUL_PRECISION: Literal["highest", "high", "medium"] = "highest" VLLM_FLOAT32_MATMUL_PRECISION: Literal["highest", "high", "medium"] = "highest"
...@@ -806,6 +807,17 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -806,6 +807,17 @@ environment_variables: dict[str, Callable[[], Any]] = {
# imported at runtime. # imported at runtime.
# If a non-existing backend is used, an AssertionError will be thrown. # If a non-existing backend is used, an AssertionError will be thrown.
"VLLM_MEDIA_CONNECTOR": lambda: os.getenv("VLLM_MEDIA_CONNECTOR", "http"), "VLLM_MEDIA_CONNECTOR": lambda: os.getenv("VLLM_MEDIA_CONNECTOR", "http"),
# Hash algorithm for multimodal content hashing.
# - "blake3": Default, fast cryptographic hash (not FIPS 140-3 compliant)
# - "sha256": FIPS 140-3 compliant, widely supported
# - "sha512": FIPS 140-3 compliant, faster on 64-bit systems
# Use sha256 or sha512 for FIPS compliance in government/enterprise deployments
"VLLM_MM_HASHER_ALGORITHM": env_with_choices(
"VLLM_MM_HASHER_ALGORITHM",
"blake3",
["blake3", "sha256", "sha512"],
case_sensitive=False,
),
# Path to the XLA persistent cache directory. # Path to the XLA persistent cache directory.
# Only used for XLA devices such as TPUs. # Only used for XLA devices such as TPUs.
"VLLM_XLA_CACHE_PATH": lambda: os.path.expanduser( "VLLM_XLA_CACHE_PATH": lambda: os.path.expanduser(
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import functools
import hashlib
import pickle import pickle
import uuid import uuid
from collections.abc import Iterable from collections.abc import Callable, Iterable
import numpy as np import numpy as np
import torch import torch
from blake3 import blake3
from PIL import Image from PIL import Image
import vllm.envs as envs
from vllm.logger import init_logger from vllm.logger import init_logger
from .media import MediaWithBytes from .media import MediaWithBytes
...@@ -17,6 +19,34 @@ from .media import MediaWithBytes ...@@ -17,6 +19,34 @@ from .media import MediaWithBytes
logger = init_logger(__name__) logger = init_logger(__name__)
@functools.lru_cache(maxsize=3)
def _get_hasher_factory(algorithm: str) -> Callable[[], "hashlib._Hash"]:
"""
Get the hasher factory based on the configured algorithm.
Args:
algorithm: Hash algorithm name (blake3, sha256, or sha512)
Returns a callable that creates a new hasher instance.
Supports blake3 (default), sha256, and sha512 for FIPS compliance.
See: https://github.com/vllm-project/vllm/issues/18334
"""
algorithm = algorithm.lower()
if algorithm == "blake3":
from blake3 import blake3
return blake3
elif algorithm == "sha256":
return hashlib.sha256
elif algorithm == "sha512":
return hashlib.sha512
else:
# This should never happen due to env_with_choices validation
raise ValueError(f"Unsupported hash algorithm: {algorithm}")
class MultiModalHasher: class MultiModalHasher:
@classmethod @classmethod
def serialize_item(cls, obj: object) -> Iterable[bytes | memoryview]: def serialize_item(cls, obj: object) -> Iterable[bytes | memoryview]:
...@@ -114,7 +144,8 @@ class MultiModalHasher: ...@@ -114,7 +144,8 @@ class MultiModalHasher:
@classmethod @classmethod
def hash_kwargs(cls, **kwargs: object) -> str: def hash_kwargs(cls, **kwargs: object) -> str:
hasher = blake3() hasher_factory = _get_hasher_factory(envs.VLLM_MM_HASHER_ALGORITHM)
hasher = hasher_factory()
for k, v in kwargs.items(): for k, v in kwargs.items():
for bytes_ in cls.iter_item_to_bytes(k, v): for bytes_ in cls.iter_item_to_bytes(k, v):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment