hub_utils.py 20.7 KB
Newer Older
anton-l's avatar
anton-l committed
1
# coding=utf-8
2
# Copyright 2024 The HuggingFace Inc. team.
anton-l's avatar
anton-l committed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


anton-l's avatar
anton-l committed
17
import os
18
import re
19
import sys
20
import tempfile
21
import traceback
22
import warnings
anton-l's avatar
anton-l committed
23
from pathlib import Path
24
from typing import Dict, List, Optional, Union
25
from uuid import uuid4
anton-l's avatar
anton-l committed
26

27
28
29
30
31
32
33
from huggingface_hub import (
    ModelCard,
    ModelCardData,
    create_repo,
    hf_hub_download,
    upload_folder,
)
34
from huggingface_hub.constants import HF_HUB_CACHE, HF_HUB_DISABLE_TELEMETRY, HF_HUB_OFFLINE
35
from huggingface_hub.file_download import REGEX_COMMIT_HASH
36
37
38
39
40
from huggingface_hub.utils import (
    EntryNotFoundError,
    RepositoryNotFoundError,
    RevisionNotFoundError,
    is_jinja_available,
41
    validate_hf_hub_args,
42
43
44
)
from packaging import version
from requests import HTTPError
anton-l's avatar
anton-l committed
45

46
from .. import __version__
47
48
49
50
51
52
from .constants import (
    DEPRECATED_REVISION_ARGS,
    HUGGINGFACE_CO_RESOLVE_ENDPOINT,
    SAFETENSORS_WEIGHTS_NAME,
    WEIGHTS_NAME,
)
53
54
from .import_utils import (
    ENV_VARS_TRUE_VALUES,
55
56
57
58
59
60
61
62
    _flax_version,
    _jax_version,
    _onnxruntime_version,
    _torch_version,
    is_flax_available,
    is_onnx_available,
    is_torch_available,
)
63
from .logging import get_logger
64
65


66
logger = get_logger(__name__)
anton-l's avatar
anton-l committed
67

68
MODEL_CARD_TEMPLATE_PATH = Path(__file__).parent / "model_card_template.md"
69
70
71
72
73
74
75
76
SESSION_ID = uuid4().hex


def http_user_agent(user_agent: Union[Dict, str, None] = None) -> str:
    """
    Formats a user-agent string with basic info about a request.
    """
    ua = f"diffusers/{__version__}; python/{sys.version.split()[0]}; session_id/{SESSION_ID}"
77
    if HF_HUB_DISABLE_TELEMETRY or HF_HUB_OFFLINE:
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
        return ua + "; telemetry/off"
    if is_torch_available():
        ua += f"; torch/{_torch_version}"
    if is_flax_available():
        ua += f"; jax/{_jax_version}"
        ua += f"; flax/{_flax_version}"
    if is_onnx_available():
        ua += f"; onnxruntime/{_onnxruntime_version}"
    # CI will set this value to True
    if os.environ.get("DIFFUSERS_IS_CI", "").upper() in ENV_VARS_TRUE_VALUES:
        ua += "; is_ci/true"
    if isinstance(user_agent, dict):
        ua += "; " + "; ".join(f"{k}/{v}" for k, v in user_agent.items())
    elif isinstance(user_agent, str):
        ua += "; " + user_agent
    return ua
anton-l's avatar
anton-l committed
94
95


96
def load_or_create_model_card(
97
98
99
100
101
102
103
104
105
106
    repo_id_or_path: str = None,
    token: Optional[str] = None,
    is_pipeline: bool = False,
    from_training: bool = False,
    model_description: Optional[str] = None,
    base_model: str = None,
    prompt: Optional[str] = None,
    license: Optional[str] = None,
    widget: Optional[List[dict]] = None,
    inference: Optional[bool] = None,
107
108
109
110
111
) -> ModelCard:
    """
    Loads or creates a model card.

    Args:
112
113
        repo_id_or_path (`str`):
            The repo id (e.g., "runwayml/stable-diffusion-v1-5") or local path where to look for the model card.
114
        token (`str`, *optional*):
115
116
            Authentication token. Will default to the stored token. See https://huggingface.co/settings/token for more
            details.
117
        is_pipeline (`bool`):
118
            Boolean to indicate if we're adding tag to a [`DiffusionPipeline`].
119
120
121
122
123
124
125
126
127
128
129
        from_training: (`bool`): Boolean flag to denote if the model card is being created from a training script.
        model_description (`str`, *optional*): Model description to add to the model card. Helpful when using
            `load_or_create_model_card` from a training script.
        base_model (`str`): Base model identifier (e.g., "stabilityai/stable-diffusion-xl-base-1.0"). Useful
            for DreamBooth-like training.
        prompt (`str`, *optional*): Prompt used for training. Useful for DreamBooth-like training.
        license: (`str`, *optional*): License of the output artifact. Helpful when using
            `load_or_create_model_card` from a training script.
        widget (`List[dict]`, *optional*): Widget to accompany a gallery template.
        inference: (`bool`, optional): Whether to turn on inference widget. Helpful when using
            `load_or_create_model_card` from a training script.
130
    """
Lucain's avatar
Lucain committed
131
    if not is_jinja_available():
132
        raise ValueError(
Lucain's avatar
Lucain committed
133
            "Modelcard rendering is based on Jinja templates."
134
            " Please make sure to have `jinja` installed before using `load_or_create_model_card`."
Lucain's avatar
Lucain committed
135
            " To install it, please run `pip install Jinja2`."
136
137
        )

138
139
140
    try:
        # Check if the model card is present on the remote repo
        model_card = ModelCard.load(repo_id_or_path, token=token)
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
    except (EntryNotFoundError, RepositoryNotFoundError):
        # Otherwise create a model card from template
        if from_training:
            model_card = ModelCard.from_template(
                card_data=ModelCardData(  # Card metadata object that will be converted to YAML block
                    license=license,
                    library_name="diffusers",
                    inference=inference,
                    base_model=base_model,
                    instance_prompt=prompt,
                    widget=widget,
                ),
                template_path=MODEL_CARD_TEMPLATE_PATH,
                model_description=model_description,
            )
        else:
            card_data = ModelCardData()
            component = "pipeline" if is_pipeline else "model"
            if model_description is None:
                model_description = f"This is the model card of a 🧨 diffusers {component} that has been pushed on the Hub. This model card has been automatically generated."
            model_card = ModelCard.from_template(card_data, model_description=model_description)
162
163
164
165

    return model_card


166
167
def populate_model_card(model_card: ModelCard, tags: Union[str, List[str]] = None) -> ModelCard:
    """Populates the `model_card` with library name and optional tags."""
168
169
    if model_card.data.library_name is None:
        model_card.data.library_name = "diffusers"
170
171
172
173
174
175
176
177
178

    if tags is not None:
        if isinstance(tags, str):
            tags = [tags]
        if model_card.data.tags is None:
            model_card.data.tags = []
        for tag in tags:
            model_card.data.tags.append(tag)

179
    return model_card
180
181


182
183
184
185
186
187
188
189
190
191
192
193
194
195
def extract_commit_hash(resolved_file: Optional[str], commit_hash: Optional[str] = None):
    """
    Extracts the commit hash from a resolved filename toward a cache file.
    """
    if resolved_file is None or commit_hash is not None:
        return commit_hash
    resolved_file = str(Path(resolved_file).as_posix())
    search = re.search(r"snapshots/([^/]+)/", resolved_file)
    if search is None:
        return None
    commit_hash = search.groups()[0]
    return commit_hash if REGEX_COMMIT_HASH.match(commit_hash) else None


196
197
198
199
200
201
202
203
204
205
206
207
# Old default cache path, potentially to be migrated.
# This logic was more or less taken from `transformers`, with the following differences:
# - Diffusers doesn't use custom environment variables to specify the cache path.
# - There is no need to migrate the cache format, just move the files to the new location.
hf_cache_home = os.path.expanduser(
    os.getenv("HF_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "huggingface"))
)
old_diffusers_cache = os.path.join(hf_cache_home, "diffusers")


def move_cache(old_cache_dir: Optional[str] = None, new_cache_dir: Optional[str] = None) -> None:
    if new_cache_dir is None:
208
        new_cache_dir = HF_HUB_CACHE
209
210
211
212
213
    if old_cache_dir is None:
        old_cache_dir = old_diffusers_cache

    old_cache_dir = Path(old_cache_dir).expanduser()
    new_cache_dir = Path(new_cache_dir).expanduser()
214
    for old_blob_path in old_cache_dir.glob("**/blobs/*"):
215
216
217
218
219
220
221
222
223
224
225
226
227
        if old_blob_path.is_file() and not old_blob_path.is_symlink():
            new_blob_path = new_cache_dir / old_blob_path.relative_to(old_cache_dir)
            new_blob_path.parent.mkdir(parents=True, exist_ok=True)
            os.replace(old_blob_path, new_blob_path)
            try:
                os.symlink(new_blob_path, old_blob_path)
            except OSError:
                logger.warning(
                    "Could not create symlink between old cache and new cache. If you use an older version of diffusers again, files will be re-downloaded."
                )
    # At this point, old_cache_dir contains symlinks to the new cache (it can still be used).


228
cache_version_file = os.path.join(HF_HUB_CACHE, "version_diffusers_cache.txt")
229
230
231
232
if not os.path.isfile(cache_version_file):
    cache_version = 0
else:
    with open(cache_version_file) as f:
233
234
235
236
        try:
            cache_version = int(f.read())
        except ValueError:
            cache_version = 0
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257

if cache_version < 1:
    old_cache_is_not_empty = os.path.isdir(old_diffusers_cache) and len(os.listdir(old_diffusers_cache)) > 0
    if old_cache_is_not_empty:
        logger.warning(
            "The cache for model files in Diffusers v0.14.0 has moved to a new location. Moving your "
            "existing cached models. This is a one-time operation, you can interrupt it or run it "
            "later by calling `diffusers.utils.hub_utils.move_cache()`."
        )
        try:
            move_cache()
        except Exception as e:
            trace = "\n".join(traceback.format_tb(e.__traceback__))
            logger.error(
                f"There was a problem when trying to move your cache:\n\n{trace}\n{e.__class__.__name__}: {e}\n\nPlease "
                "file an issue at https://github.com/huggingface/diffusers/issues/new/choose, copy paste this whole "
                "message and we will do our best to help."
            )

if cache_version < 1:
    try:
258
        os.makedirs(HF_HUB_CACHE, exist_ok=True)
259
260
261
262
        with open(cache_version_file, "w") as f:
            f.write("1")
    except Exception:
        logger.warning(
263
            f"There was a problem when trying to write in your cache folder ({HF_HUB_CACHE}). Please, ensure "
264
265
            "the directory exists and can be written to."
        )
266
267
268
269
270
271
272
273
274
275
276


def _add_variant(weights_name: str, variant: Optional[str] = None) -> str:
    if variant is not None:
        splits = weights_name.split(".")
        splits = splits[:-1] + [variant] + splits[-1:]
        weights_name = ".".join(splits)

    return weights_name


277
@validate_hf_hub_args
278
def _get_model_file(
279
    pretrained_model_name_or_path: Union[str, Path],
280
    *,
281
    weights_name: str,
282
283
284
285
286
287
288
289
290
    subfolder: Optional[str] = None,
    cache_dir: Optional[str] = None,
    force_download: bool = False,
    proxies: Optional[Dict] = None,
    resume_download: bool = False,
    local_files_only: bool = False,
    token: Optional[str] = None,
    user_agent: Optional[Union[Dict, str]] = None,
    revision: Optional[str] = None,
291
    commit_hash: Optional[str] = None,
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
):
    pretrained_model_name_or_path = str(pretrained_model_name_or_path)
    if os.path.isfile(pretrained_model_name_or_path):
        return pretrained_model_name_or_path
    elif os.path.isdir(pretrained_model_name_or_path):
        if os.path.isfile(os.path.join(pretrained_model_name_or_path, weights_name)):
            # Load from a PyTorch checkpoint
            model_file = os.path.join(pretrained_model_name_or_path, weights_name)
            return model_file
        elif subfolder is not None and os.path.isfile(
            os.path.join(pretrained_model_name_or_path, subfolder, weights_name)
        ):
            model_file = os.path.join(pretrained_model_name_or_path, subfolder, weights_name)
            return model_file
        else:
            raise EnvironmentError(
                f"Error no file named {weights_name} found in directory {pretrained_model_name_or_path}."
            )
    else:
        # 1. First check if deprecated way of loading from branches is used
        if (
            revision in DEPRECATED_REVISION_ARGS
            and (weights_name == WEIGHTS_NAME or weights_name == SAFETENSORS_WEIGHTS_NAME)
Patrick von Platen's avatar
Patrick von Platen committed
315
            and version.parse(version.parse(__version__).base_version) >= version.parse("0.22.0")
316
317
318
319
320
321
322
323
324
325
        ):
            try:
                model_file = hf_hub_download(
                    pretrained_model_name_or_path,
                    filename=_add_variant(weights_name, revision),
                    cache_dir=cache_dir,
                    force_download=force_download,
                    proxies=proxies,
                    resume_download=resume_download,
                    local_files_only=local_files_only,
326
                    token=token,
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
                    user_agent=user_agent,
                    subfolder=subfolder,
                    revision=revision or commit_hash,
                )
                warnings.warn(
                    f"Loading the variant {revision} from {pretrained_model_name_or_path} via `revision='{revision}'` is deprecated. Loading instead from `revision='main'` with `variant={revision}`. Loading model variants via `revision='{revision}'` will be removed in diffusers v1. Please use `variant='{revision}'` instead.",
                    FutureWarning,
                )
                return model_file
            except:  # noqa: E722
                warnings.warn(
                    f"You are loading the variant {revision} from {pretrained_model_name_or_path} via `revision='{revision}'`. This behavior is deprecated and will be removed in diffusers v1. One should use `variant='{revision}'` instead. However, it appears that {pretrained_model_name_or_path} currently does not have a {_add_variant(weights_name, revision)} file in the 'main' branch of {pretrained_model_name_or_path}. \n The Diffusers team and community would be very grateful if you could open an issue: https://github.com/huggingface/diffusers/issues/new with the title '{pretrained_model_name_or_path} is missing {_add_variant(weights_name, revision)}' so that the correct variant file can be added.",
                    FutureWarning,
                )
        try:
            # 2. Load model file as usual
            model_file = hf_hub_download(
                pretrained_model_name_or_path,
                filename=weights_name,
                cache_dir=cache_dir,
                force_download=force_download,
                proxies=proxies,
                resume_download=resume_download,
                local_files_only=local_files_only,
351
                token=token,
352
353
354
355
356
357
358
359
360
361
                user_agent=user_agent,
                subfolder=subfolder,
                revision=revision or commit_hash,
            )
            return model_file

        except RepositoryNotFoundError:
            raise EnvironmentError(
                f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier "
                "listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a "
362
                "token having permission to this repo with `token` or log in with `huggingface-cli "
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
                "login`."
            )
        except RevisionNotFoundError:
            raise EnvironmentError(
                f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for "
                "this model name. Check the model page at "
                f"'https://huggingface.co/{pretrained_model_name_or_path}' for available revisions."
            )
        except EntryNotFoundError:
            raise EnvironmentError(
                f"{pretrained_model_name_or_path} does not appear to have a file named {weights_name}."
            )
        except HTTPError as err:
            raise EnvironmentError(
                f"There was a specific connection error when trying to load {pretrained_model_name_or_path}:\n{err}"
            )
        except ValueError:
            raise EnvironmentError(
                f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load this model, couldn't find it"
                f" in the cached files and it looks like {pretrained_model_name_or_path} is not the path to a"
                f" directory containing a file named {weights_name} or"
                " \nCheckout your internet connection or see how to run the library in"
                " offline mode at 'https://huggingface.co/docs/diffusers/installation#offline-mode'."
            )
        except EnvironmentError:
            raise EnvironmentError(
                f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it from "
                "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
                f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
                f"containing a file named {weights_name}"
            )
394
395
396
397


class PushToHubMixin:
    """
398
    A Mixin to push a model, scheduler, or pipeline to the Hugging Face Hub.
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
    """

    def _upload_folder(
        self,
        working_dir: Union[str, os.PathLike],
        repo_id: str,
        token: Optional[str] = None,
        commit_message: Optional[str] = None,
        create_pr: bool = False,
    ):
        """
        Uploads all files in `working_dir` to `repo_id`.
        """
        if commit_message is None:
            if "Model" in self.__class__.__name__:
                commit_message = "Upload model"
            elif "Scheduler" in self.__class__.__name__:
                commit_message = "Upload scheduler"
            else:
                commit_message = f"Upload {self.__class__.__name__}"

        logger.info(f"Uploading the files of {working_dir} to {repo_id}.")
        return upload_folder(
            repo_id=repo_id, folder_path=working_dir, token=token, commit_message=commit_message, create_pr=create_pr
        )

    def push_to_hub(
        self,
        repo_id: str,
        commit_message: Optional[str] = None,
        private: Optional[bool] = None,
        token: Optional[str] = None,
        create_pr: bool = False,
        safe_serialization: bool = True,
        variant: Optional[str] = None,
    ) -> str:
        """
Steven Liu's avatar
Steven Liu committed
436
        Upload model, scheduler, or pipeline files to the 🤗 Hugging Face Hub.
437
438
439

        Parameters:
            repo_id (`str`):
Steven Liu's avatar
Steven Liu committed
440
441
442
                The name of the repository you want to push your model, scheduler, or pipeline files to. It should
                contain your organization name when pushing to an organization. `repo_id` can also be a path to a local
                directory.
443
            commit_message (`str`, *optional*):
Steven Liu's avatar
Steven Liu committed
444
                Message to commit while pushing. Default to `"Upload {object}"`.
445
446
447
448
449
450
451
            private (`bool`, *optional*):
                Whether or not the repository created should be private.
            token (`str`, *optional*):
                The token to use as HTTP bearer authorization for remote files. The token generated when running
                `huggingface-cli login` (stored in `~/.huggingface`).
            create_pr (`bool`, *optional*, defaults to `False`):
                Whether or not to create a PR with the uploaded files or directly commit.
Steven Liu's avatar
Steven Liu committed
452
453
            safe_serialization (`bool`, *optional*, defaults to `True`):
                Whether or not to convert the model weights to the `safetensors` format.
454
455
456
457
458
459
460
461
462
463
464
465
466
            variant (`str`, *optional*):
                If specified, weights are saved in the format `pytorch_model.<variant>.bin`.

        Examples:

        ```python
        from diffusers import UNet2DConditionModel

        unet = UNet2DConditionModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="unet")

        # Push the `unet` to your namespace with the name "my-finetuned-unet".
        unet.push_to_hub("my-finetuned-unet")

467
        # Push the `unet` to an organization with the name "my-finetuned-unet".
468
469
470
471
472
        unet.push_to_hub("your-org/my-finetuned-unet")
        ```
        """
        repo_id = create_repo(repo_id, private=private, token=token, exist_ok=True).repo_id

473
474
475
476
        # Create a new empty model card and eventually tag it
        model_card = load_or_create_model_card(repo_id, token=token)
        model_card = populate_model_card(model_card)

477
478
479
480
481
482
483
484
        # Save all files.
        save_kwargs = {"safe_serialization": safe_serialization}
        if "Scheduler" not in self.__class__.__name__:
            save_kwargs.update({"variant": variant})

        with tempfile.TemporaryDirectory() as tmpdir:
            self.save_pretrained(tmpdir, **save_kwargs)

485
486
487
            # Update model card if needed:
            model_card.save(os.path.join(tmpdir, "README.md"))

488
489
490
491
492
493
494
            return self._upload_folder(
                tmpdir,
                repo_id,
                token=token,
                commit_message=commit_message,
                create_pr=create_pr,
            )