Prevent online access when desired when using download_from_original_stable_diffusion_ckpt (#4271)

Prevent online access when desired - Bypass requests with config files option added to download_from_original_stable_diffusion_ckpt - Adds local_files_only flags to all from_pretrained requests

Prevent online access when desired when using download_from_original_stable_diffusion_ckpt (#4271)
Prevent online access when desired - Bypass requests with config files option added to download_from_original_stable_diffusion_ckpt - Adds local_files_only flags to all from_pretrained requests
45171174 · w4ffl35 · GitHub · 4c4fe042 · 45171174
Unverified Commit 45171174 authored Aug 03, 2023 by w4ffl35 Committed by GitHub Aug 03, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 32 additions and 23 deletions

src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py ...diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +32 -23

No files found.
--- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
+++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
@@ -778,7 +778,7 @@ def convert_ldm_bert_checkpoint(checkpoint, config):
 def convert_ldm_clip_checkpoint(checkpoint, local_files_only=False, text_encoder=None):
    if text_encoder is None:
        config_name = "openai/clip-vit-large-patch14"
-        config = CLIPTextConfig.from_pretrained(config_name)
+        config = CLIPTextConfig.from_pretrained(config_name, local_files_only=local_files_only)
        ctx = init_empty_weights if is_accelerate_available() else nullcontext
        with ctx():
@@ -832,8 +832,8 @@ protected = {re.escape(x[0]): x[1] for x in textenc_transformer_conversion_lst}
 textenc_pattern = re.compile("|".join(protected.keys()))
-def convert_paint_by_example_checkpoint(checkpoint):
+def convert_paint_by_example_checkpoint(checkpoint, local_files_only=False):
-    config = CLIPVisionConfig.from_pretrained("openai/clip-vit-large-patch14")
+    config = CLIPVisionConfig.from_pretrained("openai/clip-vit-large-patch14", local_files_only=local_files_only)
    model = PaintByExampleImageEncoder(config)
    keys = list(checkpoint.keys())
@@ -900,13 +900,13 @@ def convert_paint_by_example_checkpoint(checkpoint):
 def convert_open_clip_checkpoint(
-    checkpoint, config_name, prefix="cond_stage_model.model.", has_projection=False, **config_kwargs
+    checkpoint, config_name, prefix="cond_stage_model.model.", has_projection=False, local_files_only=False, **config_kwargs
 ):
    # text_model = CLIPTextModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="text_encoder")
    # text_model = CLIPTextModelWithProjection.from_pretrained(
    #    "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", projection_dim=1280
    # )
-    config = CLIPTextConfig.from_pretrained(config_name, **config_kwargs)
+    config = CLIPTextConfig.from_pretrained(config_name, **config_kwargs, local_files_only=local_files_only)
    ctx = init_empty_weights if is_accelerate_available() else nullcontext
    with ctx():
@@ -971,7 +971,7 @@ def convert_open_clip_checkpoint(
    return text_model
-def stable_unclip_image_encoder(original_config):
+def stable_unclip_image_encoder(original_config, local_files_only=False):
    """
    Returns the image processor and clip image encoder for the img2img unclip pipeline.
@@ -989,13 +989,13 @@ def stable_unclip_image_encoder(original_config):
        if clip_model_name == "ViT-L/14":
            feature_extractor = CLIPImageProcessor()
-            image_encoder = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-large-patch14")
+            image_encoder = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-large-patch14", local_files_only=local_files_only)
        else:
            raise NotImplementedError(f"Unknown CLIP checkpoint name in stable diffusion checkpoint {clip_model_name}")
    elif sd_clip_image_embedder_class == "FrozenOpenCLIPImageEmbedder":
        feature_extractor = CLIPImageProcessor()
-        image_encoder = CLIPVisionModelWithProjection.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K")
+        image_encoder = CLIPVisionModelWithProjection.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K", local_files_only=local_files_only)
    else:
        raise NotImplementedError(
            f"Unknown CLIP image embedder class in stable diffusion checkpoint {sd_clip_image_embedder_class}"
@@ -1116,6 +1116,7 @@ def download_from_original_stable_diffusion_ckpt(
    vae=None,
    text_encoder=None,
    tokenizer=None,
+    config_files=None,
 ) -> DiffusionPipeline:
    """
    Load a Stable Diffusion pipeline object from a CompVis-style `.ckpt`/`.safetensors` file and (ideally) a `.yaml`
@@ -1175,6 +1176,14 @@ def download_from_original_stable_diffusion_ckpt(
            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer)
            to use. If this parameter is `None`, the function will load a new instance of [CLIPTokenizer] by itself, if
            needed.
+        config_files (`Dict[str, str]`, *optional*, defaults to `None`):
+            A dictionary mapping from config file names to their contents. If this parameter is `None`, the function
+            will load the config files by itself, if needed.
+            Valid keys are:
+                - `v1`: Config file for Stable Diffusion v1
+                - `v2`: Config file for Stable Diffusion v2
+                - `xl`: Config file for Stable Diffusion XL
+                - `xl_refiner`: Config file for Stable Diffusion XL Refiner
        return: A StableDiffusionPipeline object representing the passed-in `.ckpt`/`.safetensors` file.
    """
@@ -1396,14 +1405,14 @@ def download_from_original_stable_diffusion_ckpt(
        else:
            vae.load_state_dict(converted_vae_checkpoint)
    elif vae is None:
-        vae = AutoencoderKL.from_pretrained(vae_path)
+        vae = AutoencoderKL.from_pretrained(vae_path, local_files_only=local_files_only)
    if model_type == "FrozenOpenCLIPEmbedder":
        config_name = "stabilityai/stable-diffusion-2"
        config_kwargs = {"subfolder": "text_encoder"}
        text_model = convert_open_clip_checkpoint(checkpoint, config_name, **config_kwargs)
-        tokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-2", subfolder="tokenizer")
+        tokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-2", subfolder="tokenizer", local_files_only=local_files_only)
        if stable_unclip is None:
            if controlnet:
@@ -1455,12 +1464,12 @@ def download_from_original_stable_diffusion_ckpt(
            elif stable_unclip == "txt2img":
                if stable_unclip_prior is None or stable_unclip_prior == "karlo":
                    karlo_model = "kakaobrain/karlo-v1-alpha"
-                    prior = PriorTransformer.from_pretrained(karlo_model, subfolder="prior")
+                    prior = PriorTransformer.from_pretrained(karlo_model, subfolder="prior", local_files_only=local_files_only)
-                    prior_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+                    prior_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14", local_files_only=local_files_only)
-                    prior_text_model = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-large-patch14")
+                    prior_text_model = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-large-patch14", local_files_only=local_files_only)
-                    prior_scheduler = UnCLIPScheduler.from_pretrained(karlo_model, subfolder="prior_scheduler")
+                    prior_scheduler = UnCLIPScheduler.from_pretrained(karlo_model, subfolder="prior_scheduler", local_files_only=local_files_only)
                    prior_scheduler = DDPMScheduler.from_config(prior_scheduler.config)
                else:
                    raise NotImplementedError(f"unknown prior for stable unclip model: {stable_unclip_prior}")
@@ -1486,8 +1495,8 @@ def download_from_original_stable_diffusion_ckpt(
                raise NotImplementedError(f"unknown `stable_unclip` type: {stable_unclip}")
    elif model_type == "PaintByExample":
        vision_model = convert_paint_by_example_checkpoint(checkpoint)
-        tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+        tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14", local_files_only=local_files_only)
-        feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker")
+        feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only)
        pipe = PaintByExamplePipeline(
            vae=vae,
            image_encoder=vision_model,
@@ -1500,11 +1509,11 @@ def download_from_original_stable_diffusion_ckpt(
        text_model = convert_ldm_clip_checkpoint(
            checkpoint, local_files_only=local_files_only, text_encoder=text_encoder
        )
-        tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") if tokenizer is None else tokenizer
+        tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14", local_files_only=local_files_only) if tokenizer is None else tokenizer
        if load_safety_checker:
-            safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker")
+            safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only)
-            feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker")
+            feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only)
        else:
            safety_checker = None
            feature_extractor = None
@@ -1532,9 +1541,9 @@ def download_from_original_stable_diffusion_ckpt(
            )
    elif model_type in ["SDXL", "SDXL-Refiner"]:
        if model_type == "SDXL":
-            tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+            tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14", local_files_only=local_files_only)
            text_encoder = convert_ldm_clip_checkpoint(checkpoint, local_files_only=local_files_only)
-            tokenizer_2 = CLIPTokenizer.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!")
+            tokenizer_2 = CLIPTokenizer.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!", local_files_only=local_files_only)
            config_name = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
            config_kwargs = {"projection_dim": 1280}
@@ -1555,7 +1564,7 @@ def download_from_original_stable_diffusion_ckpt(
        else:
            tokenizer = None
            text_encoder = None
-            tokenizer_2 = CLIPTokenizer.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!")
+            tokenizer_2 = CLIPTokenizer.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!", local_files_only=local_files_only)
            config_name = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
            config_kwargs = {"projection_dim": 1280}
@@ -1577,7 +1586,7 @@ def download_from_original_stable_diffusion_ckpt(
    else:
        text_config = create_ldm_bert_config(original_config)
        text_model = convert_ldm_bert_checkpoint(checkpoint, text_config)
-        tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
+        tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased", local_files_only=local_files_only)
        pipe = LDMTextToImagePipeline(vqvae=vae, bert=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
    return pipe