Unverified Commit 45171174 authored by w4ffl35's avatar w4ffl35 Committed by GitHub
Browse files

Prevent online access when desired when using download_from_original_stable_diffusion_ckpt (#4271)

Prevent online access when desired

- Bypass requests with config files option added to download_from_original_stable_diffusion_ckpt
- Adds local_files_only flags to all from_pretrained requests
parent 4c4fe042
...@@ -778,7 +778,7 @@ def convert_ldm_bert_checkpoint(checkpoint, config): ...@@ -778,7 +778,7 @@ def convert_ldm_bert_checkpoint(checkpoint, config):
def convert_ldm_clip_checkpoint(checkpoint, local_files_only=False, text_encoder=None): def convert_ldm_clip_checkpoint(checkpoint, local_files_only=False, text_encoder=None):
if text_encoder is None: if text_encoder is None:
config_name = "openai/clip-vit-large-patch14" config_name = "openai/clip-vit-large-patch14"
config = CLIPTextConfig.from_pretrained(config_name) config = CLIPTextConfig.from_pretrained(config_name, local_files_only=local_files_only)
ctx = init_empty_weights if is_accelerate_available() else nullcontext ctx = init_empty_weights if is_accelerate_available() else nullcontext
with ctx(): with ctx():
...@@ -832,8 +832,8 @@ protected = {re.escape(x[0]): x[1] for x in textenc_transformer_conversion_lst} ...@@ -832,8 +832,8 @@ protected = {re.escape(x[0]): x[1] for x in textenc_transformer_conversion_lst}
textenc_pattern = re.compile("|".join(protected.keys())) textenc_pattern = re.compile("|".join(protected.keys()))
def convert_paint_by_example_checkpoint(checkpoint): def convert_paint_by_example_checkpoint(checkpoint, local_files_only=False):
config = CLIPVisionConfig.from_pretrained("openai/clip-vit-large-patch14") config = CLIPVisionConfig.from_pretrained("openai/clip-vit-large-patch14", local_files_only=local_files_only)
model = PaintByExampleImageEncoder(config) model = PaintByExampleImageEncoder(config)
keys = list(checkpoint.keys()) keys = list(checkpoint.keys())
...@@ -900,13 +900,13 @@ def convert_paint_by_example_checkpoint(checkpoint): ...@@ -900,13 +900,13 @@ def convert_paint_by_example_checkpoint(checkpoint):
def convert_open_clip_checkpoint( def convert_open_clip_checkpoint(
checkpoint, config_name, prefix="cond_stage_model.model.", has_projection=False, **config_kwargs checkpoint, config_name, prefix="cond_stage_model.model.", has_projection=False, local_files_only=False, **config_kwargs
): ):
# text_model = CLIPTextModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="text_encoder") # text_model = CLIPTextModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="text_encoder")
# text_model = CLIPTextModelWithProjection.from_pretrained( # text_model = CLIPTextModelWithProjection.from_pretrained(
# "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", projection_dim=1280 # "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", projection_dim=1280
# ) # )
config = CLIPTextConfig.from_pretrained(config_name, **config_kwargs) config = CLIPTextConfig.from_pretrained(config_name, **config_kwargs, local_files_only=local_files_only)
ctx = init_empty_weights if is_accelerate_available() else nullcontext ctx = init_empty_weights if is_accelerate_available() else nullcontext
with ctx(): with ctx():
...@@ -971,7 +971,7 @@ def convert_open_clip_checkpoint( ...@@ -971,7 +971,7 @@ def convert_open_clip_checkpoint(
return text_model return text_model
def stable_unclip_image_encoder(original_config): def stable_unclip_image_encoder(original_config, local_files_only=False):
""" """
Returns the image processor and clip image encoder for the img2img unclip pipeline. Returns the image processor and clip image encoder for the img2img unclip pipeline.
...@@ -989,13 +989,13 @@ def stable_unclip_image_encoder(original_config): ...@@ -989,13 +989,13 @@ def stable_unclip_image_encoder(original_config):
if clip_model_name == "ViT-L/14": if clip_model_name == "ViT-L/14":
feature_extractor = CLIPImageProcessor() feature_extractor = CLIPImageProcessor()
image_encoder = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-large-patch14") image_encoder = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-large-patch14", local_files_only=local_files_only)
else: else:
raise NotImplementedError(f"Unknown CLIP checkpoint name in stable diffusion checkpoint {clip_model_name}") raise NotImplementedError(f"Unknown CLIP checkpoint name in stable diffusion checkpoint {clip_model_name}")
elif sd_clip_image_embedder_class == "FrozenOpenCLIPImageEmbedder": elif sd_clip_image_embedder_class == "FrozenOpenCLIPImageEmbedder":
feature_extractor = CLIPImageProcessor() feature_extractor = CLIPImageProcessor()
image_encoder = CLIPVisionModelWithProjection.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K") image_encoder = CLIPVisionModelWithProjection.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K", local_files_only=local_files_only)
else: else:
raise NotImplementedError( raise NotImplementedError(
f"Unknown CLIP image embedder class in stable diffusion checkpoint {sd_clip_image_embedder_class}" f"Unknown CLIP image embedder class in stable diffusion checkpoint {sd_clip_image_embedder_class}"
...@@ -1116,6 +1116,7 @@ def download_from_original_stable_diffusion_ckpt( ...@@ -1116,6 +1116,7 @@ def download_from_original_stable_diffusion_ckpt(
vae=None, vae=None,
text_encoder=None, text_encoder=None,
tokenizer=None, tokenizer=None,
config_files=None,
) -> DiffusionPipeline: ) -> DiffusionPipeline:
""" """
Load a Stable Diffusion pipeline object from a CompVis-style `.ckpt`/`.safetensors` file and (ideally) a `.yaml` Load a Stable Diffusion pipeline object from a CompVis-style `.ckpt`/`.safetensors` file and (ideally) a `.yaml`
...@@ -1175,6 +1176,14 @@ def download_from_original_stable_diffusion_ckpt( ...@@ -1175,6 +1176,14 @@ def download_from_original_stable_diffusion_ckpt(
[CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer) [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer)
to use. If this parameter is `None`, the function will load a new instance of [CLIPTokenizer] by itself, if to use. If this parameter is `None`, the function will load a new instance of [CLIPTokenizer] by itself, if
needed. needed.
config_files (`Dict[str, str]`, *optional*, defaults to `None`):
A dictionary mapping from config file names to their contents. If this parameter is `None`, the function
will load the config files by itself, if needed.
Valid keys are:
- `v1`: Config file for Stable Diffusion v1
- `v2`: Config file for Stable Diffusion v2
- `xl`: Config file for Stable Diffusion XL
- `xl_refiner`: Config file for Stable Diffusion XL Refiner
return: A StableDiffusionPipeline object representing the passed-in `.ckpt`/`.safetensors` file. return: A StableDiffusionPipeline object representing the passed-in `.ckpt`/`.safetensors` file.
""" """
...@@ -1396,14 +1405,14 @@ def download_from_original_stable_diffusion_ckpt( ...@@ -1396,14 +1405,14 @@ def download_from_original_stable_diffusion_ckpt(
else: else:
vae.load_state_dict(converted_vae_checkpoint) vae.load_state_dict(converted_vae_checkpoint)
elif vae is None: elif vae is None:
vae = AutoencoderKL.from_pretrained(vae_path) vae = AutoencoderKL.from_pretrained(vae_path, local_files_only=local_files_only)
if model_type == "FrozenOpenCLIPEmbedder": if model_type == "FrozenOpenCLIPEmbedder":
config_name = "stabilityai/stable-diffusion-2" config_name = "stabilityai/stable-diffusion-2"
config_kwargs = {"subfolder": "text_encoder"} config_kwargs = {"subfolder": "text_encoder"}
text_model = convert_open_clip_checkpoint(checkpoint, config_name, **config_kwargs) text_model = convert_open_clip_checkpoint(checkpoint, config_name, **config_kwargs)
tokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-2", subfolder="tokenizer") tokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-2", subfolder="tokenizer", local_files_only=local_files_only)
if stable_unclip is None: if stable_unclip is None:
if controlnet: if controlnet:
...@@ -1455,12 +1464,12 @@ def download_from_original_stable_diffusion_ckpt( ...@@ -1455,12 +1464,12 @@ def download_from_original_stable_diffusion_ckpt(
elif stable_unclip == "txt2img": elif stable_unclip == "txt2img":
if stable_unclip_prior is None or stable_unclip_prior == "karlo": if stable_unclip_prior is None or stable_unclip_prior == "karlo":
karlo_model = "kakaobrain/karlo-v1-alpha" karlo_model = "kakaobrain/karlo-v1-alpha"
prior = PriorTransformer.from_pretrained(karlo_model, subfolder="prior") prior = PriorTransformer.from_pretrained(karlo_model, subfolder="prior", local_files_only=local_files_only)
prior_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") prior_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14", local_files_only=local_files_only)
prior_text_model = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-large-patch14") prior_text_model = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-large-patch14", local_files_only=local_files_only)
prior_scheduler = UnCLIPScheduler.from_pretrained(karlo_model, subfolder="prior_scheduler") prior_scheduler = UnCLIPScheduler.from_pretrained(karlo_model, subfolder="prior_scheduler", local_files_only=local_files_only)
prior_scheduler = DDPMScheduler.from_config(prior_scheduler.config) prior_scheduler = DDPMScheduler.from_config(prior_scheduler.config)
else: else:
raise NotImplementedError(f"unknown prior for stable unclip model: {stable_unclip_prior}") raise NotImplementedError(f"unknown prior for stable unclip model: {stable_unclip_prior}")
...@@ -1486,8 +1495,8 @@ def download_from_original_stable_diffusion_ckpt( ...@@ -1486,8 +1495,8 @@ def download_from_original_stable_diffusion_ckpt(
raise NotImplementedError(f"unknown `stable_unclip` type: {stable_unclip}") raise NotImplementedError(f"unknown `stable_unclip` type: {stable_unclip}")
elif model_type == "PaintByExample": elif model_type == "PaintByExample":
vision_model = convert_paint_by_example_checkpoint(checkpoint) vision_model = convert_paint_by_example_checkpoint(checkpoint)
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14", local_files_only=local_files_only)
feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker") feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only)
pipe = PaintByExamplePipeline( pipe = PaintByExamplePipeline(
vae=vae, vae=vae,
image_encoder=vision_model, image_encoder=vision_model,
...@@ -1500,11 +1509,11 @@ def download_from_original_stable_diffusion_ckpt( ...@@ -1500,11 +1509,11 @@ def download_from_original_stable_diffusion_ckpt(
text_model = convert_ldm_clip_checkpoint( text_model = convert_ldm_clip_checkpoint(
checkpoint, local_files_only=local_files_only, text_encoder=text_encoder checkpoint, local_files_only=local_files_only, text_encoder=text_encoder
) )
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") if tokenizer is None else tokenizer tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14", local_files_only=local_files_only) if tokenizer is None else tokenizer
if load_safety_checker: if load_safety_checker:
safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker") safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only)
feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker") feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only)
else: else:
safety_checker = None safety_checker = None
feature_extractor = None feature_extractor = None
...@@ -1532,9 +1541,9 @@ def download_from_original_stable_diffusion_ckpt( ...@@ -1532,9 +1541,9 @@ def download_from_original_stable_diffusion_ckpt(
) )
elif model_type in ["SDXL", "SDXL-Refiner"]: elif model_type in ["SDXL", "SDXL-Refiner"]:
if model_type == "SDXL": if model_type == "SDXL":
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14", local_files_only=local_files_only)
text_encoder = convert_ldm_clip_checkpoint(checkpoint, local_files_only=local_files_only) text_encoder = convert_ldm_clip_checkpoint(checkpoint, local_files_only=local_files_only)
tokenizer_2 = CLIPTokenizer.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!") tokenizer_2 = CLIPTokenizer.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!", local_files_only=local_files_only)
config_name = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k" config_name = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
config_kwargs = {"projection_dim": 1280} config_kwargs = {"projection_dim": 1280}
...@@ -1555,7 +1564,7 @@ def download_from_original_stable_diffusion_ckpt( ...@@ -1555,7 +1564,7 @@ def download_from_original_stable_diffusion_ckpt(
else: else:
tokenizer = None tokenizer = None
text_encoder = None text_encoder = None
tokenizer_2 = CLIPTokenizer.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!") tokenizer_2 = CLIPTokenizer.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!", local_files_only=local_files_only)
config_name = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k" config_name = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
config_kwargs = {"projection_dim": 1280} config_kwargs = {"projection_dim": 1280}
...@@ -1577,7 +1586,7 @@ def download_from_original_stable_diffusion_ckpt( ...@@ -1577,7 +1586,7 @@ def download_from_original_stable_diffusion_ckpt(
else: else:
text_config = create_ldm_bert_config(original_config) text_config = create_ldm_bert_config(original_config)
text_model = convert_ldm_bert_checkpoint(checkpoint, text_config) text_model = convert_ldm_bert_checkpoint(checkpoint, text_config)
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased", local_files_only=local_files_only)
pipe = LDMTextToImagePipeline(vqvae=vae, bert=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler) pipe = LDMTextToImagePipeline(vqvae=vae, bert=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
return pipe return pipe
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment