Unverified Commit 922c5f5c authored by Parag Ekbote's avatar Parag Ekbote Committed by GitHub
Browse files

Fixed Nits in Evaluation Docs (#10063)

Minor fixes and script improvement in evaluation
docs.
parent 8d386f79
...@@ -181,7 +181,7 @@ Then we load the [v1-5 checkpoint](https://huggingface.co/stable-diffusion-v1-5/ ...@@ -181,7 +181,7 @@ Then we load the [v1-5 checkpoint](https://huggingface.co/stable-diffusion-v1-5/
```python ```python
model_ckpt_1_5 = "stable-diffusion-v1-5/stable-diffusion-v1-5" model_ckpt_1_5 = "stable-diffusion-v1-5/stable-diffusion-v1-5"
sd_pipeline_1_5 = StableDiffusionPipeline.from_pretrained(model_ckpt_1_5, torch_dtype=weight_dtype).to(device) sd_pipeline_1_5 = StableDiffusionPipeline.from_pretrained(model_ckpt_1_5, torch_dtype=torch.float16).to("cuda")
images_1_5 = sd_pipeline_1_5(prompts, num_images_per_prompt=1, generator=generator, output_type="np").images images_1_5 = sd_pipeline_1_5(prompts, num_images_per_prompt=1, generator=generator, output_type="np").images
``` ```
...@@ -280,7 +280,7 @@ from diffusers import StableDiffusionInstructPix2PixPipeline ...@@ -280,7 +280,7 @@ from diffusers import StableDiffusionInstructPix2PixPipeline
instruct_pix2pix_pipeline = StableDiffusionInstructPix2PixPipeline.from_pretrained( instruct_pix2pix_pipeline = StableDiffusionInstructPix2PixPipeline.from_pretrained(
"timbrooks/instruct-pix2pix", torch_dtype=torch.float16 "timbrooks/instruct-pix2pix", torch_dtype=torch.float16
).to(device) ).to("cuda")
``` ```
Now, we perform the edits: Now, we perform the edits:
...@@ -326,9 +326,9 @@ from transformers import ( ...@@ -326,9 +326,9 @@ from transformers import (
clip_id = "openai/clip-vit-large-patch14" clip_id = "openai/clip-vit-large-patch14"
tokenizer = CLIPTokenizer.from_pretrained(clip_id) tokenizer = CLIPTokenizer.from_pretrained(clip_id)
text_encoder = CLIPTextModelWithProjection.from_pretrained(clip_id).to(device) text_encoder = CLIPTextModelWithProjection.from_pretrained(clip_id).to("cuda")
image_processor = CLIPImageProcessor.from_pretrained(clip_id) image_processor = CLIPImageProcessor.from_pretrained(clip_id)
image_encoder = CLIPVisionModelWithProjection.from_pretrained(clip_id).to(device) image_encoder = CLIPVisionModelWithProjection.from_pretrained(clip_id).to("cuda")
``` ```
Notice that we are using a particular CLIP checkpoint, i.e., `openai/clip-vit-large-patch14`. This is because the Stable Diffusion pre-training was performed with this CLIP variant. For more details, refer to the [documentation](https://huggingface.co/docs/transformers/model_doc/clip). Notice that we are using a particular CLIP checkpoint, i.e., `openai/clip-vit-large-patch14`. This is because the Stable Diffusion pre-training was performed with this CLIP variant. For more details, refer to the [documentation](https://huggingface.co/docs/transformers/model_doc/clip).
...@@ -350,7 +350,7 @@ class DirectionalSimilarity(nn.Module): ...@@ -350,7 +350,7 @@ class DirectionalSimilarity(nn.Module):
def preprocess_image(self, image): def preprocess_image(self, image):
image = self.image_processor(image, return_tensors="pt")["pixel_values"] image = self.image_processor(image, return_tensors="pt")["pixel_values"]
return {"pixel_values": image.to(device)} return {"pixel_values": image.to("cuda")}
def tokenize_text(self, text): def tokenize_text(self, text):
inputs = self.tokenizer( inputs = self.tokenizer(
...@@ -360,7 +360,7 @@ class DirectionalSimilarity(nn.Module): ...@@ -360,7 +360,7 @@ class DirectionalSimilarity(nn.Module):
truncation=True, truncation=True,
return_tensors="pt", return_tensors="pt",
) )
return {"input_ids": inputs.input_ids.to(device)} return {"input_ids": inputs.input_ids.to("cuda")}
def encode_image(self, image): def encode_image(self, image):
preprocessed_image = self.preprocess_image(image) preprocessed_image = self.preprocess_image(image)
...@@ -459,6 +459,7 @@ with ZipFile(local_filepath, "r") as zipper: ...@@ -459,6 +459,7 @@ with ZipFile(local_filepath, "r") as zipper:
```python ```python
from PIL import Image from PIL import Image
import os import os
import numpy as np
dataset_path = "sample-imagenet-images" dataset_path = "sample-imagenet-images"
image_paths = sorted([os.path.join(dataset_path, x) for x in os.listdir(dataset_path)]) image_paths = sorted([os.path.join(dataset_path, x) for x in os.listdir(dataset_path)])
...@@ -477,6 +478,7 @@ Now that the images are loaded, let's apply some lightweight pre-processing on t ...@@ -477,6 +478,7 @@ Now that the images are loaded, let's apply some lightweight pre-processing on t
```python ```python
from torchvision.transforms import functional as F from torchvision.transforms import functional as F
import torch
def preprocess_image(image): def preprocess_image(image):
...@@ -498,6 +500,10 @@ dit_pipeline = DiTPipeline.from_pretrained("facebook/DiT-XL-2-256", torch_dtype= ...@@ -498,6 +500,10 @@ dit_pipeline = DiTPipeline.from_pretrained("facebook/DiT-XL-2-256", torch_dtype=
dit_pipeline.scheduler = DPMSolverMultistepScheduler.from_config(dit_pipeline.scheduler.config) dit_pipeline.scheduler = DPMSolverMultistepScheduler.from_config(dit_pipeline.scheduler.config)
dit_pipeline = dit_pipeline.to("cuda") dit_pipeline = dit_pipeline.to("cuda")
seed = 0
generator = torch.manual_seed(seed)
words = [ words = [
"cassette player", "cassette player",
"chainsaw", "chainsaw",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment