demo.ipynb 7.36 KB
Newer Older
Hzzone's avatar
Hzzone committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The autoreload extension is already loaded. To reload it, use:\n",
      "  %reload_ext autoreload\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/root/miniconda/envs/densecaption/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
29
    "from diffusers import StableDiffusionGLIGENPipeline"
Hzzone's avatar
Hzzone committed
30
31
32
33
34
35
36
37
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
38
39
    "from transformers import CLIPTextModel, CLIPTokenizer\n",
    "\n",
Hzzone's avatar
Hzzone committed
40
41
42
43
44
    "import diffusers\n",
    "from diffusers import (\n",
    "    AutoencoderKL,\n",
    "    DDPMScheduler,\n",
    "    EulerDiscreteScheduler,\n",
45
    "    UNet2DConditionModel,\n",
Hzzone's avatar
Hzzone committed
46
    ")\n",
47
48
    "\n",
    "\n",
Hzzone's avatar
Hzzone committed
49
50
    "# pretrained_model_name_or_path = 'masterful/gligen-1-4-generation-text-box'\n",
    "\n",
51
    "pretrained_model_name_or_path = \"/root/data/zhizhonghuang/checkpoints/models--masterful--gligen-1-4-generation-text-box/snapshots/d2820dc1e9ba6ca082051ce79cfd3eb468ae2c83\"\n",
Hzzone's avatar
Hzzone committed
52
53
54
    "\n",
    "tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_name_or_path, subfolder=\"tokenizer\")\n",
    "noise_scheduler = DDPMScheduler.from_pretrained(pretrained_model_name_or_path, subfolder=\"scheduler\")\n",
55
56
    "text_encoder = CLIPTextModel.from_pretrained(pretrained_model_name_or_path, subfolder=\"text_encoder\")\n",
    "vae = AutoencoderKL.from_pretrained(pretrained_model_name_or_path, subfolder=\"vae\")\n",
Hzzone's avatar
Hzzone committed
57
58
59
60
61
62
63
64
65
66
67
68
69
    "# unet = UNet2DConditionModel.from_pretrained(\n",
    "#     pretrained_model_name_or_path, subfolder=\"unet\"\n",
    "# )\n",
    "\n",
    "noise_scheduler = EulerDiscreteScheduler.from_config(noise_scheduler.config)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
70
    "unet = UNet2DConditionModel.from_pretrained(\"/root/data/zhizhonghuang/ckpt/GLIGEN_Text_Retrain_COCO\")"
Hzzone's avatar
Hzzone committed
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion_gligen.pipeline_stable_diffusion_gligen.StableDiffusionGLIGENPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .\n"
     ]
    }
   ],
   "source": [
    "pipe = StableDiffusionGLIGENPipeline(\n",
    "    vae,\n",
    "    text_encoder,\n",
    "    tokenizer,\n",
    "    unet,\n",
    "    noise_scheduler,\n",
    "    safety_checker=None,\n",
    "    feature_extractor=None,\n",
    ")\n",
    "pipe = pipe.to(\"cuda\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
105
106
107
    "import numpy as np\n",
    "\n",
    "\n",
Hzzone's avatar
Hzzone committed
108
109
110
111
112
113
114
115
116
    "# prompt = 'A realistic image of landscape scene depicting a green car parking on the left of a blue truck, with a red air balloon and a bird in the sky'\n",
    "# gen_boxes = [('a green car', [21, 281, 211, 159]), ('a blue truck', [269, 283, 209, 160]), ('a red air balloon', [66, 8, 145, 135]), ('a bird', [296, 42, 143, 100])]\n",
    "\n",
    "# prompt = 'A realistic top-down view of a wooden table with two apples on it'\n",
    "# gen_boxes = [('a wooden table', [20, 148, 472, 216]), ('an apple', [150, 226, 100, 100]), ('an apple', [280, 226, 100, 100])]\n",
    "\n",
    "# prompt = 'A realistic scene of three skiers standing in a line on the snow near a palm tree'\n",
    "# gen_boxes = [('a skier', [5, 152, 139, 168]), ('a skier', [278, 192, 121, 158]), ('a skier', [148, 173, 124, 155]), ('a palm tree', [404, 105, 103, 251])]\n",
    "\n",
117
118
    "prompt = \"An oil painting of a pink dolphin jumping on the left of a steam boat on the sea\"\n",
    "gen_boxes = [(\"a steam boat\", [232, 225, 257, 149]), (\"a jumping pink dolphin\", [21, 249, 189, 123])]\n",
Hzzone's avatar
Hzzone committed
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
    "\n",
    "boxes = np.array([x[1] for x in gen_boxes])\n",
    "boxes = boxes / 512\n",
    "boxes[:, 2] = boxes[:, 0] + boxes[:, 2]\n",
    "boxes[:, 3] = boxes[:, 1] + boxes[:, 3]\n",
    "boxes = boxes.tolist()\n",
    "gligen_phrases = [x[0] for x in gen_boxes]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/root/miniconda/envs/densecaption/lib/python3.11/site-packages/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py:683: FutureWarning: Accessing config attribute `in_channels` directly via 'UNet2DConditionModel' object attribute is deprecated. Please access 'in_channels' over 'UNet2DConditionModel's config object instead, e.g. 'unet.config.in_channels'.\n",
      "  num_channels_latents = self.unet.in_channels\n",
      "/root/miniconda/envs/densecaption/lib/python3.11/site-packages/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py:716: FutureWarning: Accessing config attribute `cross_attention_dim` directly via 'UNet2DConditionModel' object attribute is deprecated. Please access 'cross_attention_dim' over 'UNet2DConditionModel's config object instead, e.g. 'unet.config.cross_attention_dim'.\n",
      "  max_objs, self.unet.cross_attention_dim, device=device, dtype=self.text_encoder.dtype\n",
      "100%|██████████| 50/50 [01:21<00:00,  1.64s/it]\n"
     ]
    }
   ],
   "source": [
    "images = pipe(\n",
    "    prompt=prompt,\n",
    "    gligen_phrases=gligen_phrases,\n",
    "    gligen_boxes=boxes,\n",
    "    gligen_scheduled_sampling_beta=1.0,\n",
    "    output_type=\"pil\",\n",
    "    num_inference_steps=50,\n",
    "    negative_prompt=\"artifacts, blurry, smooth texture, bad quality, distortions, unrealistic, distorted image, bad proportions, duplicate\",\n",
    "    num_images_per_prompt=16,\n",
    ").images"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
164
    "diffusers.utils.make_image_grid(images, 4, len(images) // 4)"
Hzzone's avatar
Hzzone committed
165
166
167
168
169
170
171
172
173
174
175
176
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
177
   "display_name": "Python 3 (ipykernel)",
Hzzone's avatar
Hzzone committed
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
195
 "nbformat_minor": 4
Hzzone's avatar
Hzzone committed
196
}