sd21.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "#  The MIT License (MIT)\n",
    "#\n",
    "#  Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.\n",
    "#\n",
    "#  Permission is hereby granted, free of charge, to any person obtaining a copy\n",
    "#  of this software and associated documentation files (the 'Software'), to deal\n",
    "#  in the Software without restriction, including without limitation the rights\n",
    "#  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n",
    "#  copies of the Software, and to permit persons to whom the Software is\n",
    "#  furnished to do so, subject to the following conditions:\n",
    "#\n",
    "#  The above copyright notice and this permission notice shall be included in\n",
    "#  all copies or substantial portions of the Software.\n",
    "#\n",
    "#  THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n",
    "#  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n",
    "#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE\n",
    "#  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n",
    "#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n",
    "#  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n",
    "#  THE SOFTWARE."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Stable Diffusion 2.1\n",
    "\n",
    "The following example will show how to run `Stable Diffusion 2.1` with `MIGraphX`."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Install the required dependencies."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Install dependencies\n",
    "!pip install optimum[onnxruntime] transformers diffusers accelerate"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We will use optimum to generate the onnx files."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# export models\n",
    "!optimum-cli export onnx --model stabilityai/stable-diffusion-2-1 models/sd21-onnx"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now it is time to load these models with python.\n",
    "\n",
    "First, we make sure that MIGraphX module is found in the python path."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "mgx_lib_path = \"/opt/rocm/lib/\" # or \"/code/AMDMIGraphX/build/lib/\"\n",
    "if mgx_lib_path not in sys.path:\n",
    "    sys.path.append(mgx_lib_path)\n",
    "import migraphx as mgx"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Next, a helper method to load and cache the models.\n",
    "\n",
    "This will use the `models/sd21-onnx` path. If you changed it, make sure to update here as well."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "# helper for model loading\n",
    "def load_mgx_model(name, shapes):\n",
    "    file = f\"models/sd21-onnx/{name}/model\"\n",
    "    print(f\"Loading {name} model from {file}\")\n",
    "    if os.path.isfile(f\"{file}.mxr\"):\n",
    "        print(f\"Found mxr, loading it...\")\n",
    "        model = mgx.load(f\"{file}.mxr\", format=\"msgpack\")\n",
    "    elif os.path.isfile(f\"{file}.onnx\"):\n",
    "        print(f\"Parsing from onnx file...\")\n",
    "        model = mgx.parse_onnx(f\"{file}.onnx\", map_input_dims=shapes)\n",
    "        model.compile(mgx.get_target(\"gpu\"))\n",
    "        print(f\"Saving {name} model to mxr file...\")\n",
    "        mgx.save(model, f\"{file}.mxr\", format=\"msgpack\")\n",
    "    else:\n",
    "        print(f\"No {name} model found. Please verify the path is correct and re-try, or re-download model.\")\n",
    "        os.exit(1)\n",
    "    return model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "With that, we can load the models. This could take several minutes."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "text_encoder = load_mgx_model(\"text_encoder\", {\"input_ids\": [1, 77]})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "unet = load_mgx_model(\n",
    "        \"unet\", {\n",
    "            \"sample\": [1, 4, 64, 64],\n",
    "            \"encoder_hidden_states\": [1, 77, 1024],\n",
    "            \"timestep\": [1],\n",
    "        })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "vae = load_mgx_model(\"vae_decoder\", {\"latent_sample\": [1, 4, 64, 64]})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Import the remaining packages."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from diffusers import EulerDiscreteScheduler\n",
    "from transformers import CLIPTokenizer\n",
    "import torch\n",
    "import numpy as np\n",
    "from tqdm.auto import tqdm\n",
    "from PIL import Image"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Time to load the scheduler and tokenizer from the original source."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model_id = \"stabilityai/stable-diffusion-2-1\"\n",
    "scheduler = EulerDiscreteScheduler.from_pretrained(model_id,\n",
    "                                                   subfolder=\"scheduler\")\n",
    "tokenizer = CLIPTokenizer.from_pretrained(model_id, subfolder=\"tokenizer\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Next, we will define all the steps one by one, to make the last step short and simple.\n",
    "\n",
    "The first step will be to tokenize the user prompt. It will make a `(1, 77)` shaped `input_ids`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def tokenize(input):\n",
    "    return tokenizer([input],\n",
    "                     padding=\"max_length\",\n",
    "                     max_length=tokenizer.model_max_length,\n",
    "                     truncation=True,\n",
    "                     return_tensors=\"np\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Optional\n",
    "test_tk = tokenize(\"test tokenizer to see the tokens\")\n",
    "test_tk.input_ids.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We run the tokenized prompt through the `Text Encoder` model. It expects the `(1, 77)` data as `int32`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Optional\n",
    "text_encoder.get_parameter_shapes()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_embeddings(input):\n",
    "    return np.array(\n",
    "        text_encoder.run({\"input_ids\": input.input_ids.astype(np.int32)\n",
    "                          })[0]).astype(np.float32)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Optional\n",
    "test_emb = get_embeddings(tokenize(\"test tokenizer to see the tokens\"))\n",
    "test_emb.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The other input of the model is latent representation (pure noise). It will be transformed into a 512x512 image later.\n",
    "The last input will be the timestep."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_latents(seed):\n",
    "    return torch.randn(\n",
    "        (1, 4, 64, 64),\n",
    "        generator=torch.manual_seed(seed),\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Optional\n",
    "test_latents = generate_latents(42)\n",
    "latents.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we add two helpers to access and convert from torch to numpy with the proper datatype."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_scaled_sample(latents, t):\n",
    "    return scheduler.scale_model_input(latents, t).numpy().astype(np.float32)\n",
    "\n",
    "\n",
    "def get_timestep(t):\n",
    "    return np.atleast_1d(t.numpy().astype(np.int64))  # convert 0D -> 1D"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The UNet model will be run in a loop. It will predict the noise residual."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Optional\n",
    "unet.get_parameter_shapes()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def denoise(sample, embeddings, timestep):\n",
    "    return np.array(\n",
    "        unet.run({\n",
    "            \"sample\": sample,\n",
    "            \"encoder_hidden_states\": embeddings,\n",
    "            \"timestep\": timestep\n",
    "        })[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Helpers to do the classifier-free guidance and computing the previous noisy sample."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def perform_guidance(noise_pred_uncond, noise_pred_text, scale):\n",
    "    return noise_pred_uncond + scale * (noise_pred_text - noise_pred_uncond)\n",
    "\n",
    "def compute_previous(noise_pred, t, latents):\n",
    "    # compute the previous noisy sample x_t -> x_t-1\n",
    "    return scheduler.step(noise_pred, t, latents).prev_sample\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Scale and decode the image latents with VAE."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def scale_denoised(latents):\n",
    "    return 1 / 0.18215 * latents\n",
    "\n",
    "\n",
    "def decode(latents):\n",
    "    return np.array(\n",
    "        vae.run({\"latent_sample\": latents.numpy().astype(np.float32)})[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "And lastly, we need to convert it to an image to display or save."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def convert_to_rgb_image(image):\n",
    "    image = np.clip(image / 2 + 0.5, 0, 1)\n",
    "    image = np.transpose(image, (0, 2, 3, 1))\n",
    "    images = (image * 255).round().astype(\"uint8\")\n",
    "    return Image.fromarray(images[0])\n",
    "\n",
    "def save_image(pil_image, filename=\"output.png\"):\n",
    "    pil_image.save(filename, format=\"png\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Feel free to play around with these params."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "prompt = \"a photograph of an astronaut riding a horse\"\n",
    "negative_prompt = \"\"\n",
    "steps = 20\n",
    "seed = 13\n",
    "scale = 7.0"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "And now, to put everything together and run the whole pipeline:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "scheduler.set_timesteps(steps)\n",
    "\n",
    "text_input, uncond_input = tokenize(prompt), tokenize(negative_prompt)\n",
    "text_embeddings, uncond_embeddings = get_embeddings(\n",
    "    text_input), get_embeddings(uncond_input)\n",
    "latents = generate_latents(seed) * scheduler.init_noise_sigma\n",
    "\n",
    "for t in tqdm(scheduler.timesteps):\n",
    "    sample = get_scaled_sample(latents, t)\n",
    "    timestep = get_timestep(t)\n",
    "\n",
    "    noise_pred_uncond = denoise(sample, uncond_embeddings, timestep)\n",
    "    noise_pred_text = denoise(sample, text_embeddings, timestep)\n",
    "\n",
    "    noise_pred = perform_guidance(noise_pred_uncond, noise_pred_text, scale)\n",
    "    latents = compute_previous(torch.from_numpy(noise_pred), t, latents)\n",
    "\n",
    "latents = scale_denoised(latents)\n",
    "result = decode(latents)\n",
    "image = convert_to_rgb_image(result)\n",
    "\n",
    "# show the image\n",
    "image"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "If you like the generated image, save it with the following:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "save_image(image, \"output.png\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "sd_venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}