support flux.1-dev inference

e0292b6e · wangwf · 97f17a79 · e0292b6e · e0292b6e · e0292b6e
Commit e0292b6e authored Nov 13, 2025 by wangwf
14 changed files
--- a/examples/flux.1-dev-images-512/9-TechnologyAndDigital/theme_9_example_2_image_0.png
+++ b/examples/flux.1-dev-images-512/9-TechnologyAndDigital/theme_9_example_2_image_0.png
--- a/examples/flux.1-dev-images-512/9-TechnologyAndDigital/theme_9_example_3_image_0.png
+++ b/examples/flux.1-dev-images-512/9-TechnologyAndDigital/theme_9_example_3_image_0.png
--- a/examples/flux.1-dev-images-512/9-TechnologyAndDigital/theme_9_example_4_image_0.png
+++ b/examples/flux.1-dev-images-512/9-TechnologyAndDigital/theme_9_example_4_image_0.png
--- a/examples/prompts_and_negative_prompts.json
+++ b/examples/prompts_and_negative_prompts.json
+[
+    {
+      "theme": "Nature scenery",
+      "theme_zh": "自然风景",
+      "examples": [
+        {
+          "prompt": "Epic rainbow waterfall cascading through emerald jungle, vibrant mist refracting sunlight, hyperrealistic 8K, cinematic lighting, National Geographic photography, depth of field",
+          "negative_prompt": "blurry, cartoon, drawing, anime, text, people, buildings, artificial colors, overexposed"
+        },
+        {
+          "prompt": "Tranquil cherry blossom river in moonlight, petals floating on water, soft bokeh, Studio Ghibli style, dreamy pastel colors, matte painting",
+          "negative_prompt": "sharp lines, photorealistic, modern city, ugliness, blood, fire, broken trees, daytime"
+        },
+        {
+          "prompt": "Aurora borealis over glacial icebergs, mirror-like water reflection, cosmic color palette, long exposure photography, 16K resolution",
+          "negative_prompt": "daylight, desert, buildings, people, cartoon style, text, blurry edges"
+        },
+        {
+          "prompt": "Volcanic eruption at twilight, lava rivers through obsidian fields, ash clouds illuminated by lightning, dramatic chiaroscuro, concept art",
+          "negative_prompt": "peaceful scene, snow, vegetation, cartoonish, bright colors, humans, watercolor"
+        },
+        {
+          "prompt": "Surreal desert with floating crystal formations, double suns setting, heat haze distortion, Roger Dean landscape style",
+          "negative_prompt": "rainforest, ocean, modern structures, animals, night scene, sketch lines"
+        }
+      ]
+    },
+    {
+      "theme": "Human portrait",
+      "theme_zh": "人物与肖像",
+      "examples": [
+        {
+          "prompt": "Close-up portrait of a cyborg girl with neon circuit tattoos, holographic blue dreadlocks, reflective rain-soaked skin, cyberpunk aesthetic, cinematic volumetric lighting, art by WLOP",
+          "negative_prompt": "deformed hands, extra fingers, makeup, traditional clothing, natural hair, smile, hat, watermark, signature"
+        },
+        {
+          "prompt": "Ancient warrior queen in obsidian armor, scar across cheek, dramatic sunset backlight, oil painting style, detailed armor texture, fierce gaze",
+          "negative_prompt": "cute, modern, glasses, jewelry, high heels, cartoon, blurry background, flowers"
+        },
+        {
+          "prompt": "Renaissance alchemist in candlelit laboratory, holding glowing flask, intricate velvet robes, Caravaggio lighting style",
+          "negative_prompt": "casual clothes, technology, smile, clean background, sunglasses, photograph"
+        },
+        {
+          "prompt": "Tribal elder with weathered face and feather headdress, golden hour lighting, environmental portrait, National Geographic style",
+          "negative_prompt": "youth, modern clothing, studio lighting, cybernetics, weapons, cartoon eyes"
+        },
+        {
+          "prompt": "Steampunk inventor with mechanical arm, goggles on forehead, surrounded by brass gadgets, detailed etching illustration",
+          "negative_prompt": "casual wear, natural limbs, minimalism, futuristic tech, smile, anime style"
+        }
+      ]
+    },
+    {
+      "theme": "Science fiction and Fantasy",
+      "theme_zh": "科幻与奇幻",
+      "examples": [
+        {
+          "prompt": "Floating neon city above clouds, holographic billboards, flying cars in rain, Blade Runner 2099, cinematic ultra-wide shot, hyperdetailed, cyberpunk color palette",
+          "negative_prompt": "medieval, trees, daylight, desert, historical, sketch, low quality, text"
+        },
+        {
+          "prompt": "Crystal cave with bioluminescent fungi, alien plants pulsing light, mysterious portal glowing purple, fantasy concept art, James Gurney style",
+          "negative_prompt": "technology, robots, humans, sharp edges, modern, cartoon, sunshine"
+        },
+        {
+          "prompt": "Dyson sphere construction around red giant star, swarm robots assembling panels, cosmic scale, sci-fi book cover art",
+          "negative_prompt": "earth landscape, organic shapes, primitive technology, daytime, hand drawn"
+        },
+        {
+          "prompt": "Elven tree city at dawn, wooden bridges between giant mushrooms, glowing runes, Alan Lee illustration style",
+          "negative_prompt": "machines, pollution, concrete, modern clothes, guns, desert"
+        },
+        {
+          "prompt": "Time traveler's pocket watch portal, gears floating in spacetime rift, steampunk metaphysics, detailed 3D render",
+          "negative_prompt": "simple design, natural landscape, cartoon, blood, modern electronics"
+        }
+      ]
+    },
+    {
+      "theme": "Creature",
+      "theme_zh": "生物",
+      "examples": [
+        {
+          "prompt": "T-rex with exposed hydraulic muscles and steel plating, roaring in volcanic wasteland, dieselpunk aesthetic, highly detailed scales, dramatic smoke, Simon Stålenhag style",
+          "negative_prompt": "feathers, natural skin, jungle, peaceful, cute, baby animal, watercolor, blurry"
+        },
+        {
+          "prompt": "Phoenix rising from molten lava, iridescent gold feathers, particle effects, epic fantasy illustration, vibrant fire glow, high contrast",
+          "negative_prompt": "robot, mechanical, wings, cartoon, pixel art, blood, realistic eagle"
+        },
+        {
+          "prompt": "Mutated deep-sea anglerfish with crystalline teeth, bioluminescent lure, abyssal trench environment, cinematic underwater shot",
+          "negative_prompt": "land animal, daylight, fur, cute, cartoon eyes, jungle, flying"
+        },
+        {
+          "prompt": "Mechanical hummingbird with clockwork wings, hovering over steam flowers, intricate gear details, steampunk macro photography",
+          "negative_prompt": "natural feathers, dull colors, stationary, blood, destruction, human scale"
+        },
+        {
+          "prompt": "Forest guardian spirit made of living wood and glowing moss, ancient tree face, fantasy creature design",
+          "negative_prompt": "robotic parts, metal, urban setting, aggressive pose, fire, sharp teeth"
+        }
+      ]
+    },
+    {
+      "theme": "Architecture and Space",
+      "theme_zh": "建筑与空间",
+      "examples": [
+        {
+          "prompt": "Infinite library floating among clouds, spiral bookshelves under glass dome, sunbeams through stained glass, Baroque architecture, magical realism, unreal engine render",
+          "negative_prompt": "modern design, concrete, people, decay, darkness, Asian style, wood, minimalist"
+        },
+        {
+          "prompt": "Abandoned overgrown subway station, vines swallowing pillars, sunlight through broken ceiling, moss-covered tracks, photorealistic, haunting atmosphere",
+          "negative_prompt": "clean, futuristic, crowds, bright colors, fantasy creatures, neon lights"
+        },
+        {
+          "prompt": "Bamboo skyscraper with vertical gardens, sustainable futuristic city, daylight aerial view, eco-architecture concept",
+          "negative_prompt": "ruins, traditional buildings, desert, night, pollution, Gothic style"
+        },
+        {
+          "prompt": "Interdimensional train station with floating platforms, Art Deco design, travelers in vintage suits, Moebius art style",
+          "negative_prompt": "modern clothing, cars, daylight, destruction, wood material, medieval"
+        },
+        {
+          "prompt": "Submerged Atlantis ruins with coral-covered columns, sunken treasure glow, underwater volumetric rays, fantasy archaeology",
+          "negative_prompt": "modern structures, people, dry land, bright lighting, technology, cartoon fish"
+        }
+      ]
+    },
+    {
+      "theme": "Abstraction and Art",
+      "theme_zh": "抽象与艺术",
+      "examples": [
+        {
+          "prompt": "Abstract explosion of liquid gold and deep blue, metallic fluid dynamics, emotional turbulence, 3D render, luxury aesthetic, motion blur background",
+          "negative_prompt": "objects, human forms, text, sharp edges, cartoon, flowers, faces, landscape"
+        },
+        {
+          "prompt": "Geometric fractals in iridescent colors, quantum foam texture, glowing dark matter, digital art, 8K wallpaper, trippy",
+          "negative_prompt": "realistic, photo, animals, people, buildings, simple shapes, dull colors"
+        },
+        {
+          "prompt": "Van Gogh starry night reinterpreted with neural network patterns, swirling digital brushstrokes, post-impressionist algorithm art",
+          "negative_prompt": "photorealism, sharp lines, solid colors, text, human figures, architecture"
+        },
+        {
+          "prompt": "Kinetic sculpture of floating chrome spheres, light refraction patterns, minimalist abstract, museum installation",
+          "negative_prompt": "organic shapes, textures, landscapes, people, bright colors, cartoon shading"
+        },
+        {
+          "prompt": "Symphony visualized as interwoven color ribbons, musical notes transforming into light particles, synesthesia art",
+          "negative_prompt": "recognizable objects, faces, buildings, dark palette, text, photorealistic"
+        }
+      ]
+    },
+    {
+      "theme": "Daily life",
+      "theme_zh": "日常生活",
+      "examples": [
+        {
+          "prompt": "Cozy rainy Paris café interior, warm lamp light on books, steaming latte, blurred raindrops on window, vintage filter, atmospheric perspective",
+          "negative_prompt": "sunny, crowded, modern design, bright colors, empty cups, dirty, anime style"
+        },
+        {
+          "prompt": "Macro shot of matcha cake with red bean filling, powdered sugar dusting, food photography, shallow depth of field, natural light",
+          "negative_prompt": "burnt, sliced, fork, human hands, ugly plate, synthetic colors, text"
+        },
+        {
+          "prompt": "Vintage record store at twilight, neon sign reflection on wet pavement, vinyl records glowing softly, cinematic ambiance",
+          "negative_prompt": "daylight, empty shelves, modern streaming devices, people fighting, cartoon style"
+        },
+        {
+          "prompt": "Minimalist Japanese breakfast arrangement, miso soup steam rising, morning light through shoji screen, film photography grain",
+          "negative_prompt": "messy table, Western food, dinner scene, artificial lighting, people, text"
+        },
+        {
+          "prompt": "Antique typewriter on oak desk with scattered paper, dust motes in sunbeam, nostalgic still life, shallow depth of field",
+          "negative_prompt": "laptop, modern office, bright colors, digital screens, human hands, damage"
+        }
+      ]
+    },
+    {
+      "theme": "History and Retro",
+      "theme_zh": "历史与复古",
+      "examples": [
+        {
+          "prompt": "Steampunk laboratory with brass microscopes, glowing vials, Tesla coils sparking, intricate gear mechanisms, gas lamp lighting, detailed etching style",
+          "negative_prompt": "modern electronics, plastic, clean room, sunlight, minimalism, people, damage"
+        },
+        {
+          "prompt": "Samurai standing in bamboo forest, cherry blossoms falling, traditional Japanese ink wash painting, minimalistic monochrome",
+          "negative_prompt": "color, gun, Western armor, smile, crowd, modern clothes"
+        },
+        {
+          "prompt": "Egyptian pharaoh's tomb discovery moment, torchlight revealing golden artifacts, sandstone hieroglyphs, dramatic Indiana Jones style",
+          "negative_prompt": "modern tools, electric lighting, tourists, damage, futuristic elements, cartoon"
+        },
+        {
+          "prompt": "1920s speakeasy jazz club, smoke-filled atmosphere, flapper dancers, sepia tone photograph with film grain",
+          "negative_prompt": "modern clothing, daylight, digital devices, bright colors, empty room, destruction"
+        },
+        {
+          "prompt": "Viking longship sailing through glacial fjord, aurora reflecting on water, historical accuracy, epic cinematic shot",
+          "negative_prompt": "motorboats, tropical water, modern ships, pollution, cartoon characters, desert"
+        }
+      ]
+    },
+    {
+      "theme": "Dark and Grotesque",
+      "theme_zh": "暗黑与怪诞",
+      "examples": [
+        {
+          "prompt": "Surreal heart made of cracked porcelain, wrapped in black thorns, blood dripping onto white roses, dark academia aesthetic, Greg Rutkowski style",
+          "negative_prompt": "cute, intact, healing, jewelry, cartoon, happy, bright background"
+        },
+        {
+          "prompt": "Ghostly figure in abandoned asylum, long exposure motion blur, green phosphorescent mist, horror film still, grainy film texture",
+          "negative_prompt": "colorful, daylight, beautiful face, modern clothes, sharp focus, flowers"
+        },
+        {
+          "prompt": "Living tapestry of screaming faces emerging from medieval castle wall, Gothic horror, Zdzisław Beksiński influence",
+          "negative_prompt": "bright colors, peaceful scene, modern art, intact surface, cute animals"
+        },
+        {
+          "prompt": "Alchemical ritual circle with inverted symbols, floating obsidian shards, candle smoke forming skulls, dark fantasy illustration",
+          "negative_prompt": "happy ceremony, daylight, children, healing magic, modern setting, cartoon style"
+        },
+        {
+          "prompt": "Bone cathedral interior with flesh pipe organs, stained glass depicting nightmares, HR Giger biomechanical style",
+          "negative_prompt": "normal church, sunlight, clean surfaces, people praying, bright colors"
+        }
+      ]
+    },
+    {
+      "theme": "Technology and Digital",
+      "theme_zh": "科技与数码",
+      "examples": [
+        {
+          "prompt": "Holographic neural network in deep space, glowing data streams connecting nodes, cybernetic tree, sci-fi UI overlay, neon blue and purple, 3D render",
+          "negative_prompt": "organic, hand-drawn, paper, medieval, earth, animals, text, blur"
+        },
+        {
+          "prompt": "Quantum computer core with floating crystal processors, laser light refraction, clean futuristic lab, cinematic sci-fi, volumetric fog",
+          "negative_prompt": "wires, vintage, wood, people, mess, low tech, explosion"
+        },
+        {
+          "prompt": "Augmented reality city overlay visible through smart glasses, digital information layers floating over streets, cyberpunk UI design",
+          "negative_prompt": "natural landscape, historical setting, low tech, blurry interface, destruction"
+        },
+        {
+          "prompt": "Nanobot swarm reconstructing broken antique vase in timelapse, technology meets tradition, macro photography",
+          "negative_prompt": "human hands, organic growth, magic, blurry, cartoon robots, fire"
+        },
+        {
+          "prompt": "Singularity event visualized as fractal energy convergence, quantum foam eruption, abstract technology art, 8K resolution",
+          "negative_prompt": "mechanical parts, people, buildings, earth landscape, simple shapes, text"
+        }
+      ]
+    }
+]
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,5 +7,5 @@ onnxruntime>=1.22.1
 pillow
 prettytable
 tokenizers<0.22,>=0.21
-torch>=2.4.1
+torch>=2.5.1
 transformers>=4.54.1
--- a/run.sh
+++ b/run.sh
+python tools/run_pipe.py -m ./Flux.1-dev-new-onnx/ -p "A cat holding a sign that says hello world" --num-images-per-prompt 1 --img-size 512 --save-prefix flux_bs1_512
+python tools/run_pipe.py -m ./Flux.1-dev-new-onnx/ -p "A cat holding a sign that says hello world" --num-images-per-prompt 1 --img-size 1024 --save-prefix flux_bs1_1024
+python tools/run_pipe.py -m ./Flux.1-dev-new-onnx/ -p "A cat holding a sign that says hello world" --num-images-per-prompt 2 --img-size 512 --save-prefix flux_bs2_512
+python tools/run_pipe.py -m ./Flux.1-dev-new-onnx/ -p "A cat holding a sign that says hello world" --num-images-per-prompt 2 --img-size 1024 --save-prefix flux_bs2_1024
+
--- a/set_env.sh
+++ b/set_env.sh
+# rocblas_lib_path=$1
+
+# export MIGRAPHX_ENABLE_MIOPEN_GROUPNORM=1
+# export MIGRAPHX_ENABLE_NHWC=1
+# export MIGRAPHX_ENABLE_MIOPEN_CONCAT=1
+# export MIGRAPHX_STABLEDIFFUSION_OPT=1
+# export MIGRAPHX_ENABLE_MIOPEN_GN_LN=1
+# export MIGRAPHX_ENABLE_LAYERNORM_FUSION=1
+# export PADDING_MALLOC=0  # run on KME
+
+# export HIP_VISIBLE_DEVICES=6
+# export LD_LIBRARY_PATH=/public/home/zhuww/wangwf/pkgs/rocblas-install-0626/lib:$LD_LIBRARY_PATH
+# export LD_LIBRARY_PATH=${rocblas_lib_path}:$LD_LIBRARY_PATH
+
+MIGRAPHX_TRANSPOSE_CONTIGUOUS_GEMM=1
--- a/tools/README.md
+++ b/tools/README.md
+## 以通用方式加载并运行pipeline
+
+```bash
+python tools/run_pipe.py -m /path/to/models -p "the ocean in dream"
+```
+
+脚本参数说明：
+
+| 参数 | 说明 | 类型 | 默认值 |
+| --- | --- | --- | --- |
+| `-m` / `--model-dir` | **必选**，pipeline 模型路径 | str | None |
+| `--force-compile` | 可选，是否强制重新编译模型 | bool | False |
+| `--num-images-per-prompt` | 可选，一条提示词一次生成图片的数量 | int | 1 |
+| `--img-size` | 可选，生成图像尺寸，如果不设置，则跟随各 pipeline 默认的图像尺寸参数 | int | None | 
+| `-p` / `--prompt` | **必选**，提示词，描述图片内容、风格、生成要求等 | str | None |
+| `-n` / `--negative-prompt` | 可选，反向提示词，例如 "ugly" | str | None |
+| `-t` / `--num-inference-steps` | 可选，生成图片时迭代多少步 | int | 50 |
+| `--seed` | 可选，随机数种子 | int | 42 |
+| `--save-prefix` | 可选，保存图片的前缀 | str | None |
+
+
+## 以自定义组件方式加载并运行pipeline
+
+> reference: [https://huggingface.co/docs/diffusers/using-diffusers/custom_pipeline_overview#community-components](https://huggingface.co/docs/diffusers/using-diffusers/custom_pipeline_overview#community-components)  
+> Community components allow users to build pipelines that may have customized components that are not a part of Diffusers. If your pipeline has custom components that Diffusers doesn’t already support, you need to provide their implementations as Python modules. These customized components could be a VAE, UNet, and scheduler. In most cases, the text encoder is imported from the Transformers library. The pipeline code itself can also be customized.
+
+在此项目中，我们以 MIGraphX 为推理后端重写了 text_encoder、unet、vae_decoder 等组件，除了通用的模型加载方法 `DiffusionPipeline.from_pretrained` 外，我们还可以先加载每个自定义的组件，然后创建 pipeline 实例。以 sdxl 为例：
+
+```bash
+# 运行 sdxl
+python tools/run_sdxl_with_custom_components.py -m /path/to/sdxl_models -p "the ocean in bream"
+```
+
+脚本参数说明：
+
+| 参数 | 说明 | 类型 | 默认值 |
+| --- | --- | --- | --- |
+| `-m` / `--model-dir` | **必选**，sdxl 模型路径 | str | None |
+| `--force-compile` | 可选，是否强制重新编译模型 | bool | False |
+| `--num-images-per-prompt` | 可选，一条提示词一次生成图片的数量 | int | 1 |
+| `--img-size` | 可选，生成图像尺寸 | int | 1024 | 
+| `-p` / `--prompt` | **必选**，提示词，描述图片内容、风格、生成要求等 | str | None |
+| `-n` / `--negative-prompt` | 可选，反向提示词，例如 "ugly" | str | None |
+| `-t` / `--num-inference-steps` | 可选，生成图片时迭代多少步 | int | 50 |
+| `--seed` | 可选，随机数种子 | int | 42 |
+| `--save-prefix` | 可选，保存图片的前缀 | str | None |
+
+
+## 批量生成图片
+
+可以将多条不同主题的提示与反向提示词放在一个 json 文件中，然后使用 `tools/run_examples.py` 批量生成图片。
+
+```bash
+# 运行 sdxl
+python tools/run_examples.py \
+-m /path/to/sdxl_models \
+--examples-json examples/prompts_and_negative_prompts.json \
+--output-dir examples/sdxl-images-1024
+```
+
+脚本参数说明：
+
+| 参数 | 说明 | 类型 | 默认值 |
+| --- | --- | --- | --- |
+| `-m` / `--model-dir` | **必选**，sdxl 模型路径 | str | None |
+| `--force-compile` | 可选，是否强制重新编译模型 | bool | False |
+| `--num-images-per-prompt` | 可选，一条提示词一次生成图片的数量 | int | 1 |
+| `--img-size` | 可选，生成图像尺寸，如果不设置，则跟随各 pipeline 默认的图像尺寸参数 | int | None | 
+| `-t` / `--num-inference-steps` | 可选，生成图片时迭代多少步 | int | 50 |
+| `--seed` | 可选，随机数种子 | int | 42 |
+| `-examples-json` | 可选，提示词与反向提示词文件 | str | examples/prompts_and_negative_prompts.json |
+| `--output-dir` | 可选，保存生成的图片路径 | str | None |
+
+其中，提示词与反向提示词文件格式为：
+```json
+[
+    {
+      "theme": "theme0 name",
+      "examples": [
+        {
+          "prompt": "promt0 text here",
+          "negative_prompt": "negative_prompt0 text here"
+        },
+        {
+          "prompt": "promt1 text here",
+          "negative_prompt": "negative_prompt1 text here"
+        },
+        ...
+      ]
+    },
+    {
+      "theme": "theme1 name",
+      "examples": [
+        {
+          "prompt": "promt0 text here",
+          "negative_prompt": "negative_prompt0 text here"
+        },
+        {
+          "prompt": "promt1 text here",
+          "negative_prompt": "negative_prompt1 text here"
+        },
+        ...
+      ]
+    },
+    ...
+]
+```
+
+示例：[../examples/prompts_and_negative_prompts.json](../examples/prompts_and_negative_prompts.json)
+
+
+## 统计各模块耗时
+
+此项目支持通过非侵入式打点统计各模块耗时，大致的步骤如下：
+1. 创建计时器；
+2. 将要统计耗时的函数或方法注册进计时器；
+3. 开启计时器；
+4. 运行要统计耗时的函数或方法；
+5. 打印统计数据。
+
+简单使用示例：
+```python
+import random
+import time
+from migraphx_diffusers import AutoTimer
+
+def sleep_func(sleep_seconds=1):
+    time.sleep(sleep_seconds)
+
+class SleepClass:
+    def __init__(self):
+        self.min_seconds = 1
+        self.max_seconds = 5
+
+    def random_sleep(self):
+        time.sleep(random.randint(self.min_seconds, self.max_seconds))
+
+    def __call__(self, sleep_seconds=1):
+        time.sleep(sleep_seconds)
+
+obj = SleepClass()
+
+t = AutoTimer()  # step1
+
+# step2
+t.add_target(sleep_func, key="sleep_func")
+t.add_target(obj.random_sleep, key="random_sleep")
+t.add_target(obj, key="__call__")
+
+t.start_work() # step3
+
+# step4
+for i in range(10):
+    sleep_func()
+    obj()
+    if i % 3 == 0:
+        obj.random_sleep()
+
+t.summary()  # step5
+```
+
+运行结果如下：
+```
+--------------------------------------------------------------------------------------+
+|                                     Test Latency                                     |
+--------------+----------+--------------+--------------+--------------+---------------+
+|     模块     | 运行次数  | 最长耗时(ms) | 最短耗时(ms) | 平均耗时(ms) | 平均性能(fps) |
+--------------+----------+--------------+--------------+--------------+---------------+
+|  sleep_func  |    10    |   1001.06    |   1001.02    |   1001.04    |      1.0      |
+|   __call__   |    10    |   1001.07    |   1000.06    |   1000.94    |      1.0      |
+| random_sleep |    4     |    4004.1    |   1001.05    |   2252.33    |      0.44     |
+--------------+----------+--------------+--------------+--------------+---------------+
+```
+
+统计 sdxl 或 sd2.1 端到端性能与各组件性能数据：
+```bash
+python tools/time_count.py -m /path/to/sdxl_models
+```
+
+脚本参数说明：
+
+| 参数 | 说明 | 类型 | 默认值 |
+| --- | --- | --- | --- |
+| `-m` / `--model-dir` | **必选**，sdxl 模型路径 | str | None |
+| `--force-compile` | 可选，是否强制重新编译模型 | bool | False |
+| `--num-images-per-prompt` | 可选，一条提示词一次生成图片的数量 | int | 1 |
+| `--img-size` | 可选，生成图像尺寸，如果不设置，则跟随各 pipeline 默认的图像尺寸参数 | int | None | 
+| `-t` / `--num-inference-steps` | 可选，生成图片时迭代多少步 | int | 50 |
+| `--num-warmup-loops` | 可选，warmup 迭代次数 | int | 1 |
+| `--num-count-loops` | 可选，性能统计迭代次数 | int | 100 |
+| `--out-csv-file` | 可选，性能数据保存路径，CSV文件 | str | ./perf-{date}-{time}.csv |
+
+
+## SD2.1 端到端性能测试
+
+```bash
+python tools/run_sd2_1.py /path/to/sd2.1_models
+```
+
+脚本参数说明：
+
+| 参数 | 说明 | 类型 | 默认值 |
+| --- | --- | --- | --- |
+| `model-dir` | **位置参数**，sd2.1 模型路径 | str | None |
+| `--result-dir` | 可选，生成图片的存放目录 | str | ./results |
+
+测试场景如下：
+
+ batchsize: 1、2、4、8
+ image_size: 512
+ num_inference_steps: 20
+
+
+## 模型精度评估
+
+文生图任务一般采用 CLIP-score 来评估模型，首先准备数据集与多模态模型：
+```bash
+# 下载数据集
+wget https://raw.githubusercontent.com/google-research/parti/main/PartiPrompts.tsv --no-check-certificate
+
+# 下载模型
+mkdir ./openai
+huggingface-cli download openai/clip-vit-base-patch16 --local-dir ./openai/clip-vit-base-patch16 --local-dir-use-symlinks False
+```
+
+根据数据集中的提示词生成图片：
+```bash
+python tools/gen_p2_images.py -m /path/to/models --num-images-per-prompt 4 -p ./PartiPrompts.tsv --save-dir ./p2_images
+```
+
+评估生成的结果：
+```bash
+python python tools/evaluate.py -m ./openai/clip-vit-base-patch16 -d ./p2_images
+```
\ No newline at end of file
--- a/tools/evaluate.py
+++ b/tools/evaluate.py
+from collections import defaultdict
+import json
+import os
+import os.path as osp
+
+import cv2
+import numpy as np
+from prettytable import PrettyTable
+import torch
+import tqdm
+from torchmetrics.multimodal import CLIPScore
+from torchmetrics.functional.multimodal.clip_score import _clip_score_update
+
+
+class P2CLIPScore(CLIPScore):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.category2scores = defaultdict(list)
+        self.category2nprompts = defaultdict(int)
+        self.category2nimages = defaultdict(int)
+
+    def process(self, p2_images_dir):
+        prompt_dirs = []
+        for cat_dir_name in os.listdir(p2_images_dir): 
+            for prompt_dir_name in os.listdir(osp.join(p2_images_dir, cat_dir_name)):
+                prompt_dir = osp.join(p2_images_dir, cat_dir_name, prompt_dir_name)
+                prompt_dirs.append(prompt_dir)
+
+        print("Processing...")
+        for prompt_dir in tqdm.tqdm(prompt_dirs):
+            prompt_json = osp.join(prompt_dir, "prompt_info.json")
+            with open(prompt_json, "r") as f:
+                prompt_info = json.load(f)
+            category = prompt_info["category"]
+            cat_dir_name = prompt_dir.split("/")[-2]
+            assert cat_dir_name == category.replace(" ", "").replace("&", "_")
+
+            imgs = []
+            for file_name in os.listdir(prompt_dir):
+                if not file_name.endswith(".png"):
+                    continue
+                image_path = osp.join(prompt_dir, file_name)
+                img = cv2.imread(image_path)[None, ...]
+                imgs.append(img)
+            assert len(imgs) >= 1
+
+            scores, _ = _clip_score_update(
+                [prompt_info["prompt_text"]] * len(imgs),
+                torch.from_numpy(np.concatenate(imgs, 0).transpose(0, 3, 1, 2)), 
+                self.model, 
+                self.processor
+            )
+
+            # self.category2scores["All"].extend(scores.detach().numpy().tolist())
+            # self.category2scores[category].extend(scores.detach().numpy().tolist())
+            self.category2scores["All"].append(scores.max().item())
+            self.category2scores[category].append(scores.max().item())
+            self.category2nprompts["All"] += 1
+            self.category2nprompts[category] += 1
+            self.category2nimages["All"] += len(imgs)
+            self.category2nimages[category] += len(imgs)
+
+    def compute(self, output_json=None):
+        pt = PrettyTable()
+        pt.title = "Evaluation Results of PartiPrompts Dataset"
+        pt.field_names = ["Category", "Num Prompts", "Num Images", "Mean CLIP Score"]
+        for category, scores in self.category2scores.items():
+            num_prompts = self.category2nprompts[category]
+            num_images = self.category2nimages[category]
+            mean_score = sum(scores) / len(scores)
+            pt.add_row([category, num_prompts, num_images, round(mean_score, 4)])
+        print(pt)
+
+        if output_json is not None:
+            with open(output_json, "w") as f:
+                f.write(pt.get_json_string())
+
+def main():
+    import argparse
+    parser = argparse.ArgumentParser(
+        "Evaluate text2image results of PartiPrompts dataset")
+    parser.add_argument("-m", "--model-dir", 
+                        type=str, 
+                        required=True, 
+                        help="The path to the model directory.")
+    parser.add_argument("-d", "--data-dir", 
+                        type=str, 
+                        required=True, 
+                        help="The path to the evaluation data directory.")
+    parser.add_argument("-o", "--output-json", 
+                        type=str, 
+                        default=None, 
+                        help="Output json file path.")
+    args = parser.parse_args()
+
+    p2_clip_score = P2CLIPScore(args.model_dir)
+    p2_clip_score.process(args.data_dir)
+    p2_clip_score.compute(args.output_json)
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/export_onnx.py
+++ b/tools/export_onnx.py
+import argparse
+import os
+import os.path as osp
+import shutil
+
+import onnx
+import torch
+from transformers import (CLIPTextModel, T5EncoderModel)
+from diffusers import FluxTransformer2DModel, AutoencoderKL
+
+
+def get_local_path(local_dir, model_dir):
+    model_local_dir = os.path.join(local_dir, model_dir)
+    if not os.path.exists(model_local_dir):
+        os.makedirs(model_local_dir)
+    return model_local_dir
+
+
+def gather_weights_to_one_file(onnx_path):
+    onnx_model = onnx.load(onnx_path)
+    onnx_model_without_data = onnx.load(onnx_path, load_external_data=False)
+    
+    os.remove(onnx_path)  # remove old model file
+
+    # remove external data file
+    dir_path = osp.dirname(onnx_path)
+    for ini in onnx_model_without_data.graph.initializer:
+        for ed in ini.external_data:
+            external_data_path = osp.join(dir_path, ed.value)
+            if osp.isfile(external_data_path):
+                os.remove(external_data_path)
+    for node in onnx_model_without_data.graph.node:
+        if node.op_type != "Constant":
+            continue
+        for attr in node.attribute:
+            external_data_path = osp.join(
+                dir_path, attr.t.name.replace('/', '_').replace(':', '_'))
+            if osp.isfile(external_data_path):
+                os.remove(external_data_path)
+            
+    onnx.save(onnx_model,
+              onnx_path,
+              save_as_external_data=True,
+              all_tensors_to_one_file=True,
+              location="model.onnx.data")
+
+
+def copy_files(local_dir, save_dir, overwrite=True):
+    if overwrite or not osp.is_exist(osp.join(save_dir, "scheduler")):
+        shutil.copytree(osp.join(local_dir, "scheduler"), 
+                        osp.join(save_dir, "scheduler"), 
+                        dirs_exist_ok=True)
+    if overwrite or not osp.is_exist(osp.join(save_dir, "tokenizer")):
+        shutil.copytree(osp.join(local_dir, "tokenizer"), 
+                        osp.join(save_dir, "tokenizer"), 
+                        dirs_exist_ok=True)
+    if overwrite or not osp.is_exist(osp.join(save_dir, "tokenizer_2")):
+        shutil.copytree(osp.join(local_dir, "tokenizer_2"), 
+                        osp.join(save_dir, "tokenizer_2"), 
+                        dirs_exist_ok=True)
+    if overwrite or not osp.is_exist(osp.join(save_dir, 'model_index.json')):
+        shutil.copy(osp.join(local_dir, 'model_index.json'), 
+                    osp.join(save_dir, 'model_index.json'))
+    for sub_dir in ['text_encoder', 'text_encoder_2', 'transformer', 'vae']:
+        if overwrite or not osp.is_exist(
+                osp.join(save_dir, sub_dir, 'config.json')):
+            shutil.copy(osp.join(local_dir, sub_dir, 'config.json'), 
+                        osp.join(save_dir, sub_dir, 'config.json'))
+
+
+def export_clip(local_dir, 
+                model_dir="text_encoder", 
+                save_dir=None,
+                torch_dtype=torch.float32):
+    save_dir = save_dir or local_dir
+    clip_save_dir = get_local_path(save_dir, model_dir)
+    onnx_path = os.path.join(clip_save_dir, "model.onnx")
+
+    bs = 1
+    max_len = 77
+    sample_inputs = (torch.zeros(bs, max_len, dtype=torch.int32), )
+    input_names = ["input_ids"]
+
+    model = CLIPTextModel.from_pretrained(local_dir,
+                                          subfolder=model_dir,
+                                          torch_dtype=torch_dtype)
+
+    output_names = ["text_embeddings"]
+    dynamic_axes = {"input_ids": {0: 'B'}, "text_embeddings": {0: 'B'}}
+
+    # CLIP export requires nightly pytorch due to bug in onnx parser
+    with torch.inference_mode():
+        torch.onnx.export(model,
+                            sample_inputs,
+                            onnx_path,
+                            export_params=True,
+                            input_names=input_names,
+                            output_names=output_names,
+                            dynamic_axes=dynamic_axes)
+
+    assert os.path.isfile(onnx_path)
+    gather_weights_to_one_file(onnx_path)
+    print(f"Success export clip model: {onnx_path}")
+    return onnx_path
+
+
+def export_t5(local_dir, 
+              model_dir="text_encoder_2", 
+              save_dir=None,
+              torch_dtype=torch.float32):
+    save_dir = save_dir or local_dir
+    t5_save_dir = get_local_path(save_dir, model_dir)
+    onnx_path = os.path.join(t5_save_dir, "model.onnx")
+
+    bs = 1
+    max_len = 512
+    sample_inputs = (torch.zeros(bs, max_len, dtype=torch.int32), )
+    input_names = ["input_ids"]
+    model = T5EncoderModel.from_pretrained(local_dir,
+                                            subfolder=model_dir,
+                                            torch_dtype=torch_dtype)
+    output_names = ["text_embeddings"]
+    dynamic_axes = {"input_ids": {0: 'B'}, "text_embeddings": {0: 'B'}}
+
+    with torch.inference_mode():
+        torch.onnx.export(model,
+                          sample_inputs,
+                          onnx_path,
+                          export_params=True,
+                          input_names=input_names,
+                          output_names=output_names,
+                          dynamic_axes=dynamic_axes)
+
+    assert os.path.isfile(onnx_path)
+    gather_weights_to_one_file(onnx_path)
+    print(f"Success export t5 model: {onnx_path}")
+    return onnx_path
+
+# Following decorators required to apply fp16 inference patch to the \
+# transformer blocks. Note that we do not export fp16 weights directly to ONNX \
+# to allow migraphx to perform optimizations before quantizing down to fp16. \
+# This results in better accuracy compared to exporting fp16 directly to onnx.
+def transformer_block_clip_wrapper(fn):
+    def new_forward(*args, **kwargs):
+        encoder_hidden_states, hidden_states = fn(*args, **kwargs)
+        return encoder_hidden_states.clip(-65504, 65504), hidden_states
+
+    return new_forward
+
+
+def single_transformer_block_clip_wrapper(fn):
+    def new_forward(*args, **kwargs):
+        hidden_states = fn(*args, **kwargs)
+        return hidden_states.clip(-65504, 65504)
+
+    return new_forward
+
+
+def add_output_clippings_for_fp16(model):
+    for b in model.transformer_blocks:
+        b.forward = transformer_block_clip_wrapper(b.forward)
+
+    for b in model.single_transformer_blocks:
+        b.forward = single_transformer_block_clip_wrapper(b.forward)
+
+
+def export_transformer(local_dir,
+                       model_dir="transformer",
+                       save_dir=None,
+                       torch_dtype=torch.float32,
+                       fp16=True):
+    save_dir = save_dir or local_dir
+    transformer_save_dir = get_local_path(save_dir, model_dir)
+    onnx_path = os.path.join(transformer_save_dir, "model.onnx")
+
+    bs = 1
+    img_height = 1024
+    img_width = 1024
+    compression_factor = 8
+    latent_h = img_height // compression_factor
+    latent_w = img_width // compression_factor
+    max_len = 512
+
+    config = FluxTransformer2DModel.load_config(local_dir,
+                                                subfolder=model_dir)
+    sample_inputs = (
+        torch.randn(bs, (latent_h // 2) * (latent_w // 2),
+                    config["in_channels"],
+                    dtype=torch_dtype),
+        torch.randn(bs,
+                    max_len,
+                    config['joint_attention_dim'],
+                    dtype=torch_dtype),
+        torch.randn(bs, config['pooled_projection_dim'], dtype=torch_dtype),
+        torch.tensor([1.] * bs, dtype=torch_dtype),
+        torch.randn((latent_h // 2) * (latent_w // 2), 3, dtype=torch_dtype),
+        torch.randn(max_len, 3, dtype=torch_dtype),
+        torch.tensor([1.] * bs, dtype=torch_dtype),
+    )
+
+    input_names = [
+        'hidden_states', 'encoder_hidden_states', 'pooled_projections',
+        'timestep', 'img_ids', 'txt_ids', 'guidance'
+    ]
+
+    model = FluxTransformer2DModel.from_pretrained(local_dir,
+                                                   subfolder=model_dir,
+                                                   torch_dtype=torch_dtype)
+
+    if fp16:
+        print("applying fp16 clip workarounds to transformer")
+        add_output_clippings_for_fp16(model)
+
+    output_names = ["latent"]
+    dynamic_axes = {
+        'hidden_states': {
+            0: 'B',
+            1: 'latent_dim'
+        },
+        'encoder_hidden_states': {
+            0: 'B',
+            1: 'L'
+        },
+        'pooled_projections': {
+            0: 'B'
+        },
+        'timestep': {
+            0: 'B'
+        },
+        'img_ids': {
+            0: 'latent_dim'
+        },
+        'txt_ids': {
+            0: 'L'
+        },
+        'guidance': {
+            0: 'B'
+        },
+    }
+
+    with torch.inference_mode():
+        torch.onnx.export(model,
+                          sample_inputs,
+                          onnx_path,
+                          export_params=True,
+                          input_names=input_names,
+                          output_names=output_names,
+                          dynamic_axes=dynamic_axes)
+
+    assert os.path.isfile(onnx_path)
+    gather_weights_to_one_file(onnx_path)
+    print(f"Success export transformer model: {onnx_path}")
+    return onnx_path
+
+
+def export_vae(local_dir, 
+               model_dir="vae", 
+               save_dir=None,
+               torch_dtype=torch.float32):
+    save_dir = save_dir or local_dir
+    vae_save_dir = get_local_path(save_dir, model_dir)
+    onnx_path = os.path.join(vae_save_dir, "model.onnx")
+    
+    config = AutoencoderKL.load_config(local_dir, subfolder=model_dir)
+    bs=1
+    latent_channels = config['latent_channels']
+    img_height = 1024
+    img_width = 1024
+    compression_factor = 8
+    latent_h = img_height // compression_factor
+    latent_w = img_width // compression_factor
+    sample_inputs = (torch.randn(bs,
+                                 latent_channels,
+                                 latent_h,
+                                 latent_w,
+                                 dtype=torch_dtype), )
+    input_names = ["latent"]
+    model = AutoencoderKL.from_pretrained(local_dir,
+                                          subfolder=model_dir,
+                                          torch_dtype=torch_dtype)
+    model.forward = model.decode
+
+    output_names = ["images"]
+    dynamic_axes = {
+        'latent': {
+            0: 'B',
+            2: 'H',
+            3: 'W'
+        },
+        'images': {
+            0: 'B',
+            2: '8H',
+            3: '8W'
+        }
+    }
+
+    with torch.inference_mode():
+        torch.onnx.export(model,
+                          sample_inputs,
+                          onnx_path,
+                          export_params=True,
+                          input_names=input_names,
+                          output_names=output_names,
+                          dynamic_axes=dynamic_axes)
+
+    assert os.path.isfile(onnx_path)
+    gather_weights_to_one_file(onnx_path)
+    print(f"Success export vae_decoder model: {onnx_path}")
+    return onnx_path
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="export ONNX models")
+    parser.add_argument("--local-dir",
+                        type=str,
+                        required=True,
+                        help="local directory containing the model")
+    parser.add_argument("--save-dir",
+                        type=str,
+                        required=None,
+                        help="the directory for saving ONNX models")
+    args = parser.parse_args()
+    
+    if args.save_dir is None:
+        args.save_dir = args.local_dir
+
+    return args
+
+
+def main():
+    args = parse_args()
+    local_dir = args.local_dir
+    save_dir = args.save_dir
+    os.makedirs(save_dir, exist_ok=True)
+
+    export_clip(local_dir, save_dir=save_dir)
+    export_t5(local_dir, save_dir=save_dir)
+    export_transformer(local_dir, save_dir=save_dir)
+    export_vae(local_dir, save_dir=save_dir)
+
+    if save_dir != local_dir:
+        copy_files(local_dir, save_dir, overwrite=True)
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/gen_p2_images.py
+++ b/tools/gen_p2_images.py
+from collections import namedtuple
+import csv
+import json
+import os
+import os.path as osp
+
+from diffusers import DiffusionPipeline
+import migraphx_diffusers
+from migraphx_diffusers import get_name_and_migraphx_config
+import torch
+
+
+def parse_args():
+    from argparse import ArgumentParser
+    parser = ArgumentParser(description="SDXL inference with migraphx backend")
+
+    #=========================== mdoel load and compile ========================
+    parser.add_argument(
+        "-m", 
+        "--model-dir",
+        type=str,
+        required=True,
+        help="Path to local model directory.",
+    )
+    parser.add_argument(
+        "--force-compile",
+        action="store_true",
+        default=False,
+        help="Ignore existing .mxr files and override them",
+    )
+    parser.add_argument(
+        "--img-size",
+        type=int,
+        default=None,
+        help="output image size",
+    )
+    parser.add_argument(
+        "--num-images-per-prompt",
+        type=int,
+        default=1,
+        help="The number of images to generate per prompt."
+    )
+    # --------------------------------------------------------------------------
+
+    # =============================== generation ===============================
+    parser.add_argument(
+        "-t",
+        "--num-inference-steps",
+        type=int,
+        default=None,
+        help="Number of iteration steps",
+    )
+    parser.add_argument(
+        "--true-cfg-scale",
+        default=None,
+        type=float,
+        help="Olny for flux pipeline. When > 1.0 and a provided `negative_prompt`, " \
+             "enables true classifier-free guidance."
+    )
+    parser.add_argument(
+        "--guidance-scale",
+        default=None,
+        type=float,
+        help="Guidance scale is enabled by setting `guidance_scale > 1`. Higher " \
+             "guidance scale encourages to generate images that are closely linked to " \
+             "the text `prompt`, usually at the expense of lower image quality."
+    )
+    parser.add_argument(
+        "-s",
+        "--seed",
+        type=int,
+        default=42,
+        help="Random seed",
+    )
+    # --------------------------------------------------------------------------
+
+    # ================================ control =================================
+    parser.add_argument(
+        "-p",
+        "--parti-prompts-file",
+        type=str,
+        required=True,
+        help="Number of iteration steps",
+    )
+    parser.add_argument(
+        "--count-submodels",
+        action="store_true",
+        help="count running time for each submodel",
+    )
+    parser.add_argument(
+        "--save-dir",
+        type=str,
+        default=None,
+        help="Path to save images",
+    )
+    parser.add_argument(
+        "--resume",
+        action="store_true",
+        help="resume image generation",
+    )
+    # --------------------------------------------------------------------------
+
+    args = parser.parse_args()
+    return args
+
+
+def parse_prompts(parti_prompts_file):
+    Prompt = namedtuple("Prompt", 
+                        ["prompt_text", "category", "challenge", "note"])
+    prompt_list = []
+
+    with open(parti_prompts_file, "r") as f:
+        csv_reader = csv.reader(f, delimiter="\t")
+        for i, row in enumerate(csv_reader):
+            if i == 0:
+                continue
+            prompt_list.append(Prompt(*row))
+    
+    return prompt_list
+
+
+def main():
+    args = parse_args()
+    pipe_name, migraphx_config = get_name_and_migraphx_config(args.model_dir)
+
+    if args.img_size is not None:
+        migraphx_config['common_args']['img_size'] = args.img_size
+    migraphx_config['common_args'].update(dict(
+        batch=args.num_images_per_prompt,
+        force_compile=args.force_compile,
+    ))
+
+    pipe = DiffusionPipeline.from_pretrained(
+        args.model_dir,
+        torch_dtype=torch.float16,
+        migraphx_config=migraphx_config
+    )
+    pipe.to("cuda")
+
+    call_kwargs = {}
+    if args.num_inference_steps is not None:
+        call_kwargs['num_inference_steps'] = args.num_inference_steps
+    if args.guidance_scale is not None:
+        call_kwargs['guidance_scale'] = args.guidance_scale
+    if args.true_cfg_scale is not None:
+        assert pipe_name == 'flux.1-dev', \
+            "`true_cfg_scale` is only valid for flux.1-dev pipeline!"
+        call_kwargs['true_cfg_scale'] = args.true_cfg_scale
+    if args.seed is not None:
+        call_kwargs['generator'] = torch.Generator("cuda").manual_seed(args.seed)
+
+
+    os.makedirs(args.save_dir, exist_ok=True)
+    generator = torch.Generator("cuda").manual_seed(args.seed)
+
+    print("Generating image...")
+    for i, prompt in enumerate(parse_prompts(args.parti_prompts_file)):
+        sub_dir = osp.join(args.save_dir, 
+                           prompt.category.replace(" ", "").replace("&", "_"),
+                           f"prompt_{i:0>4d}")
+        prompt_json = osp.join(sub_dir, "prompt_info.json")
+
+        # =========================== resume =========================
+        if args.resume:
+            check_file_list = [osp.join(sub_dir, f"image_{j:0>2d}.png") 
+                               for j in range(args.num_images_per_prompt)]
+            check_file_list.append(prompt_json)
+            if all([osp.exists(f) for f in check_file_list]):
+                print(f"Skipping prompt {i}: \"{prompt.prompt_text}\"")
+                continue
+        
+        # =========================== generate image =========================
+        print(f"Processing prompt {i}: \"{prompt.prompt_text}\"")
+        if not osp.isdir(sub_dir):
+            os.makedirs(sub_dir, exist_ok=True)
+        
+        with open(prompt_json, "w") as f:
+            json.dump(prompt._asdict(), f)
+
+        images = pipe(
+            prompt=prompt.prompt_text, 
+            **call_kwargs
+        ).images
+
+        for j, image in enumerate(images):
+            save_path = osp.join(sub_dir, f"{j:0>2d}.png")
+            image.save(save_path)
+            print(f"Generated image: {save_path}")
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/run_examples.py
+++ b/tools/run_examples.py
+import json
+import os
+import os.path as osp
+
+from diffusers import DiffusionPipeline
+import migraphx_diffusers
+from migraphx_diffusers import get_name_and_migraphx_config
+import torch
+
+
+def parse_args():
+    from argparse import ArgumentParser
+    parser = ArgumentParser(description="SDXL inference with migraphx backend")
+
+    #=========================== mdoel load and compile ========================
+    parser.add_argument(
+        "-m", 
+        "--model-dir",
+        type=str,
+        required=True,
+        help="Path to local model directory.",
+    )
+    parser.add_argument(
+        "--force-compile",
+        action="store_true",
+        default=False,
+        help="Ignore existing .mxr files and override them",
+    )
+    parser.add_argument(
+        "--num-images-per-prompt",
+        type=int,
+        default=1,
+        help="The number of images to generate per prompt."
+    )
+    parser.add_argument(
+        "--img-size",
+        type=int,
+        default=None,
+        help="output image size",
+    )
+    # --------------------------------------------------------------------------
+
+    # =============================== generation ===============================
+    parser.add_argument(
+        "-t",
+        "--num-inference-steps",
+        type=int,
+        default=None,
+        help="Number of iteration steps",
+    )
+    parser.add_argument(
+        "--true-cfg-scale",
+        default=None,
+        type=float,
+        help="Olny for flux pipeline. When > 1.0 and a provided `negative_prompt`, " \
+             "enables true classifier-free guidance."
+    )
+    parser.add_argument(
+        "--guidance-scale",
+        default=None,
+        type=float,
+        help="Guidance scale is enabled by setting `guidance_scale > 1`. Higher " \
+             "guidance scale encourages to generate images that are closely linked to " \
+             "the text `prompt`, usually at the expense of lower image quality."
+    )
+    parser.add_argument(
+        "-s",
+        "--seed",
+        type=int,
+        default=42,
+        help="Random seed",
+    )
+    # --------------------------------------------------------------------------
+
+    parser.add_argument(
+        "--examples-json",
+        type=str,
+        default="./examples/prompts_and_negative_prompts.json",
+        help="Prompts and negative prompts data path",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default=None,
+        help="Path to save images",
+    )
+
+    args = parser.parse_args()
+    return args
+
+
+def parse_prompts(examples_json):
+    with open(examples_json, 'r') as f:
+        prompt_data = json.load(f)
+    return prompt_data
+
+
+def main():
+    args = parse_args()
+    pipe_name, migraphx_config = get_name_and_migraphx_config(args.model_dir)
+    if args.output_dir is None:
+        args.output_dir = f"./examples/{pipe_name}-images-{args.img_size}"
+
+    if args.img_size is not None:
+        migraphx_config['common_args']['img_size'] = args.img_size
+    migraphx_config['common_args'].update(dict(
+        batch=args.num_images_per_prompt,
+        force_compile=args.force_compile,
+    ))
+
+    pipe = DiffusionPipeline.from_pretrained(
+        args.model_dir,
+        torch_dtype=torch.float16, 
+        migraphx_config=migraphx_config
+    )
+    pipe.to("cuda")
+
+    call_kwargs = {}
+    if args.num_inference_steps is not None:
+        call_kwargs['num_inference_steps'] = args.num_inference_steps
+    if args.guidance_scale is not None:
+        call_kwargs['guidance_scale'] = args.guidance_scale
+    if args.true_cfg_scale is not None:
+        assert pipe_name == 'flux.1-dev', \
+            "`true_cfg_scale` is only valid for flux.1-dev pipeline!"
+        call_kwargs['true_cfg_scale'] = args.true_cfg_scale
+    if args.seed is not None:
+        call_kwargs['generator'] = torch.Generator("cuda").manual_seed(args.seed)
+
+    prompt_data = parse_prompts(args.examples_json)
+
+    cnt = 0
+    for i, d in enumerate(prompt_data):
+        theme = d["theme"]
+        pairs = d["examples"]
+        sub_dir = osp.join(args.output_dir, 
+                           f"{i}-{theme.title().replace(' ', '')}")
+        os.makedirs(sub_dir, exist_ok=True)
+
+        for j, pair in enumerate(pairs):
+            print(f"Generating image {cnt}...")
+            prompt = pair["prompt"]
+            negative_prompt = pair["negative_prompt"]
+            print(f"Prompt: {prompt}")
+            print(f"negative Prompt: {negative_prompt}")
+
+            images = pipe(
+                prompt=prompt, 
+                negative_prompt=negative_prompt,
+                **call_kwargs
+            ).images
+
+            for k, image in enumerate(images):
+                save_path = osp.join(
+                    sub_dir, f"theme_{i}_example_{j}_image_{k}.png")
+                image.save(save_path)
+                print(f"Image saved: {save_path}")
+                cnt += 1
+    
+    print(f"Total {cnt} images Generated!")
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/run_pipe.py
+++ b/tools/run_pipe.py
+import json
+import os.path as osp
+
+from diffusers import DiffusionPipeline
+import migraphx_diffusers
+from migraphx_diffusers import get_name_and_migraphx_config
+import torch
+
+
+def parse_args():
+    from argparse import ArgumentParser
+    parser = ArgumentParser(description="SDXL inference with migraphx backend")
+
+    #=========================== mdoel load and compile ========================
+    parser.add_argument(
+        "-m", 
+        "--model-dir",
+        type=str,
+        required=True,
+        help="Path to local model directory.",
+    )
+    parser.add_argument(
+        "--force-compile",
+        action="store_true",
+        default=False,
+        help="Ignore existing .mxr files and override them",
+    )
+    parser.add_argument(
+        "--img-size",
+        type=int,
+        default=None,
+        help="output image size",
+    )
+    parser.add_argument(
+        "--num-images-per-prompt",
+        type=int,
+        default=1,
+        help="The number of images to generate per prompt."
+    )
+    # --------------------------------------------------------------------------
+
+    # =============================== generation ===============================
+    parser.add_argument(
+        "-p",
+        "--prompt",
+        type=str,
+        required=True,
+        help="Prompt for describe image content, style and so on."
+    )
+    parser.add_argument(
+        "-n",
+        "--negative-prompt",
+        type=str,
+        default=None,
+        help="Negative prompt",
+    )
+    parser.add_argument(
+        "-t",
+        "--num-inference-steps",
+        type=int,
+        default=None,
+        help="Number of iteration steps",
+    )
+    parser.add_argument(
+        "--true-cfg-scale",
+        default=None,
+        type=float,
+        help="Olny for flux pipeline. When > 1.0 and a provided `negative_prompt`, " \
+             "enables true classifier-free guidance."
+    )
+    parser.add_argument(
+        "--guidance-scale",
+        default=None,
+        type=float,
+        help="Guidance scale is enabled by setting `guidance_scale > 1`. Higher " \
+             "guidance scale encourages to generate images that are closely linked to " \
+             "the text `prompt`, usually at the expense of lower image quality."
+    )
+    parser.add_argument(
+        "-s",
+        "--seed",
+        type=int,
+        default=42,
+        help="Random seed",
+    )
+    parser.add_argument(
+        "--save-prefix",
+        type=str,
+        default=None,
+        help="Prefix of path for saving results",
+    )
+    # --------------------------------------------------------------------------
+
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    pipe_name, migraphx_config = get_name_and_migraphx_config(args.model_dir)
+    if args.save_prefix is None:
+        args.save_prefix = f"./{pipe_name}_output"
+
+    if args.img_size is not None:
+        migraphx_config['common_args']['img_size'] = args.img_size
+    migraphx_config['common_args'].update(dict(
+        batch=args.num_images_per_prompt,
+        force_compile=args.force_compile,
+    ))
+
+    pipe = DiffusionPipeline.from_pretrained(
+        args.model_dir,
+        torch_dtype=torch.float16,
+        migraphx_config=migraphx_config
+    )
+    pipe.to("cuda")
+
+    call_kwargs = {}
+    if args.num_inference_steps is not None:
+        call_kwargs['num_inference_steps'] = args.num_inference_steps
+    if args.guidance_scale is not None:
+        call_kwargs['guidance_scale'] = args.guidance_scale
+    if args.true_cfg_scale is not None:
+        assert pipe_name == 'flux.1-dev', \
+            "`true_cfg_scale` is only valid for flux.1-dev pipeline!"
+        call_kwargs['true_cfg_scale'] = args.true_cfg_scale
+    if args.seed is not None:
+        call_kwargs['generator'] = torch.Generator("cuda").manual_seed(args.seed)
+
+    print("Generating image...")
+    images = pipe(
+        prompt=args.prompt, 
+        negative_prompt=args.negative_prompt, 
+        **call_kwargs
+    ).images
+
+    for i, image in enumerate(images):
+        save_path = f"{args.save_prefix}_{i}.png"
+        image.save(save_path)
+        print(f"Generated image: {save_path}")
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/time_count.py
+++ b/tools/time_count.py
+
+import json
+import os.path as osp
+import time
+
+from diffusers import DiffusionPipeline
+import migraphx_diffusers
+from migraphx_diffusers import AutoTimer, get_name_and_migraphx_config
+import torch
+
+
+def parse_args():
+    date_str = time.strftime("%Y%m%d-%H%M%S", time.localtime())
+
+    from argparse import ArgumentParser
+    parser = ArgumentParser(description="SDXL inference with migraphx backend")
+
+    #=========================== mdoel load and compile ========================
+    parser.add_argument(
+        "-m", 
+        "--model-dir",
+        type=str,
+        required=True,
+        help="Path to local model directory.",
+    )
+    parser.add_argument(
+        "--force-compile",
+        action="store_true",
+        default=False,
+        help="Ignore existing .mxr files and override them",
+    )
+    parser.add_argument(
+        "--img-size",
+        type=int,
+        default=None,
+        help="output image size",
+    )
+    parser.add_argument(
+        "--num-images-per-prompt",
+        type=int,
+        default=1,
+        help="The number of images to generate per prompt."
+    )
+    # --------------------------------------------------------------------------
+
+    # =============================== generation ===============================
+    parser.add_argument(
+        "-t",
+        "--num-inference-steps",
+        type=int,
+        default=50,
+        help="Number of iteration steps",
+    )
+    parser.add_argument(
+        "--out-csv-file",
+        type=str,
+        default=f"./perf-{date_str}.csv",
+        help="Prefix of path for saving results",
+    )
+    # --------------------------------------------------------------------------
+
+    # =============================== time count ===============================
+    parser.add_argument(
+        "--count-submodels",
+        action="store_true",
+        help="count running time for each submodel",
+    )
+    parser.add_argument(
+        "--num-warmup-loops",
+        type=int,
+        default=1,
+        help="warmup loops",
+    )
+    parser.add_argument(
+        "--num-count-loops",
+        type=int,
+        default=100,
+        help="time count loops",
+    )
+    # --------------------------------------------------------------------------
+
+    args = parser.parse_args()
+    return args
+
+
+def get_default_prompt(pipe_name):
+    negative_prompt = "ugly"
+    
+    if pipe_name == 'sd2.1':
+        prompt = "a photo of an astronaut riding a horse on mars"
+    elif pipe_name == 'sdxl':
+        prompt = "An astronaut riding a green horse", None
+    elif pipe_name == 'flux.1-dev':
+        prompt = "A cat holding a sign that says hello world"
+    else:
+        raise ValueError(f"{pipe_name} is not supported!")
+    
+    return prompt, negative_prompt
+
+
+def set_timer(timer, pipe, pipe_name, count_submodels=False):
+    timer.add_target(pipe, key="end2end")
+    if not count_submodels:
+        return
+    
+    if pipe_name == 'sd2.1':
+        timer.add_targets([
+            (pipe.text_encoder, "text_encoder"), 
+            (pipe.unet, "unet"),
+            (pipe.vae.decode, "vae_decoder")
+        ])
+    elif pipe_name == 'sdxl':
+        timer.add_targets([
+            (pipe.text_encoder, "text_encoder"), 
+            (pipe.text_encoder_2, "text_encoder_2"), 
+            (pipe.unet, "unet"),
+            (pipe.vae.decode, "vae_decoder")
+        ])
+    elif pipe_name == 'flux.1-dev':
+        timer.add_targets([
+            (pipe.text_encoder, "text_encoder"), 
+            (pipe.text_encoder_2, "text_encoder_2"), 
+            (pipe.transformer, "transformer"),
+            (pipe.vae.decode, "vae_decoder")
+        ])
+    else:
+        raise ValueError(f"{pipe_name} is not supported!")
+
+
+def test_latency(pipe, timer, prompt, negative_prompt=None, batch=1, 
+                 num_inference_steps=50, num_warmup_loops=1, 
+                 num_count_loops=100, title=None, out_csv_file=None, 
+                 **call_kwargs):
+    
+    date_str = time.strftime("%Y%m%d-%H%M%S", time.localtime())
+    if not out_csv_file:
+        out_csv_file = f"./perf-{date_str}.csv"
+
+    for i in range(num_warmup_loops + num_count_loops):
+        if i == num_warmup_loops:
+            timer.start_work()
+        pipe(prompt=prompt, 
+             negative_prompt=negative_prompt, 
+             num_inference_steps=num_inference_steps,
+             **call_kwargs)
+    table = timer.summary(batchsize=batch, title=title)
+    with open(out_csv_file, 'a') as f:
+        f.write(table.get_csv_string())
+    timer.clear()
+    timer.finish_work()
+
+
+def main():
+    args = parse_args()
+    pipe_name, migraphx_config = get_name_and_migraphx_config(args.model_dir)
+    assert pipe_name in ['sdxl', 'sd2.1', 'flux.1-dev'], \
+        "Only support (1)SDXL (2)SD2.1 (3)Flux.1-dev!"
+
+    if args.img_size is not None:
+        migraphx_config['common_args']['img_size'] = args.img_size
+    migraphx_config['common_args'].update(dict(
+        batch=args.num_images_per_prompt,
+        force_compile=args.force_compile,
+    ))
+
+    pipe = DiffusionPipeline.from_pretrained(
+        args.model_dir,
+        torch_dtype=torch.float16, 
+        migraphx_config=migraphx_config
+    )
+    pipe.to("cuda")
+
+    t = AutoTimer()
+    set_timer(t, pipe, pipe_name, count_submodels=args.count_submodels)
+    
+    prompt, negative_prompt = get_default_prompt(pipe_name)
+    test_latency(pipe, t, prompt, 
+                 batch=args.num_images_per_prompt, 
+                 num_inference_steps=args.num_inference_steps, 
+                 num_warmup_loops=args.num_warmup_loops, 
+                 num_count_loops=args.num_count_loops, 
+                 title=f"{pipe_name} Latency (Only Prompt)",
+                 out_csv_file=args.out_csv_file)
+
+    if pipe_name == 'flux.1-dev':
+        test_latency(pipe, t, prompt, 
+                     negative_prompt=negative_prompt,
+                     batch=args.num_images_per_prompt, 
+                     num_inference_steps=args.num_inference_steps, 
+                     num_warmup_loops=args.num_warmup_loops, 
+                     num_count_loops=args.num_count_loops,
+                     title=f"{pipe_name} Latency (Prompt + NegativePrompt)",
+                     out_csv_file=args.out_csv_file,
+                     true_cfg_scale=2.0)
+
+
+if __name__ == "__main__":
+    main()