feat: Add complete SDXL benchmark CI/CD pipeline with caching

0c0b0c34 · jerrrrry · 0c0b0c34 · 0c0b0c34 · 0c0b0c34
Commit 0c0b0c34 authored Jan 19, 2026 by jerrrrry
Show whitespace changes
Inline Side-by-side

Showing with 205 additions and 0 deletions

.gitlab-ci.yml .gitlab-ci.yml +104 -0

README.md README.md +12 -0

test.py test.py +89 -0

No files found.
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
+# 定义流水线要使用的 Docker 镜像
+image: image.sourcefind.cn:5000/dcu/admin/base/vllm:0.8.5-ubuntu22.04-dtk25.04.1-rc5-das1.6-py3.10-20250724
+
+# 定义流水线的所有阶段
+stages:
+  - prepare
+  - install
+  - test
+
+# === 阶段 1: 准备阶段 - 下载所有大文件 ===
+download_assets:
+  stage: prepare
+  tags:
+    - demos
+  script:
+    - echo "--- 1. Preparing assets ---"
+    # 安装 modelscope
+    - pip install modelscope
+    # 下载模型
+    - modelscope download --model AI-ModelScope/sdxl-vae-fp16-fix --local_dir ./sdxl-vae-fp16-fix
+    - modelscope download --model stabilityai/stable-diffusion-xl-base-1.0 --local_dir ./stable-diffusion-xl-base-1.0
+    # 下载库文件
+    - curl -f -C - -o rocblas-install-0626-bug.tar.gz https://wuzh01.hpccube.com:65015/efile/s/d/amVycnJycnk=/a6a7342d017b1748
+    - curl -f -C - -o package-miopen-dev-0801-ubuntu20.tar.gz https://ksefile.hpccube.com:65241/efile/s/d/amVycnJycnk=/9c2334cb9cdb8b39
+    - echo "--- Asset download finished ---"
+  # 使用缓存来保存下载的文件，key为分支名，paths为要缓存的目录
+  cache:
+    key: ${CI_COMMIT_REF_SLUG}
+    paths:
+      - sdxl-vae-fp16-fix/
+      - stable-diffusion-xl-base-1.0/
+      - rocblas-install-0626-bug.tar.gz
+      - package-miopen-dev-0801-ubuntu20.tar.gz
+
+# === 阶段 2: 安装阶段 - 解压、安装、配置 ===
+setup_environment:
+  stage: install
+  tags:
+    - demos
+  script:
+    - echo "--- 2. Setting up environment ---"
+    # 解压库文件
+    - tar -xzvf rocblas-install-0626-bug.tar.gz
+    - tar -xzvf package-miopen-dev-0801-ubuntu20.tar.gz
+    # 替换 VAE 权重
+    - cd stable-diffusion-xl-base-1.0
+    - mv vae vae_bak || true # 如果 vae_bak 已存在，忽略错误
+    - mkdir -p vae
+    - cp ../sdxl-vae-fp16-fix/c* ./vae/
+    - cp ../sdxl-vae-fp16-fix/s* ./vae/
+    - cp ../sdxl-vae-fp16-fix/d* ./vae/
+    - cd ..
+    # 下载并安装 Python 包
+    - curl -f -C - -o diffusers-0.33.1-py3-none-any.whl https://ksefile.hpccube.com:65241/efile/s/d/amVycnJycnk=/63ec0d10ce960f90
+    - curl -f -C - -o lightop-0.5.0+das.dtk25041-cp310-cp310-linux_x86_64.whl https://ksefile.hpccube.com:65241/efile/s/d/amVycnJycnk=/da522f7e175bb092
+    - curl -f -C - -o litserve-0.2.15-py3-none-any.whl https://ksefile.hpccube.com:65241/efile/s/d/amVycnJycnk=/36943fb1ca62ac3b
+    - pip install diffusers-0.33.1-py3-none-any.whl lightop-0.5.0+das.dtk25041-cp310-cp310-linux_x86_64.whl litserve-0.2.15-py3-none-any.whl
+    # 替换 transformers
+    - curl -f -C - -o transformers.tar.gz https://ksefile.hpccube.com:65241/efile/s/d/amVycnJycnk=/f587d939a37d1727
+    - tar -xzvf transformers.tar.gz
+    - echo "--- Environment setup finished ---"
+  # 继承上一个作业的缓存，并缓存 pip 安装的包
+  cache:
+    key: ${CI_COMMIT_REF_SLUG}
+    paths:
+      - sdxl-vae-fp16-fix/
+      - stable-diffusion-xl-base-1.0/
+      - rocblas-install/
+      - package-miopen-dev-0801-ubuntu20/
+      - transformers/
+    # 缓存 pip 的安装目录，加速后续安装
+    policy: pull-push
+
+# === 阶段 3: 测试阶段 - 运行基准测试 ===
+benchmark_sdxl:
+  stage: test
+  tags:
+    - demos
+  # 在脚本执行前设置所有必要的环境变量
+  before_script:
+    - export LD_LIBRARY_PATH=$CI_PROJECT_DIR/rocblas-install/lib/:$LD_LIBRARY_PATH
+    - export LD_LIBRARY_PATH=$CI_PROJECT_DIR/package-miopen-dev-0801-ubuntu20/lib/:$LD_LIBRARY_PATH
+    - export PYTORCH_MIOPEN_SUGGEST_NHWC=1
+    - export PYTHONPATH="$CI_PROJECT_DIR/transformers:$PYTHONPATH"
+    # 设置模型路径环境变量，供 test.py 使用
+    - export MODEL_PATH="$CI_PROJECT_DIR/stable-diffusion-xl-base-1.0"
+  script:
+    - echo "--- 3. Running SDXL benchmark ---"
+    - echo "LD_LIBRARY_PATH: $LD_LIBRARY_PATH"
+    - echo "PYTHONPATH: $PYTHONPATH"
+    - python test.py
+    - echo "--- Benchmark finished ---"
+  # 定义产物，保存测试生成的图片和性能报告
+  artifacts:
+    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME"
+    paths:
+      - *.png
+      - results.json
+    # 产物保存 1 周
+    expire_in: 1 week
+  # 只从上一个作业继承缓存，不更新
+  cache:
+    key: ${CI_COMMIT_REF_SLUG}
+    policy: pull
--- a/README.md
+++ b/README.md
+SDXL Benchmark CI/CD
+This project uses GitLab CI/CD to automate the benchmarking of the Stable Diffusion XL model on a DCU environment.
+
+Pipeline Stages
+prepare: Downloads the SDXL models and necessary libraries (rocblas, miopen). These files are cached to speed up subsequent pipelines.
+install: Extracts libraries, replaces VAE weights, and installs specific Python packages. The installed packages are also cached.
+test: Sets up environment variables and runs the test.py benchmark script. The output images and a results.json performance report are saved as artifacts.
+How it Works
+The pipeline runs on a GitLab Runner with the demos tag.
+It uses a specific Docker image that contains the base PyTorch and DCU environment.
+All heavy lifting (downloads, installations) is cached based on the branch name (${CI_COMMIT_REF_SLUG}).
+The final test results, including generated images and performance metrics, are available for download from the CI/CD job artifacts page.
--- a/test.py
+++ b/test.py
+import os
+import json
+import torch
+import time
+from diffusers import DiffusionPipeline
+
+# 使用环境变量来定义模型路径，增加灵活性
+MODEL_PATH = os.environ.get("MODEL_PATH", "/workspace/stable-diffusion-xl-base-1.0")
+
+torch.autograd.set_detect_anomaly(True)
+
+# 初始化模型
+print(f"Loading model from: {MODEL_PATH}")
+pipe = DiffusionPipeline.from_pretrained(
+    MODEL_PATH,
+    torch_dtype=torch.float16,
+    use_safetensors=True,
+    variant="fp16"
+)
+pipe.to("cuda")
+
+pipe.text_encoder = torch.compile(pipe.text_encoder, backend="inductor", dynamic=False)
+pipe.text_encoder_2 = torch.compile(pipe.text_encoder_2, backend="inductor", dynamic=False)
+if hasattr(pipe, "unet"):
+    pipe.unet = torch.compile(pipe.unet, backend="inductor", dynamic=False)
+
+# 基础提示词
+prompt = "An astronaut riding a green horse"
+
+# 配置组合参数
+widths = [1024, 2048]
+heights = [1024, 2048]
+steps_list = [10, 20]
+batch_sizes = [2]
+
+# Warm up
+print("Warming up...")
+for i in range(1):
+    pipe(prompt=prompt, width=1024, height=1024, num_inference_steps=10, num_images_per_prompt=1)
+    pipe(prompt=prompt, width=2048, height=2048, num_inference_steps=10, num_images_per_prompt=1)
+    pipe(prompt=prompt, width=1024, height=1024, num_inference_steps=10, num_images_per_prompt=2)
+    pipe(prompt=prompt, width=2048, height=2048, num_inference_steps=10, num_images_per_prompt=2)
+
+print("Warm up finished. Starting benchmark...")
+
+# 初始化结果列表
+all_results = []
+
+# 生成8种配置组合
+for width, height in zip(widths, heights):
+    for num_inference_steps in steps_list:
+        for batch_size in batch_sizes:
+            print(f"\n生成配置: {width}x{height}, steps={num_inference_steps}, batch={batch_size}")
+            time_list = []
+            for i in range(5):
+                torch.cuda.empty_cache()
+                torch.cuda.synchronize()
+                time_start = time.time()
+                result = pipe(
+                    prompt=prompt,
+                    width=width,
+                    height=height,
+                    num_inference_steps=num_inference_steps,
+                    num_images_per_prompt=batch_size
+                )
+                torch.cuda.synchronize()
+                time_end = time.time()
+                time_list.append((time_end - time_start) * 1000)
+
+            avg_time = sum(time_list) / len(time_list)
+            print(f"time cost: {time_list}, avg: {avg_time:.2f} ms")
+
+            # 保存本批次生成的图片
+            for i, image in enumerate(result.images):
+                filename = f"output_{width}x{height}_steps{num_inference_steps}_batch{batch_size}_{i}.png"
+                image.save(filename)
+                print(f"保存图片: {filename}")
+
+            # 记录结果
+            all_results.append({
+                "config": f"{width}x{height}_steps{num_inference_steps}_batch{batch_size}",
+                "avg_time_ms": round(avg_time, 2)
+            })
+
+# 将所有结果保存到 JSON 文件
+with open("results.json", "w") as f:
+    json.dump(all_results, f, indent=4)
+
+print("\n所有配置组合生成完成！结果已保存到 results.json")