Unverified Commit dd5a3629 authored by Dhruv Nair's avatar Dhruv Nair Committed by GitHub
Browse files

New Pipeline Slow Test runners (#5131)



* pipline fetcher

* update script

* clean up

* clean up

* clean up

* new pipeline runner

* rename tests to match modules

* test actions in pr

* change runner to gpu

* clean up

* clean up

* clean up

* fix report

* fix reporting

* clean up

* show test stats in failure reports

* give names to jobs

* add lora tests

* split torch cuda tests and add compile tests

* clean up

* fix tests

* change push to run only on main

---------
Co-authored-by: default avatarPatrick von Platen <patrick.v.platen@gmail.com>
parent 7271f8b7
name: Slow tests on main
name: Slow Tests on main
on:
push:
branches:
- main
env:
DIFFUSERS_IS_CI: yes
HF_HOME: /mnt/cache
......@@ -12,53 +13,115 @@ env:
MKL_NUM_THREADS: 8
PYTEST_TIMEOUT: 600
RUN_SLOW: yes
PIPELINE_USAGE_CUTOFF: 50000
jobs:
run_slow_tests:
setup_torch_cuda_pipeline_matrix:
name: Setup Torch Pipelines CUDA Slow Tests Matrix
runs-on: docker-gpu
container:
image: diffusers/diffusers-pytorch-cpu # this is a CPU image, but we need it to fetch the matrix
options: --shm-size "16gb" --ipc host
outputs:
pipeline_test_matrix: ${{ steps.fetch_pipeline_matrix.outputs.pipeline_test_matrix }}
steps:
- name: Checkout diffusers
uses: actions/checkout@v3
with:
fetch-depth: 2
- name: Install dependencies
run: |
apt-get update && apt-get install libsndfile1-dev libgl1 -y
python -m pip install -e .[quality,test]
python -m pip install git+https://github.com/huggingface/accelerate.git
- name: Environment
run: |
python utils/print_env.py
- name: Fetch Pipeline Matrix
id: fetch_pipeline_matrix
run: |
matrix=$(python utils/fetch_torch_cuda_pipeline_test_matrix.py)
echo $matrix
echo "pipeline_test_matrix=$matrix" >> $GITHUB_OUTPUT
- name: Pipeline Tests Artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v2
with:
name: test-pipelines.json
path: reports
torch_pipelines_cuda_tests:
name: Torch Pipelines CUDA Slow Tests
needs: setup_torch_cuda_pipeline_matrix
strategy:
fail-fast: false
max-parallel: 1
matrix:
config:
- name: Slow PyTorch CUDA tests on Ubuntu
framework: pytorch
runner: docker-gpu
module: ${{ fromJson(needs.setup_torch_cuda_pipeline_matrix.outputs.pipeline_test_matrix) }}
runs-on: docker-gpu
container:
image: diffusers/diffusers-pytorch-cuda
report: torch_cuda
- name: Slow Flax TPU tests on Ubuntu
framework: flax
runner: docker-tpu
image: diffusers/diffusers-flax-tpu
report: flax_tpu
- name: Slow ONNXRuntime CUDA tests on Ubuntu
framework: onnxruntime
runner: docker-gpu
image: diffusers/diffusers-onnxruntime-cuda
report: onnx_cuda
name: ${{ matrix.config.name }}
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
steps:
- name: Checkout diffusers
uses: actions/checkout@v3
with:
fetch-depth: 2
- name: NVIDIA-SMI
run: |
nvidia-smi
- name: Install dependencies
run: |
apt-get update && apt-get install libsndfile1-dev libgl1 -y
python -m pip install -e .[quality,test]
python -m pip install git+https://github.com/huggingface/accelerate.git
- name: Environment
run: |
python utils/print_env.py
- name: Slow PyTorch CUDA checkpoint tests on Ubuntu
env:
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
CUBLAS_WORKSPACE_CONFIG: :16:8
run: |
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-s -v -k "not Flax and not Onnx" \
--make-reports=tests_pipeline_${{ matrix.module }}_cuda \
tests/pipelines/${{ matrix.module }}
- name: Failure short reports
if: ${{ failure() }}
run: |
cat reports/tests_pipeline_${{ matrix.module }}_cuda_stats.txt
cat reports/tests_pipeline_${{ matrix.module }}_cuda_failures_short.txt
runs-on: ${{ matrix.config.runner }}
- name: Test suite reports artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v2
with:
name: pipeline_${{ matrix.module }}_test_reports
path: reports
torch_cuda_tests:
name: Torch CUDA Tests
runs-on: docker-gpu
container:
image: ${{ matrix.config.image }}
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ ${{ matrix.config.runner == 'docker-tpu' && '--privileged' || '--gpus 0'}}
image: diffusers/diffusers-pytorch-cuda
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
defaults:
run:
shell: bash
strategy:
matrix:
module: [models, schedulers, lora, others]
steps:
- name: Checkout diffusers
uses: actions/checkout@v3
with:
fetch-depth: 2
- name: NVIDIA-SMI
if : ${{ matrix.config.runner == 'docker-gpu' }}
run: |
nvidia-smi
- name: Install dependencies
run: |
apt-get update && apt-get install libsndfile1-dev libgl1 -y
......@@ -70,47 +133,121 @@ jobs:
python utils/print_env.py
- name: Run slow PyTorch CUDA tests
if: ${{ matrix.config.framework == 'pytorch' }}
env:
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
CUBLAS_WORKSPACE_CONFIG: :16:8
run: |
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-s -v -k "not Flax and not Onnx and not compile" \
--make-reports=tests_${{ matrix.config.report }} \
tests/
-s -v -k "not Flax and not Onnx" \
--make-reports=tests_torch_cuda \
tests/${{ matrix.module }}
- name: Failure short reports
if: ${{ failure() }}
run: |
cat reports/tests_torch_cuda_stats.txt
cat reports/tests_torch_cuda_failures_short.txt
- name: Test suite reports artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v2
with:
name: torch_cuda_test_reports
path: reports
flax_tpu_tests:
name: Flax TPU Tests
runs-on: docker-tpu
container:
image: diffusers/diffusers-flax-tpu
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --privileged
defaults:
run:
shell: bash
steps:
- name: Checkout diffusers
uses: actions/checkout@v3
with:
fetch-depth: 2
- name: Install dependencies
run: |
apt-get update && apt-get install libsndfile1-dev libgl1 -y
python -m pip install -e .[quality,test]
python -m pip install git+https://github.com/huggingface/accelerate.git
- name: Environment
run: |
python utils/print_env.py
- name: Run slow Flax TPU tests
if: ${{ matrix.config.framework == 'flax' }}
env:
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
run: |
python -m pytest -n 0 \
-s -v -k "Flax" \
--make-reports=tests_${{ matrix.config.report }} \
--make-reports=tests_flax_tpu \
tests/
- name: Failure short reports
if: ${{ failure() }}
run: |
cat reports/tests_flax_tpu_stats.txt
cat reports/tests_flax_tpu_failures_short.txt
- name: Test suite reports artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v2
with:
name: flax_tpu_test_reports
path: reports
onnx_cuda_tests:
name: ONNX CUDA Tests
runs-on: docker-gpu
container:
image: diffusers/diffusers-onnxruntime-cuda
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
defaults:
run:
shell: bash
steps:
- name: Checkout diffusers
uses: actions/checkout@v3
with:
fetch-depth: 2
- name: Install dependencies
run: |
apt-get update && apt-get install libsndfile1-dev libgl1 -y
python -m pip install -e .[quality,test]
python -m pip install git+https://github.com/huggingface/accelerate.git
- name: Environment
run: |
python utils/print_env.py
- name: Run slow ONNXRuntime CUDA tests
if: ${{ matrix.config.framework == 'onnxruntime' }}
env:
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
run: |
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-s -v -k "Onnx" \
--make-reports=tests_${{ matrix.config.report }} \
--make-reports=tests_onnx_cuda \
tests/
- name: Failure short reports
if: ${{ failure() }}
run: cat reports/tests_${{ matrix.config.report }}_failures_short.txt
run: |
cat reports/tests_onnx_cuda_stats.txt
cat reports/tests_onnx_cuda_failures_short.txt
- name: Test suite reports artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v2
with:
name: ${{ matrix.config.report }}_test_reports
name: onnx_cuda_test_reports
path: reports
run_torch_compile_tests:
......@@ -131,21 +268,17 @@ jobs:
- name: NVIDIA-SMI
run: |
nvidia-smi
- name: Install dependencies
run: |
python -m pip install -e .[quality,test,training]
- name: Environment
run: |
python utils/print_env.py
- name: Run example tests on GPU
env:
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
run: |
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/
- name: Failure short reports
if: ${{ failure() }}
run: cat reports/tests_torch_compile_cuda_failures_short.txt
......@@ -192,7 +325,9 @@ jobs:
- name: Failure short reports
if: ${{ failure() }}
run: cat reports/examples_torch_cuda_failures_short.txt
run: |
cat reports/examples_torch_cuda_stats.txt
cat reports/examples_torch_cuda_failures_short.txt
- name: Test suite reports artifacts
if: ${{ always() }}
......
......@@ -213,7 +213,7 @@ class BlipDiffusionControlNetPipeline(DiffusionPipeline):
do_center_crop=False,
do_normalize=False,
return_tensors="pt",
)["pixel_values"].to(self.device)
)["pixel_values"].to(device)
image_batch_size = image.shape[0]
if image_batch_size == 1:
......@@ -365,7 +365,7 @@ class BlipDiffusionControlNetPipeline(DiffusionPipeline):
height=height,
batch_size=batch_size,
num_images_per_prompt=1,
device=self.device,
device=device,
dtype=self.controlnet.dtype,
do_classifier_free_guidance=do_classifier_free_guidance,
)
......
......@@ -765,8 +765,9 @@ class StableDiffusionUpscalePipeline(DiffusionPipeline, TextualInversionLoaderMi
if needs_upcasting:
self.upcast_vae()
latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
# Ensure latents are always the same type as the VAE
latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
# cast back to fp16 if needed
......
......@@ -1554,7 +1554,7 @@ class UNet2DConditionLoRAModelTests(unittest.TestCase):
torch_device != "cuda" or not is_xformers_available(),
reason="XFormers attention is only available with CUDA and `xformers` installed",
)
def test_lora_xformers_on_off(self, expected_max_diff=1e-4):
def test_lora_xformers_on_off(self, expected_max_diff=6e-4):
# enable deterministic behavior for gradient checkpointing
init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
......
......@@ -39,6 +39,7 @@ from diffusers.utils.testing_utils import (
enable_full_determinism,
floats_tensor,
load_numpy,
numpy_cosine_similarity_distance,
require_torch_gpu,
slow,
torch_device,
......@@ -550,7 +551,7 @@ class ControlNetInpaintPipelineSlowTests(unittest.TestCase):
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/boy_ray_ban.npy"
)
assert np.abs(expected_image - image).max() < 0.9e-1
assert numpy_cosine_similarity_distance(expected_image.flatten(), image.flatten()) < 1e-2
def test_load_local(self):
controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny")
......
......@@ -221,6 +221,9 @@ class KandinskyV22ControlnetPipelineFastTests(PipelineTesterMixin, unittest.Test
def test_float16_inference(self):
super().test_float16_inference(expected_max_diff=1e-1)
def test_inference_batch_single_identical(self):
super().test_inference_batch_single_identical(expected_max_diff=5e-4)
@nightly
@require_torch_gpu
......
......@@ -110,7 +110,7 @@ class FlaxPipelineTests(unittest.TestCase):
assert images.shape == (num_samples, 1, 512, 512, 3)
if jax.device_count() == 8:
assert np.abs((np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 0.05652401)) < 1e-3
assert np.abs((np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 0.05652401)) < 1e-2
assert np.abs((np.abs(images, dtype=np.float32).sum() - 2383808.2)) < 5e-1
def test_stable_diffusion_v1_4_bfloat_16(self):
......@@ -139,7 +139,7 @@ class FlaxPipelineTests(unittest.TestCase):
assert images.shape == (num_samples, 1, 512, 512, 3)
if jax.device_count() == 8:
assert np.abs((np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 0.04003906)) < 1e-3
assert np.abs((np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 0.04003906)) < 5e-2
assert np.abs((np.abs(images, dtype=np.float32).sum() - 2373516.75)) < 5e-1
def test_stable_diffusion_v1_4_bfloat_16_with_safety(self):
......@@ -168,7 +168,7 @@ class FlaxPipelineTests(unittest.TestCase):
assert images.shape == (num_samples, 1, 512, 512, 3)
if jax.device_count() == 8:
assert np.abs((np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 0.04003906)) < 1e-3
assert np.abs((np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 0.04003906)) < 5e-2
assert np.abs((np.abs(images, dtype=np.float32).sum() - 2373516.75)) < 5e-1
def test_stable_diffusion_v1_4_bfloat_16_ddim(self):
......@@ -212,7 +212,7 @@ class FlaxPipelineTests(unittest.TestCase):
assert images.shape == (num_samples, 1, 512, 512, 3)
if jax.device_count() == 8:
assert np.abs((np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 0.045043945)) < 1e-3
assert np.abs((np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 0.045043945)) < 5e-2
assert np.abs((np.abs(images, dtype=np.float32).sum() - 2347693.5)) < 5e-1
def test_jax_memory_efficient_attention(self):
......
import json
import logging
import os
from collections import defaultdict
from pathlib import Path
from huggingface_hub import HfApi, ModelFilter
import diffusers
PATH_TO_REPO = Path(__file__).parent.parent.resolve()
ALWAYS_TEST_PIPELINE_MODULES = [
"controlnet",
"stable_diffusion",
"stable_diffusion_2",
"stable_diffusion_xl",
"deepfloyd_if",
"kandinsky",
"kandinsky2_2",
"text_to_video_synthesis",
"wuerstchen",
]
PIPELINE_USAGE_CUTOFF = int(os.getenv("PIPELINE_USAGE_CUTOFF", 50000))
logger = logging.getLogger(__name__)
api = HfApi()
filter = ModelFilter(library="diffusers")
def filter_pipelines(usage_dict, usage_cutoff=10000):
output = []
for diffusers_object, usage in usage_dict.items():
if usage < usage_cutoff:
continue
if "Pipeline" in diffusers_object:
output.append(diffusers_object)
return output
def fetch_pipeline_objects():
models = api.list_models(filter=filter)
downloads = defaultdict(int)
for model in models:
is_counted = False
for tag in model.tags:
if tag.startswith("diffusers:"):
is_counted = True
downloads[tag[len("diffusers:") :]] += model.downloads
if not is_counted:
downloads["other"] += model.downloads
# Remove 0 downloads
downloads = {k: v for k, v in downloads.items() if v > 0}
pipeline_objects = filter_pipelines(downloads, PIPELINE_USAGE_CUTOFF)
return pipeline_objects
def fetch_pipeline_modules_to_test():
try:
pipeline_objects = fetch_pipeline_objects()
except Exception as e:
logger.error(e)
raise RuntimeError("Unable to fetch model list from HuggingFace Hub.")
test_modules = []
for pipeline_name in pipeline_objects:
module = getattr(diffusers, pipeline_name)
test_module = module.__module__.split(".")[-2].strip()
test_modules.append(test_module)
return test_modules
def main():
test_modules = fetch_pipeline_modules_to_test()
test_modules.extend(ALWAYS_TEST_PIPELINE_MODULES)
# Get unique modules
test_modules = list(set(test_modules))
print(json.dumps(test_modules))
save_path = f"{PATH_TO_REPO}/reports"
os.makedirs(save_path, exist_ok=True)
with open(f"{save_path}/test-pipelines.json", "w") as f:
json.dump({"pipeline_test_modules": test_modules}, f)
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment