chore: add tests for flux speed (#564)

* add test speed under tests * add tests on flux speed * add requirement * add data for 5090 * precommit * remove comments * fix precommit * reorder * use private name for expected

chore: add tests for flux speed (#564)
* add test speed under tests * add tests on flux speed * add requirement * add data for 5090 * precommit * remove comments * fix precommit * reorder * use private name for expected
c360ac11 · Kung Talon · GitHub · 209f696f · c360ac11 · c360ac11
Unverified Commit c360ac11 authored Jul 27, 2025 by Kung Talon Committed by GitHub Jul 27, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 73 additions and 0 deletions

requirements.txt requirements.txt +1 -0

tests/flux/test_flux_speed.py tests/flux/test_flux_speed.py +72 -0

No files found.
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,3 +5,4 @@ facexlib
 onnxruntime
 # ip-adapter
 timm
+diffusers>=0.33.1
--- a/tests/flux/test_flux_speed.py
+++ b/tests/flux/test_flux_speed.py
+import logging
+import time
+
+import pytest
+import torch
+from diffusers import FluxPipeline
+
+from nunchaku import NunchakuFluxTransformer2dModel, NunchakuT5EncoderModel
+from nunchaku.utils import get_precision
+
+_LOGGER = logging.getLogger(__name__)
+
+_EXPECTED_LATENCIES = {
+    "NVIDIA GeForce RTX 3090": 16.05321,
+    "NVIDIA GeForce RTX 4090": 6.49650,
+    "NVIDIA GeForce RTX 5090": 4.79388,
+}
+
+
+@pytest.mark.skipif(
+    torch.cuda.get_device_name(0) not in _EXPECTED_LATENCIES, reason="Skip tests due to using unsupported GPUs"
+)
+@pytest.mark.parametrize(
+    "warmup_times,test_times,num_inference_steps,guidance_scale,use_qencoder,expected_latency",
+    [
+        (2, 5, 30, 3.5, True, _EXPECTED_LATENCIES[torch.cuda.get_device_name(0)]),
+    ],
+)
+def test_flux_speed(
+    warmup_times: int,
+    test_times: int,
+    num_inference_steps: int,
+    guidance_scale: float,
+    use_qencoder: bool,
+    expected_latency: float,
+):
+    precision = get_precision()
+
+    pipeline_init_kwargs = {
+        "transformer": NunchakuFluxTransformer2dModel.from_pretrained(
+            f"mit-han-lab/nunchaku-flux.1-schnell/svdq-{precision}_r32-flux.1-schnell.safetensors", offload=False
+        )
+    }
+    if use_qencoder:
+        text_encoder_2 = NunchakuT5EncoderModel.from_pretrained(
+            "mit-han-lab/nunchaku-t5/awq-int4-flux.1-t5xxl.safetensors"
+        )
+        pipeline_init_kwargs["text_encoder_2"] = text_encoder_2
+    pipeline = FluxPipeline.from_pretrained(
+        "black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16, **pipeline_init_kwargs
+    )
+
+    pipeline = pipeline.to("cuda")
+
+    latency_list = []
+    dummy_prompt = "A cat holding a sign that says hello world"
+
+    for _ in range(warmup_times):
+        pipeline(prompt=dummy_prompt, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale)
+        torch.cuda.synchronize()
+    for _ in range(test_times):
+        start_time = time.time()
+        pipeline(prompt=dummy_prompt, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale)
+        torch.cuda.synchronize()
+        end_time = time.time()
+        latency_list.append(end_time - start_time)
+
+    average_latency = sum(latency_list) / len(latency_list)
+
+    _LOGGER.debug(f"Latency: {average_latency:.5f} s")
+
+    assert average_latency < expected_latency * 1.1