# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Unit tests for TurboQuant KV-cache quantization. Run: .venv/bin/python -m pytest tests/quantization/test_turboquant.py -v """ import math import pytest import torch from vllm.model_executor.layers.quantization.turboquant.config import ( TQ_PRESETS, TurboQuantConfig, ) from vllm.utils.math_utils import next_power_of_2 # ============================================================================ # Helpers # ============================================================================ ALL_PRESETS = list(TQ_PRESETS.keys()) def _assert_strictly_sorted(seq, name="sequence"): for i in range(len(seq) - 1): assert seq[i] < seq[i + 1], f"{name} not sorted at index {i}" def _is_power_of_2(n: int) -> bool: return n > 0 and next_power_of_2(n) == n # Expected concrete values for each preset at head_dim=128. # fmt: off PRESET_EXPECTED = { "turboquant_k8v4": dict( key_fp8=True, key_quant_bits=8, key_mse_bits=0, value_quant_bits=4, mse_bits=4, n_centroids=16, centroid_bits=4, norm_correction=False, key_packed_size=128, value_packed_size=68, slot_size=196, slot_size_aligned=196, ), "turboquant_4bit_nc": dict( key_fp8=False, key_quant_bits=4, key_mse_bits=4, value_quant_bits=4, mse_bits=4, n_centroids=16, centroid_bits=4, norm_correction=True, key_packed_size=68, value_packed_size=68, slot_size=136, slot_size_aligned=136, ), "turboquant_k3v4_nc": dict( key_fp8=False, key_quant_bits=3, key_mse_bits=3, value_quant_bits=4, mse_bits=3, n_centroids=8, centroid_bits=3, norm_correction=True, key_packed_size=52, value_packed_size=68, slot_size=120, slot_size_aligned=120, ), "turboquant_3bit_nc": dict( key_fp8=False, key_quant_bits=3, key_mse_bits=3, value_quant_bits=3, mse_bits=3, n_centroids=8, centroid_bits=3, norm_correction=True, key_packed_size=52, value_packed_size=52, slot_size=104, slot_size_aligned=104, ), } # fmt: on # ============================================================================ # Config tests (CPU-only, no dependencies beyond config.py) # ============================================================================ class TestTurboQuantConfig: @pytest.mark.parametrize("preset", ALL_PRESETS) def test_preset_parses(self, preset): cfg = TurboQuantConfig.from_cache_dtype(preset, head_dim=128) assert isinstance(cfg, TurboQuantConfig) def test_invalid_preset_raises(self): with pytest.raises(ValueError, match="Unknown TurboQuant"): TurboQuantConfig.from_cache_dtype("turboquant_invalid", head_dim=128) # ---- Per-preset concrete value checks (table-driven) ---- @pytest.mark.parametrize("preset", ALL_PRESETS) def test_key_mode(self, preset): cfg = TurboQuantConfig.from_cache_dtype(preset, head_dim=128) exp = PRESET_EXPECTED[preset] assert cfg.key_fp8 is exp["key_fp8"] assert cfg.key_quant_bits == exp["key_quant_bits"] assert cfg.key_mse_bits == exp["key_mse_bits"] @pytest.mark.parametrize("preset", ALL_PRESETS) def test_value_mode(self, preset): cfg = TurboQuantConfig.from_cache_dtype(preset, head_dim=128) exp = PRESET_EXPECTED[preset] assert cfg.value_quant_bits == exp["value_quant_bits"] @pytest.mark.parametrize("preset", ALL_PRESETS) def test_bits_and_centroids(self, preset): cfg = TurboQuantConfig.from_cache_dtype(preset, head_dim=128) exp = PRESET_EXPECTED[preset] assert cfg.mse_bits == exp["mse_bits"] assert cfg.n_centroids == exp["n_centroids"] assert cfg.centroid_bits == exp["centroid_bits"] @pytest.mark.parametrize("preset", ALL_PRESETS) def test_norm_correction(self, preset): cfg = TurboQuantConfig.from_cache_dtype(preset, head_dim=128) assert cfg.norm_correction is PRESET_EXPECTED[preset]["norm_correction"] @pytest.mark.parametrize("preset", ALL_PRESETS) def test_packed_sizes(self, preset): cfg = TurboQuantConfig.from_cache_dtype(preset, head_dim=128) exp = PRESET_EXPECTED[preset] assert cfg.key_packed_size == exp["key_packed_size"] assert cfg.value_packed_size == exp["value_packed_size"] assert cfg.slot_size == exp["slot_size"] assert cfg.slot_size_aligned == exp["slot_size_aligned"] # ---- Cross-preset structural invariants ---- @pytest.mark.parametrize("preset", ALL_PRESETS) def test_slot_equals_key_plus_value(self, preset): cfg = TurboQuantConfig.from_cache_dtype(preset, head_dim=128) assert cfg.slot_size == cfg.key_packed_size + cfg.value_packed_size @pytest.mark.parametrize("preset", ALL_PRESETS) def test_padded_slot_is_even(self, preset): cfg = TurboQuantConfig.from_cache_dtype(preset, head_dim=128) assert cfg.slot_size_aligned >= cfg.slot_size assert cfg.slot_size_aligned % 2 == 0, ( f"slot_size_aligned={cfg.slot_size_aligned} is not even" ) @pytest.mark.parametrize("preset", ALL_PRESETS) def test_key_value_packed_sizes_positive(self, preset): cfg = TurboQuantConfig.from_cache_dtype(preset, head_dim=128) assert cfg.key_packed_size > 0 assert cfg.value_packed_size > 0 @pytest.mark.parametrize("preset", ALL_PRESETS) def test_n_centroids_is_2_to_mse_bits(self, preset): cfg = TurboQuantConfig.from_cache_dtype(preset, head_dim=128) assert cfg.n_centroids == 2**cfg.mse_bits @pytest.mark.parametrize("preset", ALL_PRESETS) def test_centroid_bits_always_positive(self, preset): cfg = TurboQuantConfig.from_cache_dtype(preset, head_dim=128) assert cfg.centroid_bits > 0 @pytest.mark.parametrize("preset", ALL_PRESETS) def test_mse_key_or_fp8_exclusive(self, preset): """Each preset is either FP8 keys or MSE keys, never both.""" cfg = TurboQuantConfig.from_cache_dtype(preset, head_dim=128) if cfg.key_fp8: assert cfg.key_mse_bits == 0 assert cfg.key_quant_bits == 8 else: assert cfg.key_mse_bits > 0 assert cfg.key_quant_bits in (3, 4) @pytest.mark.parametrize("preset", ALL_PRESETS) @pytest.mark.parametrize("head_dim", [64, 96, 128, 256]) def test_all_presets_all_head_dims(self, preset, head_dim): cfg = TurboQuantConfig.from_cache_dtype(preset, head_dim=head_dim) assert cfg.head_dim == head_dim assert cfg.slot_size == cfg.key_packed_size + cfg.value_packed_size assert cfg.slot_size_aligned >= cfg.slot_size assert cfg.slot_size_aligned % 2 == 0 # ---- Boundary skip layers ---- def test_boundary_skip_layers_basic(self): layers = TurboQuantConfig.get_boundary_skip_layers(32) assert layers == ["0", "1", "30", "31"] def test_boundary_skip_layers_zero(self): assert TurboQuantConfig.get_boundary_skip_layers(32, 0) == [] def test_boundary_skip_layers_small_model(self): layers = TurboQuantConfig.get_boundary_skip_layers(4) assert layers == ["0", "1", "2", "3"] def test_boundary_skip_layers_cap_at_half(self): layers = TurboQuantConfig.get_boundary_skip_layers(8, 10) assert len(layers) == 8 # ============================================================================ # Centroids tests (CPU-only) # ============================================================================ from vllm.model_executor.layers.quantization.turboquant.centroids import ( get_centroids, solve_lloyd_max, ) class TestCentroids: @pytest.mark.parametrize("bits,expected_n", [(2, 4), (3, 8), (4, 16)]) def test_centroids_shape(self, bits, expected_n): c = get_centroids(128, bits) assert c.shape == (expected_n,) @pytest.mark.parametrize("bits", [2, 3, 4]) def test_centroids_sorted(self, bits): _assert_strictly_sorted(get_centroids(128, bits), "centroids") def test_centroids_cached(self): c1 = get_centroids(128, 3) c2 = get_centroids(128, 3) assert c1 is c2, "get_centroids should return cached object" def test_centroids_different_dims_not_identical(self): c64 = get_centroids(64, 3) c128 = get_centroids(128, 3) assert not torch.equal(c64, c128) @pytest.mark.parametrize("bits", [2, 3, 4]) def test_centroids_symmetric_around_zero(self, bits): """N(0, 1/d) is symmetric, so centroids should be ~symmetric.""" c = get_centroids(128, bits) assert abs(c.mean().item()) < 0.01, "Centroids not centered near 0" assert abs(c[0].item() + c[-1].item()) < 0.01 @pytest.mark.parametrize("bits", [2, 3, 4]) def test_centroids_within_4sigma(self, bits): """All centroids should be within ~4 sigma of N(0, 1/d).""" sigma = math.sqrt(1.0 / 128) c = get_centroids(128, bits) for i, val in enumerate(c): assert abs(val.item()) < 4 * sigma, ( f"Centroid {i}={val:.6f} outside 4*sigma={4 * sigma:.6f}" ) class TestLloydMax: @pytest.mark.parametrize("bits,expected_n", [(2, 4), (3, 8), (4, 16)]) def test_solve_shapes(self, bits, expected_n): centroids, boundaries = solve_lloyd_max(128, bits) assert centroids.shape == (expected_n,) assert boundaries.shape == (expected_n - 1,) @pytest.mark.parametrize("bits", [2, 3, 4]) def test_centroids_sorted(self, bits): centroids, _ = solve_lloyd_max(128, bits) _assert_strictly_sorted(centroids, "centroids") @pytest.mark.parametrize("bits", [2, 3, 4]) def test_boundaries_sorted(self, bits): _, boundaries = solve_lloyd_max(128, bits) _assert_strictly_sorted(boundaries, "boundaries") @pytest.mark.parametrize("bits", [2, 3, 4]) def test_boundaries_between_centroids(self, bits): """Each boundary must lie between its adjacent centroids.""" centroids, boundaries = solve_lloyd_max(128, bits) for i in range(len(boundaries)): assert centroids[i] < boundaries[i] < centroids[i + 1], ( f"Boundary {i}={boundaries[i]:.6f} not between " f"c[{i}]={centroids[i]:.6f} and c[{i + 1}]={centroids[i + 1]:.6f}" ) @pytest.mark.parametrize("bits", [2, 3, 4]) def test_boundaries_are_midpoints(self, bits): """Lloyd-Max boundaries are midpoints of adjacent centroids.""" centroids, boundaries = solve_lloyd_max(128, bits) for i in range(len(boundaries)): expected = (centroids[i] + centroids[i + 1]) / 2.0 assert abs(boundaries[i].item() - expected.item()) < 1e-6 def test_solve_deterministic(self): c1, b1 = solve_lloyd_max(128, 3) c2, b2 = solve_lloyd_max(128, 3) assert torch.equal(c1, c2) assert torch.equal(b1, b2) def test_solve_dtype_float32(self): centroids, boundaries = solve_lloyd_max(128, 3) assert centroids.dtype == torch.float32 assert boundaries.dtype == torch.float32 # ============================================================================ # Rotation matrix tests (GPU required) # ============================================================================ CUDA_AVAILABLE = torch.cuda.is_available() from vllm.model_executor.layers.quantization.turboquant.quantizer import ( generate_rotation_matrix, ) @pytest.mark.skipif(not CUDA_AVAILABLE, reason="CUDA not available") class TestRotationMatrix: @pytest.mark.parametrize("dim", [64, 96, 128, 256]) def test_rotation_matrix_shape_and_orthogonal(self, dim): Pi = generate_rotation_matrix(dim, seed=42, device="cuda") assert Pi.shape == (dim, dim) eye = Pi @ Pi.T assert torch.allclose(eye, torch.eye(dim, device="cuda"), atol=1e-5), ( f"Pi not orthogonal for dim={dim}" ) def test_rotation_matrix_deterministic(self): Pi1 = generate_rotation_matrix(128, seed=42) Pi2 = generate_rotation_matrix(128, seed=42) assert torch.equal(Pi1, Pi2) def test_rotation_matrix_different_seeds(self): Pi1 = generate_rotation_matrix(128, seed=42) Pi2 = generate_rotation_matrix(128, seed=99) assert not torch.equal(Pi1, Pi2) def test_rotation_matrix_det_is_pm1(self): """Orthogonal matrix determinant must be +1 or -1.""" Pi = generate_rotation_matrix(128, seed=42, device="cuda") det = torch.linalg.det(Pi) assert abs(abs(det.item()) - 1.0) < 1e-4