4bit quantization for arbitrary `nn.Parameter` (#1720)

* Add parametrize util for targeting parameters outside of nn.Linear modules * Parametrize 4bit: replace existing prequantized weight * cleanup * Add caching for parametrization * Add tests * Fix tests * Guard for torch < 2.5 * Guard for torch < 2.5 * Another test gaurd for torch >= 2.5

4bit quantization for arbitrary `nn.Parameter` (#1720)
* Add parametrize util for targeting parameters outside of nn.Linear modules * Parametrize 4bit: replace existing prequantized weight * cleanup * Add caching for parametrization * Add tests * Fix tests * Guard for torch < 2.5 * Guard for torch < 2.5 * Another test gaurd for torch >= 2.5
27549fb0 · Matthew Douglas · GitHub · 39dd8471 · 27549fb0 · 27549fb0
Unverified Commit 27549fb0 authored Sep 08, 2025 by Matthew Douglas Committed by GitHub Sep 08, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 603 additions and 0 deletions

bitsandbytes/nn/parametrize.py bitsandbytes/nn/parametrize.py +192 -0

tests/test_parametrize.py tests/test_parametrize.py +411 -0

No files found.
--- a/bitsandbytes/nn/parametrize.py
+++ b/bitsandbytes/nn/parametrize.py
+from functools import partial
+from typing import Any, Literal, Optional
+import torch
+import torch.nn as nn
+import torch.nn.utils.parametrize as P
+from .. import functional as F
+class Bnb4bitParametrization(nn.Module):
+    """
+    A parametrization module that handles dequantization of a 4-bit quantized parameter.
+    The parameter data is expected to be already quantized when this parametrization is applied.
+    This module will dequantize the parameter data to its original floating-point representation
+    when the forward method is called (i.e. when the parameter is accessed).
+    Args:
+        quant_state (`F.QuantState`):
+            The quantization state containing the necessary information for dequantization.
+    """
+    def __init__(self, quant_state: F.QuantState):
+        super().__init__()
+        self.quant_state = quant_state
+    @torch.no_grad()
+    def forward(self, quantized_param: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass to dequantize the parameter.
+        Args:
+            quantized_param (`torch.Tensor`): The quantized parameter tensor (from .original)
+        Returns:
+            `torch.Tensor`: The dequantized parameter tensor in the original shape and dtype.
+        """
+        return F.dequantize_4bit(quantized_param, self.quant_state)
+def replace_parameter_4bit_prequantized(
+    module: nn.Module, param_name: str, qs_dict: dict[str, Any], device: torch.device
+):
+    if not hasattr(module, param_name):
+        raise AttributeError(f"Module does not have parameter '{param_name}'")
+    original_param = getattr(module, param_name)
+    if not isinstance(original_param, nn.Parameter):
+        raise TypeError(f"Parameter '{param_name}' is not an instance of nn.Parameter")
+    quant_state = F.QuantState.from_dict(qs_dict, device=device)
+    # Apply a parametrization to the module to handle dequantization.
+    P.register_parametrization(module, param_name, Bnb4bitParametrization(quant_state), unsafe=True)
+    # Next, register hooks.
+    _register_parametrization_hooks(module, param_name)
+def replace_parameter_4bit(
+    module: nn.Module,
+    param_name: str,
+    compress_statistics: bool = False,
+    quant_type: Literal["nf4", "fp4"] = "nf4",
+    blocksize: Optional[int] = None,
+):
+    """
+    Replace a module parameter with a 4-bit quantized version using parametrization.
+    This function quantizes an existing parameter in a PyTorch module to 4-bit precision
+    and sets up parametrization to handle automatic dequantization during forward passes.
+    The original parameter is replaced with quantized data, and a parametrization layer
+    is registered to manage the quantization state and dequantization process.
+    Additional, it registers a state dict post-hook to ensure that the quantization state
+    is saved correctly when the model's state dict is saved.
+    It is useful for MoE models or other scenarios where you want to quantize parameters
+    outside of nn.Linear layers without changing the model's architecture.
+    <Tip warning={true}>This feature is experimental and may change in future releases.</Tip>
+    Args:
+        module (`nn.Module`):
+            The PyTorch module containing the parameter to be quantized.
+        param_name (`str`):
+            The name of the parameter within the module to quantize.
+        compress_statistics (`bool`, *optional*, defaults to `False`):
+            Whether to compress quantization statistics to reduce memory usage.
+        quant_type (`Literal["nf4", "fp4"]`, *optional*, defaults to `"nf4"`):
+            The quantization format to use.
+        blocksize (`int`, *optional*, defaults to `None`):
+            The block size for quantization. If None, uses the default block size.
+    Raises:
+        AttributeError: If the module does not have the specified parameter.
+        TypeError: If the specified attribute is not an instance of nn.Parameter.
+    """
+    if not hasattr(module, param_name):
+        raise AttributeError(f"Module does not have parameter '{param_name}'")
+    original_param = getattr(module, param_name)
+    if not isinstance(original_param, nn.Parameter):
+        raise TypeError(f"Parameter '{param_name}' is not an instance of nn.Parameter")
+    # Quantize the original parameter.
+    quantized_data, quant_state = F.quantize_4bit(
+        original_param.data,
+        blocksize=blocksize,
+        compress_statistics=compress_statistics,
+        quant_type=quant_type,
+    )
+    # Replace the parameter with the quantized data.
+    setattr(module, param_name, nn.Parameter(quantized_data, requires_grad=False))
+    del original_param
+    # Apply a parametrization to the module to handle dequantization.
+    P.register_parametrization(module, param_name, Bnb4bitParametrization(quant_state), unsafe=True)
+    # Next, register hooks.
+    _register_parametrization_hooks(module, param_name)
+def _disable_parametrization_cache(module: nn.Module, inputs: tuple[Any, ...], output: Any):
+    P._cache_enabled -= 1
+    if not P._cache_enabled:
+        P._cache = {}
+def _enable_parametrization_cache(module: nn.Module, inputs: tuple[Any, ...]):
+    P._cache_enabled += 1
+def _register_parametrization_hooks(module: nn.Module, param_name: str):
+    # Register a state dict hook for saving. Note that this requires torch >= 2.5.0.
+    if torch.__version__ >= (2, 5):
+        module.register_state_dict_post_hook(
+            partial(
+                _parametrized_state_dict_post_hook,
+                param_name=param_name,
+            )
+        )
+    # Register hooks to enable caching for the dequantization parametrization.
+    # This helps preserve time and memory when the same quantized parameter
+    # is accessed multiple times in the forward computation.
+    module.register_forward_pre_hook(_enable_parametrization_cache)
+    module.register_forward_hook(_disable_parametrization_cache)
+def _parametrized_state_dict_post_hook(
+    module: nn.Module,
+    state_dict: dict[str, Any],
+    prefix: str,
+    local_metadata: Any,
+    *,
+    param_name: str = "weight",
+    **kwargs: dict[str, Any],
+) -> None:
+    """
+    Hook to modify the state dict to include the quantization state.
+    """
+    original_key = f"{prefix}parametrizations.{param_name}.original"
+    if original_key in state_dict:
+        # Create a clean entry.
+        # The `parametrizations.{param_name}.original` key will have the quantized data,
+        # but we would like it to keep it in the state_dict as `{param_name}`.
+        clean_key = f"{prefix}{param_name}"
+        state_dict[clean_key] = state_dict.pop(original_key)
+        assert P.is_parametrized(module, param_name)
+        # Find the parametrization, which should have the quantization state.
+        parametrization: Bnb4bitParametrization = next(
+            filter(lambda x: isinstance(x, Bnb4bitParametrization), module.parametrizations[param_name]), None
+        )
+        assert parametrization is not None, "Parametrization not found for the parameter."
+        quant_state = parametrization.quant_state
+        # Next, we need to store the quantization state.
+        if quant_state is not None:
+            for k, v in quant_state.as_dict(packed=True).items():
+                state_dict[f"{prefix}{param_name}.{k}"] = v
--- a/tests/test_parametrize.py
+++ b/tests/test_parametrize.py
+import pytest
+import torch
+import torch.nn as nn
+from bitsandbytes import functional as F
+from bitsandbytes.cextension import HIP_ENVIRONMENT
+from bitsandbytes.nn.parametrize import (
+    Bnb4bitParametrization,
+    replace_parameter_4bit,
+    replace_parameter_4bit_prequantized,
+)
+from tests.helpers import (
+    TRUE_FALSE,
+    describe_dtype,
+    get_available_devices,
+    id_formatter,
+    is_supported_on_hpu,
+)
+class ParametrizeTestModule(nn.Module):
+    """Test module with different parameter shapes for testing parametrization."""
+    def __init__(self, device="cpu", dtype=torch.float32):
+        super().__init__()
+        # 2D parameter (typical weight matrix)
+        self.weight_2d = nn.Parameter(torch.randn(1024, 1024, device=device, dtype=dtype))
+        # 3D parameter (MoE expert weights - the main use case for this feature)
+        self.expert_weights = nn.Parameter(torch.randn(8, 512, 256, device=device, dtype=dtype))
+        # 1D parameter (bias-like)
+        self.bias_1d = nn.Parameter(torch.randn(1024, device=device, dtype=dtype))
+        # Non-parameter attribute (should not be quantizable)
+        self.not_param = torch.randn(32, device=device, dtype=dtype)
+@pytest.mark.parametrize("device", get_available_devices())
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype)
+@pytest.mark.parametrize("quant_type", ["nf4", "fp4"])
+@pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
+@pytest.mark.parametrize(
+    "blocksize",
+    [64, 128, 256] if not HIP_ENVIRONMENT else [128, 256],
+)
+def test_replace_parameter_4bit(device, dtype, quant_type, compress_statistics, blocksize):
+    """Test basic parameter replacement with 4-bit quantization on different dtypes."""
+    if device == "hpu" and not is_supported_on_hpu(quant_type, dtype):
+        pytest.skip("This configuration is not supported on HPU.")
+    # Create module directly on target device to avoid unnecessary transfers
+    module = ParametrizeTestModule(device=device, dtype=dtype)
+    original_param = module.weight_2d.clone()
+    # Apply 4-bit quantization parametrization to the weight parameter
+    replace_parameter_4bit(
+        module, "weight_2d", compress_statistics=compress_statistics, quant_type=quant_type, blocksize=blocksize
+    )
+    # Verify that parametrization was applied correctly
+    assert hasattr(module, "parametrizations"), "Module should have parametrizations attribute"
+    assert "weight_2d" in module.parametrizations, "weight_2d should be parametrized"
+    # Test that accessing the parameter returns dequantized version with correct properties
+    reconstructed = module.weight_2d
+    assert reconstructed.shape == original_param.shape, "Shape should be preserved"
+    assert reconstructed.dtype == dtype, "dtype should match original"
+    assert reconstructed.device.type == device, "Device should match target"
+    # Verify quantization quality using same approach as functional tests
+    err = (original_param - reconstructed.detach()).abs().float()
+    relerr = (err / (original_param.abs().float() + 1e-8)).mean()
+    err_mean = err.mean()
+    # Expected error bounds from test_functional.py
+    expected_errors = {
+        "nf4": {
+            64: {"abs": 0.072792, "rel": 0.203299},
+            128: {"abs": 0.076835, "rel": 0.215252},
+            256: {"abs": 0.080326, "rel": 0.226044},
+        },
+        "fp4": {
+            64: {"abs": 0.096545, "rel": 0.260130},
+            128: {"abs": 0.102947, "rel": 0.275734},
+            256: {"abs": 0.108685, "rel": 0.289842},
+        },
+    }
+    assert err_mean < expected_errors[quant_type][blocksize]["abs"] + 1e-3, f"Mean abs error {err_mean:.6f} too high"
+    assert relerr < expected_errors[quant_type][blocksize]["rel"] + 1e-3, f"Mean rel error {relerr:.6f} too high"
+@pytest.mark.parametrize("device", get_available_devices())
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype)
+def test_moe_parameter_shape(device, dtype):
+    """Test parametrization with MoE-style parameter shape"""
+    if device == "hpu" and not is_supported_on_hpu("nf4", dtype):
+        pytest.skip("This configuration is not supported on HPU.")
+    param_shape = (8, 64, 32)
+    # Create module with custom parameter shape directly on target device
+    class MoEModule(nn.Module):
+        def __init__(self, device, dtype):
+            super().__init__()
+            self.param = nn.Parameter(torch.randn(*param_shape, dtype=dtype, device=device))
+    module = MoEModule(device=device, dtype=dtype)
+    original_param = module.param.clone()
+    # Apply quantization parametrization
+    replace_parameter_4bit(module, "param", quant_type="nf4")
+    # Verify reconstruction maintains all properties
+    reconstructed = module.param
+    assert reconstructed.shape == param_shape, f"Shape should be preserved: {reconstructed.shape} vs {param_shape}"
+    assert reconstructed.dtype == dtype, "dtype should match original"
+    assert reconstructed.device.type == device, "Device should match target"
+    # Verify quantization quality using error calculation approach from functional tests
+    err = (original_param - reconstructed.detach()).abs().float()
+    relerr = (err / (original_param.abs().float() + 1e-8)).mean()
+    err_mean = err.mean()
+    # Use slightly looser bounds for higher dimensional tensors
+    abs_bound = 0.085  # NF4 baseline + margin
+    rel_bound = 0.25  # NF4 baseline + margin
+    assert err_mean < abs_bound, f"Mean abs error {err_mean:.6f} too high for shape {param_shape}"
+    assert relerr < rel_bound, f"Mean rel error {relerr:.6f} too high for shape {param_shape}"
+@pytest.mark.parametrize("device", get_available_devices())
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype)
+@pytest.mark.parametrize("quant_type", ["nf4", "fp4"])
+def test_prequantized_replacement(device, dtype, quant_type):
+    """Test applying parametrization to already quantized parameters."""
+    if device == "hpu" and not is_supported_on_hpu(quant_type, dtype):
+        pytest.skip("Configuration not supported on HPU.")
+    module = ParametrizeTestModule(device=device, dtype=dtype)
+    original_param = module.weight_2d.clone()
+    # Manually quantize the parameter data first (simulates loading pre-quantized weights)
+    quantized_data, quant_state = F.quantize_4bit(original_param.data, quant_type=quant_type)
+    # Replace parameter with quantized data (what would happen during model loading)
+    module.weight_2d = nn.Parameter(quantized_data, requires_grad=False)
+    # Apply parametrization to handle dequantization on access
+    replace_parameter_4bit_prequantized(
+        module, "weight_2d", quant_state.as_dict(packed=True), device=torch.device(device)
+    )
+    # Test that parameter access properly dequantizes
+    reconstructed = module.weight_2d
+    assert reconstructed.shape == original_param.shape, "Shape should be preserved"
+    assert reconstructed.dtype == dtype, "dtype should match original"
+    assert reconstructed.device.type == device, "Device should match target"
+@pytest.mark.parametrize("device", get_available_devices())
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype)
+@pytest.mark.parametrize("quant_type", ["nf4", "fp4"])
+@pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
+@pytest.mark.skipif(torch.__version__ < (2, 5), reason="state dict hook requires torch >= 2.5.0")
+def test_state_dict_functionality(device, dtype, quant_type, compress_statistics):
+    """Test that state dict saving works with quantized parameters."""
+    if device == "hpu" and not is_supported_on_hpu(quant_type, dtype):
+        pytest.skip("Configuration not supported on HPU.")
+    module = ParametrizeTestModule(device=device, dtype=dtype)
+    # Apply parametrization to expert weights (main MoE use case)
+    replace_parameter_4bit(module, "expert_weights", quant_type=quant_type, compress_statistics=compress_statistics)
+    # Save state dict - should include quantization state, not parametrization internals
+    state_dict = module.state_dict()
+    # Verify state dict structure: quantized param + quantization metadata
+    assert "expert_weights" in state_dict, "Quantized parameter should be in state dict"
+    assert "expert_weights.absmax" in state_dict, "Quantization absmax should be saved"
+    assert "expert_weights.quant_map" in state_dict, "Quantization map should be saved"
+    assert f"expert_weights.quant_state.bitsandbytes__{quant_type}" in state_dict, "Quant state should be saved"
+    # Verify parametrization internals are NOT saved (clean state dict)
+    assert "parametrizations.expert_weights.original" not in state_dict, (
+        "Internal parametrization keys should not be saved"
+    )
+    # Test that the parameter can be accessed after state dict creation
+    reconstructed = module.expert_weights
+    assert reconstructed.shape == (8, 512, 256), "Shape should be preserved"
+    assert reconstructed.dtype == dtype, "dtype should match"
+@pytest.mark.parametrize("device", get_available_devices())
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype)
+def test_moe_realistic_forward(device, dtype):
+    """Test realistic MoE forward computation with quantized expert weights."""
+    if device == "hpu" and not is_supported_on_hpu("nf4", dtype):
+        pytest.skip("Configuration not supported on HPU.")
+    class SimpleMoE(nn.Module):
+        def __init__(self, device, dtype):
+            super().__init__()
+            # Expert weights: [num_experts, input_dim, output_dim]
+            self.expert_weights = nn.Parameter(torch.randn(4, 32, 64, dtype=dtype, device=device))
+        def forward(self, x, expert_idx=0):
+            # Select and use specific expert weight matrix
+            expert_weight = self.expert_weights[expert_idx]  # Shape: [input_dim, output_dim]
+            return torch.matmul(x, expert_weight)
+    module = SimpleMoE(device=device, dtype=dtype)
+    x = torch.randn(8, 32, dtype=dtype, device=device)
+    # Get reference output before quantization
+    with torch.no_grad():
+        reference_output = module(x, expert_idx=1)
+    # Apply 4-bit quantization to expert weights
+    replace_parameter_4bit(module, "expert_weights", quant_type="nf4")
+    # Get output after quantization - should be very close to original
+    with torch.no_grad():
+        quantized_output = module(x, expert_idx=1)
+    # Verify outputs match within quantization tolerance
+    assert quantized_output.shape == reference_output.shape, "Output shape should be preserved"
+    # Calculate error like functional tests (matrix ops may amplify quantization errors)
+    err = (reference_output - quantized_output).abs().float()
+    relerr = (err / (reference_output.abs().float() + 1e-8)).mean()
+    err_mean = err.mean()
+    # Allow for error amplification through matrix multiplication
+    assert err_mean < 0.5, f"Forward pass mean abs error {err_mean:.6f} too high"
+    assert relerr < 2.0, f"Forward pass mean rel error {relerr:.6f} too high"
+def test_error_conditions():
+    """Test that proper errors are raised for invalid inputs."""
+    module = ParametrizeTestModule()
+    # Test AttributeError for non-existent parameter
+    with pytest.raises(AttributeError, match="Module does not have parameter 'nonexistent'"):
+        replace_parameter_4bit(module, "nonexistent")
+    # Test TypeError for non-Parameter attribute
+    with pytest.raises(TypeError, match="Parameter 'not_param' is not an instance of nn.Parameter"):
+        replace_parameter_4bit(module, "not_param")
+    # Test same errors for prequantized version
+    with pytest.raises(AttributeError, match="Module does not have parameter 'nonexistent'"):
+        replace_parameter_4bit_prequantized(module, "nonexistent", {}, torch.device("cpu"))
+    with pytest.raises(TypeError, match="Parameter 'not_param' is not an instance of nn.Parameter"):
+        replace_parameter_4bit_prequantized(module, "not_param", {}, torch.device("cpu"))
+@pytest.mark.parametrize("device", get_available_devices())
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype)
+@pytest.mark.skipif(torch.__version__ < (2, 5), reason="state dict hook requires torch >= 2.5.0")
+def test_quant_state_preservation(device, dtype):
+    """Test that quantization state is properly preserved and accessible."""
+    if device == "hpu" and not is_supported_on_hpu("nf4", dtype):
+        pytest.skip("Configuration not supported on HPU.")
+    module = ParametrizeTestModule(device=device, dtype=dtype)
+    blocksize = 128 if HIP_ENVIRONMENT else 64
+    # Apply parametrization with specific settings
+    replace_parameter_4bit(module, "weight_2d", quant_type="nf4", compress_statistics=True, blocksize=blocksize)
+    # Verify that quantization state is accessible through parametrization
+    parametrization = module.parametrizations.weight_2d[0]
+    assert isinstance(parametrization, Bnb4bitParametrization), "Should be Bnb4bitParametrization instance"
+    # Check quantization state properties
+    quant_state = parametrization.quant_state
+    assert isinstance(quant_state, F.QuantState), "Should have QuantState"
+    assert quant_state.quant_type == "nf4", "Quant type should be preserved"
+    assert quant_state.blocksize == blocksize, "Block size should be preserved"
+    # Verify that state dict includes all necessary quantization metadata
+    state_dict = module.state_dict()
+    quant_state_dict = quant_state.as_dict(packed=True)
+    for key in quant_state_dict.keys():
+        full_key = f"weight_2d.{key}"
+        assert full_key in state_dict, f"Quantization metadata '{full_key}' should be in state dict"
+@pytest.mark.parametrize("device", get_available_devices())
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype)
+@pytest.mark.skipif(torch.__version__ < (2, 5), reason="state dict hook requires torch >= 2.5.0")
+def test_multiple_parameters(device, dtype):
+    """Test applying parametrization to multiple parameters in the same module."""
+    if device == "hpu" and not is_supported_on_hpu("nf4", dtype):
+        pytest.skip("Configuration not supported on HPU.")
+    module = ParametrizeTestModule(device=device, dtype=dtype)
+    original_2d = module.weight_2d.clone()
+    original_3d = module.expert_weights.clone()
+    # Apply parametrization to multiple parameters, with varying configurations
+    replace_parameter_4bit(module, "weight_2d", quant_type="nf4", blocksize=128)
+    replace_parameter_4bit(module, "expert_weights", quant_type="fp4", blocksize=256)
+    # Verify both parameters are parametrized and work correctly
+    reconstructed_2d = module.weight_2d
+    reconstructed_3d = module.expert_weights
+    assert reconstructed_2d.shape == original_2d.shape, "2D parameter shape should be preserved"
+    assert reconstructed_3d.shape == original_3d.shape, "3D parameter shape should be preserved"
+    # Check that state dict includes quantization info for both parameters
+    state_dict = module.state_dict()
+    assert "weight_2d" in state_dict, "2D parameter should be in state dict"
+    assert "expert_weights" in state_dict, "3D parameter should be in state dict"
+    assert "weight_2d.absmax" in state_dict, "2D parameter quantization metadata should be saved"
+    assert "expert_weights.absmax" in state_dict, "3D parameter quantization metadata should be saved"
+@pytest.mark.parametrize("device", get_available_devices())
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype)
+@pytest.mark.parametrize(
+    "blocksize",
+    [64, 128, 256] if not HIP_ENVIRONMENT else [128, 256],
+)
+def test_different_blocksizes(device, dtype, blocksize):
+    """Test parametrization with different block sizes to verify flexibility."""
+    if device == "hpu" and not is_supported_on_hpu("nf4", dtype):
+        pytest.skip("Configuration not supported on HPU.")
+    module = ParametrizeTestModule(device=device, dtype=dtype)
+    original_param = module.expert_weights.clone()
+    # Apply parametrization with specified block size
+    replace_parameter_4bit(module, "expert_weights", quant_type="nf4", blocksize=blocksize)
+    # Verify reconstruction works with different block sizes
+    reconstructed = module.expert_weights
+    assert reconstructed.shape == original_param.shape, "Shape should be preserved"
+    assert reconstructed.device.type == device, "Device should match"
+    # Verify quantization quality using error calculation approach from functional tests
+    err = (original_param - reconstructed.detach()).abs().float()
+    relerr = (err / (original_param.abs().float() + 1e-8)).mean()
+    err_mean = err.mean()
+    # Expected error bounds from functional tests (using NF4 bounds since that's what we're testing)
+    expected_abs = {64: 0.072792, 128: 0.076835, 256: 0.080326}
+    expected_rel = {64: 0.203299, 128: 0.215252, 256: 0.226044}
+    assert err_mean < expected_abs[blocksize] + 0.01, (
+        f"Mean abs error {err_mean:.6f} too high for blocksize {blocksize}"
+    )
+    assert relerr < expected_rel[blocksize] + 0.02, f"Mean rel error {relerr:.6f} too high for blocksize {blocksize}"
+def test_parametrization_forward_method():
+    """Test the Bnb4bitParametrization forward method directly."""
+    device = "cpu"
+    # Create test tensor and manually quantize it
+    original_tensor = torch.randn(64, 32, dtype=torch.float32, device=device)
+    quantized_data, quant_state = F.quantize_4bit(original_tensor, quant_type="nf4")
+    # Create parametrization instance
+    parametrization = Bnb4bitParametrization(quant_state)
+    # Test forward pass (dequantization)
+    dequantized = parametrization.forward(quantized_data)
+    # Verify dequantization produces correct output
+    assert dequantized.shape == original_tensor.shape, "Shape should be preserved during dequantization"
+    assert dequantized.dtype == torch.float32, "dtype should be preserved"
+    assert dequantized.device == original_tensor.device, "Device should be preserved"
+    # Check that dequantization approximates original using mean error calculation
+    err = (original_tensor - dequantized.detach()).abs().float()
+    relerr = (err / (original_tensor.abs().float() + 1e-8)).mean()
+    err_mean = err.mean()
+    # Use NF4 bounds from functional tests with small margin
+    assert err_mean < 0.08, f"Mean abs error {err_mean:.6f} too high"
+    assert relerr < 0.25, f"Mean rel error {relerr:.6f} too high"
+@pytest.mark.parametrize("device", get_available_devices())
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype)
+def test_gradient_behavior(device, dtype):
+    """Test that quantized parameters have proper gradient behavior."""
+    if device == "hpu" and not is_supported_on_hpu("nf4", dtype):
+        pytest.skip("Configuration not supported on HPU.")
+    module = ParametrizeTestModule(device=device, dtype=dtype)
+    # Ensure original parameter requires gradients
+    module.weight_2d.requires_grad_(True)
+    assert module.weight_2d.requires_grad, "Original parameter should require gradients"
+    # Apply quantization parametrization
+    replace_parameter_4bit(module, "weight_2d", quant_type="nf4")
+    # Verify that quantized parameters don't require gradients (expected behavior)
+    # The underlying quantized parameter should have requires_grad=False
+    # The dequantized output should also not require gradients
+    reconstructed = module.weight_2d
+    assert not reconstructed.requires_grad, "Dequantized parameter should not require gradients"