// In current scaling, scale is not known but we initialize it with 1 to avoid division by zero. If scale is already calculated, it can be correctly set.
# Weight with column-wise usage is needed for dgrad GEMM.
...
...
@@ -547,6 +560,19 @@ class _Linear(torch.autograd.Function):
# usage for only dgrad GEMM.
quantizer.set_usage(columnwise=False)
# Adjust the quantization direction approach depending
# on whether wgrad calculations will be performed.
# NOTE: If requires_dgrad is False, disabling `rowwise` quantization and keeping `columnwise` quantization
# results in `Assertion failed: output_tensor->has_data(). Quantizing in only the columnwise direction not supported yet!`
# NOTE: For `ctx.bias is True`, selected quantize kernel errors with
# `cast_kernels.cuh:1322 in function fp8_quantize_arch_l_100: Not implemented scaling mode or fusion: NVTE_DELAYED_TENSOR_SCALING or IS_DBIAS=true on GPU with compute capability < 10.0.`