Fix breaking change in `onnxruntime` for ONNX quantization (#18336)

* Fix quantization * Save model * Remove unused comments * Fix formatting

Fix breaking change in `onnxruntime` for ONNX quantization (#18336)
* Fix quantization * Save model * Remove unused comments * Fix formatting
d243112b · Severin Simmler · GitHub · 5987c637 · d243112b
Unverified Commit d243112b authored Aug 18, 2022 by Severin Simmler Committed by GitHub Aug 18, 2022
Show whitespace changes
Inline Side-by-side

Showing with 31 additions and 12 deletions

src/transformers/convert_graph_to_onnx.py src/transformers/convert_graph_to_onnx.py +31 -12

No files found.
--- a/src/transformers/convert_graph_to_onnx.py
+++ b/src/transformers/convert_graph_to_onnx.py
@@ -435,29 +435,48 @@ def quantize(onnx_model_path: Path) -> Path:
    Returns: The Path generated for the quantized
    """
    import onnx
-    from onnxruntime.quantization import QuantizationMode, quantize
+    from onnx.onnx_pb import ModelProto
+    from onnxruntime.quantization import QuantizationMode
+    from onnxruntime.quantization.onnx_quantizer import ONNXQuantizer
+    from onnxruntime.quantization.registry import IntegerOpsRegistry

+    # Load the ONNX model
    onnx_model = onnx.load(onnx_model_path.as_posix())

-    # Discussed with @yufenglee from ONNX runtime, this will be address in the next release of onnxruntime
+    if parse(onnx.__version__) < parse("1.5.0"):
        print(
-        "As of onnxruntime 1.4.0, models larger than 2GB will fail to quantize due to protobuf constraint.\n"
-        "This limitation will be removed in the next release of onnxruntime."
+            "Models larger than 2GB will fail to quantize due to protobuf constraint.\n"
+            "Please upgrade to onnxruntime >= 1.5.0."
        )

-    quantized_model = quantize(
-        model=onnx_model,
-        quantization_mode=QuantizationMode.IntegerOps,
-        force_fusions=True,
-        symmetric_weight=True,
+    # Copy it
+    copy_model = ModelProto()
+    copy_model.CopyFrom(onnx_model)
+
+    # Construct quantizer
+    quantizer = ONNXQuantizer(
+        model=copy_model,
+        per_channel=False,
+        reduce_range=False,
+        mode=QuantizationMode.IntegerOps,
+        static=False,
+        weight_qType=True,
+        input_qType=False,
+        tensors_range=None,
+        nodes_to_quantize=None,
+        nodes_to_exclude=None,
+        op_types_to_quantize=list(IntegerOpsRegistry),
    )

+    # Quantize and export
+    quantizer.quantize_model()
+
    # Append "-quantized" at the end of the model's name
    quantized_model_path = generate_identified_filename(onnx_model_path, "-quantized")

    # Save model
    print(f"Quantized model has been written at {quantized_model_path}: \N{heavy check mark}")
-    onnx.save_model(quantized_model, quantized_model_path.as_posix())
+    onnx.save_model(quantizer.model.model, quantized_model_path.as_posix())

    return quantized_model_path