Use xla flag to improve the quantized model performance (#19303)

Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>

Use xla flag to improve the quantized model performance (#19303)
Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
9af6d22e · XiongfeiWei · GitHub · 4589b940 · 9af6d22e
Unverified Commit 9af6d22e authored Jun 09, 2025 by XiongfeiWei Committed by GitHub Jun 10, 2025
Show whitespace changes
Inline Side-by-side

Showing with 4 additions and 1 deletion

vllm/v1/worker/tpu_worker.py vllm/v1/worker/tpu_worker.py +4 -1

No files found.
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -101,7 +101,10 @@ class TPUWorker:
        # fix this. It will be removed after the bug in XLA compiler is fixed.
        os.environ["LIBTPU_INIT_ARGS"] = (
            os.environ.get("LIBTPU_INIT_ARGS", "") +
-            " --xla_tpu_force_1d_allreduce_at_chunk_count=1")
+            " --xla_tpu_force_1d_allreduce_at_chunk_count=1"
+            " --xla_jf_conv_input_fusion=False")
+        # --xla_jf_conv_input_fusion=False is used to improve the perf of
+        # quantized matmul.
        torch.set_grad_enabled(False)
        torch.set_default_dtype(self.model_config.dtype)