fix the problem of poor second inference performance in triton

510401e2 · zhuwenwen · 1428c17d · 510401e2
Commit 510401e2 authored Oct 30, 2024 by zhuwenwen
Hide whitespace changes
Inline Side-by-side

Showing with 3 additions and 0 deletions

benchmarks/benchmark_prefix_caching.py benchmarks/benchmark_prefix_caching.py +3 -0

No files found.
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -35,6 +35,8 @@ from transformers import PreTrainedTokenizerBase
 from vllm import LLM, SamplingParams
 from vllm.utils import FlexibleArgumentParser

+from triton.common.backend import compute_core_version_key
+
 try:
    from vllm.transformers_utils.tokenizer import get_tokenizer
 except ImportError:
@@ -44,6 +46,7 @@ PROMPT = "You are a helpful assistant in recognizes the content of tables in mar


 def test_prefix(llm=None, sampling_params=None, prompts=None):
+    version_key = compute_core_version_key()
    start_time = time.time()

    llm.generate(prompts, sampling_params=sampling_params)