Commit 510401e2 authored by zhuwenwen's avatar zhuwenwen
Browse files

fix the problem of poor second inference performance in triton

parent 1428c17d
......@@ -35,6 +35,8 @@ from transformers import PreTrainedTokenizerBase
from vllm import LLM, SamplingParams
from vllm.utils import FlexibleArgumentParser
from triton.common.backend import compute_core_version_key
try:
from vllm.transformers_utils.tokenizer import get_tokenizer
except ImportError:
......@@ -44,6 +46,7 @@ PROMPT = "You are a helpful assistant in recognizes the content of tables in mar
def test_prefix(llm=None, sampling_params=None, prompts=None):
version_key = compute_core_version_key()
start_time = time.time()
llm.generate(prompts, sampling_params=sampling_params)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment