benchmark_hashing.py 2.02 KB
Newer Older
1
2
3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

zhuwenwen's avatar
zhuwenwen committed
4
5
6
7
import cProfile
import pstats

from vllm import LLM, SamplingParams
laibao's avatar
laibao committed
8
from vllm.utils import FlexibleArgumentParser
zhuwenwen's avatar
zhuwenwen committed
9
10

# A very long prompt, total number of tokens is about 15k.
11
12
LONG_PROMPT = ["You are an expert in large language models, aren't you?"] * 1000
LONG_PROMPT = " ".join(LONG_PROMPT)
zhuwenwen's avatar
zhuwenwen committed
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32


def main(args):
    llm = LLM(
        model=args.model,
        enforce_eager=True,
        enable_prefix_caching=True,
        tensor_parallel_size=args.tensor_parallel_size,
    )

    sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
    profiler = cProfile.Profile()

    print("------warm up------")
    for i in range(3):
        output = llm.generate(LONG_PROMPT, sampling_params)
        print(output[0].outputs[0].text)

    print("------start generating------")
    for i in range(3):
33
34
35
        profiler.runctx(
            "llm.generate(LONG_PROMPT, sampling_params)", globals(), locals()
        )
zhuwenwen's avatar
zhuwenwen committed
36
37
38

    # analyze the runtime of hashing function
    stats = pstats.Stats(profiler)
39
    stats.sort_stats("cumulative")
zhuwenwen's avatar
zhuwenwen committed
40
41
42
    total_time = 0
    total_calls = 0
    for func in stats.stats:
43
        if "hash_of_block" in func[2]:
zhuwenwen's avatar
zhuwenwen committed
44
45
46
            total_time = stats.stats[func][3]
            total_calls = stats.stats[func][0]
    percentage = (total_time / stats.total_tt) * 100
47
48
49
    print(
        f"Hashing took {total_time:.2f} seconds,{percentage:.2f}% of the total runtime."
    )
zhuwenwen's avatar
zhuwenwen committed
50
51
52


if __name__ == "__main__":
laibao's avatar
laibao committed
53
    parser = FlexibleArgumentParser(
54
55
56
57
58
59
60
61
62
        description="Benchmark the performance of hashing function in"
        "automatic prefix caching."
    )
    parser.add_argument("--model", type=str, default="lmsys/longchat-7b-16k")
    parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
    parser.add_argument("--output-len", type=int, default=10)
    parser.add_argument(
        "--enable-prefix-caching", action="store_true", help="enable prefix caching"
    )
zhuwenwen's avatar
zhuwenwen committed
63
64
    args = parser.parse_args()
    main(args)