common.js 3.1 KB
Newer Older
1
2
import { check } from 'k6';
import { scenario } from 'k6/execution';
3
4
5
import http from 'k6/http';
import { Trend, Counter } from 'k6/metrics';

6
7
const host = __ENV.HOST;
const model_id = __ENV.MODEL_ID;
8
const timePerToken = new Trend('time_per_token', true);
Nicolas Patry's avatar
Nicolas Patry committed
9
10
11
const tokens = new Counter('tokens');
const new_tokens = new Counter('new_tokens');
const input_tokens = new Counter('input_tokens');
12
const max_new_tokens = 50;
13
14
15
16
17

// const shareGPT = JSON.parse(open("ShareGPT_V3_unfiltered_cleaned_split.json"))
const shareGPT = JSON.parse(open("small.json"))


18
export function get_options() {
19
20
21
    return {
        thresholds: {
            http_req_failed: ['rate==0'],
22
23
24
25
26
            // time_per_token: [{
            //     threshold: `p(50)<${5 * reference_latency_ms}`,
            //     abortOnFail: true,
            //     delayAbortEval: '10s'
            // }],
27
28
        },
        scenarios: {
29
            single_user: {
30
31
                executor: 'constant-arrival-rate',
                duration: '60s',
32
33
                preAllocatedVUs: 1,
                rate: 1,
34
35
                timeUnit: '1s',
            },
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
            // load_test: {
            //     executor: 'constant-arrival-rate',
            //     duration: '60s',
            //     preAllocatedVUs: 100,
            //     rate: 1,
            //     timeUnit: '1s',
            // },
            // breakpoint: {
            //     executor: 'ramping-arrival-rate', //Assure load increase if the system slows
            //     preAllocatedVUs: 1000,
            //     stages: [
            //         { duration: '60s', target: 100 }, // just slowly ramp-up to a HUGE load
            //     ],
            // },
            // throughput: {
            //     executor: 'shared-iterations',
            //     vus: 100,
            //     iterations: 200,
            //     maxDuration: '40s',
            // },
56
57
58
59
        },
    };
}

60
61
62
63
64
65
function generate_payload(gpt, max_new_tokens) {
    const input = gpt["conversations"][0]["value"];
    return { "messages": [{ "role": "user", "content": input }], "temperature": 0, "model": `${model_id}`, "max_tokens": max_new_tokens }
}

export const options = get_options();
66

67
68
69
70
71
export default function run() {
    const headers = { 'Content-Type': 'application/json' };
    const query = shareGPT[scenario.iterationInTest % shareGPT.length];
    const payload = JSON.stringify(generate_payload(query, max_new_tokens));
    const res = http.post(`http://${host}/v1/chat/completions`, payload, {
72
73
        headers,
    });
74
    if (res.status >= 400 && res.status < 500) {
75
76
77
        return;
    }

Nicolas Patry's avatar
Nicolas Patry committed
78

79
    check(res, {
80
        'Post status is 200': (res) => res.status === 200,
81
    });
Nicolas Patry's avatar
Nicolas Patry committed
82
    const duration = res.timings.duration;
83
84

    if (res.status === 200) {
OlivierDehaene's avatar
OlivierDehaene committed
85
        const body = res.json();
86
87
        const completion_tokens = body.usage.completion_tokens;
        const latency_ms_per_token = duration / completion_tokens;
88
        timePerToken.add(latency_ms_per_token);
89
90
91
92
        const prompt_tokens = body.usage.prompt_tokens;
        input_tokens.add(prompt_tokens);
        new_tokens.add(completion_tokens);
        tokens.add(completion_tokens + prompt_tokens);
93
94
    }
}