eval.py 4.15 KB
Newer Older
wanglch's avatar
wanglch committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import os.path
from typing import Type

import gradio as gr

from swift.ui.base import BaseUI


class Eval(BaseUI):

    group = 'llm_eval'

    locale_dict = {
        'name': {
            'label': {
                'zh': '评测名称',
                'en': 'Evaluation name'
            },
            'info': {
                'zh': '支持英文字母、下划线、横线和数字',
                'en': 'Support characters, underscores, hyphens and numbers'
            }
        },
        'eval_dataset': {
            'label': {
                'zh': '评测数据集',
                'en': 'Evaluation dataset'
            },
            'info': {
                'zh': '选择评测数据集,支持多选',
                'en': 'Select eval dataset, multiple datasets supported'
            }
        },
        'eval_few_shot': {
            'label': {
                'zh': 'prompt的few-shot',
                'en': 'The few-shot for the prompt'
            },
            'info': {
                'zh': 'Few-shot数量在评测集中有默认设置,可以不填',
                'en': 'Few-shot numbers have default values in different datasets'
            }
        },
        'eval_limit': {
            'label': {
                'zh': '评测数据个数',
                'en': 'Eval numbers for each dataset'
            },
            'info': {
                'zh': '每个评测集的取样数',
                'en': 'Number of rows sampled from each dataset'
            }
        },
        'eval_use_cache': {
            'label': {
                'zh': '使用缓存',
                'en': 'Use eval cache'
            },
            'info': {
                'zh': '如果name指定的评测已经存在,则可以使用已有缓存',
                'en': 'If the evaluation results of the name exists, you may use cache.'
            }
        },
        'custom_eval_config': {
            'label': {
                'zh': '自定义数据集评测配置',
                'en': 'Custom eval config'
            },
            'info': {
                'zh': '可以使用该配置评测自己的数据集,详见github文档的评测部分',
                'en': 'Use this config to eval your own datasets, check the docs in github for details'
            }
        },
        'eval_url': {
            'label': {
                'zh': '评测链接',
                'en': 'The eval url'
            },
            'info': {
                'zh':
                'OpenAI样式的评测链接(如:http://localhost:8080/v1),用于评测接口(模型类型输入为实际模型类型)',
                'en':
                'The OpenAI style link(like: http://localhost:8080/v1) for '
                'evaluation(Input actual model type into model_type)'
            }
        },
        'eval_token': {
            'label': {
                'zh': 'Url token',
                'en': 'The url token'
            },
        },
        'eval_is_chat_model': {
            'label': {
                'zh': '接口是chat模型',
                'en': 'Chat model'
            },
            'info': {
                'zh': '评测接口是否是Chat模型',
                'en': 'The eval url is a chat model or not'
            }
        },
        'infer_backend': {
            'label': {
                'zh': '推理框架',
                'en': 'Infer backend'
            },
        }
    }

    @classmethod
    def do_build_ui(cls, base_tab: Type['BaseUI']):
        with gr.Row():
            gr.Textbox(elem_id='name', scale=20)
            gr.Dropdown(
                elem_id='eval_dataset', is_list=True, choices=['ceval', 'gsm8k', 'arc'], multiselect=True, scale=20)
            gr.Textbox(elem_id='eval_few_shot', scale=20)
            gr.Textbox(elem_id='eval_limit', scale=20)
            gr.Checkbox(elem_id='eval_use_cache', scale=20)
            gr.Dropdown(elem_id='infer_backend', scale=20)
        with gr.Row():
            gr.Textbox(elem_id='custom_eval_config', scale=20)
        with gr.Row():
            gr.Textbox(elem_id='eval_url', scale=20)
            gr.Textbox(elem_id='eval_token', scale=20)
            gr.Checkbox(elem_id='eval_is_chat_model', scale=20)