Unverified Commit 7505b3ca authored by Fengzhe Zhou's avatar Fengzhe Zhou Committed by GitHub
Browse files

[Feature] Add huggingface apply_chat_template (#1098)

* add TheoremQA with 5-shot

* add huggingface_above_v4_33 classes

* use num_worker partitioner in cli

* update theoremqa

* update TheoremQA

* add TheoremQA

* rename theoremqa -> TheoremQA

* update TheoremQA output path

* rewrite many model configs

* update huggingface

* further update

* refine configs

* update configs

* update configs

* add configs/eval_llama3_instruct.py

* add summarizer multi faceted

* update bbh datasets

* update configs/models/hf_llama/lmdeploy_llama3_8b_instruct.py

* rename class

* update readme

* update hf above v4.33
parent 6c711cb2
from opencompass.models import HuggingFace from opencompass.models import HuggingFaceBaseModel
models = [ models = [
dict( dict(
type=HuggingFace, type=HuggingFaceBaseModel,
abbr='yi-34b-hf', abbr='yi-34b-hf',
path='01-ai/Yi-34B', path='01-ai/Yi-34B',
tokenizer_path='01-ai/Yi-34B', max_out_len=1024,
model_kwargs=dict(
trust_remote_code=True,
device_map='auto',
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
max_out_len=100,
max_seq_len=2048,
batch_size=8, batch_size=8,
run_cfg=dict(num_gpus=4, num_procs=1), run_cfg=dict(num_gpus=2),
) )
] ]
from opencompass.models import HuggingFace
models = [
dict(
type=HuggingFace,
abbr='yi-34b-200k-hf',
path='01-ai/Yi-34B-200K',
tokenizer_path='01-ai/Yi-34B-200K',
model_kwargs=dict(
trust_remote_code=True,
device_map='auto',
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=4, num_procs=1),
)
]
from opencompass.models import HuggingFace from opencompass.models import HuggingFacewithChatTemplate
_meta_template = dict(
round=[
dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'),
dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True),
],
)
models = [ models = [
dict( dict(
type=HuggingFace, type=HuggingFacewithChatTemplate,
abbr='yi-34b-chat-hf', abbr='yi-34b-chat-hf',
path='01-ai/Yi-34B-Chat', path='01-ai/Yi-34B-Chat',
model_kwargs=dict( max_out_len=1024,
trust_remote_code=True,
device_map='auto',
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8, batch_size=8,
run_cfg=dict(num_gpus=2, num_procs=1), run_cfg=dict(num_gpus=2),
end_str='<|im_end|>',
batch_padding=True,
) )
] ]
from opencompass.models import HuggingFace from opencompass.models import HuggingFaceBaseModel
models = [ models = [
dict( dict(
type=HuggingFace, type=HuggingFaceBaseModel,
abbr='yi-6b-hf', abbr='yi-6b-hf',
path='01-ai/Yi-6B', path='01-ai/Yi-6B',
tokenizer_path='01-ai/Yi-6B', max_out_len=1024,
model_kwargs=dict(
trust_remote_code=True,
device_map='auto',
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
max_out_len=100,
max_seq_len=2048,
batch_size=8, batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1), run_cfg=dict(num_gpus=1),
) )
] ]
from opencompass.models import HuggingFace
models = [
dict(
type=HuggingFace,
abbr='yi-6b-200k-hf',
path='01-ai/Yi-6B-200K',
tokenizer_path='01-ai/Yi-6B-200K',
model_kwargs=dict(
trust_remote_code=True,
device_map='auto',
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
from opencompass.models import HuggingFace from opencompass.models import HuggingFacewithChatTemplate
_meta_template = dict(
round=[
dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'),
dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True),
],
)
models = [ models = [
dict( dict(
type=HuggingFace, type=HuggingFacewithChatTemplate,
abbr='yi-6b-chat-hf', abbr='yi-6b-chat-hf',
path='01-ai/Yi-6B-Chat', path='01-ai/Yi-6B-Chat',
tokenizer_path='01-ai/Yi-6B-Chat', max_out_len=1024,
model_kwargs=dict(
trust_remote_code=True,
device_map='auto',
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8, batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1), run_cfg=dict(num_gpus=1),
end_str='<|im_end|>',
batch_padding=True,
) )
] ]
from opencompass.models import HuggingFace from opencompass.models import HuggingFacewithChatTemplate
_meta_template = dict(
round=[
dict(role="HUMAN", begin='<|user|>\n', end='</s>'),
dict(role="BOT", begin="<|assistant|>\n", end='</s>', generate=True),
],
)
models = [ models = [
dict( dict(
type=HuggingFace, type=HuggingFacewithChatTemplate,
abbr='zephyr-7b-beta-hf', abbr='zephyr-7b-beta-hf',
path='HuggingFaceH4/zephyr-7b-beta', path='HuggingFaceH4/zephyr-7b-beta',
tokenizer_path='HuggingFaceH4/zephyr-7b-beta', max_out_len=1024,
model_kwargs=dict(
trust_remote_code=True,
device_map='auto',
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8, batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1), run_cfg=dict(num_gpus=1),
end_str='</s>',
) )
] ]
from mmengine.config import read_base
with read_base():
from .groups.mmlu import mmlu_summary_groups
from .groups.cmmlu import cmmlu_summary_groups
from .groups.ceval import ceval_summary_groups
from .groups.bbh import bbh_summary_groups
from .groups.GaokaoBench import GaokaoBench_summary_groups
from .groups.lcbench import lcbench_summary_groups
other_summary_groups = [
{
'name': 'average',
'subsets': [
['mmlu', 'naive_average'],
['cmmlu', 'naive_average'],
['ceval', 'naive_average'],
['GaokaoBench', 'weighted_average'],
['triviaqa_wiki_1shot', 'score'],
['nq_open_1shot', 'score'],
['race-high', 'accuracy'],
['winogrande', 'accuracy'],
['hellaswag', 'accuracy'],
['bbh', 'naive_average'],
['gsm8k', 'accuracy'],
['math', 'accuracy'],
['TheoremQA', 'score'],
['openai_humaneval', 'humaneval_pass@1'],
['sanitized_mbpp', 'score'],
['GPQA_diamond', 'accuracy'],
['IFEval', 'Prompt-level-strict-accuracy'],
],
},
]
summarizer = dict(
dataset_abbrs=[
['average', 'naive_average'],
['mmlu', 'naive_average'],
['cmmlu', 'naive_average'],
['ceval', 'naive_average'],
['GaokaoBench', 'weighted_average'],
['triviaqa_wiki_1shot', 'score'],
['nq_open_1shot', 'score'],
['race-high', 'accuracy'],
['winogrande', 'accuracy'],
['hellaswag', 'accuracy'],
['bbh', 'naive_average'],
['gsm8k', 'accuracy'],
['math', 'accuracy'],
['TheoremQA', 'score'],
['openai_humaneval', 'humaneval_pass@1'],
['sanitized_mbpp', 'score'],
['GPQA_diamond', 'accuracy'],
['IFEval', 'Prompt-level-strict-accuracy'],
'',
'mmlu',
'mmlu-stem',
'mmlu-social-science',
'mmlu-humanities',
'mmlu-other',
'cmmlu',
'cmmlu-stem',
'cmmlu-social-science',
'cmmlu-humanities',
'cmmlu-other',
'cmmlu-china-specific',
'ceval',
'ceval-stem',
'ceval-social-science',
'ceval-humanities',
'ceval-other',
'ceval-hard',
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith("_summary_groups")], []),
)
from mmengine.config import read_base
from opencompass.summarizers import MultiFacetedSummarizer
with read_base():
from .groups.mmlu import mmlu_summary_groups
from .groups.cmmlu import cmmlu_summary_groups
from .groups.ceval import ceval_summary_groups
from .groups.bbh import bbh_summary_groups
from .groups.GaokaoBench import GaokaoBench_summary_groups
other_summary_groups = [
{
'name': 'average',
'subsets': [
['mmlu', 'naive_average'],
['cmmlu', 'naive_average'],
['ceval', 'naive_average'],
['GaokaoBench', 'weighted_average'],
['triviaqa_wiki_1shot', 'score'],
['nq_open_1shot', 'score'],
['race-high', 'accuracy'],
['winogrande', 'accuracy'],
['hellaswag', 'accuracy'],
['bbh', 'naive_average'],
['gsm8k', 'accuracy'],
['math', 'accuracy'],
['TheoremQA', 'score'],
['openai_humaneval', 'humaneval_pass@1'],
['sanitized_mbpp', 'score'],
['GPQA_diamond', 'accuracy'],
['IFEval', 'Prompt-level-strict-accuracy'],
],
},
]
overall_dataset_abbrs = [
['average', 'naive_average'],
['mmlu', 'naive_average'],
['cmmlu', 'naive_average'],
['ceval', 'naive_average'],
['GaokaoBench', 'weighted_average'],
['triviaqa_wiki_1shot', 'score'],
['nq_open_1shot', 'score'],
['race-high', 'accuracy'],
['winogrande', 'accuracy'],
['hellaswag', 'accuracy'],
['bbh', 'naive_average'],
['gsm8k', 'accuracy'],
['math', 'accuracy'],
['TheoremQA', 'score'],
['openai_humaneval', 'humaneval_pass@1'],
['sanitized_mbpp', 'score'],
['GPQA_diamond', 'accuracy'],
['IFEval', 'Prompt-level-strict-accuracy'],
]
mmlu_summary_groups_dict = {g['name']: g['subsets'] for g in mmlu_summary_groups}
mmlu_dataset_abbrs = [
['mmlu', 'naive_average'],
['mmlu-stem', 'naive_average'],
['mmlu-social-science', 'naive_average'],
['mmlu-humanities', 'naive_average'],
['mmlu-other', 'naive_average'],
*mmlu_summary_groups_dict['mmlu-stem'],
*mmlu_summary_groups_dict['mmlu-social-science'],
*mmlu_summary_groups_dict['mmlu-humanities'],
*mmlu_summary_groups_dict['mmlu-other'],
]
cmmlu_summary_groups_dict = {g['name']: g['subsets'] for g in cmmlu_summary_groups}
cmmlu_dataset_abbrs = [
['cmmlu', 'naive_average'],
['cmmlu-stem', 'naive_average'],
['cmmlu-social-science', 'naive_average'],
['cmmlu-humanities', 'naive_average'],
['cmmlu-other', 'naive_average'],
['cmmlu-china-specific', 'naive_average'],
*cmmlu_summary_groups_dict['cmmlu-stem'],
*cmmlu_summary_groups_dict['cmmlu-social-science'],
*cmmlu_summary_groups_dict['cmmlu-humanities'],
*cmmlu_summary_groups_dict['cmmlu-other'],
]
ceval_summary_groups_dict = {g['name']: g['subsets'] for g in ceval_summary_groups}
ceval_dataset_abbrs = [
['ceval', 'naive_average'],
['ceval-stem', 'naive_average'],
['ceval-social-science', 'naive_average'],
['ceval-humanities', 'naive_average'],
['ceval-other', 'naive_average'],
['ceval-hard', 'naive_average'],
*ceval_summary_groups_dict['ceval-stem'],
*ceval_summary_groups_dict['ceval-social-science'],
*ceval_summary_groups_dict['ceval-humanities'],
*ceval_summary_groups_dict['ceval-other'],
]
bbh_summary_groups_dict = {g['name']: g['subsets'] for g in bbh_summary_groups}
bbh_dataset_abbrs = [
['bbh', 'naive_average'],
*bbh_summary_groups_dict['bbh'],
]
GaokaoBench_summary_groups_dict = {g['name']: g['subsets'] for g in GaokaoBench_summary_groups}
GaokaoBench_dataset_abbrs = [
['GaokaoBench', 'weighted_average'],
*GaokaoBench_summary_groups_dict['GaokaoBench'],
]
sanitized_mbpp_dataset_abbrs = [
['sanitized_mbpp', 'score'],
['sanitized_mbpp', 'pass'],
['sanitized_mbpp', 'failed'],
['sanitized_mbpp', 'wrong_answer'],
['sanitized_mbpp', 'timeout'],
]
summarizer = dict(
type=MultiFacetedSummarizer,
dataset_abbrs_list=[
{'name': 'mmlu', 'dataset_abbrs': mmlu_dataset_abbrs},
{'name': 'cmmlu', 'dataset_abbrs': cmmlu_dataset_abbrs},
{'name': 'ceval', 'dataset_abbrs': ceval_dataset_abbrs},
{'name': 'bbh', 'dataset_abbrs': bbh_dataset_abbrs},
{'name': 'GaokaoBench', 'dataset_abbrs': GaokaoBench_dataset_abbrs},
{'name': 'sanitized_mbpp', 'dataset_abbrs': sanitized_mbpp_dataset_abbrs},
{'name': 'overall', 'dataset_abbrs': overall_dataset_abbrs},
],
summary_groups=sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []),
)
...@@ -80,13 +80,8 @@ For HuggingFace models, users can set model parameters directly through the comm ...@@ -80,13 +80,8 @@ For HuggingFace models, users can set model parameters directly through the comm
```bash ```bash
python run.py --datasets siqa_gen winograd_ppl \ python run.py --datasets siqa_gen winograd_ppl \
--hf-path facebook/opt-125m \ --hf-type base \
--model-kwargs device_map='auto' \ --hf-path facebook/opt-125m
--tokenizer-kwargs padding_side='left' truncation='left' trust_remote_code=True \
--max-seq-len 2048 \
--max-out-len 100 \
--batch-size 128 \
--num-gpus 1 # Number of minimum required GPUs
``` ```
Note that in this way, OpenCompass only evaluates one model at a time, while other ways can evaluate multiple models at once. Note that in this way, OpenCompass only evaluates one model at a time, while other ways can evaluate multiple models at once.
...@@ -99,12 +94,14 @@ Note that in this way, OpenCompass only evaluates one model at a time, while oth ...@@ -99,12 +94,14 @@ Note that in this way, OpenCompass only evaluates one model at a time, while oth
:animate: fade-in-slide-down :animate: fade-in-slide-down
```bash ```bash
python run.py --datasets siqa_gen winograd_ppl \ python run.py --datasets siqa_gen winograd_ppl \
--hf-type base \ # HuggingFace model type, base or chat
--hf-path facebook/opt-125m \ # HuggingFace model path --hf-path facebook/opt-125m \ # HuggingFace model path
--tokenizer-path facebook/opt-125m \ # HuggingFace tokenizer path (if the same as the model path, can be omitted) --tokenizer-path facebook/opt-125m \ # HuggingFace tokenizer path (if the same as the model path, can be omitted)
--tokenizer-kwargs padding_side='left' truncation='left' trust_remote_code=True \ # Arguments to construct the tokenizer --tokenizer-kwargs padding_side='left' truncation='left' trust_remote_code=True \ # Arguments to construct the tokenizer
--model-kwargs device_map='auto' \ # Arguments to construct the model --model-kwargs device_map='auto' \ # Arguments to construct the model
--max-seq-len 2048 \ # Maximum sequence length the model can accept --max-seq-len 2048 \ # Maximum sequence length the model can accept
--max-out-len 100 \ # Maximum number of tokens to generate --max-out-len 100 \ # Maximum number of tokens to generate
--min-out-len 100 \ # Minimum number of tokens to generate
--batch-size 64 \ # Batch size --batch-size 64 \ # Batch size
--num-gpus 1 # Number of GPUs required to run the model --num-gpus 1 # Number of GPUs required to run the model
``` ```
...@@ -146,28 +143,22 @@ python run.py configs/eval_demo.py ...@@ -146,28 +143,22 @@ python run.py configs/eval_demo.py
OpenCompass provides a series of pre-defined model configurations under `configs/models`. Below is the configuration snippet related to [opt-350m](https://github.com/open-compass/opencompass/blob/main/configs/models/opt/hf_opt_350m.py) (`configs/models/opt/hf_opt_350m.py`): OpenCompass provides a series of pre-defined model configurations under `configs/models`. Below is the configuration snippet related to [opt-350m](https://github.com/open-compass/opencompass/blob/main/configs/models/opt/hf_opt_350m.py) (`configs/models/opt/hf_opt_350m.py`):
```python ```python
# Evaluate models supported by HuggingFace's `AutoModelForCausalLM` using `HuggingFaceCausalLM` # Evaluate models supported by HuggingFace's `AutoModelForCausalLM` using `HuggingFaceBaseModel`
from opencompass.models import HuggingFaceCausalLM from opencompass.models import HuggingFaceBaseModel
# OPT-350M models = [
opt350m = dict( # OPT-350M
type=HuggingFaceCausalLM, dict(
# Initialization parameters for `HuggingFaceCausalLM` type=HuggingFaceBaseModel,
path='facebook/opt-350m', # Initialization parameters for `HuggingFaceBaseModel`
tokenizer_path='facebook/opt-350m', path='facebook/opt-350m',
tokenizer_kwargs=dict( # Below are common parameters for all models, not specific to HuggingFaceBaseModel
padding_side='left', abbr='opt-350m-hf', # Model abbreviation
truncation_side='left', max_out_len=1024, # Maximum number of generated tokens
proxies=None, batch_size=32, # Batch size
trust_remote_code=True), run_cfg=dict(num_gpus=1), # The required GPU numbers for this model
model_kwargs=dict(device_map='auto'),
# Below are common parameters for all models, not specific to HuggingFaceCausalLM
abbr='opt350m', # Model abbreviation for result display
max_seq_len=2048, # The maximum length of the entire sequence
max_out_len=100, # Maximum number of generated tokens
batch_size=64, # batchsize
run_cfg=dict(num_gpus=1), # The required GPU numbers for this model
) )
]
``` ```
When using configurations, we can specify the relevant files through the command-line argument ` --models` or import the model configurations into the `models` list in the configuration file using the inheritance mechanism. When using configurations, we can specify the relevant files through the command-line argument ` --models` or import the model configurations into the `models` list in the configuration file using the inheritance mechanism.
......
...@@ -79,13 +79,8 @@ python tools/list_configs.py llama mmlu ...@@ -79,13 +79,8 @@ python tools/list_configs.py llama mmlu
```bash ```bash
python run.py --datasets siqa_gen winograd_ppl \ python run.py --datasets siqa_gen winograd_ppl \
--hf-path facebook/opt-125m \ --hf-type base \
--model-kwargs device_map='auto' \ --hf-path facebook/opt-125m
--tokenizer-kwargs padding_side='left' truncation='left' trust_remote_code=True \
--max-seq-len 2048 \
--max-out-len 100 \
--batch-size 128 \
--num-gpus 1 # 最少需要的 GPU 数量
``` ```
请注意,通过这种方式,OpenCompass 一次只评估一个模型,而其他方式可以一次评估多个模型。 请注意,通过这种方式,OpenCompass 一次只评估一个模型,而其他方式可以一次评估多个模型。
...@@ -100,12 +95,14 @@ python run.py --datasets siqa_gen winograd_ppl \ ...@@ -100,12 +95,14 @@ python run.py --datasets siqa_gen winograd_ppl \
:animate: fade-in-slide-down :animate: fade-in-slide-down
```bash ```bash
python run.py --datasets siqa_gen winograd_ppl \ python run.py --datasets siqa_gen winograd_ppl \
--hf-type base \ # HuggingFace 模型类型, base 或 chat
--hf-path facebook/opt-125m \ # HuggingFace 模型路径 --hf-path facebook/opt-125m \ # HuggingFace 模型路径
--tokenizer-path facebook/opt-125m \ # HuggingFace tokenizer 路径(如果与模型路径相同,可以省略) --tokenizer-path facebook/opt-125m \ # HuggingFace tokenizer 路径(如果与模型路径相同,可以省略)
--tokenizer-kwargs padding_side='left' truncation='left' trust_remote_code=True \ # 构建 tokenizer 的参数 --tokenizer-kwargs padding_side='left' truncation='left' trust_remote_code=True \ # 构建 tokenizer 的参数
--model-kwargs device_map='auto' \ # 构建模型的参数 --model-kwargs device_map='auto' \ # 构建模型的参数
--max-seq-len 2048 \ # 模型可以接受的最大序列长度 --max-seq-len 2048 \ # 模型可以接受的最大序列长度
--max-out-len 100 \ # 生成的最大 token 数 --max-out-len 100 \ # 生成的最大 token 数
--min-out-len 100 \ # 生成的最小 token 数
--batch-size 64 \ # 批量大小 --batch-size 64 \ # 批量大小
--num-gpus 1 # 运行模型所需的 GPU 数量 --num-gpus 1 # 运行模型所需的 GPU 数量
``` ```
...@@ -147,28 +144,22 @@ python run.py configs/eval_demo.py ...@@ -147,28 +144,22 @@ python run.py configs/eval_demo.py
OpenCompass 提供了一系列预定义的模型配置,位于 `configs/models` 下。以下是与 [opt-350m](https://github.com/open-compass/opencompass/blob/main/configs/models/opt/hf_opt_350m.py)(`configs/models/opt/hf_opt_350m.py`)相关的配置片段: OpenCompass 提供了一系列预定义的模型配置,位于 `configs/models` 下。以下是与 [opt-350m](https://github.com/open-compass/opencompass/blob/main/configs/models/opt/hf_opt_350m.py)(`configs/models/opt/hf_opt_350m.py`)相关的配置片段:
```python ```python
# 使用 `HuggingFaceCausalLM` 评估由 HuggingFace 的 `AutoModelForCausalLM` 支持的模型 # 使用 `HuggingFaceBaseModel` 评估由 HuggingFace 的 `AutoModelForCausalLM` 支持的模型
from opencompass.models import HuggingFaceCausalLM from opencompass.models import HuggingFaceBaseModel
# OPT-350M models = [
opt350m = dict( # OPT-350M
type=HuggingFaceCausalLM, dict(
# `HuggingFaceCausalLM` 的初始化参数 type=HuggingFaceBaseModel,
path='facebook/opt-350m', # `HuggingFaceBaseModel` 的初始化参数
tokenizer_path='facebook/opt-350m', path='facebook/opt-350m',
tokenizer_kwargs=dict( # 下面是所有模型的共同参数,不特定于 HuggingFaceBaseModel
padding_side='left', abbr='opt-350m-hf', # 模型的缩写
truncation_side='left', max_out_len=1024, # 生成的最大 token 数
proxies=None, batch_size=32, # 批量大小
trust_remote_code=True), run_cfg=dict(num_gpus=1), # 该模型所需的 GPU 数量
model_kwargs=dict(device_map='auto'),
# 下面是所有模型的共同参数,不特定于 HuggingFaceCausalLM
abbr='opt350m', # 结果显示的模型缩写
max_seq_len=2048, # 整个序列的最大长度
max_out_len=100, # 生成的最大 token 数
batch_size=64, # 批量大小
run_cfg=dict(num_gpus=1), # 该模型所需的 GPU 数量
) )
]
``` ```
使用配置时,我们可以通过命令行参数 `--models` 指定相关文件,或使用继承机制将模型配置导入到配置文件中的 `models` 列表中。 使用配置时,我们可以通过命令行参数 `--models` 指定相关文件,或使用继承机制将模型配置导入到配置文件中的 `models` 列表中。
......
# flake8: noqa
# yapf: disable
import argparse import argparse
import getpass import getpass
import os import os
...@@ -51,7 +53,7 @@ def parse_args(): ...@@ -51,7 +53,7 @@ def parse_args():
action='store_true', action='store_true',
default=False) default=False)
parser.add_argument( parser.add_argument(
'--accelerator', '-a', '--accelerator',
help='Infer accelerator, support vllm and lmdeploy now.', help='Infer accelerator, support vllm and lmdeploy now.',
choices=['vllm', 'lmdeploy', 'hf'], choices=['vllm', 'lmdeploy', 'hf'],
default='hf', default='hf',
...@@ -81,7 +83,7 @@ def parse_args(): ...@@ -81,7 +83,7 @@ def parse_args():
'saved in this path, including the slurm logs, ' 'saved in this path, including the slurm logs, '
'the evaluation results, the summary results, etc.' 'the evaluation results, the summary results, etc.'
'If not specified, the work_dir will be set to ' 'If not specified, the work_dir will be set to '
'./outputs/default.', 'outputs/default.',
default=None, default=None,
type=str) type=str)
parser.add_argument( parser.add_argument(
...@@ -95,23 +97,12 @@ def parse_args(): ...@@ -95,23 +97,12 @@ def parse_args():
help='Report the running status to lark bot', help='Report the running status to lark bot',
action='store_true', action='store_true',
default=False) default=False)
parser.add_argument('--max-partition-size',
help='The maximum size of an infer task. Only '
'effective when "infer" is missing from the config.',
type=int,
default=40000),
parser.add_argument(
'--gen-task-coef',
help='The dataset cost measurement coefficient for generation tasks, '
'Only effective when "infer" is missing from the config.',
type=int,
default=20)
parser.add_argument('--max-num-workers', parser.add_argument('--max-num-workers',
help='Max number of workers to run in parallel. ' help='Max number of workers to run in parallel. '
'Will be overrideen by the "max_num_workers" argument ' 'Will be overrideen by the "max_num_workers" argument '
'in the config.', 'in the config.',
type=int, type=int,
default=32) default=1)
parser.add_argument('--max-workers-per-gpu', parser.add_argument('--max-workers-per-gpu',
help='Max task to run in parallel on one GPU. ' help='Max task to run in parallel on one GPU. '
'It will only be used in the local runner.', 'It will only be used in the local runner.',
...@@ -181,25 +172,21 @@ def parse_dlc_args(dlc_parser): ...@@ -181,25 +172,21 @@ def parse_dlc_args(dlc_parser):
def parse_hf_args(hf_parser): def parse_hf_args(hf_parser):
"""These args are all for the quick construction of HuggingFace models.""" """These args are all for the quick construction of HuggingFace models."""
hf_parser.add_argument('--hf-path', type=str) hf_parser.add_argument('--hf-type', type=str, choices=['base', 'chat'], default='chat', help='The type of the HuggingFace model, base or chat')
hf_parser.add_argument('--peft-path', type=str) hf_parser.add_argument('--hf-path', type=str, help='The path to the HuggingFace model, e.g. "facebook/opt-125m", required')
hf_parser.add_argument('--tokenizer-path', type=str) hf_parser.add_argument('--model-kwargs', nargs='+', action=DictAction, default={}, help='The kwargs for the HuggingFace model')
hf_parser.add_argument('--model-kwargs', hf_parser.add_argument('--tokenizer-path', type=str, help='The path to the HuggingFace tokenizer, same as --hf-path if not specified')
nargs='+', hf_parser.add_argument('--tokenizer-kwargs', nargs='+', action=DictAction, default={}, help='The kwargs for the tokenizer')
action=DictAction, hf_parser.add_argument('--peft-path', type=str, help='The path to the PEFT model')
default={}) hf_parser.add_argument('--peft-kwargs', nargs='+', action=DictAction, default={}, help='The kwargs for the PEFT model')
hf_parser.add_argument('--tokenizer-kwargs', hf_parser.add_argument('--generation-kwargs', nargs='+', action=DictAction, default={}, help='The kwargs for the generation')
nargs='+', hf_parser.add_argument('--max-seq-len', type=int, help='The max sequence length for the HuggingFace model')
action=DictAction, hf_parser.add_argument('--max-out-len', type=int, default=256, help='The max output length for the HuggingFace model')
default={}) hf_parser.add_argument('--min-out-len', type=int, default=1, help='The min output length for the HuggingFace model')
hf_parser.add_argument('--max-out-len', type=int) hf_parser.add_argument('--batch-size', type=int, default=8, help='The batch size for the HuggingFace model')
hf_parser.add_argument('--max-seq-len', type=int) hf_parser.add_argument('--num-gpus', type=int, default=1, help='The number of GPUs for **the HuggingFace model passed via cli**')
hf_parser.add_argument('--no-batch-padding', hf_parser.add_argument('--pad-token-id', type=int, help='The pad token id for the HuggingFace model')
action='store_true', hf_parser.add_argument('--stop-words', nargs='+', default=[], help='The stop words for the HuggingFace model')
default=False)
hf_parser.add_argument('--batch-size', type=int)
hf_parser.add_argument('--num-gpus', type=int)
hf_parser.add_argument('--pad-token-id', type=int)
def parse_custom_dataset_args(custom_dataset_parser): def parse_custom_dataset_args(custom_dataset_parser):
...@@ -225,7 +212,7 @@ def main(): ...@@ -225,7 +212,7 @@ def main():
if args.work_dir is not None: if args.work_dir is not None:
cfg['work_dir'] = args.work_dir cfg['work_dir'] = args.work_dir
else: else:
cfg.setdefault('work_dir', './outputs/default/') cfg.setdefault('work_dir', osp.join('outputs', 'default'))
# cfg_time_str defaults to the current time # cfg_time_str defaults to the current time
cfg_time_str = dir_time_str = datetime.now().strftime('%Y%m%d_%H%M%S') cfg_time_str = dir_time_str = datetime.now().strftime('%Y%m%d_%H%M%S')
......
...@@ -22,6 +22,9 @@ class winograndeDataset(BaseDataset): ...@@ -22,6 +22,9 @@ class winograndeDataset(BaseDataset):
prompt = line['sentence'] prompt = line['sentence']
continue_prompt = prompt.split('_')[1] continue_prompt = prompt.split('_')[1]
data_item = { data_item = {
'prompt': prompt,
'only_option1': line['option1'],
'only_option2': line['option2'],
'opt1': prompt.replace('_', line['option1']), 'opt1': prompt.replace('_', line['option1']),
'opt2': prompt.replace('_', line['option2']), 'opt2': prompt.replace('_', line['option2']),
'answer': line['answer'], 'answer': line['answer'],
...@@ -48,6 +51,9 @@ class winograndeDataset_V2(BaseDataset): ...@@ -48,6 +51,9 @@ class winograndeDataset_V2(BaseDataset):
answer = line['answer'] answer = line['answer']
answer = ' AB'[int(answer)] if answer != '' else 'NULL' answer = ' AB'[int(answer)] if answer != '' else 'NULL'
data_item = { data_item = {
'prompt': prompt,
'only_option1': line['option1'],
'only_option2': line['option2'],
'opt1': prompt.replace('_', line['option1']), 'opt1': prompt.replace('_', line['option1']),
'opt2': prompt.replace('_', line['option2']), 'opt2': prompt.replace('_', line['option2']),
'answer': answer, 'answer': answer,
...@@ -76,6 +82,9 @@ class winograndeDataset_V3(BaseDataset): ...@@ -76,6 +82,9 @@ class winograndeDataset_V3(BaseDataset):
answer = line['answer'] answer = line['answer']
answer = ' AB'[int(answer)] if answer != '' else 'NULL' answer = ' AB'[int(answer)] if answer != '' else 'NULL'
data_item = { data_item = {
'prompt': prompt,
'only_option1': line['option1'],
'only_option2': line['option2'],
'opt1': prompt.replace('_', line['option1']), 'opt1': prompt.replace('_', line['option1']),
'opt2': prompt.replace('_', line['option2']), 'opt2': prompt.replace('_', line['option2']),
'answer': answer, 'answer': answer,
......
...@@ -3,26 +3,28 @@ from .ai360_api import AI360GPT # noqa: F401 ...@@ -3,26 +3,28 @@ from .ai360_api import AI360GPT # noqa: F401
from .alaya import AlayaLM # noqa: F401 from .alaya import AlayaLM # noqa: F401
from .baichuan_api import BaiChuan, BaiChuan3 # noqa: F401 from .baichuan_api import BaiChuan, BaiChuan3 # noqa: F401
from .baidu_api import ERNIEBot # noqa: F401 from .baidu_api import ERNIEBot # noqa: F401
from .base import BaseModel, LMTemplateParser # noqa from .base import BaseModel, LMTemplateParser # noqa: F401
from .base_api import APITemplateParser, BaseAPIModel # noqa from .base_api import APITemplateParser, BaseAPIModel # noqa: F401
from .bytedance_api import ByteDance # noqa: F401 from .bytedance_api import ByteDance # noqa: F401
from .claude_api import Claude # noqa: F401 from .claude_api import Claude # noqa: F401
from .gemini_api import Gemini, GeminiAllesAPIN # noqa: F401, F403 from .gemini_api import Gemini, GeminiAllesAPIN # noqa: F401
from .glm import GLM130B # noqa: F401, F403 from .glm import GLM130B # noqa: F401
from .huggingface import HuggingFace # noqa: F401, F403 from .huggingface import HuggingFace # noqa: F401
from .huggingface import HuggingFaceCausalLM # noqa: F401, F403 from .huggingface import HuggingFaceCausalLM # noqa: F401
from .huggingface import HuggingFaceChatGLM3 # noqa: F401, F403 from .huggingface import HuggingFaceChatGLM3 # noqa: F401
from .huggingface_above_v4_33 import HuggingFaceBaseModel # noqa: F401
from .huggingface_above_v4_33 import HuggingFacewithChatTemplate # noqa: F401
from .hunyuan_api import Hunyuan # noqa: F401 from .hunyuan_api import Hunyuan # noqa: F401
from .intern_model import InternLM # noqa: F401, F403 from .intern_model import InternLM # noqa: F401
from .krgpt_api import KrGPT # noqa: F401 from .krgpt_api import KrGPT # noqa: F401
from .lightllm_api import LightllmAPI # noqa: F401 from .lightllm_api import LightllmAPI # noqa: F401
from .llama2 import Llama2, Llama2Chat # noqa: F401, F403 from .llama2 import Llama2, Llama2Chat # noqa: F401
from .lmdeploy_pytorch import LmdeployPytorchModel # noqa: F401 from .lmdeploy_pytorch import LmdeployPytorchModel # noqa: F401
from .lmdeploy_tis import LmdeployTisModel # noqa: F401 from .lmdeploy_tis import LmdeployTisModel # noqa: F401
from .minimax_api import MiniMax # noqa: F401 from .minimax_api import MiniMax # noqa: F401
from .mistral_api import Mistral # noqa: F401 from .mistral_api import Mistral # noqa: F401
from .mixtral import Mixtral # noqa: F401 from .mixtral import Mixtral # noqa: F401
from .modelscope import ModelScope, ModelScopeCausalLM # noqa: F401, F403 from .modelscope import ModelScope, ModelScopeCausalLM # noqa: F401
from .moonshot_api import MoonShot # noqa: F401 from .moonshot_api import MoonShot # noqa: F401
from .nanbeige_api import Nanbeige # noqa: F401 from .nanbeige_api import Nanbeige # noqa: F401
from .openai_api import OpenAI # noqa: F401 from .openai_api import OpenAI # noqa: F401
......
# flake8: noqa
# yapf: disable
from typing import Dict, List, Optional, Union
from opencompass.models.base import BaseModel, LMTemplateParser
from opencompass.models.base_api import APITemplateParser
from opencompass.registry import MODELS
from opencompass.utils.logging import get_logger
from opencompass.utils.prompt import PromptList
PromptType = Union[PromptList, str]
def _get_stopping_criteria(stop_words, tokenizer, batch_size):
from transformers import (PreTrainedTokenizer, StoppingCriteria,
StoppingCriteriaList)
class MultiTokenEOSCriteria(StoppingCriteria):
"""Criteria to stop on the specified multi-token sequence."""
def __init__(self, sequence: str, tokenizer: PreTrainedTokenizer, batch_size: int):
self.done_tracker = [False] * batch_size
self.sequence = sequence
self.sequence_ids = tokenizer.encode(sequence, add_special_tokens=False)
self.sequence_id_len = len(self.sequence_ids)
self.tokenizer = tokenizer
def __call__(self, input_ids, scores, **kwargs) -> bool:
# compare the last len(stop) tokens
lookback_ids_batch = input_ids[:, -self.sequence_id_len:]
lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch)
for i, done in enumerate(self.done_tracker):
if done:
continue
self.done_tracker[i] = self.sequence in lookback_tokens_batch[i]
return False not in self.done_tracker
criteria = []
for stop_word in stop_words:
c = MultiTokenEOSCriteria(stop_word, tokenizer, batch_size)
criteria.append(c)
criteria = StoppingCriteriaList(criteria)
return criteria
def _get_possible_max_seq_len(max_seq_len, path):
if max_seq_len is not None:
return max_seq_len
from transformers import AutoConfig
config = AutoConfig.from_pretrained(path, trust_remote_code=True)
possible_keys = [
'max_position_embeddings',
'seq_length',
'model_max_length',
]
for k in possible_keys:
if hasattr(config, k):
return getattr(config, k)
raise ValueError('max_seq_len is not provided and cannot be inferred from the model config.')
def _convert_chat_messages(inputs):
outputs = []
for _input in inputs:
messages = []
if isinstance(_input, str):
messages.append({'role': 'HUMAN', 'prompt': _input})
else:
for item in _input:
role = {
'HUMAN': 'user',
'BOT': 'assistant',
'SYSTEM': 'system',
}[item['role']]
messages.append({'role': role, 'content': item['prompt']})
outputs.append(messages)
return outputs
def _format_with_fast_chat_template(inputs: List[str], name: str='vicuna'):
try:
from fastchat.model import get_conversation_template
except ImportError:
raise ModuleNotFoundError('fastchat not found. Please install with\npip install "fschat[model_worker,webui]"')
outputs = []
for _input in inputs:
template = get_conversation_template(name)
for item in _input:
if item['role'] == 'user':
template.append_message(template.roles[0], item['content'])
elif item['role'] == 'assistant':
template.append_message(template.roles[1], item['content'])
elif item['role'] == 'system':
continue
else:
raise ValueError(f'Unknown role {item["role"]}')
template.append_message(template.roles[1], None)
outputs.append(template.get_prompt())
return outputs
def _get_meta_template(meta_template):
default_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
]
)
return APITemplateParser(meta_template or default_meta_template)
def _set_model_kwargs_torch_dtype(model_kwargs):
import torch
if 'torch_dtype' not in model_kwargs:
torch_dtype = torch.float16
else:
torch_dtype = {
'torch.float16': torch.float16,
'torch.bfloat16': torch.bfloat16,
'torch.float': torch.float,
'auto': 'auto',
'None': None,
}.get(model_kwargs['torch_dtype'])
if torch_dtype is not None:
model_kwargs['torch_dtype'] = torch_dtype
return model_kwargs
@MODELS.register_module()
class HuggingFacewithChatTemplate(BaseModel):
def __init__(self,
path: str,
model_kwargs: dict = dict(),
tokenizer_path: Optional[str] = None,
tokenizer_kwargs: dict = dict(),
peft_path: Optional[str] = None,
peft_kwargs: dict = dict(),
tokenizer_only: bool = False,
generation_kwargs: dict = dict(),
max_seq_len: Optional[int] = None,
meta_template: Optional[Dict] = None,
pad_token_id: Optional[int] = None,
fastchat_template: Optional[str] = None,
stop_words: Optional[str] = [],
**other_kwargs):
self.logger = get_logger()
self.path = path
self.tokenizer_only = tokenizer_only
self.template_parser = _get_meta_template(meta_template)
self.max_seq_len = _get_possible_max_seq_len(max_seq_len, path)
self._load_tokenizer(tokenizer_path or path, tokenizer_kwargs, pad_token_id)
if not tokenizer_only:
self._load_model(path=path, kwargs=model_kwargs, peft_path=peft_path, peft_kwargs=peft_kwargs)
self.generation_kwargs = generation_kwargs
self.fastchat_template = fastchat_template
self.stop_words = stop_words
for k, v in other_kwargs.items():
if v is not None:
self.logger.warning(f'Unused argument {k}={v}')
def _load_tokenizer(self, path: Optional[str], kwargs: dict, pad_token_id: Optional[int] = None):
from transformers import AutoTokenizer, GenerationConfig
DEFAULT_TOKENIZER_KWARGS = dict(padding_side='left', truncation_side='left', use_fast=False, trust_remote_code=True)
tokenizer_kwargs = DEFAULT_TOKENIZER_KWARGS
tokenizer_kwargs.update(kwargs)
self.tokenizer = AutoTokenizer.from_pretrained(path, **tokenizer_kwargs)
# A patch for some models without pad_token_id
if pad_token_id is not None:
if self.tokenizer.pad_token_id is None:
self.logger.debug(f'Using {pad_token_id} as pad_token_id')
elif self.tokenizer.pad_token_id != pad_token_id:
self.logger.warning(f'pad_token_id is not consistent. Using {pad_token_id} as pad_token_id')
self.tokenizer.pad_token_id = pad_token_id
return
if self.tokenizer.pad_token_id is not None:
return
self.logger.warning('pad_token_id is not set for the tokenizer.')
generation_config = GenerationConfig.from_pretrained(path)
if generation_config.pad_token_id is not None:
self.logger.warning(f'Using {generation_config.pad_token_id} as pad_token_id.')
self.tokenizer.pad_token_id = generation_config.pad_token_id
return
if self.tokenizer.eos_token_id is not None:
self.logger.warning(f'Using eos_token_id {self.tokenizer.eos_token_id} as pad_token_id.')
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
return
raise ValueError('pad_token_id is not set for this tokenizer. Please set `pad_token_id={PAD_TOKEN_ID}` in model_cfg.')
def _load_model(self, path: str, kwargs: dict, peft_path: Optional[str] = None, peft_kwargs: dict = dict()):
from transformers import AutoModel, AutoModelForCausalLM
DEFAULT_MODEL_KWARGS = dict(device_map='auto', trust_remote_code=True)
model_kwargs = DEFAULT_MODEL_KWARGS
model_kwargs.update(kwargs)
model_kwargs = _set_model_kwargs_torch_dtype(model_kwargs)
try:
self.model = AutoModelForCausalLM.from_pretrained(path, **model_kwargs)
except ValueError:
self.model = AutoModel.from_pretrained(path, **model_kwargs)
if peft_path is not None:
from peft import PeftModel
peft_kwargs['is_trainable'] = False
self.model = PeftModel.from_pretrained(self.model, peft_path, **peft_kwargs)
self.model.eval()
self.model.generation_config.do_sample = False
def generate(self,
inputs: List[str],
max_out_len: int,
min_out_len: Optional[int] = None,
stopping_criteria: List[str] = [],
**kwargs) -> List[str]:
messages = _convert_chat_messages(inputs)
batch_size = len(messages)
tokenize_kwargs = dict(
return_tensors='pt',
padding=True,
truncation=True,
add_special_tokens=True,
max_length=self.max_seq_len
)
if self.fastchat_template:
messages = _format_with_fast_chat_template(messages, self.fastchat_template)
tokens = self.tokenizer.batch_encode_plus(messages, **tokenize_kwargs)
else:
messages = [self.tokenizer.apply_chat_template(m, add_generation_prompt=True, tokenize=False) for m in messages]
tokenize_kwargs['add_special_tokens'] = False
tokens = self.tokenizer.batch_encode_plus(messages, **tokenize_kwargs)
tokens = {k: v.to(self.model.device) for k, v in tokens.items()}
generation_kwargs = self.generation_kwargs.copy()
generation_kwargs.update(kwargs)
stopping_criteria = list(set(stopping_criteria + self.stop_words))
if stopping_criteria:
generation_kwargs['stopping_criteria'] = _get_stopping_criteria(stopping_criteria, self.tokenizer, batch_size)
if max_out_len is not None:
generation_kwargs['max_new_tokens'] = max_out_len
if min_out_len is not None:
generation_kwargs['min_new_tokens'] = min_out_len
generation_kwargs['pad_token_id'] = self.tokenizer.pad_token_id
# step-2: conduct model forward to generate output
outputs = self.model.generate(**tokens, **generation_kwargs)
outputs = outputs[:, tokens['input_ids'].shape[1]:]
# step-3: decode the output
decodeds = self.tokenizer.batch_decode(outputs)
for stop in stopping_criteria:
decodeds = [t.split(stop)[0] for t in decodeds]
return decodeds
def get_token_len(self, prompt: str) -> int:
m = _convert_chat_messages([prompt])[0]
t = self.tokenizer.apply_chat_template(m, add_generation_prompt=True, return_dict=True)
return len(t['input_ids'])
def _convert_base_messages(inputs):
outputs = []
for _input in inputs:
if isinstance(_input, str):
outputs.append(_input)
else:
messages = []
for item in _input:
messages.append(item['prompt'])
outputs.append(''.join(messages))
return outputs
class HuggingFaceBaseModel(HuggingFacewithChatTemplate):
def __init__(self,
path: str,
model_kwargs: dict = dict(),
tokenizer_path: Optional[str] = None,
tokenizer_kwargs: dict = dict(),
peft_path: Optional[str] = None,
peft_kwargs: dict = dict(),
tokenizer_only: bool = False,
generation_kwargs: dict = dict(),
max_seq_len: Optional[int] = None,
pad_token_id: Optional[int] = None,
stop_words: Optional[str] = [],
**other_kwargs):
self.logger = get_logger()
self.path = path
self.tokenizer_only = tokenizer_only
self.template_parser = LMTemplateParser()
self.max_seq_len = _get_possible_max_seq_len(max_seq_len, path)
self._load_tokenizer(tokenizer_path or path, tokenizer_kwargs, pad_token_id)
if not tokenizer_only:
self._load_model(path=path, kwargs=model_kwargs, peft_path=peft_path, peft_kwargs=peft_kwargs)
self.generation_kwargs = generation_kwargs
self.stop_words = stop_words
for k, v in other_kwargs.items():
if v is not None:
self.logger.warning(f'Unused argument {k}={v}')
def generate(self,
inputs: List[str],
max_out_len: int,
min_out_len: Optional[int] = None,
stopping_criteria: List[str] = [],
**kwargs) -> List[str]:
messages = _convert_base_messages(inputs)
batch_size = len(messages)
tokenize_kwargs = dict(
return_tensors='pt',
padding=True,
truncation=True,
add_special_tokens=True,
max_length=self.max_seq_len
)
tokens = self.tokenizer.batch_encode_plus(messages, **tokenize_kwargs)
tokens = {k: v.to(self.model.device) for k, v in tokens.items()}
generation_kwargs = self.generation_kwargs.copy()
generation_kwargs.update(kwargs)
stopping_criteria = list(set(stopping_criteria + self.stop_words))
if stopping_criteria:
generation_kwargs['stopping_criteria'] = _get_stopping_criteria(stopping_criteria, self.tokenizer, batch_size)
if max_out_len is not None:
generation_kwargs['max_new_tokens'] = max_out_len
if min_out_len is not None:
generation_kwargs['min_new_tokens'] = min_out_len
generation_kwargs['pad_token_id'] = self.tokenizer.pad_token_id
# step-2: conduct model forward to generate output
outputs = self.model.generate(**tokens, **generation_kwargs)
outputs = outputs[:, tokens['input_ids'].shape[1]:]
# step-3: decode the output
decodeds = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
for stop in stopping_criteria:
decodeds = [token.split(stop)[0] for token in decodeds]
return decodeds
def get_ppl(self, inputs: List[str], mask_length: Optional[List[int]] = None) -> List[float]:
"""Get perplexity scores given a list of inputs.
Args:
inputs (List[str]): A list of strings.
mask_length (Optional[List[int]]): A list of mask lengths. If
provided, the perplexity scores will be calculated with the
first mask_length[i] tokens masked out. It's okay to skip
its implementation if advanced features in PPLInfernecer is
not needed.
Returns:
List[float]: A list of perplexity scores.
"""
assert self.tokenizer.pad_token
import torch
import torch.nn.functional as F
pad_token_id = self.tokenizer.pad_token_id
messages = _convert_base_messages(inputs)
tokenize_kwargs = dict(
return_tensors='pt',
padding=True,
truncation=True,
add_special_tokens=True,
max_length=self.max_seq_len
)
tokens = self.tokenizer.batch_encode_plus(messages, **tokenize_kwargs)
tokens = {k: v.to(self.model.device) for k, v in tokens.items()}
outputs = self.model(**tokens)[0]
batch_size, seq_len, vocab_size = outputs.shape
shift_logits = outputs[:, :-1, :].contiguous().float()
shift_labels = tokens['input_ids'][:, 1:].contiguous()
loss = F.cross_entropy(
shift_logits.view(-1, vocab_size),
shift_labels.view(-1),
ignore_index=pad_token_id,
reduction='none').view(batch_size, seq_len - 1)
lens = (tokens['input_ids'] != pad_token_id).sum(-1).cpu().numpy()
if mask_length is not None:
import numpy as np
mask = torch.zeros_like(shift_labels) # [batch,seqlen]
for i in range(len(mask)):
for j in range(mask_length[i] - 1, len(mask[i])):
mask[i][j] = 1
loss = loss * mask
lens -= np.array(mask_length)
ce_loss = loss.float().sum(-1).cpu().detach().numpy() / lens
return ce_loss
def get_loglikelihood(self, inputs: List[str], conts: List[str]) -> List[float]:
mask_length = [self.get_token_len(c, add_special_tokens=False) for c in conts]
return - self.get_ppl(inputs, mask_length)
def get_token_len(self, prompt: str, add_special_tokens: bool=True) -> int:
m = _convert_base_messages([prompt])[0]
t = self.tokenizer(m, add_special_tokens=add_special_tokens)
return len(t['input_ids'])
...@@ -37,9 +37,6 @@ class TurboMindModel(BaseModel): ...@@ -37,9 +37,6 @@ class TurboMindModel(BaseModel):
arguments like session_len, max_batch_size for TurboMind. arguments like session_len, max_batch_size for TurboMind.
gen_config (Dict, optional): Generation config to set gen_config (Dict, optional): Generation config to set
arguments like top_k, top_p, temperature. arguments like top_k, top_p, temperature.
end_str (str, optional): Whether to trim generated strings with end_str
if the model has special ending strings that are not handled well.
Defaults to None.
""" """
def __init__(self, def __init__(self,
...@@ -47,9 +44,8 @@ class TurboMindModel(BaseModel): ...@@ -47,9 +44,8 @@ class TurboMindModel(BaseModel):
concurrency: int = 8, concurrency: int = 8,
max_seq_len: int = 2048, max_seq_len: int = 2048,
meta_template: Optional[Dict] = None, meta_template: Optional[Dict] = None,
engine_config: Optional[Dict] = None, engine_config: Dict = {},
gen_config: Optional[Dict] = None, gen_config: Dict = {}):
end_str: Optional[str] = None):
super().__init__(path=path, super().__init__(path=path,
max_seq_len=max_seq_len, max_seq_len=max_seq_len,
meta_template=meta_template) meta_template=meta_template)
...@@ -70,12 +66,14 @@ class TurboMindModel(BaseModel): ...@@ -70,12 +66,14 @@ class TurboMindModel(BaseModel):
] ]
self.generator_ids = [i + 1 for i in range(concurrency)] self.generator_ids = [i + 1 for i in range(concurrency)]
self.gen_config = gen_config self.gen_config = gen_config
self.end_str = end_str
self.major_version, self.minor_version, _ = version_info self.major_version, self.minor_version, _ = version_info
def generate(self, def generate(self,
inputs: List[str], inputs: List[str],
max_out_len: int = 512, max_out_len: int = 512,
stopping_criteria: List[str] = [],
do_sample: Optional[bool] = None,
temperature: int = 1,
**kwargs) -> List[str]: **kwargs) -> List[str]:
"""Generate results given a list of inputs. """Generate results given a list of inputs.
...@@ -96,13 +94,21 @@ class TurboMindModel(BaseModel): ...@@ -96,13 +94,21 @@ class TurboMindModel(BaseModel):
] ]
gen_config = copy.deepcopy(self.gen_config) gen_config = copy.deepcopy(self.gen_config)
if 'do_sample' in kwargs: if do_sample is not None:
if kwargs['do_sample']: if do_sample:
gen_config.top_k = 1000 gen_config['top_k'] = 1000
gen_config.temperature = kwargs.get('temperature', 1) gen_config['temperature'] = temperature
else: else:
gen_config.top_k = 1 gen_config['top_k'] = 1
gen_config.temperature = 0.01 if stopping_criteria:
stop_words = gen_config.get('stop_words', [])
for t in stopping_criteria:
t = self.tokenizer.encode(t, add_bos=False)
stop_words.append(t[0])
gen_config['stop_words'] = list(set(stop_words))
from lmdeploy.messages import EngineGenerationConfig
gen_config = EngineGenerationConfig(**gen_config)
results = [] results = []
for batch_input in batch_inputs: for batch_input in batch_inputs:
...@@ -115,7 +121,6 @@ class TurboMindModel(BaseModel): ...@@ -115,7 +121,6 @@ class TurboMindModel(BaseModel):
batch_input, batch_input,
[max_out_len] * len(batch_input), [max_out_len] * len(batch_input),
[gen_config] * len(batch_input), [gen_config] * len(batch_input),
[self.end_str] * len(batch_input),
)) ))
results += _results results += _results
return results return results
...@@ -136,8 +141,7 @@ class TurboMindModel(BaseModel): ...@@ -136,8 +141,7 @@ class TurboMindModel(BaseModel):
session_id, session_id,
prompt: PromptType, prompt: PromptType,
max_out_len: int, max_out_len: int,
gen_config=None, gen_config=None) -> str:
end_str: Optional[str] = None) -> str:
"""Generate results given a list of inputs. """Generate results given a list of inputs.
Args: Args:
...@@ -147,10 +151,6 @@ class TurboMindModel(BaseModel): ...@@ -147,10 +151,6 @@ class TurboMindModel(BaseModel):
max_out_len (int): The maximum length of the output. max_out_len (int): The maximum length of the output.
gen_config (EngineGenerationConfig, optional): Generation gen_config (EngineGenerationConfig, optional): Generation
config to set arguments like top_k, top_p, temperature. config to set arguments like top_k, top_p, temperature.
end_str (str, optional): Whether to trim generated strings
with end_str if the model has special ending strings
that are not handled well.
Defaults to None.
Returns: Returns:
str: The generated string. str: The generated string.
""" """
...@@ -173,9 +173,6 @@ class TurboMindModel(BaseModel): ...@@ -173,9 +173,6 @@ class TurboMindModel(BaseModel):
_, output_ids, _ = outputs _, output_ids, _ = outputs
response = self.tokenizer.decode(output_ids) response = self.tokenizer.decode(output_ids)
response = valid_str(response) response = valid_str(response)
# used to trim
if end_str:
response = response.split(end_str)[0]
return response return response
def get_ppl(self, def get_ppl(self,
......
...@@ -25,7 +25,7 @@ class VLLM(BaseModel): ...@@ -25,7 +25,7 @@ class VLLM(BaseModel):
meta_template: Optional[Dict] = None, meta_template: Optional[Dict] = None,
mode: str = 'none', mode: str = 'none',
use_fastchat_template: bool = False, use_fastchat_template: bool = False,
end_str: Optional[str] = None, stop_words: List[str] = [],
): ):
super().__init__(path=path, super().__init__(path=path,
max_seq_len=max_seq_len, max_seq_len=max_seq_len,
...@@ -42,7 +42,7 @@ class VLLM(BaseModel): ...@@ -42,7 +42,7 @@ class VLLM(BaseModel):
assert mode in ['none', 'mid'] assert mode in ['none', 'mid']
self.mode = mode self.mode = mode
self.use_fastchat_template = use_fastchat_template self.use_fastchat_template = use_fastchat_template
self.end_str = end_str self.stop_words = stop_words
def _load_model(self, def _load_model(self,
path: str, path: str,
...@@ -59,7 +59,10 @@ class VLLM(BaseModel): ...@@ -59,7 +59,10 @@ class VLLM(BaseModel):
ray.shutdown() ray.shutdown()
self.model = LLM(path, **model_kwargs) self.model = LLM(path, **model_kwargs)
def generate(self, inputs: List[str], max_out_len: int, def generate(self,
inputs: List[str],
max_out_len: int,
stopping_criteria: List[str] = [],
**kwargs) -> List[str]: **kwargs) -> List[str]:
"""Generate results given a list of inputs. """Generate results given a list of inputs.
...@@ -90,6 +93,8 @@ class VLLM(BaseModel): ...@@ -90,6 +93,8 @@ class VLLM(BaseModel):
generation_kwargs = kwargs.copy() generation_kwargs = kwargs.copy()
generation_kwargs.update(self.generation_kwargs) generation_kwargs.update(self.generation_kwargs)
generation_kwargs.update({'max_tokens': max_out_len}) generation_kwargs.update({'max_tokens': max_out_len})
_stop = list(set(self.stop_words + stopping_criteria))
generation_kwargs.update({'stop': _stop})
sampling_kwargs = SamplingParams(**generation_kwargs) sampling_kwargs = SamplingParams(**generation_kwargs)
outputs = self.model.generate(inputs, sampling_kwargs) outputs = self.model.generate(inputs, sampling_kwargs)
...@@ -97,9 +102,6 @@ class VLLM(BaseModel): ...@@ -97,9 +102,6 @@ class VLLM(BaseModel):
for output in outputs: for output in outputs:
prompt = output.prompt prompt = output.prompt
generated_text = output.outputs[0].text generated_text = output.outputs[0].text
if self.end_str:
generated_text = generated_text.split(self.end_str)[0]
prompt_list.append(prompt) prompt_list.append(prompt)
output_strs.append(generated_text) output_strs.append(generated_text)
......
"""PPL Inferencer.""" # flake8: noqa
# yapf: disable
"""LogLikelihood(LL) Inferencer."""
import os import os
from typing import List, Optional from typing import List, Optional
...@@ -76,16 +78,13 @@ class LLInferencer(BaseInferencer): ...@@ -76,16 +78,13 @@ class LLInferencer(BaseInferencer):
# 3. Get labels of all the classes # 3. Get labels of all the classes
if self.labels is None: if self.labels is None:
labels = retriever.get_labels(ice_template=ice_template, labels = retriever.get_labels(ice_template=ice_template, prompt_template=prompt_template)
prompt_template=prompt_template)
else: else:
labels = self.labels labels = self.labels
# 4. Generate in-context examples for testing inputs # 4. Generate in-context examples for testing inputs
for idx in range(len(ice_idx_list)): for idx in range(len(ice_idx_list)):
ice.append( ice.append(retriever.generate_ice(ice_idx_list[idx], ice_template=ice_template))
retriever.generate_ice(ice_idx_list[idx],
ice_template=ice_template))
output_handler.save_ice(self.model.parse_template(ice, mode='ppl')) output_handler.save_ice(self.model.parse_template(ice, mode='ppl'))
# 5. Calculating loglikelihood for prompts in each label's class # 5. Calculating loglikelihood for prompts in each label's class
...@@ -99,58 +98,41 @@ class LLInferencer(BaseInferencer): ...@@ -99,58 +98,41 @@ class LLInferencer(BaseInferencer):
# 5.1 Generate prompts of current label and truncate # 5.1 Generate prompts of current label and truncate
# TODO: Refactor # TODO: Refactor
for idx in range(len(ice_idx_list)): for idx in range(len(ice_idx_list)):
prompt = retriever.generate_label_prompt( prompt_kwargs = {
idx, 'idx': idx,
ice[idx], 'ice': ice[idx],
label, 'label': label,
ice_template=ice_template, 'ice_template': ice_template,
prompt_template=prompt_template) 'prompt_template': prompt_template,
}
prompt = retriever.generate_label_prompt(**prompt_kwargs)
prompt_token_num = self.model.get_token_len_from_template(prompt, mode='ppl')
if self.max_seq_len is not None: if self.max_seq_len is not None:
prompt_token_num = self.model.get_token_len_from_template( while len(ice_idx_list[idx]) > 0 and prompt_token_num > self.max_seq_len:
prompt, mode='ppl')
while len(ice_idx_list[idx]
) > 0 and prompt_token_num > self.max_seq_len:
ice_idx_list[idx] = ice_idx_list[idx][:-1] ice_idx_list[idx] = ice_idx_list[idx][:-1]
ice[idx] = retriever.generate_ice( ice[idx] = retriever.generate_ice(ice_idx_list[idx], ice_template=ice_template)
ice_idx_list[idx], ice_template=ice_template) prompt_kwargs['ice'] = ice[idx]
prompt = retriever.generate_label_prompt( prompt = retriever.generate_label_prompt(**prompt_kwargs)
idx, prompt_token_num = self.model.get_token_len_from_template(prompt, mode='ppl')
ice[idx],
label,
ice_template=ice_template,
prompt_template=prompt_template)
prompt_token_num = self.model.get_token_len_from_template( # noqa
prompt, mode='ppl') # noqa
prompt_list.append(prompt) prompt_list.append(prompt)
token_num_list.append(prompt_token_num) token_num_list.append(prompt_token_num)
cont_list.append(retriever.test_ds[idx]['cont']) cont_list.append(retriever.test_ds[idx]['cont'])
# 5.2 Get loglikelihood # 5.2 Get loglikelihood
logger.info( logger.info(f"Calculating Loglikelihood for prompts labeled '{label}'")
f"Calculating Loglikelihood for prompts labeled '{label}'" for idx in trange(0, len(prompt_list), self.batch_size, disable=not self.is_main_process):
) # noqa
for idx in trange(0,
len(prompt_list),
self.batch_size,
disable=not self.is_main_process):
sub_prompt_list = prompt_list[idx:idx + self.batch_size] sub_prompt_list = prompt_list[idx:idx + self.batch_size]
sub_cont_list = cont_list[idx:idx + self.batch_size] sub_cont_list = cont_list[idx:idx + self.batch_size]
with torch.no_grad(): with torch.no_grad():
# mainly modify compared to PPLInferencer # mainly modify compared to PPLInferencer
sub_inputs = self.model.parse_template(sub_prompt_list, sub_inputs = self.model.parse_template(sub_prompt_list, mode='ppl')
mode='ppl') sub_res = self.model.get_loglikelihood(sub_inputs, sub_cont_list).tolist()
sub_res = self.model.get_loglikelihood( for res, prompt in zip(sub_res, self.model.parse_template(sub_prompt_list, mode='ppl')):
sub_inputs, sub_cont_list).tolist()
for res, prompt in zip(
sub_res,
self.model.parse_template(sub_prompt_list,
mode='ppl')):
sub_ppl_list.append(res) sub_ppl_list.append(res)
ice_str = self.model.parse_template(ice[idx], mode='ppl') ice_str = self.model.parse_template(ice[idx], mode='ppl')
output_handler.save_prompt_and_loglikelihood( output_handler.save_prompt_and_loglikelihood(label, prompt.replace(ice_str, ''), prompt, res, index)
label, prompt.replace(ice_str, ''), prompt, res, index)
index = index + 1 index = index + 1
ppl.append(sub_ppl_list) ppl.append(sub_ppl_list)
...@@ -169,13 +151,9 @@ class LLInferencer(BaseInferencer): ...@@ -169,13 +151,9 @@ class LLInferencer(BaseInferencer):
# 8. Output # 8. Output
if self.is_main_process: if self.is_main_process:
os.makedirs(output_json_filepath, exist_ok=True) os.makedirs(output_json_filepath, exist_ok=True)
output_handler.write_to_json(output_json_filepath, output_handler.write_to_json(output_json_filepath, output_json_filename)
output_json_filename)
return [ return [sample['prediction'] for sample in output_handler.results_dict.values()]
sample['prediction']
for sample in output_handler.results_dict.values()
]
class LLInferencerOutputHandler: class LLInferencerOutputHandler:
......
# flake8: noqa
# yapf: disable
"""PPL Inferencer.""" """PPL Inferencer."""
import os import os
...@@ -84,9 +86,7 @@ class PPLInferencer(BaseInferencer): ...@@ -84,9 +86,7 @@ class PPLInferencer(BaseInferencer):
# 4. Generate in-context examples for testing inputs # 4. Generate in-context examples for testing inputs
for idx in range(len(ice_idx_list)): for idx in range(len(ice_idx_list)):
ice.append( ice.append(retriever.generate_ice(ice_idx_list[idx], ice_template=ice_template))
retriever.generate_ice(ice_idx_list[idx],
ice_template=ice_template))
output_handler.save_ice(self.model.parse_template(ice, mode='ppl')) output_handler.save_ice(self.model.parse_template(ice, mode='ppl'))
# 5. Calculating PPL for prompts in each label's class # 5. Calculating PPL for prompts in each label's class
...@@ -101,33 +101,26 @@ class PPLInferencer(BaseInferencer): ...@@ -101,33 +101,26 @@ class PPLInferencer(BaseInferencer):
# 5.1 Generate prompts of current label and truncate # 5.1 Generate prompts of current label and truncate
# TODO: Refactor # TODO: Refactor
for idx in range(len(ice_idx_list)): for idx in range(len(ice_idx_list)):
prompt = retriever.generate_label_prompt( prompt_kwargs = {
idx, 'idx': idx,
ice[idx], 'ice': ice[idx],
label, 'label': label,
ice_template=ice_template, 'ice_template': ice_template,
prompt_template=prompt_template, 'prompt_template': prompt_template,
remain_sep=normalizing_str is not None) 'remain_sep': normalizing_str is not None
prompt_token_num = self.model.get_token_len_from_template( }
prompt, mode='ppl') prompt = retriever.generate_label_prompt(**prompt_kwargs)
prompt_token_num = self.model.get_token_len_from_template(prompt, mode='ppl')
if self.max_seq_len is not None: if self.max_seq_len is not None:
while len(ice_idx_list[idx] while len(ice_idx_list[idx]) > 0 and prompt_token_num > self.max_seq_len:
) > 0 and prompt_token_num > self.max_seq_len:
ice_idx_list[idx] = ice_idx_list[idx][:-1] ice_idx_list[idx] = ice_idx_list[idx][:-1]
ice[idx] = retriever.generate_ice( ice[idx] = retriever.generate_ice(ice_idx_list[idx], ice_template=ice_template)
ice_idx_list[idx], ice_template=ice_template) prompt_kwargs['ice'] = ice[idx]
prompt = retriever.generate_label_prompt( prompt = retriever.generate_label_prompt(**prompt_kwargs)
idx, prompt_token_num = self.model.get_token_len_from_template(prompt, mode='ppl')
ice[idx],
label,
ice_template=ice_template,
prompt_template=prompt_template)
prompt_token_num = self.model.get_token_len_from_template( # noqa
prompt, mode='ppl') # noqa
if normalizing_str is not None: if normalizing_str is not None:
assert isinstance(prompt, str), \ assert isinstance(prompt, str), 'Prompt must be a string when normalizing_str is set.'
'Prompt must be a string when normalizing_str is set.'
prompt_sep = prompt prompt_sep = prompt
if prompt_template is not None: if prompt_template is not None:
sep_token = prompt_template.sep_token sep_token = prompt_template.sep_token
...@@ -140,10 +133,9 @@ class PPLInferencer(BaseInferencer): ...@@ -140,10 +133,9 @@ class PPLInferencer(BaseInferencer):
prompt = context + answer prompt = context + answer
normalizing_prompt = normalizing_str + answer normalizing_prompt = normalizing_str + answer
context_length_list.append( context_length_list.append(self.model.get_token_len_from_template(context, mode='ppl'))
self.model.get_token_len_from_template(context,
mode='ppl'))
normalizing_prompt_list.append(normalizing_prompt) normalizing_prompt_list.append(normalizing_prompt)
prompt_list.append(prompt) prompt_list.append(prompt)
token_num_list.append(prompt_token_num) token_num_list.append(prompt_token_num)
...@@ -153,45 +145,25 @@ class PPLInferencer(BaseInferencer): ...@@ -153,45 +145,25 @@ class PPLInferencer(BaseInferencer):
# 5.2 Get PPL # 5.2 Get PPL
logger.info(f"Calculating PPL for prompts labeled '{label}'") logger.info(f"Calculating PPL for prompts labeled '{label}'")
for idx in trange(0, for idx in trange(0, len(prompt_list), self.batch_size, disable=not self.is_main_process):
len(prompt_list),
self.batch_size,
disable=not self.is_main_process):
sub_prompt_list = prompt_list[idx:idx + self.batch_size] sub_prompt_list = prompt_list[idx:idx + self.batch_size]
if normalizing_str is not None:
sub_context_length_list = context_length_list[idx:idx +
self.
batch_size]
sub_normalizing_prompt_list = normalizing_prompt_list[
idx:idx + self.batch_size]
with torch.no_grad(): with torch.no_grad():
if normalizing_str is not None: if normalizing_str is not None:
res1 = self.model.get_ppl_from_template( sub_context_length_list = context_length_list[idx:idx + self.batch_size]
sub_prompt_list, sub_normalizing_prompt_list = normalizing_prompt_list[idx:idx + self.batch_size]
mask_length=sub_context_length_list) res1 = self.model.get_ppl_from_template(sub_prompt_list, mask_length=sub_context_length_list)
res2 = self.model.get_ppl_from_template( sub_normalizing_context_length_list = [normalizing_str_len for _ in range(len(sub_prompt_list))]
sub_normalizing_prompt_list, res2 = self.model.get_ppl_from_template(sub_normalizing_prompt_list, mask_length=sub_normalizing_context_length_list)
mask_length=[
normalizing_str_len
for i in range(len(sub_prompt_list))
])
sub_res = res1 - res2 sub_res = res1 - res2
else: else:
sub_res = self.model.get_ppl_from_template( sub_res = self.model.get_ppl_from_template(sub_prompt_list).tolist()
sub_prompt_list).tolist()
for res, prompt in zip( for res, prompt in zip(sub_res, self.model.parse_template(sub_prompt_list, mode='ppl')):
sub_res,
self.model.parse_template(sub_prompt_list,
mode='ppl')):
sub_ppl_list.append(res) sub_ppl_list.append(res)
ice_str = self.model.parse_template(ice[idx], mode='ppl') ice_str = self.model.parse_template(ice[idx], mode='ppl')
output_handler.save_prompt_and_ppl( prompt_wo_ice = prompt.replace(ice_str, '')
label, prompt.replace(ice_str, ''), prompt, res, index) output_handler.save_prompt_and_ppl(label, prompt_wo_ice, prompt, res, index)
output_handler.results_dict[str( output_handler.results_dict[str(index)][f'label: {str(label)}']['BPB'] = res * token_num_list[index] / len(prompt_wo_ice.encode())
index)][f'label: {str(label)}'][
'BPB'] = res * token_num_list[index] / len(
prompt.replace(ice_str, '').encode())
index = index + 1 index = index + 1
ppl.append(sub_ppl_list) ppl.append(sub_ppl_list)
...@@ -210,10 +182,6 @@ class PPLInferencer(BaseInferencer): ...@@ -210,10 +182,6 @@ class PPLInferencer(BaseInferencer):
# 8. Output # 8. Output
if self.is_main_process: if self.is_main_process:
os.makedirs(output_json_filepath, exist_ok=True) os.makedirs(output_json_filepath, exist_ok=True)
output_handler.write_to_json(output_json_filepath, output_handler.write_to_json(output_json_filepath, output_json_filename)
output_json_filename)
return [ return [sample['prediction'] for sample in output_handler.results_dict.values()]
sample['prediction']
for sample in output_handler.results_dict.values()
]
...@@ -60,14 +60,16 @@ class NumWorkerPartitioner(BasePartitioner): ...@@ -60,14 +60,16 @@ class NumWorkerPartitioner(BasePartitioner):
if osp.exists(filename): if osp.exists(filename):
continue continue
dataset_size = self.get_size(dataset) dataset_size = self.get_size(dataset)
if dataset_size > self.min_task_size: if self.num_worker <= 1:
chunks.append(dataset)
elif dataset_size <= self.min_task_size:
chunks.append(dataset)
else:
root, ext = osp.splitext(filename) root, ext = osp.splitext(filename)
dataset_splits = self.split_dataset(dataset) dataset_splits = self.split_dataset(dataset)
for i, dataset_split in enumerate(dataset_splits): for i, dataset_split in enumerate(dataset_splits):
if not osp.exists(f'{root}_{i}{ext}'): if not osp.exists(f'{root}_{i}{ext}'):
chunks.append(dataset_split) chunks.append(dataset_split)
else:
chunks.append(dataset)
if self.strategy == 'heuristic': if self.strategy == 'heuristic':
buckets = [[] for _ in range(self.num_worker)] buckets = [[] for _ in range(self.num_worker)]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment