[Feature] Add huggingface apply_chat_template (#1098)

* add TheoremQA with 5-shot * add huggingface_above_v4_33 classes * use num_worker partitioner in cli * update theoremqa * update TheoremQA * add TheoremQA * rename theoremqa -> TheoremQA * update TheoremQA output path * rewrite many model configs * update huggingface * further update * refine configs * update configs * update configs * add configs/eval_llama3_instruct.py * add summarizer multi faceted * update bbh datasets * update configs/models/hf_llama/lmdeploy_llama3_8b_instruct.py * rename class * update readme * update hf above v4.33

[Feature] Add huggingface apply_chat_template (#1098)
* add TheoremQA with 5-shot * add huggingface_above_v4_33 classes * use num_worker partitioner in cli * update theoremqa * update TheoremQA * add TheoremQA * rename theoremqa -> TheoremQA * update TheoremQA output path * rewrite many model configs * update huggingface * further update * refine configs * update configs * update configs * add configs/eval_llama3_instruct.py * add summarizer multi faceted * update bbh datasets * update configs/models/hf_llama/lmdeploy_llama3_8b_instruct.py * rename class * update readme * update hf above v4.33
7505b3ca · Fengzhe Zhou · GitHub · 6c711cb2 · 7505b3ca · 6c711cb2
Unverified Commit 7505b3ca authored May 14, 2024 by Fengzhe Zhou Committed by GitHub May 14, 2024
20 changed files
--- a/configs/models/yi/hf_yi_34b.py
+++ b/configs/models/yi/hf_yi_34b.py
-from opencompass.models import HuggingFace
+from opencompass.models import HuggingFaceBaseModel
 models = [
    dict(
-        type=HuggingFace,
+        type=HuggingFaceBaseModel,
        abbr='yi-34b-hf',
        path='01-ai/Yi-34B',
-        tokenizer_path='01-ai/Yi-34B',
+        max_out_len=1024,
-        model_kwargs=dict(
-            trust_remote_code=True,
-            device_map='auto',
-        ),
-        tokenizer_kwargs=dict(
-            padding_side='left',
-            truncation_side='left',
-            trust_remote_code=True,
-        ),
-        max_out_len=100,
-        max_seq_len=2048,
        batch_size=8,
-        run_cfg=dict(num_gpus=4, num_procs=1),
+        run_cfg=dict(num_gpus=2),
    )
 ]
--- a/configs/models/yi/hf_yi_34b_200k.py
+++ b/configs/models/yi/hf_yi_34b_200k.py
-from opencompass.models import HuggingFace
-models = [
-    dict(
-        type=HuggingFace,
-        abbr='yi-34b-200k-hf',
-        path='01-ai/Yi-34B-200K',
-        tokenizer_path='01-ai/Yi-34B-200K',
-        model_kwargs=dict(
-            trust_remote_code=True,
-            device_map='auto',
-        ),
-        tokenizer_kwargs=dict(
-            padding_side='left',
-            truncation_side='left',
-            trust_remote_code=True,
-        ),
-        max_out_len=100,
-        max_seq_len=2048,
-        batch_size=8,
-        run_cfg=dict(num_gpus=4, num_procs=1),
-    )
-]
--- a/configs/models/yi/hf_yi_34b_chat.py
+++ b/configs/models/yi/hf_yi_34b_chat.py
-from opencompass.models import HuggingFace
+from opencompass.models import HuggingFacewithChatTemplate
-_meta_template = dict(
-    round=[
-        dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'),
-        dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True),
-    ],
-)
 models = [
    dict(
-        type=HuggingFace,
+        type=HuggingFacewithChatTemplate,
        abbr='yi-34b-chat-hf',
        path='01-ai/Yi-34B-Chat',
-        model_kwargs=dict(
+        max_out_len=1024,
-            trust_remote_code=True,
-            device_map='auto',
-        ),
-        tokenizer_kwargs=dict(
-            padding_side='left',
-            truncation_side='left',
-            trust_remote_code=True,
-        ),
-        meta_template=_meta_template,
-        max_out_len=100,
-        max_seq_len=2048,
        batch_size=8,
-        run_cfg=dict(num_gpus=2, num_procs=1),
+        run_cfg=dict(num_gpus=2),
-        end_str='<|im_end|>',
-        batch_padding=True,
    )
 ]
--- a/configs/models/yi/hf_yi_6b.py
+++ b/configs/models/yi/hf_yi_6b.py
-from opencompass.models import HuggingFace
+from opencompass.models import HuggingFaceBaseModel
 models = [
    dict(
-        type=HuggingFace,
+        type=HuggingFaceBaseModel,
        abbr='yi-6b-hf',
        path='01-ai/Yi-6B',
-        tokenizer_path='01-ai/Yi-6B',
+        max_out_len=1024,
-        model_kwargs=dict(
-            trust_remote_code=True,
-            device_map='auto',
-        ),
-        tokenizer_kwargs=dict(
-            padding_side='left',
-            truncation_side='left',
-            trust_remote_code=True,
-        ),
-        max_out_len=100,
-        max_seq_len=2048,
        batch_size=8,
-        run_cfg=dict(num_gpus=1, num_procs=1),
+        run_cfg=dict(num_gpus=1),
    )
 ]
--- a/configs/models/yi/hf_yi_6b_200k.py
+++ b/configs/models/yi/hf_yi_6b_200k.py
-from opencompass.models import HuggingFace
-models = [
-    dict(
-        type=HuggingFace,
-        abbr='yi-6b-200k-hf',
-        path='01-ai/Yi-6B-200K',
-        tokenizer_path='01-ai/Yi-6B-200K',
-        model_kwargs=dict(
-            trust_remote_code=True,
-            device_map='auto',
-        ),
-        tokenizer_kwargs=dict(
-            padding_side='left',
-            truncation_side='left',
-            trust_remote_code=True,
-        ),
-        max_out_len=100,
-        max_seq_len=2048,
-        batch_size=8,
-        run_cfg=dict(num_gpus=1, num_procs=1),
-    )
-]
--- a/configs/models/yi/hf_yi_6b_chat.py
+++ b/configs/models/yi/hf_yi_6b_chat.py
-from opencompass.models import HuggingFace
+from opencompass.models import HuggingFacewithChatTemplate
-_meta_template = dict(
-    round=[
-        dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'),
-        dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True),
-    ],
-)
 models = [
    dict(
-        type=HuggingFace,
+        type=HuggingFacewithChatTemplate,
        abbr='yi-6b-chat-hf',
        path='01-ai/Yi-6B-Chat',
-        tokenizer_path='01-ai/Yi-6B-Chat',
+        max_out_len=1024,
-        model_kwargs=dict(
-            trust_remote_code=True,
-            device_map='auto',
-        ),
-        tokenizer_kwargs=dict(
-            padding_side='left',
-            truncation_side='left',
-            trust_remote_code=True,
-        ),
-        meta_template=_meta_template,
-        max_out_len=100,
-        max_seq_len=2048,
        batch_size=8,
-        run_cfg=dict(num_gpus=1, num_procs=1),
+        run_cfg=dict(num_gpus=1),
-        end_str='<|im_end|>',
-        batch_padding=True,
    )
 ]
--- a/configs/models/zephyr/hf_zephyr_7b_beta.py
+++ b/configs/models/zephyr/hf_zephyr_7b_beta.py
-from opencompass.models import HuggingFace
+from opencompass.models import HuggingFacewithChatTemplate
-_meta_template = dict(
-    round=[
-        dict(role="HUMAN", begin='<|user|>\n', end='</s>'),
-        dict(role="BOT", begin="<|assistant|>\n", end='</s>', generate=True),
-    ],
-)
 models = [
    dict(
-        type=HuggingFace,
+        type=HuggingFacewithChatTemplate,
        abbr='zephyr-7b-beta-hf',
        path='HuggingFaceH4/zephyr-7b-beta',
-        tokenizer_path='HuggingFaceH4/zephyr-7b-beta',
+        max_out_len=1024,
-        model_kwargs=dict(
-            trust_remote_code=True,
-            device_map='auto',
-        ),
-        tokenizer_kwargs=dict(
-            padding_side='left',
-            truncation_side='left',
-            trust_remote_code=True,
-        ),
-        meta_template=_meta_template,
-        max_out_len=100,
-        max_seq_len=2048,
        batch_size=8,
-        run_cfg=dict(num_gpus=1, num_procs=1),
+        run_cfg=dict(num_gpus=1),
-        end_str='</s>',
    )
 ]
--- a/configs/summarizers/chat_OC15.py
+++ b/configs/summarizers/chat_OC15.py
+from mmengine.config import read_base
+with read_base():
+    from .groups.mmlu import mmlu_summary_groups
+    from .groups.cmmlu import cmmlu_summary_groups
+    from .groups.ceval import ceval_summary_groups
+    from .groups.bbh import bbh_summary_groups
+    from .groups.GaokaoBench import GaokaoBench_summary_groups
+    from .groups.lcbench import lcbench_summary_groups
+other_summary_groups = [
+    {
+        'name': 'average',
+        'subsets': [
+            ['mmlu', 'naive_average'],
+            ['cmmlu', 'naive_average'],
+            ['ceval', 'naive_average'],
+            ['GaokaoBench', 'weighted_average'],
+            ['triviaqa_wiki_1shot', 'score'],
+            ['nq_open_1shot', 'score'],
+            ['race-high', 'accuracy'],
+            ['winogrande', 'accuracy'],
+            ['hellaswag', 'accuracy'],
+            ['bbh', 'naive_average'],
+            ['gsm8k', 'accuracy'],
+            ['math', 'accuracy'],
+            ['TheoremQA', 'score'],
+            ['openai_humaneval', 'humaneval_pass@1'],
+            ['sanitized_mbpp', 'score'],
+            ['GPQA_diamond', 'accuracy'],
+            ['IFEval', 'Prompt-level-strict-accuracy'],
+        ],
+    },
+]
+summarizer = dict(
+    dataset_abbrs=[
+        ['average', 'naive_average'],
+        ['mmlu', 'naive_average'],
+        ['cmmlu', 'naive_average'],
+        ['ceval', 'naive_average'],
+        ['GaokaoBench', 'weighted_average'],
+        ['triviaqa_wiki_1shot', 'score'],
+        ['nq_open_1shot', 'score'],
+        ['race-high', 'accuracy'],
+        ['winogrande', 'accuracy'],
+        ['hellaswag', 'accuracy'],
+        ['bbh', 'naive_average'],
+        ['gsm8k', 'accuracy'],
+        ['math', 'accuracy'],
+        ['TheoremQA', 'score'],
+        ['openai_humaneval', 'humaneval_pass@1'],
+        ['sanitized_mbpp', 'score'],
+        ['GPQA_diamond', 'accuracy'],
+        ['IFEval', 'Prompt-level-strict-accuracy'],
+        '',
+        'mmlu',
+        'mmlu-stem',
+        'mmlu-social-science',
+        'mmlu-humanities',
+        'mmlu-other',
+        'cmmlu',
+        'cmmlu-stem',
+        'cmmlu-social-science',
+        'cmmlu-humanities',
+        'cmmlu-other',
+        'cmmlu-china-specific',
+        'ceval',
+        'ceval-stem',
+        'ceval-social-science',
+        'ceval-humanities',
+        'ceval-other',
+        'ceval-hard',
+    ],
+    summary_groups=sum(
+        [v for k, v in locals().items() if k.endswith("_summary_groups")], []),
+)
--- a/configs/summarizers/chat_OC15_multi_faceted.py
+++ b/configs/summarizers/chat_OC15_multi_faceted.py
+from mmengine.config import read_base
+from opencompass.summarizers import MultiFacetedSummarizer
+with read_base():
+    from .groups.mmlu import mmlu_summary_groups
+    from .groups.cmmlu import cmmlu_summary_groups
+    from .groups.ceval import ceval_summary_groups
+    from .groups.bbh import bbh_summary_groups
+    from .groups.GaokaoBench import GaokaoBench_summary_groups
+other_summary_groups = [
+    {
+        'name': 'average',
+        'subsets': [
+            ['mmlu', 'naive_average'],
+            ['cmmlu', 'naive_average'],
+            ['ceval', 'naive_average'],
+            ['GaokaoBench', 'weighted_average'],
+            ['triviaqa_wiki_1shot', 'score'],
+            ['nq_open_1shot', 'score'],
+            ['race-high', 'accuracy'],
+            ['winogrande', 'accuracy'],
+            ['hellaswag', 'accuracy'],
+            ['bbh', 'naive_average'],
+            ['gsm8k', 'accuracy'],
+            ['math', 'accuracy'],
+            ['TheoremQA', 'score'],
+            ['openai_humaneval', 'humaneval_pass@1'],
+            ['sanitized_mbpp', 'score'],
+            ['GPQA_diamond', 'accuracy'],
+            ['IFEval', 'Prompt-level-strict-accuracy'],
+        ],
+    },
+]
+overall_dataset_abbrs = [
+    ['average', 'naive_average'],
+    ['mmlu', 'naive_average'],
+    ['cmmlu', 'naive_average'],
+    ['ceval', 'naive_average'],
+    ['GaokaoBench', 'weighted_average'],
+    ['triviaqa_wiki_1shot', 'score'],
+    ['nq_open_1shot', 'score'],
+    ['race-high', 'accuracy'],
+    ['winogrande', 'accuracy'],
+    ['hellaswag', 'accuracy'],
+    ['bbh', 'naive_average'],
+    ['gsm8k', 'accuracy'],
+    ['math', 'accuracy'],
+    ['TheoremQA', 'score'],
+    ['openai_humaneval', 'humaneval_pass@1'],
+    ['sanitized_mbpp', 'score'],
+    ['GPQA_diamond', 'accuracy'],
+    ['IFEval', 'Prompt-level-strict-accuracy'],
+]
+mmlu_summary_groups_dict = {g['name']: g['subsets'] for g in mmlu_summary_groups}
+mmlu_dataset_abbrs = [
+    ['mmlu', 'naive_average'],
+    ['mmlu-stem', 'naive_average'],
+    ['mmlu-social-science', 'naive_average'],
+    ['mmlu-humanities', 'naive_average'],
+    ['mmlu-other', 'naive_average'],
+    *mmlu_summary_groups_dict['mmlu-stem'],
+    *mmlu_summary_groups_dict['mmlu-social-science'],
+    *mmlu_summary_groups_dict['mmlu-humanities'],
+    *mmlu_summary_groups_dict['mmlu-other'],
+]
+cmmlu_summary_groups_dict = {g['name']: g['subsets'] for g in cmmlu_summary_groups}
+cmmlu_dataset_abbrs = [
+    ['cmmlu', 'naive_average'],
+    ['cmmlu-stem', 'naive_average'],
+    ['cmmlu-social-science', 'naive_average'],
+    ['cmmlu-humanities', 'naive_average'],
+    ['cmmlu-other', 'naive_average'],
+    ['cmmlu-china-specific', 'naive_average'],
+    *cmmlu_summary_groups_dict['cmmlu-stem'],
+    *cmmlu_summary_groups_dict['cmmlu-social-science'],
+    *cmmlu_summary_groups_dict['cmmlu-humanities'],
+    *cmmlu_summary_groups_dict['cmmlu-other'],
+]
+ceval_summary_groups_dict = {g['name']: g['subsets'] for g in ceval_summary_groups}
+ceval_dataset_abbrs = [
+    ['ceval', 'naive_average'],
+    ['ceval-stem', 'naive_average'],
+    ['ceval-social-science', 'naive_average'],
+    ['ceval-humanities', 'naive_average'],
+    ['ceval-other', 'naive_average'],
+    ['ceval-hard', 'naive_average'],
+    *ceval_summary_groups_dict['ceval-stem'],
+    *ceval_summary_groups_dict['ceval-social-science'],
+    *ceval_summary_groups_dict['ceval-humanities'],
+    *ceval_summary_groups_dict['ceval-other'],
+]
+bbh_summary_groups_dict = {g['name']: g['subsets'] for g in bbh_summary_groups}
+bbh_dataset_abbrs = [
+    ['bbh', 'naive_average'],
+    *bbh_summary_groups_dict['bbh'],
+]
+GaokaoBench_summary_groups_dict = {g['name']: g['subsets'] for g in GaokaoBench_summary_groups}
+GaokaoBench_dataset_abbrs = [
+    ['GaokaoBench', 'weighted_average'],
+    *GaokaoBench_summary_groups_dict['GaokaoBench'],
+]
+sanitized_mbpp_dataset_abbrs = [
+    ['sanitized_mbpp', 'score'],
+    ['sanitized_mbpp', 'pass'],
+    ['sanitized_mbpp', 'failed'],
+    ['sanitized_mbpp', 'wrong_answer'],
+    ['sanitized_mbpp', 'timeout'],
+]
+summarizer = dict(
+    type=MultiFacetedSummarizer,
+    dataset_abbrs_list=[
+        {'name': 'mmlu', 'dataset_abbrs': mmlu_dataset_abbrs},
+        {'name': 'cmmlu', 'dataset_abbrs': cmmlu_dataset_abbrs},
+        {'name': 'ceval', 'dataset_abbrs': ceval_dataset_abbrs},
+        {'name': 'bbh', 'dataset_abbrs': bbh_dataset_abbrs},
+        {'name': 'GaokaoBench', 'dataset_abbrs': GaokaoBench_dataset_abbrs},
+        {'name': 'sanitized_mbpp', 'dataset_abbrs': sanitized_mbpp_dataset_abbrs},
+        {'name': 'overall', 'dataset_abbrs': overall_dataset_abbrs},
+    ],
+    summary_groups=sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []),
+)
--- a/docs/en/get_started/quick_start.md
+++ b/docs/en/get_started/quick_start.md
@@ -80,13 +80,8 @@ For HuggingFace models, users can set model parameters directly through the comm
 ```bash
 python run.py --datasets siqa_gen winograd_ppl \
--hf-path facebook/opt-125m \
+--hf-type base \
--model-kwargs device_map='auto' \
+--hf-path facebook/opt-125m
--tokenizer-kwargs padding_side='left' truncation='left' trust_remote_code=True \
--max-seq-len 2048 \
--max-out-len 100 \
--batch-size 128  \
--num-gpus 1  # Number of minimum required GPUs
 ```
 Note that in this way, OpenCompass only evaluates one model at a time, while other ways can evaluate multiple models at once.
@@ -99,12 +94,14 @@ Note that in this way, OpenCompass only evaluates one model at a time, while oth
 :animate: fade-in-slide-down
 ```bash
 python run.py --datasets siqa_gen winograd_ppl \
+--hf-type base \  # HuggingFace model type, base or chat
 --hf-path facebook/opt-125m \  # HuggingFace model path
 --tokenizer-path facebook/opt-125m \  # HuggingFace tokenizer path (if the same as the model path, can be omitted)
 --tokenizer-kwargs padding_side='left' truncation='left' trust_remote_code=True \  # Arguments to construct the tokenizer
 --model-kwargs device_map='auto' \  # Arguments to construct the model
 --max-seq-len 2048 \  # Maximum sequence length the model can accept
 --max-out-len 100 \  # Maximum number of tokens to generate
+--min-out-len 100 \  # Minimum number of tokens to generate
 --batch-size 64  \  # Batch size
 --num-gpus 1  # Number of GPUs required to run the model
 ```
@@ -146,28 +143,22 @@ python run.py configs/eval_demo.py
 OpenCompass provides a series of pre-defined model configurations under `configs/models`. Below is the configuration snippet related to [opt-350m](https://github.com/open-compass/opencompass/blob/main/configs/models/opt/hf_opt_350m.py) (`configs/models/opt/hf_opt_350m.py`):
 ```python
-# Evaluate models supported by HuggingFace's `AutoModelForCausalLM` using `HuggingFaceCausalLM`
+# Evaluate models supported by HuggingFace's `AutoModelForCausalLM` using `HuggingFaceBaseModel`
-from opencompass.models import HuggingFaceCausalLM
+from opencompass.models import HuggingFaceBaseModel
-# OPT-350M
+models = [
-opt350m = dict(
+    # OPT-350M
-       type=HuggingFaceCausalLM,
+    dict(
-       # Initialization parameters for `HuggingFaceCausalLM`
+        type=HuggingFaceBaseModel,
-       path='facebook/opt-350m',
+        # Initialization parameters for `HuggingFaceBaseModel`
-       tokenizer_path='facebook/opt-350m',
+        path='facebook/opt-350m',
-       tokenizer_kwargs=dict(
+        # Below are common parameters for all models, not specific to HuggingFaceBaseModel
-           padding_side='left',
+        abbr='opt-350m-hf',         # Model abbreviation
-           truncation_side='left',
+        max_out_len=1024,           # Maximum number of generated tokens
-           proxies=None,
+        batch_size=32,              # Batch size
-           trust_remote_code=True),
+        run_cfg=dict(num_gpus=1),   # The required GPU numbers for this model
-       model_kwargs=dict(device_map='auto'),
-       # Below are common parameters for all models, not specific to HuggingFaceCausalLM
-       abbr='opt350m',               # Model abbreviation for result display
-       max_seq_len=2048,             # The maximum length of the entire sequence
-       max_out_len=100,              # Maximum number of generated tokens
-       batch_size=64,                # batchsize
-       run_cfg=dict(num_gpus=1),     # The required GPU numbers for this model
    )
+]
 ```
 When using configurations, we can specify the relevant files through the command-line argument ` --models` or import the model configurations into the  `models` list in the configuration file using the inheritance mechanism.

--- a/docs/zh_cn/get_started/quick_start.md
+++ b/docs/zh_cn/get_started/quick_start.md
@@ -79,13 +79,8 @@ python tools/list_configs.py llama mmlu
 ```bash
 python run.py --datasets siqa_gen winograd_ppl \
--hf-path facebook/opt-125m \
+--hf-type base \
--model-kwargs device_map='auto' \
+--hf-path facebook/opt-125m
--tokenizer-kwargs padding_side='left' truncation='left' trust_remote_code=True \
--max-seq-len 2048 \
--max-out-len 100 \
--batch-size 128  \
--num-gpus 1  # 最少需要的 GPU 数量
 ```
 请注意，通过这种方式，OpenCompass 一次只评估一个模型，而其他方式可以一次评估多个模型。
@@ -100,12 +95,14 @@ python run.py --datasets siqa_gen winograd_ppl \
 :animate: fade-in-slide-down
 ```bash
 python run.py --datasets siqa_gen winograd_ppl \
+--hf-type base \  # HuggingFace 模型类型, base 或 chat
 --hf-path facebook/opt-125m \  # HuggingFace 模型路径
 --tokenizer-path facebook/opt-125m \  # HuggingFace tokenizer 路径（如果与模型路径相同，可以省略）
 --tokenizer-kwargs padding_side='left' truncation='left' trust_remote_code=True \  # 构建 tokenizer 的参数
 --model-kwargs device_map='auto' \  # 构建模型的参数
 --max-seq-len 2048 \  # 模型可以接受的最大序列长度
 --max-out-len 100 \  # 生成的最大 token 数
+--min-out-len 100 \  # 生成的最小 token 数
 --batch-size 64  \  # 批量大小
 --num-gpus 1  # 运行模型所需的 GPU 数量
 ```
@@ -147,28 +144,22 @@ python run.py configs/eval_demo.py
 OpenCompass 提供了一系列预定义的模型配置，位于 `configs/models` 下。以下是与 [opt-350m](https://github.com/open-compass/opencompass/blob/main/configs/models/opt/hf_opt_350m.py)（`configs/models/opt/hf_opt_350m.py`）相关的配置片段：
 ```python
-# 使用 `HuggingFaceCausalLM` 评估由 HuggingFace 的 `AutoModelForCausalLM` 支持的模型
+# 使用 `HuggingFaceBaseModel` 评估由 HuggingFace 的 `AutoModelForCausalLM` 支持的模型
-from opencompass.models import HuggingFaceCausalLM
+from opencompass.models import HuggingFaceBaseModel
-# OPT-350M
+models = [
-opt350m = dict(
+    # OPT-350M
-       type=HuggingFaceCausalLM,
+    dict(
-       # `HuggingFaceCausalLM` 的初始化参数
+        type=HuggingFaceBaseModel,
-       path='facebook/opt-350m',
+        # `HuggingFaceBaseModel` 的初始化参数
-       tokenizer_path='facebook/opt-350m',
+        path='facebook/opt-350m',
-       tokenizer_kwargs=dict(
+        # 下面是所有模型的共同参数，不特定于 HuggingFaceBaseModel
-           padding_side='left',
+        abbr='opt-350m-hf',         # 模型的缩写
-           truncation_side='left',
+        max_out_len=1024,           # 生成的最大 token 数
-           proxies=None,
+        batch_size=32,              # 批量大小
-           trust_remote_code=True),
+        run_cfg=dict(num_gpus=1),   # 该模型所需的 GPU 数量
-       model_kwargs=dict(device_map='auto'),
-       # 下面是所有模型的共同参数，不特定于 HuggingFaceCausalLM
-       abbr='opt350m',               # 结果显示的模型缩写
-       max_seq_len=2048,             # 整个序列的最大长度
-       max_out_len=100,              # 生成的最大 token 数
-       batch_size=64,                # 批量大小
-       run_cfg=dict(num_gpus=1),     # 该模型所需的 GPU 数量
    )
+]
 ```
 使用配置时，我们可以通过命令行参数 `--models` 指定相关文件，或使用继承机制将模型配置导入到配置文件中的 `models` 列表中。

--- a/opencompass/cli/main.py
+++ b/opencompass/cli/main.py
+# flake8: noqa
+# yapf: disable
 import argparse
 import getpass
 import os
@@ -51,7 +53,7 @@ def parse_args():
                        action='store_true',
                        default=False)
    parser.add_argument(
-        '--accelerator',
+        '-a', '--accelerator',
        help='Infer accelerator, support vllm and lmdeploy now.',
        choices=['vllm', 'lmdeploy', 'hf'],
        default='hf',
@@ -81,7 +83,7 @@ def parse_args():
                        'saved in this path, including the slurm logs, '
                        'the evaluation results, the summary results, etc.'
                        'If not specified, the work_dir will be set to '
-                        './outputs/default.',
+                        'outputs/default.',
                        default=None,
                        type=str)
    parser.add_argument(
@@ -95,23 +97,12 @@ def parse_args():
                        help='Report the running status to lark bot',
                        action='store_true',
                        default=False)
-    parser.add_argument('--max-partition-size',
-                        help='The maximum size of an infer task. Only '
-                        'effective when "infer" is missing from the config.',
-                        type=int,
-                        default=40000),
-    parser.add_argument(
-        '--gen-task-coef',
-        help='The dataset cost measurement coefficient for generation tasks, '
-        'Only effective when "infer" is missing from the config.',
-        type=int,
-        default=20)
    parser.add_argument('--max-num-workers',
                        help='Max number of workers to run in parallel. '
                        'Will be overrideen by the "max_num_workers" argument '
                        'in the config.',
                        type=int,
-                        default=32)
+                        default=1)
    parser.add_argument('--max-workers-per-gpu',
                        help='Max task to run in parallel on one GPU. '
                        'It will only be used in the local runner.',
@@ -181,25 +172,21 @@ def parse_dlc_args(dlc_parser):
 def parse_hf_args(hf_parser):
    """These args are all for the quick construction of HuggingFace models."""
-    hf_parser.add_argument('--hf-path', type=str)
+    hf_parser.add_argument('--hf-type', type=str, choices=['base', 'chat'], default='chat', help='The type of the HuggingFace model, base or chat')
-    hf_parser.add_argument('--peft-path', type=str)
+    hf_parser.add_argument('--hf-path', type=str, help='The path to the HuggingFace model, e.g. "facebook/opt-125m", required')
-    hf_parser.add_argument('--tokenizer-path', type=str)
+    hf_parser.add_argument('--model-kwargs', nargs='+', action=DictAction, default={}, help='The kwargs for the HuggingFace model')
-    hf_parser.add_argument('--model-kwargs',
+    hf_parser.add_argument('--tokenizer-path', type=str, help='The path to the HuggingFace tokenizer, same as --hf-path if not specified')
-                           nargs='+',
+    hf_parser.add_argument('--tokenizer-kwargs', nargs='+', action=DictAction, default={}, help='The kwargs for the tokenizer')
-                           action=DictAction,
+    hf_parser.add_argument('--peft-path', type=str, help='The path to the PEFT model')
-                           default={})
+    hf_parser.add_argument('--peft-kwargs', nargs='+', action=DictAction, default={}, help='The kwargs for the PEFT model')
-    hf_parser.add_argument('--tokenizer-kwargs',
+    hf_parser.add_argument('--generation-kwargs', nargs='+', action=DictAction, default={}, help='The kwargs for the generation')
-                           nargs='+',
+    hf_parser.add_argument('--max-seq-len', type=int, help='The max sequence length for the HuggingFace model')
-                           action=DictAction,
+    hf_parser.add_argument('--max-out-len', type=int, default=256, help='The max output length for the HuggingFace model')
-                           default={})
+    hf_parser.add_argument('--min-out-len', type=int, default=1, help='The min output length for the HuggingFace model')
-    hf_parser.add_argument('--max-out-len', type=int)
+    hf_parser.add_argument('--batch-size', type=int, default=8, help='The batch size for the HuggingFace model')
-    hf_parser.add_argument('--max-seq-len', type=int)
+    hf_parser.add_argument('--num-gpus', type=int, default=1, help='The number of GPUs for **the HuggingFace model passed via cli**')
-    hf_parser.add_argument('--no-batch-padding',
+    hf_parser.add_argument('--pad-token-id', type=int, help='The pad token id for the HuggingFace model')
-                           action='store_true',
+    hf_parser.add_argument('--stop-words', nargs='+', default=[], help='The stop words for the HuggingFace model')
-                           default=False)
-    hf_parser.add_argument('--batch-size', type=int)
-    hf_parser.add_argument('--num-gpus', type=int)
-    hf_parser.add_argument('--pad-token-id', type=int)
 def parse_custom_dataset_args(custom_dataset_parser):
@@ -225,7 +212,7 @@ def main():
    if args.work_dir is not None:
        cfg['work_dir'] = args.work_dir
    else:
-        cfg.setdefault('work_dir', './outputs/default/')
+        cfg.setdefault('work_dir', osp.join('outputs', 'default'))
    # cfg_time_str defaults to the current time
    cfg_time_str = dir_time_str = datetime.now().strftime('%Y%m%d_%H%M%S')

--- a/opencompass/datasets/winogrande.py
+++ b/opencompass/datasets/winogrande.py
@@ -22,6 +22,9 @@ class winograndeDataset(BaseDataset):
                prompt = line['sentence']
                continue_prompt = prompt.split('_')[1]
                data_item = {
+                    'prompt': prompt,
+                    'only_option1': line['option1'],
+                    'only_option2': line['option2'],
                    'opt1': prompt.replace('_', line['option1']),
                    'opt2': prompt.replace('_', line['option2']),
                    'answer': line['answer'],
@@ -48,6 +51,9 @@ class winograndeDataset_V2(BaseDataset):
                answer = line['answer']
                answer = ' AB'[int(answer)] if answer != '' else 'NULL'
                data_item = {
+                    'prompt': prompt,
+                    'only_option1': line['option1'],
+                    'only_option2': line['option2'],
                    'opt1': prompt.replace('_', line['option1']),
                    'opt2': prompt.replace('_', line['option2']),
                    'answer': answer,
@@ -76,6 +82,9 @@ class winograndeDataset_V3(BaseDataset):
                    answer = line['answer']
                    answer = ' AB'[int(answer)] if answer != '' else 'NULL'
                    data_item = {
+                        'prompt': prompt,
+                        'only_option1': line['option1'],
+                        'only_option2': line['option2'],
                        'opt1': prompt.replace('_', line['option1']),
                        'opt2': prompt.replace('_', line['option2']),
                        'answer': answer,

--- a/opencompass/models/__init__.py
+++ b/opencompass/models/__init__.py
@@ -3,26 +3,28 @@ from .ai360_api import AI360GPT  # noqa: F401
 from .alaya import AlayaLM  # noqa: F401
 from .baichuan_api import BaiChuan, BaiChuan3  # noqa: F401
 from .baidu_api import ERNIEBot  # noqa: F401
-from .base import BaseModel, LMTemplateParser  # noqa
+from .base import BaseModel, LMTemplateParser  # noqa: F401
-from .base_api import APITemplateParser, BaseAPIModel  # noqa
+from .base_api import APITemplateParser, BaseAPIModel  # noqa: F401
 from .bytedance_api import ByteDance  # noqa: F401
 from .claude_api import Claude  # noqa: F401
-from .gemini_api import Gemini, GeminiAllesAPIN  # noqa: F401, F403
+from .gemini_api import Gemini, GeminiAllesAPIN  # noqa: F401
-from .glm import GLM130B  # noqa: F401, F403
+from .glm import GLM130B  # noqa: F401
-from .huggingface import HuggingFace  # noqa: F401, F403
+from .huggingface import HuggingFace  # noqa: F401
-from .huggingface import HuggingFaceCausalLM  # noqa: F401, F403
+from .huggingface import HuggingFaceCausalLM  # noqa: F401
-from .huggingface import HuggingFaceChatGLM3  # noqa: F401, F403
+from .huggingface import HuggingFaceChatGLM3  # noqa: F401
+from .huggingface_above_v4_33 import HuggingFaceBaseModel  # noqa: F401
+from .huggingface_above_v4_33 import HuggingFacewithChatTemplate  # noqa: F401
 from .hunyuan_api import Hunyuan  # noqa: F401
-from .intern_model import InternLM  # noqa: F401, F403
+from .intern_model import InternLM  # noqa: F401
 from .krgpt_api import KrGPT  # noqa: F401
 from .lightllm_api import LightllmAPI  # noqa: F401
-from .llama2 import Llama2, Llama2Chat  # noqa: F401, F403
+from .llama2 import Llama2, Llama2Chat  # noqa: F401
 from .lmdeploy_pytorch import LmdeployPytorchModel  # noqa: F401
 from .lmdeploy_tis import LmdeployTisModel  # noqa: F401
 from .minimax_api import MiniMax  # noqa: F401
 from .mistral_api import Mistral  # noqa: F401
 from .mixtral import Mixtral  # noqa: F401
-from .modelscope import ModelScope, ModelScopeCausalLM  # noqa: F401, F403
+from .modelscope import ModelScope, ModelScopeCausalLM  # noqa: F401
 from .moonshot_api import MoonShot  # noqa: F401
 from .nanbeige_api import Nanbeige  # noqa: F401
 from .openai_api import OpenAI  # noqa: F401

--- a/opencompass/models/huggingface_above_v4_33.py
+++ b/opencompass/models/huggingface_above_v4_33.py
+# flake8: noqa
+# yapf: disable
+from typing import Dict, List, Optional, Union
+from opencompass.models.base import BaseModel, LMTemplateParser
+from opencompass.models.base_api import APITemplateParser
+from opencompass.registry import MODELS
+from opencompass.utils.logging import get_logger
+from opencompass.utils.prompt import PromptList
+PromptType = Union[PromptList, str]
+def _get_stopping_criteria(stop_words, tokenizer, batch_size):
+    from transformers import (PreTrainedTokenizer, StoppingCriteria,
+                              StoppingCriteriaList)
+    class MultiTokenEOSCriteria(StoppingCriteria):
+        """Criteria to stop on the specified multi-token sequence."""
+        def __init__(self, sequence: str, tokenizer: PreTrainedTokenizer, batch_size: int):
+            self.done_tracker = [False] * batch_size
+            self.sequence = sequence
+            self.sequence_ids = tokenizer.encode(sequence, add_special_tokens=False)
+            self.sequence_id_len = len(self.sequence_ids)
+            self.tokenizer = tokenizer
+        def __call__(self, input_ids, scores, **kwargs) -> bool:
+            # compare the last len(stop) tokens
+            lookback_ids_batch = input_ids[:, -self.sequence_id_len:]
+            lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch)
+            for i, done in enumerate(self.done_tracker):
+                if done:
+                    continue
+                self.done_tracker[i] = self.sequence in lookback_tokens_batch[i]
+            return False not in self.done_tracker
+    criteria = []
+    for stop_word in stop_words:
+        c = MultiTokenEOSCriteria(stop_word, tokenizer, batch_size)
+        criteria.append(c)
+    criteria = StoppingCriteriaList(criteria)
+    return criteria
+def _get_possible_max_seq_len(max_seq_len, path):
+    if max_seq_len is not None:
+        return max_seq_len
+    from transformers import AutoConfig
+    config = AutoConfig.from_pretrained(path, trust_remote_code=True)
+    possible_keys = [
+        'max_position_embeddings',
+        'seq_length',
+        'model_max_length',
+    ]
+    for k in possible_keys:
+        if hasattr(config, k):
+            return getattr(config, k)
+    raise ValueError('max_seq_len is not provided and cannot be inferred from the model config.')
+def _convert_chat_messages(inputs):
+    outputs = []
+    for _input in inputs:
+        messages = []
+        if isinstance(_input, str):
+            messages.append({'role': 'HUMAN', 'prompt': _input})
+        else:
+            for item in _input:
+                role = {
+                    'HUMAN': 'user',
+                    'BOT': 'assistant',
+                    'SYSTEM': 'system',
+                }[item['role']]
+                messages.append({'role': role, 'content': item['prompt']})
+        outputs.append(messages)
+    return outputs
+def _format_with_fast_chat_template(inputs: List[str], name: str='vicuna'):
+    try:
+        from fastchat.model import get_conversation_template
+    except ImportError:
+        raise ModuleNotFoundError('fastchat not found. Please install with\npip install "fschat[model_worker,webui]"')
+    outputs = []
+    for _input in inputs:
+        template = get_conversation_template(name)
+        for item in _input:
+            if item['role'] == 'user':
+                template.append_message(template.roles[0], item['content'])
+            elif item['role'] == 'assistant':
+                template.append_message(template.roles[1], item['content'])
+            elif item['role'] == 'system':
+                continue
+            else:
+                raise ValueError(f'Unknown role {item["role"]}')
+        template.append_message(template.roles[1], None)
+        outputs.append(template.get_prompt())
+    return outputs
+def _get_meta_template(meta_template):
+    default_meta_template = dict(
+        round=[
+            dict(role='HUMAN', api_role='HUMAN'),
+            dict(role='BOT', api_role='BOT', generate=True),
+        ]
+    )
+    return APITemplateParser(meta_template or default_meta_template)
+def _set_model_kwargs_torch_dtype(model_kwargs):
+    import torch
+    if 'torch_dtype' not in model_kwargs:
+        torch_dtype = torch.float16
+    else:
+        torch_dtype = {
+            'torch.float16': torch.float16,
+            'torch.bfloat16': torch.bfloat16,
+            'torch.float': torch.float,
+            'auto': 'auto',
+            'None': None,
+        }.get(model_kwargs['torch_dtype'])
+    if torch_dtype is not None:
+        model_kwargs['torch_dtype'] = torch_dtype
+    return model_kwargs
+@MODELS.register_module()
+class HuggingFacewithChatTemplate(BaseModel):
+    def __init__(self,
+                 path: str,
+                 model_kwargs: dict = dict(),
+                 tokenizer_path: Optional[str] = None,
+                 tokenizer_kwargs: dict = dict(),
+                 peft_path: Optional[str] = None,
+                 peft_kwargs: dict = dict(),
+                 tokenizer_only: bool = False,
+                 generation_kwargs: dict = dict(),
+                 max_seq_len: Optional[int] = None,
+                 meta_template: Optional[Dict] = None,
+                 pad_token_id: Optional[int] = None,
+                 fastchat_template: Optional[str] = None,
+                 stop_words: Optional[str] = [],
+                 **other_kwargs):
+        self.logger = get_logger()
+        self.path = path
+        self.tokenizer_only = tokenizer_only
+        self.template_parser = _get_meta_template(meta_template)
+        self.max_seq_len = _get_possible_max_seq_len(max_seq_len, path)
+        self._load_tokenizer(tokenizer_path or path, tokenizer_kwargs, pad_token_id)
+        if not tokenizer_only:
+            self._load_model(path=path, kwargs=model_kwargs, peft_path=peft_path, peft_kwargs=peft_kwargs)
+        self.generation_kwargs = generation_kwargs
+        self.fastchat_template = fastchat_template
+        self.stop_words = stop_words
+        for k, v in other_kwargs.items():
+            if v is not None:
+                self.logger.warning(f'Unused argument {k}={v}')
+    def _load_tokenizer(self, path: Optional[str], kwargs: dict, pad_token_id: Optional[int] = None):
+        from transformers import AutoTokenizer, GenerationConfig
+        DEFAULT_TOKENIZER_KWARGS = dict(padding_side='left', truncation_side='left', use_fast=False, trust_remote_code=True)
+        tokenizer_kwargs = DEFAULT_TOKENIZER_KWARGS
+        tokenizer_kwargs.update(kwargs)
+        self.tokenizer = AutoTokenizer.from_pretrained(path, **tokenizer_kwargs)
+        # A patch for some models without pad_token_id
+        if pad_token_id is not None:
+            if self.tokenizer.pad_token_id is None:
+                self.logger.debug(f'Using {pad_token_id} as pad_token_id')
+            elif self.tokenizer.pad_token_id != pad_token_id:
+                self.logger.warning(f'pad_token_id is not consistent. Using {pad_token_id} as pad_token_id')
+            self.tokenizer.pad_token_id = pad_token_id
+            return
+        if self.tokenizer.pad_token_id is not None:
+            return
+        self.logger.warning('pad_token_id is not set for the tokenizer.')
+        generation_config = GenerationConfig.from_pretrained(path)
+        if generation_config.pad_token_id is not None:
+            self.logger.warning(f'Using {generation_config.pad_token_id} as pad_token_id.')
+            self.tokenizer.pad_token_id = generation_config.pad_token_id
+            return
+        if self.tokenizer.eos_token_id is not None:
+            self.logger.warning(f'Using eos_token_id {self.tokenizer.eos_token_id} as pad_token_id.')
+            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
+            return
+        raise ValueError('pad_token_id is not set for this tokenizer. Please set `pad_token_id={PAD_TOKEN_ID}` in model_cfg.')
+    def _load_model(self, path: str, kwargs: dict, peft_path: Optional[str] = None, peft_kwargs: dict = dict()):
+        from transformers import AutoModel, AutoModelForCausalLM
+        DEFAULT_MODEL_KWARGS = dict(device_map='auto', trust_remote_code=True)
+        model_kwargs = DEFAULT_MODEL_KWARGS
+        model_kwargs.update(kwargs)
+        model_kwargs = _set_model_kwargs_torch_dtype(model_kwargs)
+        try:
+            self.model = AutoModelForCausalLM.from_pretrained(path, **model_kwargs)
+        except ValueError:
+            self.model = AutoModel.from_pretrained(path, **model_kwargs)
+        if peft_path is not None:
+            from peft import PeftModel
+            peft_kwargs['is_trainable'] = False
+            self.model = PeftModel.from_pretrained(self.model, peft_path, **peft_kwargs)
+        self.model.eval()
+        self.model.generation_config.do_sample = False
+    def generate(self,
+                 inputs: List[str],
+                 max_out_len: int,
+                 min_out_len: Optional[int] = None,
+                 stopping_criteria: List[str] = [],
+                 **kwargs) -> List[str]:
+        messages = _convert_chat_messages(inputs)
+        batch_size = len(messages)
+        tokenize_kwargs = dict(
+            return_tensors='pt',
+            padding=True,
+            truncation=True,
+            add_special_tokens=True,
+            max_length=self.max_seq_len
+        )
+        if self.fastchat_template:
+            messages = _format_with_fast_chat_template(messages, self.fastchat_template)
+            tokens = self.tokenizer.batch_encode_plus(messages, **tokenize_kwargs)
+        else:
+            messages = [self.tokenizer.apply_chat_template(m, add_generation_prompt=True, tokenize=False) for m in messages]
+            tokenize_kwargs['add_special_tokens'] = False
+            tokens = self.tokenizer.batch_encode_plus(messages, **tokenize_kwargs)
+        tokens = {k: v.to(self.model.device) for k, v in tokens.items()}
+        generation_kwargs = self.generation_kwargs.copy()
+        generation_kwargs.update(kwargs)
+        stopping_criteria = list(set(stopping_criteria + self.stop_words))
+        if stopping_criteria:
+            generation_kwargs['stopping_criteria'] = _get_stopping_criteria(stopping_criteria, self.tokenizer, batch_size)
+        if max_out_len is not None:
+            generation_kwargs['max_new_tokens'] = max_out_len
+        if min_out_len is not None:
+            generation_kwargs['min_new_tokens'] = min_out_len
+        generation_kwargs['pad_token_id'] = self.tokenizer.pad_token_id
+        # step-2: conduct model forward to generate output
+        outputs = self.model.generate(**tokens, **generation_kwargs)
+        outputs = outputs[:, tokens['input_ids'].shape[1]:]
+        # step-3: decode the output
+        decodeds = self.tokenizer.batch_decode(outputs)
+        for stop in stopping_criteria:
+            decodeds = [t.split(stop)[0] for t in decodeds]
+        return decodeds
+    def get_token_len(self, prompt: str) -> int:
+        m = _convert_chat_messages([prompt])[0]
+        t = self.tokenizer.apply_chat_template(m, add_generation_prompt=True, return_dict=True)
+        return len(t['input_ids'])
+def  _convert_base_messages(inputs):
+    outputs = []
+    for _input in inputs:
+        if isinstance(_input, str):
+            outputs.append(_input)
+        else:
+            messages = []
+            for item in _input:
+                messages.append(item['prompt'])
+            outputs.append(''.join(messages))
+    return outputs
+class HuggingFaceBaseModel(HuggingFacewithChatTemplate):
+    def __init__(self,
+                 path: str,
+                 model_kwargs: dict = dict(),
+                 tokenizer_path: Optional[str] = None,
+                 tokenizer_kwargs: dict = dict(),
+                 peft_path: Optional[str] = None,
+                 peft_kwargs: dict = dict(),
+                 tokenizer_only: bool = False,
+                 generation_kwargs: dict = dict(),
+                 max_seq_len: Optional[int] = None,
+                 pad_token_id: Optional[int] = None,
+                 stop_words: Optional[str] = [],
+                 **other_kwargs):
+        self.logger = get_logger()
+        self.path = path
+        self.tokenizer_only = tokenizer_only
+        self.template_parser = LMTemplateParser()
+        self.max_seq_len = _get_possible_max_seq_len(max_seq_len, path)
+        self._load_tokenizer(tokenizer_path or path, tokenizer_kwargs, pad_token_id)
+        if not tokenizer_only:
+            self._load_model(path=path, kwargs=model_kwargs, peft_path=peft_path, peft_kwargs=peft_kwargs)
+        self.generation_kwargs = generation_kwargs
+        self.stop_words = stop_words
+        for k, v in other_kwargs.items():
+            if v is not None:
+                self.logger.warning(f'Unused argument {k}={v}')
+    def generate(self,
+                 inputs: List[str],
+                 max_out_len: int,
+                 min_out_len: Optional[int] = None,
+                 stopping_criteria: List[str] = [],
+                 **kwargs) -> List[str]:
+        messages = _convert_base_messages(inputs)
+        batch_size = len(messages)
+        tokenize_kwargs = dict(
+            return_tensors='pt',
+            padding=True,
+            truncation=True,
+            add_special_tokens=True,
+            max_length=self.max_seq_len
+        )
+        tokens = self.tokenizer.batch_encode_plus(messages, **tokenize_kwargs)
+        tokens = {k: v.to(self.model.device) for k, v in tokens.items()}
+        generation_kwargs = self.generation_kwargs.copy()
+        generation_kwargs.update(kwargs)
+        stopping_criteria = list(set(stopping_criteria + self.stop_words))
+        if stopping_criteria:
+            generation_kwargs['stopping_criteria'] = _get_stopping_criteria(stopping_criteria, self.tokenizer, batch_size)
+        if max_out_len is not None:
+            generation_kwargs['max_new_tokens'] = max_out_len
+        if min_out_len is not None:
+            generation_kwargs['min_new_tokens'] = min_out_len
+        generation_kwargs['pad_token_id'] = self.tokenizer.pad_token_id
+        # step-2: conduct model forward to generate output
+        outputs = self.model.generate(**tokens, **generation_kwargs)
+        outputs = outputs[:, tokens['input_ids'].shape[1]:]
+        # step-3: decode the output
+        decodeds = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        for stop in stopping_criteria:
+            decodeds = [token.split(stop)[0] for token in decodeds]
+        return decodeds
+    def get_ppl(self, inputs: List[str], mask_length: Optional[List[int]] = None) -> List[float]:
+        """Get perplexity scores given a list of inputs.
+        Args:
+            inputs (List[str]): A list of strings.
+            mask_length (Optional[List[int]]): A list of mask lengths. If
+                provided, the perplexity scores will be calculated with the
+                first mask_length[i] tokens masked out. It's okay to skip
+                its implementation if advanced features in PPLInfernecer is
+                not needed.
+        Returns:
+            List[float]: A list of perplexity scores.
+        """
+        assert self.tokenizer.pad_token
+        import torch
+        import torch.nn.functional as F
+        pad_token_id = self.tokenizer.pad_token_id
+        messages = _convert_base_messages(inputs)
+        tokenize_kwargs = dict(
+            return_tensors='pt',
+            padding=True,
+            truncation=True,
+            add_special_tokens=True,
+            max_length=self.max_seq_len
+        )
+        tokens = self.tokenizer.batch_encode_plus(messages, **tokenize_kwargs)
+        tokens = {k: v.to(self.model.device) for k, v in tokens.items()}
+        outputs = self.model(**tokens)[0]
+        batch_size, seq_len, vocab_size = outputs.shape
+        shift_logits = outputs[:, :-1, :].contiguous().float()
+        shift_labels = tokens['input_ids'][:, 1:].contiguous()
+        loss = F.cross_entropy(
+            shift_logits.view(-1, vocab_size),
+            shift_labels.view(-1),
+            ignore_index=pad_token_id,
+            reduction='none').view(batch_size, seq_len - 1)
+        lens = (tokens['input_ids'] != pad_token_id).sum(-1).cpu().numpy()
+        if mask_length is not None:
+            import numpy as np
+            mask = torch.zeros_like(shift_labels)  # [batch,seqlen]
+            for i in range(len(mask)):
+                for j in range(mask_length[i] - 1, len(mask[i])):
+                    mask[i][j] = 1
+            loss = loss * mask
+            lens -= np.array(mask_length)
+        ce_loss = loss.float().sum(-1).cpu().detach().numpy() / lens
+        return ce_loss
+    def get_loglikelihood(self, inputs: List[str], conts:  List[str]) -> List[float]:
+        mask_length = [self.get_token_len(c, add_special_tokens=False) for c in conts]
+        return - self.get_ppl(inputs, mask_length)
+    def get_token_len(self, prompt: str, add_special_tokens: bool=True) -> int:
+        m = _convert_base_messages([prompt])[0]
+        t = self.tokenizer(m, add_special_tokens=add_special_tokens)
+        return len(t['input_ids'])
--- a/opencompass/models/turbomind.py
+++ b/opencompass/models/turbomind.py
@@ -37,9 +37,6 @@ class TurboMindModel(BaseModel):
            arguments like session_len, max_batch_size for TurboMind.
        gen_config (Dict, optional): Generation config to set
                arguments like top_k, top_p, temperature.
-        end_str (str, optional): Whether to trim generated strings with end_str
-            if the model has special ending strings that are not handled well.
-            Defaults to None.
    """
    def __init__(self,
@@ -47,9 +44,8 @@ class TurboMindModel(BaseModel):
                 concurrency: int = 8,
                 max_seq_len: int = 2048,
                 meta_template: Optional[Dict] = None,
-                 engine_config: Optional[Dict] = None,
+                 engine_config: Dict = {},
-                 gen_config: Optional[Dict] = None,
+                 gen_config: Dict = {}):
-                 end_str: Optional[str] = None):
        super().__init__(path=path,
                         max_seq_len=max_seq_len,
                         meta_template=meta_template)
@@ -70,12 +66,14 @@ class TurboMindModel(BaseModel):
        ]
        self.generator_ids = [i + 1 for i in range(concurrency)]
        self.gen_config = gen_config
-        self.end_str = end_str
        self.major_version, self.minor_version, _ = version_info
    def generate(self,
                 inputs: List[str],
                 max_out_len: int = 512,
+                 stopping_criteria: List[str] = [],
+                 do_sample: Optional[bool] = None,
+                 temperature: int = 1,
                 **kwargs) -> List[str]:
        """Generate results given a list of inputs.
@@ -96,13 +94,21 @@ class TurboMindModel(BaseModel):
        ]
        gen_config = copy.deepcopy(self.gen_config)
-        if 'do_sample' in kwargs:
+        if do_sample is not None:
-            if kwargs['do_sample']:
+            if do_sample:
-                gen_config.top_k = 1000
+                gen_config['top_k'] = 1000
-                gen_config.temperature = kwargs.get('temperature', 1)
+                gen_config['temperature'] = temperature
            else:
-                gen_config.top_k = 1
+                gen_config['top_k'] = 1
-                gen_config.temperature = 0.01
+        if stopping_criteria:
+            stop_words = gen_config.get('stop_words', [])
+            for t in stopping_criteria:
+                t = self.tokenizer.encode(t, add_bos=False)
+                stop_words.append(t[0])
+            gen_config['stop_words'] = list(set(stop_words))
+        from lmdeploy.messages import EngineGenerationConfig
+        gen_config = EngineGenerationConfig(**gen_config)
        results = []
        for batch_input in batch_inputs:
@@ -115,7 +121,6 @@ class TurboMindModel(BaseModel):
                        batch_input,
                        [max_out_len] * len(batch_input),
                        [gen_config] * len(batch_input),
-                        [self.end_str] * len(batch_input),
                    ))
                results += _results
        return results
@@ -136,8 +141,7 @@ class TurboMindModel(BaseModel):
                  session_id,
                  prompt: PromptType,
                  max_out_len: int,
-                  gen_config=None,
+                  gen_config=None) -> str:
-                  end_str: Optional[str] = None) -> str:
        """Generate results given a list of inputs.
        Args:
@@ -147,10 +151,6 @@ class TurboMindModel(BaseModel):
            max_out_len (int): The maximum length of the output.
            gen_config (EngineGenerationConfig, optional): Generation
                config to set arguments like top_k, top_p, temperature.
-            end_str (str, optional): Whether to trim generated strings
-                with end_str if the model has special ending strings
-                that are not handled well.
-                Defaults to None.
        Returns:
            str: The generated string.
        """
@@ -173,9 +173,6 @@ class TurboMindModel(BaseModel):
                _, output_ids, _ = outputs
            response = self.tokenizer.decode(output_ids)
            response = valid_str(response)
-        # used to trim
-        if end_str:
-            response = response.split(end_str)[0]
        return response
    def get_ppl(self,

--- a/opencompass/models/vllm.py
+++ b/opencompass/models/vllm.py
@@ -25,7 +25,7 @@ class VLLM(BaseModel):
        meta_template: Optional[Dict] = None,
        mode: str = 'none',
        use_fastchat_template: bool = False,
-        end_str: Optional[str] = None,
+        stop_words: List[str] = [],
    ):
        super().__init__(path=path,
                         max_seq_len=max_seq_len,
@@ -42,7 +42,7 @@ class VLLM(BaseModel):
        assert mode in ['none', 'mid']
        self.mode = mode
        self.use_fastchat_template = use_fastchat_template
-        self.end_str = end_str
+        self.stop_words = stop_words
    def _load_model(self,
                    path: str,
@@ -59,7 +59,10 @@ class VLLM(BaseModel):
            ray.shutdown()
        self.model = LLM(path, **model_kwargs)
-    def generate(self, inputs: List[str], max_out_len: int,
+    def generate(self,
+                 inputs: List[str],
+                 max_out_len: int,
+                 stopping_criteria: List[str] = [],
                 **kwargs) -> List[str]:
        """Generate results given a list of inputs.
@@ -90,6 +93,8 @@ class VLLM(BaseModel):
        generation_kwargs = kwargs.copy()
        generation_kwargs.update(self.generation_kwargs)
        generation_kwargs.update({'max_tokens': max_out_len})
+        _stop = list(set(self.stop_words + stopping_criteria))
+        generation_kwargs.update({'stop': _stop})
        sampling_kwargs = SamplingParams(**generation_kwargs)
        outputs = self.model.generate(inputs, sampling_kwargs)
@@ -97,9 +102,6 @@ class VLLM(BaseModel):
        for output in outputs:
            prompt = output.prompt
            generated_text = output.outputs[0].text
-            if self.end_str:
-                generated_text = generated_text.split(self.end_str)[0]
            prompt_list.append(prompt)
            output_strs.append(generated_text)

--- a/opencompass/openicl/icl_inferencer/icl_ll_inferencer.py
+++ b/opencompass/openicl/icl_inferencer/icl_ll_inferencer.py
-"""PPL Inferencer."""
+# flake8: noqa
+# yapf: disable
+"""LogLikelihood(LL) Inferencer."""
 import os
 from typing import List, Optional
@@ -76,16 +78,13 @@ class LLInferencer(BaseInferencer):
        # 3. Get labels of all the classes
        if self.labels is None:
-            labels = retriever.get_labels(ice_template=ice_template,
+            labels = retriever.get_labels(ice_template=ice_template, prompt_template=prompt_template)
-                                          prompt_template=prompt_template)
        else:
            labels = self.labels
        # 4. Generate in-context examples for testing inputs
        for idx in range(len(ice_idx_list)):
-            ice.append(
+            ice.append(retriever.generate_ice(ice_idx_list[idx], ice_template=ice_template))
-                retriever.generate_ice(ice_idx_list[idx],
-                                       ice_template=ice_template))
        output_handler.save_ice(self.model.parse_template(ice, mode='ppl'))
        # 5. Calculating loglikelihood for prompts in each label's class
@@ -99,58 +98,41 @@ class LLInferencer(BaseInferencer):
            # 5.1 Generate prompts of current label and truncate
            # TODO: Refactor
            for idx in range(len(ice_idx_list)):
-                prompt = retriever.generate_label_prompt(
+                prompt_kwargs = {
-                    idx,
+                    'idx': idx,
-                    ice[idx],
+                    'ice': ice[idx],
-                    label,
+                    'label': label,
-                    ice_template=ice_template,
+                    'ice_template': ice_template,
-                    prompt_template=prompt_template)
+                    'prompt_template': prompt_template,
+                }
+                prompt = retriever.generate_label_prompt(**prompt_kwargs)
+                prompt_token_num = self.model.get_token_len_from_template(prompt, mode='ppl')
                if self.max_seq_len is not None:
-                    prompt_token_num = self.model.get_token_len_from_template(
+                    while len(ice_idx_list[idx]) > 0 and prompt_token_num > self.max_seq_len:
-                        prompt, mode='ppl')
-                    while len(ice_idx_list[idx]
-                              ) > 0 and prompt_token_num > self.max_seq_len:
                        ice_idx_list[idx] = ice_idx_list[idx][:-1]
-                        ice[idx] = retriever.generate_ice(
+                        ice[idx] = retriever.generate_ice(ice_idx_list[idx], ice_template=ice_template)
-                            ice_idx_list[idx], ice_template=ice_template)
+                        prompt_kwargs['ice'] = ice[idx]
-                        prompt = retriever.generate_label_prompt(
+                        prompt = retriever.generate_label_prompt(**prompt_kwargs)
-                            idx,
+                        prompt_token_num = self.model.get_token_len_from_template(prompt, mode='ppl')
-                            ice[idx],
-                            label,
-                            ice_template=ice_template,
-                            prompt_template=prompt_template)
-                        prompt_token_num = self.model.get_token_len_from_template(  # noqa
-                            prompt, mode='ppl')  # noqa
                prompt_list.append(prompt)
                token_num_list.append(prompt_token_num)
                cont_list.append(retriever.test_ds[idx]['cont'])
            # 5.2 Get loglikelihood
-            logger.info(
+            logger.info(f"Calculating Loglikelihood for prompts labeled '{label}'")
-                f"Calculating Loglikelihood for prompts labeled '{label}'"
+            for idx in trange(0, len(prompt_list), self.batch_size, disable=not self.is_main_process):
-            )  # noqa
-            for idx in trange(0,
-                              len(prompt_list),
-                              self.batch_size,
-                              disable=not self.is_main_process):
                sub_prompt_list = prompt_list[idx:idx + self.batch_size]
                sub_cont_list = cont_list[idx:idx + self.batch_size]
                with torch.no_grad():
                    # mainly modify compared to PPLInferencer
-                    sub_inputs = self.model.parse_template(sub_prompt_list,
+                    sub_inputs = self.model.parse_template(sub_prompt_list, mode='ppl')
-                                                           mode='ppl')
+                    sub_res = self.model.get_loglikelihood(sub_inputs, sub_cont_list).tolist()
-                    sub_res = self.model.get_loglikelihood(
+                for res, prompt in zip(sub_res, self.model.parse_template(sub_prompt_list, mode='ppl')):
-                        sub_inputs, sub_cont_list).tolist()
-                for res, prompt in zip(
-                        sub_res,
-                        self.model.parse_template(sub_prompt_list,
-                                                  mode='ppl')):
                    sub_ppl_list.append(res)
                    ice_str = self.model.parse_template(ice[idx], mode='ppl')
-                    output_handler.save_prompt_and_loglikelihood(
+                    output_handler.save_prompt_and_loglikelihood(label, prompt.replace(ice_str, ''), prompt, res, index)
-                        label, prompt.replace(ice_str, ''), prompt, res, index)
                    index = index + 1
            ppl.append(sub_ppl_list)
@@ -169,13 +151,9 @@ class LLInferencer(BaseInferencer):
        # 8. Output
        if self.is_main_process:
            os.makedirs(output_json_filepath, exist_ok=True)
-            output_handler.write_to_json(output_json_filepath,
+            output_handler.write_to_json(output_json_filepath, output_json_filename)
-                                         output_json_filename)
-        return [
+        return [sample['prediction'] for sample in output_handler.results_dict.values()]
-            sample['prediction']
-            for sample in output_handler.results_dict.values()
-        ]
 class LLInferencerOutputHandler:

--- a/opencompass/openicl/icl_inferencer/icl_ppl_inferencer.py
+++ b/opencompass/openicl/icl_inferencer/icl_ppl_inferencer.py
+# flake8: noqa
+# yapf: disable
 """PPL Inferencer."""
 import os
@@ -84,9 +86,7 @@ class PPLInferencer(BaseInferencer):
        # 4. Generate in-context examples for testing inputs
        for idx in range(len(ice_idx_list)):
-            ice.append(
+            ice.append(retriever.generate_ice(ice_idx_list[idx], ice_template=ice_template))
-                retriever.generate_ice(ice_idx_list[idx],
-                                       ice_template=ice_template))
        output_handler.save_ice(self.model.parse_template(ice, mode='ppl'))
        # 5. Calculating PPL for prompts in each label's class
@@ -101,33 +101,26 @@ class PPLInferencer(BaseInferencer):
            # 5.1 Generate prompts of current label and truncate
            # TODO: Refactor
            for idx in range(len(ice_idx_list)):
-                prompt = retriever.generate_label_prompt(
+                prompt_kwargs = {
-                    idx,
+                    'idx': idx,
-                    ice[idx],
+                    'ice': ice[idx],
-                    label,
+                    'label': label,
-                    ice_template=ice_template,
+                    'ice_template': ice_template,
-                    prompt_template=prompt_template,
+                    'prompt_template': prompt_template,
-                    remain_sep=normalizing_str is not None)
+                    'remain_sep': normalizing_str is not None
-                prompt_token_num = self.model.get_token_len_from_template(
+                }
-                    prompt, mode='ppl')
+                prompt = retriever.generate_label_prompt(**prompt_kwargs)
+                prompt_token_num = self.model.get_token_len_from_template(prompt, mode='ppl')
                if self.max_seq_len is not None:
-                    while len(ice_idx_list[idx]
+                    while len(ice_idx_list[idx]) > 0 and prompt_token_num > self.max_seq_len:
-                              ) > 0 and prompt_token_num > self.max_seq_len:
                        ice_idx_list[idx] = ice_idx_list[idx][:-1]
-                        ice[idx] = retriever.generate_ice(
+                        ice[idx] = retriever.generate_ice(ice_idx_list[idx], ice_template=ice_template)
-                            ice_idx_list[idx], ice_template=ice_template)
+                        prompt_kwargs['ice'] = ice[idx]
-                        prompt = retriever.generate_label_prompt(
+                        prompt = retriever.generate_label_prompt(**prompt_kwargs)
-                            idx,
+                        prompt_token_num = self.model.get_token_len_from_template(prompt, mode='ppl')
-                            ice[idx],
-                            label,
-                            ice_template=ice_template,
-                            prompt_template=prompt_template)
-                        prompt_token_num = self.model.get_token_len_from_template(  # noqa
-                            prompt, mode='ppl')  # noqa
                if normalizing_str is not None:
-                    assert isinstance(prompt, str), \
+                    assert isinstance(prompt, str), 'Prompt must be a string when normalizing_str is set.'
-                         'Prompt must be a string when normalizing_str is set.'
                    prompt_sep = prompt
                    if prompt_template is not None:
                        sep_token = prompt_template.sep_token
@@ -140,10 +133,9 @@ class PPLInferencer(BaseInferencer):
                    prompt = context + answer
                    normalizing_prompt = normalizing_str + answer
-                    context_length_list.append(
+                    context_length_list.append(self.model.get_token_len_from_template(context, mode='ppl'))
-                        self.model.get_token_len_from_template(context,
-                                                               mode='ppl'))
                    normalizing_prompt_list.append(normalizing_prompt)
                prompt_list.append(prompt)
                token_num_list.append(prompt_token_num)
@@ -153,45 +145,25 @@ class PPLInferencer(BaseInferencer):
            # 5.2 Get PPL
            logger.info(f"Calculating PPL for prompts labeled '{label}'")
-            for idx in trange(0,
+            for idx in trange(0, len(prompt_list), self.batch_size, disable=not self.is_main_process):
-                              len(prompt_list),
-                              self.batch_size,
-                              disable=not self.is_main_process):
                sub_prompt_list = prompt_list[idx:idx + self.batch_size]
-                if normalizing_str is not None:
-                    sub_context_length_list = context_length_list[idx:idx +
-                                                                  self.
-                                                                  batch_size]
-                    sub_normalizing_prompt_list = normalizing_prompt_list[
-                        idx:idx + self.batch_size]
                with torch.no_grad():
                    if normalizing_str is not None:
-                        res1 = self.model.get_ppl_from_template(
+                        sub_context_length_list = context_length_list[idx:idx + self.batch_size]
-                            sub_prompt_list,
+                        sub_normalizing_prompt_list = normalizing_prompt_list[idx:idx + self.batch_size]
-                            mask_length=sub_context_length_list)
+                        res1 = self.model.get_ppl_from_template(sub_prompt_list, mask_length=sub_context_length_list)
-                        res2 = self.model.get_ppl_from_template(
+                        sub_normalizing_context_length_list = [normalizing_str_len for _ in range(len(sub_prompt_list))]
-                            sub_normalizing_prompt_list,
+                        res2 = self.model.get_ppl_from_template(sub_normalizing_prompt_list, mask_length=sub_normalizing_context_length_list)
-                            mask_length=[
-                                normalizing_str_len
-                                for i in range(len(sub_prompt_list))
-                            ])
                        sub_res = res1 - res2
                    else:
-                        sub_res = self.model.get_ppl_from_template(
+                        sub_res = self.model.get_ppl_from_template(sub_prompt_list).tolist()
-                            sub_prompt_list).tolist()
-                for res, prompt in zip(
+                for res, prompt in zip(sub_res, self.model.parse_template(sub_prompt_list, mode='ppl')):
-                        sub_res,
-                        self.model.parse_template(sub_prompt_list,
-                                                  mode='ppl')):
                    sub_ppl_list.append(res)
                    ice_str = self.model.parse_template(ice[idx], mode='ppl')
-                    output_handler.save_prompt_and_ppl(
+                    prompt_wo_ice = prompt.replace(ice_str, '')
-                        label, prompt.replace(ice_str, ''), prompt, res, index)
+                    output_handler.save_prompt_and_ppl(label, prompt_wo_ice, prompt, res, index)
-                    output_handler.results_dict[str(
+                    output_handler.results_dict[str(index)][f'label: {str(label)}']['BPB'] = res * token_num_list[index] / len(prompt_wo_ice.encode())
-                        index)][f'label: {str(label)}'][
-                            'BPB'] = res * token_num_list[index] / len(
-                                prompt.replace(ice_str, '').encode())
                    index = index + 1
            ppl.append(sub_ppl_list)
@@ -210,10 +182,6 @@ class PPLInferencer(BaseInferencer):
        # 8. Output
        if self.is_main_process:
            os.makedirs(output_json_filepath, exist_ok=True)
-            output_handler.write_to_json(output_json_filepath,
+            output_handler.write_to_json(output_json_filepath, output_json_filename)
-                                         output_json_filename)
-        return [
+        return [sample['prediction'] for sample in output_handler.results_dict.values()]
-            sample['prediction']
-            for sample in output_handler.results_dict.values()
-        ]
--- a/opencompass/partitioners/num_worker.py
+++ b/opencompass/partitioners/num_worker.py
@@ -60,14 +60,16 @@ class NumWorkerPartitioner(BasePartitioner):
                    if osp.exists(filename):
                        continue
                    dataset_size = self.get_size(dataset)
-                    if dataset_size > self.min_task_size:
+                    if self.num_worker <= 1:
+                        chunks.append(dataset)
+                    elif dataset_size <= self.min_task_size:
+                        chunks.append(dataset)
+                    else:
                        root, ext = osp.splitext(filename)
                        dataset_splits = self.split_dataset(dataset)
                        for i, dataset_split in enumerate(dataset_splits):
                            if not osp.exists(f'{root}_{i}{ext}'):
                                chunks.append(dataset_split)
-                    else:
-                        chunks.append(dataset)
                if self.strategy == 'heuristic':
                    buckets = [[] for _ in range(self.num_worker)]