Merge branch 'main' of https://github.com/hpcaitech/ColossalAI

9e768b59 · zhuwenwen · 7bc5a8e3 · 8aed02b9 · 9e768b59 · 9e768b59
Commit 9e768b59 authored Oct 10, 2023 by zhuwenwen
20 changed files
--- a/applications/ColossalEval/configs/gpt_evaluation/data/eval_en_examples.json
+++ b/applications/ColossalEval/configs/gpt_evaluation/data/eval_en_examples.json
+[
+  {
+    "category": "brainstorming",
+    "instruction": "Which are some popular fiction books that I should read?",
+    "input": "",
+    "output": "",
+    "target": "",
+    "id": 1
+  },
+  {
+    "category": "brainstorming",
+    "instruction": "How do I properly store fruits and vegetables to keep them fresh for longer?",
+    "input": "",
+    "output": "",
+    "target": "",
+    "id": 2
+  },
+  {
+    "category": "brainstorming",
+    "instruction": "How do you properly chop an onion without crying?",
+    "input": "",
+    "output": "",
+    "target": "",
+    "id": 3
+  },
+  {
+    "category": "brainstorming",
+    "instruction": "How to make an international transfer? Please provide 3 techniques.",
+    "input": "",
+    "output": "",
+    "target": "",
+    "id": 4
+  },
+  {
+    "category": "brainstorming",
+    "instruction": "Name five leadership qualities that you consider most important.",
+    "input": "",
+    "output": "",
+    "target": "",
+    "id": 5
+  },
+  {
+    "category": "chat",
+    "instruction": "Complete a dialogue based on the following character information. Alex: A novice writer who is struggling to find inspiration and develop his writing skills. Emma: A successful author with many published works, providing guidance and advice to Alex.",
+    "input": "Alex: Hi Emma, I have been writing for a while now but can't seem to make any progress. Can you give me any advice? Emma: Hi Alex, sure. What kind of writing are you doing? Alex: I'm trying to write a novel, but I just can't seem to find any inspiration. Emma: ",
+    "output": "",
+    "target": "",
+    "id": 6
+  },
+  {
+    "category": "chat",
+    "instruction": "Complete a dialogue based on the following character information. John: An experienced software engineer with a passion for coding. Karen: A recent college graduate who is interested in learning more about software development.",
+    "input": "Karen: Hi John, I noticed that you have a lot of experience in the software industry. Can you tell me what you think is the most important skill for a software engineer? John: ",
+    "output": "",
+    "target": "",
+    "id": 7
+  },
+  {
+    "category": "chat",
+    "instruction": "Complete a dialogue based on the following character information. Sarah is a new employee who is nervous about her first presentation; Tom is her boss who has given her coaching and preparation materials.",
+    "input": "Sarah: Tom, I'm feeling really nervous about my presentation tomorrow. Tom: I know how you feel, Sarah. However, I believe in you and your abilities. Just stick to the preparation materials that I have given you, and you'll do great. Sarah: Thank you, Tom. What if I forget something important during the presentation? Tom: ",
+    "output": "",
+    "target": "",
+    "id": 8
+  },
+  {
+    "category": "chat",
+    "instruction": "Complete a dialogue based on the following character information. Sarah: a young artist who is full of creative ideas and always eager to try new things. Jack: a seasoned artist who has achieved great success in the art world and is more traditional in his approach to art.",
+    "input": "Sarah: Hi Jack, I'm really excited to meet you. I'm a big fan of your work. Jack: Hi Sarah, nice to meet you too. So, what kind of art do you do? Sarah: I am passionate about abstract art, especially combining different materials and colors. I think it can really give people a new perspective on things. Jack: That's interesting, but I am more focused on realistic paintings. I believe the most important thing is to master the basic skills first. Sarah: ",
+    "output": "",
+    "target": "",
+    "id": 9
+  },
+  {
+    "category": "chat",
+    "instruction": "Complete a conversation based on the following persona information. Sarah is a college student who is interested in joining a volunteer organization. John is the leader of the volunteer organization and is eager to welcome new members.",
+    "input": "Sarah: Hi, I'm Sarah, and I'm interested in joining your volunteer organization. John: Hi Sarah, welcome! We're always looking for new members who are passionate about volunteering. What areas would you like to focus on? Sarah: I'm interested in community outreach and working with children. John: ",
+    "output": "",
+    "target": "",
+    "id": 10
+  },
+  {
+    "category": "generation",
+    "instruction": "Write an email based on the subject:",
+    "input": "Subject: \"Invitation to an Exclusive Webinar\"",
+    "output": "",
+    "target": "",
+    "id": 11
+  },
+  {
+    "category": "generation",
+    "instruction": "Write a set of guidelines for first-time pet owners on how to properly care for a new puppy.",
+    "input": "",
+    "output": "",
+    "target": "",
+    "id": 12
+  },
+  {
+    "category": "generation",
+    "instruction": "Can you help me write a persuasive speech on why we should recycle more and take better care of the environment?",
+    "input": "",
+    "output": "",
+    "target": "",
+    "id": 13
+  },
+  {
+    "category": "generation",
+    "instruction": "Write a pitch for a brand-new mobile app that helps people organize their daily tasks more efficiently.",
+    "input": "",
+    "output": "",
+    "target": "",
+    "id": 14
+  },
+  {
+    "category": "generation",
+    "instruction": "Write a social media post promoting an upcoming concert featuring a local band.",
+    "input": "",
+    "output": "",
+    "target": "",
+    "id": 15
+  },
+  {
+    "category": "open_qa",
+    "instruction": "Describe the significance of the Renaissance period in European history.",
+    "input": "",
+    "output": "",
+    "target": "",
+    "id": 16
+  },
+  {
+    "category": "open_qa",
+    "instruction": "What is the term for the surgical removal of the appendix?",
+    "input": "",
+    "output": "",
+    "target": "",
+    "id": 17
+  },
+  {
+    "category": "open_qa",
+    "instruction": "Explain the process of osmosis in biological systems.",
+    "input": "",
+    "output": "",
+    "target": "",
+    "id": 18
+  },
+  {
+    "category": "open_qa",
+    "instruction": "Who were the members of the Beatles band?",
+    "input": "",
+    "output": "",
+    "target": "",
+    "id": 19
+  },
+  {
+    "category": "open_qa",
+    "instruction": "Who painted the The Scream?",
+    "input": "",
+    "output": "",
+    "target": "",
+    "id": 20
+  },
+  {
+    "category": "roleplay",
+    "instruction": "I want you to act as a linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply with the terminal output inside one unique code block, and nothing else. do not write explanations. do not type commands unless I instruct you to do so. when i need to tell you something in english, i will do so by putting text inside curly brackets {like this}. my first command is pwd",
+    "input": "",
+    "output": "",
+    "target": "",
+    "id": 21
+  },
+  {
+    "category": "roleplay",
+    "instruction": "I want you to act as a travel guide. I will write you my location and you will suggest a place to visit near my location. In some cases, I will also give you the type of places I will visit. You will also suggest me places of similar type that are close to my first location. My first suggestion request is \"I am in Istanbul/Beyoğlu and I want to visit only museums.\"",
+    "input": "",
+    "output": "",
+    "target": "",
+    "id": 22
+  },
+  {
+    "category": "roleplay",
+    "instruction": "I want you to act as an advertiser. You will create a campaign to promote a product or service of your choice. You will choose a target audience, develop key messages and slogans, select the media channels for promotion, and decide on any additional activities needed to reach your goals. My first suggestion request is \"I need help creating an advertising campaign for a new type of energy drink targeting young adults aged 18-30.\"",
+    "input": "",
+    "output": "",
+    "target": "",
+    "id": 23
+  },
+  {
+    "category": "roleplay",
+    "instruction": "I want you to act as a storyteller. You will come up with entertaining stories that are engaging, imaginative and captivating for the audience. It can be fairy tales, educational stories or any other type of stories which has the potential to capture people's attention and imagination. Depending on the target audience, you may choose specific themes or topics for your storytelling session e.g., if it’s children then you can talk about animals; If it’s adults then history-based tales might engage them better etc. My first request is \"I need an interesting story on perseverance.\"",
+    "input": "",
+    "output": "",
+    "target": "",
+    "id": 24
+  },
+  {
+    "category": "roleplay",
+    "instruction": "I want you to act as a rapper. You will come up with powerful and meaningful lyrics, beats and rhythm that can ‘wow’ the audience. Your lyrics should have an intriguing meaning and message which people can relate too. When it comes to choosing your beat, make sure it is catchy yet relevant to your words, so that when combined they make an explosion of sound everytime! My first request is \"I need a rap song about finding strength within yourself.\"",
+    "input": "",
+    "output": "",
+    "target": "",
+    "id": 25
+  }
+]
--- a/applications/ColossalEval/configs/gpt_evaluation/prompt/battle_prompt/battle_prompt_cn.json
+++ b/applications/ColossalEval/configs/gpt_evaluation/prompt/battle_prompt/battle_prompt_cn.json
+{
+  "id": 1,
+  "system_prompt": "你是一个检查回答质量的好助手。",
+  "prompt_template": "[问题]\n{question}\n\n[1号AI助手的答案]\n{answer_1}\n\n[1号AI助手答案终止]\n\n[2号AI助手的答案]\n{answer_2}\n\n[2号AI助手答案终止]\n\n[要求]\n{prompt}\n\n",
+  "prompt": "我们需要你评价这两个AI助手回答的性能。\n请对他们的回答的有用性、相关性、准确性、详细程度进行评分。每个AI助手都会得到一个1到10分的总分，分数越高表示整体表现越好。\n请首先输出一行，该行只包含两个数值，分别表示1号和2号AI助手的分数。这两个分数之间要有一个空格。在随后的一行中，请对你的评价作出全面的解释，避免任何潜在的偏见，并确保AI助手回答的顺序不会影响您的判断。"
+}
--- a/applications/ColossalEval/configs/gpt_evaluation/prompt/battle_prompt/battle_prompt_en.json
+++ b/applications/ColossalEval/configs/gpt_evaluation/prompt/battle_prompt/battle_prompt_en.json
+{
+  "id": 1,
+  "system_prompt": "You are a helpful and precise assistant for checking the quality of the answer. You will be given two different answers to the same question",
+  "prompt_template": "[Question]\n{question}\n\n[The Start of AI Assistant 1's Answer]\n{answer_1}\n\n[The End of AI Assistant 1's Answer]\n\n[The Start of AI Assistant 2's Answer]\n{answer_2}\n\n[The End of AI Assistant 2's Answer]\n\n[Requirements]\n{prompt}\n\n",
+  "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above.\nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space. In the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."
+}
--- a/applications/ColossalEval/configs/gpt_evaluation/prompt/evaluation_prompt/evaluation_prompt_cn.json
+++ b/applications/ColossalEval/configs/gpt_evaluation/prompt/evaluation_prompt/evaluation_prompt_cn.json
+{
+  "brainstorming": {
+    "id": 1,
+    "category": "brainstorming",
+    "metrics": {
+      "language organization": "语言组织(1-5)：答案语言是否流畅、连贯，使用正确的语法，具有一定逻辑性，使用恰当的连接词、过渡词等等。",
+      "relevance": "切题(1-5)：答案内容是否切题，不答非所问，并且严格遵照题目要求。",
+      "creativity": "创意性(1-5)：某些头脑风暴问题可能需要答案具有创意，提出新的思路。",
+      "practicality": "实用性(1-5)：某些头脑风暴问题可能需要答案提出实用的建议或解决方法。",
+      "reasonableness": "合理性(1-5)：答案应该符合常识、生活实际等等。"
+    },
+    "CoT": {
+      "language organization": "1. 阅读答案，并检查是否有语法错误、用词不当或其他显著的错误。\n2. 检查答案是否具有逻辑性，能够按照合理的顺序传达信息并且能够自圆其说。\n3. 确定答案是否与问题或主题相关，并且能够传达清晰的信息。\n4. 检查答案是否连贯，是否使用适当的转换和过渡来保持句子和段落之间的连贯性。\n5. 检查答案是否具有明确的结构和组织方式，使得读者可以轻松理解信息的层次和结构。\n6. 根据以上因素综合评估答案的语言组织，并给出一个1到5的分数，其中5表示语言组织非常好，而1表示语言组织非常差。\n\n语言组织：",
+      "relevance": "1. 阅读题目，确定题目所问的问题是什么，以及需要回答哪些方面的问题。\n2. 阅读答案，确认答案是否直接回答了题目所问的问题。\n3. 检查答案是否严格遵照了题目的要求，包括答题方式、答题长度、答题格式等等。\n4. 根据以上因素综合评估答案的切题程度，并给出一个1到5的分数，其中5表示答案非常切题，而1表示答案完全没有切题。\n\n切题：",
+      "creativity": "1. 仔细阅读所提供的头脑风暴问题，确保你理解问题的要点和背景。\n2. 根据你的知识和经验，判断所提供的答案是否可行。如果答案不可行，则创意性评分可能会受到影响。\n3. 考虑答案中是否包含新颖的想法或独特的思路。答案可能与已知的解决方案有所重叠，但仍然可以被认为是有创意的，只要它提供了新的角度或方法来解决问题。\n4. 根据答案的创意性，给出一个1到5的评分。如果答案缺乏创意，则应给出一个较低的评分。如果答案具有创意并提供了新的思路，应给出一个较高的评分。\n\n创意性：",
+      "practicality": "1. 仔细阅读所提供的头脑风暴问题，确保你理解问题的要点和背景。\n2. 根据你的知识和经验，判断所提供的答案是否可行。如果答案不可行，则实用性评分可能会受到影响。\n3. 考虑答案中提出的建议或解决方法是否实用并可行。答案可能看起来很好，但如果无法实现或应用，则实用性评分可能会受到影响。\n4. 根据答案的实用性，给出一个1到5的评分。如果答案缺乏实用性，则应给出一个较低的评分。如果答案提出了实用的建议或解决方法，并且可以很好地解决问题，则应给出一个较高的评分。\n\n实用性：",
+      "reasonableness": "1. 仔细阅读所提供的头脑风暴问题，确保你理解问题的要点和背景。\n2. 根据你的知识和经验，判断所提供的答案是否可行。如果答案不可行，则合理性评分可能会受到影响。\n3. 考虑答案中所提供的信息是否合理、符合常识、生活实际等等。如果答案中存在明显的不合理之处，则合理性评分可能会受到影响。\n4. 根据答案的合理性，给出一个1到5的评分。如果答案存在明显的不合理之处，则应给出一个较低的评分。如果答案合理、符合常识、生活实际等等，则应给出一个较高的评分。\n\n合理性："
+    },
+    "prompt": "你是一个好助手。请你为下面“头脑风暴”问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
+  },
+  "chat": {
+    "id": 2,
+    "category": "chat",
+    "metrics": {
+      "language organization": "语言组织(1-5)：答案语言是否流畅、连贯，使用正确的语法，具有一定逻辑性，使用恰当的连接词、过渡词等等。",
+      "relevance": "切题(1-5)：答案内容是否切题，不答非所问，并且严格遵照题目要求。",
+      "naturalness": "自然(1-5)：答案是否自然，并且符合问题给定的身份。",
+      "engagingness": "参与感(1-5)：答案是否对前面的对话内容做出了恰当的反应，是否理解对话的语境和背景。",
+      "reasonableness": "合理性(1-5)：答案是否能够与前面的对话内容形成逻辑上的衔接，是否符合常理，能否在这个上下文中合理存在。",
+      "fidelity": "保真度(1-5)：答案是否能够严格遵守角色的设定回答给定的请求。"
+    },
+    "CoT": {
+      "language organization": "1. 阅读答案，并检查是否有语法错误、用词不当或其他显著的错误。\n2. 检查答案是否具有逻辑性，能够按照合理的顺序传达信息并且能够自圆其说。\n3. 确定答案是否与问题或主题相关，并且能够传达清晰的信息。\n4. 检查答案是否连贯，是否使用适当的转换和过渡来保持句子和段落之间的连贯性。\n5. 检查答案是否具有明确的结构和组织方式，使得读者可以轻松理解信息的层次和结构。\n6. 根据以上因素综合评估答案的语言组织，并给出一个1到5的分数，其中5表示语言组织非常好，而1表示语言组织非常差。\n\n语言组织：",
+      "relevance": "1. 阅读题目，确定题目所问的问题是什么，以及需要回答哪些方面的问题。\n2. 阅读答案，确认答案是否直接回答了题目所问的问题。\n3. 检查答案是否严格遵照了题目的要求，包括答题方式、答题长度、答题格式等等。\n4. 根据以上因素综合评估答案的切题程度，并给出一个1到5的分数，其中5表示答案非常切题，而1表示答案完全没有切题。\n\n切题：",
+      "naturalness": "1. 阅读题目，确定题目提供的身份信息。\n2. 检查答案内容是否符合题目给定的身份。\n3. 根据以上因素，对该回答的自然性进行打分，分数从1到5，其中1表示不自然，5表示非常自然，并符合问题给定的身份。\n\n自然：",
+      "engagingness": "1. 阅读题目，确定对话的语境和背景。\n2. 检查答案是否充分理解对话的语境和背景，能否自然地融入到对话中而不显得突兀。\n3. 根据以上因素，对该回答的参与感进行打分，分数从1到5，其中1表示没有参与感，5表示非常有参与感，并且恰当地理解了对话的语境和背景。\n\n参与感：",
+      "reasonableness": "1. 阅读题目，确定对话的主题以及问题期望的回答方向。\n2. 判断答案是否能够与前面的对话内容形成逻辑上的衔接，是否符合常理，能否在这个上下文中合理存在。\n3. 根据以上因素，对该回答的合理性进行打分，分数从1到5，其中1表示不合理，5表示非常合理，并且能够与前面的对话内容形成逻辑上的衔接，并符合常理。\n\n合理性：",
+      "fidelity": "1. 仔细阅读问题，了解角色在问题中的设定和表现，包括职业、背景、观点、性格等方面。\n阅读题目的请求，确认回答请求时需要注意的细节。\n3. 对比提供的回答与该角色的设定，评估回答是否能够严格遵守角色的设定。\n4. 结合以上评估结果给出保真度的评分，范围从1到5分，其中1分表示回答与角色设定完全不符，5分表示回答完全符合角色设定且满足给定请求。\n\n保真度："
+    },
+    "prompt": "你是一个好助手。请你为下面的“补全对话”问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
+  },
+  "generation": {
+    "id": 3,
+    "category": "generation",
+    "metrics": {
+      "language organization": "语言组织(1-5)：答案语言是否流畅、连贯，使用正确的语法，具有一定逻辑性，使用恰当的连接词、过渡词等等。",
+      "relevance": "切题(1-5)：答案内容是否切题，不答非所问，并且严格遵照题目要求。",
+      "diversity": "多样性(1-5)：答案使用语言是否优美，具有有一定的创造性和想象力。然而，回答也应该保持合理和适度，不要过于夸张或离题。"
+    },
+    "CoT": {
+      "language organization": "1. 阅读答案，并检查是否有语法错误、用词不当或其他显著的错误。\n2. 检查答案是否具有逻辑性，能够按照合理的顺序传达信息并且能够自圆其说。\n3. 确定答案是否与问题或主题相关，并且能够传达清晰的信息。\n4. 检查答案是否连贯，是否使用适当的转换和过渡来保持句子和段落之间的连贯性。\n5. 检查答案是否具有明确的结构和组织方式，使得读者可以轻松理解信息的层次和结构。\n6. 根据以上因素综合评估答案的语言组织，并给出一个1到5的分数，其中5表示语言组织非常好，而1表示语言组织非常差。\n\n语言组织：",
+      "relevance": "1. 阅读题目，确定题目所问的问题是什么，以及需要回答哪些方面的问题。\n2. 阅读答案，确认答案是否直接回答了题目所问的问题。\n3. 检查答案是否严格遵照了题目的要求，包括答题方式、答题长度、答题格式等等。\n4. 根据以上因素综合评估答案的切题程度，并给出一个1到5的分数，其中5表示答案非常切题，而1表示答案完全没有切题。\n\n切题：",
+      "diversity": "1. 仔细阅读整个回答，确保完全理解回答所表达的内容和主题。\n2. 在阅读回答的同时，注意语言的质量，例如措辞是否正确，语言是否生动等。\n3. 检查回答的创造性和想象力，看看回答是否能够吸引人阅读下去。\n4. 检查回答的合理性和适度，看看回答是否夸张或离题。\n5. 将多样性的评分打分在1到5之间，5分表示回答的质量很好，能够吸引人阅读，1分表示回答的内容生硬或者有离题的问题。\n\n多样性："
+    },
+    "prompt": "你是一个好助手。请你为下面的“生成”问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
+  },
+  "open_qa": {
+    "id": 4,
+    "category": "open_qa",
+    "metrics": {
+      "language organization": "语言组织(1-5)：答案语言是否流畅、连贯，使用正确的语法，具有一定逻辑性，使用恰当的连接词、过渡词等等。",
+      "relevance": "切题(1-5)：答案内容是否切题，不答非所问，并且严格遵照题目要求。",
+      "correctness": "正确性(1-5)：答案是否正确。"
+    },
+    "CoT": {
+      "language organization": "1. 阅读答案，并检查是否有语法错误、用词不当或其他显著的错误。\n2. 检查答案是否具有逻辑性，能够按照合理的顺序传达信息并且能够自圆其说。\n3. 确定答案是否与问题或主题相关，并且能够传达清晰的信息。\n4. 检查答案是否连贯，是否使用适当的转换和过渡来保持句子和段落之间的连贯性。\n5. 检查答案是否具有明确的结构和组织方式，使得读者可以轻松理解信息的层次和结构。\n6. 根据以上因素综合评估答案的语言组织，并给出一个1到5的分数，其中5表示语言组织非常好，而1表示语言组织非常差。\n\n语言组织：",
+      "relevance": "1. 阅读题目，确定题目所问的问题是什么，以及需要回答哪些方面的问题。\n2. 阅读答案，确认答案是否直接回答了题目所问的问题。\n3. 检查答案是否严格遵照了题目的要求，包括答题方式、答题长度、答题格式等等。\n4. 根据以上因素综合评估答案的切题程度，并给出一个1到5的分数，其中5表示答案非常切题，而1表示答案完全没有切题。\n\n切题：",
+      "correctness": "1. 仔细阅读题目，尝试自己回答该问题。\n2. 检查答案的准确性。您可以使用已知的事实或研究来验证答案是否正确。如果答案是正确的，则可以将正确性得分为5分。如果答案是部分正确的，则可以给予适当的得分，例如2分、3分或4分。如果答案完全不正确，则只得1分。\n\n正确性："
+    },
+    "prompt": "你是一个好助手。请你为下面的问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
+  },
+  "roleplay": {
+    "id": 5,
+    "category": "roleplay",
+    "metrics": {
+      "language organization": "语言组织(1-5)：答案语言是否流畅、连贯，使用正确的语法，具有一定逻辑性，使用恰当的连接词、过渡词等等。",
+      "relevance": "切题(1-5)：答案内容是否切题，不答非所问，并且严格遵照题目要求。",
+      "fidelity": "保真度(1-5)：答案是否能够严格遵守角色的设定回答给定的请求。",
+      "creativity": "创意性(1-5)：角色扮演问题的回答需要具有一定创意，但同时需要遵守角色的设定。"
+    },
+    "CoT": {
+      "language organization": "1. 阅读答案，并检查是否有语法错误、用词不当或其他显著的错误。\n2. 检查答案是否具有逻辑性，能够按照合理的顺序传达信息并且能够自圆其说。\n3. 确定答案是否与问题或主题相关，并且能够传达清晰的信息。\n4. 检查答案是否连贯，是否使用适当的转换和过渡来保持句子和段落之间的连贯性。\n5. 检查答案是否具有明确的结构和组织方式，使得读者可以轻松理解信息的层次和结构。\n6. 根据以上因素综合评估答案的语言组织，并给出一个1到5的分数，其中5表示语言组织非常好，而1表示语言组织非常差。\n\n语言组织：",
+      "relevance": "1. 阅读题目，确定题目所问的问题是什么，以及需要回答哪些方面的问题。\n2. 阅读答案，确认答案是否直接回答了题目所问的问题。\n3. 检查答案是否严格遵照了题目的要求，包括答题方式、答题长度、答题格式等等。\n4. 根据以上因素综合评估答案的切题程度，并给出一个1到5的分数，其中5表示答案非常切题，而1表示答案完全没有切题。\n\n切题：",
+      "fidelity": "1. 仔细阅读问题，了解角色在问题中的设定和表现，包括职业、背景、观点、性格等方面。\n2. 阅读题目的请求，确认回答请求时需要注意的细节。\n3. 对比提供的回答与该角色的设定，评估回答是否能够严格遵守角色的设定。\n4. 结合以上评估结果给出保真度的评分，范围从1到5分，其中1分表示回答与角色设定完全不符，5分表示回答完全符合角色设定且满足给定请求。\n\n保真度：",
+      "creativity": "1. 仔细阅读问题，了解角色在问题中的设定和表现，包括职业、背景、观点、性格等方面。\n2. 评估回答是否具有独特的思路和建议，是否能够给提问者带来新的想法和启示。\n3. 对比回答中的创意和该角色的设定，评估回答是否遵守了该角色的设定和基本特征。\n4. 对回答的质量进行总体评估，并结合以上评估结果给出创意性的评分，范围从1到5分，其中1分表示回答缺乏创意，5分表示回答具有独特的思路和建议，并且能够遵守该角色的设定。\n\n创意性："
+    },
+    "prompt": "你是一个好助手。请你为下面的“角色扮演”问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
+  },
+  "Other": {
+    "id": 6,
+    "category": "Other",
+    "metrics": {
+      "relevance": "切题(1-5)：答案内容是否切题，不答非所问，并且严格遵照题目要求。",
+      "correctness": "正确性(1-5)：答案是否正确。"
+    },
+    "CoT": {
+      "relevance": "1. 阅读题目，确定题目所问的问题是什么，以及需要回答哪些方面的问题。\n2. 阅读答案，确认答案是否直接回答了题目所问的问题。\n3. 检查答案是否严格遵照了题目的要求，包括答题方式、答题长度、答题格式等等。\n4. 根据以上因素综合评估答案的切题程度，并给出一个1到5的分数，其中5表示答案非常切题，而1表示答案完全没有切题。\n\n切题：",
+      "correctness": "1. 仔细阅读题目，尝试自己回答该问题。\n2. 检查答案的准确性。您可以使用已知的事实或研究来验证答案是否正确。如果答案是正确的，则可以将正确性得分为5分。如果答案是部分正确的，则可以给予适当的得分，例如2分、3分或4分。如果答案完全不正确，则只得1分。\n\n正确性："
+    },
+    "prompt": "你是一个好助手。请你为下面问题的答案打分。\n\n问题如下：\n\n{question}\n\n需要你评分的答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
+  }
+}
--- a/applications/ColossalEval/configs/gpt_evaluation/prompt/evaluation_prompt/evaluation_prompt_en.json
+++ b/applications/ColossalEval/configs/gpt_evaluation/prompt/evaluation_prompt/evaluation_prompt_en.json
+{
+  "brainstorming": {
+    "id": 1,
+    "category": "brainstorming",
+    "metrics": {
+      "language organization": "Language organization (1-5): whether the answer language is fluent and coherent, uses correct grammar, has a certain logic, uses appropriate connecting words, transition words, etc.",
+      "relevance": "Relevance (1-5): whether the content of the answer is relevant to the topic, does not answer the wrong question, and strictly follows the requirements of the topic.",
+      "creativity": "Creativity (1-5): Some brainstorming questions may require answers that are creative and suggest new ideas.",
+      "practicality": "Practicality (1-5): Some brainstorming questions may require answers to suggest practical suggestions or solutions.",
+      "reasonableness": "Reasonableness (1-5): The answer should be in line with common sense, life experience, etc."
+    },
+    "CoT": {
+      "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the language organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good language organization and 1 indicates very poor language organization.\n\nLanguage organization:",
+      "relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
+      "creativity": "1. Read the provided brainstorming questions carefully to make sure you understand the gist and context of the questions.\n2. Based on your knowledge and experience, determine if the answers provided are feasible. If the answer is not feasible, the creativity score may be affected.\n3. Consider whether the answer contains novel ideas or unique thoughts. An answer may overlap with a known solution and still be considered creative, as long as it offers a new perspective or approach to the problem.\n4. Give a score of 1 to 5 depending on the creativity of the answer. If the answer lacks creativity, a lower score should be given. If the answer is creative and provides a new idea, a higher score should be given.\n\nCreativity:",
+      "practicality": "1. Read the provided brainstorming questions carefully to make sure you understand the gist and context of the questions.\n2. Based on your knowledge and experience, determine if the answers provided are feasible. If the answer is not feasible, the practicality score may be affected.\n3. Consider whether the suggestions or solutions presented in the answer are practical and workable. The answer may look good, but if it cannot be implemented or applied, the practicality score may be affected.\n4. Give a score of 1 to 5 depending on the practicality of the answer. If the answer lacks practicality, a lower score should be given. If the answer makes a practical suggestion or solution and solves the problem well, a higher score should be given.\n\nPracticality:",
+      "reasonableness": "1. Read the provided brainstorming questions carefully to make sure you understand the gist and context of the questions.\n2. Based on your knowledge and experience, determine if the answers provided are feasible. If the answer is not feasible, the reasonableness score may be affected.\n3. Consider whether the information provided in the answer is reasonable, consistent with common sense, real life, etc. If there are obvious errors or implausibilities in the answer, the reasonableness score may be affected.\n4. Give a score of 1 to 5 depending on the reasonableness of the answer. If the answer contains obvious errors or unreasonable points, a lower score should be given. A higher score should be given if the answer is reasonable, consistent with common sense, real life, etc.\n\nReasonableness:"
+    },
+    "prompt": "You are a good assistant. Please rate the given answer to the \"brainstorming\" question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
+  },
+  "chat": {
+    "id": 2,
+    "category": "chat",
+    "metrics": {
+      "language organization": "Language organization (1-5): whether the answer language is fluent and coherent, uses correct grammar, has a certain logic, uses appropriate connecting words, transition words, etc.",
+      "relevance": "Relevance (1-5): whether the content of the answer is relevant to the topic, does not answer the wrong question, and strictly follows the requirements of the topic.",
+      "naturalness": "Naturalness (1-5): whether the answer is natural and fits the identity given by the question.",
+      "engagingness": "Engagingness (1-5): whether the answer responds appropriately to the content of the preceding conversation and whether it understands the context and background of the conversation.",
+      "reasonableness": "Reasonableness (1-5): Whether the answer can form a logical connection with the content of the previous dialogue, whether it is consistent with common sense, and whether it can reasonably exist in this context.",
+      "fidelity": "Fidelity (1-5): whether the answer is able to answer the given request in strict compliance with the role setting."
+    },
+    "CoT": {
+      "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the language organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good language organization and 1 indicates very poor language organization.\n\nLanguage organization:",
+      "relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
+      "naturalness": "1. Read the question and determine the identity information provided in the question.\n2. Check whether the content of the answer matches the identity given in the question.\n3. Based on the above factors, score the naturalness of the response on a scale from 1 to 5, where 1 means unnatural and 5 means very natural and in accordance with the identity given in the question.\n\nNaturalness:",
+      "engagingness": "1. Read the questions to determine the context and background of the dialogue.\n2. Check that the answer fully understands the context and background of the conversation and that it fits naturally into the conversation without seeming abrupt.\n3. Based on the above factors, rate the response's engagement on a scale from 1 to 5, where 1 means not engaged and 5 means very engaged and appropriately understands the context and background of the conversation.\n\nEngagingness:",
+      "reasonableness": "1. Read the question and determine the topic of the conversation and the direction the question expects the answer to go.\n2. Determine whether the answer can be logically connected to the preceding conversation, whether it makes common sense, and whether it can reasonably exist in this context.\n3. Based on the above factors, rate the reasonableness of the answer on a scale from 1 to 5, where 1 means unreasonable and 5 means very reasonable and able to form a logical connection with the preceding dialogue content and consistent with common sense.\n\nReasonableness:",
+      "fidelity": "1. Read the question carefully to understand how the character is set up and represented in the question, including aspects such as occupation, background, point of view, and personality.\n2. Read the question's request and confirm the details that need to be taken into account when answering the request.\n3. Compare the provided answer with the setting of the role and assess whether the answer can strictly adhere to the setting of the role.\n4. Combine the results of the above assessment to give a fidelity score ranging from 1 to 5, where a score of 1 means that the response does not match the persona at all, and a score of 5 means that the response fully complies with the persona and satisfies the given request.\n\nFidelity:"
+    },
+    "prompt": "You are a good assistant. Please rate the given answer to the \"chat\" question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
+  },
+  "generation": {
+    "id": 3,
+    "category": "generation",
+    "metrics": {
+      "language organization": "Language organization (1-5): whether the answer language is fluent and coherent, uses correct grammar, has a certain logic, uses appropriate connecting words, transition words, etc.",
+      "relevance": "Relevance (1-5): whether the content of the answer is relevant to the topic, does not answer the wrong question, and strictly follows the requirements of the topic.",
+      "diversity": "Diversity (1-5): Whether the answers use beautiful language and have some creativity and imagination. However, answers should also be kept reasonable and moderate, not overly exaggerated or off-topic."
+    },
+    "CoT": {
+      "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the language organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good language organization and 1 indicates very poor language organization.\n\nLanguage organization:",
+      "relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
+      "diversity": "1. Read the entire response carefully to ensure that you fully understand the content and theme expressed in the response.\n2. While reading the response, pay attention to the quality of the language, such as whether the wording is correct and the language is vivid.\n3. Check the creativity and imagination of the response to see if the response is engaging to read on.\n4. Check the reasonableness and appropriateness of the responses to see if the responses are exaggerated or off-topic.\n5. Rate the diversity on a scale of 1 to 5, with a 5 indicating a good quality response that is engaging to read and a 1 indicating a raw response or a question that is off-topic.\n\nDiversity:"
+    },
+    "prompt": "You are a good assistant. Please rate the given answer to the \"generation\" question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
+  },
+  "open_qa": {
+    "id": 4,
+    "category": "open_qa",
+    "metrics": {
+      "language organization": "Language organization (1-5): whether the answer language is fluent and coherent, uses correct grammar, has a certain logic, uses appropriate connecting words, transition words, etc.",
+      "relevance": "Relevance (1-5): whether the content of the answer is relevant to the topic, does not answer the wrong question, and strictly follows the requirements of the topic.",
+      "correctness": "Correctness (1-5): whether the answer is correct or not."
+    },
+    "CoT": {
+      "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the language organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good language organization and 1 indicates very poor language organization.\n\nLanguage organization:",
+      "relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
+      "correctness": "1. Read the question carefully and try to answer the question yourself.\n2. Check the correctness of the answer. You can use known facts or research to verify that the answer is correct. If the answer is correct, you can give a score of 5 for correctness. If the answer is partially correct, an appropriate score, such as 2, 3, or 4, may be given. If the answer is completely incorrect, only 1 point is awarded.\n\nCorrectness:"
+    },
+    "prompt": "You are a good assistant. Please rate the answers to the \"open qa\" question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
+  },
+  "roleplay": {
+    "id": 5,
+    "category": "roleplay",
+    "metrics": {
+      "language organization": "Language organization (1-5): whether the answer language is fluent and coherent, uses correct grammar, has a certain logic, uses appropriate connecting words, transition words, etc.",
+      "relevance": "Relevance (1-5): whether the content of the answer is relevant to the topic, does not answer the wrong question, and strictly follows the requirements of the topic.",
+      "fidelity": "Fidelity (1-5): whether the answer is able to answer the given request in strict compliance with the role setting.",
+      "creativity": "Creativity (1-5): The answers to the role-play questions need to be somewhat creative, but at the same time they need to adhere to the setting of the role."
+    },
+    "CoT": {
+      "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the language organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good language organization and 1 indicates very poor language organization.\n\nLanguage organization:",
+      "relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
+      "fidelity": "1. Read the question carefully to understand how the character is set up and represented in the question, including aspects such as occupation, background, point of view, and personality.\n2. Read the question's request and confirm the details that need to be taken into account when answering the request.\n3. Compare the provided answer with the setting of the role and assess whether the answer can strictly adhere to the setting of the role.\n4. Combine the results of the above assessment to give a fidelity score ranging from 1 to 5, where a score of 1 means that the response does not match the persona at all, and a score of 5 means that the response fully complies with the persona and satisfies the given request.\n\nFidelity:",
+      "creativity": "1. Read the question carefully to understand how the character is set up and represented in the question, including career, background, perspective, and personality.\n2. Evaluate whether the answer has unique ideas and suggestions that bring new ideas and insights to the questioner.\n3. Compare the creativity in the response to the setting of the persona and assess whether the response adheres to the setting and essential characteristics of the persona.\n4. Evaluate the quality of the responses in general and combine the results of the above assessment to give a creativity score ranging from 1 to 5, where a score of 1 indicates that the response lacks creativity and a score of 5 indicates that the response has unique ideas and suggestions and is able to adhere to the set-up of the persona.\n\nCreativity:"
+    },
+    "prompt": "You are a good assistant. Please rate the given answer to the \"role-play\" question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
+  },
+  "Other": {
+    "id": 6,
+    "category": "Other",
+    "metrics": {
+      "relevance": "Relevance (1-5): whether the content of the answer is relevant to the topic, does not answer the wrong question, and strictly follows the requirements of the topic.",
+      "correctness": "Correctness (1-5): whether the answer is correct or not."
+    },
+    "CoT": {
+      "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the language organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good language organization and 1 indicates very poor language organization.\n\nLanguage organization:",
+      "relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
+      "correctness": "1. Read the question carefully and try to answer the question by yourself.\n2. Check the correctness of the answer. You can use known facts or research to verify that the answer is correct. If the answer is correct, you can give a score of 5 for correctness. If the answer is partially correct, an appropriate score, such as 2, 3, or 4, may be assigned. If the answer is completely incorrect, only 1 point is awarded.\n\nCorrectness:"
+    },
+    "prompt": "You are a good assistant. Please rate the given answer to the question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
+  }
+}
--- a/applications/ColossalEval/examples/dataset_evaluation/config/evaluation/config.json
+++ b/applications/ColossalEval/examples/dataset_evaluation/config/evaluation/config.json
+{
+  "model": [
+    {
+      "name": "model1"
+    },
+    {
+      "name": "model2"
+    }
+  ],
+  "dataset": [
+    {
+      "name": "mmlu",
+      "metrics": [
+        "first_token_accuracy",
+        "single_choice_accuracy",
+        "perplexity",
+        "ppl_score",
+        "ppl_score_over_choices"
+      ]
+    },
+    {
+      "name": "cmmlu",
+      "metrics": [
+        "first_token_accuracy",
+        "single_choice_accuracy",
+        "perplexity",
+        "ppl_score",
+        "ppl_score_over_choices"
+      ]
+    },
+    {
+      "name": "agieval",
+      "metrics": [
+        "first_token_accuracy",
+        "single_choice_accuracy",
+        "multi_choice_accuracy",
+        "math_equivalence",
+        "perplexity",
+        "ppl_score_over_choices",
+        "ppl_score"
+      ]
+    },
+    {
+      "name": "gaokaobench",
+      "metrics": [
+        "first_token_accuracy",
+        "single_choice_accuracy",
+        "multi_choice_accuracy",
+        "math_equivalence",
+        "rouge_score",
+        "rouge_zh_score",
+        "perplexity",
+        "ppl_score_over_choices",
+        "ppl_score"
+      ]
+    }
+  ]
+}
--- a/applications/ColossalEval/examples/dataset_evaluation/config/inference/config.json
+++ b/applications/ColossalEval/examples/dataset_evaluation/config/inference/config.json
+{
+  "model": [
+    {
+      "name": "model name",
+      "model_class": "HuggingFaceCausalLM",
+      "parameters": {
+        "path": "path to model",
+        "model_max_length": 4096,
+        "tokenizer_path": "",
+        "tokenizer_kwargs": {
+          "trust_remote_code": true
+        },
+        "peft_path": null,
+        "model_kwargs": {
+          "torch_dtype": "torch.float32",
+          "trust_remote_code": true
+        },
+        "prompt_template": "plain",
+        "batch_size": 4
+      }
+    },
+    {
+      "name": "model2 name",
+      "model_class": "HuggingFaceCausalLM",
+      "parameters": {
+        "path": "path to model2",
+        "model_max_length": 4096,
+        "tokenizer_path": "",
+        "tokenizer_kwargs": {
+          "trust_remote_code": true
+        },
+        "peft_path": null,
+        "model_kwargs": {
+          "torch_dtype": "torch.float32",
+          "trust_remote_code": true
+        },
+        "prompt_template": "plain",
+        "batch_size": 4
+      }
+    }
+  ],
+  "dataset": [
+    {
+      "name": "agieval",
+      "dataset_class": "AGIEvalDataset",
+      "debug": false,
+      "few_shot": false,
+      "path": "path to original dataset (folder)",
+      "save_path": "path to save converted dataset (e.g. inference_data/agieval.json)"
+    },
+    {
+      "name": "ceval",
+      "dataset_class": "CEvalDataset",
+      "debug": false,
+      "few_shot": true,
+      "path": "path to original dataset (folder)",
+      "save_path": "path to save converted dataset (e.g. inference_data/ceval.json)"
+    },
+    {
+      "name": "cmmlu",
+      "dataset_class": "CMMLUDataset",
+      "debug": false,
+      "few_shot": true,
+      "path": "path to original dataset (folder)",
+      "save_path": "path to save converted dataset (e.g. inference_data/cmmlu.json)"
+    },
+    {
+      "name": "gaokaobench",
+      "dataset_class": "GaoKaoBenchDataset",
+      "debug": false,
+      "few_shot": false,
+      "path": "path to original dataset (folder)",
+      "save_path": "path to save converted dataset (e.g. inference_data/gaokaobench.json)"
+    },
+    {
+      "name": "mmlu",
+      "dataset_class": "MMLUDataset",
+      "debug": false,
+      "few_shot": true,
+      "path": "path to original dataset (folder)",
+      "save_path": "path to save converted dataset (e.g. inference_data/mmlu.json)"
+    }
+  ]
+}
--- a/applications/ColossalEval/examples/dataset_evaluation/eval_dataset.py
+++ b/applications/ColossalEval/examples/dataset_evaluation/eval_dataset.py
+import argparse
+import os
+
+import tabulate
+from colossal_eval.evaluate.dataset_evaluator import DatasetEvaluator
+from colossal_eval.utils import jdump, jload
+
+
+def main(args):
+    config = jload(args.config)
+
+    evaluation_results = {dataset["name"]: {} for dataset in config["dataset"]}
+    evaluation_results_table = {dataset["name"]: {} for dataset in config["dataset"]}
+    evaluator = DatasetEvaluator()
+
+    for dataset_parameter in config["dataset"]:
+        dataset_name = dataset_parameter["name"]
+        metrics = dataset_parameter["metrics"]
+        results_metric_model = {metric: {model["name"]: None for model in config["model"]} for metric in metrics}
+        for model in config["model"]:
+            model_name = model["name"]
+
+            data = jload(
+                os.path.join(args.inference_results_path, model_name, f"{dataset_name}_inference_results.json")
+            )
+            results = evaluator.get_evaluation_results(data, dataset_name, model_name, metrics)
+
+            for metric, score in results.items():
+                results_metric_model[metric][model_name] = score["ALL"]
+
+            evaluation_results[dataset_name][model_name] = results
+
+        evaluation_results_table[dataset_name] = results_metric_model
+
+    table = []
+    header = ["dataset", "metric"] + [model["name"] for model in config["model"]]
+    table.append(header)
+
+    for dataset_parameter in config["dataset"]:
+        dataset_name = dataset_parameter["name"]
+        metrics = dataset_parameter["metrics"]
+
+        for metric, model_results in evaluation_results_table[dataset_name].items():
+            row = [dataset_name]
+            for model, score in model_results.items():
+                if len(row) == 1:
+                    row.extend([metric, "{:.02f}".format(score)])
+                else:
+                    row.append("{:.02f}".format(score))
+
+            table.append(row)
+
+    table = tabulate.tabulate(table, headers="firstrow")
+    print(table)
+
+    os.makedirs(args.evaluation_results_save_path, exist_ok=True)
+
+    with open(os.path.join(args.evaluation_results_save_path, "evaluation_results_table.txt"), "w") as file:
+        file.write(table)
+
+    jdump(evaluation_results, os.path.join(args.evaluation_results_save_path, "evaluation_results.json"))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="ColossalEval evaluation process.")
+    parser.add_argument("--config", type=str, default=None, required=True, help="path to config file")
+    parser.add_argument("--inference_results_path", type=str, default=None, help="path to inference results")
+    parser.add_argument(
+        "--evaluation_results_save_path", type=str, default=None, help="path to save evaluation results"
+    )
+    args = parser.parse_args()
+
+    main(args)
--- a/applications/ColossalEval/examples/dataset_evaluation/eval_dataset.sh
+++ b/applications/ColossalEval/examples/dataset_evaluation/eval_dataset.sh
+python eval_dataset.py \
+    --config "path to config file" \
+    --inference_results_path "path to inference results" \
+    --evaluation_results_save_path "path to save evaluation results"
--- a/applications/ColossalEval/examples/dataset_evaluation/inference.py
+++ b/applications/ColossalEval/examples/dataset_evaluation/inference.py
+import argparse
+import copy
+import os
+from typing import Dict, List
+
+import torch
+import torch.distributed as dist
+from colossal_eval import dataset, models, utils
+
+import colossalai
+from colossalai.logging import get_dist_logger
+
+logger = get_dist_logger()
+
+
+def rm_and_merge(world_size: int, save_path: str, model_names: List[str], dataset_names: Dict[str, List]) -> None:
+    """
+    Remove inference result per rank and merge them into one file.
+
+    Args:
+        world_size: Number of processes for inference.
+        save_path: The folder for storing inference results.
+        model_names: Names of models for inference.
+        dataset_names: Names of dataset for inference.
+
+    """
+
+    for model_name in model_names:
+        for dataset_name, categories in dataset_names.items():
+            all_answers = {}
+            for category in categories:
+                all_answers[category] = {"data": []}
+                answers = {"data": []}
+
+                for r in range(world_size):
+                    directory = os.path.join(
+                        save_path, model_name, f"{dataset_name}_{category}_inference_results_rank{r}.json"
+                    )
+                    if not os.path.exists(directory):
+                        raise Exception(
+                            f"Directory {directory} not found. There may be an error during inference time."
+                        )
+                    else:
+                        rank_answers = utils.jload(directory)
+                        answers["data"].extend(rank_answers["data"])
+                        answers["inference_kwargs"] = rank_answers["inference_kwargs"]
+
+                for r in range(world_size):
+                    try:
+                        directory = os.path.join(
+                            save_path, model_name, f"{dataset_name}_{category}_inference_results_rank{r}.json"
+                        )
+                        os.remove(directory)
+                    except Exception as e:
+                        print(e)
+
+                all_answers[category] = answers
+
+            logger.info(f"Save inference results of model {model_name} on dataset {dataset_name}.")
+            utils.jdump(all_answers, os.path.join(save_path, model_name, f"{dataset_name}_inference_results.json"))
+
+        logger.info(f"Save inference results of model {model_name} for all dataset.")
+    logger.info(f"Save inference results of all models for all dataset.")
+
+
+def main(args):
+    colossalai.launch_from_torch(config={}, seed=42)
+    world_size = dist.get_world_size()
+    rank = dist.get_rank()
+
+    inference_data = {}
+    debug_args = {}
+    few_shot_args = {}
+
+    config = utils.jload(args.config)
+
+    model_parameters = config["model"]
+    dataset_parameters = config["dataset"]
+
+    for dataset_parameter in dataset_parameters:
+        path = dataset_parameter["path"]
+        save_path = dataset_parameter["save_path"]
+        dataset_name = dataset_parameter["name"]
+        debug_args[dataset_name] = dataset_parameter["debug"]
+        few_shot_args[dataset_name] = dataset_parameter["few_shot"]
+
+        if not args.load_dataset:
+            if os.path.exists(save_path):
+                dataset_ = utils.jload(save_path)
+                inference_data[dataset_name] = dataset_["test"]
+            else:
+                raise Exception(
+                    "Can't find the converted dataset. You may set load_dataset True to store the dataset first."
+                )
+
+            continue
+
+        dataset_class = eval(f"dataset.{dataset_parameter['dataset_class']}")
+        if not issubclass(dataset_class, dataset.BaseDataset):
+            raise ValueError(f"Dataset class {dataset_parameter['dataset_class']} is not a subclass of BaseDataset.")
+
+        dataset_ = dataset_class(path, logger, dataset_parameter["few_shot"])
+
+        dataset_.save(save_path)
+        inference_data[dataset_name] = dataset_.dataset["test"]
+
+    for model_parameter in model_parameters:
+        model_name = model_parameter["name"]
+        model_class = eval(f"models.{model_parameter['model_class']}")
+        paramerters = model_parameter["parameters"]
+        paramerters.update({"logger": logger})
+        paramerters.update({"prompt_template": utils.prompt_templates[paramerters["prompt_template"]]})
+
+        model_ = model_class(**paramerters)
+        if not issubclass(model_class, models.BaseModel):
+            raise ValueError(f"Model class {model_parameter['model_class']} is not a subclass of BaseModel.")
+
+        for dataset_name, split_data in inference_data.items():
+            start = 0
+            for category, category_data in split_data.items():
+                if few_shot_args[dataset_name] and category_data["inference_kwargs"].get("few_shot_data", None) is None:
+                    raise Exception(f"Dataset {dataset_name} doesn't have few-shot data for category {category}!")
+
+                answers_to_dump = copy.deepcopy(category_data)
+                partition_size = len(category_data["data"]) // world_size
+                redundant = len(category_data["data"]) % world_size
+
+                # Ensure that the amount of data for inference is as consistent as possible across different processes.
+                lengths = [partition_size for _ in range(world_size)]
+                for j in range(redundant):
+                    lengths[(j + start) % world_size] += 1
+
+                start = (start + redundant) % world_size
+
+                questions = category_data["data"][sum(lengths[0:rank]) : sum(lengths[0:rank]) + lengths[rank]]
+
+                answers_per_rank = model_.inference(
+                    questions, inference_kwargs=category_data["inference_kwargs"], debug=debug_args[dataset_name]
+                )
+
+                answers_to_dump["data"] = answers_per_rank
+
+                utils.jdump(
+                    answers_to_dump,
+                    os.path.join(
+                        args.inference_save_path,
+                        model_name,
+                        f"{dataset_name}_{category}_inference_results_rank{rank}.json",
+                    ),
+                )
+
+        logger.info(f"Rank {rank} peak CUDA mem: {torch.cuda.max_memory_allocated()/1024**3:.3f} GB")
+
+        del model_
+        torch.cuda.empty_cache()
+
+    dist.barrier()
+    if rank == 0:
+        model_names = [model_parameter["name"] for model_parameter in model_parameters]
+        dataset_names = {key: list(inference_data[key].keys()) for key in inference_data}
+        rm_and_merge(world_size, args.inference_save_path, model_names, dataset_names)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="ColossalEval inference process.")
+    parser.add_argument("--config", type=str, default=None, required=True, help="path to config file")
+    parser.add_argument("--load_dataset", default=False, action="store_true")
+    parser.add_argument("--inference_save_path", type=str, default=None, help="path to save inference results")
+    args = parser.parse_args()
+
+    main(args)
--- a/applications/ColossalEval/examples/dataset_evaluation/inference.sh
+++ b/applications/ColossalEval/examples/dataset_evaluation/inference.sh
+torchrun --nproc_per_node=1 inference.py \
+    --config "path to config file" \
+    --load_dataset \
+    --inference_save_path "path to save inference results"
--- a/applications/ColossalEval/examples/gpt_evaluation/config/evaluation/config.json
+++ b/applications/ColossalEval/examples/gpt_evaluation/config/evaluation/config.json
+{
+  "language": "en",
+  "category": {
+    "brainstorming": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "creativity",
+        "practicality",
+        "reasonableness"
+      ]
+    },
+    "chat": {
+      "GPT": [
+        "language organization",
+        "naturalness",
+        "engagingness",
+        "fidelity"
+      ]
+    },
+    "generation": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "diversity"
+      ]
+    },
+    "open_qa": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "correctness"
+      ]
+    },
+    "roleplay": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "fidelity",
+        "creativity"
+      ]
+    }
+  }
+}
--- a/applications/ColossalEval/examples/gpt_evaluation/config/inference/config.json
+++ b/applications/ColossalEval/examples/gpt_evaluation/config/inference/config.json
+{
+  "model": [
+    {
+      "name": "model name",
+      "model_class": "HuggingFaceCausalLM",
+      "parameters": {
+        "path": "path to model",
+        "model_max_length": 4096,
+        "tokenizer_path": "",
+        "tokenizer_kwargs": {
+          "trust_remote_code": true
+        },
+        "peft_path": null,
+        "model_kwargs": {
+          "torch_dtype": "torch.float32",
+          "trust_remote_code": true
+        },
+        "prompt_template": "plain",
+        "batch_size": 4
+      }
+    }
+  ],
+  "dataset": [
+    {
+      "name": "colossal",
+      "dataset_class": "ColossalDataset",
+      "debug": false,
+      "few_shot": false,
+      "path": "../../configs/gpt_evaluation/data/eval_en_examples.json",
+      "save_path": "path to save converted dataset (inference_data/colossal.json)"
+    }
+  ]
+}
--- a/applications/ColossalEval/examples/gpt_evaluation/eval.py
+++ b/applications/ColossalEval/examples/gpt_evaluation/eval.py
+import argparse
+import os
+
+import openai
+from colossal_eval.evaluate.evaluator import Evaluator
+from colossal_eval.utils import jload
+
+
+def main(args):
+    assert len(args.answer_file_list) == len(
+        args.model_name_list
+    ), "The number of answer files and model names should be equal!"
+
+    # load config
+    config = jload(args.config_file)
+
+    if config["language"] in ["cn", "en"]:
+        # get metric settings for all categories
+        metrics_per_category = {}
+        for category in config["category"].keys():
+            metrics_all = {}
+            for metric_type, metrics in config["category"][category].items():
+                metrics_all[metric_type] = metrics
+            metrics_per_category[category] = metrics_all
+
+        battle_prompt = None
+        if args.battle_prompt_file:
+            battle_prompt = jload(args.battle_prompt_file)
+
+        gpt_evaluation_prompt = None
+        if args.gpt_evaluation_prompt_file:
+            gpt_evaluation_prompt = jload(args.gpt_evaluation_prompt_file)
+
+        if len(args.model_name_list) == 2 and not battle_prompt:
+            raise Exception("No prompt file for battle provided. Please specify the prompt file for battle!")
+
+        if len(args.model_name_list) == 1 and not gpt_evaluation_prompt:
+            raise Exception(
+                "No prompt file for gpt evaluation provided. Please specify the prompt file for gpt evaluation!"
+            )
+
+        if args.gpt_model == "text-davinci-003" and args.gpt_with_reference:
+            raise Exception(
+                "GPT evaluation with reference is not supported for text-davinci-003. You should specify chat models such as gpt-3.5-turbo or gpt-4."
+            )
+
+        # initialize evaluator
+        evaluator = Evaluator(
+            metrics_per_category,
+            battle_prompt,
+            gpt_evaluation_prompt,
+            args.gpt_model,
+            config["language"],
+            args.gpt_with_reference,
+        )
+        if len(args.model_name_list) == 2:
+            answers_1 = jload(args.answer_file_list[0])
+            answers_2 = jload(args.answer_file_list[1])
+
+            answers1 = []
+            for category, value in answers_1.items():
+                answers1.extend(value["data"])
+
+            answers2 = []
+            for category, value in answers_2.items():
+                answers2.extend(value["data"])
+
+            assert len(answers1) == len(answers2), "The number of answers for two models should be equal!"
+
+            evaluator.battle(answers1=answers1, answers2=answers2)
+            evaluator.save(args.save_path, args.model_name_list)
+        elif len(args.model_name_list) == 1:
+            targets = jload(args.target_file)
+            answers = jload(args.answer_file_list[0])
+
+            references = []
+            for category, value in targets["test"].items():
+                references.extend(value["data"])
+
+            predictions = []
+            for category, value in answers.items():
+                predictions.extend(value["data"])
+
+            assert len(references) == len(
+                predictions
+            ), "The number of target answers and model answers should be equal!"
+
+            evaluator.evaluate(
+                answers=predictions, targets=references, save_path=args.save_path, model_name=args.model_name_list[0]
+            )
+            evaluator.save(args.save_path, args.model_name_list)
+        else:
+            raise ValueError("Unsupported number of answer files and model names!")
+    else:
+        raise ValueError(f'Unsupported language {config["language"]}!')
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="ColossalAI LLM evaluation pipeline.")
+    parser.add_argument(
+        "--config_file", type=str, default=None, required=True, help="path to the file of target results"
+    )
+    parser.add_argument("--battle_prompt_file", type=str, default=None, help="path to the prompt file for battle")
+    parser.add_argument(
+        "--gpt_evaluation_prompt_file", type=str, default=None, help="path to the prompt file for gpt evaluation"
+    )
+    parser.add_argument("--target_file", type=str, default=None, help="path to the target answer (ground truth) file")
+    parser.add_argument(
+        "--answer_file_list",
+        type=str,
+        nargs="+",
+        default=[],
+        required=True,
+        help="path to the answer files of at most 2 models",
+    )
+    parser.add_argument(
+        "--model_name_list", type=str, nargs="+", default=[], required=True, help="the names of at most 2 models"
+    )
+    parser.add_argument(
+        "--gpt_model",
+        default="gpt-3.5-turbo-16k",
+        choices=["text-davinci-003", "gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-4"],
+        help="which GPT model to use for evaluation",
+    )
+    parser.add_argument(
+        "--gpt_with_reference",
+        default=False,
+        action="store_true",
+        help="whether to include reference answer in gpt evaluation",
+    )
+    parser.add_argument("--save_path", type=str, default="results", help="path to save evaluation results")
+    parser.add_argument("--openai_key", type=str, default=None, required=True, help="Your openai key")
+    args = parser.parse_args()
+
+    if args.openai_key is not None:
+        os.environ["OPENAI_API_KEY"] = args.openai_key
+    openai.api_key = os.getenv("OPENAI_API_KEY")
+
+    main(args)
--- a/applications/ColossalEval/examples/gpt_evaluation/eval.sh
+++ b/applications/ColossalEval/examples/gpt_evaluation/eval.sh
+python eval.py \
+    --config_file "path to the config file" \
+    --battle_prompt_file "path to the prompt file for battle" \
+    --gpt_evaluation_prompt_file "path to the prompt file for gpt evaluation" \
+    --target_file "path to the target answer file" \
+    --answer_file_list "path to the answer files of at most 2 models" \
+    --model_name_list "the names of at most 2 models" \
+    --save_path "path to save results" \
+    --openai_key "your openai key" \
--- a/applications/ColossalEval/examples/gpt_evaluation/inference.py
+++ b/applications/ColossalEval/examples/gpt_evaluation/inference.py
+import argparse
+import copy
+import os
+from typing import Dict, List
+
+import torch
+import torch.distributed as dist
+from colossal_eval import dataset, models, utils
+
+import colossalai
+from colossalai.logging import get_dist_logger
+
+logger = get_dist_logger()
+
+
+def rm_and_merge(world_size: int, save_path: str, model_names: List[str], dataset_names: Dict[str, List]) -> None:
+    """
+    Remove inference result per rank and merge them into one file.
+
+    Args:
+        world_size: Number of processes for inference.
+        save_path: The folder for storing inference results.
+        model_names: Names of models for inference.
+        dataset_names: Names of dataset for inference.
+
+    """
+
+    for model_name in model_names:
+        for dataset_name, categories in dataset_names.items():
+            all_answers = {}
+            for category in categories:
+                all_answers[category] = {"data": []}
+                answers = {"data": []}
+
+                for r in range(world_size):
+                    directory = os.path.join(
+                        save_path, model_name, f"{dataset_name}_{category}_inference_results_rank{r}.json"
+                    )
+                    if not os.path.exists(directory):
+                        raise Exception(
+                            f"Directory {directory} not found. There may be an error during inference time."
+                        )
+                    else:
+                        rank_answers = utils.jload(directory)
+                        answers["data"].extend(rank_answers["data"])
+                        answers["inference_kwargs"] = rank_answers["inference_kwargs"]
+
+                for r in range(world_size):
+                    try:
+                        directory = os.path.join(
+                            save_path, model_name, f"{dataset_name}_{category}_inference_results_rank{r}.json"
+                        )
+                        os.remove(directory)
+                    except Exception as e:
+                        print(e)
+
+                all_answers[category] = answers
+
+            logger.info(f"Save inference results of model {model_name} on dataset {dataset_name}.")
+            utils.jdump(all_answers, os.path.join(save_path, model_name, f"{dataset_name}_inference_results.json"))
+
+        logger.info(f"Save inference results of model {model_name} for all dataset.")
+    logger.info(f"Save inference results of all models for all dataset.")
+
+
+def main(args):
+    colossalai.launch_from_torch(config={}, seed=42)
+    world_size = dist.get_world_size()
+    rank = dist.get_rank()
+
+    inference_data = {}
+    debug_args = {}
+    few_shot_args = {}
+
+    config = utils.jload(args.config)
+
+    model_parameters = config["model"]
+    dataset_parameters = config["dataset"]
+
+    for dataset_parameter in dataset_parameters:
+        path = dataset_parameter["path"]
+        save_path = dataset_parameter["save_path"]
+        dataset_name = dataset_parameter["name"]
+        debug_args[dataset_name] = dataset_parameter["debug"]
+        few_shot_args[dataset_name] = dataset_parameter["few_shot"]
+
+        if not args.load_dataset:
+            if os.path.exists(save_path):
+                dataset_ = utils.jload(save_path)
+                inference_data[dataset_name] = dataset_["test"]
+            else:
+                raise Exception(
+                    "Can't find the converted dataset. You may set load_dataset True to store the dataset first."
+                )
+
+            continue
+
+        dataset_class = eval(f"dataset.{dataset_parameter['dataset_class']}")
+        if not issubclass(dataset_class, dataset.BaseDataset):
+            raise ValueError(f"Dataset class {dataset_parameter['dataset_class']} is not a subclass of BaseDataset.")
+
+        dataset_ = dataset_class(path, logger, dataset_parameter["few_shot"])
+
+        dataset_.save(save_path)
+        inference_data[dataset_name] = dataset_.dataset["test"]
+
+    for model_parameter in model_parameters:
+        model_name = model_parameter["name"]
+        model_class = eval(f"models.{model_parameter['model_class']}")
+        paramerters = model_parameter["parameters"]
+        paramerters.update({"logger": logger})
+        paramerters.update({"prompt_template": utils.prompt_templates[paramerters["prompt_template"]]})
+
+        model_ = model_class(**paramerters)
+        if not issubclass(model_class, models.BaseModel):
+            raise ValueError(f"Model class {model_parameter['model_class']} is not a subclass of BaseModel.")
+
+        for dataset_name, split_data in inference_data.items():
+            start = 0
+            for category, category_data in split_data.items():
+                if few_shot_args[dataset_name] and category_data["inference_kwargs"].get("few_shot_data", None) is None:
+                    raise Exception(f"Dataset {dataset_name} doesn't have few-shot data for category {category}!")
+
+                answers_to_dump = copy.deepcopy(category_data)
+                partition_size = len(category_data["data"]) // world_size
+                redundant = len(category_data["data"]) % world_size
+
+                # Ensure that the amount of data for inference is as consistent as possible across different processes.
+                lengths = [partition_size for _ in range(world_size)]
+                for j in range(redundant):
+                    lengths[(j + start) % world_size] += 1
+
+                start = (start + redundant) % world_size
+
+                questions = category_data["data"][sum(lengths[0:rank]) : sum(lengths[0:rank]) + lengths[rank]]
+
+                answers_per_rank = model_.inference(
+                    questions, inference_kwargs=category_data["inference_kwargs"], debug=debug_args[dataset_name]
+                )
+
+                answers_to_dump["data"] = answers_per_rank
+
+                utils.jdump(
+                    answers_to_dump,
+                    os.path.join(
+                        args.inference_save_path,
+                        model_name,
+                        f"{dataset_name}_{category}_inference_results_rank{rank}.json",
+                    ),
+                )
+
+        logger.info(f"Rank {rank} peak CUDA mem: {torch.cuda.max_memory_allocated()/1024**3:.3f} GB")
+
+        del model_
+        torch.cuda.empty_cache()
+
+    dist.barrier()
+    if rank == 0:
+        model_names = [model_parameter["name"] for model_parameter in model_parameters]
+        dataset_names = {key: list(inference_data[key].keys()) for key in inference_data}
+        rm_and_merge(world_size, args.inference_save_path, model_names, dataset_names)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="ColossalEval inference process.")
+    parser.add_argument("--config", type=str, default=None, required=True, help="path to config file")
+    parser.add_argument("--load_dataset", default=False, action="store_true")
+    parser.add_argument("--inference_save_path", type=str, default=None, help="path to save inference results")
+    args = parser.parse_args()
+
+    main(args)
--- a/applications/ColossalEval/examples/gpt_evaluation/inference.sh
+++ b/applications/ColossalEval/examples/gpt_evaluation/inference.sh
+torchrun --nproc_per_node=1 inference.py \
+    --config "path to config file" \
+    --load_dataset \
+    --inference_save_path "path to save inference results"
--- a/applications/ColossalEval/requirements.txt
+++ b/applications/ColossalEval/requirements.txt
+transformers>=4.32.0
+colossalai>=0.3.1
+peft
+tabulate
+jieba
+fuzzywuzzy
+rouge
+openai
+matplotlib
+pandas
+seaborn
+scikit-learn
--- a/applications/ColossalEval/setup.py
+++ b/applications/ColossalEval/setup.py
+from setuptools import find_packages, setup
+
+
+def fetch_requirements(path):
+    with open(path, "r") as fd:
+        return [r.strip() for r in fd.readlines()]
+
+
+def fetch_readme():
+    with open("README.md", encoding="utf-8") as f:
+        return f.read()
+
+
+setup(
+    name="colossal_eval",
+    version="0.0.1",
+    packages=find_packages(exclude=["examples", "*.egg-info"]),
+    description="Colossal-AI LLM-Evaluation Framework",
+    long_description=fetch_readme(),
+    long_description_content_type="text/markdown",
+    license="Apache Software License 2.0",
+    url="https://github.com/hpcaitech/LLM-Evaluation",
+    install_requires=fetch_requirements("requirements.txt"),
+    python_requires=">=3.6",
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: Apache Software License",
+        "Environment :: GPU :: NVIDIA CUDA",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    ],
+)
--- a/applications/README.md
+++ b/applications/README.md
@@ -4,8 +4,10 @@ This directory contains the applications that are powered by Colossal-AI.

 The list of applications include:

- [X] [Chatbot](./Chat/README.md)
- [X] [FastFold](https://github.com/hpcaitech/FastFold): Optimizing AlphaFold (Biomedicine) Training and Inference on GPU Clusters
+- [X] [Colossal-LLaMA-2](./Colossal-LLaMA-2/): Continual Pre-training of LLaMA-2.
+- [X] [ColossalEval](./ColossalEval): Evaluation Pipeline for LLMs.
+- [X] [ColossalChat](./Chat/README.md): Replication of ChatGPT with RLHF.
+- [X] [FastFold](https://github.com/hpcaitech/FastFold): Optimizing AlphaFold (Biomedicine) Training and Inference on GPU Clusters.

 > Please note that the `Chatbot` application is migrated from the original `ChatGPT` folder.