Update files

7d06d0f9 · yangzhong · 2f320edb · 7d06d0f9 · 7d06d0f9 · 7d06d0f9
Commit 7d06d0f9 authored Jun 26, 2025 by yangzhong
20 changed files
--- a/scripts/cmmlu/evaluator.py
+++ b/scripts/cmmlu/evaluator.py
+# This code is modified from C-Eval Project: https://github.com/SJTU-LIT/ceval
+import string
+class Evaluator:
+    def __init__(self, choices, model_path, k=-1):
+        self.choices = choices
+        self.model_path = model_path
+        self.k = k
+        self.puncs = list(string.punctuation)
+    def format_example(self, line, include_answer=True):
+        example = line['question']
+        # print(example)
+        for choice in self.choices:
+            example += f'\n{choice}. {line[f"{choice}"]}'
+        example += '\n答案：'
+        if include_answer:
+            example += f'{line["answer"]}\n\n'
+        return example
+    def generate_few_shot_prompt(self, subject, dev_df):
+        prompt = f"以下是中国关于{subject}考试的单项选择题，请选出其中的正确答案。\n\n"
+        k = self.k
+        if self.k == -1:
+            k = dev_df.shape[0]
+        for i in range(k):
+            prompt += self.format_example(dev_df.iloc[i, :])
+        return prompt
+    def eval_subject(self, subject_name, test_df, dev_df=None, few_shot=False, save_result_dir=None):
+        pass
+    def normalize_answer(self,s):
+        def white_space_fix(text):
+            return ' '.join(text.split())
+        def remove_punc(text):
+            exclude=set(self.puncs)
+            return ''.join(ch for ch in text if ch not in exclude)
+        def lower(text):
+            return text.lower()
+        return white_space_fix(remove_punc(lower(s)))
+    def exact_match(self,pred, target):
+        return self.normalize_answer(pred)==self.normalize_answer(target)
--- a/scripts/cmmlu/llama2_evaluator.py
+++ b/scripts/cmmlu/llama2_evaluator.py
+# This code is modified from C-Eval Project: https://github.com/SJTU-LIT/ceval
+import os
+import re
+from tqdm import tqdm
+import random
+import numpy as np
+import torch
+from transformers import AutoModelForCausalLM, LlamaTokenizer
+from transformers import GenerationConfig
+from evaluator import Evaluator
+class Llama_Evaluator(Evaluator):
+    def __init__(self, choices, k, model_path, device, temperature=0.2, verbose=False):
+        super(Llama_Evaluator, self).__init__(choices, model_path, k)
+        load_type = torch.float16
+        self.model_path = model_path
+        self.device = device
+        self.verbose = verbose
+        self.tokenizer = LlamaTokenizer.from_pretrained(model_path, legacy=True)
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            load_in_8bit=False,
+            torch_dtype=load_type,
+            low_cpu_mem_usage=True,
+            device_map='auto',
+            trust_remote_code=True)
+        self.generation_config = GenerationConfig(
+            temperature=temperature,
+            top_k=40,
+            top_p=0.9,
+            do_sample=True,
+            num_beams=1,
+            repetition_penalty=1.1,
+            max_new_tokens=20
+        )
+        self.sA_id = self.tokenizer.encode("A", add_special_tokens=False)[0]
+        self.sB_id = self.tokenizer.encode("B", add_special_tokens=False)[0]
+        self.sC_id = self.tokenizer.encode("C", add_special_tokens=False)[0]
+        self.sD_id = self.tokenizer.encode("D", add_special_tokens=False)[0]
+        self.A_id = self.tokenizer.encode("：A")[-1]
+        self.B_id = self.tokenizer.encode("：B")[-1]
+        self.C_id = self.tokenizer.encode("：C")[-1]
+        self.D_id = self.tokenizer.encode("：D")[-1]
+    def eval_subject(self, subject_name,
+            test_df,
+            dev_df=None,
+            few_shot=False,
+            cot=False,
+            save_result_dir=None,
+            with_prompt=False,
+            constrained_decoding=False,
+            do_test=False):
+        all_answers = {}
+        if constrained_decoding is True:
+            self.generation_config.output_scores = True
+            self.generation_config.return_dict_in_generate = True
+            self.generation_config.max_new_tokens = 1
+            self.generation_config.top_p = 1.0
+            self.generation_config.top_k = 0
+        correct_num = 0
+        if save_result_dir:
+            result = []
+            score = []
+        if few_shot:
+          if with_prompt:
+            history = self.generate_few_shot_prompt(subject_name, dev_df, cot=cot)
+          else:
+            history = self.generate_few_shot_noprompt(subject_name, dev_df, cot=cot)
+        else:
+            history = ''
+        answers = ['NA'] * len(test_df) if do_test is True else list(test_df['Answer'])
+        for row_index, row in tqdm(test_df.iterrows(), total=len(test_df)):
+            question = self.format_example(row, include_answer=False, cot=cot,with_prompt=with_prompt)
+            instruction =  question
+            if with_prompt:
+                DEFAULT_SYSTEM_PROMPT = """你是一个乐于助人的助手。"""
+                prompt_template = (
+                                        "[INST] <<SYS>>\n"
+                                        "{system_prompt}\n"
+                                        "<</SYS>>\n\n"
+                                        "{instruction} [/INST]"
+                                    )
+                instruction = prompt_template.format_map({'instruction': instruction,'system_prompt':DEFAULT_SYSTEM_PROMPT})
+            instruction=history+instruction
+            inputs = self.tokenizer(instruction, return_tensors="pt")
+            generation_output = self.model.generate(
+                    input_ids = inputs["input_ids"].to(self.device),
+                    attention_mask = inputs['attention_mask'].to(self.device),
+                    eos_token_id=self.tokenizer.eos_token_id,
+                    pad_token_id=self.tokenizer.pad_token_id,
+                    generation_config = self.generation_config
+                )
+            _, length = inputs.input_ids.shape
+            if constrained_decoding is True:
+                logits = generation_output.scores[0][0]
+                logits = logits.float().cpu().detach()
+                choices1_logits = logits[[self.sA_id,self.sB_id,self.sC_id,self.sD_id]]
+                choices2_logits = logits[[self.A_id,self.B_id,self.C_id,self.D_id]]
+                choicesAll_logits = (choices1_logits + choices2_logits).numpy()
+                assert not (np.any(np.isinf(choicesAll_logits)) or np.any(np.isnan(choicesAll_logits)))
+                ans = {0: "A", 1: "B", 2: "C", 3: "D"}[np.argmax(choicesAll_logits)]
+                response = self.tokenizer.decode([logits.argmax(-1).item()])
+            else:
+                response = self.tokenizer.decode(generation_output[0, length:], skip_special_tokens=True)
+                ans, _ = self.extract_answer(row, response)
+            if ans == answers[row_index]:
+                correct_num += 1
+                correct = 1
+            else:
+                correct = 0
+            if self.verbose is True:
+                print(f"\n======={str(row_index)}=======")
+                print(f"question: {question}\n")
+                print(f"response: {response}\n")
+                print(f"extracted answer: {ans}")
+                print(f"ground truth: {answers[row_index]} \n")
+            if save_result_dir:
+                result.append(response)
+                score.append(correct)
+            all_answers[str(row_index)] = ans
+        correct_ratio = 100*correct_num/len(answers)
+        if save_result_dir:
+            test_df['model_output'] = result
+            test_df['correctness'] = score
+            test_df.to_csv(os.path.join(save_result_dir, f'{subject_name}_test.csv'))
+        return correct_ratio, all_answers
+    def format_example(self, line, include_answer=True, cot=False, with_prompt=False):
+        example = line['Question']
+        suffix = ""
+        for choice in self.choices:
+            example += f'\n{choice}. {line[f"{choice}"]}'
+        if include_answer:
+            if cot:
+                example += "\n答案：让我们一步一步思考，\n" + \
+                    line["explanation"] + f"\n所以答案是{line['Answer']}。\n\n"
+            else:
+                example += '\n答案：' + suffix + line["Answer"] + '\n\n'
+        else:
+            if with_prompt is False:
+                if cot:
+                    example += "\n答案：让我们一步一步思考，\n1."
+                else:
+                    example += '\n答案：' + suffix
+            else:
+                if cot:
+                    example += "\n答案是什么？让我们一步一步思考，\n1."
+                else:
+                    example += '\n答案：'
+        return example
+    def generate_few_shot_noprompt(self, subject, dev_df, cot=False):
+        prompt = f"以下是中国关于{subject}考试的单项选择题，请选出其中的正确答案。\n\n"
+        k = self.k
+        if self.k == -1:
+            k = dev_df.shape[0]
+        for i in range(k):
+            prompt += self.format_example(
+                dev_df.iloc[i, :],
+                include_answer=True,
+                cot=cot
+            )
+        return prompt
+    def generate_few_shot_prompt(self, subject, dev_df, cot=False):
+        DEFAULT_SYSTEM_PROMPT = """你是一个乐于助人的助手。"""
+        prompt = f"以下是中国关于{subject}考试的单项选择题，请选出其中的正确答案。\n\n"
+        prompt_template = (
+                                        "[INST] <<SYS>>\n"
+                                        "{system_prompt}\n"
+                                        "<</SYS>>\n\n"
+                                        "{instruction} [/INST]好的，我会结合{subject}相关知识回答"
+                                    )
+        prompt = prompt_template.format_map({'instruction':prompt,'system_prompt':DEFAULT_SYSTEM_PROMPT,"subject":subject})
+        k = self.k
+        if self.k == -1:
+            k = dev_df.shape[0]
+        for i in range(k):
+            line=dev_df.iloc[i, :]
+            q=line['Question']
+            for choice in self.choices:
+                q += f'\n{choice}. {line[f"{choice}"]}'
+            a=line['Answer']
+            prompt+="[INST] "+q+"\n答案：[/INST]"+a+"\n"
+        return prompt
+    def extract_answer(self, line, gen_ans):
+        m = re.findall(r'所以答案是(.+?)。', gen_ans, re.M)
+        if len(m) > 0 and m[-1] in self.choices:
+            return m[-1], True
+        answer_patterns = [
+            r'([ABCD])是正确的',
+            r'选项([ABCD])正确',
+            r'答案为([ABCD])',
+            r'答案是([ABCD])',
+            r'答案([ABCD])',
+            r'选择([ABCD])',
+            r'答案：([ABCD])',
+            r'选择答案([ABCD])'
+        ]
+        # RE extraction
+        for answer_pattern in answer_patterns:
+            m = re.search(answer_pattern, gen_ans, re.M)
+            if m:
+                answer = m.group(1)
+                return answer, False
+        # only containing one choice-character
+        m = re.findall(r'[ABCD]', gen_ans, re.M)
+        if len(m) >= 1:
+            answer = m[0]
+            return answer, False
+        choices_dict = {}
+        pattern = ""
+        for c in self.choices:
+            choices_dict[str(line[f'{c}'])] = c
+            pattern += re.escape(str(line[f'{c}']))+"|"
+        pattern = pattern[:-1]
+        m = re.findall(pattern, gen_ans, re.M)
+        print("w/ escape:",repr(pattern),gen_ans,(len(m)>=1))
+        if len(m) >= 1:
+            answer = choices_dict[m[0]]
+            return answer, False
+        return  random.choice('ABCD'), False
--- a/scripts/inference/flash_attn_patch_for_inference.py
+++ b/scripts/inference/flash_attn_patch_for_inference.py
+# Below code is based on https://github.com/lm-sys/FastChat/blob/main/fastchat/train/llama_flash_attn_monkey_patch.py.
+from typing import Optional, Tuple
+import torch
+import transformers
+from einops import rearrange
+try:
+    from flash_attn.flash_attn_interface import flash_attn_with_kvcache
+except ImportError:
+    flash_attn_with_kvcache = None
+    print(
+        "FlashAttention-2 is not installed correctly. If you want to use flash attention to inference, flash-attention >= 2.2 is needed. "
+        "Please check the usage in https://github.com/Dao-AILab/flash-attention for more details."
+    )
+def forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.Tensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+    padding_mask=None,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    """Input shape: Batch x Time x Channel
+    attention_mask: [bsz, q_len]
+    """
+    bsz, q_len, _ = hidden_states.size()
+    query_states = (
+        self.q_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+    )
+    key_states = (
+        self.k_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+    )
+    value_states = (
+        self.v_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+    )
+    kv_seq_len = key_states.shape[1]
+    past_kv_len = 0
+    if past_key_value is not None:
+        past_kv_len = past_key_value[0].shape[-2]
+        kv_seq_len += past_kv_len
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    rotary_dim = cos.shape[-1]
+    cos, sin = cos.squeeze(0,1)[:,:rotary_dim//2].contiguous(), sin.squeeze(0,1)[:,:rotary_dim//2].contiguous()
+    if past_key_value is not None:
+        key_cache = torch.cat([past_key_value[0].transpose(1, 2), key_states], dim=1)
+        value_cache = torch.cat([past_key_value[1].transpose(1, 2), value_states], dim=1)
+    else:
+        key_cache = key_states
+        value_cache = value_states
+    assert not output_attentions, "output_attentions is not supported"
+    q = query_states  # [bsz, q_len, nh, hd]
+    k, v = key_states, value_states # [bsz, q_len, nh, hd]
+    output = flash_attn_with_kvcache(
+        q, key_cache, value_cache, k, v, rotary_cos=cos, rotary_sin=sin, cache_seqlens=past_kv_len, softmax_scale=None, causal=True, rotary_interleaved=False
+    )
+    output = rearrange(output, "b s h d -> b s (h d)", b=bsz)
+    past_key_value = (key_cache[:,:kv_seq_len].transpose(1,2), value_cache[:,:kv_seq_len].transpose(1,2)) if use_cache else None
+    output = self.o_proj(output)
+    return output, None, past_key_value
+# Disable the transformation of the attention mask in LlamaModel as the flash attention
+# requires the attention mask to be the same as the key_padding_mask
+def _prepare_decoder_attention_mask(
+    self, attention_mask, input_shape, inputs_embeds, past_key_values_length
+):
+    return attention_mask
+def replace_llama_attn_with_flash_attn():
+    if flash_attn_with_kvcache != None:
+        print("USE_FLASH_ATTENTION: ", True)
+        transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = _prepare_decoder_attention_mask
+        transformers.models.llama.modeling_llama.LlamaAttention.forward = forward
+    else:
+        print("USE_FLASH_ATTENTION: ", False)
--- a/scripts/inference/gradio_demo.py
+++ b/scripts/inference/gradio_demo.py
--- a/scripts/inference/inference_hf.py
+++ b/scripts/inference/inference_hf.py
--- a/scripts/inference/speculative_sample.py
+++ b/scripts/inference/speculative_sample.py
--- a/scripts/langchain/doc.txt
+++ b/scripts/langchain/doc.txt
+李白[注 1]（701年5月19日—762年11月30日），字太白，号青莲居士，中国唐朝诗人。李白自言祖籍陇西成纪（今甘肃静宁西南），汉飞将军李广后裔，西凉武昭王李暠之后，与李唐皇室同宗。
+一说其幼时内迁，寄籍剑南道绵州昌隆（今四川省江油市青莲镇）。一说先人隋末被窜于碎叶，出生于碎叶，属唐安西都护府（今吉尔吉斯斯坦共和国楚河州托克马克市）。有“诗仙”、“诗侠”、“酒仙”、“谪仙人”等称呼，活跃于盛唐[1]，为杰出的浪漫主义诗人。与杜甫合称“李杜”[注 2]。被贺知章呼为“天上谪仙”、“李谪仙”。
+李白的诗歌在唐朝已被选进殷璠编选的《河岳英灵集》、于敦煌石室发现的《唐写本唐人选唐诗》、韦庄编选的《又玄集》和韦縠编选的《才调集》。唐文宗御封李白的诗歌、裴旻的剑舞、张旭的草书称为“三绝”[2]。其作品想像奇特丰富，风格雄奇浪漫，意境独特，清新俊逸；善于利用夸饰与譬喻等手法、自然优美的词句，表现出奔放的情感。诗句行云流水，浑然天成。李白诗篇传诵千年，众多诗句已成经典，清赵翼称：“李杜诗篇万口传”（例如“抽刀断水水更流，举杯消愁愁更愁”等，更被谱入曲）。李白在诗歌的艺术成就被认为是中国浪漫主义诗歌的巅峰。诗作在全唐诗收录于卷161至卷185。有《李太白集》传世。杜甫曾经这样评价过李白的文章：“笔落惊风雨，诗成泣鬼神”、“白也诗无敌，飘然思不群”。
+生平
+早年
+据《新唐书》记载李白为兴圣皇帝（凉武昭王李暠）九世孙[3]，如果按照这个说法李白与李唐诸王实际上同宗，应是唐太宗李世民的同辈族弟。亦有野史说其祖是李建成或李元吉，因为被李世民族灭而逃往西域；但此说缺乏佐证，且李建成、李元吉诸子尚在幼年即在玄武门之变后全数被害，留有亲生后嗣的可能性很小。据《旧唐书》记载，李白之父李客为任城尉。更为了学习而隐居。
+李白于武则天大足元年（701年）[4]出生，关于其出生地有多种说法，现在主要有剑南道绵州昌隆县（今四川省江油市）[5]青莲乡（今青莲镇）和西域的碎叶（Suyab，位于今吉尔吉斯托克马克附近）[6]这两种说法，其中后一种说法认为李白直到四岁时（705年）才跟随他的父亲李客迁居蜀地，入籍绵州。李白自四岁（705年）接受启蒙教育，从景云元年（710年）开始，李白开始读诸子史籍[7]，开元三年时十四岁（715年）——喜好作赋、剑术、奇书、神仙：“十五观奇书，做赋凌相如”。在青年时期开始在中国各地游历。开元五年左右，李白曾拜撰写《长短经》的赵蕤为师，学习一年有余，这段时期的学习对李白产生了深远的影响。开元六年，在戴天山（约在四川省昌隆县北五十里处）大明寺读书。二十五岁时只身出四川，开始了广泛漫游，南到洞庭湘江，东至吴、越，寓居在安陆（今湖北省安陆市）、应山（今湖北省广水市）。
+中年
+李白曾经在唐玄宗天宝元年（742年）供奉翰林。有一次皇帝因酒酣问李白说：“我朝与天后（武后）之朝何如？”白曰：“天后朝政出多门，国由奸幸，任人之道，如小儿市瓜，不择香味，惟拣肥大者；我朝任人如淘沙取金，剖石采用，皆得其精粹者。”玄宗听后大笑不止[8][9]。但是由于他桀骜不驯的性格，所以仅仅不到两年他就离开了长安。据说是因为他作的《清平调》得罪了当时宠冠后宫的杨贵妃（因李白命“力士脱靴”，高力士引以为大耻，因而以言语诱使杨贵妃认为“可怜飞燕倚新妆”几句是讽刺她）而不容于宫中[注 3]。天宝三年（745年）“恳求还山，帝赐金放还”，离开长安。
+后在洛阳与另两位著名诗人杜甫、高适相识，并结为好友。
+晚年
+天宝十一年（752年）李白年届五十二岁，北上途中游广平郡邯郸、临洺、清漳等地。十月，抵幽州。初有立功边疆思想，在边地习骑射。后发现安禄山野心，登黄金台痛哭。不久即离幽州南下。
+安史之乱爆发时，李白游华山，南下回宣城，后上庐山。756年12月，李白被三次邀请，下山赴寻阳入永王李璘幕僚[10]。永王触怒唐肃宗被杀后，李白也获罪入狱。幸得郭子仪力保，方得免死，改为流徙夜郎（今贵州关岭县一带），在途经巫山时遇赦，此时他已经59岁。（参见李璘之乱）
+李白晚年在江南一带漂泊。在他61岁时，听到太尉李光弼率领大军讨伐安史叛军，于是他北上准备追随李光弼从军杀敌，但是中途因病折回。第二年，李白投奔他的族叔、当时在当涂（今属安徽省马鞍山）当县令的李阳冰。同年11月，李白病逝于寓所，终年61岁，葬当涂龙山。唐宪宗元和十二年（817年），宣歙观察使范传正根据李白生前“志在青山”的遗愿，将其墓迁至当涂青山。
+去世
+《新唐书》记载，唐代宗继位后以左拾遗召李白，但李白当时已去世。
+李阳冰在《草堂集序》中说李白是病死的[11]；皮日休在诗作中记载，李白是患“腐胁疾”而死的[12]。
+《旧唐书》则记载，李白流放虽然遇赦，但因途中饮酒过度，醉死于宣城。中国民间有“太白捞月”的传说：李白在舟中赏月，饮酒大醉，想要跳下船至水里捞月而溺死[13][14][15]；在民间的求签活动中亦有“太白捞月”一签文，乃是下下签[16]。
+作品
+李白一生创作大量的诗歌，绝大多数已散佚[17]，流传至今的只有九百多首。他的诗歌创作涉及的中国古典诗歌的题材非常广泛，而且在许多题材都有名作出现，而且因为际遇的不同，每个时期的诗风都有所不同。
\ No newline at end of file
--- a/scripts/langchain/langchain_qa.py
+++ b/scripts/langchain/langchain_qa.py
+import argparse
+import os
+parser = argparse.ArgumentParser()
+parser.add_argument('--file_path', required=True, type=str)
+parser.add_argument('--embedding_path', required=True, type=str)
+parser.add_argument('--model_path', required=True, type=str)
+parser.add_argument('--gpu_id', default="0", type=str)
+parser.add_argument('--chain_type', default="refine", type=str)
+args = parser.parse_args()
+os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_id
+file_path = args.file_path
+embedding_path = args.embedding_path
+model_path = args.model_path
+import torch
+from langchain.llms.huggingface_pipeline import HuggingFacePipeline
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.vectorstores import FAISS
+from langchain.document_loaders import TextLoader
+from langchain.prompts import PromptTemplate
+from langchain.chains import RetrievalQA
+from langchain.embeddings.huggingface import HuggingFaceEmbeddings
+prompt_template = (
+    "[INST] <<SYS>>\n"
+    "You are a helpful assistant. 你是一个乐于助人的助手。\n"
+    "<</SYS>>\n\n"
+    "{context}\n{question} [/INST]"
+)
+refine_prompt_template = (
+    "[INST] <<SYS>>\n"
+    "You are a helpful assistant. 你是一个乐于助人的助手。\n"
+    "<</SYS>>\n\n"
+    "这是原始问题: {question}\n"
+    "已有的回答: {existing_answer}\n"
+    "现在还有一些文字，（如果有需要）你可以根据它们完善现有的回答。"
+    "\n\n"
+    "{context_str}\n"
+    "\n\n"
+    "请根据新的文段，进一步完善你的回答。"
+    " [/INST]"
+)
+initial_qa_template = (
+    "[INST] <<SYS>>\n"
+    "You are a helpful assistant. 你是一个乐于助人的助手。\n"
+    "<</SYS>>\n\n"
+    "以下为背景知识：\n"
+    "{context_str}"
+    "\n"
+    "请根据以上背景知识, 回答这个问题：{question}。"
+    " [/INST]"
+)
+if __name__ == '__main__':
+    load_type = torch.float16
+    if not torch.cuda.is_available():
+        raise RuntimeError("No CUDA GPUs are available.")
+    loader = TextLoader(file_path)
+    documents = loader.load()
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=600, chunk_overlap=100)
+    texts = text_splitter.split_documents(documents)
+    print("Loading the embedding model...")
+    embeddings = HuggingFaceEmbeddings(model_name=embedding_path)
+    docsearch = FAISS.from_documents(texts, embeddings)
+    print("loading LLM...")
+    model = HuggingFacePipeline.from_model_id(model_id=model_path,
+            task="text-generation",
+            device=0,
+            pipeline_kwargs={
+                "max_new_tokens": 400,
+                "do_sample": True,
+                "temperature": 0.2,
+                "top_k": 40,
+                "top_p": 0.9,
+                "repetition_penalty": 1.1},
+            model_kwargs={
+                "torch_dtype": load_type,
+                "low_cpu_mem_usage": True,
+                "trust_remote_code": True}
+            )
+    if args.chain_type == "stuff":
+        PROMPT = PromptTemplate(
+            template=prompt_template, input_variables=["context", "question"]
+        )
+        chain_type_kwargs = {"prompt": PROMPT}
+        qa = RetrievalQA.from_chain_type(
+            llm=model,
+            chain_type="stuff",
+            retriever=docsearch.as_retriever(search_kwargs={"k": 1}),
+            chain_type_kwargs=chain_type_kwargs)
+    elif args.chain_type == "refine":
+        refine_prompt = PromptTemplate(
+            input_variables=["question", "existing_answer", "context_str"],
+            template=refine_prompt_template,
+        )
+        initial_qa_prompt = PromptTemplate(
+            input_variables=["context_str", "question"],
+            template=initial_qa_template,
+        )
+        chain_type_kwargs = {"question_prompt": initial_qa_prompt, "refine_prompt": refine_prompt}
+        qa = RetrievalQA.from_chain_type(
+            llm=model, chain_type="refine",
+            retriever=docsearch.as_retriever(search_kwargs={"k": 1}),
+            chain_type_kwargs=chain_type_kwargs)
+    while True:
+        query = input("请输入问题：")
+        if len(query.strip())==0:
+            break
+        print(qa.run(query))
--- a/scripts/langchain/langchain_sum.py
+++ b/scripts/langchain/langchain_sum.py
+import argparse
+import os
+parser = argparse.ArgumentParser()
+parser.add_argument('--file_path', required=True, type=str)
+parser.add_argument('--model_path', required=True, type=str)
+parser.add_argument('--gpu_id', default="0", type=str)
+parser.add_argument('--chain_type', default="refine", type=str)
+args = parser.parse_args()
+os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_id
+file_path = args.file_path
+model_path = args.model_path
+import torch
+from langchain.llms.huggingface_pipeline import HuggingFacePipeline
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.prompts import PromptTemplate
+from langchain.chains.summarize import load_summarize_chain
+prompt_template = (
+    "[INST] <<SYS>>\n"
+    "You are a helpful assistant. 你是一个乐于助人的助手。\n"
+    "<</SYS>>\n\n"
+    "请为以下文字写一段摘要:\n{text} [/INST]"
+)
+refine_template = (
+    "[INST] <<SYS>>\n"
+    "You are a helpful assistant. 你是一个乐于助人的助手。\n"
+    "<</SYS>>\n\n"
+    "已有一段摘要：{existing_answer}\n"
+    "现在还有一些文字，（如果有需要）你可以根据它们完善现有的摘要。"
+    "\n"
+    "{text}\n"
+    "\n"
+    "如果这段文字没有用，返回原来的摘要即可。请你生成一个最终的摘要。"
+    " [/INST]"
+)
+if __name__ == '__main__':
+    load_type = torch.float16
+    if not torch.cuda.is_available():
+        raise RuntimeError("No CUDA GPUs are available.")
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=100, length_function=len)
+    with open(file_path) as f:
+        text = f.read()
+    docs = text_splitter.create_documents([text])
+    print("loading LLM...")
+    model = HuggingFacePipeline.from_model_id(model_id=model_path,
+            task="text-generation",
+            device=0,
+            pipeline_kwargs={
+                "max_new_tokens": 400,
+                "do_sample": True,
+                "temperature": 0.2,
+                "top_k": 40,
+                "top_p": 0.9,
+                "repetition_penalty": 1.1},
+            model_kwargs={
+                "torch_dtype" : load_type,
+                "low_cpu_mem_usage" : True,
+                "trust_remote_code": True}
+            )
+    PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"])
+    REFINE_PROMPT = PromptTemplate(
+        template=refine_template,input_variables=["existing_answer", "text"],
+    )
+    if args.chain_type == "stuff":
+        chain = load_summarize_chain(model, chain_type="stuff", prompt=PROMPT)
+    elif args.chain_type == "refine":
+        chain = load_summarize_chain(model, chain_type="refine", question_prompt=PROMPT, refine_prompt=REFINE_PROMPT)
+    print(chain.run(docs))
--- a/scripts/llama-cpp/README.md
+++ b/scripts/llama-cpp/README.md
+## llama.cpp相关示例脚本
+具体使用方法参考：https://github.com/ymcui/Chinese-LLaMA-Alpaca-2/wiki/llamacpp_zh
+Detailed usage: https://github.com/ymcui/Chinese-LLaMA-Alpaca-2/wiki/llamacpp_en
+### chat.sh
+用于与Alpaca-2系列模型进行对话交流。
+Chat with Alpaca-2 models.
+### server_curl_example.sh
+架设server后使用curl调用示例。
+An example to use curl for API calls after setting up server.
--- a/scripts/llama-cpp/chat.sh
+++ b/scripts/llama-cpp/chat.sh
+#!/bin/bash
+# temporary script to chat with Chinese Alpaca-2 model
+# usage: ./chat.sh alpaca2-ggml-model-path your-first-instruction
+SYSTEM_PROMPT='You are a helpful assistant. 你是一个乐于助人的助手。'
+# SYSTEM_PROMPT='You are a helpful assistant. 你是一个乐于助人的助手。请你提供专业、有逻辑、内容真实、有价值的详细回复。' # Try this one, if you prefer longer response.
+MODEL_PATH=$1
+FIRST_INSTRUCTION=$2
+./main -m "$MODEL_PATH" \
+--color -i -c 4096 -t 8 --temp 0.5 --top_k 40 --top_p 0.9 --repeat_penalty 1.1 \
+--in-prefix-bos --in-prefix ' [INST] ' --in-suffix ' [/INST]' -p \
+"[INST] <<SYS>>
+$SYSTEM_PROMPT
+<</SYS>>
+$FIRST_INSTRUCTION [/INST]"
--- a/scripts/llama-cpp/server_curl_example.sh
+++ b/scripts/llama-cpp/server_curl_example.sh
+#!/bin/bash
+# NOTE: start the server first before running this script.
+# usage: ./server_curl_example.sh your-instruction
+SYSTEM_PROMPT='You are a helpful assistant. 你是一个乐于助人的助手。'
+# SYSTEM_PROMPT='You are a helpful assistant. 你是一个乐于助人的助手。请你提供专业、有逻辑、内容真实、有价值的详细回复。' # Try this one, if you prefer longer response.
+INSTRUCTION=$1
+ALL_PROMPT="[INST] <<SYS>>\n$SYSTEM_PROMPT\n<</SYS>>\n\n$INSTRUCTION [/INST]"
+CURL_DATA="{\"prompt\": \"$ALL_PROMPT\",\"n_predict\": 128}"
+curl --request POST \
+    --url http://localhost:8080/completion \
+    --header "Content-Type: application/json" \
+    --data "$CURL_DATA"
--- a/scripts/longbench/config/dataset2maxlen.json
+++ b/scripts/longbench/config/dataset2maxlen.json
+{
+    "narrativeqa": 128,
+    "qasper": 128,
+    "multifieldqa_en": 64,
+    "multifieldqa_zh": 64,
+    "hotpotqa": 32,
+    "2wikimqa": 32,
+    "musique": 32,
+    "dureader": 128,
+    "gov_report": 512,
+    "qmsum": 512,
+    "multi_news": 512,
+    "vcsum": 512,
+    "trec": 64,
+    "triviaqa": 32,
+    "samsum": 128,
+    "lsht": 64,
+    "passage_count": 32,
+    "passage_retrieval_en": 32,
+    "passage_retrieval_zh": 32,
+    "lcc": 64,
+    "repobench-p": 64
+}
\ No newline at end of file
--- a/scripts/longbench/config/dataset2prompt.json
+++ b/scripts/longbench/config/dataset2prompt.json
+{
+    "narrativeqa": "You are given a story, which can be either a novel or a movie script, and a question. Answer the question asconcisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story asconcisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:",
+    "qasper": "You are given a scientific article and a question. Answer the question as concisely as you can, using a single phrase or sentence if possible. If the question cannot be answered based on the information in the article, write \"unanswerable\". If the question is a yes/no question, answer \"yes\", \"no\", or \"unanswerable\". Do not provide any explanation.\n\nArticle: {context}\n\n Answer the question based on the above article as concisely as you can, using a single phrase or sentence if possible. If the question cannot be answered based on the information in the article, write \"unanswerable\". If the question is a yes/no question, answer \"yes\", \"no\", or \"unanswerable\". Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:",
+    "multifieldqa_en": "Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
+    "multifieldqa_zh": "阅读以下文字并用中文简短回答：\n\n{context}\n\n现在请基于上面的文章回答下面的问题，只告诉我答案，不要输出任何其他字词。\n\n问题：{input}\n回答：",
+    "hotpotqa": "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
+    "2wikimqa": "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
+    "musique": "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
+    "dureader": "请基于给定的文章回答下述问题。\n\n文章：{context}\n\n请基于上述文章回答下面的问题。\n\n问题：{input}\n回答：",
+    "gov_report": "You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:",
+    "qmsum": "You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:",
+    "multi_news": "You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:",
+    "vcsum": "下面有一段会议记录，请你阅读后，写一段总结，总结会议的内容。\n会议记录：\n{context}\n\n会议总结：",
+    "trec": "Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}",
+    "triviaqa": "Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}",
+    "samsum": "Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}",
+    "lsht": "请判断给定新闻的类别，下面是一些例子。\n\n{context}\n{input}",
+    "passage_count": "There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ",
+    "passage_retrieval_en": "Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like \"Paragraph 1\", \"Paragraph 2\", etc.\n\nThe answer is: ",
+    "passage_retrieval_zh": "以下是若干段落文字，以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是\"段落1\"，\"段落2\"等格式\n\n答案是：",
+    "lcc": "Please complete the code given below. \n{context}Next line of code:\n",
+    "repobench-p": "Please complete the code given below. \n{context}{input}Next line of code:\n"
+}
\ No newline at end of file
--- a/scripts/longbench/eval.py
+++ b/scripts/longbench/eval.py
--- a/scripts/longbench/metrics.py
+++ b/scripts/longbench/metrics.py
--- a/scripts/longbench/pred_llama2.py
+++ b/scripts/longbench/pred_llama2.py
--- a/scripts/longbench/requirements.txt
+++ b/scripts/longbench/requirements.txt
--- a/scripts/merge_llama2_with_chinese_lora_low_mem.py
+++ b/scripts/merge_llama2_with_chinese_lora_low_mem.py
--- a/scripts/openai_server_demo/README.md
+++ b/scripts/openai_server_demo/README.md