Commit 7d06d0f9 authored by yangzhong's avatar yangzhong
Browse files

Update files

parent 2f320edb
Pipeline #2827 failed with stages
in 0 seconds
# This code is modified from C-Eval Project: https://github.com/SJTU-LIT/ceval
import string
class Evaluator:
def __init__(self, choices, model_path, k=-1):
self.choices = choices
self.model_path = model_path
self.k = k
self.puncs = list(string.punctuation)
def format_example(self, line, include_answer=True):
example = line['question']
# print(example)
for choice in self.choices:
example += f'\n{choice}. {line[f"{choice}"]}'
example += '\n答案:'
if include_answer:
example += f'{line["answer"]}\n\n'
return example
def generate_few_shot_prompt(self, subject, dev_df):
prompt = f"以下是中国关于{subject}考试的单项选择题,请选出其中的正确答案。\n\n"
k = self.k
if self.k == -1:
k = dev_df.shape[0]
for i in range(k):
prompt += self.format_example(dev_df.iloc[i, :])
return prompt
def eval_subject(self, subject_name, test_df, dev_df=None, few_shot=False, save_result_dir=None):
pass
def normalize_answer(self,s):
def white_space_fix(text):
return ' '.join(text.split())
def remove_punc(text):
exclude=set(self.puncs)
return ''.join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_punc(lower(s)))
def exact_match(self,pred, target):
return self.normalize_answer(pred)==self.normalize_answer(target)
# This code is modified from C-Eval Project: https://github.com/SJTU-LIT/ceval
import os
import re
from tqdm import tqdm
import random
import numpy as np
import torch
from transformers import AutoModelForCausalLM, LlamaTokenizer
from transformers import GenerationConfig
from evaluator import Evaluator
class Llama_Evaluator(Evaluator):
def __init__(self, choices, k, model_path, device, temperature=0.2, verbose=False):
super(Llama_Evaluator, self).__init__(choices, model_path, k)
load_type = torch.float16
self.model_path = model_path
self.device = device
self.verbose = verbose
self.tokenizer = LlamaTokenizer.from_pretrained(model_path, legacy=True)
self.model = AutoModelForCausalLM.from_pretrained(
model_path,
load_in_8bit=False,
torch_dtype=load_type,
low_cpu_mem_usage=True,
device_map='auto',
trust_remote_code=True)
self.generation_config = GenerationConfig(
temperature=temperature,
top_k=40,
top_p=0.9,
do_sample=True,
num_beams=1,
repetition_penalty=1.1,
max_new_tokens=20
)
self.sA_id = self.tokenizer.encode("A", add_special_tokens=False)[0]
self.sB_id = self.tokenizer.encode("B", add_special_tokens=False)[0]
self.sC_id = self.tokenizer.encode("C", add_special_tokens=False)[0]
self.sD_id = self.tokenizer.encode("D", add_special_tokens=False)[0]
self.A_id = self.tokenizer.encode(":A")[-1]
self.B_id = self.tokenizer.encode(":B")[-1]
self.C_id = self.tokenizer.encode(":C")[-1]
self.D_id = self.tokenizer.encode(":D")[-1]
def eval_subject(self, subject_name,
test_df,
dev_df=None,
few_shot=False,
cot=False,
save_result_dir=None,
with_prompt=False,
constrained_decoding=False,
do_test=False):
all_answers = {}
if constrained_decoding is True:
self.generation_config.output_scores = True
self.generation_config.return_dict_in_generate = True
self.generation_config.max_new_tokens = 1
self.generation_config.top_p = 1.0
self.generation_config.top_k = 0
correct_num = 0
if save_result_dir:
result = []
score = []
if few_shot:
if with_prompt:
history = self.generate_few_shot_prompt(subject_name, dev_df, cot=cot)
else:
history = self.generate_few_shot_noprompt(subject_name, dev_df, cot=cot)
else:
history = ''
answers = ['NA'] * len(test_df) if do_test is True else list(test_df['Answer'])
for row_index, row in tqdm(test_df.iterrows(), total=len(test_df)):
question = self.format_example(row, include_answer=False, cot=cot,with_prompt=with_prompt)
instruction = question
if with_prompt:
DEFAULT_SYSTEM_PROMPT = """你是一个乐于助人的助手。"""
prompt_template = (
"[INST] <<SYS>>\n"
"{system_prompt}\n"
"<</SYS>>\n\n"
"{instruction} [/INST]"
)
instruction = prompt_template.format_map({'instruction': instruction,'system_prompt':DEFAULT_SYSTEM_PROMPT})
instruction=history+instruction
inputs = self.tokenizer(instruction, return_tensors="pt")
generation_output = self.model.generate(
input_ids = inputs["input_ids"].to(self.device),
attention_mask = inputs['attention_mask'].to(self.device),
eos_token_id=self.tokenizer.eos_token_id,
pad_token_id=self.tokenizer.pad_token_id,
generation_config = self.generation_config
)
_, length = inputs.input_ids.shape
if constrained_decoding is True:
logits = generation_output.scores[0][0]
logits = logits.float().cpu().detach()
choices1_logits = logits[[self.sA_id,self.sB_id,self.sC_id,self.sD_id]]
choices2_logits = logits[[self.A_id,self.B_id,self.C_id,self.D_id]]
choicesAll_logits = (choices1_logits + choices2_logits).numpy()
assert not (np.any(np.isinf(choicesAll_logits)) or np.any(np.isnan(choicesAll_logits)))
ans = {0: "A", 1: "B", 2: "C", 3: "D"}[np.argmax(choicesAll_logits)]
response = self.tokenizer.decode([logits.argmax(-1).item()])
else:
response = self.tokenizer.decode(generation_output[0, length:], skip_special_tokens=True)
ans, _ = self.extract_answer(row, response)
if ans == answers[row_index]:
correct_num += 1
correct = 1
else:
correct = 0
if self.verbose is True:
print(f"\n======={str(row_index)}=======")
print(f"question: {question}\n")
print(f"response: {response}\n")
print(f"extracted answer: {ans}")
print(f"ground truth: {answers[row_index]} \n")
if save_result_dir:
result.append(response)
score.append(correct)
all_answers[str(row_index)] = ans
correct_ratio = 100*correct_num/len(answers)
if save_result_dir:
test_df['model_output'] = result
test_df['correctness'] = score
test_df.to_csv(os.path.join(save_result_dir, f'{subject_name}_test.csv'))
return correct_ratio, all_answers
def format_example(self, line, include_answer=True, cot=False, with_prompt=False):
example = line['Question']
suffix = ""
for choice in self.choices:
example += f'\n{choice}. {line[f"{choice}"]}'
if include_answer:
if cot:
example += "\n答案:让我们一步一步思考,\n" + \
line["explanation"] + f"\n所以答案是{line['Answer']}\n\n"
else:
example += '\n答案:' + suffix + line["Answer"] + '\n\n'
else:
if with_prompt is False:
if cot:
example += "\n答案:让我们一步一步思考,\n1."
else:
example += '\n答案:' + suffix
else:
if cot:
example += "\n答案是什么?让我们一步一步思考,\n1."
else:
example += '\n答案:'
return example
def generate_few_shot_noprompt(self, subject, dev_df, cot=False):
prompt = f"以下是中国关于{subject}考试的单项选择题,请选出其中的正确答案。\n\n"
k = self.k
if self.k == -1:
k = dev_df.shape[0]
for i in range(k):
prompt += self.format_example(
dev_df.iloc[i, :],
include_answer=True,
cot=cot
)
return prompt
def generate_few_shot_prompt(self, subject, dev_df, cot=False):
DEFAULT_SYSTEM_PROMPT = """你是一个乐于助人的助手。"""
prompt = f"以下是中国关于{subject}考试的单项选择题,请选出其中的正确答案。\n\n"
prompt_template = (
"[INST] <<SYS>>\n"
"{system_prompt}\n"
"<</SYS>>\n\n"
"{instruction} [/INST]好的,我会结合{subject}相关知识回答"
)
prompt = prompt_template.format_map({'instruction':prompt,'system_prompt':DEFAULT_SYSTEM_PROMPT,"subject":subject})
k = self.k
if self.k == -1:
k = dev_df.shape[0]
for i in range(k):
line=dev_df.iloc[i, :]
q=line['Question']
for choice in self.choices:
q += f'\n{choice}. {line[f"{choice}"]}'
a=line['Answer']
prompt+="[INST] "+q+"\n答案:[/INST]"+a+"\n"
return prompt
def extract_answer(self, line, gen_ans):
m = re.findall(r'所以答案是(.+?)。', gen_ans, re.M)
if len(m) > 0 and m[-1] in self.choices:
return m[-1], True
answer_patterns = [
r'([ABCD])是正确的',
r'选项([ABCD])正确',
r'答案为([ABCD])',
r'答案是([ABCD])',
r'答案([ABCD])',
r'选择([ABCD])',
r'答案:([ABCD])',
r'选择答案([ABCD])'
]
# RE extraction
for answer_pattern in answer_patterns:
m = re.search(answer_pattern, gen_ans, re.M)
if m:
answer = m.group(1)
return answer, False
# only containing one choice-character
m = re.findall(r'[ABCD]', gen_ans, re.M)
if len(m) >= 1:
answer = m[0]
return answer, False
choices_dict = {}
pattern = ""
for c in self.choices:
choices_dict[str(line[f'{c}'])] = c
pattern += re.escape(str(line[f'{c}']))+"|"
pattern = pattern[:-1]
m = re.findall(pattern, gen_ans, re.M)
print("w/ escape:",repr(pattern),gen_ans,(len(m)>=1))
if len(m) >= 1:
answer = choices_dict[m[0]]
return answer, False
return random.choice('ABCD'), False
# Below code is based on https://github.com/lm-sys/FastChat/blob/main/fastchat/train/llama_flash_attn_monkey_patch.py.
from typing import Optional, Tuple
import torch
import transformers
from einops import rearrange
try:
from flash_attn.flash_attn_interface import flash_attn_with_kvcache
except ImportError:
flash_attn_with_kvcache = None
print(
"FlashAttention-2 is not installed correctly. If you want to use flash attention to inference, flash-attention >= 2.2 is needed. "
"Please check the usage in https://github.com/Dao-AILab/flash-attention for more details."
)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: bool = False,
use_cache: bool = False,
padding_mask=None,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
"""Input shape: Batch x Time x Channel
attention_mask: [bsz, q_len]
"""
bsz, q_len, _ = hidden_states.size()
query_states = (
self.q_proj(hidden_states)
.view(bsz, q_len, self.num_heads, self.head_dim)
)
key_states = (
self.k_proj(hidden_states)
.view(bsz, q_len, self.num_heads, self.head_dim)
)
value_states = (
self.v_proj(hidden_states)
.view(bsz, q_len, self.num_heads, self.head_dim)
)
kv_seq_len = key_states.shape[1]
past_kv_len = 0
if past_key_value is not None:
past_kv_len = past_key_value[0].shape[-2]
kv_seq_len += past_kv_len
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
rotary_dim = cos.shape[-1]
cos, sin = cos.squeeze(0,1)[:,:rotary_dim//2].contiguous(), sin.squeeze(0,1)[:,:rotary_dim//2].contiguous()
if past_key_value is not None:
key_cache = torch.cat([past_key_value[0].transpose(1, 2), key_states], dim=1)
value_cache = torch.cat([past_key_value[1].transpose(1, 2), value_states], dim=1)
else:
key_cache = key_states
value_cache = value_states
assert not output_attentions, "output_attentions is not supported"
q = query_states # [bsz, q_len, nh, hd]
k, v = key_states, value_states # [bsz, q_len, nh, hd]
output = flash_attn_with_kvcache(
q, key_cache, value_cache, k, v, rotary_cos=cos, rotary_sin=sin, cache_seqlens=past_kv_len, softmax_scale=None, causal=True, rotary_interleaved=False
)
output = rearrange(output, "b s h d -> b s (h d)", b=bsz)
past_key_value = (key_cache[:,:kv_seq_len].transpose(1,2), value_cache[:,:kv_seq_len].transpose(1,2)) if use_cache else None
output = self.o_proj(output)
return output, None, past_key_value
# Disable the transformation of the attention mask in LlamaModel as the flash attention
# requires the attention mask to be the same as the key_padding_mask
def _prepare_decoder_attention_mask(
self, attention_mask, input_shape, inputs_embeds, past_key_values_length
):
return attention_mask
def replace_llama_attn_with_flash_attn():
if flash_attn_with_kvcache != None:
print("USE_FLASH_ATTENTION: ", True)
transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = _prepare_decoder_attention_mask
transformers.models.llama.modeling_llama.LlamaAttention.forward = forward
else:
print("USE_FLASH_ATTENTION: ", False)
import torch
from transformers import (
AutoModelForCausalLM,
LlamaForCausalLM,
LlamaTokenizer,
StoppingCriteria,
BitsAndBytesConfig,
GenerationConfig
)
import gradio as gr
import argparse
import os
from queue import Queue
from threading import Thread
import traceback
import gc
import json
import requests
from typing import Iterable, List
import subprocess
import re
DEFAULT_SYSTEM_PROMPT = """You are a helpful assistant. 你是一个乐于助人的助手。"""
TEMPLATE_WITH_SYSTEM_PROMPT = (
"[INST] <<SYS>>\n"
"{system_prompt}\n"
"<</SYS>>\n\n"
"{instruction} [/INST]"
)
TEMPLATE_WITHOUT_SYSTEM_PROMPT = "[INST] {instruction} [/INST]"
# Parse command-line arguments
parser = argparse.ArgumentParser()
parser.add_argument(
'--base_model',
default=None,
type=str,
required=True,
help='Base model path')
parser.add_argument('--lora_model', default=None, type=str,
help="If None, perform inference on the base model")
parser.add_argument(
'--tokenizer_path',
default=None,
type=str,
help='If None, lora model path or base model path will be used')
parser.add_argument(
'--gpus',
default="0",
type=str,
help='If None, cuda:0 will be used. Inference using multi-cards: --gpus=0,1,... ')
parser.add_argument('--share', default=True, help='Share gradio domain name')
parser.add_argument('--port', default=19324, type=int, help='Port of gradio demo')
parser.add_argument(
'--max_memory',
default=1024,
type=int,
help='Maximum number of input tokens (including system prompt) to keep. If exceeded, earlier history will be discarded.')
parser.add_argument(
'--load_in_8bit',
action='store_true',
help='Use 8 bit quantized model')
parser.add_argument(
'--load_in_4bit',
action='store_true',
help='Use 4 bit quantized model')
parser.add_argument(
'--only_cpu',
action='store_true',
help='Only use CPU for inference')
parser.add_argument(
'--alpha',
type=str,
default="1.0",
help="The scaling factor of NTK method, can be a float or 'auto'. ")
parser.add_argument(
"--use_vllm",
action='store_true',
help="Use vLLM as back-end LLM service.")
parser.add_argument(
"--post_host",
type=str,
default="0.0.0.0",
help="Host of vLLM service.")
parser.add_argument(
"--post_port",
type=int,
default=8000,
help="Port of vLLM service.")
parser.add_argument(
"--speculative_sampling",
action='store_true',
help="Use speculative sampling to speed up inference.")
parser.add_argument(
"--draft_base_model",
default=None,
type=str,
help="Draft base model used in speculative sampling.")
parser.add_argument(
"--draft_lora_model",
default=None,
type=str,
help="If None, perform inference on the draft base model")
parser.add_argument(
"--draft_model_load_in_8bit",
action='store_true',
help="Load the draft model in the 8bit mode")
parser.add_argument(
"--draft_model_load_in_4bit",
action='store_true',
help="Load the draft model in the 4bit mode")
parser.add_argument(
'--use_flash_attention_2',
action='store_true',
help="Use flash attention to replace the LLaMA attention")
parser.add_argument('--use_ntk', action='store_true', help="Use dynamic-ntk to extend context window")
args = parser.parse_args()
ENABLE_CFG_SAMPLING = True
try:
from transformers.generation import UnbatchedClassifierFreeGuidanceLogitsProcessor
except ImportError:
ENABLE_CFG_SAMPLING = False
print("Install the latest transformers (commit equal or later than d533465) to enable CFG sampling.")
if args.use_vllm is True:
print("CFG sampling is disabled when using vLLM.")
ENABLE_CFG_SAMPLING = False
if args.only_cpu is True:
args.gpus = ""
if args.load_in_8bit or args.load_in_4bit:
raise ValueError("Quantization is unavailable on CPU.")
if args.load_in_8bit and args.load_in_4bit:
raise ValueError("Only one quantization method can be chosen for inference. Please check your arguments")
import sys
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)
if not args.only_cpu:
if args.use_flash_attention_2:
from flash_attn_patch_for_inference import replace_llama_attn_with_flash_attn
replace_llama_attn_with_flash_attn()
else:
from attn_and_long_ctx_patches import apply_attention_patch
apply_attention_patch(use_memory_efficient_attention=True)
from attn_and_long_ctx_patches import apply_ntk_scaling_patch
if args.use_ntk:
apply_ntk_scaling_patch(args.alpha)
if args.speculative_sampling:
if args.draft_base_model == None:
raise ValueError("Speculative sampling requires a draft model. Please specify the draft model.")
if args.draft_model_load_in_8bit and args.draft_model_load_in_4bit:
raise ValueError("Only one quantization method can be chosen for inference. Please check your arguments")
from speculative_sample import speculative_sample
# Set CUDA devices if available
os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus
# Peft library can only import after setting CUDA devices
from peft import PeftModel
# Set up the required components: model and tokenizer
def setup():
global tokenizer, model, device, share, port, max_memory
if args.speculative_sampling:
global draft_model
if args.use_vllm:
# global share, port, max_memory
max_memory = args.max_memory
port = args.port
share = args.share == 'True' or args.share is True
if args.lora_model is not None:
raise ValueError("vLLM currently does not support LoRA, please merge the LoRA weights to the base model.")
if args.load_in_8bit or args.load_in_4bit:
raise ValueError("vLLM currently does not support quantization, please use fp16 (default) or unuse --use_vllm.")
if args.only_cpu:
raise ValueError("vLLM requires GPUs with compute capability not less than 7.0. If you want to run only on CPU, please unuse --use_vllm.")
if args.speculative_sampling:
raise ValueError("speculative_sampling is set, but vLLM does not support speculative sampling. Please unset speculative_sampling. ")
if args.tokenizer_path is None:
args.tokenizer_path = args.base_model
tokenizer = LlamaTokenizer.from_pretrained(args.tokenizer_path, legacy=True)
print("Start launch vllm server.")
cmd = f"python -m vllm.entrypoints.api_server \
--model={args.base_model} \
--tokenizer={args.tokenizer_path} \
--tokenizer-mode=slow \
--tensor-parallel-size={len(args.gpus.split(','))} \
--host {args.post_host} \
--port {args.post_port} \
&"
subprocess.check_call(cmd, shell=True)
else:
max_memory = args.max_memory
port = args.port
share = args.share == 'True' or args.share is True
load_type = torch.float16
if torch.cuda.is_available():
device = torch.device(0)
else:
device = torch.device('cpu')
if args.tokenizer_path is None:
args.tokenizer_path = args.lora_model
if args.lora_model is None:
args.tokenizer_path = args.base_model
tokenizer = LlamaTokenizer.from_pretrained(args.tokenizer_path, legacy=True)
if args.load_in_4bit or args.load_in_8bit:
quantization_config = BitsAndBytesConfig(
load_in_4bit=args.load_in_4bit,
load_in_8bit=args.load_in_8bit,
bnb_4bit_compute_dtype=load_type,
)
base_model = AutoModelForCausalLM.from_pretrained(
args.base_model,
torch_dtype=load_type,
low_cpu_mem_usage=True,
device_map='auto',
load_in_4bit=args.load_in_4bit,
load_in_8bit=args.load_in_8bit,
quantization_config=quantization_config if (args.load_in_4bit or args.load_in_8bit) else None,
trust_remote_code=True
)
if args.speculative_sampling:
if args.load_in_4bit or args.load_in_8bit:
draft_quantization_config = BitsAndBytesConfig(
load_in_4bit=args.draft_model_load_in_4bit,
load_in_8bit=args.draft_model_load_in_8bit,
bnb_4bit_compute_dtype=load_type,
)
draft_base_model = LlamaForCausalLM.from_pretrained(
args.draft_base_model,
torch_dtype=load_type,
low_cpu_mem_usage=True,
device_map='auto',
load_in_4bit=args.draft_model_load_in_4bit,
load_in_8bit=args.draft_model_load_in_8bit,
quantization_config=draft_quantization_config if (args.draft_model_load_in_4bit or args.draft_model_load_in_8bit) else None
)
model_vocab_size = base_model.get_input_embeddings().weight.size(0)
tokenizer_vocab_size = len(tokenizer)
print(f"Vocab of the base model: {model_vocab_size}")
print(f"Vocab of the tokenizer: {tokenizer_vocab_size}")
if model_vocab_size != tokenizer_vocab_size:
print("Resize model embeddings to fit tokenizer")
base_model.resize_token_embeddings(tokenizer_vocab_size)
if args.speculative_sampling:
draft_model_vocab_size = draft_base_model.get_input_embeddings().weight.size(0)
print(f"Vocab of the draft base model: {draft_model_vocab_size}")
if draft_model_vocab_size!=tokenizer_vocab_size:
print("Resize draft model embeddings to fit tokenizer")
draft_base_model.resize_token_embeddings(tokenizer_vocab_size)
if args.lora_model is not None:
print("loading peft model")
model = PeftModel.from_pretrained(
base_model,
args.lora_model,
torch_dtype=load_type,
device_map='auto',
).half()
else:
model = base_model
if args.speculative_sampling:
if args.draft_lora_model is not None:
print("loading peft draft model")
draft_model = PeftModel.from_pretrained(draft_base_model, args.draft_lora_model,torch_dtype=load_type,device_map='auto',).half()
else:
draft_model = draft_base_model
if device == torch.device('cpu'):
model.float()
model.eval()
if args.speculative_sampling:
if device==torch.device('cpu'):
draft_model.float()
draft_model.eval()
# Reset the user input
def reset_user_input():
return gr.update(value='')
# Reset the state
def reset_state():
return []
def generate_prompt(instruction, response="", with_system_prompt=True, system_prompt=DEFAULT_SYSTEM_PROMPT):
if with_system_prompt is True:
prompt = TEMPLATE_WITH_SYSTEM_PROMPT.format_map({'instruction': instruction,'system_prompt': system_prompt})
else:
prompt = TEMPLATE_WITHOUT_SYSTEM_PROMPT.format_map({'instruction': instruction})
if len(response)>0:
prompt += " " + response
return prompt
# User interaction function for chat
def user(user_message, history):
return gr.update(value="", interactive=False), history + \
[[user_message, None]]
class Stream(StoppingCriteria):
def __init__(self, callback_func=None):
self.callback_func = callback_func
def __call__(self, input_ids, scores) -> bool:
if self.callback_func is not None:
self.callback_func(input_ids[0])
return False
class Iteratorize:
"""
Transforms a function that takes a callback
into a lazy iterator (generator).
Adapted from: https://stackoverflow.com/a/9969000
"""
def __init__(self, func, kwargs=None, callback=None):
self.mfunc = func
self.c_callback = callback
self.q = Queue()
self.sentinel = object()
self.kwargs = kwargs or {}
self.stop_now = False
def _callback(val):
if self.stop_now:
raise ValueError
self.q.put(val)
def gentask():
try:
ret = self.mfunc(callback=_callback, **self.kwargs)
except ValueError:
pass
except Exception:
traceback.print_exc()
clear_torch_cache()
self.q.put(self.sentinel)
if self.c_callback:
self.c_callback(ret)
self.thread = Thread(target=gentask)
self.thread.start()
def __iter__(self):
return self
def __next__(self):
obj = self.q.get(True, None)
if obj is self.sentinel:
raise StopIteration
else:
return obj
def __del__(self):
clear_torch_cache()
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.stop_now = True
clear_torch_cache()
def clear_torch_cache():
gc.collect()
if torch.cuda.device_count() > 0:
torch.cuda.empty_cache()
def post_http_request(prompt: str,
api_url: str,
n: int = 1,
top_p: float = 0.9,
top_k: int = 40,
temperature: float = 0.2,
max_tokens: int = 512,
presence_penalty: float = 1.0,
use_beam_search: bool = False,
stream: bool = False) -> requests.Response:
headers = {"User-Agent": "Test Client"}
pload = {
"prompt": prompt,
"n": n,
"top_p": 1 if use_beam_search else top_p,
"top_k": -1 if use_beam_search else top_k,
"temperature": 0 if use_beam_search else temperature,
"max_tokens": max_tokens,
"use_beam_search": use_beam_search,
"best_of": 5 if use_beam_search else n,
"presence_penalty": presence_penalty,
"stream": stream,
}
print(pload)
response = requests.post(api_url, headers=headers, json=pload, stream=True)
return response
def get_streaming_response(response: requests.Response) -> Iterable[List[str]]:
for chunk in response.iter_lines(chunk_size=8192,
decode_unicode=False,
delimiter=b"\0"):
if chunk:
data = json.loads(chunk.decode("utf-8"))
output = data["text"]
yield output
# Perform prediction based on the user input and history
@torch.no_grad()
def predict(
history,
system_prompt,
negative_prompt,
max_new_tokens=128,
top_p=0.9,
temperature=0.2,
top_k=40,
do_sample=True,
repetition_penalty=1.1,
guidance_scale=1.0,
presence_penalty=0.0,
draft_k=0,
):
if len(system_prompt) == 0:
system_prompt = DEFAULT_SYSTEM_PROMPT
while True:
print("len(history):", len(history))
print("history: ", history)
history[-1][1] = ""
if len(history) == 1:
input = history[0][0]
prompt = generate_prompt(input,response="", with_system_prompt=True, system_prompt=system_prompt)
else:
input = history[0][0]
response = history[0][1]
prompt = generate_prompt(input, response=response, with_system_prompt=True, system_prompt=system_prompt)+'</s>'
for hist in history[1:-1]:
input = hist[0]
response = hist[1]
prompt = prompt + '<s>'+generate_prompt(input, response=response, with_system_prompt=False)+'</s>'
input = history[-1][0]
prompt = prompt + '<s>'+generate_prompt(input, response="", with_system_prompt=False)
input_length = len(tokenizer.encode(prompt, add_special_tokens=True))
print(f"Input length: {input_length}")
if input_length > max_memory and len(history) > 1:
print(f"The input length ({input_length}) exceeds the max memory ({max_memory}). The earlier history will be discarded.")
history = history[1:]
print("history: ", history)
else:
break
if args.use_vllm:
generate_params = {
'max_tokens': max_new_tokens,
'top_p': top_p,
'temperature': temperature,
'top_k': top_k,
"use_beam_search": not do_sample,
'presence_penalty': presence_penalty,
}
api_url = f"http://{args.post_host}:{args.post_port}/generate"
response = post_http_request(prompt, api_url, **generate_params, stream=True)
for h in get_streaming_response(response):
for line in h:
line = line.replace(prompt, '')
history[-1][1] = line
yield history
else:
negative_text = None
if len(negative_prompt) != 0:
negative_text = re.sub(r"<<SYS>>\n(.*)\n<</SYS>>", f"<<SYS>>\n{negative_prompt}\n<</SYS>>", prompt)
inputs = tokenizer(prompt, return_tensors="pt")
input_ids = inputs["input_ids"].to(device)
if negative_text is None:
negative_prompt_ids = None
negative_prompt_attention_mask = None
else:
negative_inputs = tokenizer(negative_text,return_tensors="pt")
negative_prompt_ids = negative_inputs["input_ids"].to(device)
negative_prompt_attention_mask = negative_inputs["attention_mask"].to(device)
generate_params = {
'input_ids': input_ids,
'max_new_tokens': max_new_tokens,
'top_p': top_p,
'temperature': temperature,
'top_k': top_k,
'do_sample': do_sample,
'repetition_penalty': repetition_penalty,
'eos_token_id': tokenizer.eos_token_id,
}
if ENABLE_CFG_SAMPLING is True:
generate_params['guidance_scale'] = guidance_scale
generate_params['negative_prompt_ids'] = negative_prompt_ids
generate_params['negative_prompt_attention_mask'] = negative_prompt_attention_mask
if args.speculative_sampling:
generate_params['target_model'] = model
generate_params['draft_model'] = draft_model
generate_params['draft_k'] = draft_k
generate_params['generation_config'] = GenerationConfig()
def generate_with_callback(callback=None, **kwargs):
if 'stopping_criteria' in kwargs:
kwargs['stopping_criteria'].append(Stream(callback_func=callback))
else:
kwargs['stopping_criteria'] = [Stream(callback_func=callback)]
clear_torch_cache()
with torch.no_grad():
if not args.speculative_sampling:
model.generate(**kwargs)
else: # enable speculative sampling
speculative_sample(**kwargs)
def generate_with_streaming(**kwargs):
return Iteratorize(generate_with_callback, kwargs, callback=None)
with generate_with_streaming(**generate_params) as generator:
for output in generator:
next_token_ids = output[len(input_ids[0]):]
if next_token_ids[0] == tokenizer.eos_token_id:
break
new_tokens = tokenizer.decode(
next_token_ids, skip_special_tokens=True)
if isinstance(tokenizer, LlamaTokenizer) and len(next_token_ids) > 0:
if tokenizer.convert_ids_to_tokens(int(next_token_ids[0])).startswith('▁'):
new_tokens = ' ' + new_tokens
history[-1][1] = new_tokens
yield history
if len(next_token_ids) >= max_new_tokens:
break
# Call the setup function to initialize the components
setup()
# Create the Gradio interface
with gr.Blocks() as demo:
github_banner_path = 'https://raw.githubusercontent.com/ymcui/Chinese-LLaMA-Alpaca-2/main/pics/banner.png'
gr.HTML(f'<p align="center"><a href="https://github.com/ymcui/Chinese-LLaMA-Alpaca-2"><img src={github_banner_path} width="700"/></a></p>')
chatbot = gr.Chatbot()
with gr.Row():
with gr.Column(scale=4):
with gr.Column(scale=3):
system_prompt_input = gr.Textbox(
show_label=True,
label="系统提示语(仅在对话开始前或清空历史后修改有效,对话过程中修改无效)",
placeholder=DEFAULT_SYSTEM_PROMPT,
lines=1).style(
container=True)
negative_prompt_input = gr.Textbox(
show_label=True,
label="反向提示语(仅在对话开始前或清空历史后修改有效,对话过程中修改无效)",
placeholder="(可选,默认为空)",
lines=1,
visible=ENABLE_CFG_SAMPLING).style(
container=True)
with gr.Column(scale=12):
user_input = gr.Textbox(
show_label=True,
label="用户指令",
placeholder="Shift + Enter发送消息...",
lines=10).style(
container=True)
with gr.Column(min_width=32, scale=1):
submitBtn = gr.Button("Submit", variant="primary")
with gr.Column(scale=1):
emptyBtn = gr.Button("Clear History")
max_new_token = gr.Slider(
0,
4096,
value=512,
step=1.0,
label="Maximum New Token Length",
interactive=True)
top_p = gr.Slider(0, 1, value=0.9, step=0.01,
label="Top P", interactive=True)
temperature = gr.Slider(
0,
1,
value=0.2,
step=0.01,
label="Temperature",
interactive=True)
top_k = gr.Slider(1, 40, value=40, step=1,
label="Top K", interactive=True)
do_sample = gr.Checkbox(
value=True,
label="Do Sample",
info="use random sample strategy",
interactive=True)
repetition_penalty = gr.Slider(
1.0,
3.0,
value=1.1,
step=0.1,
label="Repetition Penalty",
interactive=True,
visible=False if args.use_vllm else True)
guidance_scale = gr.Slider(
1.0,
3.0,
value=1.0,
step=0.1,
label="Guidance Scale",
interactive=True,
visible=ENABLE_CFG_SAMPLING)
presence_penalty = gr.Slider(
-2.0,
2.0,
value=1.0,
step=0.1,
label="Presence Penalty",
interactive=True,
visible=True if args.use_vllm else False)
draft_k = gr.Slider(
0,
10,
value=0,
step=1.0,
label="Draft K",
interactive=True,
visible=args.speculative_sampling==True)
params = [user_input, chatbot]
predict_params = [
chatbot,
system_prompt_input,
negative_prompt_input,
max_new_token,
top_p,
temperature,
top_k,
do_sample,
repetition_penalty,
guidance_scale,
presence_penalty,
draft_k]
submitBtn.click(
user,
params,
params,
queue=False).then(
predict,
predict_params,
chatbot).then(
lambda: gr.update(
interactive=True),
None,
[user_input],
queue=False)
user_input.submit(
user,
params,
params,
queue=False).then(
predict,
predict_params,
chatbot).then(
lambda: gr.update(
interactive=True),
None,
[user_input],
queue=False)
submitBtn.click(reset_user_input, [], [user_input])
emptyBtn.click(reset_state, outputs=[chatbot], show_progress=True)
# Launch the Gradio interface
demo.queue().launch(
share=share,
inbrowser=True,
server_name='0.0.0.0',
server_port=port)
import argparse
import json, os
DEFAULT_SYSTEM_PROMPT = """You are a helpful assistant. 你是一个乐于助人的助手。"""
TEMPLATE = (
"[INST] <<SYS>>\n"
"{system_prompt}\n"
"<</SYS>>\n\n"
"{instruction} [/INST]"
)
parser = argparse.ArgumentParser()
parser.add_argument('--base_model', default=None, type=str, required=True)
parser.add_argument('--lora_model', default=None, type=str, help="If None, perform inference on the base model")
parser.add_argument('--tokenizer_path', default=None, type=str)
parser.add_argument('--data_file', default=None, type=str, help="A file that contains instructions (one instruction per line)")
parser.add_argument('--with_prompt', action='store_true', help="wrap the input with the prompt automatically")
parser.add_argument('--interactive', action='store_true', help="run in the instruction mode (single-turn)")
parser.add_argument('--predictions_file', default='./predictions.json', type=str)
parser.add_argument('--gpus', default="0", type=str)
parser.add_argument('--only_cpu', action='store_true', help='only use CPU for inference')
parser.add_argument('--alpha', type=str, default="1.0", help="The scaling factor of NTK method, can be a float or 'auto'. ")
parser.add_argument('--load_in_8bit', action='store_true', help="Load the LLM in the 8bit mode")
parser.add_argument('--load_in_4bit', action='store_true', help="Load the LLM in the 4bit mode")
parser.add_argument("--use_vllm", action='store_true', help="Use vLLM as back-end LLM service.")
parser.add_argument('--system_prompt', type=str, default=DEFAULT_SYSTEM_PROMPT, help="The system prompt of the prompt template.")
parser.add_argument('--negative_prompt', type=str, default=None, help="Negative prompt in CFG sampling.")
parser.add_argument('--guidance_scale', type=float, default=1.0, help="The guidance scale for CFG sampling. CFG is enabled by setting `guidance_scale > 1`.")
parser.add_argument('--speculative_sampling', action='store_true', help="Use speculative sampling to speed up inference.")
parser.add_argument('--draft_k', type=int, default=-1, help="Number of new tokens the draft model generates each times. Should be a positive integer. Using adaptive number K if `draft_k <= 0`.")
parser.add_argument('--draft_base_model', default=None, type=str, help="Draft base model used in speculative sampling.")
parser.add_argument('--draft_lora_model', default=None, type=str, help="If None, perform inference on the draft base model")
parser.add_argument('--draft_model_load_in_8bit', action='store_true', help="Load the draft model in the 8bit mode")
parser.add_argument('--draft_model_load_in_4bit', action='store_true', help="Load the draft model in the 4bit mode")
parser.add_argument('--use_flash_attention_2', action='store_true', help="Use flash attention to replace the LLaMA attention")
parser.add_argument('--use_ntk', action='store_true', help="Use dynamic-ntk to extend context window")
args = parser.parse_args()
if args.guidance_scale > 1:
try:
from transformers.generation import UnbatchedClassifierFreeGuidanceLogitsProcessor
except ImportError:
raise ImportError("Please install the latest transformers (commit equal or later than d533465) to enable CFG sampling.")
if args.use_vllm:
if args.lora_model is not None:
raise ValueError("vLLM currently does not support LoRA, please merge the LoRA weights to the base model.")
if args.load_in_8bit or args.load_in_4bit:
raise ValueError("vLLM currently does not support quantization, please use fp16 (default) or unuse --use_vllm.")
if args.only_cpu:
raise ValueError("vLLM requires GPUs with compute capability not less than 7.0. If you want to run only on CPU, please unuse --use_vllm.")
if args.guidance_scale > 1:
raise ValueError("guidance_scale > 1, but vLLM does not support CFG sampling. Please unset guidance_scale. ")
if args.speculative_sampling:
raise ValueError("speculative_sampling is set, but vLLM does not support speculative sampling. Please unset speculative_sampling. ")
if args.load_in_8bit and args.load_in_4bit:
raise ValueError("Only one quantization method can be chosen for inference. Please check your arguments")
if args.only_cpu is True:
args.gpus = ""
if args.load_in_8bit or args.load_in_4bit:
raise ValueError("Quantization is unavailable on CPU.")
os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus
import torch
from transformers import AutoModelForCausalLM, LlamaForCausalLM, LlamaTokenizer
from transformers import GenerationConfig
from transformers import BitsAndBytesConfig
from peft import PeftModel
if args.use_vllm:
from vllm import LLM, SamplingParams
import sys
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)
if not args.only_cpu:
if args.use_flash_attention_2:
from flash_attn_patch_for_inference import replace_llama_attn_with_flash_attn
replace_llama_attn_with_flash_attn()
elif not args.use_vllm:
from attn_and_long_ctx_patches import apply_attention_patch
apply_attention_patch(use_memory_efficient_attention=True)
if args.use_ntk:
from attn_and_long_ctx_patches import apply_ntk_scaling_patch
apply_ntk_scaling_patch(args.alpha)
if args.speculative_sampling:
if args.draft_base_model == None:
raise ValueError("Speculative sampling requires a draft model. Please specify the draft model.")
if args.draft_model_load_in_8bit and args.draft_model_load_in_4bit:
raise ValueError("Only one quantization method can be chosen for inference. Please check your arguments")
from speculative_sample import speculative_sample
if args.use_vllm:
generation_config = dict(
temperature=0.2,
top_k=40,
top_p=0.9,
max_tokens=400,
presence_penalty=1.0,
)
else:
generation_config = GenerationConfig(
temperature=0.2,
top_k=40,
top_p=0.9,
do_sample=True,
num_beams=1,
repetition_penalty=1.1,
max_new_tokens=400
)
sample_data = ["为什么要减少污染,保护环境?"]
def generate_prompt(instruction, system_prompt=DEFAULT_SYSTEM_PROMPT):
return TEMPLATE.format_map({'instruction': instruction,'system_prompt': system_prompt})
if __name__ == '__main__':
load_type = torch.float16
if torch.cuda.is_available():
device = torch.device(0)
else:
device = torch.device('cpu')
if args.tokenizer_path is None:
args.tokenizer_path = args.lora_model
if args.lora_model is None:
args.tokenizer_path = args.base_model
if args.use_vllm:
model = LLM(model=args.base_model,
tokenizer=args.tokenizer_path,
tokenizer_mode='slow',
tensor_parallel_size=len(args.gpus.split(',')),
trust_remote_code=True)
tokenizer = LlamaTokenizer.from_pretrained(args.tokenizer_path, legacy=True)
else:
tokenizer = LlamaTokenizer.from_pretrained(args.tokenizer_path, legacy=True)
if args.load_in_4bit or args.load_in_8bit:
quantization_config = BitsAndBytesConfig(
load_in_4bit=args.load_in_4bit,
load_in_8bit=args.load_in_8bit,
bnb_4bit_compute_dtype=load_type,
)
base_model = AutoModelForCausalLM.from_pretrained(
args.base_model,
torch_dtype=load_type,
low_cpu_mem_usage=True,
device_map='auto',
load_in_4bit=args.load_in_4bit,
load_in_8bit=args.load_in_8bit,
quantization_config=quantization_config if (args.load_in_4bit or args.load_in_8bit) else None,
trust_remote_code=True
)
if args.speculative_sampling:
if args.load_in_4bit or args.load_in_8bit:
draft_quantization_config = BitsAndBytesConfig(
load_in_4bit=args.draft_model_load_in_4bit,
load_in_8bit=args.draft_model_load_in_8bit,
bnb_4bit_compute_dtype=load_type,
)
draft_base_model = LlamaForCausalLM.from_pretrained(
args.draft_base_model,
torch_dtype=load_type,
low_cpu_mem_usage=True,
device_map='auto',
load_in_4bit=args.draft_model_load_in_4bit,
load_in_8bit=args.draft_model_load_in_8bit,
quantization_config=draft_quantization_config if (args.draft_model_load_in_4bit or args.draft_model_load_in_8bit) else None
)
model_vocab_size = base_model.get_input_embeddings().weight.size(0)
tokenizer_vocab_size = len(tokenizer)
print(f"Vocab of the base model: {model_vocab_size}")
print(f"Vocab of the tokenizer: {tokenizer_vocab_size}")
if model_vocab_size!=tokenizer_vocab_size:
print("Resize model embeddings to fit tokenizer")
base_model.resize_token_embeddings(tokenizer_vocab_size)
if args.speculative_sampling:
draft_model_vocab_size = draft_base_model.get_input_embeddings().weight.size(0)
print(f"Vocab of the draft base model: {draft_model_vocab_size}")
if draft_model_vocab_size!=tokenizer_vocab_size:
print("Resize draft model embeddings to fit tokenizer")
draft_base_model.resize_token_embeddings(tokenizer_vocab_size)
if args.lora_model is not None:
print("loading peft model")
model = PeftModel.from_pretrained(base_model, args.lora_model,torch_dtype=load_type,device_map='auto',).half()
else:
model = base_model
if args.speculative_sampling:
if args.draft_lora_model is not None:
print("loading peft draft model")
draft_model = PeftModel.from_pretrained(draft_base_model, args.draft_lora_model,torch_dtype=load_type,device_map='auto',).half()
else:
draft_model = draft_base_model
if device==torch.device('cpu'):
model.float()
model.eval()
if args.speculative_sampling:
if device==torch.device('cpu'):
draft_model.float()
draft_model.eval()
# test data
if args.data_file is None:
examples = sample_data
else:
with open(args.data_file,'r') as f:
examples = [l.strip() for l in f.readlines()]
print("first 10 examples:")
for example in examples[:10]:
print(example)
with torch.no_grad():
if args.interactive:
print("Start inference with instruction mode.")
print('='*85)
print("+ 该模式下仅支持单轮问答,无多轮对话能力。\n"
"+ 如要进行多轮对话,请使用llama.cpp或本项目中的gradio_demo.py。")
print('-'*85)
print("+ This mode only supports single-turn QA.\n"
"+ If you want to experience multi-turn dialogue, please use llama.cpp or gradio_demo.py.")
print('='*85)
while True:
raw_input_text = input("Input:")
if len(raw_input_text.strip())==0:
break
if args.with_prompt:
input_text = generate_prompt(instruction=raw_input_text, system_prompt=args.system_prompt)
negative_text = None if args.negative_prompt is None \
else generate_prompt(instruction=raw_input_text, system_prompt=args.negative_prompt)
else:
input_text = raw_input_text
negative_text = args.negative_prompt
if args.use_vllm:
output = model.generate([input_text], SamplingParams(**generation_config), use_tqdm=False)
response = output[0].outputs[0].text
else:
inputs = tokenizer(input_text,return_tensors="pt") #add_special_tokens=False ?
if args.guidance_scale ==1:
if not args.speculative_sampling:
generation_output = model.generate(
input_ids = inputs["input_ids"].to(device),
attention_mask = inputs['attention_mask'].to(device),
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
generation_config = generation_config
)
else: # enable speculative sampling
generation_output = speculative_sample(
input_ids=inputs["input_ids"].to(device),
target_model=model,
draft_model=draft_model,
draft_k=args.draft_k,
generation_config=generation_config,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
)
else: # enable CFG sampling
if negative_text is None:
negative_prompt_ids = None
negative_prompt_attention_mask = None
else:
negative_inputs = tokenizer(negative_text,return_tensors="pt")
negative_prompt_ids = negative_inputs["input_ids"].to(device)
negative_prompt_attention_mask = negative_inputs["attention_mask"].to(device)
if not args.speculative_sampling:
generation_output = model.generate(
input_ids = inputs["input_ids"].to(device),
attention_mask = inputs['attention_mask'].to(device),
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
generation_config = generation_config,
guidance_scale = args.guidance_scale,
negative_prompt_ids = negative_prompt_ids,
negative_prompt_attention_mask = negative_prompt_attention_mask
)
else: # enable speculative sampling
generation_output = speculative_sample(
input_ids=inputs["input_ids"].to(device),
target_model=model,
draft_model=draft_model,
draft_k=args.draft_k,
generation_config=generation_config,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
guidance_scale=args.guidance_scale,
negative_prompt_ids=negative_prompt_ids,
negative_prompt_attention_mask=negative_prompt_attention_mask,
)
s = generation_output[0]
output = tokenizer.decode(s,skip_special_tokens=True)
if args.with_prompt:
response = output.split("[/INST]")[-1].strip()
else:
response = output
print("Response: ",response)
print("\n")
else:
print("Start inference.")
results = []
if args.use_vllm:
if args.with_prompt is True:
inputs = [generate_prompt(example, system_prompt=args.system_prompt) for example in examples]
else:
inputs = examples
outputs = model.generate(inputs, SamplingParams(**generation_config))
for index, (example, output) in enumerate(zip(examples, outputs)):
response = output.outputs[0].text
print(f"======={index}=======")
print(f"Input: {example}\n")
print(f"Output: {response}\n")
results.append({"Input":example,"Output":response})
else:
for index, example in enumerate(examples):
if args.with_prompt:
input_text = generate_prompt(instruction=example, system_prompt=args.system_prompt)
negative_text = None if args.negative_prompt is None else \
generate_prompt(instruction=example, system_prompt=args.negative_prompt)
else:
input_text = example
negative_text = args.negative_prompt
inputs = tokenizer(input_text,return_tensors="pt") #add_special_tokens=False ?
if args.guidance_scale == 1:
if not args.speculative_sampling:
generation_output = model.generate(
input_ids = inputs["input_ids"].to(device),
attention_mask = inputs['attention_mask'].to(device),
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
generation_config = generation_config
)
else: # enable speculative sampling
generation_output = speculative_sample(
input_ids=inputs["input_ids"].to(device),
target_model=model,
draft_model=draft_model,
draft_k=args.draft_k,
generation_config=generation_config,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
)
else: # enable CFG sampling
if negative_text is None:
negative_prompt_ids = None
negative_prompt_attention_mask = None
else:
negative_inputs = tokenizer(negative_text,return_tensors="pt")
negative_prompt_ids = negative_inputs["input_ids"].to(device)
negative_prompt_attention_mask = negative_inputs["attention_mask"].to(device)
if not args.speculative_sampling:
generation_output = model.generate(
input_ids = inputs["input_ids"].to(device),
attention_mask = inputs['attention_mask'].to(device),
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
generation_config = generation_config,
guidance_scale = args.guidance_scale,
negative_prompt_ids = negative_prompt_ids,
negative_prompt_attention_mask = negative_prompt_attention_mask
)
else: # enable speculative sampling
generation_output = speculative_sample(
input_ids=inputs["input_ids"].to(device),
target_model=model,
draft_model=draft_model,
draft_k=args.draft_k,
generation_config=generation_config,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
guidance_scale=args.guidance_scale,
negative_prompt_ids=negative_prompt_ids,
negative_prompt_attention_mask=negative_prompt_attention_mask,
)
s = generation_output[0]
output = tokenizer.decode(s,skip_special_tokens=True)
if args.with_prompt:
response = output.split("[/INST]")[1].strip()
else:
response = output
print(f"======={index}=======")
print(f"Input: {example}\n")
print(f"Output: {response}\n")
results.append({"Input":input_text,"Output":response})
dirname = os.path.dirname(args.predictions_file)
os.makedirs(dirname,exist_ok=True)
with open(args.predictions_file,'w') as f:
json.dump(results,f,ensure_ascii=False,indent=2)
if args.use_vllm:
with open(dirname+'/generation_config.json','w') as f:
json.dump(generation_config,f,ensure_ascii=False,indent=2)
else:
generation_config.save_pretrained('./')
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, PreTrainedModel
from transformers import (
LogitsProcessorList,
StoppingCriteriaList,
)
from transformers.generation.streamers import BaseStreamer
import torch
from typing import Tuple, List, Optional
import copy
def norm_logits(
x: torch.Tensor,
logits: torch.Tensor,
logits_processor: LogitsProcessorList,
logits_warper: LogitsProcessorList,
do_sample: bool = False,
cur_len=None,
) -> torch.Tensor:
"""
Args:
x (`torch.Tensor`): input ids, shape (batch, seqlen)
logits `(`torch.Tensor`): shape (batch, seqlen, vocab)
do_sample ('bool'): whether do sample
logits_processor (`LogitsProcessorList`, *optional*):
Custom logits processors that complement the default logits processors built from arguments and
generation config. If a logit processor is passed that is already created with the arguments or a
generation config an error is thrown. This feature is intended for advanced users.
logits_warper (`LogitsProcessorList`): An instance of [`LogitsProcessorList`]. List of instances of class derived from
[`LogitsWarper`] used to warp the prediction score distribution of the language modeling head applied before multinomial
sampling at each generation step.
do_sample ('boo;'): whether do sample.
cur_len ('int'): length of current decoded tokens.
Returns:
`torch.Tensor`: probs with shape as (batch, seq_len)
"""
new_logits = logits[:,:]
if len(logits_processor) > 0:
for i in range(x.shape[1]-cur_len+1):
new_logits[:,i,:] = logits_processor(x[:,:cur_len+i], new_logits[:,i,:])
if do_sample and len(logits_warper) > 0:
for i in range(x.shape[1]-cur_len+1):
new_logits[:,i,:] = logits_warper(x[:,:cur_len+i], new_logits[:,i,:])
probs = new_logits.softmax(dim=-1)
return probs
def sample(probs : torch.Tensor, do_sample : bool = False, num_samples: int = 1):
if do_sample:
new_token = torch.multinomial(probs, num_samples=num_samples)
else:
new_token = torch.argmax(probs, keepdim=True)
return new_token
def max_fn(x):
"""
norm(max (x, 0))
"""
x_max = torch.where(x > 0, x, torch.zeros_like(x))
x_max_sum = torch.sum(x_max, dim=1, keepdim=True)
return x_max / x_max_sum
def _draft_model_serial_forward(
prefix : torch.Tensor,
draft_k : int,
draft_model : torch.nn.Module,
logits_processor,
logits_warper,
do_sample=False,
past_key_values=None,
rejected=False,
eos_token_id_tensor = None
) -> Tuple[torch.Tensor, torch.Tensor, List[Tuple[torch.Tensor, torch.Tensor]], torch.Tensor or bool]:
""" forward draft model draft_k times
Args:
prefix (`torch.Tensor`): the original input ids
draft_k (`int`): how many times draft model forward and sample
draft_model (`torch.nn.Module`): an draft model
logits_processor (`LogitsProcessorList`, *optional*): Custom logits processors that complement the default logits processors built from arguments and
generation config.
logits_warper: List of instances of class derived from [`LogitsWarper`] used to warp the prediction score distribution
do_sample (`bool`): whether do sample
past_key_values: kv cache of draft model in last iteration
rejected (`bool`): whether any of tokens in last iteration was rejected
eos_token_id_tensor (`torch.Tensor`): eos token id in tokenizer
Returns:
Tuple[torch.Tensor, torch.Tensor, List[Tuple[torch.Tensor, torch.Tensor]], torch.Tensor or bool]:
generated tokens, probability distribution of draft model's output,
past_key_values of draft model, flag of whether last token is eos
"""
x = prefix
x = x.to(draft_model.device)
input_ids = x
probs = None
if past_key_values != None:
if rejected == False:
output = draft_model(input_ids[:,-2:-1], past_key_values = past_key_values, use_cache=True)
past_key_values = output.past_key_values
input_ids = input_ids[:,-1:]
probs = norm_logits(x[:,:-1], output.logits, logits_processor, logits_warper, do_sample, x.shape[1]-1)
else:
input_ids = input_ids[:,-1:]
for _ in range(draft_k):
output = draft_model(input_ids, past_key_values = past_key_values, use_cache=True)
new_probs = norm_logits(x, output.logits[:,-1:], logits_processor, logits_warper, do_sample, x.shape[1])
next_tok = sample(new_probs[:, -1, :], do_sample=do_sample)
if eos_token_id_tensor is not None:
last_token_is_eos = next_tok.tile(eos_token_id_tensor.shape[0], 1)
last_token_is_eos = (
~last_token_is_eos.ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0).bool()
)
if last_token_is_eos:
break
else:
last_token_is_eos = False
past_key_values = output.past_key_values
probs = torch.cat((probs, new_probs), dim=1) if probs != None else torch.cat((output.logits[:,:-1], new_probs), dim=1)
input_ids = next_tok
x = torch.cat((x, next_tok), dim=1)
return x, probs, past_key_values, last_token_is_eos
def _speculative_sampling(
prefix : torch.Tensor,
target_model : torch.nn.Module,
draft_model : torch.nn.Module,
max_new_tokens : int ,
draft_k : int = 4,
logits_processor: LogitsProcessorList = None,
logits_warper : LogitsProcessorList = None,
do_sample = False,
eos_token_id = None,
stopping_criteria = None,
streamer: Optional["BaseStreamer"] = None,
) -> torch.Tensor:
"""
DeepMind version Speculative Sampling.
Accelerating Large Language Model Decoding with Speculative Sampling
https://arxiv.org/abs/2302.01318
Args:
prefix (torch.Tensor): input sequence, (batch, prefix_seqlen), Note that the batch dim is always 1 now.
target_model (torch.nn.Module): target model, the large one
draft_model (torch.nn.Module): draft model, the small one
max_new_tokens (int): the max overall generated tokens number.
draft_k (int): the token number small model guesses.
logits_processor (`LogitsProcessorList`, *optional*): Custom logits processors that complement the default logits processors built from arguments and
generation config.
logits_warper: List of instances of class derived from [`LogitsWarper`] used to warp the prediction score distribution
do_sample (`bool`): whether do sample
eos_token_id: eos token id in tokenizer
stopping_criteria: An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
used to tell if the generation loop should stop.
streamer (`BaseStreamer`, *optional*):
Streamer object that will be used to stream the generated sequences. Generated tokens are passed
through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
Returns:
torch.Tensor: generated tokens (batch, target_seqlen)
"""
input_seq_len = prefix.shape[1]
T = input_seq_len + max_new_tokens
assert prefix.shape[0] == 1, "input batch size must be 1"
if draft_k <= 0:
draft_k = 4
adaptive_k = True
else:
adaptive_k = False
draft_past_key_values = None
draft_probs = None
target_past_key_values = None
target_probs = None
rejected = False
unfinished_sequences = prefix.new(prefix.shape[0]).fill_(1)
logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
logits_warper = logits_warper if logits_warper is not None else LogitsProcessorList()
if isinstance(eos_token_id, int):
eos_token_id = [eos_token_id]
eos_token_id_tensor = torch.tensor(eos_token_id).to(prefix.device) if eos_token_id is not None else None
stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
while prefix.shape[1] < T:
prefix_len = prefix.shape[1]
x, new_draft_probs, draft_past_key_values, _ = _draft_model_serial_forward(
prefix,
draft_k,
draft_model,
logits_processor,
logits_warper,
do_sample,
draft_past_key_values,
rejected,
eos_token_id_tensor
)
if draft_probs != None and new_draft_probs != None:
draft_probs = torch.concat((draft_probs, new_draft_probs), dim=1)
elif new_draft_probs == None:
draft_probs = draft_probs
else:
draft_probs = new_draft_probs
if target_past_key_values != None:
unchecked_token_count = x.shape[1] - target_probs.shape[1] - 1
outputs = target_model(x[:,-(unchecked_token_count+1):], past_key_values=target_past_key_values, use_cache=True)
else:
unchecked_token_count = x.shape[1] - prefix_len
outputs = target_model(x, use_cache=True)
new_target_probs = norm_logits(x, outputs.logits[:,-(unchecked_token_count+1):], logits_processor, logits_warper, do_sample, prefix_len)
target_probs = torch.cat((target_probs, new_target_probs), dim=1) if target_probs != None else torch.cat((outputs.logits[:,:-(unchecked_token_count+1)], new_target_probs), dim=1)
target_past_key_values = outputs.past_key_values
# n_valid: the length of the valid prefix
is_all_accept = True
n_valid = prefix_len
for i in range(unchecked_token_count):
r = torch.rand(1, device = target_probs.device)
cur_token_id = x[:, prefix_len + i]
cur_pos = prefix_len + i - 1
if r < torch.min(
torch.tensor([1], device=draft_probs.device),
target_probs[:, cur_pos, cur_token_id] / draft_probs[:, cur_pos, cur_token_id]
):
# accept, and update n_valid
n_valid += 1
else:
# reject
target_new_token = sample(
max_fn(
target_probs[:, n_valid-1, :] - draft_probs[:, n_valid-1, :]
), do_sample=do_sample
)
is_all_accept = False
rejected = True
break
n_valid = min(n_valid, T - 1)
prefix = x[:, :n_valid]
if is_all_accept:
target_new_token = sample(target_probs[:, -1, :], do_sample=do_sample)
rejected = False
else:
draft_probs = draft_probs[:,:n_valid,:]
target_probs = target_probs[:,:n_valid,:]
if "bloom" in draft_model.__class__.__name__.lower() or (
draft_model.config.architectures is not None and "bloom" in draft_model.config.architectures[0].lower()
):
draft_past_key_values = [
(key[:,:,:n_valid], value[:,:n_valid,:])
for key,value in draft_past_key_values
]
target_past_key_values = [
(key[:,:,:n_valid], value[:,:n_valid,:])
for key,value in target_past_key_values
]
else:
draft_past_key_values = [
(key[:,:,:n_valid,:], value[:,:,:n_valid,:])
for key,value in draft_past_key_values
]
target_past_key_values = [
(key[:,:,:n_valid,:], value[:,:,:n_valid,:])
for key,value in target_past_key_values
]
if adaptive_k:
if is_all_accept:
draft_k += 2
else:
draft_k = max(1, draft_k - 1)
prefix = torch.cat((prefix, target_new_token), dim=1)
if streamer is not None:
streamer.put(prefix.cpu())
if stopping_criteria(prefix, target_probs):
# this_peer_finished = True
break
if eos_token_id_tensor is not None:
unfinished_sequences = unfinished_sequences.mul(
prefix[:, -1]
.tile(eos_token_id_tensor.shape[0], 1)
.ne(eos_token_id_tensor.unsqueeze(1))
.prod(dim=0)
)
# stop when each sentence is finished
if unfinished_sequences.max() == 0:
# this_peer_finished = True
break
if streamer is not None:
streamer.end()
return prefix
def speculative_sample(
input_ids,
target_model: Optional["PreTrainedModel"],
draft_model: Optional["PreTrainedModel"],
generation_config: GenerationConfig,
logits_processor: Optional[LogitsProcessorList] = None,
stopping_criteria: Optional[StoppingCriteriaList] = None,
draft_k: int = 4,
negative_prompt_ids: Optional[torch.Tensor] = None,
negative_prompt_attention_mask: Optional[torch.Tensor] = None,
streamer: Optional["BaseStreamer"] = None,
**kwargs,
):
generation_config = copy.deepcopy(generation_config)
model_kwargs = generation_config.update(**kwargs) # All unused kwargs must be model kwargs
generation_config.validate()
logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
inputs_tensor, _, model_kwargs = target_model._prepare_model_inputs(
input_ids, generation_config.bos_token_id, model_kwargs
)
model_kwargs["use_cache"] = generation_config.use_cache
input_ids_seq_length = input_ids.shape[-1]
has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
if has_default_max_length and generation_config.max_new_tokens is None:
# warnings.warn(
# f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. "
# "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we"
# " recommend using `max_new_tokens` to control the maximum length of the generation.",
# UserWarning,
# )
pass
elif generation_config.max_new_tokens is not None:
# if not has_default_max_length:
# logger.warning(
# f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
# f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
# "Please refer to the documentation for more information. "
# "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)"
# )
generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
if generation_config.min_length is not None and generation_config.min_length > generation_config.max_length:
raise ValueError(
f"Unfeasible length constraints: the minimum length ({generation_config.min_length}) is larger than"
f" the maximum length ({generation_config.max_length})"
)
if input_ids_seq_length >= generation_config.max_length:
# input_ids_string = "decoder_input_ids" if target_model.config.is_encoder_decoder else "input_ids"
# logger.warning(
# f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
# f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
# " increasing `max_new_tokens`."
# )
pass
# prepare logis_processor, stopping_criteria, logits_warper
try:
logits_processor = target_model._get_logits_processor(
generation_config=generation_config,
input_ids_seq_length=input_ids_seq_length,
encoder_input_ids=inputs_tensor,
prefix_allowed_tokens_fn=None,
logits_processor=logits_processor,
model_kwargs=model_kwargs,
negative_prompt_ids=negative_prompt_ids,
negative_prompt_attention_mask=negative_prompt_attention_mask,
)
except TypeError:
# Please install the latest transformers (commit equal or later than d533465) to enable CFG sampling.
logits_processor = target_model._get_logits_processor(
generation_config=generation_config,
input_ids_seq_length=input_ids_seq_length,
encoder_input_ids=inputs_tensor,
prefix_allowed_tokens_fn=None,
logits_processor=logits_processor,
)
stopping_criteria = target_model._get_stopping_criteria(
generation_config=generation_config, stopping_criteria=stopping_criteria
)
logits_warper=target_model._get_logits_warper(generation_config) if generation_config.do_sample else None
outputs = _speculative_sampling(
prefix=input_ids,
target_model=target_model,
draft_model=draft_model,
max_new_tokens=generation_config.max_new_tokens,
draft_k=draft_k,
logits_processor=logits_processor,
logits_warper=logits_warper,
do_sample=generation_config.do_sample,
eos_token_id=generation_config.eos_token_id,
stopping_criteria=stopping_criteria,
streamer=streamer,
)
return outputs
if __name__ == "__main__":
# A usage example
draft_model_name = 'Draft/Model/Path'
target_model_name = 'Target/Model/Path'
DEFAULT_SYSTEM_PROMPT = """You are a helpful assistant. 你是一个乐于助人的助手。"""
TEMPLATE = (
"[INST] <<SYS>>\n"
"{system_prompt}\n"
"<</SYS>>\n\n"
"{instruction} [/INST]"
)
def generate_prompt(instruction, system_prompt=DEFAULT_SYSTEM_PROMPT):
return TEMPLATE.format_map({'instruction': instruction,'system_prompt': system_prompt})
inputs = ["我能用lightning数据线给安卓手机充电吗?"]
negative_text = generate_prompt(inputs[0], system_prompt="回复尽可能多的内容。")
inputs = [generate_prompt(text) for text in inputs]
tokenizer = AutoTokenizer.from_pretrained(target_model_name)
print("begin loading models")
draft_model = AutoModelForCausalLM.from_pretrained(
draft_model_name,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
device_map='auto',
load_in_8bit=False
)
draft_model.resize_token_embeddings(len(tokenizer))
print(f"Load {draft_model_name}")
target_model = AutoModelForCausalLM.from_pretrained(
target_model_name,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
device_map='auto',
load_in_8bit=False
)
print(f"Load {target_model_name}")
draft_model.eval()
target_model.eval()
print("finish loading models")
torch_device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
input_ids = tokenizer.encode(inputs[0], return_tensors='pt').to(torch_device)
negative_inputs = tokenizer(negative_text,return_tensors="pt")
negative_prompt_ids = negative_inputs["input_ids"].to(torch_device)
negative_prompt_attention_mask = negative_inputs["attention_mask"].to(torch_device)
generation_config = GenerationConfig(
temperature=0.2,
top_k=40,
top_p=0.9,
do_sample=True,
num_beams=1,
repetition_penalty=1.1,
max_new_tokens=128
)
outputs = speculative_sample(
input_ids=input_ids,
target_model=target_model,
draft_model=draft_model,
generation_config=generation_config,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
# draft_k=4,
# guidance_scale=1.5,
# negative_prompt_ids=negative_prompt_ids,
# negative_prompt_attention_mask=negative_prompt_attention_mask,
)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)
李白[注 1](701年5月19日—762年11月30日),字太白,号青莲居士,中国唐朝诗人。李白自言祖籍陇西成纪(今甘肃静宁西南),汉飞将军李广后裔,西凉武昭王李暠之后,与李唐皇室同宗。
一说其幼时内迁,寄籍剑南道绵州昌隆(今四川省江油市青莲镇)。一说先人隋末被窜于碎叶,出生于碎叶,属唐安西都护府(今吉尔吉斯斯坦共和国楚河州托克马克市)。有“诗仙”、“诗侠”、“酒仙”、“谪仙人”等称呼,活跃于盛唐[1],为杰出的浪漫主义诗人。与杜甫合称“李杜”[注 2]。被贺知章呼为“天上谪仙”、“李谪仙”。
李白的诗歌在唐朝已被选进殷璠编选的《河岳英灵集》、于敦煌石室发现的《唐写本唐人选唐诗》、韦庄编选的《又玄集》和韦縠编选的《才调集》。唐文宗御封李白的诗歌、裴旻的剑舞、张旭的草书称为“三绝”[2]。其作品想像奇特丰富,风格雄奇浪漫,意境独特,清新俊逸;善于利用夸饰与譬喻等手法、自然优美的词句,表现出奔放的情感。诗句行云流水,浑然天成。李白诗篇传诵千年,众多诗句已成经典,清赵翼称:“李杜诗篇万口传”(例如“抽刀断水水更流,举杯消愁愁更愁”等,更被谱入曲)。李白在诗歌的艺术成就被认为是中国浪漫主义诗歌的巅峰。诗作在全唐诗收录于卷161至卷185。有《李太白集》传世。杜甫曾经这样评价过李白的文章:“笔落惊风雨,诗成泣鬼神”、“白也诗无敌,飘然思不群”。
生平
早年
据《新唐书》记载李白为兴圣皇帝(凉武昭王李暠)九世孙[3],如果按照这个说法李白与李唐诸王实际上同宗,应是唐太宗李世民的同辈族弟。亦有野史说其祖是李建成或李元吉,因为被李世民族灭而逃往西域;但此说缺乏佐证,且李建成、李元吉诸子尚在幼年即在玄武门之变后全数被害,留有亲生后嗣的可能性很小。据《旧唐书》记载,李白之父李客为任城尉。更为了学习而隐居。
李白于武则天大足元年(701年)[4]出生,关于其出生地有多种说法,现在主要有剑南道绵州昌隆县(今四川省江油市)[5]青莲乡(今青莲镇)和西域的碎叶(Suyab,位于今吉尔吉斯托克马克附近)[6]这两种说法,其中后一种说法认为李白直到四岁时(705年)才跟随他的父亲李客迁居蜀地,入籍绵州。李白自四岁(705年)接受启蒙教育,从景云元年(710年)开始,李白开始读诸子史籍[7],开元三年时十四岁(715年)——喜好作赋、剑术、奇书、神仙:“十五观奇书,做赋凌相如”。在青年时期开始在中国各地游历。开元五年左右,李白曾拜撰写《长短经》的赵蕤为师,学习一年有余,这段时期的学习对李白产生了深远的影响。开元六年,在戴天山(约在四川省昌隆县北五十里处)大明寺读书。二十五岁时只身出四川,开始了广泛漫游,南到洞庭湘江,东至吴、越,寓居在安陆(今湖北省安陆市)、应山(今湖北省广水市)。
中年
李白曾经在唐玄宗天宝元年(742年)供奉翰林。有一次皇帝因酒酣问李白说:“我朝与天后(武后)之朝何如?”白曰:“天后朝政出多门,国由奸幸,任人之道,如小儿市瓜,不择香味,惟拣肥大者;我朝任人如淘沙取金,剖石采用,皆得其精粹者。”玄宗听后大笑不止[8][9]。但是由于他桀骜不驯的性格,所以仅仅不到两年他就离开了长安。据说是因为他作的《清平调》得罪了当时宠冠后宫的杨贵妃(因李白命“力士脱靴”,高力士引以为大耻,因而以言语诱使杨贵妃认为“可怜飞燕倚新妆”几句是讽刺她)而不容于宫中[注 3]。天宝三年(745年)“恳求还山,帝赐金放还”,离开长安。
后在洛阳与另两位著名诗人杜甫、高适相识,并结为好友。
晚年
天宝十一年(752年)李白年届五十二岁,北上途中游广平郡邯郸、临洺、清漳等地。十月,抵幽州。初有立功边疆思想,在边地习骑射。后发现安禄山野心,登黄金台痛哭。不久即离幽州南下。
安史之乱爆发时,李白游华山,南下回宣城,后上庐山。756年12月,李白被三次邀请,下山赴寻阳入永王李璘幕僚[10]。永王触怒唐肃宗被杀后,李白也获罪入狱。幸得郭子仪力保,方得免死,改为流徙夜郎(今贵州关岭县一带),在途经巫山时遇赦,此时他已经59岁。(参见李璘之乱)
李白晚年在江南一带漂泊。在他61岁时,听到太尉李光弼率领大军讨伐安史叛军,于是他北上准备追随李光弼从军杀敌,但是中途因病折回。第二年,李白投奔他的族叔、当时在当涂(今属安徽省马鞍山)当县令的李阳冰。同年11月,李白病逝于寓所,终年61岁,葬当涂龙山。唐宪宗元和十二年(817年),宣歙观察使范传正根据李白生前“志在青山”的遗愿,将其墓迁至当涂青山。
去世
《新唐书》记载,唐代宗继位后以左拾遗召李白,但李白当时已去世。
李阳冰在《草堂集序》中说李白是病死的[11];皮日休在诗作中记载,李白是患“腐胁疾”而死的[12]。
《旧唐书》则记载,李白流放虽然遇赦,但因途中饮酒过度,醉死于宣城。中国民间有“太白捞月”的传说:李白在舟中赏月,饮酒大醉,想要跳下船至水里捞月而溺死[13][14][15];在民间的求签活动中亦有“太白捞月”一签文,乃是下下签[16]。
作品
李白一生创作大量的诗歌,绝大多数已散佚[17],流传至今的只有九百多首。他的诗歌创作涉及的中国古典诗歌的题材非常广泛,而且在许多题材都有名作出现,而且因为际遇的不同,每个时期的诗风都有所不同。
\ No newline at end of file
import argparse
import os
parser = argparse.ArgumentParser()
parser.add_argument('--file_path', required=True, type=str)
parser.add_argument('--embedding_path', required=True, type=str)
parser.add_argument('--model_path', required=True, type=str)
parser.add_argument('--gpu_id', default="0", type=str)
parser.add_argument('--chain_type', default="refine", type=str)
args = parser.parse_args()
os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_id
file_path = args.file_path
embedding_path = args.embedding_path
model_path = args.model_path
import torch
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
prompt_template = (
"[INST] <<SYS>>\n"
"You are a helpful assistant. 你是一个乐于助人的助手。\n"
"<</SYS>>\n\n"
"{context}\n{question} [/INST]"
)
refine_prompt_template = (
"[INST] <<SYS>>\n"
"You are a helpful assistant. 你是一个乐于助人的助手。\n"
"<</SYS>>\n\n"
"这是原始问题: {question}\n"
"已有的回答: {existing_answer}\n"
"现在还有一些文字,(如果有需要)你可以根据它们完善现有的回答。"
"\n\n"
"{context_str}\n"
"\n\n"
"请根据新的文段,进一步完善你的回答。"
" [/INST]"
)
initial_qa_template = (
"[INST] <<SYS>>\n"
"You are a helpful assistant. 你是一个乐于助人的助手。\n"
"<</SYS>>\n\n"
"以下为背景知识:\n"
"{context_str}"
"\n"
"请根据以上背景知识, 回答这个问题:{question}。"
" [/INST]"
)
if __name__ == '__main__':
load_type = torch.float16
if not torch.cuda.is_available():
raise RuntimeError("No CUDA GPUs are available.")
loader = TextLoader(file_path)
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=600, chunk_overlap=100)
texts = text_splitter.split_documents(documents)
print("Loading the embedding model...")
embeddings = HuggingFaceEmbeddings(model_name=embedding_path)
docsearch = FAISS.from_documents(texts, embeddings)
print("loading LLM...")
model = HuggingFacePipeline.from_model_id(model_id=model_path,
task="text-generation",
device=0,
pipeline_kwargs={
"max_new_tokens": 400,
"do_sample": True,
"temperature": 0.2,
"top_k": 40,
"top_p": 0.9,
"repetition_penalty": 1.1},
model_kwargs={
"torch_dtype": load_type,
"low_cpu_mem_usage": True,
"trust_remote_code": True}
)
if args.chain_type == "stuff":
PROMPT = PromptTemplate(
template=prompt_template, input_variables=["context", "question"]
)
chain_type_kwargs = {"prompt": PROMPT}
qa = RetrievalQA.from_chain_type(
llm=model,
chain_type="stuff",
retriever=docsearch.as_retriever(search_kwargs={"k": 1}),
chain_type_kwargs=chain_type_kwargs)
elif args.chain_type == "refine":
refine_prompt = PromptTemplate(
input_variables=["question", "existing_answer", "context_str"],
template=refine_prompt_template,
)
initial_qa_prompt = PromptTemplate(
input_variables=["context_str", "question"],
template=initial_qa_template,
)
chain_type_kwargs = {"question_prompt": initial_qa_prompt, "refine_prompt": refine_prompt}
qa = RetrievalQA.from_chain_type(
llm=model, chain_type="refine",
retriever=docsearch.as_retriever(search_kwargs={"k": 1}),
chain_type_kwargs=chain_type_kwargs)
while True:
query = input("请输入问题:")
if len(query.strip())==0:
break
print(qa.run(query))
import argparse
import os
parser = argparse.ArgumentParser()
parser.add_argument('--file_path', required=True, type=str)
parser.add_argument('--model_path', required=True, type=str)
parser.add_argument('--gpu_id', default="0", type=str)
parser.add_argument('--chain_type', default="refine", type=str)
args = parser.parse_args()
os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_id
file_path = args.file_path
model_path = args.model_path
import torch
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
prompt_template = (
"[INST] <<SYS>>\n"
"You are a helpful assistant. 你是一个乐于助人的助手。\n"
"<</SYS>>\n\n"
"请为以下文字写一段摘要:\n{text} [/INST]"
)
refine_template = (
"[INST] <<SYS>>\n"
"You are a helpful assistant. 你是一个乐于助人的助手。\n"
"<</SYS>>\n\n"
"已有一段摘要:{existing_answer}\n"
"现在还有一些文字,(如果有需要)你可以根据它们完善现有的摘要。"
"\n"
"{text}\n"
"\n"
"如果这段文字没有用,返回原来的摘要即可。请你生成一个最终的摘要。"
" [/INST]"
)
if __name__ == '__main__':
load_type = torch.float16
if not torch.cuda.is_available():
raise RuntimeError("No CUDA GPUs are available.")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=100, length_function=len)
with open(file_path) as f:
text = f.read()
docs = text_splitter.create_documents([text])
print("loading LLM...")
model = HuggingFacePipeline.from_model_id(model_id=model_path,
task="text-generation",
device=0,
pipeline_kwargs={
"max_new_tokens": 400,
"do_sample": True,
"temperature": 0.2,
"top_k": 40,
"top_p": 0.9,
"repetition_penalty": 1.1},
model_kwargs={
"torch_dtype" : load_type,
"low_cpu_mem_usage" : True,
"trust_remote_code": True}
)
PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"])
REFINE_PROMPT = PromptTemplate(
template=refine_template,input_variables=["existing_answer", "text"],
)
if args.chain_type == "stuff":
chain = load_summarize_chain(model, chain_type="stuff", prompt=PROMPT)
elif args.chain_type == "refine":
chain = load_summarize_chain(model, chain_type="refine", question_prompt=PROMPT, refine_prompt=REFINE_PROMPT)
print(chain.run(docs))
## llama.cpp相关示例脚本
具体使用方法参考:https://github.com/ymcui/Chinese-LLaMA-Alpaca-2/wiki/llamacpp_zh
Detailed usage: https://github.com/ymcui/Chinese-LLaMA-Alpaca-2/wiki/llamacpp_en
### chat.sh
用于与Alpaca-2系列模型进行对话交流。
Chat with Alpaca-2 models.
### server_curl_example.sh
架设server后使用curl调用示例。
An example to use curl for API calls after setting up server.
#!/bin/bash
# temporary script to chat with Chinese Alpaca-2 model
# usage: ./chat.sh alpaca2-ggml-model-path your-first-instruction
SYSTEM_PROMPT='You are a helpful assistant. 你是一个乐于助人的助手。'
# SYSTEM_PROMPT='You are a helpful assistant. 你是一个乐于助人的助手。请你提供专业、有逻辑、内容真实、有价值的详细回复。' # Try this one, if you prefer longer response.
MODEL_PATH=$1
FIRST_INSTRUCTION=$2
./main -m "$MODEL_PATH" \
--color -i -c 4096 -t 8 --temp 0.5 --top_k 40 --top_p 0.9 --repeat_penalty 1.1 \
--in-prefix-bos --in-prefix ' [INST] ' --in-suffix ' [/INST]' -p \
"[INST] <<SYS>>
$SYSTEM_PROMPT
<</SYS>>
$FIRST_INSTRUCTION [/INST]"
#!/bin/bash
# NOTE: start the server first before running this script.
# usage: ./server_curl_example.sh your-instruction
SYSTEM_PROMPT='You are a helpful assistant. 你是一个乐于助人的助手。'
# SYSTEM_PROMPT='You are a helpful assistant. 你是一个乐于助人的助手。请你提供专业、有逻辑、内容真实、有价值的详细回复。' # Try this one, if you prefer longer response.
INSTRUCTION=$1
ALL_PROMPT="[INST] <<SYS>>\n$SYSTEM_PROMPT\n<</SYS>>\n\n$INSTRUCTION [/INST]"
CURL_DATA="{\"prompt\": \"$ALL_PROMPT\",\"n_predict\": 128}"
curl --request POST \
--url http://localhost:8080/completion \
--header "Content-Type: application/json" \
--data "$CURL_DATA"
{
"narrativeqa": 128,
"qasper": 128,
"multifieldqa_en": 64,
"multifieldqa_zh": 64,
"hotpotqa": 32,
"2wikimqa": 32,
"musique": 32,
"dureader": 128,
"gov_report": 512,
"qmsum": 512,
"multi_news": 512,
"vcsum": 512,
"trec": 64,
"triviaqa": 32,
"samsum": 128,
"lsht": 64,
"passage_count": 32,
"passage_retrieval_en": 32,
"passage_retrieval_zh": 32,
"lcc": 64,
"repobench-p": 64
}
\ No newline at end of file
{
"narrativeqa": "You are given a story, which can be either a novel or a movie script, and a question. Answer the question asconcisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story asconcisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:",
"qasper": "You are given a scientific article and a question. Answer the question as concisely as you can, using a single phrase or sentence if possible. If the question cannot be answered based on the information in the article, write \"unanswerable\". If the question is a yes/no question, answer \"yes\", \"no\", or \"unanswerable\". Do not provide any explanation.\n\nArticle: {context}\n\n Answer the question based on the above article as concisely as you can, using a single phrase or sentence if possible. If the question cannot be answered based on the information in the article, write \"unanswerable\". If the question is a yes/no question, answer \"yes\", \"no\", or \"unanswerable\". Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:",
"multifieldqa_en": "Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
"multifieldqa_zh": "阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:",
"hotpotqa": "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
"2wikimqa": "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
"musique": "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
"dureader": "请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:",
"gov_report": "You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:",
"qmsum": "You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:",
"multi_news": "You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:",
"vcsum": "下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:",
"trec": "Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}",
"triviaqa": "Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}",
"samsum": "Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}",
"lsht": "请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}",
"passage_count": "There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ",
"passage_retrieval_en": "Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like \"Paragraph 1\", \"Paragraph 2\", etc.\n\nThe answer is: ",
"passage_retrieval_zh": "以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是\"段落1\"\"段落2\"等格式\n\n答案是:",
"lcc": "Please complete the code given below. \n{context}Next line of code:\n",
"repobench-p": "Please complete the code given below. \n{context}{input}Next line of code:\n"
}
\ No newline at end of file
# The script is from https://github.com/THUDM/LongBench
import os
import json
import argparse
import numpy as np
from metrics import (
qa_f1_score,
rouge_zh_score,
qa_f1_zh_score,
rouge_score,
classification_score,
retrieval_score,
retrieval_zh_score,
count_score,
code_sim_score,
)
dataset2metric = {
"narrativeqa": qa_f1_score,
"qasper": qa_f1_score,
"multifieldqa_en": qa_f1_score,
"multifieldqa_zh": qa_f1_zh_score,
"hotpotqa": qa_f1_score,
"2wikimqa": qa_f1_score,
"musique": qa_f1_score,
"dureader": rouge_zh_score,
"gov_report": rouge_score,
"qmsum": rouge_score,
"multi_news": rouge_score,
"vcsum": rouge_zh_score,
"trec": classification_score,
"triviaqa": qa_f1_score,
"samsum": rouge_score,
"lsht": classification_score,
"passage_retrieval_en": retrieval_score,
"passage_count": count_score,
"passage_retrieval_zh": retrieval_zh_score,
"lcc": code_sim_score,
"repobench-p": code_sim_score,
}
def parse_args(args=None):
parser = argparse.ArgumentParser()
parser.add_argument('--output_dir')
parser.add_argument('--e', action='store_true', help="Evaluate on LongBench-E")
return parser.parse_args(args)
def scorer_e(dataset, predictions, answers, lengths, all_classes):
scores = {"0-4k": [], "4-8k": [], "8k+": []}
for (prediction, ground_truths, length) in zip(predictions, answers, lengths):
score = 0.
if dataset in ["trec", "triviaqa", "samsum", "lsht"]:
prediction = prediction.lstrip('\n').split('\n')[0]
for ground_truth in ground_truths:
score = max(score, dataset2metric[dataset](prediction, ground_truth, all_classes=all_classes))
if length < 4000:
scores["0-4k"].append(score)
elif length < 8000:
scores["4-8k"].append(score)
else:
scores["8k+"].append(score)
for key in scores.keys():
scores[key] = round(100 * np.mean(scores[key]), 2)
return scores
def scorer(dataset, predictions, answers, all_classes):
total_score = 0.
for (prediction, ground_truths) in zip(predictions, answers):
score = 0.
if dataset in ["trec", "triviaqa", "samsum", "lsht"]:
prediction = prediction.lstrip('\n').split('\n')[0]
for ground_truth in ground_truths:
score = max(score, dataset2metric[dataset](prediction, ground_truth, all_classes=all_classes))
total_score += score
return round(100 * total_score / len(predictions), 2)
if __name__ == '__main__':
args = parse_args()
scores = dict()
if args.e:
path = f"{args.output_dir}/pred_e/"
else:
path = f"{args.output_dir}/pred/"
all_files = os.listdir(path)
print("Evaluating on:", all_files)
for filename in all_files:
if not filename.endswith("jsonl"):
continue
predictions, answers, lengths = [], [], []
dataset = filename.split('.')[0]
with open(f"{path}{filename}", "r", encoding="utf-8") as f:
print(filename)
for line in f:
data = json.loads(line)
predictions.append(data["pred"])
answers.append(data["answers"])
all_classes = data["all_classes"]
if "length" in data:
lengths.append(data["length"])
if args.e:
score = scorer_e(dataset, predictions, answers, lengths, all_classes)
else:
score = scorer(dataset, predictions, answers, all_classes)
scores[dataset] = score
if args.e:
out_path = f"{args.output_dir}/pred_e/result.json"
else:
out_path = f"{args.output_dir}/pred/result.json"
with open(out_path, "w") as f:
json.dump(scores, f, ensure_ascii=False, indent=4)
# The script is from https://github.com/THUDM/LongBench
import re
import string
import jieba
from fuzzywuzzy import fuzz
import difflib
from collections import Counter
from rouge import Rouge
def normalize_answer(s):
"""Lower text and remove punctuation, articles and extra whitespace."""
def remove_articles(text):
return re.sub(r"\b(a|an|the)\b", " ", text)
def white_space_fix(text):
return " ".join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return "".join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def normalize_zh_answer(s):
"""Lower text and remove punctuation, extra whitespace."""
def white_space_fix(text):
return "".join(text.split())
def remove_punc(text):
cn_punctuation = "!?。。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
all_punctuation = set(string.punctuation + cn_punctuation)
return "".join(ch for ch in text if ch not in all_punctuation)
def lower(text):
return text.lower()
return white_space_fix(remove_punc(lower(s)))
def count_score(prediction, ground_truth, **kwargs):
numbers = re.findall(r"\d+", prediction)
right_num = 0
for number in numbers:
if str(number) == str(ground_truth):
right_num += 1
final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers)
return float(final_score)
def retrieval_score(prediction, ground_truth, **kwargs):
pattern = r'Paragraph (\d+)'
matches = re.findall(pattern, ground_truth)
ground_truth_id = matches[0]
numbers = re.findall(r"\d+", prediction)
right_num = 0
for number in numbers:
if str(number) == str(ground_truth_id):
right_num += 1
final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers)
return float(final_score)
def retrieval_zh_score(prediction, ground_truth, **kwargs):
pattern = r'段落(\d+)'
matches = re.findall(pattern, ground_truth)
ground_truth_id = matches[0]
numbers = re.findall(r"\d+", prediction)
right_num = 0
for number in numbers:
if str(number) == str(ground_truth_id):
right_num += 1
final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers)
return float(final_score)
def code_sim_score(prediction, ground_truth, **kwargs):
all_lines = prediction.lstrip('\n').split('\n')
prediction = ""
for line in all_lines:
if ('`' not in line) and ('#' not in line) and ('//' not in line):
prediction = line
break
return (fuzz.ratio(prediction, ground_truth) / 100)
def classification_score(prediction, ground_truth, **kwargs):
em_match_list = []
all_classes = kwargs["all_classes"]
for class_name in all_classes:
if class_name in prediction:
em_match_list.append(class_name)
for match_term in em_match_list:
if match_term in ground_truth and match_term != ground_truth:
em_match_list.remove(match_term)
if em_match_list != 0:
if ground_truth in em_match_list:
score = (1.0 / len(em_match_list))
else:
score = 0.0
else:
best_match = None
highest_similarity = 0
for string in all_classes:
similarity = difflib.SequenceMatcher(None, string, prediction).ratio()
if similarity > highest_similarity:
highest_similarity = similarity
best_match = string
score = float(best_match == ground_truth)
return score
def rouge_score(prediction, ground_truth, **kwargs):
rouge = Rouge()
try:
scores = rouge.get_scores([prediction], [ground_truth], avg=True)
except Exception:
return 0.0
return scores["rouge-l"]["f"]
def rouge_zh_score(prediction, ground_truth, **kwargs):
prediction = " ".join(list(jieba.cut(prediction, cut_all=False)))
ground_truth = " ".join(list(jieba.cut(ground_truth, cut_all=False)))
score = rouge_score(prediction, ground_truth)
return score
def f1_score(prediction, ground_truth, **kwargs):
common = Counter(prediction) & Counter(ground_truth)
num_same = sum(common.values())
if num_same == 0:
return 0
precision = 1.0 * num_same / len(prediction)
recall = 1.0 * num_same / len(ground_truth)
f1 = (2 * precision * recall) / (precision + recall)
return f1
def qa_f1_score(prediction, ground_truth, **kwargs):
normalized_prediction = normalize_answer(prediction)
normalized_ground_truth = normalize_answer(ground_truth)
prediction_tokens = normalized_prediction.split()
ground_truth_tokens = normalized_ground_truth.split()
return f1_score(prediction_tokens, ground_truth_tokens)
def qa_f1_zh_score(prediction, ground_truth, **kwargs):
prediction_tokens = list(jieba.cut(prediction, cut_all=False))
ground_truth_tokens = list(jieba.cut(ground_truth, cut_all=False))
prediction_tokens = [normalize_zh_answer(token) for token in prediction_tokens]
ground_truth_tokens = [normalize_zh_answer(token) for token in ground_truth_tokens]
prediction_tokens = [token for token in prediction_tokens if len(token) > 0]
ground_truth_tokens = [token for token in ground_truth_tokens if len(token) > 0]
return f1_score(prediction_tokens, ground_truth_tokens)
# The script is modified from https://github.com/THUDM/LongBench/blob/main/pred.py
from datasets import load_dataset
import torch
import random
import numpy as np
import json
from transformers import LlamaTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig
from tqdm import tqdm
import os
import argparse
import sys
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)
from attn_and_long_ctx_patches import apply_attention_patch, apply_ntk_scaling_patch
dir_path = os.path.dirname(os.path.realpath(__file__))
DEFAULT_SYSTEM_PROMPT = """You are a helpful assistant. 你是一个乐于助人的助手。"""
TEMPLATE = (
"[INST] <<SYS>>\n"
"{system_prompt}\n"
"<</SYS>>\n\n"
"{instruction} [/INST]"
)
parser = argparse.ArgumentParser()
parser.add_argument('--model_path', type=str)
parser.add_argument('--load_in_4bit',action='store_true')
parser.add_argument('--load_in_8bit',action='store_true')
parser.add_argument('--predict_on',type=str, default='zh')
parser.add_argument('--output_dir',type=str, default='pred')
parser.add_argument('--gpus',type=str, default=None)
parser.add_argument('--max_length',type=int, default=4096-512)
parser.add_argument('--alpha', type=str, default="auto", help="The scaling factor of NTK method, can be a float or 'auto'. ")
parser.add_argument('--with_inst', choices=['true','false','auto'], default = 'false',
help="Whether use the system prompt and template of Chinese-Alpaca-2 when constructing the instructions.")
parser.add_argument('--e', action='store_true', help="Evaluate on LongBench-E")
parser.add_argument('--use_flash_attention_2', action='store_true', help="Use flash attention to replace the LLaMA attention")
parser.add_argument('--use_ntk', action='store_true', help="Use dynamic-ntk to extend context window")
args = parser.parse_args()
model_path = args.model_path
load_in_4bit = args.load_in_4bit
load_in_8bit = args.load_in_8bit
predict_on = args.predict_on
output_dir = args.output_dir
gpus=args.gpus
max_length = args.max_length
alpha = args.alpha
DO_SAMPLE =True
TEMPERATURE = 0.2
REPETITION_PENALTY = 1.1
TOP_P = 0.95
TOP_K = 40
if gpus is not None:
os.environ["CUDA_VISIBLE_DEVICES"] = gpus
apply_attention_patch(use_memory_efficient_attention=True)
if args.use_ntk:
apply_ntk_scaling_patch(args.alpha)
def fill_llama2_prompt_template(instruction, with_inst = True, with_system_prompt = True, system_prompt = DEFAULT_SYSTEM_PROMPT):
if with_inst is False:
return instruction
if with_system_prompt is True:
return TEMPLATE.format_map({'instruction': instruction,'system_prompt': system_prompt})
else:
return "[INST] {instruction} [/INST]"
def get_pred(model, tokenizer, data, max_length, max_gen, prompt_format, dataset, device):
preds = []
for json_obj in tqdm(data):
prompt = prompt_format.format(**json_obj)
# truncate to fit max_length (we suggest truncate in the middle, since the left and right side may contain crucial instructions)
tokenized_prompt = tokenizer(prompt, truncation=False, return_tensors="pt").input_ids[0]
if len(tokenized_prompt) > max_length:
half = int(max_length/2)
prompt = tokenizer.decode(tokenized_prompt[:half], skip_special_tokens=True)+tokenizer.decode(tokenized_prompt[-half:], skip_special_tokens=True)
if args.with_inst == 'auto':
if dataset not in ["trec", "triviaqa", "samsum", "lsht", "lcc", "repobench-p"]: # chat models are better off without build prompts on these tasks
prompt = fill_llama2_prompt_template(instruction=prompt)
elif args.with_inst == 'true':
prompt = fill_llama2_prompt_template(instruction=prompt, with_inst = True)
else:
prompt = fill_llama2_prompt_template(instruction=prompt, with_inst = False)
input_data = tokenizer(prompt, truncation=False, return_tensors="pt").to(device)
context_length = input_data.input_ids.shape[-1]
if dataset == "samsum": # prevent illegal output on samsum (model endlessly repeat "\nDialogue"), might be a prompting issue
output = model.generate(
**input_data,
max_new_tokens=max_gen,
num_beams=1,
do_sample=DO_SAMPLE,
repetition_penalty = REPETITION_PENALTY,
top_p = TOP_P,
top_k = TOP_K,
temperature=TEMPERATURE,
min_length=context_length+1,
eos_token_id=[tokenizer.eos_token_id, tokenizer.encode("\n", add_special_tokens=False)[-1]],
)[0]
else:
output = model.generate(
**input_data,
max_new_tokens=max_gen,
num_beams=1,
do_sample=DO_SAMPLE,
repetition_penalty = REPETITION_PENALTY,
top_p = TOP_P,
top_k = TOP_K,
temperature=TEMPERATURE
)[0]
pred = tokenizer.decode(output[context_length:], skip_special_tokens=True)
#print(pred)
preds.append({"pred": pred, "answers": json_obj["answers"], "all_classes": json_obj["all_classes"], "length": json_obj["length"]})
return preds
def seed_everything(seed):
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
torch.cuda.manual_seed_all(seed)
if __name__ == '__main__':
seed_everything(42)
load_type = torch.float16
if torch.cuda.is_available():
device = torch.device(0)
else:
device = torch.device('cpu')
if args.e:
en_datasets = [ "hotpotqa","2wikimqa",
"qasper", "multifieldqa_en", "gov_report",
"trec", "samsum", "triviaqa",
"passage_count", "passage_retrieval_en", "multi_news"]
zh_datasets = []
code_datasets = [ "lcc", "repobench-p" ]
if not os.path.exists(f"{output_dir}/pred_e"):
os.makedirs(f"{output_dir}/pred_e")
else:
en_datasets = [ "hotpotqa","2wikimqa", "musique", "narrativeqa",
"qasper", "multifieldqa_en", "gov_report",
"qmsum", "trec", "samsum", "triviaqa",
"passage_count", "passage_retrieval_en", "multi_news"]
zh_datasets = [ "dureader", "multifieldqa_zh",
"vcsum","lsht", "passage_retrieval_zh"]
code_datasets = [ "lcc", "repobench-p" ]
if not os.path.exists(f"{output_dir}/pred"):
os.makedirs(f"{output_dir}/pred")
datasets = []
for data_type in predict_on.split(','):
if data_type == 'zh':
datasets += zh_datasets
elif data_type == 'en':
datasets += en_datasets
elif data_type == 'code':
datasets += code_datasets
print(datasets)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = LlamaTokenizer.from_pretrained(model_path, legacy=True)
model = None
if args.load_in_4bit or args.load_in_8bit:
quantization_config = BitsAndBytesConfig(
load_in_4bit=args.load_in_4bit,
load_in_8bit=args.load_in_8bit,
bnb_4bit_compute_dtype=load_type,
)
model = AutoModelForCausalLM.from_pretrained(
model_path,
torch_dtype=load_type,
low_cpu_mem_usage=True,
device_map='auto',
quantization_config=quantization_config if (args.load_in_4bit or args.load_in_8bit) else None,
use_flash_attention_2=args.use_flash_attention_2,
trust_remote_code=True
)
model = model.eval()
model_vocab_size = model.get_input_embeddings().weight.size(0)
print(f"Vocab of the base model: {model_vocab_size}")
tokenizer_vocab_size = len(tokenizer)
print(f"Vocab of the tokenizer: {tokenizer_vocab_size}")
# we design specific prompt format and max generation length for each task, feel free to modify them to optimize model output
dataset2prompt = json.load(open(dir_path + "/config/dataset2prompt.json", "r"))
dataset2maxlen = json.load(open(dir_path + "/config/dataset2maxlen.json", "r"))
# predict on each dataset
for dataset in datasets:
print(f"Loading dataset {dataset}")
if args.e:
data = load_dataset('THUDM/LongBench', dataset+'_e', split='test')
output_path = f"{output_dir}/pred_e/{dataset}.jsonl"
else:
data = load_dataset('THUDM/LongBench', dataset, split='test')
output_path = f"{output_dir}/pred/{dataset}.jsonl"
prompt_format = dataset2prompt[dataset]
max_gen = dataset2maxlen[dataset]
preds = get_pred(model, tokenizer, data, max_length, max_gen, prompt_format, dataset, device)
with open(output_path, "w", encoding="utf-8") as f:
for pred in preds:
json.dump(pred, f, ensure_ascii=False)
f.write('\n')
datasets
tqdm
rouge
jieba
fuzzywuzzy
torch
transformers==4.35.0
einops
\ No newline at end of file
"""
Usage:
python merge_llama2_with_chinese_lora_low_mem.py \
--base_model path/to/llama2-hf-model \
--lora_model path/to/chinese-llama2-or-alpaca2-lora \
--output_type [huggingface|pth|] \
--output_dir path/to/output-dir
"""
import argparse
import json
import os
import gc
import torch
import peft
from transformers import LlamaTokenizer
from transformers.modeling_utils import dtype_byte_size
from huggingface_hub import snapshot_download
import re
import shutil
parser = argparse.ArgumentParser(description='Script to merge Llama-2-hf with Chinese LLaMA-2 or Alpaca-2 LoRA weights')
parser.add_argument('--base_model', default=None, required=True,
type=str, help="Base model path (basically Llama-2-hf)")
parser.add_argument('--lora_model', default=None, required=True,
type=str, help="LoRA model path (Chinese-LLaMA-2-LoRA, Chinese-Alpaca-2-LoRA)")
parser.add_argument('--output_type', default='huggingface',choices=['huggingface', 'pth'],
type=str, help="Output model type can be 'huggingface' (default) or 'pth' format")
parser.add_argument('--output_dir', default='./merged_model',
type=str, help="Output path for the merged model")
parser.add_argument('--verbose', default=False, action='store_true',
help="Show detailed debugging messages")
layers_to_model_size = {
4 : '1.3B',
32 : '7B',
40 : '13B',
80 : '70B',
}
num_shards_of_models = {'1.3B': 1, '7B': 1, '13B': 2, '70B': 8}
params_of_models = {
'1.3B':
{
"dim": 4096,
"multiple_of": 256,
"n_heads": 32,
"n_layers": 4,
"norm_eps": 1e-05,
"vocab_size": -1,
},
'7B':
{
"dim": 4096,
"multiple_of": 256,
"n_heads": 32,
"n_layers": 32,
"norm_eps": 1e-05,
"vocab_size": -1,
},
'13B':
{
"dim": 5120,
"multiple_of": 256,
"n_heads": 40,
"n_layers": 40,
"norm_eps": 1e-05,
"vocab_size": -1,
},
'70B':
{
"dim": 8192,
"multiple_of": 4096,
"ffn_dim_multiplier": 1.3,
"n_heads": 64,
"n_kv_heads": 8,
"n_layers": 80,
"norm_eps": 1e-05,
"vocab_size": -1,
},
}
def transpose(weight, fan_in_fan_out):
return weight.T if fan_in_fan_out else weight
def jsonload(filename):
with open(filename, "r") as file:
d = json.load(file)
return d
# Borrowed and modified from https://github.com/tloen/alpaca-lora
def translate_state_dict_key(k):
k = k.replace("base_model.model.", "")
if k == "model.embed_tokens.weight":
return "tok_embeddings.weight"
elif k == "model.norm.weight":
return "norm.weight"
elif k == "lm_head.weight":
return "output.weight"
elif k.startswith("model.layers."):
layer = k.split(".")[2]
if k.endswith(".self_attn.q_proj.weight"):
return f"layers.{layer}.attention.wq.weight"
elif k.endswith(".self_attn.k_proj.weight"):
return f"layers.{layer}.attention.wk.weight"
elif k.endswith(".self_attn.v_proj.weight"):
return f"layers.{layer}.attention.wv.weight"
elif k.endswith(".self_attn.o_proj.weight"):
return f"layers.{layer}.attention.wo.weight"
elif k.endswith(".mlp.gate_proj.weight"):
return f"layers.{layer}.feed_forward.w1.weight"
elif k.endswith(".mlp.down_proj.weight"):
return f"layers.{layer}.feed_forward.w2.weight"
elif k.endswith(".mlp.up_proj.weight"):
return f"layers.{layer}.feed_forward.w3.weight"
elif k.endswith(".input_layernorm.weight"):
return f"layers.{layer}.attention_norm.weight"
elif k.endswith(".post_attention_layernorm.weight"):
return f"layers.{layer}.ffn_norm.weight"
elif k.endswith("rotary_emb.inv_freq") or "lora" in k:
return None
else:
print(layer, k)
raise NotImplementedError
else:
print(k)
raise NotImplementedError
def unpermute(w):
return (
w.view(n_heads, 2, dim // n_heads // 2, dim).transpose(1, 2).reshape(dim, dim)
)
def save_shards(model_sd, num_shards: int, prefix="", verbose=False):
"""
Convert and save the HF format weights to PTH format weights
"""
with torch.no_grad():
if num_shards == 1:
new_state_dict = {}
for k, v in model_sd.items():
new_k = translate_state_dict_key(k)
if new_k is not None:
if "wq" in new_k or "wk" in new_k:
new_state_dict[new_k] = unpermute(v)
else:
new_state_dict[new_k] = v
os.makedirs(output_dir, exist_ok=True)
print(f"Saving shard 1 of {num_shards} into {output_dir}/{prefix}consolidated.00.pth")
torch.save(new_state_dict, output_dir + f"/{prefix}consolidated.00.pth")
else:
new_state_dicts = [dict() for _ in range(num_shards)]
for k in list(model_sd.keys()):
v = model_sd[k]
new_k = translate_state_dict_key(k)
if new_k is not None:
if new_k=='tok_embeddings.weight':
assert v.size(1)%num_shards==0
splits = v.split(v.size(1)//num_shards,dim=1)
elif new_k=='output.weight':
if v.size(0)%num_shards==0:
splits = v.split(v.size(0)//num_shards,dim=0)
else:
size_list = [v.size(0)//num_shards] * num_shards
size_list[-1] += v.size(0)%num_shards
splits = v.split(size_list, dim=0) # 13B: size_list == [24976,24977]
elif new_k=='norm.weight':
splits = [v] * num_shards
elif 'ffn_norm.weight' in new_k:
splits = [v] * num_shards
elif 'attention_norm.weight' in new_k:
splits = [v] * num_shards
elif 'w1.weight' in new_k:
splits = v.split(v.size(0)//num_shards,dim=0)
elif 'w2.weight' in new_k:
splits = v.split(v.size(1)//num_shards,dim=1)
elif 'w3.weight' in new_k:
splits = v.split(v.size(0)//num_shards,dim=0)
elif 'wo.weight' in new_k:
splits = v.split(v.size(1)//num_shards,dim=1)
elif 'wv.weight' in new_k:
splits = v.split(v.size(0)//num_shards,dim=0)
elif "wq.weight" in new_k or "wk.weight" in new_k:
v = unpermute(v)
splits = v.split(v.size(0)//num_shards,dim=0)
else:
print(f"Unexpected key {new_k}")
raise ValueError
if verbose:
print(f"Processing {new_k}")
for sd,split in zip(new_state_dicts,splits):
sd[new_k] = split.clone()
del split
del splits
del model_sd[k],v
gc.collect() # Effectively enforce garbage collection
os.makedirs(output_dir, exist_ok=True)
for i,new_state_dict in enumerate(new_state_dicts):
print(f"Saving shard {i+1} of {num_shards} into {output_dir}/{prefix}consolidated.0{i}.pth")
torch.save(new_state_dict, output_dir + f"/{prefix}consolidated.0{i}.pth")
def merge_shards(output_dir, num_shards: int):
ckpt_filenames = sorted([f for f in os.listdir(output_dir) if re.match('L(\d+)-consolidated.(\d+).pth',f)])
for i in range(num_shards):
shards_filenames = sorted([f for f in ckpt_filenames if re.match(f'L(\d+)-consolidated.0{i}.pth',f)])
print(f"Loading {shards_filenames} ...")
shards_dicts = [torch.load(os.path.join(output_dir,fn)) for fn in shards_filenames]
shards_merged = {}
for d in shards_dicts:
shards_merged |= d
print(f"Saving the merged shard to " + os.path.join(output_dir, f"consolidated.0{i}.pth"))
torch.save(shards_merged, os.path.join(output_dir, f"consolidated.0{i}.pth"))
print("Cleaning up...")
del shards_merged
for d in shards_dicts:
del d
del shards_dicts
gc.collect() # Effectively enforce garbage collection
for fn in shards_filenames:
os.remove(os.path.join(output_dir,fn))
if __name__=='__main__':
args = parser.parse_args()
base_model_path = args.base_model
lora_model_path = args.lora_model
output_dir = args.output_dir
output_type = args.output_type
os.makedirs(output_dir, exist_ok=True)
print(f"="*80)
print(f"Base model: {base_model_path}")
print(f"LoRA model: {lora_model_path}")
tokenizers_and_loras = []
print(f"Loading {lora_model_path}")
if not os.path.exists(lora_model_path):
print("Cannot find lora model on the disk. Downloading lora model from hub...")
lora_model_path = snapshot_download(repo_id=lora_model_path)
tokenizer = LlamaTokenizer.from_pretrained(lora_model_path, legacy=True)
lora_config = peft.LoraConfig.from_pretrained(lora_model_path)
lora_state_dict = torch.load(os.path.join(lora_model_path,'adapter_model.bin'),map_location='cpu')
if 'base_model.model.model.embed_tokens.weight' in lora_state_dict:
lora_vocab_size = lora_state_dict['base_model.model.model.embed_tokens.weight'].shape[0]
assert lora_vocab_size == len(tokenizer), \
(f"The vocab size of the tokenizer {len(tokenizer)} does not match the vocab size of the LoRA weight {lora_vocab_size}!\n")
tokenizers_and_loras.append(
{
"tokenizer" :tokenizer,
"state_dict" :lora_state_dict,
"config": lora_config,
"scaling": lora_config.lora_alpha / lora_config.r,
"fan_in_fan_out" : lora_config.fan_in_fan_out,
})
if not os.path.exists(base_model_path):
print("Cannot find lora model on the disk. Downloading lora model from hub...")
base_model_path = snapshot_download(repo_id=base_model_path)
if os.path.exists(os.path.join(base_model_path, "pytorch_model.bin")):
ckpt_filenames = ["pytorch_model.bin"]
else:
ckpt_filenames = sorted([f for f in os.listdir(base_model_path) if re.match('pytorch_model-(\d+)-of-(\d+).bin',f)])
if len(ckpt_filenames) == 0:
raise FileNotFoundError(f"Cannot find base model checkpoints in ${base_model_path}. Please make sure the checkpoints are saved in the HF format.")
layers = jsonload(os.path.join(base_model_path, "config.json"))["num_hidden_layers"]
model_size = None
total_size = 0
for index, filename in enumerate(ckpt_filenames):
print(f"Loading ckpt {filename}")
state_dict = torch.load(os.path.join(base_model_path,filename), map_location='cpu')
if index == 0:
model_size = layers_to_model_size[layers]
if output_type == 'pth':
params = params_of_models[model_size]
num_shards = num_shards_of_models[model_size]
n_layers = params["n_layers"]
n_heads = params["n_heads"]
dim = params["dim"]
dims_per_head = dim // n_heads
base = 10000.0
inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
print("Merging...")
for k in state_dict:
for tl_idx, t_and_l in enumerate(tokenizers_and_loras):
saved_key = 'base_model.model.'+k
lora_key_A = saved_key.replace('.weight','.lora_A.weight')
if saved_key in t_and_l['state_dict']:
if args.verbose:
print(f"copying {saved_key} from {tl_idx}-th LoRA weight to {k}")
state_dict[k] = t_and_l['state_dict'][saved_key].half().clone() # do we need half()?
if lora_key_A in t_and_l['state_dict']:
lora_key_B = lora_key_A.replace('lora_A.weight','lora_B.weight')
if args.verbose:
print(f"merging {lora_key_A} and lora_B.weight form {tl_idx}-th LoRA weight to {k}")
state_dict[k] += (
transpose(
t_and_l['state_dict'][lora_key_B].float()
@ t_and_l['state_dict'][lora_key_A].float(), t_and_l['fan_in_fan_out']) * t_and_l['scaling']
)
weight_size = state_dict[k].numel() * dtype_byte_size(state_dict[k].dtype)
total_size += weight_size
if output_type == 'huggingface':
print(f"Saving ckpt {filename} to {output_dir} in HF format...")
torch.save(state_dict,os.path.join(output_dir, filename))
elif output_type == 'pth':
print(f"Converting to pth format...")
save_shards(model_sd=state_dict, num_shards=num_shards,prefix=f"L{index+1}-", verbose=args.verbose)
del state_dict
gc.collect() # Effectively enforce garbage collection
print(f"Saving tokenizer")
tokenizers_and_loras[-1]['tokenizer'].save_pretrained(output_dir)
if output_type == 'pth':
with open(output_dir + "/params.json", "w") as f:
print(f"Saving params.json into {output_dir}/params.json")
json.dump(params, f)
merge_shards(output_dir, num_shards=num_shards)
if output_type=='huggingface':
configs = ('config.json', 'generation_config.json', 'pytorch_model.bin.index.json')
if model_size == "1.3B":
configs = ('config.json', 'generation_config.json')
for config in configs:
if os.path.exists(os.path.join(lora_model_path, config)):
print(f"Saving {config} from {lora_model_path}")
with open(os.path.join(lora_model_path, config),'r') as f:
obj = json.load(f)
else:
print(f"Saving {config} from {base_model_path}")
with open(os.path.join(base_model_path, config),'r') as f:
obj = json.load(f)
if config == 'config.json':
obj['vocab_size'] = len(tokenizers_and_loras[-1]['tokenizer'])
if config == 'pytorch_model.bin.index.json':
obj['metadata']['total_size'] = total_size
with open(os.path.join(output_dir, config), 'w') as f:
json.dump(obj, f, indent=2)
for f in os.listdir(lora_model_path):
if re.match("(.*).py", f):
shutil.copy2(os.path.join(lora_model_path, f), output_dir)
print("Done.")
print(f"Check output dir: {output_dir}")
# OPENAI API DEMO
> 更加详细的OPENAI API信息:<https://platform.openai.com/docs/api-reference>
这是一个使用fastapi实现的简易的仿OPENAI API风格的服务器DEMO,您可以使用这个API DEMO来快速搭建基于中文大模型的个人网站以及其他有趣的WEB DEMO。
## 部署方式
安装依赖
``` shell
pip install fastapi uvicorn shortuuid sse_starlette
```
启动脚本
``` shell
python scripts/openai_server_demo/openai_api_server.py --base_model /path/to/base_model --lora_model /path/to/lora_model --gpus 0,1
```
### 参数说明
`--base_model {base_model}`:存放HF格式的LLaMA-2模型权重和配置文件的目录,可以是合并后的中文Alpaca-2模型(此时无需提供`--lora_model`),也可以是转后HF格式后的原版LLaMA-2模型(需要提供`--lora_model`
`--lora_model {lora_model}`:中文Alpaca-2 LoRA解压后文件所在目录,也可使用🤗Model Hub模型调用名称。若不提供此参数,则只加载--base_model指定的模型
`--tokenizer_path {tokenizer_path}`:存放对应tokenizer的目录。若不提供此参数,则其默认值与`--lora_model`相同;若也未提供`--lora_model`参数,则其默认值与--base_model相同
`--only_cpu`:仅使用CPU进行推理
`--gpus {gpu_ids}`:指定使用的GPU设备编号,默认为0。如使用多张GPU,以逗号分隔,如0,1,2
`--load_in_8bit`:使用8bit模型进行推理,可节省显存,但可能影响模型效果
`--alpha {alpha}`:使用NTK方法拓展上下文长度的系数,可以提升可处理的输入长度。默认为1。如果不知道怎么设置,可以保持默认值,或设为`"auto"`
`--use_ntk`:使用NTK方法拓展上下文长度,只对基础版和16K版有效,64K版无需设置该参数。
`--use_flash_attention_2`:使用flash-attention2加速推理。
## API文档
### 文字接龙(completion)
> 有关completion的中文翻译,李宏毅教授将其翻译为文字接龙 <https://www.youtube.com/watch?v=yiY4nPOzJEg>
最基础的API接口,输入prompt,输出语言大模型的文字接龙(completion)结果。
API DEMO内置有prompt模板,prompt将被套入instruction模板中,这里输入的prompt应更像指令而非对话。
#### 快速体验completion接口
请求command:
``` shell
curl http://localhost:19327/v1/completions \
-H "Content-Type: application/json" \
-d '{
"prompt": "告诉我中国的首都在哪里"
}'
```
json返回体:
``` json
{
"id": "cmpl-3watqWsbmYgbWXupsSik7s",
"object": "text_completion",
"created": 1686067311,
"model": "chinese-llama-alpaca-2",
"choices": [
{
"index": 0,
"text": "中国的首都是北京。"
}
]
}
```
#### completion接口高级参数
请求command:
``` shell
curl http://localhost:19327/v1/completions \
-H "Content-Type: application/json" \
-d '{
"prompt": "告诉我中国和美国分别各有哪些优点缺点",
"max_tokens": 90,
"temperature": 0.7,
"num_beams": 4,
"top_k": 40
}'
```
json返回体:
``` json
{
"id": "cmpl-PvVwfMq2MVWHCBKiyYJfKM",
"object": "text_completion",
"created": 1686149471,
"model": "chinese-llama-alpaca-2",
"choices": [
{
"index": 0,
"text": "中国的优点是拥有丰富的文化和历史,而美国的优点是拥有先进的科技和经济体系。"
}
]
}
```
#### completion接口高级参数说明
> 有关Decoding策略,更加详细的细节可以参考 <https://towardsdatascience.com/the-three-decoding-methods-for-nlp-23ca59cb1e9d> 该文章详细讲述了三种LLaMA会用到的Decoding策略:Greedy Decoding、Random Sampling 和 Beam Search,Decoding策略是top_k、top_p、temperature、num_beam等高级参数的基础。
`prompt`: 生成文字接龙(completion)的提示。
`max_tokens`: 新生成的句子的token长度。
`temperature`: 在0和2之间选择的采样温度。较高的值如0.8会使输出更加随机,而较低的值如0.2则会使其输出更具有确定性。temperature越高,使用随机采样最为decoding的概率越大。
`num_beams`: 当搜索策略为束搜索(beam search)时,该参数为在束搜索(beam search)中所使用的束个数,当num_beams=1时,实际上就是贪心搜索(greedy decoding)。
`top_k`: 在随机采样(random sampling)时,前top_k高概率的token将作为候选token被随机采样。
`top_p`: 在随机采样(random sampling)时,累积概率超过top_p的token将作为候选token被随机采样,越低随机性越大,举个例子,当top_p设定为0.6时,概率前5的token概率分别为{0.23, 0.20, 0.18, 0.11, 0.10}时,前三个token的累积概率为0.61,那么第4个token将被过滤掉,只有前三的token将作为候选token被随机采样。
`repetition_penalty`: 重复惩罚,具体细节可以参考这篇文章:<https://arxiv.org/pdf/1909.05858.pdf>
`do_sample`: 启用随机采样策略。默认为true。
### 聊天(chat completion)
聊天接口支持多轮对话
#### 快速体验聊天接口
请求command:
``` shell
curl http://localhost:19327/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"messages": [
{"role": "user","content": "给我讲一些有关杭州的故事吧"}
],
"repetition_penalty": 1.0
}'
```
json返回体:
``` json
{
"id": "chatcmpl-5L99pYoW2ov5ra44Ghwupt",
"object": "chat.completion",
"created": 1686143170,
"model": "chinese-llama-alpaca-2",
"choices": [
{
"index": 0,
"message": {
"role": "user",
"content": "给我讲一些有关杭州的故事吧"
}
},
{
"index": 1,
"message": {
"role": "assistant",
"content": "好的,请问您对杭州有什么特别的偏好吗?"
}
}
]
}
```
#### 多轮对话
请求command:
``` shell
curl http://localhost:19327/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"messages": [
{"role": "user","content": "给我讲一些有关杭州的故事吧"},
{"role": "assistant","content": "好的,请问您对杭州有什么特别的偏好吗?"},
{"role": "user","content": "我比较喜欢和西湖,可以给我讲一下西湖吗"}
],
"repetition_penalty": 1.0
}'
```
json返回体:
``` json
{
"id": "chatcmpl-hmvrQNPGYTcLtmYruPJbv6",
"object": "chat.completion",
"created": 1686143439,
"model": "chinese-llama-alpaca-2",
"choices": [
{
"index": 0,
"message": {
"role": "user",
"content": "给我讲一些有关杭州的故事吧"
}
},
{
"index": 1,
"message": {
"role": "assistant",
"content": "好的,请问您对杭州有什么特别的偏好吗?"
}
},
{
"index": 2,
"message": {
"role": "user",
"content": "我比较喜欢和西湖,可以给我讲一下西湖吗"
}
},
{
"index": 3,
"message": {
"role": "assistant",
"content": "是的,西湖是杭州最著名的景点之一,它被誉为“人间天堂”。 <\\s>"
}
}
]
}
```
#### 聊天接口高级参数说明
`prompt`: 生成文字接龙(completion)的提示。
`max_tokens`: 新生成的句子的token长度。
`temperature`: 在0和2之间选择的采样温度。较高的值如0.8会使输出更加随机,而较低的值如0.2则会使其输出更具有确定性。temperature越高,使用随机采样最为decoding的概率越大。
`num_beams`: 当搜索策略为束搜索(beam search)时,该参数为在束搜索(beam search)中所使用的束个数,当num_beams=1时,实际上就是贪心搜索(greedy decoding)。
`top_k`: 在随机采样(random sampling)时,前top_k高概率的token将作为候选token被随机采样。
`top_p`: 在随机采样(random sampling)时,累积概率超过top_p的token将作为候选token被随机采样,越低随机性越大,举个例子,当top_p设定为0.6时,概率前5的token概率分别为[0.23, 0.20, 0.18, 0.11, 0.10]时,前三个token的累积概率为0.61,那么第4个token将被过滤掉,只有前三的token将作为候选token被随机采样。
`repetition_penalty`: 重复惩罚,具体细节可以参考这篇文章:<https://arxiv.org/pdf/1909.05858.pdf>
`do_sample`: 启用随机采样策略。默认为true。
`stream`: OpenAI格式的流式返回。默认为false,设置为true时,会按照OpenAI的格式流式返回数据,可以作为任意基于ChatGPT的应用的后端。
### 文本嵌入向量(text embedding)
文本嵌入向量有很多作用,包括但不限于基于大型文档问答、总结一本书中的内容、为大语言模型找到与当前用户输入最相近的记忆等等。
请求command:
``` shell
curl http://localhost:19327/v1/embeddings \
-H "Content-Type: application/json" \
-d '{
"input": "今天天气真不错"
}'
```
json返回体:
``` json
{
"object": "list",
"data": [
{
"object": "embedding",
"embedding": [
0.003643923671916127,
-0.0072653163224458694,
0.0075545101426541805,
....,
0.0045851171016693115
],
"index": 0
}
],
"model": "chinese-llama-alpaca-2"
}
```
embedding向量的长度与所使用模型hidden size相同。比如当使用7B模型时,embedding的长度为4096。
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment