Initial commit

7630c1fc · wanglch · 7630c1fc · 7630c1fc · 7630c1fc · 7630c1fc
Commit 7630c1fc authored Jul 01, 2024 by wanglch
19 changed files
--- a/ds_zero3_work_dtk.sh
+++ b/ds_zero3_work_dtk.sh
+#!/bin/bash
+deepspeed --master_port $(shuf -n 1 -i 10000-65535)  --include="localhost:0,1,2,3" ./LLaMA-Factory/src/train_bash.py \
+    --deepspeed ./LLaMA-Factory/deepspeed.json \
+    --stage sft \
+    --do_train \
+    --model_name_or_path ./Tongyi-Finance-14B-Chat \
+    --dataset fingpt_sentiment \
+    --dataset_dir ./LLaMA-Factory/data \
+    --template qwen \
+    --finetuning_type lora \
+    --lora_target all \
+    --output_dir ./saves/Tongyi-Finance-14B-Chat/lora_multi_dtk/sft \
+    --overwrite_output_dir \
+    --cutoff_len 1024 \
+    --preprocessing_num_workers 1 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 1 \
+    --lr_scheduler_type cosine \
+    --logging_steps 10 \
+    --warmup_steps 20 \
+    --save_steps 100 \
+    --eval_steps 10 \
+    --evaluation_strategy steps \
+    --load_best_model_at_end \
+    --learning_rate 5e-5 \
+    --num_train_epochs 1.0 \
+    --max_samples 3000 \
+    --val_size 0.1 \
+    --ddp_timeout 180000000 \
+    --plot_loss True \
+    --fp16
--- a/eval/__init__.py
+++ b/eval/__init__.py
--- a/eval/embedding_utils.py
+++ b/eval/embedding_utils.py
+import dashscope
+from dashscope import TextEmbedding
+import numpy as np
+from typing import Union, List
+import logging
+import os
+import yaml
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), "configs/emb_config.yaml"), 'r') as file:
+    ds_config = yaml.safe_load(file).get("dashscope_config")
+def generate_embedding(text, embedding_model="dashscope", **kwargs):
+    # todo: support more embedding model in the future, e.g., m3e model.
+    dashscope.api_key = ds_config["api_key"]
+    dashscope.base_http_api_url = ds_config['base_http_api_url']
+    try:
+        rsp = TextEmbedding.call(model=TextEmbedding.Models.text_embedding_v1, input=text)
+        embeddings = np.array([record['embedding'] for record in rsp.output['embeddings']])
+        if isinstance(text, str):
+            embeddings = embeddings[0]
+    except TypeError as e:
+        logger.warning(f"Request dashscope embedding service failed, error info {e}")
+        embeddings = None
+    return embeddings
+def cosine_distance(a, b):
+    """
+    Only support `a` is an embedding vector, `b` is a vector or matrix.
+    """
+    dist = np.dot(a, b.T) / (np.linalg.norm(a, axis=-1) * np.linalg.norm(b, axis=-1))
+    return dist
+def l2_distance(a, b):
+    dist = np.linalg.norm(a - b, axis=-1)
+    return dist
+def similarity_match(query: str, corpus: Union[str, List], dist_type="cosine"):
+    if dist_type not in ("cosine", "l2"):
+        logger.warning(f"invalid input distance type, {dist_type}, setting to cosine distance")
+        dist_type = "cosine"
+    query_emb = generate_embedding(query)
+    corpus_emb = generate_embedding(corpus)
+    if query_emb is not None and corpus_emb is not None:
+        if dist_type == "l2":
+            return l2_distance(query_emb, corpus_emb)
+        else:
+            return cosine_distance(query_emb, corpus_emb)
+    else:
+        return None
+if __name__ == "__main__":
+    queries = "请问贵州茅台最近股价如何"
+    context = ["完美世界近期市场波动较大", "茅台和五粮液作为消费龙头, 2020年整体表现优于沪深300指数"]
+    print(similarity_match(queries, context))
--- a/eval/evaluate.py
+++ b/eval/evaluate.py
+# coding=utf-8
+import json
+import re
+import numpy as np
+import pandas as pd
+from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
+import Levenshtein
+import jsonlines
+import os
+import sys
+from tqdm import tqdm
+# from tokenizer import MsTokenizer, JiebaTokenizer
+import jieba
+from .embedding_utils import similarity_match
+# 错误字典，这里只是示例
+error_msg = {
+    1: "Bad input file",
+    2: "Wrong input file format",
+    3: "Duplicate ids in the submit files",
+    4: "Not find valid submit files.",
+    5: "Unequal size between the submit files and question files.",
+    6: "Unaligned id information.",
+    7: "None or empty answer in the submit files."
+}
+def report_error_msg(detail, showMsg, out_p):
+    error_dict = dict()
+    error_dict['errorDetail'] = detail
+    error_dict['errorMsg'] = showMsg
+    error_dict['score'] = 0
+    error_dict['scoreJson'] = {}
+    error_dict['success'] = False
+    dump_2_json(error_dict, out_p)
+def report_score(score, out_p):
+    result = dict()
+    result['success'] = True
+    result['score'] = score["score"]
+    # 这里{}里面的score注意保留，但可以增加其他key，比如这样：
+    # result['scoreJson'] = {'score': score, 'aaaa': 0.1}
+    result['scoreJson'] = {
+        'score': score["score"],
+        "data_query": score.get("数据查询", -1),
+        "text_comprehension": score.get("文本理解", -1)
+    }
+    # result['scoreJson'] = score
+    dump_2_json(result, out_p)
+def read_jsonl(path):
+    content = []
+    with jsonlines.open(path, "r") as json_file:
+        for obj in json_file.iter(type=dict, skip_invalid=True):
+            content.append(obj)
+    return content
+def dump_2_json(info, path):
+    with open(path, 'w') as output_json_file:
+        json.dump(info, output_json_file, ensure_ascii=False, indent=4)
+def tokenize(text, method="qwen"):
+    # if method == "qwen":
+    #     tkr = MsTokenizer(name="TongyiFinance/Tongyi-Finance-14B")
+    # else:
+    #     tkr = JiebaTokenizer()
+    # return tkr.tokenize(text)
+    return jieba.lcut(text)
+def calculate_bleu_score(reference_sentence, hypothesis_sentence):
+    """
+    N-gram precision metric.
+    """
+    smoothing = SmoothingFunction().method1
+    reference_tokens = [tokenize(reference_sentence)]
+    hypothesis_tokens = tokenize(hypothesis_sentence)
+    bleu_score = sentence_bleu(
+        reference_tokens, hypothesis_tokens, smoothing_function=smoothing, auto_reweigh=True)
+    return bleu_score
+def calculate_t2v_score(reference_sentence, hypothesis_sentence):
+    if sim_model is not None:
+        t2v_score = sim_model.get_score(reference_sentence, hypothesis_sentence)
+    else:
+        t2v_score = similarity_match(reference_sentence, hypothesis_sentence, )
+    return t2v_score
+def calculate_leven_score(reference_sentence, hypothesis_sentence):
+    l_score = Levenshtein.distance(reference_sentence, hypothesis_sentence)
+    return l_score
+def calculate_f1(reference_sentence, hypothesis_sentence):
+    """
+    Set F1 score.
+    """
+    reference_tokens = set(tokenize(reference_sentence))
+    hypothesis_tokens = set(tokenize(hypothesis_sentence))
+    if len(reference_tokens) == 0 or len(hypothesis_tokens) == 0:
+        return 0
+    commons = hypothesis_tokens & reference_tokens
+    # if there are no common tokens then f1 = 0
+    if len(commons) == 0:
+        return 0
+    prec = len(commons) / len(hypothesis_tokens)
+    rec = len(commons) / len(reference_tokens)
+    return 2 * (prec * rec) / (prec + rec)
+def calculate_scores(reference_sentence, hypothesis_sentence):
+    scores = dict()
+    # n-gram precision metric
+    # scores["BLEU"] = calculate_bleu_nopenalty_score(reference_sentence, hypothesis_sentence)
+    # embedding, semantic metric
+    scores["text2vec"] = calculate_t2v_score(reference_sentence, hypothesis_sentence)
+    # precision & recall
+    scores["f1_score"] = calculate_f1(reference_sentence, hypothesis_sentence)
+    scores["score"] = 0.6 * scores["text2vec"] + 0.4 * scores["f1_score"]
+    return scores
+def evaluate_answer(reference_data, reference_answer, user_answer):
+    """根据新的标准评估用户的答案"""
+    score = 0.0
+    if user_answer is None or user_answer == "":
+        return score
+    # 标准化答案和参考数据的日期格式
+    user_answer = standardize_extended_date_formats(user_answer)
+    reference_answer = standardize_extended_date_formats(reference_answer)
+    reference_data = [standardize_extended_date_formats(i) for i in reference_data]
+    # if no reference data given, evaluate the results according to the semantic matching only.
+    if reference_data is not None and len(reference_data) > 0:
+        score_weight = (0.6, 0.4)
+        matched_data_count = sum(1.0 for data in reference_data if data in user_answer)
+        score += score_weight[0] * (matched_data_count / len(reference_data))
+    else:
+        score_weight = (0, 1.0)
+    # 计算语义相似度得分
+    semantic_scores = calculate_scores(reference_answer, user_answer)
+    if semantic_scores:
+        score += score_weight[1] * semantic_scores["score"]
+    return score
+def standardize_extended_date_formats(text):
+    """标准化扩展的日期格式"""
+    # 定义日期格式的正则表达式及其替代格式
+    patterns = [
+        (r'(\d{4})年(\d{1,2})月(\d{1,2})[日号]', "{0}{1:02}{2:02}"),  # YYYY年MM月DD日
+        (r'(\d{4})/(\d{1,2})/(\d{1,2})', "{0}{1:02}{2:02}"),  # YYYY/MM/DD
+        (r'(\d{4})-(\d{1,2})-(\d{1,2})', "{0}{1:02}{2:02}"),  # YYYY-MM-DD
+        (r'(\d{4})\.(\d{1,2})\.(\d{1,2})', "{0}{1:02}{2:02}"),  # YYYY.MM.DD
+        (r'(\d{1,2})月(\d{1,2})[日号][,，](\d{4})年', "{2}{0:02}{1:02}"),  # MM月DD日,YYYY年
+        (r'(\d{1,2})[日号](\d{1,2})月[,，](\d{4})年', "{2}{1:02}{0:02}"),  # DD月MM日,YYYY年
+        (r'(\d{4})年?一季度', "{0}0331"),
+        (r'(\d{4})年?第一季度', "{0}0331"),
+        (r'(\d{4})年?Q1', "{0}0331"),
+        (r'(\d{4})年?二季度', "{0}0630"),
+        (r'(\d{4})年?第二季度', "{0}0630"),
+        (r'(\d{4})年?Q2', "{0}0630"),
+        (r'(\d{4})年?三季度', "{0}0930"),
+        (r'(\d{4})年?第三季度', "{0}0930"),
+        (r'(\d{4})年?Q3', "{0}0930"),
+        (r'(\d{4})年?四季度', "{0}1231"),
+        (r'(\d{4})年?第四季度', "{0}1231"),
+        (r'(\d{4})年?Q4', "{0}1231"),
+        (r'(\d{6})日期?', "{0}"),
+    ]
+    # 遍历所有的模式并进行替换
+    for pattern, replacement_format in patterns:
+        def replacement(match):
+            groups = [int(g) for g in match.groups()]
+            return replacement_format.format(*groups)
+        text = re.sub(pattern, replacement, text)
+    return text
+def evaluate(correct_data, user_data):
+    total_score = 0.0
+    data = []
+    paired_data = list(zip(correct_data, user_data))
+    pbar = tqdm(len(paired_data), desc="Processing")
+    for i, (correct_answer, user_answer) in enumerate(paired_data):
+        score = evaluate_answer(
+            correct_answer["answer_term"],
+            correct_answer["answer"],
+            user_answer.get("answer", "")
+        )
+        total_score += score  # 累加得分
+        c = {
+            'id': correct_answer['id'],
+            'type': correct_answer['type'],
+            'question': correct_answer['question'],
+            'refer_answer': correct_answer['answer'],
+            'refer_answer_term': correct_answer['answer_term'],
+            'user_answer': user_answer.get('answer', ""),
+            'score': score
+        }
+        data.append(c)
+        if (i + 1) % 50 == 0:
+            pbar.set_postfix({"score": round(score, 4)})
+            pbar.update(50)
+    dump_2_json(data, "./evaluate_result_detail.jsonl")
+    data_df = pd.DataFrame(data)
+    total_score = round(data_df["score"].mean() * 100.0, 2)
+    score_dict = np.round(data_df.groupby("type")["score"].mean() * 100.0, 2).to_dict()
+    score_dict["score"] = total_score
+    print(f"Scores: {score_dict}", flush=True)
+    return score_dict
+if __name__ == "__main__":
+    '''
+      online evaluation 
+    '''
+    in_param_path = sys.argv[1]
+    out_path = sys.argv[2]
+    try:
+        submit_path = sys.argv[3]
+    except IndexError:
+        submit_path = None
+    # read submit and answer file from first parameter
+    with open(in_param_path, 'r') as load_f:
+        input_params = json.load(load_f)
+    # 标准答案路径
+    standard_path = input_params["fileData"]["standardFilePath"]
+    print("Read standard from %s" % standard_path)
+    # 选手提交的结果文件路径
+    if submit_path is None:
+        submit_path = input_params["fileData"]["userFilePath"]
+    print("Read user submit file from %s" % submit_path)
+    # 假设这是您已经下载并加载到环境中的模型，huggingface上可以下载该模型
+    sim_model_path = "shibing624/text2vec-base-chinese"
+    # sim_model_path = None
+    if isinstance(sim_model_path, str) and os.path.exists(sim_model_path):
+        from text2vec import Similarity
+        sim_model = Similarity(max_seq_length=256)
+    else:
+        sim_model = None
+    if not os.path.exists(submit_path):
+        report_error_msg(error_msg[4], f"Error message: {error_msg[4]}", out_path)
+        sys.exit()
+    try:
+        standard_labels = read_jsonl(standard_path)
+        submit_preds = read_jsonl(submit_path)
+    except json.JSONDecodeError as e:
+        report_error_msg(e.msg, f"Error message: {error_msg[2]}", out_path)
+        sys.exit()
+    if len(standard_labels) != len(submit_preds):
+        report_error_msg(error_msg[5], f"Error message: {error_msg[5]}", out_path)
+        sys.exit()
+    submit_preds = sorted(submit_preds, key=lambda s: s['id'])
+    label_ids = [s["id"] for s in standard_labels]
+    pred_ids = [s["id"] for s in submit_preds]
+    if label_ids != pred_ids:
+        report_error_msg(error_msg[6], f"Error message: {error_msg[6]}", out_path)
+        sys.exit()
+    for s in submit_preds:
+        ans = s.get("answer", "")
+        if ans is None:
+            report_error_msg(error_msg[7], f"Error message: {error_msg[7]}", out_path)
+            sys.exit()
+    try:
+        eval_score = evaluate(standard_labels, submit_preds)
+        report_score(eval_score, out_path)
+    except Exception as e:
+        report_error_msg(f"{e}", f"Error message: {e}", out_path)
--- a/eval/input_param.json
+++ b/eval/input_param.json
+{
+  "fileData": {
+    "evaluatorDir": "",
+    "evaluatorPath": "",
+    "standardFileDir": "",
+    "standardFilePath": "standard_answer.jsonl",
+    "userFileDir": "",
+    "userFilePath": "submit_result.jsonl"
+  }
+}
--- a/eval/tokenizer.py
+++ b/eval/tokenizer.py
+"""
+Tokenizer to support Chinese
+"""
+import jieba
+from abc import abstractmethod
+from modelscope import AutoTokenizer
+def singleton(cls):
+    instances = {}
+    def wrapper(*args, **kwargs):
+        if cls not in instances:
+            instances[cls] = cls(*args, **kwargs)
+        return instances[cls]
+    return wrapper
+class Tokenizer(object):
+    @abstractmethod
+    def tokenize(self, text):
+        pass
+@singleton
+class JiebaTokenizer(Tokenizer):
+    def __init__(self, cut_all=False):
+        jieba.initialize()
+        self.cut_all = cut_all
+    def tokenize(self, text):
+        return list(jieba.cut(text, cut_all=self.cut_all))
+@singleton
+class MsTokenizer(Tokenizer):
+    def __init__(self, name="TongyiFinance/Tongyi-Finance-14B"):
+        self.tokenizer = AutoTokenizer.from_pretrained(name, trust_remote_code=True)
+    def tokenize(self, text):
+        return self.tokenizer(text).get("input_ids", [])
--- a/finqwen_inference.py
+++ b/finqwen_inference.py
+from modelscope import AutoModelForCausalLM, AutoTokenizer
+from modelscope import GenerationConfig
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+model_dir = '/home/wanglch/projects/FinQwen/Tongyi-Finance-14B-Chat'
+# Note: The default behavior now has injection attack prevention off.
+tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
+# use bf16
+# model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="cuda:0", trust_remote_code=True, bf16=True).eval()
+# use cpu only
+# model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="cpu", trust_remote_code=True).eval()
+model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True).eval()
+# 模型加载指定device_map='cuda:0'，更改成device_map='auto'会使用所有可用显卡
+# Specify hyperparameters for generation
+model.generation_config = GenerationConfig.from_pretrained(model_dir, trust_remote_code=True)
+response, history = model.chat(tokenizer, "请解释一下资产负债率", history=None)
+print(response)
+# 资产负债率是一个财务比率，用来衡量一个企业的负债水平。它是用一个企业负债总额除以其资产总额的百分比来表示的。它的计算公式是：资产负债率 = 负债总额 / 资产总额。它能够反映一个企业的财务状况，以及它是否具有足够的资产来抵偿其债务。
\ No newline at end of file
--- a/imgs/framework_1.jpg
+++ b/imgs/framework_1.jpg
--- a/imgs/framework_2.jpg
+++ b/imgs/framework_2.jpg
--- a/imgs/framework_3.jpg
+++ b/imgs/framework_3.jpg
--- a/imgs/framework_4.jpg
+++ b/imgs/framework_4.jpg
--- a/imgs/framework_5.jpg
+++ b/imgs/framework_5.jpg
--- a/imgs/framework_6.jpg
+++ b/imgs/framework_6.jpg
--- a/imgs/logo_fin_qwen.png
+++ b/imgs/logo_fin_qwen.png
--- a/imgs/result1.png
+++ b/imgs/result1.png
--- a/imgs/transformer.jpg
+++ b/imgs/transformer.jpg
--- a/imgs/transformer.png
+++ b/imgs/transformer.png
--- a/model.properties
+++ b/model.properties
+# 模型唯一标识
+modelCode = 734
+# 模型名称
+modelName=finqwen_pytorch                    
+# 模型描述
+modelDescription=金融对话大模型
+# 应用场景
+appScenario=推理,训练,金融,教育
+# 框架类型
+frameType=pytorch
--- a/requirements.txt
+++ b/requirements.txt
+torch>=1.13.1
+transformers>=4.37.2
+datasets>=2.14.3
+accelerate>=0.21.0
+peft>=0.8.2
+trl>=0.7.6
+gradio>=3.38.0,<4.0.0
+scipy
+einops
+sentencepiece
+protobuf
+jieba
+rouge-chinese
+nltk
+uvicorn
+pydantic
+fastapi
+sse-starlette
+matplotlib
+transformers_stream_generator
+modelscope