yidong infer init

92c75df1 · sunzhq2 · 92c75df1 · 92c75df1 · 92c75df1 · 92c75df1
Commit 92c75df1 authored Jan 20, 2026 by sunzhq2
20 changed files
--- a/bert/bert4torch_cmcc/examples/basic/basic_language_model_CDial_GPT.py
+++ b/bert/bert4torch_cmcc/examples/basic/basic_language_model_CDial_GPT.py
+#! -*- coding: utf-8 -*-
+# 基本测试：中文GPT模型，base版本，CDial-GPT版
+# 项目链接：https://github.com/thu-coai/CDial-GPT
+# 参考项目：https://github.com/bojone/CDial-GPT-tf
+# 权重需转换后方可加载，转换脚本见convert_script文件夹
+import torch
+from bert4torch.models import build_transformer_model
+from bert4torch.tokenizers import Tokenizer
+from bert4torch.snippets import AutoRegressiveDecoder
+config_path = 'F:/Projects/pretrain_ckpt/gpt/[thu-coai_torch_base]--CDial-GPT-LCCC-base/bert4torch_config.json'
+checkpoint_path = 'F:/Projects/pretrain_ckpt/gpt/[thu-coai_torch_base]--CDial-GPT-LCCC-base/bert4torch_pytorch_model.bin'
+dict_path = 'F:/Projects/pretrain_ckpt/gpt/[thu-coai_torch_base]--CDial-GPT-LCCC-base/bert4torch_vocab.txt'
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+tokenizer = Tokenizer(dict_path, do_lower_case=True)  # 建立分词器
+speakers = [tokenizer.token_to_id('[speaker1]'), tokenizer.token_to_id('[speaker2]')]
+# config中设置shared_segment_embeddings=True，segment embedding用word embedding的权重生成
+model = build_transformer_model(
+    config_path=config_path,
+    checkpoint_path=checkpoint_path,
+    model='gpt',
+).to(device)  # 建立模型，加载权重
+class ChatBot(AutoRegressiveDecoder):
+    """基于随机采样的闲聊回复
+    """
+    @AutoRegressiveDecoder.wraps(default_rtype='probas')
+    def predict(self, inputs, output_ids, states):
+        token_ids, segment_ids = inputs
+        curr_segment_ids = torch.zeros_like(output_ids) + token_ids[0, -1]
+        token_ids = torch.cat([token_ids, output_ids], 1)
+        segment_ids = torch.cat([segment_ids, curr_segment_ids], 1)
+        logits = model.predict([token_ids, segment_ids])
+        return logits[:, -1, :]
+    def response(self, texts, n=1, topk=5):
+        token_ids = [tokenizer._token_start_id, speakers[0]]
+        segment_ids = [tokenizer._token_start_id, speakers[0]]
+        for i, text in enumerate(texts):
+            ids = tokenizer.encode(text)[0][1:-1] + [speakers[(i + 1) % 2]]
+            token_ids.extend(ids)
+            segment_ids.extend([speakers[i % 2]] * len(ids))
+            segment_ids[-1] = speakers[(i + 1) % 2]
+        results = self.random_sample([token_ids, segment_ids], n, topk)  # 基于随机采样
+        return tokenizer.decode(results[0].cpu().numpy())
+chatbot  = ChatBot(start_id=None, end_id=tokenizer._token_end_id, maxlen=32, device=device)
+print(chatbot.response([u'别爱我没结果', u'你这样会失去我的', u'失去了又能怎样']))
+"""
+回复是随机的，例如：你还有我 | 那就不要爱我 | 你是不是傻 | 等等。
+"""
\ No newline at end of file
--- a/bert/bert4torch_cmcc/examples/basic/basic_language_model_GAU_alpha.py
+++ b/bert/bert4torch_cmcc/examples/basic/basic_language_model_GAU_alpha.py
+#! -*- coding: utf-8 -*-
+# 基础测试：GAU_alpha的mlm预测，和bert4keras版本比对一致
+# 测试中长文本效果明显高于短文本效果
+# 博客：https://kexue.fm/archives/9052
+# 权重转换脚本：./convert_script/convert_GAU_alpha.py
+from bert4torch.models import build_transformer_model
+from bert4torch.tokenizers import Tokenizer
+import torch
+# 加载模型，请更换成自己的路径
+config_path = 'F:/Projects/pretrain_ckpt/gau/[sushen-torch]--chinese_GAU-alpha-char_L-24_H-768/bert_config.json'
+checkpoint_path = 'F:/Projects/pretrain_ckpt/gau/[sushen-torch]--chinese_GAU-alpha-char_L-24_H-768/pytorch_model.bin'
+dict_path = 'F:/Projects/pretrain_ckpt/gau/[sushen-torch]--chinese_GAU-alpha-char_L-24_H-768/vocab.txt'
+# 建立分词器
+tokenizer = Tokenizer(dict_path, do_lower_case=True)
+model = build_transformer_model(config_path, checkpoint_path, model='gau_alpha', with_mlm='softmax')  # 建立模型，加载权重
+token_ids, segments_ids = tokenizer.encode("近期正是上市公司财报密集披露的时间，但有多家龙头公司的业绩令投资者失望")
+token_ids[5] = token_ids[6] = tokenizer._token_mask_id
+print(''.join(tokenizer.ids_to_tokens(token_ids)))
+tokens_ids_tensor = torch.tensor([token_ids])
+segment_ids_tensor = torch.tensor([segments_ids])
+# 需要传入参数with_mlm
+model.eval()
+with torch.no_grad():
+    _, probas = model([tokens_ids_tensor, segment_ids_tensor])
+    result = torch.argmax(probas[0, 5:7], dim=-1).numpy()
+    print(tokenizer.decode(result))
--- a/bert/bert4torch_cmcc/examples/basic/basic_language_model_bart.py
+++ b/bert/bert4torch_cmcc/examples/basic/basic_language_model_bart.py
+# 测试bart语言模型的预测效果
+# bert4torch需要转换一下权重，见convert文件夹中
+from transformers import BertTokenizer, BartForConditionalGeneration
+tokenizer = BertTokenizer.from_pretrained("F:/Projects/pretrain_ckpt/bart/[FudanNLP_torch_base]/")
+model = BartForConditionalGeneration.from_pretrained("F:/Projects/pretrain_ckpt/bart/[FudanNLP_torch_base]/")
+input_ids = tokenizer.encode("北京是[MASK]的首都", return_tensors='pt')
+pred_ids = model.generate(input_ids, num_beams=4, max_length=20)
+print('transformers output: ', tokenizer.convert_ids_to_tokens(pred_ids[0]))
+# 输出： ['[SEP]', '[CLS]', '北', '京', '是', '中', '国', '的', '首', '都', '[SEP]'] 
+from bert4torch.models import build_transformer_model
+from bert4torch.tokenizers import Tokenizer
+from bert4torch.snippets import AutoRegressiveDecoder
+import torch
+# bert配置
+config_path = 'F:/Projects/pretrain_ckpt/bart/[FudanNLP_torch_base]/bert4torch_config.json'
+checkpoint_path = 'F:/Projects/pretrain_ckpt/bart/[FudanNLP_torch_base]/bert4torch_pytorch_model.bin'
+dict_path = 'F:/Projects/pretrain_ckpt/bart/[FudanNLP_torch_base]/vocab.txt'
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+tokenizer = Tokenizer(dict_path, do_lower_case=True)
+model = build_transformer_model(config_path, checkpoint_path, model='bart', segment_vocab_size=0).to(device)
+class AutoTitle(AutoRegressiveDecoder):
+    """seq2seq解码器
+    """
+    @AutoRegressiveDecoder.wraps(default_rtype='logits')
+    def predict(self, inputs, output_ids, states):
+        return model.decoder.predict([output_ids] + inputs)[-1][:, -1, :]  # 保留最后一位
+    def generate(self, text, topk=4):
+        token_ids, _ = tokenizer.encode(text, maxlen=128)
+        token_ids = torch.tensor([token_ids], device=device)
+        encoder_output = model.encoder.predict([token_ids])
+        output_ids = self.beam_search(encoder_output, topk=topk)  # 基于beam search
+        return tokenizer.decode(output_ids.cpu().numpy())
+autotitle = AutoTitle(start_id=102, end_id=tokenizer._token_end_id, maxlen=32, device=device)
+print('bert4torch output: ', autotitle.generate("北京是[MASK]的首都"))
\ No newline at end of file
--- a/bert/bert4torch_cmcc/examples/basic/basic_language_model_bert.py
+++ b/bert/bert4torch_cmcc/examples/basic/basic_language_model_bert.py
+#! -*- coding: utf-8 -*-
+# 基础测试：mlm预测
+from bert4torch.models import build_transformer_model
+from bert4torch.tokenizers import Tokenizer
+import torch
+# 加载模型，请更换成自己的路径
+root_model_path = "F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12"
+vocab_path = root_model_path + "/vocab.txt"
+config_path = root_model_path + "/bert_config.json"
+checkpoint_path = root_model_path + '/pytorch_model.bin'
+# 建立分词器
+tokenizer = Tokenizer(vocab_path, do_lower_case=True)
+model = build_transformer_model(config_path, checkpoint_path, with_mlm='softmax')  # 建立模型，加载权重
+token_ids, segments_ids = tokenizer.encode("科学技术是第一生产力")
+token_ids[3] = token_ids[4] = tokenizer._token_mask_id
+print(''.join(tokenizer.ids_to_tokens(token_ids)))
+tokens_ids_tensor = torch.tensor([token_ids])
+segment_ids_tensor = torch.tensor([segments_ids])
+# 需要传入参数with_mlm
+model.eval()
+with torch.no_grad():
+    _, probas = model([tokens_ids_tensor, segment_ids_tensor])
+    result = torch.argmax(probas[0, 3:5], dim=-1).numpy()
+    print(tokenizer.decode(result))
--- a/bert/bert4torch_cmcc/examples/basic/basic_language_model_cpm_lm.py
+++ b/bert/bert4torch_cmcc/examples/basic/basic_language_model_cpm_lm.py
+#! -*- coding: utf-8 -*-
+# 基本测试：清华开源的中文GPT2模型（26亿参数）
+# 项目链接：https://github.com/TsinghuaAI/CPM-Generate
+# 博客介绍：https://kexue.fm/archives/7912
+# 权重需转换后方可加载，转换脚本见convert_script文件夹
+import numpy as np
+from bert4torch.models import build_transformer_model
+from bert4torch.tokenizers import SpTokenizer
+from bert4torch.snippets import AutoRegressiveDecoder
+import torch
+import jieba
+jieba.initialize()
+# 模型路径
+config_path = 'F:/Projects/pretrain_ckpt/gpt2/[cpm_gpt2_torch]--cpm_lm_2.6b/bert4torch_config.json'
+checkpoint_path = 'F:/Projects/pretrain_ckpt/gpt2/[cpm_gpt2_torch]--cpm_lm_2.6b/bert4torch_pytorch_model.bin'
+spm_path = 'F:/Projects/pretrain_ckpt/gpt2/[cpm_gpt2_torch]--cpm_lm_2.6b/chinese_vocab.model'
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+def pre_tokenize(text):
+    """分词前处理函数，'\n'替换成'▃', ' '替换成'▂'
+    """
+    return [
+        w.replace(' ', u'\u2582').replace('\n', u'\u2583')
+        for w in jieba.cut(text, cut_all=False)
+    ]
+tokenizer = SpTokenizer(
+    spm_path,
+    token_start=None,
+    token_end=None,
+    pre_tokenize=pre_tokenize,
+    token_translate={u'\u2583': '<cls>'}  # '\n'替换成<cls>
+)  # 建立分词器
+model = build_transformer_model(
+    config_path=config_path, checkpoint_path=checkpoint_path, model='gpt2', segment_vocab_size=0
+).to(device)  # 建立模型，加载权重
+class TextExpansion(AutoRegressiveDecoder):
+    """基于随机采样的文本续写
+    """
+    @AutoRegressiveDecoder.wraps(default_rtype='probas')
+    def predict(self, inputs, output_ids, states):
+        token_ids = torch.cat([inputs[0], output_ids], 1)
+        logits = model.predict([token_ids])
+        return logits[:, -1, :]
+    def generate(self, text, n=1, topp=0.95, temperature=1):
+        """输出结果会有一定的随机性，如果只关心Few Shot效果，
+        可以考虑将解码方式换为beam search。
+        """
+        token_ids, _ = tokenizer.encode(text)
+        results = self.random_sample([token_ids], n, topp=topp, temperature=temperature)  # 基于随机采样
+        results = [token_ids + [int(i) for i in ids.cpu().numpy()] for ids in results]
+        texts = [tokenizer.decode(ids) for ids in results]
+        return [self.post_replace(text) for text in texts]
+    def post_replace(self, text):
+        for s, t in [(' ', ''), (u'\u2582', ' '), (u'\u2583', '\n')]:
+            text = text.replace(s, t)
+        return text
+text_expansion = TextExpansion(
+    start_id=None,
+    end_id=3,  # 3是<cls>，也是换行符
+    maxlen=16,
+    device=device
+)
+# 常识推理
+# 本例输出：北京
+query = u"""
+美国的首都是华盛顿
+法国的首都是巴黎
+日本的首都是东京
+中国的首都是
+"""
+print(text_expansion.generate(query[1:-1], 1)[0])
+# 单词翻译
+# 本例输出：bird
+query = u"""
+狗 dog
+猫 cat
+猪 pig
+鸟 
+"""
+print(text_expansion.generate(query[1:-1], 1)[0])
+# 主语抽取
+# 本例输出：杨振宁
+query = u"""
+从1931年起，华罗庚在清华大学边学习边工作 华罗庚
+在一间简陋的房间里，陈景润攻克了“哥德巴赫猜想” 陈景润
+在这里，丘成桐得到IBM奖学金 丘成桐
+杨振宁在粒子物理学、统计力学和凝聚态物理等领域作出里程碑性贡献 
+"""
+print(text_expansion.generate(query[1:-1], 1)[0])
+# 三元组抽取
+# 本例输出：张红,体重,140斤
+query = u"""
+姚明的身高是211cm，是很多人心目中的偶像。 ->姚明，身高，211cm
+毛泽东是绍兴人，早年在长沙读书。->毛泽东，出生地，绍兴
+虽然周杰伦在欧洲办的婚礼，但是他是土生土长的中国人->周杰伦，国籍，中国
+小明出生于武汉，但是却不喜欢在武汉生成，长大后去了北京。->小明，出生地，武汉
+吴亦凡是很多人的偶像，但是他却是加拿大人，另很多人失望->吴亦凡，国籍，加拿大
+武耀的生日在5月8号，这一天，大家都为他庆祝了生日->武耀，生日，5月8号
+《青花瓷》是周杰伦最得意的一首歌。->周杰伦，作品，《青花瓷》
+北京是中国的首都。->中国，首都，北京
+蒋碧的家乡在盘龙城，毕业后去了深圳工作。->蒋碧，籍贯，盘龙城
+上周我们和王立一起去了他的家乡云南玩昨天才回到了武汉。->王立，籍贯，云南
+昨天11月17号，我和朋友一起去了海底捞，期间服务员为我的朋友刘章庆祝了生日。->刘章，生日，11月17号
+张红的体重达到了140斤，她很苦恼。->
+"""
+print(text_expansion.generate(query[1:-1], 1)[0])
--- a/bert/bert4torch_cmcc/examples/basic/basic_language_model_ernie.py
+++ b/bert/bert4torch_cmcc/examples/basic/basic_language_model_ernie.py
+#! -*- coding: utf-8 -*-
+# 基础测试：ERNIE模型测试
+from bert4torch.models import build_transformer_model
+from bert4torch.tokenizers import Tokenizer
+import torch
+# 加载模型，请更换成自己的路径
+root_model_path = "F:/Projects/pretrain_ckpt/ernie/[baidu_torch_base]--ernie-1-base-zh"
+# root_model_path = "F:/Projects/pretrain_ckpt/ernie/[baidu_torch_base]--ernie-3-base-zh"
+vocab_path = root_model_path + "/vocab.txt"
+config_path = root_model_path + "/config.json"
+checkpoint_path = root_model_path + '/pytorch_model.bin'
+# 建立分词器
+tokenizer = Tokenizer(vocab_path, do_lower_case=True)
+model = build_transformer_model(config_path, checkpoint_path, model='ERNIE', with_mlm='softmax')  # 建立模型，加载权重
+token_ids, segments_ids = tokenizer.encode("科学技术是第一生产力")
+token_ids[3] = token_ids[4] = tokenizer._token_mask_id
+print(''.join(tokenizer.ids_to_tokens(token_ids)))
+tokens_ids_tensor = torch.tensor([token_ids])
+segment_ids_tensor = torch.tensor([segments_ids])
+# 需要传入参数
+model.eval()
+with torch.no_grad():
+    _, probas = model([tokens_ids_tensor, segment_ids_tensor])
+    result = torch.argmax(probas[0, 3:5], dim=-1).numpy()
+    print(tokenizer.decode(result))
--- a/bert/bert4torch_cmcc/examples/basic/basic_language_model_gpt2_ml.py
+++ b/bert/bert4torch_cmcc/examples/basic/basic_language_model_gpt2_ml.py
+#! -*- coding: utf-8 -*-
+# 基本测试：gpt2_ml的效果测试
+# 项目链接(tf版本)：https://github.com/imcaspar/gpt2-ml
+# 权重需转换后方可加载，转换脚本见convert_script文件夹
+import torch
+from bert4torch.models import build_transformer_model
+from bert4torch.tokenizers import Tokenizer
+from bert4torch.snippets import AutoRegressiveDecoder
+config_path = 'F:/Projects/pretrain_ckpt/gpt2/[gpt2-ml_torch_15g]/bert4torch_config.json'
+checkpoint_path = 'F:/Projects/pretrain_ckpt/gpt2/[gpt2-ml_torch_15g]/bert4torch_pytorch_model.bin'
+dict_path = 'F:/Projects/pretrain_ckpt/gpt2/[gpt2-ml_torch_15g]/vocab.txt'
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+tokenizer = Tokenizer(dict_path, token_start=None, token_end=None, do_lower_case=True)  # 建立分词器
+model = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, model='gpt2_ml', segment_vocab_size=0).to(device)  # 建立模型，加载权重
+class ArticleCompletion(AutoRegressiveDecoder):
+    """基于随机采样的文章续写
+    """
+    @AutoRegressiveDecoder.wraps(default_rtype='probas')
+    def predict(self, inputs, output_ids, states):
+        token_ids = torch.cat([inputs[0], output_ids], 1)
+        logits = model.predict([token_ids])
+        return logits[:, -1, :]
+    def generate(self, text, n=1, topp=0.95):
+        token_ids, _ = tokenizer.encode(text)
+        results = self.random_sample([token_ids], n, topp=topp)  # 基于随机采样
+        return [text + tokenizer.decode(ids.cpu().numpy()) for ids in results]
+article_completion = ArticleCompletion(
+    start_id=None,
+    end_id=511,  # 511是中文句号
+    maxlen=256,
+    minlen=128,
+    device=device
+)
+for text in [u'今天天气不错', u'双十一', u'科学空间']:
+    print(article_completion.generate(text))
+"""
+部分结果：
+>>> article_completion.generate(u'今天天气不错')
+[u'今天天气不错。昨天的天气是多云到晴的天气，今天的天气还不错，不会太冷。明后两天天气还是比较好的。不过今天的天气比较闷热，最高温度在30℃左右，明后两天天气会更加热。预计今天的最高温度为30℃，明后两天的最   高温度为32℃左右，今天的最高气温将在30℃左右。（记者李莉）。新华网重庆频道诚邀广大网友投稿，您可以用相机或手机记录下身边的感人故事，精彩瞬间。请将作者、拍摄时间、地点和简要说明连同照片发给我们，我们将精选其中的好图、美图在页面上展示，让所有新华网友共赏。[投稿] 。本报讯(记者陈敏华) 今年上半年，重庆市各级公安机关在全力抓好']
+>>> article_completion.generate(u'双十一')
+[u'双十一大是中国共产党在新的历史起点上召开的一次十分重要的代表大会, 是全面落实科学发展观、推进中国特色社会主义伟大事业的一次重要会议。会议的召开, 是党和政府对新世纪新阶段我国改革开放和社会主义现代化建设 事业的新的历史任务的一次重要总动员, 必将对我们党全面推进党的建']
+>>> article_completion.generate(u'科学空间')
+[u'科学空间站上的两个机器人在进入轨道后，一边在轨道上工作，一边用它们的身体和心脏在空间站上的一个大气层进行活动，以确保它们在进入地球之后不会因太阳风暴而受到影响；而另外一个机器人则在进入轨道的过程中，通 过机器人与地球上的大气层相互作用，使地球的大气层不断地向地球的大气层中转移，以使其能够在空间站上工作，并且使用它们的身体和心脏来完成它们的各种任务。']
+"""
--- a/bert/bert4torch_cmcc/examples/basic/basic_language_model_nezha_gen_gpt.py
+++ b/bert/bert4torch_cmcc/examples/basic/basic_language_model_nezha_gen_gpt.py
+#! -*- coding: utf-8 -*-
+# 基本测试：中文GPT模型，base版本，华为开源的
+# 权重链接: https://pan.baidu.com/s/1-FB0yl1uxYDCGIRvU1XNzQ 提取码: xynn，这里使用的是转pytorch后的模型文件
+# 参考项目：https://github.com/bojone/chinese-gen
+import torch
+from bert4torch.models import build_transformer_model
+from bert4torch.tokenizers import Tokenizer
+from bert4torch.snippets import AutoRegressiveDecoder
+config_path = 'F:/Projects/pretrain_ckpt/bert/[huawei_noah_tf_base]--chinese_nezha_gpt_L-12_H-768_A-12/config.json'
+checkpoint_path = 'F:/Projects/pretrain_ckpt/bert/[huawei_noah_tf_base]--chinese_nezha_gpt_L-12_H-768_A-12/pytorch_model.bin'
+dict_path = 'F:/Projects/pretrain_ckpt/bert/[huawei_noah_tf_base]--chinese_nezha_gpt_L-12_H-768_A-12/vocab.txt'
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+tokenizer = Tokenizer(dict_path, do_lower_case=True)  # 建立分词器
+model = build_transformer_model(
+    config_path=config_path,
+    checkpoint_path=checkpoint_path,
+    segment_vocab_size=0,  # 去掉segmeng_ids输入
+    application='lm',
+).to(device)  # 建立模型，加载权重
+class ArticleCompletion(AutoRegressiveDecoder):
+    """基于随机采样的文章续写
+    """
+    @AutoRegressiveDecoder.wraps(default_rtype='logits')
+    def predict(self, inputs, output_ids, states):
+        token_ids = torch.cat([inputs[0], output_ids], 1)
+        _, mlm_scores = model.predict([token_ids])
+        return mlm_scores[:, -1, :]
+    def generate(self, text, n=1, topp=0.95):
+        token_ids = tokenizer.encode(text)[0][:-1]
+        results = self.random_sample([token_ids], n, topp=topp)  # 基于随机采样
+        return [text + tokenizer.decode(ids.cpu().numpy()) for ids in results]
+article_completion = ArticleCompletion(
+    start_id=None,
+    end_id=511,  # 511是中文句号
+    maxlen=256,
+    minlen=128,
+    device=device
+)
+print(article_completion.generate(u'今天天气不错'))
+"""
+部分结果：
+>>> article_completion.generate(u'今天天气不错')
+[u'今天天气不错。昨天的天气是多云到晴的天气，今天的天气还不错，不会太冷。明后两天天气还是比较好的。不过今天的天气比较闷热，最高温度在30℃左右，明后两天天气会更加热。预计今天的最高温度为30℃，明后两天的最   高温度为32℃左右，今天的最高气温将在30℃左右。（记者李莉）。新华网重庆频道诚邀广大网友投稿，您可以用相机或手机记录下身边的感人故事，精彩瞬间。请将作者、拍摄时间、地点和简要说明连同照片发给我们，我们将精选其中的好图、美图在页面上展示，让所有新华网友共赏。[投稿] 。本报讯(记者陈敏华) 今年上半年，重庆市各级公安机关在全力抓好']
+>>> article_completion.generate(u'双十一')
+[u'双十一大是中国共产党在新的历史起点上召开的一次十分重要的代表大会, 是全面落实科学发展观、推进中国特色社会主义伟大事业的一次重要会议。会议的召开, 是党和政府对新世纪新阶段我国改革开放和社会主义现代化建设 事业的新的历史任务的一次重要总动员, 必将对我们党全面推进党的建']
+>>> article_completion.generate(u'科学空间')
+[u'科学空间站上的两个机器人在进入轨道后，一边在轨道上工作，一边用它们的身体和心脏在空间站上的一个大气层进行活动，以确保它们在进入地球之后不会因太阳风暴而受到影响；而另外一个机器人则在进入轨道的过程中，通 过机器人与地球上的大气层相互作用，使地球的大气层不断地向地球的大气层中转移，以使其能够在空间站上工作，并且使用它们的身体和心脏来完成它们的各种任务。']
+"""
--- a/bert/bert4torch_cmcc/examples/basic/basic_language_model_nezha_gpt_dialog.py
+++ b/bert/bert4torch_cmcc/examples/basic/basic_language_model_nezha_gpt_dialog.py
+#! -*- coding: utf-8 -*-
+# NEZHA模型做闲聊任务，这里只提供了测试脚本
+# 源项目：https://github.com/bojone/nezha_gpt_dialog
+# 权重转换脚本见：https://github.com/Tongjilibo/bert4torch/blob/master/examples/convert_script/convert_nezha_gpt_dialog.py
+from bert4torch.models import build_transformer_model
+from bert4torch.tokenizers import Tokenizer
+from bert4torch.snippets import AutoRegressiveDecoder
+import torch
+# nezha配置
+config_path = 'F:/Projects/pretrain_ckpt/nezha/[sushen_tf_base]--nezha_gpt_dialog/config.json'
+checkpoint_path = 'F:/Projects/pretrain_ckpt/nezha/[sushen_tf_base]--nezha_gpt_dialog/pytorch_model.bin'
+dict_path = 'F:/Projects/pretrain_ckpt/nezha/[sushen_tf_base]--nezha_gpt_dialog/vocab.txt'
+# 建立分词器
+tokenizer = Tokenizer(dict_path, do_lower_case=True)
+# 建立并加载模型
+model = build_transformer_model(
+    config_path,
+    checkpoint_path,
+    model='nezha',
+    application='lm',
+)
+class ChatBot(AutoRegressiveDecoder):
+    """基于随机采样对话机器人
+    """
+    @AutoRegressiveDecoder.wraps(default_rtype='logits')
+    def predict(self, inputs, output_ids, states):
+        token_ids, segment_ids = inputs
+        token_ids = torch.concat([token_ids, output_ids], 1)
+        curr_segment_ids = torch.ones_like(output_ids) - segment_ids[0, -1]
+        segment_ids = torch.concat([segment_ids, curr_segment_ids], 1)
+        return model.predict([token_ids, segment_ids])[-1][:, -1]
+    def response(self, texts, topk=5):
+        token_ids, segment_ids = [tokenizer._token_start_id], [0]
+        for i, text in enumerate(texts):
+            ids = tokenizer.encode(text)[0][1:]
+            token_ids.extend(ids)
+            segment_ids.extend([i % 2] * len(ids))
+        results = self.random_sample([token_ids, segment_ids], 1, topk)
+        return tokenizer.decode(results[0].cpu().numpy())
+chatbot = ChatBot(start_id=None, end_id=tokenizer._token_end_id, maxlen=32)
+print(chatbot.response([u'别爱我没结果', u'你这样会失去我的', u'失去了又能怎样']))
+"""
+回复是随机的，例如：那你还爱我吗 | 不知道 | 爱情是不是不能因为一点小事就否定了 | 我会一直爱你，你一个人会很辛苦 | 等等。
+"""
\ No newline at end of file
--- a/bert/bert4torch_cmcc/examples/basic/basic_language_model_roformer.py
+++ b/bert/bert4torch_cmcc/examples/basic/basic_language_model_roformer.py
+#! -*- coding: utf-8 -*-
+# 基础测试：mlm测试roformer、roformer_v2模型
+from bert4torch.models import build_transformer_model
+from bert4torch.tokenizers import Tokenizer
+import torch
+choice = 'roformer_v2'  # roformer roformer_v2
+if choice == 'roformer':
+    args_model_path = "F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v1_base/"
+    args_model = 'roformer'
+else:
+    args_model_path = "F:/Projects/pretrain_ckpt/roformer/[sushen_torch_base]--roformer_v2_char_base/"
+    args_model = 'roformer_v2'
+# 加载模型，请更换成自己的路径
+root_model_path = args_model_path
+vocab_path = root_model_path + "/vocab.txt"
+config_path = root_model_path + "/config.json"
+checkpoint_path = root_model_path + '/pytorch_model.bin'
+# 建立分词器
+tokenizer = Tokenizer(vocab_path, do_lower_case=True)
+model = build_transformer_model(config_path, checkpoint_path, model=args_model, with_mlm='softmax')  # 建立模型，加载权重
+token_ids, segments_ids = tokenizer.encode("今天M很好，我M去公园玩。")
+token_ids[3] = token_ids[8] = tokenizer._token_mask_id
+print(''.join(tokenizer.ids_to_tokens(token_ids)))
+tokens_ids_tensor = torch.tensor([token_ids])
+segment_ids_tensor = torch.tensor([segments_ids])
+# 需要传入参数with_mlm
+model.eval()
+with torch.no_grad():
+    _, logits = model([tokens_ids_tensor, segment_ids_tensor])
+pred_str = 'Predict: '
+for i, logit in enumerate(logits[0]):
+    if token_ids[i] == tokenizer._token_mask_id:
+        pred_str += tokenizer.id_to_token(torch.argmax(logit, dim=-1).item())
+    else:
+        pred_str += tokenizer.id_to_token(token_ids[i])
+print(pred_str)
--- a/bert/bert4torch_cmcc/examples/basic/basic_language_model_simbert.py
+++ b/bert/bert4torch_cmcc/examples/basic/basic_language_model_simbert.py
+#! -*- coding: utf-8 -*-
+# SimBERT/RoFormer-Sim测试相似问生成效果，以及句子之间相似度效果
+# 官方项目：https://github.com/ZhuiyiTechnology/simbert
+# 官方项目：https://github.com/ZhuiyiTechnology/roformer-sim
+import torch
+from bert4torch.models import build_transformer_model, BaseModel
+from bert4torch.snippets import sequence_padding, AutoRegressiveDecoder, get_pool_emb
+from bert4torch.tokenizers import Tokenizer, load_vocab
+# 基本信息
+maxlen = 32
+choice = 'simbert_v2'  # simbert simbert_v2
+if choice == 'simbert':
+    args_model_path = "F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--simbert_chinese_base"
+    args_model = 'bert'
+else:
+    args_model_path = "F:/Projects/pretrain_ckpt/simbert/[sushen_torch_base]--roformer_chinese_sim_char_base"
+    args_model = 'roformer'
+# 加载simbert权重或roformer_v2
+root_model_path = args_model_path
+dict_path = root_model_path + "/vocab.txt"
+config_path = root_model_path + "/config.json"
+checkpoint_path = root_model_path + '/pytorch_model.bin'
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+# 加载并精简词表，建立分词器
+token_dict, keep_tokens = load_vocab(
+    dict_path=dict_path,
+    simplified=True,
+    startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
+)
+tokenizer = Tokenizer(token_dict, do_lower_case=True)
+# 建立加载模型
+class Model(BaseModel):
+    def __init__(self, pool_method='cls'):
+        super().__init__()
+        self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, with_pool='linear', model=args_model,
+                                            application='unilm', keep_tokens=keep_tokens)
+        self.pool_method = pool_method
+    def forward(self, token_ids, segment_ids):
+        hidden_state, pooler, seq_logit = self.bert([token_ids, segment_ids])
+        sen_emb = get_pool_emb(hidden_state, pooler, token_ids.gt(0).long(), self.pool_method)
+        return seq_logit, sen_emb
+model = Model(pool_method='cls').to(device)
+class SynonymsGenerator(AutoRegressiveDecoder):
+    """seq2seq解码器
+    """
+    @AutoRegressiveDecoder.wraps('logits')
+    def predict(self, inputs, output_ids, states):
+        token_ids, segment_ids = inputs
+        token_ids = torch.cat([token_ids, output_ids], 1)
+        segment_ids = torch.cat([segment_ids, torch.ones_like(output_ids, device=device)], 1)
+        seq_logit, _ = model.predict([token_ids, segment_ids])
+        return seq_logit[:, -1, :]
+    def generate(self, text, n=1, topk=5):
+        token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
+        output_ids = self.random_sample([token_ids, segment_ids], n, topk)  # 基于随机采样
+        return [tokenizer.decode(ids.cpu().numpy()) for ids in output_ids]
+synonyms_generator = SynonymsGenerator(start_id=None, end_id=tokenizer._token_end_id, maxlen=maxlen, device=device)
+def cal_sen_emb(text_list):
+    '''输入text的list，计算sentence的embedding
+    '''
+    X, S = [], []
+    for t in text_list:
+        x, s = tokenizer.encode(t)
+        X.append(x)
+        S.append(s)
+    X = torch.tensor(sequence_padding(X), dtype=torch.long, device=device)
+    S = torch.tensor(sequence_padding(S), dtype=torch.long, device=device)
+    _, Z = model.predict([X, S])
+    return Z
+def gen_synonyms(text, n=100, k=20):
+    """"含义： 产生sent的n个相似句，然后返回最相似的k个。
+    做法：用seq2seq生成，并用encoder算相似度并排序。
+    效果：
+        >>> gen_synonyms(u'微信和支付宝哪个好？')
+        [
+            u'微信和支付宝，哪个好?',
+            u'微信和支付宝哪个好',
+            u'支付宝和微信哪个好',
+            u'支付宝和微信哪个好啊',
+            u'微信和支付宝那个好用？',
+            u'微信和支付宝哪个好用',
+            u'支付宝和微信那个更好',
+            u'支付宝和微信哪个好用',
+            u'微信和支付宝用起来哪个好？',
+            u'微信和支付宝选哪个好',
+        ]
+    """
+    r = synonyms_generator.generate(text, n)
+    r = [i for i in set(r) if i != text]  # 不和原文相同
+    r = [text] + r
+    Z = cal_sen_emb(r)
+    Z /= (Z**2).sum(dim=1, keepdims=True)**0.5
+    argsort = torch.matmul(Z[1:], -Z[0]).argsort()
+    return [r[i + 1] for i in argsort[:k]]
+if __name__ == '__main__':
+    choice = 'generate'  # generate  similarity
+    if choice == 'generate':
+        print(gen_synonyms('我想去北京玩玩可以吗', 10, 10))
+    elif choice == 'similarity':
+        target_text = '我想去首都北京玩玩'
+        text_list = ['我想去北京玩', '北京有啥好玩的吗？我想去看看', '好渴望去北京游玩啊']
+        Z = cal_sen_emb([target_text]+text_list)
+        Z /= (Z**2).sum(dim=1, keepdims=True)**0.5
+        similarity = torch.matmul(Z[1:], Z[0])
+        for i, line in enumerate(text_list):
+            print(f'cos_sim: {similarity[i].item():.4f}, tgt_text: "{target_text}", cal_text: "{line}"')
+else:
+    model.load_weights('./best_model.pt')
--- a/bert/bert4torch_cmcc/examples/basic/basic_language_model_t5_pegasus.py
+++ b/bert/bert4torch_cmcc/examples/basic/basic_language_model_t5_pegasus.py
+#! -*- coding: utf-8 -*-
+# 调用T5 PEGASUS, 使用到是BertTokenizer
+import torch
+from bert4torch.models import build_transformer_model
+from bert4torch.tokenizers import Tokenizer, load_vocab
+from bert4torch.snippets import AutoRegressiveDecoder
+import jieba
+jieba.initialize()
+# bert配置
+# pretrain_model = 'F:/Projects/pretrain_ckpt/t5/[sushen_t5_pegasus_torch_small]--chinese_t5_pegasus_small/'
+pretrain_model = 'F:/Projects/pretrain_ckpt/t5/[sushen_t5_pegasus_torch_base]--chinese_t5_pegasus_base/'
+config_path = pretrain_model + 'config.json'
+checkpoint_path = pretrain_model + 'pytorch_model.bin'
+dict_path = pretrain_model + 'vocab.txt'
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+# 加载并精简词表，建立分词器
+tokenizer = Tokenizer(
+    dict_path,
+    do_lower_case=True,
+    pre_tokenize=lambda s: jieba.cut(s, HMM=False)
+)
+model = build_transformer_model(
+    config_path,
+    checkpoint_path,
+    model='mt5.1.1',
+    segment_vocab_size=0
+).to(device)
+class AutoTitle(AutoRegressiveDecoder):
+    """seq2seq解码器
+    """
+    @AutoRegressiveDecoder.wraps(default_rtype='logits')
+    def predict(self, inputs, output_ids, states):
+        # inputs中包含了[decoder_ids, encoder_hidden_state, encoder_attention_mask]
+        return model.decoder.predict([output_ids] + inputs)[-1][:, -1, :]  # 保留最后一位
+    def generate(self, text, topk=1):
+        token_ids, _ = tokenizer.encode(text, maxlen=256)
+        token_ids = torch.tensor([token_ids], device=device)
+        encoder_output = model.encoder.predict([token_ids])
+        output_ids = self.beam_search(encoder_output, topk=topk)  # 基于beam search
+        return tokenizer.decode([int(i) for i in output_ids.cpu().numpy()])
+autotitle = AutoTitle(start_id=tokenizer._token_start_id, end_id=tokenizer._token_end_id, maxlen=32, device=device)  # 这里end_id可以设置为tokenizer._token_end_id这样结果更短
+if __name__ == '__main__':
+    print(autotitle.generate('今天天气不错啊'))
+# small版输出：我是个女的，我想知道我是怎么想的
+# base版输出：请问明天的天气怎么样啊？
\ No newline at end of file
--- a/bert/bert4torch_cmcc/examples/basic/basic_language_model_transformer_xl.py
+++ b/bert/bert4torch_cmcc/examples/basic/basic_language_model_transformer_xl.py
+#! -*- coding: utf-8 -*-
+# 调用transformer_xl模型，该模型流行度较低，未找到中文预训练模型
+# last_hidden_state目前是debug到transformer包中查看，经比对和本框架一致
+# 用的是transformer中的英文预训练模型来验证正确性
+# 转换脚本: convert_script/convert_transformer_xl.py
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+pretrained_model = "F:/Projects/pretrain_ckpt/transformer_xl/[english_hugging_face_torch]--transfo-xl-wt103"
+# ----------------------transformers包----------------------
+tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
+model = AutoModelForCausalLM.from_pretrained(pretrained_model)
+model.eval()
+inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+with torch.no_grad():
+    # 这里只能断点进去看
+    outputs = model(**inputs, labels=inputs["input_ids"])
+    loss = outputs.losses
+print('transforms loss: ', loss)
+# ----------------------bert4torch配置----------------------
+from bert4torch.models import build_transformer_model
+config_path = f'{pretrained_model}/bert4torch_config.json'
+checkpoint_path = f'{pretrained_model}/bert4torch_pytorch_model.bin'
+model = build_transformer_model(
+    config_path,
+    checkpoint_path=checkpoint_path,
+    model='transformer_xl',
+)
+print('bert4torch last_hidden_state: ', model.predict([inputs['input_ids']]))
+# tensor([[[ 0.1027,  0.0604, -0.2585,  ...,  0.3137, -0.2679,  0.1036],
+#          [ 0.3482, -0.0458, -0.4582,  ...,  0.0242, -0.0721,  0.2311],
+#          [ 0.3426, -0.1353, -0.4145,  ...,  0.1123,  0.1374,  0.1313],
+#          [ 0.0038, -0.0978, -0.5570,  ...,  0.0487, -0.1891, -0.0608],
+#          [-0.2155, -0.1388, -0.5549,  ..., -0.1458,  0.0774,  0.0419],
+#          [ 0.0967, -0.1781, -0.4328,  ..., -0.1831, -0.0808,  0.0890]]])
\ No newline at end of file
--- a/bert/bert4torch_cmcc/examples/basic/basic_language_model_uer_t5.py
+++ b/bert/bert4torch_cmcc/examples/basic/basic_language_model_uer_t5.py
+#! -*- coding: utf-8 -*-
+# 调用预训练的t5-chinese模型直接做预测,使用的BertTokenizer
+# t5使用的是t5.1.0的结构
+import torch
+from bert4torch.models import build_transformer_model
+from bert4torch.tokenizers import Tokenizer, load_vocab
+from bert4torch.snippets import AutoRegressiveDecoder
+# bert配置
+config_path = 'F:/Projects/pretrain_ckpt/t5/[uer_t5_torch_small]--t5-small-chinese-cluecorpussmall/bert4torch_config.json'
+checkpoint_path = 'F:/Projects/pretrain_ckpt/t5/[uer_t5_torch_small]--t5-small-chinese-cluecorpussmall/pytorch_model.bin'
+dict_path = 'F:/Projects/pretrain_ckpt/t5/[uer_t5_torch_small]--t5-small-chinese-cluecorpussmall/vocab.txt'
+# config_path = 'F:/Projects/pretrain_ckpt/t5/[uer_t5_torch_base]--t5-base-chinese-cluecorpussmall/bert4torch_config.json'
+# checkpoint_path = 'F:/Projects/pretrain_ckpt/t5/[uer_t5_torch_base]--t5-base-chinese-cluecorpussmall/pytorch_model.bin'
+# dict_path = 'F:/Projects/pretrain_ckpt/t5/[uer_t5_torch_base]--t5-base-chinese-cluecorpussmall/vocab.txt'
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+# 加载并精简词表，建立分词器
+token_dict = load_vocab(
+    dict_path=dict_path,
+    simplified=False,
+    startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
+)
+tokenizer = Tokenizer(token_dict, do_lower_case=True)
+model = build_transformer_model(
+    config_path,
+    checkpoint_path,
+    model='t5.1.0',
+    segment_vocab_size=0
+).to(device)
+class AutoTitle(AutoRegressiveDecoder):
+    """seq2seq解码器
+    """
+    @AutoRegressiveDecoder.wraps(default_rtype='logits')
+    def predict(self, inputs, output_ids, states):
+        token_ids = inputs[0] 
+        return model.predict([[token_ids], [output_ids]])[-1][:, -1, :]  # 保留最后一位
+    def generate(self, text, topk=1, topp=0.95):
+        token_ids, _ = tokenizer.encode(text, maxlen=256)
+        output_ids = self.beam_search([token_ids], topk=topk)  # 基于beam search
+        return tokenizer.decode(output_ids.cpu().numpy())
+autotitle = AutoTitle(start_id=tokenizer._token_start_id, end_id=1, maxlen=32, device=device)  # 这里end_id可以设置为tokenizer._token_end_id这样结果更短
+if __name__ == '__main__':
+    print(autotitle.generate('中国的首都是extra0京'))
--- a/bert/bert4torch_cmcc/examples/basic/basic_language_model_xlnet.py
+++ b/bert/bert4torch_cmcc/examples/basic/basic_language_model_xlnet.py
+from transformers import XLNetTokenizer, XLNetModel
+import torch
+pretrained_model = "F:/Projects/pretrain_ckpt/xlnet/[hit_torch_base]--chinese-xlnet-base"
+tokenizer = XLNetTokenizer.from_pretrained(pretrained_model)
+model = XLNetModel.from_pretrained(pretrained_model)
+inputs = tokenizer(["你好啊，我叫张三", "天气不错啊"], padding=True, return_tensors="pt")
+outputs = model(**inputs)
+last_hidden_states = outputs.last_hidden_state
+print('--------transformers last_hidden_state--------\n', last_hidden_states)
+# ----------------------bert4torch配置----------------------
+from bert4torch.models import build_transformer_model
+config_path = f'{pretrained_model}/bert4torch_config.json'
+checkpoint_path = f'{pretrained_model}/pytorch_model.bin'
+model = build_transformer_model(
+    config_path,
+    checkpoint_path=checkpoint_path,
+    model='xlnet',
+    # with_lm=True
+    token_pad_ids=tokenizer.pad_token_id,
+)
+print('--------bert4torch last_hidden_state--------\n', model.predict([inputs['input_ids'], inputs['token_type_ids']]))
\ No newline at end of file
--- a/bert/bert4torch_cmcc/examples/basic/basic_make_uncased_model_cased.py
+++ b/bert/bert4torch_cmcc/examples/basic/basic_make_uncased_model_cased.py
+#! -*- coding: utf-8 -*-
+# 通过简单修改词表，使得不区分大小写的模型有区分大小写的能力
+# 基本思路：将英文单词大写化后添加到词表中，并修改模型Embedding层
+from bert4torch.models import build_transformer_model
+from bert4torch.tokenizers import Tokenizer, load_vocab
+import torch
+root_model_path = "F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12"
+vocab_path = root_model_path + "/vocab.txt"
+config_path = root_model_path + "/bert_config.json"
+checkpoint_path = root_model_path + '/pytorch_model.bin'
+token_dict = load_vocab(vocab_path)
+new_token_dict = token_dict.copy()
+compound_tokens = []
+for t, i in sorted(token_dict.items(), key=lambda s: s[1]):
+    # 这里主要考虑两种情况：1、首字母大写；2、整个单词大写。
+    # Python2下，新增了5594个token；Python3下，新增了5596个token。
+    tokens = []
+    if t.isalpha():
+        tokens.extend([t[:1].upper() + t[1:], t.upper()])
+    elif t[:2] == '##' and t[2:].isalpha():
+        tokens.append(t.upper())
+    for token in tokens:
+        if token not in new_token_dict:
+            compound_tokens.append([i])
+            new_token_dict[token] = len(new_token_dict)
+tokenizer = Tokenizer(new_token_dict, do_lower_case=False)
+model = build_transformer_model(
+    config_path,
+    checkpoint_path,
+    compound_tokens=compound_tokens,  # 增加新token，用旧token平均来初始化
+)
+text = u'Welcome to BEIJING.'
+tokens = tokenizer.tokenize(text)
+print(tokens)
+"""
+输出：['[CLS]', u'Welcome', u'to', u'BE', u'##I', u'##JING', u'.', '[SEP]']
+"""
+token_ids, segment_ids = tokenizer.encode(text)
+token_ids, segment_ids = torch.tensor([token_ids]), torch.tensor([segment_ids])
+model.eval()
+with torch.no_grad():
+  print(model([token_ids, segment_ids])[0])
+"""
+输出：
+[[[-1.4999904e-01  1.9651388e-01 -1.7924258e-01 ...  7.8269649e-01
+    2.2241375e-01  1.1325148e-01]
+  [-4.5268752e-02  5.5090344e-01  7.4699545e-01 ... -4.7773960e-01
+   -1.7562288e-01  4.1265407e-01]
+  [ 7.0158571e-02  1.7816302e-01  3.6949167e-01 ...  9.6258509e-01
+   -8.4678203e-01  6.3776302e-01]
+  ...
+  [ 9.3637377e-01  3.0232478e-02  8.1411439e-01 ...  7.9186147e-01
+    7.5704646e-01 -8.3475001e-04]
+  [ 2.3699696e-01  2.9953337e-01  8.1962071e-02 ... -1.3776925e-01
+    3.8681498e-01  3.2553676e-01]
+  [ 1.9728680e-01  7.7782705e-02  5.2951699e-01 ...  8.9622810e-02
+   -2.3932748e-02  6.9600858e-02]]]
+"""
--- a/bert/bert4torch_cmcc/examples/basic/basic_test_parallel_apply.py
+++ b/bert/bert4torch_cmcc/examples/basic/basic_test_parallel_apply.py
+# 多进程/线程parallel_apply测试
+from tqdm import tqdm
+from bert4torch.tokenizers import Tokenizer
+import torch
+import numpy as np
+from bert4torch.snippets import parallel_apply
+import time
+dict_path = 'F:/Projects/pretrain_ckpt/bert/[huggingface_torch_base]--bert-base-chinese/vocab.txt'
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+categories = {'LOC':2, 'PER':3, 'ORG':4}
+# 相对距离设置
+dis2idx = np.zeros((1000), dtype='int64')
+dis2idx[1] = 1
+dis2idx[2:] = 2
+dis2idx[4:] = 3
+dis2idx[8:] = 4
+dis2idx[16:] = 5
+dis2idx[32:] = 6
+dis2idx[64:] = 7
+dis2idx[128:] = 8
+dis2idx[256:] = 9
+# 用到的小函数
+def convert_index_to_text(index, type):
+    text = "-".join([str(i) for i in index])
+    text = text + "-#-{}".format(type)
+    return text
+def convert_text_to_index(text):
+    index, type = text.split("-#-")
+    index = [int(x) for x in index.split("-")]
+    return index, int(type)
+# 建立分词器
+tokenizer = Tokenizer(dict_path, do_lower_case=True)
+maxlen = 256
+def load_data(filename):
+    D = []
+    with open(filename, encoding='utf-8') as f:
+        f = f.read()
+        for l in tqdm(f.split('\n\n'), desc='Load data'):
+            if not l:
+                continue
+            sentence, d = [], []
+            for i, c in enumerate(l.split('\n')):
+                char, flag = c.split(' ')
+                sentence += char
+                if flag[0] == 'B':
+                    d.append([i, i, flag[2:]])
+                elif flag[0] == 'I':
+                    d[-1][1] = i
+            if len(sentence) > maxlen - 2:
+                continue
+            D.append((sentence, d))
+    return D
+def func(inputs):
+    sentence, d = inputs
+    tokens = [tokenizer.tokenize(word)[1:-1] for word in sentence[:maxlen-2]]
+    pieces = [piece for pieces in tokens for piece in pieces]
+    tokens_ids = [tokenizer._token_start_id] + tokenizer.tokens_to_ids(pieces) + [tokenizer._token_end_id]
+    assert len(tokens_ids) <= maxlen
+    length = len(tokens)
+    # piece和word的对应关系，中文两者一致，除了[CLS]和[SEP]
+    _pieces2word = np.zeros((length, len(tokens_ids)), dtype=np.bool)
+    e_start = 0
+    for i, pieces in enumerate(tokens):
+        if len(pieces) == 0:
+            continue
+        pieces = list(range(e_start, e_start + len(pieces)))
+        _pieces2word[i, pieces[0] + 1:pieces[-1] + 2] = 1
+        e_start += len(pieces)
+    # 相对距离
+    _dist_inputs = np.zeros((length, length), dtype=np.int)
+    for k in range(length):
+        _dist_inputs[k, :] += k
+        _dist_inputs[:, k] -= k
+    for i in range(length):
+        for j in range(length):
+            if _dist_inputs[i, j] < 0:
+                _dist_inputs[i, j] = dis2idx[-_dist_inputs[i, j]] + 9
+            else:
+                _dist_inputs[i, j] = dis2idx[_dist_inputs[i, j]]
+    _dist_inputs[_dist_inputs == 0] = 19
+    # golden标签
+    _grid_labels = np.zeros((length, length), dtype=np.int)
+    _grid_mask2d = np.ones((length, length), dtype=np.bool)
+    for entity in d:
+        e_start, e_end, e_type = entity[0], entity[1]+1, entity[-1]
+        if e_end >= maxlen - 2:
+            continue
+        index = list(range(e_start, e_end))
+        for i in range(len(index)):
+            if i + 1 >= len(index):
+                break
+            _grid_labels[index[i], index[i + 1]] = 1
+        _grid_labels[index[-1], index[0]] = categories[e_type]
+    _entity_text = set([convert_index_to_text(list(range(e[0], e[1]+1)), categories[e[-1]]) for e in d])
+    return tokens_ids, _pieces2word, _dist_inputs, _grid_labels, _grid_mask2d, _entity_text
+corpus = load_data('F:/Projects/data/corpus/ner/china-people-daily-ner-corpus/example.train')
+start = time.time()
+train_samples = parallel_apply(
+            func=func,
+            iterable=corpus,
+            workers=8,
+            max_queue_size=2000,
+            dummy=False,  # windows设置为True使用多进程
+            callback=None,
+            unordered=False
+        )
+print(time.time()-start)
\ No newline at end of file
--- a/bert/bert4torch_cmcc/examples/basic/basic_test_tokenizer.py
+++ b/bert/bert4torch_cmcc/examples/basic/basic_test_tokenizer.py
+# 测试tokenizer和transformers自带的tokenizer是否一致，测试后是一致的
+from transformers import BertTokenizer, XLNetTokenizer, XLNetTokenizerFast
+from bert4torch.tokenizers import Tokenizer, SpTokenizer
+from tqdm import tqdm
+choice = 1
+if choice:
+    print('Test BertTokenizer')
+    tokenizer_transformers = BertTokenizer.from_pretrained("F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12")
+    tokenizer_bert4torch = Tokenizer('F:/Projects/pretrain_ckpt/bert/[google_tf_base]--chinese_L-12_H-768_A-12/vocab.txt', do_lower_case=True, do_tokenize_unk=True)
+else:
+    print('Test SpTokenizer')
+    tokenizer_transformers = XLNetTokenizer.from_pretrained("F:/Projects/pretrain_ckpt/xlnet/[hit_torch_base]--chinese-xlnet-base")
+    # tokenizer_transformers = XLNetTokenizerFast.from_pretrained("F:/Projects/pretrain_ckpt/xlnet/[hit_torch_base]--chinese-xlnet-base")  # fast版本有些许不一样
+    tokenizer_bert4torch = tokenizer = SpTokenizer('F:/Projects/pretrain_ckpt/xlnet/[hit_torch_base]--chinese-xlnet-base/spiece.model', token_start=None, token_end=None)
+with open('F:/Projects/data/corpus/sentence_classification/sentiment/sentiment.train.data', 'r', encoding='utf-8') as f:
+    for l in tqdm(f):
+        l = l.split('\t')[0].strip()
+        tokens1 = tokenizer_transformers.tokenize(l)
+        tokens2 = tokenizer_bert4torch.tokenize(l)
+        tokens2 = tokens2[1:-1] if choice == 1 else tokens2
+        if tokens1 != tokens2:
+            print(''.join(tokens1))
+            print(''.join(tokens2))
+            print('------------------------------')
\ No newline at end of file
--- a/bert/bert4torch_cmcc/examples/convert_script/PLM_config.md
+++ b/bert/bert4torch_cmcc/examples/convert_script/PLM_config.md
+# 预训练权重config
+记录bert4torch需要另外配置的config，部分权重可在转换脚本中查看
+----
+- xlnet/[hit_torch_base]--chinese-xlnet-base
+```json
+{
+  "architectures": [
+    "XLNetLMHeadModel"
+  ],
+  "attn_type": "bi",
+  "bi_data": false,
+  "bos_token_id": 1,
+  "clamp_len": -1,
+  "intermediate_size": 3072,
+  "hidden_size": 768,
+  "hidden_dropout_prob": 0.1,
+  "end_n_top": 5,
+  "eos_token_id": 2,
+  "hidden_act": "relu",
+  "initializer_range": 0.02,
+  "layer_norm_eps": 1e-12,
+  "mem_len": null,
+  "model_type": "xlnet",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "output_past": true,
+  "pad_token_id": 5,
+  "reuse_len": null,
+  "same_length": false,
+  "start_n_top": 5,
+  "summary_activation": "tanh",
+  "summary_last_hidden_dropout_prob": 0.1,
+  "summary_type": "last",
+  "summary_use_proj": true,
+  "untie_r": true,
+  "vocab_size": 32000
+}
+```
+- gau/[sushen-torch]--chinese_GAU-alpha-char_L-24_H-768
+```json
+{
+  "hidden_act": "swish",
+  "hidden_size": 768,
+  "hidden_dropout_prob": 0.1,
+  "attention_probs_dropout_prob": 0.1,
+  "num_attention_heads": 1,
+  "attention_key_size": 128,
+  "intermediate_size": 1536,
+  "num_hidden_layers": 24,
+  "type_vocab_size": 2,
+  "vocab_size": 12000
+}
+```
+- gpt/[thu-coai_torch_base]--CDial-GPT-LCCC-base
+```json
+{
+  "attention_probs_dropout_prob": 0.1, 
+  "directionality": "bidi", 
+  "hidden_act": "gelu", 
+  "hidden_dropout_prob": 0.1, 
+  "hidden_size": 768, 
+  "initializer_range": 0.02, 
+  "intermediate_size": 3072, 
+  "max_position_embeddings": 513, 
+  "num_attention_heads": 12, 
+  "num_hidden_layers": 12, 
+  "vocab_size": 13088,
+  "type_vocab_size": 3,
+  "shared_segment_embeddings": true
+}
+```
+- gpt2/[cpm_gpt2_torch]--cpm_lm_2.6b
+```json
+{
+  "vocab_size": 30000,
+  "hidden_size": 2560,
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "initializer_range": 0.014142135623731,
+  "intermediate_size": 10240,
+  "max_position_embeddings": 1024,
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32
+}
+```
+- gpt2/[gpt2-ml_torch_15g]
+```json
+{
+  "vocab_size": 21130,
+  "hidden_size": 1536,
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "initializer_range": 0.014142135623731,
+  "intermediate_size": 6144,
+  "max_position_embeddings": 1024,
+  "num_attention_heads": 24,
+  "num_hidden_layers": 48
+}
+```
+- t5/[google_mt5_torch_base]
+```json
+{
+  "attention_probs_dropout_prob": 0.1, 
+  "hidden_act": "gelu_new", 
+  "hidden_dropout_prob": 0.1, 
+  "hidden_size": 768, 
+  "initializer_range": 0.02, 
+  "intermediate_size": 2048, 
+  "max_position_embeddings": 512, 
+  "num_attention_heads": 12, 
+  "num_hidden_layers": 12, 
+  "type_vocab_size": 2, 
+  "vocab_size": 250112,
+  "relative_attention_num_buckets": 32,
+  "attention_scale":  false,
+  "is_dropout": true
+}
+```
+- t5/[uer_t5_torch_base]--t5-base-chinese-cluecorpussmall
+```json
+{
+  "attention_probs_dropout_prob": 0.1, 
+  "hidden_act": "relu", 
+  "hidden_dropout_prob": 0.1, 
+  "hidden_size": 768, 
+  "initializer_range": 0.02, 
+  "intermediate_size": 3072, 
+  "max_position_embeddings": 512, 
+  "num_attention_heads": 12, 
+  "num_hidden_layers": 12, 
+  "type_vocab_size": 2, 
+  "vocab_size": 21228,
+  "relative_attention_num_buckets": 32,
+  "attention_scale": false,
+  "is_dropout": true
+}
+```
+- t5/[uer_t5_torch_small]--t5-small-chinese-cluecorpussmall
+```json
+{
+  "attention_probs_dropout_prob": 0.1, 
+  "hidden_act": "relu", 
+  "hidden_dropout_prob": 0.1, 
+  "hidden_size": 512, 
+  "initializer_range": 0.02, 
+  "intermediate_size": 2048, 
+  "max_position_embeddings": 512, 
+  "num_attention_heads": 8, 
+  "num_hidden_layers": 6, 
+  "type_vocab_size": 2, 
+  "vocab_size": 21228,
+  "relative_attention_num_buckets": 32,
+  "attention_scale": false,
+  "is_dropout": true
+}
+```
+- t5/[sushen_t5_pegasus_torch_small]--chinese_t5_pegasus_small
+```json
+{
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 512,
+  "initializer_range": 0.02,
+  "intermediate_size": 1024,
+  "num_attention_heads": 6,
+  "attention_head_size": 64,
+  "num_hidden_layers": 8,
+  "vocab_size": 50000,
+  "relative_attention_num_buckets": 32,
+  "attention_scale":  false,
+  "is_dropout": true
+}
+```
+- t5/[sushen_t5_pegasus_torch_base]--chinese_t5_pegasus_base
+```json
+{
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 2048,
+  "num_attention_heads": 12,
+  "attention_head_size": 64,
+  "num_hidden_layers": 12,
+  "vocab_size": 50000,
+  "relative_attention_num_buckets": 32,
+  "attention_scale":  false,
+  "is_dropout": true
+}
+```
+- bart/[FudanNLP_torch_base]
+```json
+{
+  "attention_probs_dropout_prob": 0.1, 
+  "hidden_act": "gelu", 
+  "hidden_dropout_prob": 0.1, 
+  "hidden_size": 768, 
+  "initializer_range": 0.02, 
+  "intermediate_size": 3072, 
+  "max_position_embeddings": 512, 
+  "num_attention_heads": 12, 
+  "num_hidden_layers": 6, 
+  "type_vocab_size": 2, 
+  "vocab_size": 21128
+}
+```
\ No newline at end of file
--- a/bert/bert4torch_cmcc/examples/convert_script/convert_GAU_alpha.py
+++ b/bert/bert4torch_cmcc/examples/convert_script/convert_GAU_alpha.py
+# tensorflow权重链接：https://github.com/ZhuiyiTechnology/GAU-alpha
+# 这里直接映射到GAU_alpha的结构上了，因此不需要mapping
+import torch
+import tensorflow as tf
+tf_path = 'F:/Projects/pretrain_ckpt/gau/[sushen-tf]--chinese_GAU-alpha-char_L-24_H-768/bert_model.ckpt'
+torch_state_dict = {}
+ts = tf.train.load_variable(tf_path, 'bert/embeddings/word_embeddings')
+torch_state_dict['embeddings.word_embeddings.weight'] = torch.from_numpy(ts)
+torch_state_dict['mlmDecoder.weight'] = torch.from_numpy(ts)
+ts = tf.train.load_variable(tf_path, 'bert/embeddings/token_type_embeddings')
+torch_state_dict['embeddings.segment_embeddings.weight'] = torch.from_numpy(ts)
+for i in range(24):
+    ts = tf.train.load_variable(tf_path, f'GAU_alpha/encoder/layer_{i}/gau/i_dense/kernel')
+    torch_state_dict[f'encoderLayer.{i}.gau.i_dense.weight'] = torch.from_numpy(ts.T)
+    ts = tf.train.load_variable(tf_path, f'GAU_alpha/encoder/layer_{i}/gau/o_dense/kernel')
+    torch_state_dict[f'encoderLayer.{i}.gau.o_dense.weight'] = torch.from_numpy(ts.T)
+    ts1 = tf.train.load_variable(tf_path, f'GAU_alpha/encoder/layer_{i}/gau/q_scaleoffset/gamma')
+    ts2 = tf.train.load_variable(tf_path, f'GAU_alpha/encoder/layer_{i}/gau/k_scaleoffset/gamma')
+    ts = torch.stack([torch.from_numpy(ts1), torch.from_numpy(ts2)], dim=0)
+    torch_state_dict[f'encoderLayer.{i}.gau.offsetscale.gamma'] = ts
+torch.save(torch_state_dict, 'F:/Projects/pretrain_ckpt/gau/[sushen-torch]--chinese_GAU-alpha-char_L-24_H-768/pytorch_model.bin')
+# config文件
+'''
+{
+  "hidden_act": "swish",
+  "hidden_size": 768,
+  "hidden_dropout_prob": 0.1,
+  "attention_probs_dropout_prob": 0.1,
+  "num_attention_heads": 1,
+  "attention_key_size": 128,
+  "intermediate_size": 1536,
+  "num_hidden_layers": 24,
+  "type_vocab_size": 2,
+  "vocab_size": 12000
+}
+'''
\ No newline at end of file