init the dlexamples new

0fc002df · huchen · 0e04b692 · 0fc002df · 0fc002df · 0fc002df
Commit 0fc002df authored Apr 14, 2022 by huchen
20 changed files
--- a/Keras/NLP/bert4keras/LICENSE
+++ b/Keras/NLP/bert4keras/LICENSE
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/Keras/NLP/bert4keras/README.md
+++ b/Keras/NLP/bert4keras/README.md
+# bert4keras
+- Our light reimplement of bert for keras
+- 更清晰、更轻量级的keras版bert
+- 在线文档：http://bert4keras.spaces.ac.cn/ （还在构建中）
+## 功能
+目前已经实现：
+- 加载bert/roberta/albert的预训练权重进行finetune；
+- 实现语言模型、seq2seq所需要的attention mask；
+- 丰富的examples</a>；
+- 从零预训练代码（支持TPU、多GPU，请看pretraining</a>）；
+- 兼容keras、tf.keras
+## 使用
+安装稳定版：
+```shell
+pip install bert4keras
+```
+安装最新版：
+```shell
+pip install git+https://www.github.com/bojone/bert4keras.git
+```
+使用例子请参考examples</a>目录。
+理论上兼容Python2和Python3，兼容tensorflow 1.14+和tensorflow 2.x，实验环境是Python 2.7、Tesorflow 1.14+以及Keras 2.3.1（已经在2.2.4、2.3.0、2.3.1、tf.keras下测试通过）。
+**为了获得最好的体验，建议你使用Tensorflow 1.14 + Keras 2.3.1组合。**
+<blockquote><strong>关于环境组合</strong>
+- 支持tf+keras和tf+tf.keras，后者需要提前传入环境变量TF_KERAS=1。
+- 当使用tf+keras时，建议2.2.4 <= keras <= 2.3.1，以及 1.14 <= tf <= 2.2，不能使用tf 2.3+。
+- keras 2.4+可以用，但事实上keras 2.4.x基本上已经完全等价于tf.keras了，因此如果你要用keras 2.4+，倒不如直接用tf.keras。
+</blockquote>
+## 权重
+目前支持加载的权重：
+- <strong>Google原版bert</strong>: https://github.com/google-research/bert
+- <strong>brightmart版roberta</strong>: https://github.com/brightmart/roberta_zh
+- <strong>哈工大版roberta</strong>: https://github.com/ymcui/Chinese-BERT-wwm
+- <strong>Google原版albert</strong><sup><a href="https://github.com/bojone/bert4keras/issues/29#issuecomment-552188981">[例子]</a></sup>: https://github.com/google-research/ALBERT
+- <strong>brightmart版albert</strong>: https://github.com/brightmart/albert_zh
+- <strong>转换后的albert</strong>: https://github.com/bojone/albert_zh
+- <strong>华为的NEZHA</strong>: https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/NEZHA-TensorFlow
+- <strong>华为的NEZHA-GEN</strong>: https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/NEZHA-Gen-TensorFlow
+- <strong>自研语言模型</strong>: https://github.com/ZhuiyiTechnology/pretrained-models
+- <strong>T5模型</strong>: https://github.com/google-research/text-to-text-transfer-transformer
+- <strong>GPT_OpenAI</strong>: https://github.com/bojone/CDial-GPT-tf
+- <strong>GPT2_ML</strong>: https://github.com/imcaspar/gpt2-ml
+- <strong>Google原版ELECTRA</strong>: https://github.com/google-research/electra
+- <strong>哈工大版ELECTRA</strong>: https://github.com/ymcui/Chinese-ELECTRA
+- <strong>CLUE版ELECTRA</strong>: https://github.com/CLUEbenchmark/ELECTRA
+- <strong>LaBSE（多国语言BERT）</strong>: https://github.com/bojone/labse
+- <strong>Chinese-GEN项目下的模型</strong>: https://github.com/bojone/chinese-gen
+- <strong>T5.1.1</strong>: https://github.com/google-research/text-to-text-transfer-transformer/blob/master/released_checkpoints.md#t511
+- <strong>Multilingual T5</strong>: https://github.com/google-research/multilingual-t5/
+<strong>注意事项</strong>
+- 注1：brightmart版albert的开源时间早于Google版albert，这导致早期brightmart版albert的权重与Google版的不完全一致，换言之两者不能直接相互替换。为了减少代码冗余，bert4keras的0.2.4及后续版本均只支持加载<u>Google版</u>以brightmart版中<u>带Google字眼</u>的权重。如果要加载早期版本的权重，请用<a href="https://github.com/bojone/bert4keras/releases/tag/v0.2.3">0.2.3版本</a>，或者考虑作者转换过的<a href="https://github.com/bojone/albert_zh">albert_zh</a>。
+- 注2：下载下来的ELECTRA权重，如果没有json配置文件的话，参考<a href="https://github.com/ymcui/Chinese-ELECTRA/issues/3">这里</a>自己改一个（需要加上`type_vocab_size`字段）。
--- a/Keras/NLP/bert4keras/bert4keras/__init__.py
+++ b/Keras/NLP/bert4keras/bert4keras/__init__.py
+#! -*- coding: utf-8 -*-
+__version__ = '0.10.5'
--- a/Keras/NLP/bert4keras/bert4keras/backend.py
+++ b/Keras/NLP/bert4keras/bert4keras/backend.py
+# -*- coding: utf-8 -*-
+# 分离后端函数，主要是为了同时兼容原生keras和tf.keras
+# 通过设置环境变量TF_KERAS=1来切换tf.keras
+import os, sys
+from distutils.util import strtobool
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.util import nest, tf_inspect
+from tensorflow.python.eager import tape
+from tensorflow.python.ops.custom_gradient import _graph_mode_decorator
+# 判断是tf.keras还是纯keras的标记
+is_tf_keras = strtobool(os.environ.get('TF_KERAS', '0'))
+if is_tf_keras:
+    import tensorflow.keras as keras
+    import tensorflow.keras.backend as K
+    sys.modules['keras'] = keras
+else:
+    import keras
+    import keras.backend as K
+# 判断是否启用重计算（通过时间换空间）
+do_recompute = strtobool(os.environ.get('RECOMPUTE', '0'))
+def gelu_erf(x):
+    """基于Erf直接计算的gelu函数
+    """
+    return 0.5 * x * (1.0 + tf.math.erf(x / np.sqrt(2.0)))
+def gelu_tanh(x):
+    """基于Tanh近似计算的gelu函数
+    """
+    cdf = 0.5 * (
+        1.0 + K.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * K.pow(x, 3))))
+    )
+    return x * cdf
+def set_gelu(version):
+    """设置gelu版本
+    """
+    version = version.lower()
+    assert version in ['erf', 'tanh'], 'gelu version must be erf or tanh'
+    if version == 'erf':
+        keras.utils.get_custom_objects()['gelu'] = gelu_erf
+    else:
+        keras.utils.get_custom_objects()['gelu'] = gelu_tanh
+def piecewise_linear(t, schedule):
+    """分段线性函数
+    其中schedule是形如{1000: 1, 2000: 0.1}的字典，
+    表示 t ∈ [0, 1000]时，输出从0均匀增加至1，而
+    t ∈ [1000, 2000]时，输出从1均匀降低到0.1，最后
+    t > 2000时，保持0.1不变。
+    """
+    schedule = sorted(schedule.items())
+    if schedule[0][0] != 0:
+        schedule = [(0, 0.0)] + schedule
+    x = K.constant(schedule[0][1], dtype=K.floatx())
+    t = K.cast(t, K.floatx())
+    for i in range(len(schedule)):
+        t_begin = schedule[i][0]
+        x_begin = x
+        if i != len(schedule) - 1:
+            dx = schedule[i + 1][1] - schedule[i][1]
+            dt = schedule[i + 1][0] - schedule[i][0]
+            slope = 1.0 * dx / dt
+            x = schedule[i][1] + slope * (t - t_begin)
+        else:
+            x = K.constant(schedule[i][1], dtype=K.floatx())
+        x = K.switch(t >= t_begin, x, x_begin)
+    return x
+def search_layer(inputs, name, exclude_from=None):
+    """根据inputs和name来搜索层
+    说明：inputs为某个层或某个层的输出；name为目标层的名字。
+    实现：根据inputs一直往上递归搜索，直到发现名字为name的层为止；
+         如果找不到，那就返回None。
+    """
+    if exclude_from is None:
+        exclude_from = set()
+    if isinstance(inputs, keras.layers.Layer):
+        layer = inputs
+    else:
+        layer = inputs._keras_history[0]
+    if layer.name == name:
+        return layer
+    elif layer in exclude_from:
+        return None
+    else:
+        exclude_from.add(layer)
+        if isinstance(layer, keras.models.Model):
+            model = layer
+            for layer in model.layers:
+                if layer.name == name:
+                    return layer
+        inbound_layers = layer._inbound_nodes[0].inbound_layers
+        if not isinstance(inbound_layers, list):
+            inbound_layers = [inbound_layers]
+        if len(inbound_layers) > 0:
+            for layer in inbound_layers:
+                layer = search_layer(layer, name, exclude_from)
+                if layer is not None:
+                    return layer
+def sequence_masking(x, mask, value=0.0, axis=None):
+    """为序列条件mask的函数
+    mask: 形如(batch_size, seq_len)的0-1矩阵；
+    value: mask部分要被替换成的值，可以是'-inf'或'inf'；
+    axis: 序列所在轴，默认为1；
+    """
+    if mask is None:
+        return x
+    else:
+        if K.dtype(mask) != K.dtype(x):
+            mask = K.cast(mask, K.dtype(x))
+        if value == '-inf':
+            value = -1e12
+        elif value == 'inf':
+            value = 1e12
+        if axis is None:
+            axis = 1
+        elif axis < 0:
+            axis = K.ndim(x) + axis
+        assert axis > 0, 'axis must be greater than 0'
+        for _ in range(axis - 1):
+            mask = K.expand_dims(mask, 1)
+        for _ in range(K.ndim(x) - K.ndim(mask)):
+            mask = K.expand_dims(mask, K.ndim(mask))
+        return x * mask + value * (1 - mask)
+def batch_gather(params, indices):
+    """同tf旧版本的batch_gather
+    """
+    if K.dtype(indices)[:3] != 'int':
+        indices = K.cast(indices, 'int32')
+    try:
+        return tf.gather(params, indices, batch_dims=K.ndim(indices) - 1)
+    except Exception as e1:
+        try:
+            return tf.batch_gather(params, indices)
+        except Exception as e2:
+            raise ValueError('%s\n%s\n' % (e1.message, e2.message))
+def pool1d(
+    x,
+    pool_size,
+    strides=1,
+    padding='valid',
+    data_format=None,
+    pool_mode='max'
+):
+    """向量序列的pool函数
+    """
+    x = K.expand_dims(x, 1)
+    x = K.pool2d(
+        x,
+        pool_size=(1, pool_size),
+        strides=(1, strides),
+        padding=padding,
+        data_format=data_format,
+        pool_mode=pool_mode
+    )
+    return x[:, 0]
+def divisible_temporal_padding(x, n):
+    """将一维向量序列右padding到长度能被n整除
+    """
+    r_len = K.shape(x)[1] % n
+    p_len = K.switch(r_len > 0, n - r_len, 0)
+    return K.temporal_padding(x, (0, p_len))
+def swish(x):
+    """swish函数（这样封装过后才有 __name__ 属性）
+    """
+    return tf.nn.swish(x)
+def leaky_relu(x, alpha=0.2):
+    """leaky relu函数（这样封装过后才有 __name__ 属性）
+    """
+    return tf.nn.leaky_relu(x, alpha=alpha)
+class Sinusoidal(keras.initializers.Initializer):
+    """Sin-Cos位置向量初始化器
+    来自：https://arxiv.org/abs/1706.03762
+    """
+    def __call__(self, shape, dtype=None):
+        """Sin-Cos形式的位置向量
+        """
+        vocab_size, depth = shape
+        embeddings = np.zeros(shape)
+        for pos in range(vocab_size):
+            for i in range(depth // 2):
+                theta = pos / np.power(10000, 2. * i / depth)
+                embeddings[pos, 2 * i] = np.sin(theta)
+                embeddings[pos, 2 * i + 1] = np.cos(theta)
+        return embeddings
+def symbolic(f):
+    """恒等装饰器（兼容旧版本keras用）
+    """
+    return f
+def graph_mode_decorator(f, *args, **kwargs):
+    """tf 2.1与之前版本的传参方式不一样，这里做个同步
+    """
+    if tf.__version__ < '2.1':
+        return _graph_mode_decorator(f, *args, **kwargs)
+    else:
+        return _graph_mode_decorator(f, args, kwargs)
+def recompute_grad(call):
+    """重计算装饰器（用来装饰Keras层的call函数）
+    关于重计算，请参考：https://arxiv.org/abs/1604.06174
+    """
+    if not do_recompute:
+        return call
+    def inner(self, inputs, **kwargs):
+        """定义需要求梯度的函数以及重新定义求梯度过程
+        （参考自官方自带的tf.recompute_grad函数）
+        """
+        flat_inputs = nest.flatten(inputs)
+        call_args = tf_inspect.getfullargspec(call).args
+        for key in ['mask', 'training']:
+            if key not in call_args and key in kwargs:
+                del kwargs[key]
+        def kernel_call():
+            """定义前向计算
+            """
+            return call(self, inputs, **kwargs)
+        def call_and_grad(*inputs):
+            """定义前向计算和反向计算
+            """
+            if is_tf_keras:
+                with tape.stop_recording():
+                    outputs = kernel_call()
+                    outputs = tf.identity(outputs)
+            else:
+                outputs = kernel_call()
+            def grad_fn(doutputs, variables=None):
+                watches = list(inputs)
+                if variables is not None:
+                    watches += list(variables)
+                with tf.GradientTape() as t:
+                    t.watch(watches)
+                    with tf.control_dependencies([doutputs]):
+                        outputs = kernel_call()
+                grads = t.gradient(
+                    outputs, watches, output_gradients=[doutputs]
+                )
+                del t
+                return grads[:len(inputs)], grads[len(inputs):]
+            return outputs, grad_fn
+        if is_tf_keras:  # 仅在tf >= 2.0下可用
+            outputs, grad_fn = call_and_grad(*flat_inputs)
+            flat_outputs = nest.flatten(outputs)
+            def actual_grad_fn(*doutputs):
+                grads = grad_fn(*doutputs, variables=self.trainable_weights)
+                return grads[0] + grads[1]
+            watches = flat_inputs + self.trainable_weights
+            watches = [tf.convert_to_tensor(x) for x in watches]
+            tape.record_operation(
+                call.__name__, flat_outputs, watches, actual_grad_fn
+            )
+            return outputs
+        else:  # keras + tf >= 1.14 均可用
+            return graph_mode_decorator(call_and_grad, *flat_inputs)
+    return inner
+# 给旧版本keras新增symbolic方法（装饰器），
+# 以便兼容optimizers.py中的代码
+K.symbolic = getattr(K, 'symbolic', None) or symbolic
+custom_objects = {
+    'gelu_erf': gelu_erf,
+    'gelu_tanh': gelu_tanh,
+    'gelu': gelu_erf,
+    'swish': swish,
+    'leaky_relu': leaky_relu,
+    'Sinusoidal': Sinusoidal,
+}
+keras.utils.get_custom_objects().update(custom_objects)
--- a/Keras/NLP/bert4keras/bert4keras/layers.py
+++ b/Keras/NLP/bert4keras/bert4keras/layers.py
+#! -*- coding: utf-8 -*-
+# 自定义层
+import numpy as np
+import tensorflow as tf
+from bert4keras.backend import keras, K, is_tf_keras
+from bert4keras.backend import sequence_masking
+from bert4keras.backend import recompute_grad
+from keras import initializers, activations
+from keras.layers import *
+def integerize_shape(func):
+    """装饰器，保证input_shape一定是int或None
+    """
+    def convert(item):
+        if hasattr(item, '__iter__'):
+            return [convert(i) for i in item]
+        elif hasattr(item, 'value'):
+            return item.value
+        else:
+            return item
+    def new_func(self, input_shape):
+        input_shape = convert(input_shape)
+        return func(self, input_shape)
+    return new_func
+if (not is_tf_keras) and keras.__version__ < '2.3':
+    class Layer(keras.layers.Layer):
+        """重新定义Layer，赋予“层中层”功能
+        （仅keras 2.3以下版本需要）
+        """
+        def __init__(self, **kwargs):
+            super(Layer, self).__init__(**kwargs)
+            self.supports_masking = True  # 本项目的自定义层均可mask
+        def __setattr__(self, name, value):
+            if isinstance(value, keras.layers.Layer):
+                if not hasattr(self, '_layers'):
+                    self._layers = []
+                if value not in self._layers:
+                    self._layers.append(value)
+            super(Layer, self).__setattr__(name, value)
+        @property
+        def trainable_weights(self):
+            trainable = getattr(self, 'trainable', True)
+            if trainable:
+                trainable_weights = super(Layer, self).trainable_weights[:]
+                for l in getattr(self, '_layers', []):
+                    trainable_weights += l.trainable_weights
+                return trainable_weights
+            else:
+                return []
+        @property
+        def non_trainable_weights(self):
+            trainable = getattr(self, 'trainable', True)
+            non_trainable_weights = super(Layer, self).non_trainable_weights[:]
+            for l in getattr(self, '_layers', []):
+                if trainable:
+                    non_trainable_weights += l.non_trainable_weights
+                else:
+                    non_trainable_weights += l.weights
+            return non_trainable_weights
+    if keras.__version__ < '2.2.5':
+        import inspect
+        class Model(keras.models.Model):
+            """重新定义Model，整合fit和fit_generator
+            """
+            def fit(self, x=None, *args, **kwargs):
+                if inspect.isgenerator(x):
+                    return self.fit_generator(x, *args, **kwargs)
+                else:
+                    return super(Model, self).fit(x, *args, **kwargs)
+        keras.models.Model = Model
+else:
+    class Layer(keras.layers.Layer):
+        def __init__(self, **kwargs):
+            super(Layer, self).__init__(**kwargs)
+            self.supports_masking = True  # 本项目的自定义层均可mask
+if (not is_tf_keras) or tf.__version__ < '1.15':
+    if not is_tf_keras:
+        NodeBase = keras.engine.base_layer.Node
+    else:
+        from tensorflow.python.keras.engine import base_layer
+        NodeBase = base_layer.Node
+    class Node(NodeBase):
+        """修改Node来修复keras下孪生网络的bug
+        注意：这是keras的bug，并不是bert4keras的bug，但keras已经不更新了，
+              所以只好在这里进行修改。tf 1.15+自带的keras已经修改了这个
+              bug。
+        """
+        @property
+        def arguments(self):
+            return self._arguments.copy()
+        @arguments.setter
+        def arguments(self, value):
+            self._arguments = value or {}
+    if not is_tf_keras:
+        keras.engine.base_layer.Node = Node
+    else:
+        base_layer.Node = Node
+class GlobalAveragePooling1D(keras.layers.GlobalAveragePooling1D):
+    """重新定义GlobalAveragePooling1D，支持序列长度为None
+    """
+    def call(self, inputs, mask=None):
+        axis = 1 if self.data_format == 'channels_last' else 2
+        if mask is not None:
+            mask = K.cast(mask, K.floatx())
+            mask = mask[..., None] if axis == 1 else mask[:, None]
+            return K.sum(inputs * mask, axis=axis) / K.sum(mask, axis=axis)
+        else:
+            return K.mean(inputs, axis=axis)
+class GlobalMaxPooling1D(keras.layers.GlobalMaxPooling1D):
+    """重新定义GlobalMaxPooling1D，支持mask
+    """
+    def __init__(self, data_format='channels_last', **kwargs):
+        super(GlobalMaxPooling1D, self).__init__(data_format, **kwargs)
+        self.supports_masking = True
+    def call(self, inputs, mask=None):
+        axis = 1 if self.data_format == 'channels_last' else 2
+        inputs = sequence_masking(inputs, mask, '-inf', axis)
+        return K.max(inputs, axis=axis)
+    def compute_mask(self, inputs, mask=None):
+        return None
+# 直接覆盖原对象
+keras.layers.GlobalAveragePooling1D = GlobalAveragePooling1D
+keras.layers.GlobalMaxPooling1D = GlobalMaxPooling1D
+class Embedding(keras.layers.Embedding):
+    """拓展Embedding层
+    """
+    def compute_mask(self, inputs, mask=None):
+        """为了适配T5，保证第一个token不被mask
+        """
+        if K.ndim(inputs) == 2:
+            mask = super(Embedding, self).compute_mask(inputs, mask)
+            if mask is not None:
+                mask1 = K.ones_like(mask[:, :1], dtype='bool')
+                mask2 = mask[:, 1:]
+                return K.concatenate([mask1, mask2], 1)
+        else:
+            return mask
+    def call(self, inputs, mode='embedding'):
+        """新增mode参数，可以为embedding或dense。如果为embedding，
+        则等价于普通Embedding层；如果为dense，则等价于无bias的Dense层。
+        """
+        if mode == 'embedding':
+            return super(Embedding, self).call(inputs)
+        else:
+            kernel = K.transpose(self.embeddings)
+            return K.dot(inputs, kernel)
+    def compute_output_shape(self, input_shape):
+        """关于判据，本来是通过缓存call时的mode参数来判断的，但是后来发现
+        Keras在使用compute_output_shape的时候不一定配套调用了call函数，
+        所以缓存的mode可能是不准的，因此只能出此下策。
+        """
+        if len(input_shape) == 2:
+            return super(Embedding, self).compute_output_shape(input_shape)
+        else:
+            return input_shape[:2] + (K.int_shape(self.embeddings)[0],)
+class BiasAdd(Layer):
+    """加上偏置项
+    """
+    @integerize_shape
+    def build(self, input_shape):
+        super(BiasAdd, self).build(input_shape)
+        output_dim = input_shape[-1]
+        self.bias = self.add_weight(
+            name='bias',
+            shape=(output_dim,),
+            initializer='zeros',
+            trainable=True
+        )
+    def call(self, inputs):
+        return K.bias_add(inputs, self.bias)
+class Concatenate1D(Layer):
+    """1维序列拼接层
+    说明：本来该功能可以直接通过Concatenate层来实现，无奈Keras
+          自带的Concatenate层的compute_mask写得不合理，导致一个
+          带mask的序列与一个不带mask的序列拼接会报错，因此干脆
+          自己重写一个好了。
+    """
+    def call(self, inputs):
+        return K.concatenate(inputs, axis=1)
+    def compute_mask(self, inputs, mask=None):
+        if mask is not None:
+            masks = []
+            for i, m in enumerate(mask):
+                if m is None:
+                    m = K.ones_like(inputs[i][..., 0], dtype='bool')
+                masks.append(m)
+            return K.concatenate(masks, axis=1)
+    def compute_output_shape(self, input_shape):
+        if all([shape[1] for shape in input_shape]):
+            seq_len = sum([shape[1] for shape in input_shape])
+            return (input_shape[0][0], seq_len, input_shape[0][2])
+        else:
+            return (input_shape[0][0], None, input_shape[0][2])
+class MultiHeadAttention(Layer):
+    """多头注意力机制
+    """
+    def __init__(
+        self,
+        heads,
+        head_size,
+        out_dim=None,
+        key_size=None,
+        use_bias=True,
+        attention_scale=True,
+        return_attention_scores=False,
+        kernel_initializer='glorot_uniform',
+        **kwargs
+    ):
+        super(MultiHeadAttention, self).__init__(**kwargs)
+        self.heads = heads
+        self.head_size = head_size
+        self.out_dim = out_dim or heads * head_size
+        self.key_size = key_size or head_size
+        self.use_bias = use_bias
+        self.attention_scale = attention_scale
+        self.return_attention_scores = return_attention_scores
+        self.kernel_initializer = initializers.get(kernel_initializer)
+    def build(self, input_shape):
+        super(MultiHeadAttention, self).build(input_shape)
+        self.q_dense = Dense(
+            units=self.key_size * self.heads,
+            use_bias=self.use_bias,
+            kernel_initializer=self.kernel_initializer
+        )
+        self.k_dense = Dense(
+            units=self.key_size * self.heads,
+            use_bias=self.use_bias,
+            kernel_initializer=self.kernel_initializer
+        )
+        self.v_dense = Dense(
+            units=self.head_size * self.heads,
+            use_bias=self.use_bias,
+            kernel_initializer=self.kernel_initializer
+        )
+        self.o_dense = Dense(
+            units=self.out_dim,
+            use_bias=self.use_bias,
+            kernel_initializer=self.kernel_initializer
+        )
+    @recompute_grad
+    def call(self, inputs, mask=None, **kwargs):
+        """实现多头注意力
+        q_mask: 对输入的query序列的mask。
+                主要是将输出结果的padding部分置0。
+        v_mask: 对输入的value序列的mask。
+                主要是防止attention读取到padding信息。
+        """
+        q, k, v = inputs[:3]
+        q_mask, v_mask = None, None
+        if mask is not None:
+            q_mask, v_mask = mask[0], mask[2]
+        # 线性变换
+        qw = self.q_dense(q)
+        kw = self.k_dense(k)
+        vw = self.v_dense(v)
+        # 形状变换
+        qw = K.reshape(qw, (-1, K.shape(q)[1], self.heads, self.key_size))
+        kw = K.reshape(kw, (-1, K.shape(k)[1], self.heads, self.key_size))
+        vw = K.reshape(vw, (-1, K.shape(v)[1], self.heads, self.head_size))
+        # Attention
+        qkv_inputs = [qw, kw, vw] + inputs[3:]
+        qv_masks = [q_mask, v_mask]
+        o, a = self.pay_attention_to(qkv_inputs, qv_masks, **kwargs)
+        # 完成输出
+        o = K.reshape(o, (-1, K.shape(o)[1], self.head_size * self.heads))
+        o = self.o_dense(o)
+        # 返回结果
+        if self.return_attention_scores:
+            return [o, a]
+        else:
+            return o
+    def pay_attention_to(self, inputs, mask=None, **kwargs):
+        """实现标准的乘性多头注意力
+        a_bias: 对attention矩阵的bias。
+                不同的attention bias对应不同的应用。
+        p_bias: 在attention里的位置偏置。
+                一般用来指定相对位置编码的种类。
+        说明: 这里单独分离出pay_attention_to函数，是为了方便
+              继承此类来定义不同形式的atttention；此处要求
+              返回o.shape=(batch_size, seq_len, heads, head_size)。
+        """
+        (qw, kw, vw), n = inputs[:3], 3
+        q_mask, v_mask = mask
+        a_bias, p_bias = kwargs.get('a_bias'), kwargs.get('p_bias')
+        if a_bias:
+            a_bias = inputs[n]
+            n += 1
+        if p_bias == 'rotary':
+            cos_pos = K.repeat_elements(inputs[n][..., None, 1::2], 2, -1)
+            sin_pos = K.repeat_elements(inputs[n][..., None, ::2], 2, -1)
+            qw2 = K.stack([-qw[..., 1::2], qw[..., ::2]], 4)
+            qw2 = K.reshape(qw2, K.shape(qw))
+            qw = qw * cos_pos + qw2 * sin_pos
+            kw2 = K.stack([-kw[..., 1::2], kw[..., ::2]], 4)
+            kw2 = K.reshape(kw2, K.shape(kw))
+            kw = kw * cos_pos + kw2 * sin_pos
+        # Attention
+        a = tf.einsum('bjhd,bkhd->bhjk', qw, kw)
+        # 处理位置编码
+        if p_bias == 'typical_relative':
+            position_bias = inputs[n]
+            a = a + tf.einsum('bjhd,jkd->bhjk', qw, position_bias)
+        elif p_bias == 't5_relative':
+            position_bias = K.permute_dimensions(inputs[n], (2, 0, 1))
+            a = a + K.expand_dims(position_bias, 0)
+        # Attention（续）
+        if self.attention_scale:
+            a = a / self.key_size**0.5
+        if a_bias is not None:
+            a = a + a_bias
+        a = sequence_masking(a, v_mask, '-inf', -1)
+        A = K.softmax(a)
+        # 完成输出
+        o = tf.einsum('bhjk,bkhd->bjhd', A, vw)
+        if p_bias == 'typical_relative':
+            o = o + tf.einsum('bhjk,jkd->bjhd', A, position_bias)
+        return o, a
+    def compute_output_shape(self, input_shape):
+        o_shape = (input_shape[0][0], input_shape[0][1], self.out_dim)
+        if self.return_attention_scores:
+            a_shape = (
+                input_shape[0][0], self.heads, input_shape[0][1],
+                input_shape[1][1]
+            )
+            return [o_shape, a_shape]
+        else:
+            return o_shape
+    def compute_mask(self, inputs, mask=None):
+        if mask is not None:
+            if self.return_attention_scores:
+                return [mask[0], None]
+            else:
+                return mask[0]
+    def get_config(self):
+        config = {
+            'heads': self.heads,
+            'head_size': self.head_size,
+            'out_dim': self.out_dim,
+            'key_size': self.key_size,
+            'use_bias': self.use_bias,
+            'attention_scale': self.attention_scale,
+            'return_attention_scores': self.return_attention_scores,
+            'kernel_initializer':
+                initializers.serialize(self.kernel_initializer),
+        }
+        base_config = super(MultiHeadAttention, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+class LayerNormalization(Layer):
+    """(Conditional) Layer Normalization
+    hidden_*系列参数仅为有条件输入时(conditional=True)使用
+    """
+    def __init__(
+        self,
+        center=True,
+        scale=True,
+        epsilon=None,
+        conditional=False,
+        hidden_units=None,
+        hidden_activation='linear',
+        hidden_initializer='glorot_uniform',
+        **kwargs
+    ):
+        super(LayerNormalization, self).__init__(**kwargs)
+        self.center = center
+        self.scale = scale
+        self.conditional = conditional
+        self.hidden_units = hidden_units
+        self.hidden_activation = activations.get(hidden_activation)
+        self.hidden_initializer = initializers.get(hidden_initializer)
+        self.epsilon = epsilon or 1e-12
+    def compute_mask(self, inputs, mask=None):
+        if self.conditional:
+            masks = mask if mask is not None else []
+            masks = [m[None] for m in masks if m is not None]
+            if len(masks) == 0:
+                return None
+            else:
+                return K.all(K.concatenate(masks, axis=0), axis=0)
+        else:
+            return mask
+    def build(self, input_shape):
+        super(LayerNormalization, self).build(input_shape)
+        if self.conditional:
+            shape = (input_shape[0][-1],)
+        else:
+            shape = (input_shape[-1],)
+        if self.center:
+            self.beta = self.add_weight(
+                shape=shape, initializer='zeros', name='beta'
+            )
+        if self.scale:
+            self.gamma = self.add_weight(
+                shape=shape, initializer='ones', name='gamma'
+            )
+        if self.conditional:
+            if self.hidden_units is not None:
+                self.hidden_dense = Dense(
+                    units=self.hidden_units,
+                    activation=self.hidden_activation,
+                    use_bias=False,
+                    kernel_initializer=self.hidden_initializer
+                )
+            if self.center:
+                self.beta_dense = Dense(
+                    units=shape[0], use_bias=False, kernel_initializer='zeros'
+                )
+            if self.scale:
+                self.gamma_dense = Dense(
+                    units=shape[0], use_bias=False, kernel_initializer='zeros'
+                )
+    @recompute_grad
+    def call(self, inputs):
+        """如果是条件Layer Norm，则默认以list为输入，第二个是condition
+        """
+        if self.conditional:
+            inputs, cond = inputs
+            if self.hidden_units is not None:
+                cond = self.hidden_dense(cond)
+            for _ in range(K.ndim(inputs) - K.ndim(cond)):
+                cond = K.expand_dims(cond, 1)
+            if self.center:
+                beta = self.beta_dense(cond) + self.beta
+            if self.scale:
+                gamma = self.gamma_dense(cond) + self.gamma
+        else:
+            if self.center:
+                beta = self.beta
+            if self.scale:
+                gamma = self.gamma
+        outputs = inputs
+        if self.center:
+            mean = K.mean(outputs, axis=-1, keepdims=True)
+            outputs = outputs - mean
+        if self.scale:
+            variance = K.mean(K.square(outputs), axis=-1, keepdims=True)
+            std = K.sqrt(variance + self.epsilon)
+            outputs = outputs / std * gamma
+        if self.center:
+            outputs = outputs + beta
+        return outputs
+    def compute_output_shape(self, input_shape):
+        if self.conditional:
+            return input_shape[0]
+        else:
+            return input_shape
+    def get_config(self):
+        config = {
+            'center': self.center,
+            'scale': self.scale,
+            'epsilon': self.epsilon,
+            'conditional': self.conditional,
+            'hidden_units': self.hidden_units,
+            'hidden_activation': activations.serialize(self.hidden_activation),
+            'hidden_initializer':
+                initializers.serialize(self.hidden_initializer),
+        }
+        base_config = super(LayerNormalization, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+class PositionEmbedding(Layer):
+    """定义可训练的位置Embedding
+    """
+    def __init__(
+        self,
+        input_dim,
+        output_dim,
+        merge_mode='add',
+        hierarchical=None,
+        embeddings_initializer='zeros',
+        custom_position_ids=False,
+        **kwargs
+    ):
+        super(PositionEmbedding, self).__init__(**kwargs)
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.merge_mode = merge_mode
+        self.hierarchical = hierarchical
+        self.embeddings_initializer = initializers.get(embeddings_initializer)
+        self.custom_position_ids = custom_position_ids
+    def build(self, input_shape):
+        super(PositionEmbedding, self).build(input_shape)
+        self.embeddings = self.add_weight(
+            name='embeddings',
+            shape=(self.input_dim, self.output_dim),
+            initializer=self.embeddings_initializer
+        )
+    def call(self, inputs):
+        """如果custom_position_ids，那么第二个输入为自定义的位置id
+        """
+        if self.custom_position_ids:
+            inputs, position_ids = inputs
+            if 'int' not in K.dtype(position_ids):
+                position_ids = K.cast(position_ids, 'int32')
+        else:
+            input_shape = K.shape(inputs)
+            batch_size, seq_len = input_shape[0], input_shape[1]
+            position_ids = K.arange(0, seq_len, dtype='int32')[None]
+        if self.hierarchical:
+            alpha = 0.4 if self.hierarchical is True else self.hierarchical
+            embeddings = self.embeddings - alpha * self.embeddings[:1]
+            embeddings = embeddings / (1 - alpha)
+            embeddings_x = K.gather(embeddings, position_ids // self.input_dim)
+            embeddings_y = K.gather(embeddings, position_ids % self.input_dim)
+            embeddings = alpha * embeddings_x + (1 - alpha) * embeddings_y
+        else:
+            if self.custom_position_ids:
+                embeddings = K.gather(self.embeddings, position_ids)
+            else:
+                embeddings = self.embeddings[None, :seq_len]
+        if self.merge_mode == 'add':
+            return inputs + embeddings
+        elif self.merge_mode == 'mul':
+            return inputs * (embeddings + 1.0)
+        elif self.merge_mode == 'zero':
+            return embeddings
+        else:
+            if not self.custom_position_ids:
+                embeddings = K.tile(embeddings, [batch_size, 1, 1])
+            return K.concatenate([inputs, embeddings])
+    def compute_output_shape(self, input_shape):
+        if self.custom_position_ids:
+            input_shape = input_shape[0]
+        if self.merge_mode in ['add', 'mul', 'zero']:
+            return input_shape[:2] + (self.output_dim,)
+        else:
+            return input_shape[:2] + (input_shape[2] + self.output_dim,)
+    def get_config(self):
+        config = {
+            'input_dim': self.input_dim,
+            'output_dim': self.output_dim,
+            'merge_mode': self.merge_mode,
+            'hierarchical': self.hierarchical,
+            'embeddings_initializer':
+                initializers.serialize(self.embeddings_initializer),
+            'custom_position_ids': self.custom_position_ids,
+        }
+        base_config = super(PositionEmbedding, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+class SinusoidalPositionEmbedding(Layer):
+    """定义Sin-Cos位置Embedding
+    """
+    def __init__(
+        self, output_dim, merge_mode='add', custom_position_ids=False, **kwargs
+    ):
+        super(SinusoidalPositionEmbedding, self).__init__(**kwargs)
+        self.output_dim = output_dim
+        self.merge_mode = merge_mode
+        self.custom_position_ids = custom_position_ids
+    def call(self, inputs):
+        """如果custom_position_ids，那么第二个输入为自定义的位置id
+        """
+        if self.custom_position_ids:
+            seq_len = K.shape(inputs)[1]
+            inputs, position_ids = inputs
+            if 'float' not in K.dtype(position_ids):
+                position_ids = K.cast(position_ids, K.floatx())
+        else:
+            input_shape = K.shape(inputs)
+            batch_size, seq_len = input_shape[0], input_shape[1]
+            position_ids = K.arange(0, seq_len, dtype=K.floatx())[None]
+        indices = K.arange(0, self.output_dim // 2, dtype=K.floatx())
+        indices = K.pow(10000.0, -2 * indices / self.output_dim)
+        embeddings = tf.einsum('bn,d->bnd', position_ids, indices)
+        embeddings = K.stack([K.sin(embeddings), K.cos(embeddings)], axis=-1)
+        embeddings = K.reshape(embeddings, (-1, seq_len, self.output_dim))
+        if self.merge_mode == 'add':
+            return inputs + embeddings
+        elif self.merge_mode == 'mul':
+            return inputs * (embeddings + 1.0)
+        elif self.merge_mode == 'zero':
+            return embeddings
+        else:
+            if not self.custom_position_ids:
+                embeddings = K.tile(embeddings, [batch_size, 1, 1])
+            return K.concatenate([inputs, embeddings])
+    def compute_output_shape(self, input_shape):
+        if self.custom_position_ids:
+            input_shape = input_shape[0]
+        if self.merge_mode in ['add', 'mul', 'zero']:
+            return input_shape[:2] + (self.output_dim,)
+        else:
+            return input_shape[:2] + (input_shape[2] + self.output_dim,)
+    def get_config(self):
+        config = {
+            'output_dim': self.output_dim,
+            'merge_mode': self.merge_mode,
+            'custom_position_ids': self.custom_position_ids,
+        }
+        base_config = super(SinusoidalPositionEmbedding, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+class RelativePositionEmbedding(Layer):
+    """相对位置编码
+    来自论文：https://arxiv.org/abs/1803.02155
+    """
+    def __init__(
+        self, input_dim, output_dim, embeddings_initializer='zeros', **kwargs
+    ):
+        super(RelativePositionEmbedding, self).__init__(**kwargs)
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.embeddings_initializer = initializers.get(embeddings_initializer)
+    def build(self, input_shape):
+        super(RelativePositionEmbedding, self).build(input_shape)
+        self.embeddings = self.add_weight(
+            name='embeddings',
+            shape=(self.input_dim, self.output_dim),
+            initializer=self.embeddings_initializer,
+        )
+    def call(self, inputs):
+        pos_ids = self.compute_position_ids(inputs)
+        return K.gather(self.embeddings, pos_ids)
+    def compute_position_ids(self, inputs):
+        q, v = inputs
+        # 计算位置差
+        q_idxs = K.arange(0, K.shape(q)[1], dtype='int32')
+        q_idxs = K.expand_dims(q_idxs, 1)
+        v_idxs = K.arange(0, K.shape(v)[1], dtype='int32')
+        v_idxs = K.expand_dims(v_idxs, 0)
+        pos_ids = v_idxs - q_idxs
+        # 后处理操作
+        max_position = (self.input_dim - 1) // 2
+        pos_ids = K.clip(pos_ids, -max_position, max_position)
+        pos_ids = pos_ids + max_position
+        return pos_ids
+    def compute_output_shape(self, input_shape):
+        return (None, None, self.output_dim)
+    def compute_mask(self, inputs, mask):
+        return mask[0]
+    def get_config(self):
+        config = {
+            'input_dim': self.input_dim,
+            'output_dim': self.output_dim,
+            'embeddings_initializer':
+                initializers.serialize(self.embeddings_initializer),
+        }
+        base_config = super(RelativePositionEmbedding, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+class RelativePositionEmbeddingT5(RelativePositionEmbedding):
+    """Google T5的相对位置编码
+    来自论文：https://arxiv.org/abs/1910.10683
+    """
+    def __init__(
+        self,
+        input_dim,
+        output_dim,
+        max_distance=128,
+        bidirectional=True,
+        embeddings_initializer='zeros',
+        **kwargs
+    ):
+        super(RelativePositionEmbeddingT5,
+              self).__init__(input_dim, output_dim, **kwargs)
+        self.max_distance = max_distance
+        self.bidirectional = bidirectional
+    def compute_position_ids(self, inputs):
+        """T5的相对位置分桶（直接翻译自官方T5源码）
+        """
+        q, v = inputs
+        # 计算位置差
+        q_idxs = K.arange(0, K.shape(q)[1], dtype='int32')
+        q_idxs = K.expand_dims(q_idxs, 1)
+        v_idxs = K.arange(0, K.shape(v)[1], dtype='int32')
+        v_idxs = K.expand_dims(v_idxs, 0)
+        pos_ids = v_idxs - q_idxs
+        # 后处理操作
+        num_buckets, max_distance = self.input_dim, self.max_distance
+        ret = 0
+        n = -pos_ids
+        if self.bidirectional:
+            num_buckets //= 2
+            ret += K.cast(K.less(n, 0), 'int32') * num_buckets
+            n = K.abs(n)
+        else:
+            n = K.maximum(n, 0)
+        # now n is in the range [0, inf)
+        max_exact = num_buckets // 2
+        is_small = K.less(n, max_exact)
+        val_if_large = max_exact + K.cast(
+            K.log(K.cast(n, K.floatx()) / max_exact) /
+            np.log(max_distance / max_exact) * (num_buckets - max_exact),
+            'int32',
+        )
+        val_if_large = K.minimum(val_if_large, num_buckets - 1)
+        ret += K.switch(is_small, n, val_if_large)
+        return ret
+    def get_config(self):
+        config = {
+            'max_distance': self.max_distance,
+            'bidirectional': self.bidirectional,
+        }
+        base_config = super(RelativePositionEmbeddingT5, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+class FeedForward(Layer):
+    """FeedForward层
+    如果activation不是一个list，那么它就是两个Dense层的叠加；如果activation是
+    一个list，那么第一个Dense层将会被替换成门控线性单元（Gated Linear Unit）。
+    参考论文: https://arxiv.org/abs/2002.05202
+    """
+    def __init__(
+        self,
+        units,
+        activation='relu',
+        use_bias=True,
+        kernel_initializer='glorot_uniform',
+        **kwargs
+    ):
+        super(FeedForward, self).__init__(**kwargs)
+        self.units = units
+        if not isinstance(activation, list):
+            activation = [activation]
+        self.activation = [activations.get(act) for act in activation]
+        self.use_bias = use_bias
+        self.kernel_initializer = initializers.get(kernel_initializer)
+    @integerize_shape
+    def build(self, input_shape):
+        super(FeedForward, self).build(input_shape)
+        output_dim = input_shape[-1]
+        for i, activation in enumerate(self.activation):
+            i_dense = Dense(
+                units=self.units,
+                activation=activation,
+                use_bias=self.use_bias,
+                kernel_initializer=self.kernel_initializer
+            )
+            setattr(self, 'i%s_dense' % i, i_dense)
+        self.o_dense = Dense(
+            units=output_dim,
+            use_bias=self.use_bias,
+            kernel_initializer=self.kernel_initializer
+        )
+    @recompute_grad
+    def call(self, inputs):
+        x = self.i0_dense(inputs)
+        for i in range(1, len(self.activation)):
+            x = x * getattr(self, 'i%s_dense' % i)(inputs)
+        x = self.o_dense(x)
+        return x
+    def get_config(self):
+        config = {
+            'units': self.units,
+            'activation': [
+                activations.serialize(act) for act in self.activation
+            ],
+            'use_bias': self.use_bias,
+            'kernel_initializer':
+                initializers.serialize(self.kernel_initializer),
+        }
+        base_config = super(FeedForward, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+class ConditionalRandomField(Layer):
+    """纯Keras实现CRF层
+    CRF层本质上是一个带训练参数的loss计算层。
+    """
+    def __init__(self, lr_multiplier=1, **kwargs):
+        super(ConditionalRandomField, self).__init__(**kwargs)
+        self.lr_multiplier = lr_multiplier  # 当前层学习率的放大倍数
+    @integerize_shape
+    def build(self, input_shape):
+        super(ConditionalRandomField, self).build(input_shape)
+        output_dim = input_shape[-1]
+        self._trans = self.add_weight(
+            name='trans',
+            shape=(output_dim, output_dim),
+            initializer='glorot_uniform',
+            trainable=True
+        )
+        if self.lr_multiplier != 1:
+            K.set_value(self._trans, K.eval(self._trans) / self.lr_multiplier)
+    @property
+    def trans(self):
+        if self.lr_multiplier != 1:
+            return self.lr_multiplier * self._trans
+        else:
+            return self._trans
+    def compute_mask(self, inputs, mask=None):
+        return None
+    def call(self, inputs, mask=None):
+        return sequence_masking(inputs, mask, '-inf', 1)
+    def target_score(self, y_true, y_pred):
+        """计算目标路径的相对概率（还没有归一化）
+        要点：逐标签得分，加上转移概率得分。
+        """
+        point_score = tf.einsum('bni,bni->b', y_true, y_pred)  # 逐标签得分
+        trans_score = tf.einsum(
+            'bni,ij,bnj->b', y_true[:, :-1], self.trans, y_true[:, 1:]
+        )  # 标签转移得分
+        return point_score + trans_score
+    def log_norm_step(self, inputs, states):
+        """递归计算归一化因子
+        要点：1、递归计算；2、用logsumexp避免溢出。
+        """
+        inputs, mask = inputs[:, :-1], inputs[:, -1:]
+        states = K.expand_dims(states[0], 2)  # (batch_size, output_dim, 1)
+        trans = K.expand_dims(self.trans, 0)  # (1, output_dim, output_dim)
+        outputs = tf.reduce_logsumexp(
+            states + trans, 1
+        )  # (batch_size, output_dim)
+        outputs = outputs + inputs
+        outputs = mask * outputs + (1 - mask) * states[:, :, 0]
+        return outputs, [outputs]
+    def dense_loss(self, y_true, y_pred):
+        """y_true需要是one hot形式
+        """
+        # 导出mask并转换数据类型
+        mask = K.all(K.greater(y_pred, -1e6), axis=2, keepdims=True)
+        mask = K.cast(mask, K.floatx())
+        # 计算目标分数
+        y_true, y_pred = y_true * mask, y_pred * mask
+        target_score = self.target_score(y_true, y_pred)
+        # 递归计算log Z
+        init_states = [y_pred[:, 0]]
+        y_pred = K.concatenate([y_pred, mask], axis=2)
+        input_length = K.int_shape(y_pred[:, 1:])[1]
+        log_norm, _, _ = K.rnn(
+            self.log_norm_step,
+            y_pred[:, 1:],
+            init_states,
+            input_length=input_length
+        )  # 最后一步的log Z向量
+        log_norm = tf.reduce_logsumexp(log_norm, 1)  # logsumexp得标量
+        # 计算损失 -log p
+        return log_norm - target_score
+    def sparse_loss(self, y_true, y_pred):
+        """y_true需要是整数形式（非one hot）
+        """
+        # y_true需要重新明确一下shape和dtype
+        y_true = K.reshape(y_true, K.shape(y_pred)[:-1])
+        y_true = K.cast(y_true, 'int32')
+        # 转为one hot
+        y_true = K.one_hot(y_true, K.shape(self.trans)[0])
+        return self.dense_loss(y_true, y_pred)
+    def dense_accuracy(self, y_true, y_pred):
+        """训练过程中显示逐帧准确率的函数，排除了mask的影响
+        此处y_true需要是one hot形式
+        """
+        y_true = K.argmax(y_true, 2)
+        return self.sparse_accuracy(y_true, y_pred)
+    def sparse_accuracy(self, y_true, y_pred):
+        """训练过程中显示逐帧准确率的函数，排除了mask的影响
+        此处y_true需要是整数形式（非one hot）
+        """
+        # 导出mask并转换数据类型
+        mask = K.all(K.greater(y_pred, -1e6), axis=2)
+        mask = K.cast(mask, K.floatx())
+        # y_true需要重新明确一下shape和dtype
+        y_true = K.reshape(y_true, K.shape(y_pred)[:-1])
+        y_true = K.cast(y_true, 'int32')
+        # 逐标签取最大来粗略评测训练效果
+        y_pred = K.cast(K.argmax(y_pred, 2), 'int32')
+        isequal = K.cast(K.equal(y_true, y_pred), K.floatx())
+        return K.sum(isequal * mask) / K.sum(mask)
+    def get_config(self):
+        config = {
+            'lr_multiplier': self.lr_multiplier,
+        }
+        base_config = super(ConditionalRandomField, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+class MaximumEntropyMarkovModel(Layer):
+    """（双向）最大熵隐马尔可夫模型
+    作用和用法都类似CRF，但是比CRF更快更简单。
+    """
+    def __init__(self, lr_multiplier=1, hidden_dim=None, **kwargs):
+        super(MaximumEntropyMarkovModel, self).__init__(**kwargs)
+        self.lr_multiplier = lr_multiplier  # 当前层学习率的放大倍数
+        self.hidden_dim = hidden_dim  # 如果非None，则将转移矩阵低秩分解
+    @integerize_shape
+    def build(self, input_shape):
+        super(MaximumEntropyMarkovModel, self).build(input_shape)
+        output_dim = input_shape[-1]
+        if self.hidden_dim is None:
+            self._trans = self.add_weight(
+                name='trans',
+                shape=(output_dim, output_dim),
+                initializer='glorot_uniform',
+                trainable=True
+            )
+            if self.lr_multiplier != 1:
+                K.set_value(
+                    self._trans,
+                    K.eval(self._trans) / self.lr_multiplier
+                )
+        else:
+            self._l_trans = self.add_weight(
+                name='l_trans',
+                shape=(output_dim, self.hidden_dim),
+                initializer='glorot_uniform',
+                trainable=True
+            )
+            self._r_trans = self.add_weight(
+                name='r_trans',
+                shape=(output_dim, self.hidden_dim),
+                initializer='glorot_uniform',
+                trainable=True
+            )
+            if self.lr_multiplier != 1:
+                K.set_value(
+                    self._l_trans,
+                    K.eval(self._l_trans) / self.lr_multiplier
+                )
+                K.set_value(
+                    self._r_trans,
+                    K.eval(self._r_trans) / self.lr_multiplier
+                )
+    @property
+    def trans(self):
+        if self.lr_multiplier != 1:
+            return self.lr_multiplier * self._trans
+        else:
+            return self._trans
+    @property
+    def l_trans(self):
+        if self.lr_multiplier != 1:
+            return self.lr_multiplier * self._l_trans
+        else:
+            return self._l_trans
+    @property
+    def r_trans(self):
+        if self.lr_multiplier != 1:
+            return self.lr_multiplier * self._r_trans
+        else:
+            return self._r_trans
+    def compute_mask(self, inputs, mask=None):
+        return None
+    def call(self, inputs, mask=None):
+        return sequence_masking(inputs, mask, '-inf', 1)
+    def reverse_sequence(self, inputs, mask=None):
+        if mask is None:
+            return [x[:, ::-1] for x in inputs]
+        else:
+            length = K.cast(K.sum(mask, 1), 'int32')
+            return [tf.reverse_sequence(x, length, seq_axis=1) for x in inputs]
+    def basic_loss(self, y_true, y_pred, go_backwards=False):
+        """y_true需要是整数形式（非one hot）
+        """
+        # 导出mask并转换数据类型
+        mask = K.all(K.greater(y_pred, -1e6), axis=2)
+        mask = K.cast(mask, K.floatx())
+        # y_true需要重新明确一下shape和dtype
+        y_true = K.reshape(y_true, K.shape(y_pred)[:-1])
+        y_true = K.cast(y_true, 'int32')
+        # 反转相关
+        if self.hidden_dim is None:
+            if go_backwards:  # 是否反转序列
+                y_true, y_pred = self.reverse_sequence([y_true, y_pred], mask)
+                trans = K.transpose(self.trans)
+            else:
+                trans = self.trans
+            histoty = K.gather(trans, y_true)
+        else:
+            if go_backwards:  # 是否反转序列
+                y_true, y_pred = self.reverse_sequence([y_true, y_pred], mask)
+                r_trans, l_trans = self.l_trans, self.r_trans
+            else:
+                l_trans, r_trans = self.l_trans, self.r_trans
+            histoty = K.gather(l_trans, y_true)
+            histoty = tf.einsum('bnd,kd->bnk', histoty, r_trans)
+        # 计算loss
+        histoty = K.concatenate([y_pred[:, :1], histoty[:, :-1]], 1)
+        y_pred = (y_pred + histoty) / 2
+        loss = K.sparse_categorical_crossentropy(
+            y_true, y_pred, from_logits=True
+        )
+        return K.sum(loss * mask) / K.sum(mask)
+    def sparse_loss(self, y_true, y_pred):
+        """y_true需要是整数形式（非one hot）
+        """
+        loss = self.basic_loss(y_true, y_pred, False)
+        loss = loss + self.basic_loss(y_true, y_pred, True)
+        return loss / 2
+    def dense_loss(self, y_true, y_pred):
+        """y_true需要是one hot形式
+        """
+        y_true = K.argmax(y_true, 2)
+        return self.sparse_loss(y_true, y_pred)
+    def basic_accuracy(self, y_true, y_pred, go_backwards=False):
+        """训练过程中显示逐帧准确率的函数，排除了mask的影响
+        此处y_true需要是整数形式（非one hot）
+        """
+        # 导出mask并转换数据类型
+        mask = K.all(K.greater(y_pred, -1e6), axis=2)
+        mask = K.cast(mask, K.floatx())
+        # y_true需要重新明确一下shape和dtype
+        y_true = K.reshape(y_true, K.shape(y_pred)[:-1])
+        y_true = K.cast(y_true, 'int32')
+        # 反转相关
+        if self.hidden_dim is None:
+            if go_backwards:  # 是否反转序列
+                y_true, y_pred = self.reverse_sequence([y_true, y_pred], mask)
+                trans = K.transpose(self.trans)
+            else:
+                trans = self.trans
+            histoty = K.gather(trans, y_true)
+        else:
+            if go_backwards:  # 是否反转序列
+                y_true, y_pred = self.reverse_sequence([y_true, y_pred], mask)
+                r_trans, l_trans = self.l_trans, self.r_trans
+            else:
+                l_trans, r_trans = self.l_trans, self.r_trans
+            histoty = K.gather(l_trans, y_true)
+            histoty = tf.einsum('bnd,kd->bnk', histoty, r_trans)
+        # 计算逐标签accuracy
+        histoty = K.concatenate([y_pred[:, :1], histoty[:, :-1]], 1)
+        y_pred = (y_pred + histoty) / 2
+        y_pred = K.cast(K.argmax(y_pred, 2), 'int32')
+        isequal = K.cast(K.equal(y_true, y_pred), K.floatx())
+        return K.sum(isequal * mask) / K.sum(mask)
+    def sparse_accuracy(self, y_true, y_pred):
+        """训练过程中显示逐帧准确率的函数，排除了mask的影响
+        此处y_true需要是整数形式（非one hot）
+        """
+        accuracy = self.basic_accuracy(y_true, y_pred, False)
+        accuracy = accuracy + self.basic_accuracy(y_true, y_pred, True)
+        return accuracy / 2
+    def dense_accuracy(self, y_true, y_pred):
+        """训练过程中显示逐帧准确率的函数，排除了mask的影响
+        此处y_true需要是one hot形式
+        """
+        y_true = K.argmax(y_true, 2)
+        return self.sparse_accuracy(y_true, y_pred)
+    def get_config(self):
+        config = {
+            'lr_multiplier': self.lr_multiplier,
+            'hidden_dim': self.hidden_dim,
+        }
+        base_config = super(MaximumEntropyMarkovModel, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+class Loss(Layer):
+    """特殊的层，用来定义复杂loss
+    """
+    def __init__(self, output_axis=None, **kwargs):
+        super(Loss, self).__init__(**kwargs)
+        self.output_axis = output_axis
+    def call(self, inputs, mask=None):
+        loss = self.compute_loss(inputs, mask)
+        self.add_loss(loss, inputs=inputs)
+        if self.output_axis is None:
+            return inputs
+        elif isinstance(self.output_axis, list):
+            return [inputs[i] for i in self.output_axis]
+        else:
+            return inputs[self.output_axis]
+    def compute_loss(self, inputs, mask=None):
+        raise NotImplementedError
+    def compute_output_shape(self, input_shape):
+        if self.output_axis is None:
+            return input_shape
+        elif isinstance(self.output_axis, list):
+            return [input_shape[i] for i in self.output_axis]
+        else:
+            return input_shape[self.output_axis]
+    def compute_mask(self, inputs, mask):
+        if mask is not None:
+            if self.output_axis is None:
+                return mask
+            elif isinstance(self.output_axis, list):
+                return [mask[i] for i in self.output_axis]
+            else:
+                return mask[self.output_axis]
+    def get_config(self):
+        config = {
+            'output_axis': self.output_axis,
+        }
+        base_config = super(Loss, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+custom_objects = {
+    'Embedding': Embedding,
+    'BiasAdd': BiasAdd,
+    'Concatenate1D': Concatenate1D,
+    'MultiHeadAttention': MultiHeadAttention,
+    'LayerNormalization': LayerNormalization,
+    'PositionEmbedding': PositionEmbedding,
+    'SinusoidalPositionEmbedding': SinusoidalPositionEmbedding,
+    'RelativePositionEmbedding': RelativePositionEmbedding,
+    'RelativePositionEmbeddingT5': RelativePositionEmbeddingT5,
+    'FeedForward': FeedForward,
+    'ConditionalRandomField': ConditionalRandomField,
+    'MaximumEntropyMarkovModel': MaximumEntropyMarkovModel,
+    'Loss': Loss,
+}
+keras.utils.get_custom_objects().update(custom_objects)
--- a/Keras/NLP/bert4keras/bert4keras/models.py
+++ b/Keras/NLP/bert4keras/bert4keras/models.py
+#! -*- coding: utf-8 -*-
+# 主要模型
+import numpy as np
+from bert4keras.layers import *
+from bert4keras.snippets import insert_arguments
+from bert4keras.snippets import delete_arguments
+from bert4keras.snippets import is_string, is_one_of
+from keras.models import Model
+import json
+class Transformer(object):
+    """模型基类
+    """
+    def __init__(
+        self,
+        vocab_size,  # 词表大小
+        hidden_size,  # 编码维度
+        num_hidden_layers,  # Transformer总层数
+        num_attention_heads,  # Attention的头数
+        intermediate_size,  # FeedForward的隐层维度
+        hidden_act,  # FeedForward隐层的激活函数
+        dropout_rate=None,  # Dropout比例
+        embedding_size=None,  # 是否指定embedding_size
+        attention_head_size=None,  # Attention中V的head_size
+        attention_key_size=None,  # Attention中Q,K的head_size
+        sequence_length=None,  # 是否固定序列长度
+        keep_tokens=None,  # 要保留的词ID列表
+        compound_tokens=None,  # 扩展Embedding
+        residual_attention_scores=False,  # Attention矩阵加残差
+        ignore_invalid_weights=False,  # 允许跳过不存在的权重
+        layers=None,  # 外部传入的Keras层
+        prefix=None,  # 层名前缀
+        name=None,  # 模型名称
+        **kwargs
+    ):
+        if keep_tokens is not None:
+            vocab_size = len(keep_tokens)
+        if compound_tokens is not None:
+            vocab_size += len(compound_tokens)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_size = attention_head_size or hidden_size // num_attention_heads
+        self.attention_key_size = attention_key_size or self.attention_head_size
+        self.intermediate_size = intermediate_size
+        self.dropout_rate = dropout_rate or 0
+        self.hidden_act = hidden_act
+        self.embedding_size = embedding_size or hidden_size
+        self.sequence_length = sequence_length
+        self.keep_tokens = keep_tokens
+        self.compound_tokens = compound_tokens
+        self.attention_bias = None
+        self.position_bias = None
+        self.attention_scores = None
+        self.residual_attention_scores = residual_attention_scores
+        self.ignore_invalid_weights = ignore_invalid_weights
+        self.layers = {} if layers is None else layers
+        self.prefix = prefix or ''
+        self.name = name
+        self.built = False
+    def build(
+        self,
+        attention_caches=None,
+        layer_norm_cond=None,
+        layer_norm_cond_hidden_size=None,
+        layer_norm_cond_hidden_act=None,
+        additional_input_layers=None,
+        **kwargs
+    ):
+        """模型构建函数
+        attention_caches：为Attention的K,V的缓存序列字典，格式为
+                         {Attention层名: [K缓存, V缓存]}；
+        layer_norm_*系列参数：实现Conditional Layer Normalization时使用，
+                            用来实现以“固定长度向量”为条件的条件Bert。
+        """
+        if self.built:
+            return None
+        # Input
+        inputs = self.get_inputs()
+        self.set_inputs(inputs, additional_input_layers)
+        # Other
+        self.attention_caches = attention_caches or {}
+        self.layer_norm_conds = [
+            layer_norm_cond,
+            layer_norm_cond_hidden_size,
+            layer_norm_cond_hidden_act or 'linear',
+        ]
+        # Call
+        outputs = self.call(inputs)
+        self.set_outputs(outputs)
+        # Model
+        self.model = Model(self.inputs, self.outputs, name=self.name)
+        self.built = True
+    def call(self, inputs):
+        """定义模型的执行流程
+        """
+        # Embedding
+        outputs = self.apply_embeddings(inputs)
+        # Main
+        for i in range(self.num_hidden_layers):
+            outputs = self.apply_main_layers(outputs, i)
+        # Final
+        outputs = self.apply_final_layers(outputs)
+        return outputs
+    def prefixed(self, name):
+        """给名字加前缀
+        """
+        if name is not None:
+            return self.prefix + name
+    def apply(self, inputs=None, layer=None, arguments=None, **kwargs):
+        """通过apply调用层会自动重用同名层
+        inputs: 上一层的输出；
+        layer: 要调用的层类名；
+        arguments: 传递给layer.call的参数；
+        kwargs: 传递给层初始化的参数。
+        """
+        if layer is Dropout and self.dropout_rate == 0:
+            return inputs
+        if layer is MultiHeadAttention and self.residual_attention_scores:
+            kwargs['return_attention_scores'] = True
+        arguments = arguments or {}
+        name = self.prefixed(kwargs.get('name'))
+        kwargs['name'] = name
+        if name not in self.layers:
+            layer = layer(**kwargs)
+            name = layer.name
+            self.layers[name] = layer
+        if inputs is None:
+            return self.layers[name]
+        else:
+            if isinstance(self.layers[name], MultiHeadAttention):
+                if name in self.attention_caches:
+                    # 如果检测到Cache的传入，那么自动在Key,Value处拼接起来
+                    k_cache, v_cache = self.attention_caches[name]
+                    k_name, v_name = name + '-Cached-Key', name + '-Cached-Value'
+                    k = Concatenate1D(name=k_name)([k_cache, inputs[1]])
+                    v = Concatenate1D(name=v_name)([v_cache, inputs[2]])
+                    inputs = inputs[:1] + [k, v] + inputs[3:]
+                if self.residual_attention_scores:
+                    # 如果使用残差Attention矩阵，则给每个Attention矩阵加上前上一层的Attention
+                    # 矩阵，这对应RealFormer设计（https://arxiv.org/abs/2012.11747）。目前
+                    # 该实现还相对粗糙，可能欠缺通用性。
+                    if self.attention_scores is not None:
+                        if arguments.get('a_bias'):
+                            a_bias = Add(name=name + '-Attention-Bias'
+                                        )([inputs[3], self.attention_scores])
+                            inputs = inputs[:3] + [a_bias] + inputs[4:]
+                        else:
+                            a_bias = self.attention_scores
+                            inputs = inputs[:3] + [a_bias] + inputs[3:]
+                        arguments['a_bias'] = True
+                    o, a = self.layers[name](inputs, **arguments)
+                    self.attention_scores = a
+                    return o
+            return self.layers[name](inputs, **arguments)
+    def get_inputs(self):
+        raise NotImplementedError
+    def apply_embeddings(self, inputs):
+        raise NotImplementedError
+    def apply_main_layers(self, inputs, index):
+        raise NotImplementedError
+    def apply_final_layers(self, inputs):
+        raise NotImplementedError
+    def compute_attention_bias(self, inputs=None):
+        """定义每一层的Attention Bias
+        """
+        return self.attention_bias
+    def compute_position_bias(self, inputs=None):
+        """定义每一层的Position Bias（一般相对位置编码用）
+        """
+        return self.position_bias
+    def set_inputs(self, inputs, additional_input_layers=None):
+        """设置input和inputs属性
+        """
+        if inputs is None:
+            inputs = []
+        elif not isinstance(inputs, list):
+            inputs = [inputs]
+        inputs = inputs[:]
+        if additional_input_layers is not None:
+            if not isinstance(additional_input_layers, list):
+                additional_input_layers = [additional_input_layers]
+            inputs.extend(additional_input_layers)
+        self.inputs = inputs
+        if len(inputs) > 1:
+            self.input = inputs
+        else:
+            self.input = inputs[0]
+    def set_outputs(self, outputs):
+        """设置output和oututs属性
+        """
+        if not isinstance(outputs, list):
+            outputs = [outputs]
+        outputs = outputs[:]
+        self.outputs = outputs
+        if len(outputs) > 1:
+            self.output = outputs
+        else:
+            self.output = outputs[0]
+    @property
+    def initializer(self):
+        """默认使用截断正态分布初始化
+        """
+        return keras.initializers.TruncatedNormal(stddev=0.02)
+    def simplify(self, inputs):
+        """将list中的None过滤掉
+        """
+        inputs = [i for i in inputs if i is not None]
+        if len(inputs) == 1:
+            inputs = inputs[0]
+        return inputs
+    def load_embeddings(self, embeddings):
+        """处理Embedding层权重
+        """
+        embeddings = embeddings.astype(K.floatx())  # 防止np.average报错
+        if self.keep_tokens is not None:
+            embeddings = embeddings[self.keep_tokens]
+        if self.compound_tokens is not None:
+            ext_embeddings = []
+            for item in self.compound_tokens:
+                if isinstance(item, list):
+                    item = (item, [1] * len(item))
+                ext_embeddings.append(
+                    np.average(embeddings[item[0]], 0, item[1])
+                )
+            embeddings = np.concatenate([embeddings, ext_embeddings], 0)
+        return embeddings
+    def load_variable(self, checkpoint, name):
+        """加载单个变量的函数
+        """
+        if isinstance(checkpoint, dict):
+            return checkpoint[name]
+        else:
+            return tf.train.load_variable(checkpoint, name)
+    def create_variable(self, name, value, dtype=None):
+        """创建一个变量
+        """
+        dtype = dtype or K.floatx()
+        return K.variable(
+            self.initializer(value.shape, dtype), dtype, name=name
+        ), value
+    def variable_mapping(self):
+        """构建keras层与checkpoint的变量名之间的映射表
+        """
+        return {}
+    def load_weights_from_checkpoint(self, checkpoint, mapping=None):
+        """根据mapping从checkpoint加载权重
+        """
+        mapping = mapping or self.variable_mapping()
+        mapping = {self.prefixed(k): v for k, v in mapping.items()}
+        mapping = {k: v for k, v in mapping.items() if k in self.layers}
+        weight_value_pairs = []
+        for layer, variables in mapping.items():
+            layer = self.layers[layer]
+            weights, values = [], []
+            for w, v in zip(layer.trainable_weights, variables):  # 允许跳过不存在的权重
+                try:
+                    values.append(self.load_variable(checkpoint, v))
+                    weights.append(w)
+                except Exception as e:
+                    if self.ignore_invalid_weights:
+                        print('%s, but ignored.' % e.message)
+                    else:
+                        raise e
+            if isinstance(layer, MultiHeadAttention):
+                """如果key_size不等于head_size，则可以通过
+                正交矩阵将相应的权重投影到合适的shape。
+                """
+                count = 2
+                if layer.use_bias:
+                    count += 2
+                heads = self.num_attention_heads
+                head_size = self.attention_head_size
+                key_size = self.attention_key_size
+                W = np.linalg.qr(np.random.randn(key_size, head_size))[0].T
+                if layer.attention_scale:
+                    W = W * key_size**0.25 / head_size**0.25
+                for w, v in zip(weights, values):
+                    if is_one_of(w, layer.trainable_weights[:count]):
+                        w_shape, v_shape = K.int_shape(w), v.shape
+                        if w_shape[-1] != v_shape[-1]:
+                            pre_shape = w_shape[:-1]
+                            v = v.reshape(pre_shape + (heads, head_size))
+                            v = np.dot(v, W)
+                            v = v.reshape(pre_shape + (heads * key_size,))
+                            values[weights.index(w)] = v
+            weight_value_pairs.extend(zip(weights, values))
+        K.batch_set_value(weight_value_pairs)
+    def save_weights_as_checkpoint(self, filename, mapping=None, dtype=None):
+        """根据mapping将权重保存为checkpoint格式
+        """
+        mapping = mapping or self.variable_mapping()
+        mapping = {self.prefixed(k): v for k, v in mapping.items()}
+        mapping = {k: v for k, v in mapping.items() if k in self.layers}
+        with tf.Graph().as_default():
+            all_variables, all_values = [], []
+            for layer, variables in mapping.items():
+                layer = self.layers[layer]
+                values = K.batch_get_value(layer.trainable_weights)
+                for name, value in zip(variables, values):
+                    variable, value = self.create_variable(name, value, dtype)
+                    all_variables.append(variable)
+                    all_values.append(value)
+            with tf.Session() as sess:
+                K.batch_set_value(zip(all_variables, all_values))
+                saver = tf.train.Saver()
+                saver.save(sess, filename)
+class LM_Mask(object):
+    """定义下三角Attention Mask（语言模型用）
+    """
+    def compute_attention_bias(self, inputs=None):
+        """通过idxs序列的比较来得到对应的mask
+        """
+        if self.attention_bias is None:
+            def lm_mask(s):
+                seq_len = K.shape(s)[1]
+                idxs = K.arange(0, seq_len)
+                mask = idxs[None, :] <= idxs[:, None]
+                mask = K.cast(mask, K.floatx())
+                return -(1 - mask[None, None]) * 1e12
+            self.attention_bias = self.apply(
+                inputs=self.inputs[0],
+                layer=Lambda,
+                function=lm_mask,
+                name='Attention-LM-Mask'
+            )
+        return self.attention_bias
+class UniLM_Mask(object):
+    """定义UniLM的Attention Mask（Seq2Seq模型用）
+    其中source和target的分区，由segment_ids来表示。
+    UniLM: https://arxiv.org/abs/1905.03197
+    """
+    def compute_attention_bias(self, inputs=None):
+        """通过idxs序列的比较来得到对应的mask
+        """
+        if self.attention_bias is None:
+            def unilm_mask(s):
+                idxs = K.cumsum(s, axis=1)
+                mask = idxs[:, None, :] <= idxs[:, :, None]
+                mask = K.cast(mask, K.floatx())
+                return -(1 - mask[:, None]) * 1e12
+            self.attention_bias = self.apply(
+                inputs=self.inputs[1],
+                layer=Lambda,
+                function=unilm_mask,
+                name='Attention-UniLM-Mask'
+            )
+        return self.attention_bias
+class BERT(Transformer):
+    """构建BERT模型
+    """
+    def __init__(
+        self,
+        max_position,  # 序列最大长度
+        segment_vocab_size=2,  # segment总数目
+        with_pool=False,  # 是否包含Pool部分
+        with_nsp=False,  # 是否包含NSP部分
+        with_mlm=False,  # 是否包含MLM部分
+        hierarchical_position=None,  # 是否层次分解位置编码
+        custom_position_ids=False,  # 是否自行传入位置id
+        shared_segment_embeddings=False,  # 若True，则segment跟token共用embedding
+        **kwargs  # 其余参数
+    ):
+        super(BERT, self).__init__(**kwargs)
+        self.max_position = max_position
+        self.segment_vocab_size = segment_vocab_size
+        self.with_pool = with_pool
+        self.with_nsp = with_nsp
+        self.with_mlm = with_mlm
+        self.hierarchical_position = hierarchical_position
+        self.custom_position_ids = custom_position_ids
+        self.shared_segment_embeddings = shared_segment_embeddings
+        if self.with_nsp and not self.with_pool:
+            self.with_pool = True
+    def get_inputs(self):
+        """BERT的输入是token_ids和segment_ids
+        （但允许自行传入位置id，以实现一些特殊需求）
+        """
+        x_in = self.apply(
+            layer=Input, shape=(self.sequence_length,), name='Input-Token'
+        )
+        inputs = [x_in]
+        if self.segment_vocab_size > 0:
+            s_in = self.apply(
+                layer=Input,
+                shape=(self.sequence_length,),
+                name='Input-Segment'
+            )
+            inputs.append(s_in)
+        if self.custom_position_ids:
+            p_in = self.apply(
+                layer=Input,
+                shape=(self.sequence_length,),
+                name='Input-Position'
+            )
+            inputs.append(p_in)
+        return inputs
+    def apply_embeddings(self, inputs):
+        """BERT的embedding是token、position、segment三者embedding之和
+        """
+        inputs = inputs[:]
+        x = inputs.pop(0)
+        if self.segment_vocab_size > 0:
+            s = inputs.pop(0)
+        if self.custom_position_ids:
+            p = inputs.pop(0)
+        else:
+            p = None
+        z = self.layer_norm_conds[0]
+        x = self.apply(
+            inputs=x,
+            layer=Embedding,
+            input_dim=self.vocab_size,
+            output_dim=self.embedding_size,
+            embeddings_initializer=self.initializer,
+            mask_zero=True,
+            name='Embedding-Token'
+        )
+        if self.segment_vocab_size > 0:
+            if self.shared_segment_embeddings:
+                name = 'Embedding-Token'
+            else:
+                name = 'Embedding-Segment'
+            s = self.apply(
+                inputs=s,
+                layer=Embedding,
+                input_dim=self.segment_vocab_size,
+                output_dim=self.embedding_size,
+                embeddings_initializer=self.initializer,
+                name=name
+            )
+            x = self.apply(
+                inputs=[x, s], layer=Add, name='Embedding-Token-Segment'
+            )
+        x = self.apply(
+            inputs=self.simplify([x, p]),
+            layer=PositionEmbedding,
+            input_dim=self.max_position,
+            output_dim=self.embedding_size,
+            merge_mode='add',
+            hierarchical=self.hierarchical_position,
+            embeddings_initializer=self.initializer,
+            custom_position_ids=self.custom_position_ids,
+            name='Embedding-Position'
+        )
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='Embedding-Norm'
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='Embedding-Dropout'
+        )
+        if self.embedding_size != self.hidden_size:
+            x = self.apply(
+                inputs=x,
+                layer=Dense,
+                units=self.hidden_size,
+                kernel_initializer=self.initializer,
+                name='Embedding-Mapping'
+            )
+        return x
+    def apply_main_layers(self, inputs, index):
+        """BERT的主体是基于Self-Attention的模块
+        顺序：Att --> Add --> LN --> FFN --> Add --> LN
+        """
+        x = inputs
+        z = self.layer_norm_conds[0]
+        attention_name = 'Transformer-%d-MultiHeadSelfAttention' % index
+        feed_forward_name = 'Transformer-%d-FeedForward' % index
+        attention_mask = self.compute_attention_bias(index)
+        # Self Attention
+        xi, x, arguments = x, [x, x, x], {'a_bias': None}
+        if attention_mask is not None:
+            arguments['a_bias'] = True
+            x.append(attention_mask)
+        x = self.apply(
+            inputs=x,
+            layer=MultiHeadAttention,
+            arguments=arguments,
+            heads=self.num_attention_heads,
+            head_size=self.attention_head_size,
+            out_dim=self.hidden_size,
+            key_size=self.attention_key_size,
+            kernel_initializer=self.initializer,
+            name=attention_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='%s-Dropout' % attention_name
+        )
+        x = self.apply(
+            inputs=[xi, x], layer=Add, name='%s-Add' % attention_name
+        )
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='%s-Norm' % attention_name
+        )
+        # Feed Forward
+        xi = x
+        x = self.apply(
+            inputs=x,
+            layer=FeedForward,
+            units=self.intermediate_size,
+            activation=self.hidden_act,
+            kernel_initializer=self.initializer,
+            name=feed_forward_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='%s-Dropout' % feed_forward_name
+        )
+        x = self.apply(
+            inputs=[xi, x], layer=Add, name='%s-Add' % feed_forward_name
+        )
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='%s-Norm' % feed_forward_name
+        )
+        return x
+    def apply_final_layers(self, inputs):
+        """根据剩余参数决定输出
+        """
+        x = inputs
+        z = self.layer_norm_conds[0]
+        outputs = [x]
+        if self.with_pool:
+            # Pooler部分（提取CLS向量）
+            x = outputs[0]
+            x = self.apply(
+                inputs=x,
+                layer=Lambda,
+                function=lambda x: x[:, 0],
+                name='Pooler'
+            )
+            pool_activation = 'tanh' if self.with_pool is True else self.with_pool
+            x = self.apply(
+                inputs=x,
+                layer=Dense,
+                units=self.hidden_size,
+                activation=pool_activation,
+                kernel_initializer=self.initializer,
+                name='Pooler-Dense'
+            )
+            if self.with_nsp:
+                # Next Sentence Prediction部分
+                x = self.apply(
+                    inputs=x,
+                    layer=Dense,
+                    units=2,
+                    activation='softmax',
+                    kernel_initializer=self.initializer,
+                    name='NSP-Proba'
+                )
+            outputs.append(x)
+        if self.with_mlm:
+            # Masked Language Model部分
+            x = outputs[0]
+            x = self.apply(
+                inputs=x,
+                layer=Dense,
+                units=self.embedding_size,
+                activation=self.hidden_act,
+                kernel_initializer=self.initializer,
+                name='MLM-Dense'
+            )
+            x = self.apply(
+                inputs=self.simplify([x, z]),
+                layer=LayerNormalization,
+                conditional=(z is not None),
+                hidden_units=self.layer_norm_conds[1],
+                hidden_activation=self.layer_norm_conds[2],
+                hidden_initializer=self.initializer,
+                name='MLM-Norm'
+            )
+            x = self.apply(
+                inputs=x,
+                layer=Embedding,
+                arguments={'mode': 'dense'},
+                name='Embedding-Token'
+            )
+            x = self.apply(inputs=x, layer=BiasAdd, name='MLM-Bias')
+            mlm_activation = 'softmax' if self.with_mlm is True else self.with_mlm
+            x = self.apply(
+                inputs=x,
+                layer=Activation,
+                activation=mlm_activation,
+                name='MLM-Activation'
+            )
+            outputs.append(x)
+        if len(outputs) == 1:
+            outputs = outputs[0]
+        elif len(outputs) == 2:
+            outputs = outputs[1]
+        else:
+            outputs = outputs[1:]
+        return outputs
+    def load_variable(self, checkpoint, name):
+        """加载单个变量的函数
+        """
+        variable = super(BERT, self).load_variable(checkpoint, name)
+        if name in [
+            'bert/embeddings/word_embeddings',
+            'cls/predictions/output_bias',
+        ]:
+            return self.load_embeddings(variable)
+        elif name == 'cls/seq_relationship/output_weights':
+            return variable.T
+        else:
+            return variable
+    def create_variable(self, name, value, dtype=None):
+        """在tensorflow中创建一个变量
+        """
+        if name == 'cls/seq_relationship/output_weights':
+            value = value.T
+        return super(BERT, self).create_variable(name, value, dtype)
+    def variable_mapping(self):
+        """映射到官方BERT权重格式
+        """
+        mapping = {
+            'Embedding-Token': ['bert/embeddings/word_embeddings'],
+            'Embedding-Segment': ['bert/embeddings/token_type_embeddings'],
+            'Embedding-Position': ['bert/embeddings/position_embeddings'],
+            'Embedding-Norm': [
+                'bert/embeddings/LayerNorm/beta',
+                'bert/embeddings/LayerNorm/gamma',
+            ],
+            'Embedding-Mapping': [
+                'bert/encoder/embedding_hidden_mapping_in/kernel',
+                'bert/encoder/embedding_hidden_mapping_in/bias',
+            ],
+            'Pooler-Dense': [
+                'bert/pooler/dense/kernel',
+                'bert/pooler/dense/bias',
+            ],
+            'NSP-Proba': [
+                'cls/seq_relationship/output_weights',
+                'cls/seq_relationship/output_bias',
+            ],
+            'MLM-Dense': [
+                'cls/predictions/transform/dense/kernel',
+                'cls/predictions/transform/dense/bias',
+            ],
+            'MLM-Norm': [
+                'cls/predictions/transform/LayerNorm/beta',
+                'cls/predictions/transform/LayerNorm/gamma',
+            ],
+            'MLM-Bias': ['cls/predictions/output_bias'],
+        }
+        for i in range(self.num_hidden_layers):
+            prefix = 'bert/encoder/layer_%d/' % i
+            mapping.update({
+                'Transformer-%d-MultiHeadSelfAttention' % i: [
+                    prefix + 'attention/self/query/kernel',
+                    prefix + 'attention/self/query/bias',
+                    prefix + 'attention/self/key/kernel',
+                    prefix + 'attention/self/key/bias',
+                    prefix + 'attention/self/value/kernel',
+                    prefix + 'attention/self/value/bias',
+                    prefix + 'attention/output/dense/kernel',
+                    prefix + 'attention/output/dense/bias',
+                ],
+                'Transformer-%d-MultiHeadSelfAttention-Norm' % i: [
+                    prefix + 'attention/output/LayerNorm/beta',
+                    prefix + 'attention/output/LayerNorm/gamma',
+                ],
+                'Transformer-%d-FeedForward' % i: [
+                    prefix + 'intermediate/dense/kernel',
+                    prefix + 'intermediate/dense/bias',
+                    prefix + 'output/dense/kernel',
+                    prefix + 'output/dense/bias',
+                ],
+                'Transformer-%d-FeedForward-Norm' % i: [
+                    prefix + 'output/LayerNorm/beta',
+                    prefix + 'output/LayerNorm/gamma',
+                ],
+            })
+        return mapping
+class ALBERT(BERT):
+    """构建ALBERT模型
+    """
+    def apply_main_layers(self, inputs, index):
+        """ALBERT的主体是基于Self-Attention的模块
+        顺序：Att --> Add --> LN --> FFN --> Add --> LN
+        """
+        x = inputs
+        z = self.layer_norm_conds[0]
+        attention_name = 'Transformer-MultiHeadSelfAttention'
+        feed_forward_name = 'Transformer-FeedForward'
+        attention_mask = self.compute_attention_bias(index)
+        # Self Attention
+        xi, x, arguments = x, [x, x, x], {'a_bias': None}
+        if attention_mask is not None:
+            arguments['a_bias'] = True
+            x.append(attention_mask)
+        x = self.apply(
+            inputs=x,
+            layer=MultiHeadAttention,
+            arguments=arguments,
+            heads=self.num_attention_heads,
+            head_size=self.attention_head_size,
+            out_dim=self.hidden_size,
+            key_size=self.attention_key_size,
+            kernel_initializer=self.initializer,
+            name=attention_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='%s-Dropout' % attention_name
+        )
+        x = self.apply(
+            inputs=[xi, x], layer=Add, name='%s-Add' % attention_name
+        )
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='%s-Norm' % attention_name
+        )
+        # Feed Forward
+        xi = x
+        x = self.apply(
+            inputs=x,
+            layer=FeedForward,
+            units=self.intermediate_size,
+            activation=self.hidden_act,
+            kernel_initializer=self.initializer,
+            name=feed_forward_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='%s-Dropout' % feed_forward_name
+        )
+        x = self.apply(
+            inputs=[xi, x], layer=Add, name='%s-Add' % feed_forward_name
+        )
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='%s-Norm' % feed_forward_name
+        )
+        return x
+    def variable_mapping(self):
+        """映射到官方ALBERT权重格式
+        """
+        mapping = super(ALBERT, self).variable_mapping()
+        prefix = 'bert/encoder/transformer/group_0/inner_group_0/'
+        mapping.update({
+            'Transformer-MultiHeadSelfAttention': [
+                prefix + 'attention_1/self/query/kernel',
+                prefix + 'attention_1/self/query/bias',
+                prefix + 'attention_1/self/key/kernel',
+                prefix + 'attention_1/self/key/bias',
+                prefix + 'attention_1/self/value/kernel',
+                prefix + 'attention_1/self/value/bias',
+                prefix + 'attention_1/output/dense/kernel',
+                prefix + 'attention_1/output/dense/bias',
+            ],
+            'Transformer-MultiHeadSelfAttention-Norm': [
+                prefix + 'LayerNorm/beta',
+                prefix + 'LayerNorm/gamma',
+            ],
+            'Transformer-FeedForward': [
+                prefix + 'ffn_1/intermediate/dense/kernel',
+                prefix + 'ffn_1/intermediate/dense/bias',
+                prefix + 'ffn_1/intermediate/output/dense/kernel',
+                prefix + 'ffn_1/intermediate/output/dense/bias',
+            ],
+            'Transformer-FeedForward-Norm': [
+                prefix + 'LayerNorm_1/beta',
+                prefix + 'LayerNorm_1/gamma',
+            ],
+        })
+        return mapping
+class ALBERT_Unshared(BERT):
+    """解开ALBERT共享约束，当成BERT用
+    """
+    def variable_mapping(self):
+        """映射到官方ALBERT权重格式
+        """
+        mapping = super(ALBERT_Unshared, self).variable_mapping()
+        prefix = 'bert/encoder/transformer/group_0/inner_group_0/'
+        for i in range(self.num_hidden_layers):
+            mapping.update({
+                'Transformer-%d-MultiHeadSelfAttention' % i: [
+                    prefix + 'attention_1/self/query/kernel',
+                    prefix + 'attention_1/self/query/bias',
+                    prefix + 'attention_1/self/key/kernel',
+                    prefix + 'attention_1/self/key/bias',
+                    prefix + 'attention_1/self/value/kernel',
+                    prefix + 'attention_1/self/value/bias',
+                    prefix + 'attention_1/output/dense/kernel',
+                    prefix + 'attention_1/output/dense/bias',
+                ],
+                'Transformer-%d-MultiHeadSelfAttention-Norm' % i: [
+                    prefix + 'LayerNorm/beta',
+                    prefix + 'LayerNorm/gamma',
+                ],
+                'Transformer-%d-FeedForward' % i: [
+                    prefix + 'ffn_1/intermediate/dense/kernel',
+                    prefix + 'ffn_1/intermediate/dense/bias',
+                    prefix + 'ffn_1/intermediate/output/dense/kernel',
+                    prefix + 'ffn_1/intermediate/output/dense/bias',
+                ],
+                'Transformer-%d-FeedForward-Norm' % i: [
+                    prefix + 'LayerNorm_1/beta',
+                    prefix + 'LayerNorm_1/gamma',
+                ],
+            })
+        return mapping
+class NEZHA(BERT):
+    """华为推出的NAZHA模型
+    链接：https://arxiv.org/abs/1909.00204
+    """
+    def apply_embeddings(self, inputs):
+        """NEZHA的embedding是token、segment两者embedding之和
+        """
+        inputs = inputs[:]
+        x = inputs.pop(0)
+        if self.segment_vocab_size > 0:
+            s = inputs.pop(0)
+        z = self.layer_norm_conds[0]
+        x = self.apply(
+            inputs=x,
+            layer=Embedding,
+            input_dim=self.vocab_size,
+            output_dim=self.embedding_size,
+            embeddings_initializer=self.initializer,
+            mask_zero=True,
+            name='Embedding-Token'
+        )
+        if self.segment_vocab_size > 0:
+            if self.shared_segment_embeddings:
+                name = 'Embedding-Token'
+            else:
+                name = 'Embedding-Segment'
+            s = self.apply(
+                inputs=s,
+                layer=Embedding,
+                input_dim=2,
+                output_dim=self.embedding_size,
+                embeddings_initializer=self.initializer,
+                name=name
+            )
+            x = self.apply(
+                inputs=[x, s], layer=Add, name='Embedding-Token-Segment'
+            )
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='Embedding-Norm'
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='Embedding-Dropout'
+        )
+        if self.embedding_size != self.hidden_size:
+            x = self.apply(
+                inputs=x,
+                layer=Dense,
+                units=self.hidden_size,
+                kernel_initializer=self.initializer,
+                name='Embedding-Mapping'
+            )
+        return x
+    def apply_main_layers(self, inputs, index):
+        """NEZHA的主体是基于Self-Attention的模块
+        顺序：Att --> Add --> LN --> FFN --> Add --> LN
+        """
+        x = inputs
+        z = self.layer_norm_conds[0]
+        attention_name = 'Transformer-%d-MultiHeadSelfAttention' % index
+        feed_forward_name = 'Transformer-%d-FeedForward' % index
+        attention_mask = self.compute_attention_bias(index)
+        position_bias = self.compute_position_bias(x)
+        # Self Attention
+        xi, x = x, [x, x, x, position_bias]
+        arguments = {'a_bias': None, 'p_bias': 'typical_relative'}
+        if attention_mask is not None:
+            arguments['a_bias'] = True
+            x.insert(3, attention_mask)
+        x = self.apply(
+            inputs=x,
+            layer=MultiHeadAttention,
+            arguments=arguments,
+            heads=self.num_attention_heads,
+            head_size=self.attention_head_size,
+            out_dim=self.hidden_size,
+            key_size=self.attention_key_size,
+            kernel_initializer=self.initializer,
+            name=attention_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='%s-Dropout' % attention_name
+        )
+        x = self.apply(
+            inputs=[xi, x], layer=Add, name='%s-Add' % attention_name
+        )
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='%s-Norm' % attention_name
+        )
+        # Feed Forward
+        xi = x
+        x = self.apply(
+            inputs=x,
+            layer=FeedForward,
+            units=self.intermediate_size,
+            activation=self.hidden_act,
+            kernel_initializer=self.initializer,
+            name=feed_forward_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='%s-Dropout' % feed_forward_name
+        )
+        x = self.apply(
+            inputs=[xi, x], layer=Add, name='%s-Add' % feed_forward_name
+        )
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='%s-Norm' % feed_forward_name
+        )
+        return x
+    def compute_position_bias(self, inputs=None):
+        """经典相对位置编码
+        """
+        if self.position_bias is None:
+            x = inputs
+            self.position_bias = self.apply(
+                inputs=[x, x],
+                layer=RelativePositionEmbedding,
+                input_dim=2 * 64 + 1,
+                output_dim=self.attention_key_size,
+                embeddings_initializer='Sinusoidal',
+                name='Embedding-Relative-Position',
+                trainable=False
+            )
+        return self.position_bias
+class RoFormer(NEZHA):
+    """旋转式位置编码的BERT模型
+    链接：https://kexue.fm/archives/8265
+    """
+    def apply_main_layers(self, inputs, index):
+        """RoFormer的主体是基于Self-Attention的模块
+        顺序：Att --> Add --> LN --> FFN --> Add --> LN
+        """
+        x = inputs
+        z = self.layer_norm_conds[0]
+        attention_name = 'Transformer-%d-MultiHeadSelfAttention' % index
+        feed_forward_name = 'Transformer-%d-FeedForward' % index
+        attention_mask = self.compute_attention_bias(index)
+        position_bias = self.compute_position_bias(x)
+        # Self Attention
+        xi, x = x, [x, x, x, position_bias]
+        arguments = {'a_bias': None, 'p_bias': 'rotary'}
+        if attention_mask is not None:
+            arguments['a_bias'] = True
+            x.insert(3, attention_mask)
+        x = self.apply(
+            inputs=x,
+            layer=MultiHeadAttention,
+            arguments=arguments,
+            heads=self.num_attention_heads,
+            head_size=self.attention_head_size,
+            out_dim=self.hidden_size,
+            key_size=self.attention_key_size,
+            kernel_initializer=self.initializer,
+            name=attention_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='%s-Dropout' % attention_name
+        )
+        x = self.apply(
+            inputs=[xi, x], layer=Add, name='%s-Add' % attention_name
+        )
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='%s-Norm' % attention_name
+        )
+        # Feed Forward
+        xi = x
+        x = self.apply(
+            inputs=x,
+            layer=FeedForward,
+            units=self.intermediate_size,
+            activation=self.hidden_act,
+            kernel_initializer=self.initializer,
+            name=feed_forward_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='%s-Dropout' % feed_forward_name
+        )
+        x = self.apply(
+            inputs=[xi, x], layer=Add, name='%s-Add' % feed_forward_name
+        )
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='%s-Norm' % feed_forward_name
+        )
+        return x
+    def compute_position_bias(self, inputs=None):
+        """Sinusoidal位置编码（直接返回）
+        """
+        if self.position_bias is None:
+            x = inputs
+            self.position_bias = self.apply(
+                inputs=x,
+                layer=SinusoidalPositionEmbedding,
+                output_dim=self.attention_key_size,
+                merge_mode='zero',
+                name='Embedding-Rotary-Position'
+            )
+        return self.position_bias
+class ELECTRA(BERT):
+    """Google推出的ELECTRA模型
+    链接：https://arxiv.org/abs/2003.10555
+    """
+    @insert_arguments(with_discriminator=False)
+    @delete_arguments('with_pool', 'with_mlm')
+    def __init__(
+        self,
+        max_position,  # 序列最大长度
+        **kwargs  # 其余参数
+    ):
+        super(ELECTRA, self).__init__(max_position, **kwargs)
+    def apply_final_layers(self, inputs):
+        x = inputs
+        if self.with_discriminator:
+            if self.with_discriminator is True:
+                final_activation = 'sigmoid'
+            else:
+                final_activation = self.with_discriminator
+            x = self.apply(
+                inputs=x,
+                layer=Dense,
+                units=self.hidden_size,
+                activation=self.hidden_act,
+                kernel_initializer=self.initializer,
+                name='Discriminator-Dense'
+            )
+            x = self.apply(
+                inputs=x,
+                layer=Dense,
+                units=1,
+                activation=final_activation,
+                kernel_initializer=self.initializer,
+                name='Discriminator-Prediction'
+            )
+        return x
+    def load_variable(self, checkpoint, name):
+        """加载单个变量的函数
+        """
+        variable = super(ELECTRA, self).load_variable(checkpoint, name)
+        if name == 'electra/embeddings/word_embeddings':
+            return self.load_embeddings(variable)
+        else:
+            return variable
+    def variable_mapping(self):
+        mapping = super(ELECTRA, self).variable_mapping()
+        mapping['Embedding-Mapping'] = [
+            'electra/embeddings_project/kernel',
+            'electra/embeddings_project/bias',
+        ]
+        mapping = {
+            k: [i.replace('bert/', 'electra/') for i in v]
+            for k, v in mapping.items()
+        }
+        mapping['Discriminator-Dense'] = [
+            'discriminator_predictions/dense/kernel',
+            'discriminator_predictions/dense/bias',
+        ]
+        mapping['Discriminator-Prediction'] = [
+            'discriminator_predictions/dense_1/kernel',
+            'discriminator_predictions/dense_1/bias',
+        ]
+        return mapping
+class GPT(LM_Mask, BERT):
+    """构建GPT模型
+    链接：https://github.com/openai/finetune-transformer-lm
+    """
+    @insert_arguments(final_activation='softmax')
+    @delete_arguments('with_pool', 'with_mlm')
+    def __init__(self, **kwargs):
+        super(GPT, self).__init__(**kwargs)
+    def apply_embeddings(self, inputs):
+        """GPT的embedding是token、position、segment三者embedding之和
+        跟BERT的主要区别是三者相加之后没有加LayerNormalization层。
+        """
+        inputs = inputs[:]
+        x = inputs.pop(0)
+        if self.segment_vocab_size > 0:
+            s = inputs.pop(0)
+        if self.custom_position_ids:
+            p = inputs.pop(0)
+        else:
+            p = None
+        x = self.apply(
+            inputs=x,
+            layer=Embedding,
+            input_dim=self.vocab_size,
+            output_dim=self.embedding_size,
+            embeddings_initializer=self.initializer,
+            mask_zero=True,
+            name='Embedding-Token'
+        )
+        if self.segment_vocab_size > 0:
+            if self.shared_segment_embeddings:
+                name = 'Embedding-Token'
+            else:
+                name = 'Embedding-Segment'
+            s = self.apply(
+                inputs=s,
+                layer=Embedding,
+                input_dim=self.segment_vocab_size,
+                output_dim=self.embedding_size,
+                embeddings_initializer=self.initializer,
+                name=name
+            )
+            x = self.apply(
+                inputs=[x, s], layer=Add, name='Embedding-Token-Segment'
+            )
+        x = self.apply(
+            inputs=self.simplify([x, p]),
+            layer=PositionEmbedding,
+            input_dim=self.max_position,
+            output_dim=self.embedding_size,
+            merge_mode='add',
+            hierarchical=self.hierarchical_position,
+            embeddings_initializer=self.initializer,
+            custom_position_ids=self.custom_position_ids,
+            name='Embedding-Position'
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='Embedding-Dropout'
+        )
+        if self.embedding_size != self.hidden_size:
+            x = self.apply(
+                inputs=x,
+                layer=Dense,
+                units=self.hidden_size,
+                kernel_initializer=self.initializer,
+                name='Embedding-Mapping'
+            )
+        return x
+    def apply_final_layers(self, inputs):
+        """剩余部分
+        """
+        x = inputs
+        # Language Model部分
+        x = self.apply(
+            inputs=x,
+            layer=Embedding,
+            arguments={'mode': 'dense'},
+            name='Embedding-Token'
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Activation,
+            activation=self.final_activation,
+            name='LM-Activation'
+        )
+        return x
+    def load_variable(self, checkpoint, name):
+        """加载单个变量的函数
+        """
+        variable = super(GPT, self).load_variable(checkpoint, name)
+        if name == 'gpt/embeddings/word_embeddings':
+            return self.load_embeddings(variable)
+        else:
+            return variable
+    def variable_mapping(self):
+        """映射到TF版GPT权重格式
+        """
+        mapping = super(GPT, self).variable_mapping()
+        mapping = {
+            k: [
+                i.replace('bert/', 'gpt/').replace('encoder', 'transformer')
+                for i in v
+            ]
+            for k, v in mapping.items()
+        }
+        return mapping
+class GPT2(GPT):
+    """构建GPT2模型
+    链接: https://github.com/openai/gpt-2
+    """
+    def get_inputs(self):
+        """GPT2的输入是token_ids
+        """
+        x_in = self.apply(
+            layer=Input, shape=(self.sequence_length,), name='Input-Token'
+        )
+        return x_in
+    def apply_embeddings(self, inputs):
+        """GPT2的embedding是token、position两者embedding之和
+        """
+        x = inputs
+        x = self.apply(
+            inputs=x,
+            layer=Embedding,
+            input_dim=self.vocab_size,
+            output_dim=self.embedding_size,
+            embeddings_initializer=self.initializer,
+            mask_zero=True,
+            name='Embedding-Token'
+        )
+        x = self.apply(
+            inputs=x,
+            layer=PositionEmbedding,
+            input_dim=self.max_position,
+            output_dim=self.embedding_size,
+            merge_mode='add',
+            hierarchical=self.hierarchical_position,
+            embeddings_initializer=self.initializer,
+            name='Embedding-Position'
+        )
+        if self.embedding_size != self.hidden_size:
+            x = self.apply(
+                inputs=x,
+                layer=Dense,
+                units=self.hidden_size,
+                kernel_initializer=self.initializer,
+                name='Embedding-Mapping'
+            )
+        return x
+    def apply_main_layers(self, inputs, index):
+        """GPT2的主体是基于Self-Attention的模块
+        顺序：LN --> Att  --> Add --> LN --> FFN --> Add
+        """
+        x = inputs
+        z = self.layer_norm_conds[0]
+        attention_name = 'Transformer-%d-MultiHeadSelfAttention' % index
+        feed_forward_name = 'Transformer-%d-FeedForward' % index
+        attention_mask = self.compute_attention_bias(index)
+        # Self Attention
+        xi = x
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            epsilon=1e-5,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='%s-Norm' % attention_name
+        )
+        x = self.apply(
+            inputs=[x, x, x, attention_mask],
+            layer=MultiHeadAttention,
+            arguments={'a_bias': True},
+            heads=self.num_attention_heads,
+            head_size=self.attention_head_size,
+            out_dim=self.hidden_size,
+            key_size=self.attention_key_size,
+            kernel_initializer=self.initializer,
+            name=attention_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='%s-Dropout' % attention_name
+        )
+        x = self.apply(
+            inputs=[xi, x], layer=Add, name='%s-Add' % attention_name
+        )
+        # Feed Forward
+        xi = x
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            epsilon=1e-5,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='%s-Norm' % feed_forward_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=FeedForward,
+            units=self.intermediate_size,
+            activation=self.hidden_act,
+            kernel_initializer=self.initializer,
+            name=feed_forward_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='%s-Dropout' % feed_forward_name
+        )
+        x = self.apply(
+            inputs=[xi, x], layer=Add, name='%s-Add' % feed_forward_name
+        )
+        return x
+    def apply_final_layers(self, inputs):
+        """剩余部分
+        """
+        x = inputs
+        z = self.layer_norm_conds[0]
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            epsilon=1e-5,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='Output-Norm'
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='Output-Dropout'
+        )
+        x = super(GPT2, self).apply_final_layers(x)
+        return x
+    def variable_mapping(self):
+        """映射到TF版GPT2权重格式
+        """
+        mapping = super(GPT2, self).variable_mapping()
+        mapping = {
+            k: [i.replace('output/LayerNorm', 'input/LayerNorm') for i in v]
+            for k, v in mapping.items()
+        }
+        mapping['Output-Norm'] = [
+            'gpt/output/LayerNorm/beta',
+            'gpt/output/LayerNorm/gamma',
+        ]
+        return mapping
+class GPT2_ML(GPT):
+    """构建GPT2_ML模型
+    链接: https://github.com/imcaspar/gpt2-ml
+    注意：GPT2_ML虽然号称GPT2，但是它的结构其实更接近GPT，它自称GPT2的
+         原因大概是因为它开源的版本参数量达到了GPT2的15亿参数。
+    """
+    def get_inputs(self):
+        """GPT2_ML的输入是token_ids
+        """
+        x_in = self.apply(
+            layer=Input, shape=(self.sequence_length,), name='Input-Token'
+        )
+        return x_in
+    def apply_embeddings(self, inputs):
+        """GPT2_ML的embedding是token、position两者embedding之和
+        """
+        x = inputs
+        z = self.layer_norm_conds[0]
+        x = self.apply(
+            inputs=x,
+            layer=Embedding,
+            input_dim=self.vocab_size,
+            output_dim=self.embedding_size,
+            embeddings_initializer=self.initializer,
+            mask_zero=True,
+            name='Embedding-Token'
+        )
+        x = self.apply(
+            inputs=x,
+            layer=PositionEmbedding,
+            input_dim=self.max_position,
+            output_dim=self.embedding_size,
+            merge_mode='add',
+            hierarchical=self.hierarchical_position,
+            embeddings_initializer=self.initializer,
+            name='Embedding-Position'
+        )
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            epsilon=1e-5,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='Embedding-Norm'
+        )
+        if self.embedding_size != self.hidden_size:
+            x = self.apply(
+                inputs=x,
+                layer=Dense,
+                units=self.hidden_size,
+                kernel_initializer=self.initializer,
+                name='Embedding-Mapping'
+            )
+        return x
+    def apply_main_layers(self, inputs, index):
+        """GPT2_ML的主体是基于Self-Attention的模块
+        顺序：Att  --> LN --> FFN --> Add --> LN
+        """
+        x = inputs
+        z = self.layer_norm_conds[0]
+        attention_name = 'Transformer-%d-MultiHeadSelfAttention' % index
+        feed_forward_name = 'Transformer-%d-FeedForward' % index
+        attention_mask = self.compute_attention_bias(index)
+        # Self Attention
+        xi, x, arguments = x, [x, x, x, attention_mask], {'a_bias': True}
+        x = self.apply(
+            inputs=x,
+            layer=MultiHeadAttention,
+            arguments=arguments,
+            heads=self.num_attention_heads,
+            head_size=self.attention_head_size,
+            out_dim=self.hidden_size,
+            key_size=self.attention_key_size,
+            kernel_initializer=self.initializer,
+            name=attention_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='%s-Dropout' % attention_name
+        )
+        x = self.apply(
+            inputs=[xi, x], layer=Add, name='%s-Add' % attention_name
+        )
+        # Feed Forward
+        xi = x
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            epsilon=1e-5,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='%s-Norm-0' % feed_forward_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=FeedForward,
+            units=self.intermediate_size,
+            activation=self.hidden_act,
+            kernel_initializer=self.initializer,
+            name=feed_forward_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='%s-Dropout' % feed_forward_name
+        )
+        x = self.apply(
+            inputs=[xi, x], layer=Add, name='%s-Add' % feed_forward_name
+        )
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            epsilon=1e-5,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='%s-Norm-1' % feed_forward_name
+        )
+        return x
+    def load_variable(self, checkpoint, name):
+        """加载单个变量的函数
+        """
+        variable = super(GPT2_ML, self).load_variable(checkpoint, name)
+        if name == 'newslm/embeddings/word_embed':
+            return self.load_embeddings(variable)
+        else:
+            return variable
+    def variable_mapping(self):
+        """映射到官方GPT2_ML权重格式
+        """
+        mapping = {
+            'Embedding-Token': ['newslm/embeddings/word_embed'],
+            'Embedding-Position': ['newslm/embeddings/pos_embed'],
+            'Embedding-Norm': [
+                'newslm/embeddings/LayerNorm_embed_norm/beta',
+                'newslm/embeddings/LayerNorm_embed_norm/gamma',
+            ],
+        }
+        for i in range(self.num_hidden_layers):
+            prefix = 'newslm/layer%02d/' % i
+            mapping.update({
+                'Transformer-%d-MultiHeadSelfAttention' % i: [
+                    prefix + 'query_layer/kernel',
+                    prefix + 'query_layer/bias',
+                    prefix + 'key_layer/kernel',
+                    prefix + 'key_layer/bias',
+                    prefix + 'value_layer/kernel',
+                    prefix + 'value_layer/bias',
+                    prefix + 'context_projection_layer/kernel',
+                    prefix + 'context_projection_layer/bias',
+                ],
+                'Transformer-%d-FeedForward-Norm-0' % i: [
+                    prefix + 'LayerNorm_mlp_ln0/beta',
+                    prefix + 'LayerNorm_mlp_ln0/gamma',
+                ],
+                'Transformer-%d-FeedForward' % i: [
+                    prefix + 'intermediate/kernel',
+                    prefix + 'intermediate/bias',
+                    prefix + 'output/kernel',
+                    prefix + 'output/bias',
+                ],
+                'Transformer-%d-FeedForward-Norm-1' % i: [
+                    prefix + 'LayerNorm_mlp_ln1/beta',
+                    prefix + 'LayerNorm_mlp_ln1/gamma',
+                ],
+            })
+        return mapping
+class T5_Base(Transformer):
+    """Google的T5模型（基类）
+    注意T5有两个版本，一开始放出来的版本称为t5.1.0，而后来放出了一个升级
+    版本称为t5.1.1，两者结构略有不同，包括后来放出来的多国语言版T5也采用
+    了t5.1.1的结构。
+    t5.1.0: https://github.com/google-research/text-to-text-transfer-transformer
+    t5.1.1: https://github.com/google-research/text-to-text-transfer-transformer/blob/master/released_checkpoints.md#t511
+    multilingual-t5: https://github.com/google-research/multilingual-t5
+    """
+    @insert_arguments(version='t5.1.0')
+    def __init__(self, **kwargs):
+        super(T5_Base, self).__init__(**kwargs)
+    def load_variable(self, checkpoint, name):
+        """加载单个变量的函数
+        """
+        variable = super(T5_Base, self).load_variable(checkpoint, name)
+        if name == 'shared/embedding':
+            return self.load_embeddings(variable)
+        elif name == 'decoder/logits/kernel':
+            return self.load_embeddings(variable.T).T
+        elif 'relative_attention_bias' in name:
+            return variable.T
+        else:
+            return variable
+    def create_variable(self, name, value, dtype=None):
+        """在tensorflow中创建一个变量
+        """
+        if 'relative_attention_bias' in name:
+            value = value.T
+        return super(T5_Base, self).create_variable(name, value, dtype)
+    def variable_mapping(self):
+        """映射到官方T5权重格式
+        """
+        mapping = {
+            'Embedding-Token': ['shared/embedding'],
+            'Encoder-Embedding-Relative-Position': [
+                'encoder/block_000/layer_000/SelfAttention/relative_attention_bias'
+            ],
+            'Encoder-Output-Norm': ['encoder/final_layer_norm/scale'],
+            'Decoder-Embedding-Relative-Position': [
+                'decoder/block_000/layer_000/SelfAttention/relative_attention_bias',
+            ],
+            'Decoder-Output-Norm': ['decoder/final_layer_norm/scale'],
+        }
+        for i in range(self.num_hidden_layers):
+            # Encoder主体
+            prefix = 'encoder/block_%03d/' % i
+            mapping.update({
+                'Encoder-Transformer-%d-MultiHeadSelfAttention' % i: [
+                    prefix + 'layer_000/SelfAttention/q',
+                    prefix + 'layer_000/SelfAttention/k',
+                    prefix + 'layer_000/SelfAttention/v',
+                    prefix + 'layer_000/SelfAttention/o',
+                ],
+                'Encoder-Transformer-%d-MultiHeadSelfAttention-Norm' % i: [
+                    prefix + 'layer_000/layer_norm/scale',
+                ],
+                'Encoder-Transformer-%d-FeedForward' % i: [
+                    prefix + 'layer_001/DenseReluDense/wi/kernel',
+                    prefix + 'layer_001/DenseReluDense/wo/kernel',
+                ],
+                'Encoder-Transformer-%d-FeedForward-Norm' % i: [
+                    prefix + 'layer_001/layer_norm/scale',
+                ],
+            })
+            # Decoder主体
+            prefix = 'decoder/block_%03d/' % i
+            mapping.update({
+                'Decoder-Transformer-%d-MultiHeadSelfAttention' % i: [
+                    prefix + 'layer_000/SelfAttention/q',
+                    prefix + 'layer_000/SelfAttention/k',
+                    prefix + 'layer_000/SelfAttention/v',
+                    prefix + 'layer_000/SelfAttention/o',
+                ],
+                'Decoder-Transformer-%d-MultiHeadSelfAttention-Norm' % i: [
+                    prefix + 'layer_000/layer_norm/scale',
+                ],
+                'Decoder-Transformer-%d-MultiHeadCrossAttention' % i: [
+                    prefix + 'layer_001/EncDecAttention/q',
+                    prefix + 'layer_001/EncDecAttention/k',
+                    prefix + 'layer_001/EncDecAttention/v',
+                    prefix + 'layer_001/EncDecAttention/o',
+                ],
+                'Decoder-Transformer-%d-MultiHeadCrossAttention-Norm' % i: [
+                    prefix + 'layer_001/layer_norm/scale',
+                ],
+                'Decoder-Transformer-%d-FeedForward' % i: [
+                    prefix + 'layer_002/DenseReluDense/wi/kernel',
+                    prefix + 'layer_002/DenseReluDense/wo/kernel',
+                ],
+                'Decoder-Transformer-%d-FeedForward-Norm' % i: [
+                    prefix + 'layer_002/layer_norm/scale',
+                ],
+            })
+        if self.version == 't5.1.1':
+            mapping['Encoder-Output-Norm'] = ['encoder/rms_norm/scale']
+            mapping['Decoder-Output-Norm'] = ['decoder/rms_norm/scale']
+            mapping['Decoder-Output-LM'] = ['decoder/logits/kernel']
+            mapping = {
+                k: [i.replace('layer_norm', 'rms_norm') for i in v]
+                for k, v in mapping.items()
+            }
+            for i in range(self.num_hidden_layers):
+                for layer in [
+                    'Encoder-Transformer-%d-FeedForward' % i,
+                    'Decoder-Transformer-%d-FeedForward' % i
+                ]:
+                    mapping[layer] = [
+                        mapping[layer][0][:-7] + '_0' + mapping[layer][0][-7:],
+                        mapping[layer][0][:-7] + '_1' + mapping[layer][0][-7:],
+                        mapping[layer][1]
+                    ]
+        return mapping
+class T5_Encoder(T5_Base):
+    """Google的T5模型（Encoder）
+    """
+    def get_inputs(self):
+        """T5的Encoder的输入只有token_ids
+        """
+        x_in = self.apply(
+            layer=Input,
+            shape=(self.sequence_length,),
+            name='Encoder-Input-Token'
+        )
+        return x_in
+    def apply_embeddings(self, inputs):
+        """T5的embedding只有token embedding，
+        并把relative position embedding准备好，待attention使用。
+        """
+        x = inputs
+        x = self.apply(
+            inputs=x,
+            layer=Embedding,
+            input_dim=self.vocab_size,
+            output_dim=self.embedding_size,
+            embeddings_initializer=self.initializer,
+            mask_zero=True,
+            name='Embedding-Token'
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='Encoder-Embedding-Dropout'
+        )
+        if self.embedding_size != self.hidden_size:
+            x = self.apply(
+                inputs=x,
+                layer=Dense,
+                units=self.hidden_size,
+                kernel_initializer=self.initializer,
+                name='Encoder-Embedding-Mapping'
+            )
+        return x
+    def apply_main_layers(self, inputs, index):
+        """T5的Encoder的主体是基于Self-Attention的模块
+        顺序：LN --> Att --> Add --> LN --> FFN --> Add
+        """
+        x = inputs
+        z = self.layer_norm_conds[0]
+        attention_name = 'Encoder-Transformer-%d-MultiHeadSelfAttention' % index
+        feed_forward_name = 'Encoder-Transformer-%d-FeedForward' % index
+        attention_mask = self.compute_attention_bias(index)
+        position_bias = self.compute_position_bias(x)
+        # Self Attention
+        xi = x
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            center=False,
+            epsilon=1e-6,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='%s-Norm' % attention_name
+        )
+        x = self.apply(
+            inputs=[x, x, x, position_bias],
+            layer=MultiHeadAttention,
+            arguments={'p_bias': 't5_relative'},
+            heads=self.num_attention_heads,
+            head_size=self.attention_head_size,
+            out_dim=self.hidden_size,
+            key_size=self.attention_key_size,
+            use_bias=False,
+            attention_scale=False,
+            kernel_initializer=self.initializer,
+            name=attention_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='%s-Dropout' % attention_name
+        )
+        x = self.apply(
+            inputs=[xi, x], layer=Add, name='%s-Add' % attention_name
+        )
+        # Feed Forward
+        xi = x
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            center=False,
+            epsilon=1e-6,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='%s-Norm' % feed_forward_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=FeedForward,
+            units=self.intermediate_size,
+            activation=self.hidden_act,
+            use_bias=False,
+            kernel_initializer=self.initializer,
+            name=feed_forward_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='%s-Dropout' % feed_forward_name
+        )
+        x = self.apply(
+            inputs=[xi, x], layer=Add, name='%s-Add' % feed_forward_name
+        )
+        return x
+    def apply_final_layers(self, inputs):
+        """剩余部分
+        """
+        x = inputs
+        z = self.layer_norm_conds[0]
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            center=False,
+            epsilon=1e-6,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='Encoder-Output-Norm'
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='Encoder-Output-Dropout'
+        )
+        return x
+    def compute_position_bias(self, inputs=None):
+        """T5相对位置编码
+        """
+        if self.position_bias is None:
+            x = inputs
+            p = self.apply(
+                inputs=[x, x],
+                layer=RelativePositionEmbeddingT5,
+                input_dim=32,
+                output_dim=self.num_attention_heads,
+                bidirectional=True,
+                embeddings_initializer=self.initializer,
+                name='Encoder-Embedding-Relative-Position'
+            )
+            self.position_bias = p
+        return self.position_bias
+class T5_Decoder(LM_Mask, T5_Base):
+    """Google的T5模型（Decoder）
+    """
+    def __init__(self, with_lm=True, **kwargs):
+        super(T5_Decoder, self).__init__(**kwargs)
+        self.with_lm = with_lm
+    def get_inputs(self):
+        """T5的Decoder的输入为context序列和token_ids
+        """
+        c_in = self.apply(
+            layer=Input,
+            shape=(self.sequence_length, self.hidden_size),
+            name='Input-Context'
+        )
+        x_in = self.apply(
+            layer=Input,
+            shape=(self.sequence_length,),
+            name='Decoder-Input-Token'
+        )
+        return [c_in, x_in]
+    def apply_embeddings(self, inputs):
+        """T5的embedding只有token embedding，
+        并把relative position embedding准备好，待attention使用。
+        """
+        c, x = inputs
+        c = self.apply(
+            inputs=c, layer=Masking, mask_value=0.0, name='Masked-Context'
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Embedding,
+            input_dim=self.vocab_size,
+            output_dim=self.embedding_size,
+            embeddings_initializer=self.initializer,
+            mask_zero=True,
+            name='Embedding-Token'
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='Decoder-Embedding-Dropout'
+        )
+        if self.embedding_size != self.hidden_size:
+            x = self.apply(
+                inputs=x,
+                layer=Dense,
+                units=self.hidden_size,
+                kernel_initializer=self.initializer,
+                name='Decoder-Embedding-Mapping'
+            )
+        return [c, x]
+    def apply_main_layers(self, inputs, index):
+        """T5的Dencoder主体是基于Self-Attention、Cross-Attention的模块
+        顺序：LN --> Att1 --> Add --> LN --> Att2 --> Add -->  LN --> FFN --> Add
+        """
+        c, x = inputs
+        z = self.layer_norm_conds[0]
+        self_attention_name = 'Decoder-Transformer-%d-MultiHeadSelfAttention' % index
+        cross_attention_name = 'Decoder-Transformer-%d-MultiHeadCrossAttention' % index
+        feed_forward_name = 'Decoder-Transformer-%d-FeedForward' % index
+        attention_mask = self.compute_attention_bias(index)
+        position_bias = self.compute_position_bias([x, c])
+        # Self Attention
+        xi = x
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            center=False,
+            epsilon=1e-6,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='%s-Norm' % self_attention_name
+        )
+        x = self.apply(
+            inputs=[x, x, x, attention_mask, position_bias[0]],
+            layer=MultiHeadAttention,
+            arguments={
+                'a_bias': True,
+                'p_bias': 't5_relative'
+            },
+            heads=self.num_attention_heads,
+            head_size=self.attention_head_size,
+            out_dim=self.hidden_size,
+            key_size=self.attention_key_size,
+            use_bias=False,
+            attention_scale=False,
+            kernel_initializer=self.initializer,
+            name=self_attention_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='%s-Dropout' % self_attention_name
+        )
+        x = self.apply(
+            inputs=[xi, x], layer=Add, name='%s-Add' % self_attention_name
+        )
+        # Cross Attention
+        xi = x
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            center=False,
+            epsilon=1e-6,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='%s-Norm' % cross_attention_name
+        )
+        x = self.apply(
+            inputs=[x, c, c, position_bias[1]],
+            layer=MultiHeadAttention,
+            arguments={
+                'a_bias': None,
+                'p_bias': 't5_relative'
+            },
+            heads=self.num_attention_heads,
+            head_size=self.attention_head_size,
+            out_dim=self.hidden_size,
+            key_size=self.attention_key_size,
+            use_bias=False,
+            attention_scale=False,
+            kernel_initializer=self.initializer,
+            name=cross_attention_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='%s-Dropout' % cross_attention_name
+        )
+        x = self.apply(
+            inputs=[xi, x], layer=Add, name='%s-Add' % cross_attention_name
+        )
+        # Feed Forward
+        xi = x
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            center=False,
+            epsilon=1e-6,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='%s-Norm' % feed_forward_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=FeedForward,
+            units=self.intermediate_size,
+            activation=self.hidden_act,
+            use_bias=False,
+            kernel_initializer=self.initializer,
+            name=feed_forward_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='%s-Dropout' % feed_forward_name
+        )
+        x = self.apply(
+            inputs=[xi, x], layer=Add, name='%s-Add' % feed_forward_name
+        )
+        return [c, x]
+    def apply_final_layers(self, inputs):
+        """剩余部分
+        """
+        c, x = inputs
+        z = self.layer_norm_conds[0]
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            center=False,
+            epsilon=1e-6,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='Decoder-Output-Norm'
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='Decoder-Output-Dropout'
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Lambda,
+            function=lambda x: x / self.hidden_size**0.5,
+            mask=lambda i, m: m,
+            name='Decoder-Output-Scale'
+        )
+        if self.with_lm:
+            # 预测token概率部分
+            if self.embedding_size != self.hidden_size:
+                x = self.apply(
+                    inputs=x,
+                    layer=Dense,
+                    units=self.embedding_size,
+                    kernel_initializer=self.initializer,
+                    name='Decoder-Output-Mapping'
+                )
+            lm_activation = 'softmax' if self.with_lm is True else self.with_lm
+            if self.version == 't5.1.0':
+                x = self.apply(
+                    inputs=x,
+                    layer=Embedding,
+                    arguments={'mode': 'dense'},
+                    name='Embedding-Token'
+                )
+                x = self.apply(
+                    inputs=x,
+                    layer=Activation,
+                    activation=lm_activation,
+                    name='Dencoder-Output-LM-Activation'
+                )
+            else:
+                x = self.apply(
+                    inputs=x,
+                    layer=Dense,
+                    units=self.vocab_size,
+                    activation=lm_activation,
+                    use_bias=False,
+                    kernel_initializer=self.initializer,
+                    name='Decoder-Output-LM'
+                )
+        return x
+    def compute_attention_bias(self, inputs=None):
+        """修改LM Mask的序列长度（从 self.inputs[0] 改为 self.inputs[1] ）
+        """
+        old_inputs = self.inputs[:]
+        self.inputs = [old_inputs[1]]
+        mask = super(T5_Decoder, self).compute_attention_bias(inputs)
+        self.inputs = old_inputs
+        return mask
+    def compute_position_bias(self, inputs=None):
+        """T5相对位置编码
+        """
+        if self.position_bias is None:
+            x, c = inputs
+            p1 = self.apply(
+                inputs=[x, x],
+                layer=RelativePositionEmbeddingT5,
+                input_dim=32,
+                output_dim=self.num_attention_heads,
+                bidirectional=False,
+                embeddings_initializer=self.initializer,
+                name='Decoder-Embedding-Relative-Position'
+            )
+            p2 = self.apply(
+                inputs=[x, c],
+                layer=RelativePositionEmbeddingT5,
+                input_dim=32,
+                output_dim=self.num_attention_heads,
+                bidirectional=False,
+                embeddings_initializer=self.initializer,
+                name='Decoder-Embedding-Relative-Position'
+            )
+            self.position_bias = (p1, p2)
+        return self.position_bias
+class T5(T5_Base):
+    """Google的T5模型（Encoder-Decoder）
+    """
+    def __init__(self, **kwargs):
+        super(T5, self).__init__(**kwargs)
+        kwargs['layers'] = self.layers
+        e_name, d_name = 'Encoder', 'Decoder'
+        if 'name' in kwargs:
+            e_name = '%s_%s' % (kwargs['name'], e_name)
+            d_name = '%s_%s' % (kwargs['name'], d_name)
+            del kwargs['name']  # 防止重复传参
+        self._encoder = T5_Encoder(name=e_name, **kwargs)
+        self._decoder = T5_Decoder(name=d_name, **kwargs)
+    def build(self, **kwargs):
+        """同时构建Encoder和Decoder
+        """
+        self._encoder.build(**kwargs)
+        self._decoder.build(**kwargs)
+        self.encoder = self._encoder.model
+        self.decoder = self._decoder.model
+        self.inputs = self.encoder.inputs + self.decoder.inputs[1:]
+        self.outputs = self.decoder(
+            self.encoder.outputs + self.decoder.inputs[1:]
+        )
+        self.model = Model(self.inputs, self.outputs)
+def extend_with_language_model(BaseModel):
+    """添加下三角的Attention Mask（语言模型用）
+    """
+    class LanguageModel(LM_Mask, BaseModel):
+        """带下三角Attention Mask的派生模型
+        """
+        def __init__(self, *args, **kwargs):
+            super(LanguageModel, self).__init__(*args, **kwargs)
+            self.with_mlm = self.with_mlm or True
+    return LanguageModel
+def extend_with_unified_language_model(BaseModel):
+    """添加UniLM的Attention Mask（Seq2Seq模型用）
+    """
+    class UnifiedLanguageModel(UniLM_Mask, BaseModel):
+        """带UniLM的Attention Mask的派生模型
+        UniLM: https://arxiv.org/abs/1905.03197
+        """
+        def __init__(self, *args, **kwargs):
+            super(UnifiedLanguageModel, self).__init__(*args, **kwargs)
+            self.with_mlm = self.with_mlm or True
+    return UnifiedLanguageModel
+def build_transformer_model(
+    config_path=None,
+    checkpoint_path=None,
+    model='bert',
+    application='encoder',
+    return_keras_model=True,
+    **kwargs
+):
+    """根据配置文件构建模型，可选加载checkpoint权重
+    """
+    configs = {}
+    if config_path is not None:
+        configs.update(json.load(open(config_path)))
+    configs.update(kwargs)
+    if 'max_position' not in configs:
+        configs['max_position'] = configs.get('max_position_embeddings', 512)
+    if 'dropout_rate' not in configs:
+        configs['dropout_rate'] = configs.get('hidden_dropout_prob')
+    if 'segment_vocab_size' not in configs:
+        configs['segment_vocab_size'] = configs.get('type_vocab_size', 2)
+    models = {
+        'bert': BERT,
+        'albert': ALBERT,
+        'albert_unshared': ALBERT_Unshared,
+        'roberta': BERT,
+        'nezha': NEZHA,
+        'roformer': RoFormer,
+        'electra': ELECTRA,
+        'gpt': GPT,
+        'gpt2': GPT2,
+        'gpt2_ml': GPT2_ML,
+        't5': T5,
+        't5_encoder': T5_Encoder,
+        't5_decoder': T5_Decoder,
+        't5.1.0': T5,
+        't5.1.0_encoder': T5_Encoder,
+        't5.1.0_decoder': T5_Decoder,
+        't5.1.1': T5,
+        't5.1.1_encoder': T5_Encoder,
+        't5.1.1_decoder': T5_Decoder,
+    }
+    if is_string(model):
+        model = model.lower()
+        MODEL = models[model]
+        if model.startswith('t5.1.1'):
+            configs['version'] = 't5.1.1'
+    else:
+        MODEL = model
+    application = application.lower()
+    if application in ['lm', 'unilm'] and model in ['electra', 't5']:
+        raise ValueError(
+            '"%s" model can not be used as "%s" application.\n' %
+            (model, application)
+        )
+    if application == 'lm':
+        MODEL = extend_with_language_model(MODEL)
+    elif application == 'unilm':
+        MODEL = extend_with_unified_language_model(MODEL)
+    transformer = MODEL(**configs)
+    transformer.build(**configs)
+    if checkpoint_path is not None:
+        transformer.load_weights_from_checkpoint(checkpoint_path)
+    if return_keras_model:
+        return transformer.model
+    else:
+        return transformer
--- a/Keras/NLP/bert4keras/bert4keras/optimizers.py
+++ b/Keras/NLP/bert4keras/bert4keras/optimizers.py
+# -*- coding: utf-8 -*-
+# 优化相关
+import numpy as np
+import tensorflow as tf
+from bert4keras.backend import keras, K, is_tf_keras
+from bert4keras.snippets import is_string, string_matching
+from bert4keras.snippets import is_one_of, insert_arguments
+from bert4keras.backend import piecewise_linear
+import re
+class Adam(keras.optimizers.Optimizer):
+    """重新定义Adam优化器，便于派生出新的优化器
+    （tensorflow的optimizer_v2类）
+    """
+    def __init__(
+        self,
+        learning_rate=0.001,
+        beta_1=0.9,
+        beta_2=0.999,
+        epsilon=1e-6,
+        bias_correction=True,
+        **kwargs
+    ):
+        kwargs['name'] = kwargs.get('name') or 'Adam'
+        super(Adam, self).__init__(**kwargs)
+        self._set_hyper('learning_rate', learning_rate)
+        self._set_hyper('beta_1', beta_1)
+        self._set_hyper('beta_2', beta_2)
+        self.epsilon = epsilon or K.epislon()
+        self.bias_correction = bias_correction
+    def _create_slots(self, var_list):
+        for var in var_list:
+            self.add_slot(var, 'm')
+            self.add_slot(var, 'v')
+    def _resource_apply(self, grad, var, indices=None):
+        # 准备变量
+        var_dtype = var.dtype.base_dtype
+        lr_t = self._decayed_lr(var_dtype)
+        m = self.get_slot(var, 'm')
+        v = self.get_slot(var, 'v')
+        beta_1_t = self._get_hyper('beta_1', var_dtype)
+        beta_2_t = self._get_hyper('beta_2', var_dtype)
+        epsilon_t = K.cast(self.epsilon, var_dtype)
+        local_step = K.cast(self.iterations + 1, var_dtype)
+        beta_1_t_power = K.pow(beta_1_t, local_step)
+        beta_2_t_power = K.pow(beta_2_t, local_step)
+        # 更新公式
+        if indices is None:
+            m_t = K.update(m, beta_1_t * m + (1 - beta_1_t) * grad)
+            v_t = K.update(v, beta_2_t * v + (1 - beta_2_t) * grad**2)
+        else:
+            mv_ops = [K.update(m, beta_1_t * m), K.update(v, beta_2_t * v)]
+            with tf.control_dependencies(mv_ops):
+                m_t = self._resource_scatter_add(
+                    m, indices, (1 - beta_1_t) * grad
+                )
+                v_t = self._resource_scatter_add(
+                    v, indices, (1 - beta_2_t) * grad**2
+                )
+        # 返回算子
+        with tf.control_dependencies([m_t, v_t]):
+            if self.bias_correction:
+                m_t = m_t / (1.0 - beta_1_t_power)
+                v_t = v_t / (1.0 - beta_2_t_power)
+            var_t = var - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)
+            return K.update(var, var_t)
+    def _resource_apply_dense(self, grad, var):
+        return self._resource_apply(grad, var)
+    def _resource_apply_sparse(self, grad, var, indices):
+        return self._resource_apply(grad, var, indices)
+    def get_config(self):
+        config = {
+            'learning_rate': self._serialize_hyperparameter('learning_rate'),
+            'beta_1': self._serialize_hyperparameter('beta_1'),
+            'beta_2': self._serialize_hyperparameter('beta_2'),
+            'epsilon': self.epsilon,
+            'bias_correction': self.bias_correction,
+        }
+        base_config = super(Adam, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+class AdaFactorBase(keras.optimizers.Optimizer):
+    """AdaFactor优化器（基类）
+    论文链接：https://arxiv.org/abs/1804.04235
+    参考实现：https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/optimize.py
+    """
+    def __init__(
+        self,
+        learning_rate=1e-3,  # 可以为None
+        beta1=0.0,
+        beta2=None,
+        epsilon1=1e-30,
+        epsilon2=1e-3,
+        multiply_by_parameter_scale=True,
+        clipping_threshold=1.0,
+        min_dim_size_to_factor=128,
+        **kwargs
+    ):
+        super(AdaFactorBase, self).__init__(**kwargs)
+        self._learning_rate = learning_rate
+        self.beta1 = beta1
+        self._beta2 = beta2
+        self.epsilon1 = epsilon1
+        self.epsilon2 = epsilon2
+        self.multiply_by_parameter_scale = multiply_by_parameter_scale
+        self.clipping_threshold = clipping_threshold
+        self.min_dim_size_to_factor = min_dim_size_to_factor
+    @property
+    def learning_rate(self):
+        if self._learning_rate is None:
+            iterations = K.cast(self.iterations + 1, K.floatx())
+            learning_rate = K.minimum(1.0 / K.sqrt(iterations), 0.01)
+            if self.multiply_by_parameter_scale:
+                return learning_rate
+            else:
+                return learning_rate * 0.05
+        else:
+            if not hasattr(self, '__learning_rate'):
+                with K.name_scope(self.__class__.__name__):
+                    self.__learning_rate = K.variable(
+                        self._learning_rate, name='learning_rate'
+                    )
+            return self.__learning_rate
+    @property
+    def beta2(self):
+        if self._beta2 is None:
+            iterations = K.cast(self.iterations + 1, K.floatx())
+            return 1.0 - K.pow(iterations, -0.8)
+        else:
+            return self._beta2
+    def factored_shape(self, shape):
+        if len(shape) < 2:
+            return None
+        shape = np.array(shape)
+        indices = shape.argpartition(-2)
+        if shape[indices[-2]] < self.min_dim_size_to_factor:
+            return None
+        shape1, shape2 = np.array(shape), np.array(shape)
+        shape1[indices[-1]] = 1
+        shape2[indices[-2]] = 1
+        return shape1, indices[-1], shape2, indices[-2]
+    def get_config(self):
+        config = {
+            'learning_rate': self._learning_rate,
+            'beta1': self.beta1,
+            'beta2': self._beta2,
+            'epsilon1': self.epsilon1,
+            'epsilon2': self.epsilon2,
+            'multiply_by_parameter_scale': self.multiply_by_parameter_scale,
+            'clipping_threshold': self.clipping_threshold,
+            'min_dim_size_to_factor': self.min_dim_size_to_factor,
+        }
+        base_config = super(AdaFactorBase, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+class AdaFactorV1(AdaFactorBase):
+    """AdaFactor优化器（纯Keras版）
+    论文链接：https://arxiv.org/abs/1804.04235
+    参考实现：https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/optimize.py
+    """
+    def __init__(self, *args, **kwargs):
+        super(AdaFactorV1, self).__init__(*args, **kwargs)
+        with K.name_scope(self.__class__.__name__):
+            self.iterations = K.variable(0, dtype='int64', name='iterations')
+    @K.symbolic
+    def get_updates(self, loss, params):
+        grads = self.get_gradients(loss, params)
+        self.updates = [K.update_add(self.iterations, 1)]
+        self.weights = [self.iterations]
+        lr = self.learning_rate
+        for i, (p, g) in enumerate(zip(params, grads)):
+            g2 = K.square(g) + self.epsilon1
+            shape, dtype = K.int_shape(p), K.dtype(p)
+            factored_shape = self.factored_shape(shape)
+            if factored_shape is None:
+                # 定义参数
+                v = K.zeros(shape, dtype=dtype, name='v_' + str(i))
+                self.weights.append(v)
+                # 定义更新
+                v_t = self.beta2 * v + (1.0 - self.beta2) * g2
+                self.updates.append(K.update(v, v_t))
+            else:
+                # 定义参数
+                shape1, axis1, shape2, axis2 = factored_shape
+                vr = K.zeros(shape1, dtype=dtype, name='vr_' + str(i))
+                vc = K.zeros(shape2, dtype=dtype, name='vc_' + str(i))
+                self.weights.extend([vr, vc])
+                # 定义更新
+                vr_t = self.beta2 * vr + K.mean(g2, axis=axis1, keepdims=True)
+                vc_t = self.beta2 * vc + K.mean(g2, axis=axis2, keepdims=True)
+                self.updates.extend([K.update(vr, vr_t), K.update(vc, vc_t)])
+                # 合成矩阵
+                v_t = vr_t * vc_t / K.mean(vr_t, axis=axis2, keepdims=True)
+            # 增量主体
+            u = g / K.sqrt(v_t)
+            # 增量裁剪
+            if self.clipping_threshold is not None:
+                u_rms = K.mean(K.sum(K.square(u)))
+                d = self.clipping_threshold
+                u = u / K.maximum(1.0, u_rms / d)
+            # 增量滑动
+            if self.beta1 > 0.0:
+                # 定义参数
+                m = K.zeros(shape, dtype=dtype, name='m_' + str(i))
+                self.weights.append(m)
+                # 定义更新
+                m_t = self.beta1 * m + (1.0 - self.beta1) * u
+                self.updates.append(K.update(m, m_t))
+                u = m_t
+            # 增量调整
+            if self.multiply_by_parameter_scale:
+                u = u * K.maximum(K.mean(K.sum(K.square(p))), self.epsilon2)
+            # 更新参数
+            self.updates.append(K.update(p, p - lr * u))
+        return self.updates
+class AdaFactorV2(AdaFactorBase):
+    """AdaFactor优化器（tf.keras版）
+    论文链接：https://arxiv.org/abs/1804.04235
+    参考实现：https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/optimize.py
+    """
+    def __init__(self, *args, **kwargs):
+        kwargs['name'] = kwargs.get('name') or 'AdaFactor'
+        super(AdaFactorV2, self).__init__(*args, **kwargs)
+    def _create_slots(self, var_list):
+        for var in var_list:
+            if self.beta1 > 0.0:
+                self.add_slot(var, 'm')
+            shape = K.int_shape(var)
+            factored_shape = self.factored_shape(shape)
+            if factored_shape is None:
+                self.add_slot(var, 'v')
+            else:
+                shape1, axis1, shape2, axis2 = factored_shape
+                value1, value2 = np.zeros(shape1), np.zeros(shape2)
+                self.add_slot(var, 'vr', value1)
+                self.add_slot(var, 'vc', value2)
+    def _resource_apply(self, grad, var, indices=None):
+        lr = self.learning_rate
+        g2 = K.square(grad) + self.epsilon1
+        shape = K.int_shape(var)
+        factored_shape = self.factored_shape(shape)
+        if factored_shape is None:
+            v = self.get_slot(var, 'v')
+            # 定义更新
+            v_t = self.beta2 * v + (1.0 - self.beta2) * g2
+            v_t = K.update(v, v_t)
+        else:
+            shape1, axis1, shape2, axis2 = factored_shape
+            vr = self.get_slot(var, 'vr')
+            vc = self.get_slot(var, 'vc')
+            # 定义更新
+            vr_t = self.beta2 * vr + K.mean(g2, axis=axis1, keepdims=True)
+            vc_t = self.beta2 * vc + K.mean(g2, axis=axis2, keepdims=True)
+            vr_t, vc_t = K.update(vr, vr_t), K.update(vc, vc_t)
+            # 合成矩阵
+            v_t = vr_t * vc_t / K.mean(vr_t, axis=axis2, keepdims=True)
+        # 增量主体
+        u = grad / K.sqrt(v_t)
+        # 增量裁剪
+        if self.clipping_threshold is not None:
+            u_rms = K.mean(K.sum(K.square(u)))
+            d = self.clipping_threshold
+            u = u / K.maximum(1.0, u_rms / d)
+        # 增量滑动
+        if self.beta1 > 0.0:
+            m = self.get_slot(var, 'm')
+            # 定义更新
+            m_t = self.beta1 * m + (1.0 - self.beta1) * u
+            u = K.update(m, m_t)
+        # 增量调整
+        if self.multiply_by_parameter_scale:
+            u = u * K.maximum(K.mean(K.sum(K.square(var))), self.epsilon2)
+        # 更新参数
+        return K.update(var, var - lr * u)
+    def _resource_apply_dense(self, grad, var):
+        return self._resource_apply(grad, var)
+    def _resource_apply_sparse(self, grad, var, indices):
+        grad = tf.IndexedSlices(grad, indices, K.shape(var))
+        grad = tf.convert_to_tensor(grad)
+        return self._resource_apply_dense(grad, var)
+def export_to_custom_objects(base_extend_with):
+    """装饰器，用来将优化器放到custom_objects中
+    """
+    def new_extend_with(BaseOptimizer, name=None):
+        NewOptimizer = base_extend_with(BaseOptimizer)
+        if is_string(name):
+            NewOptimizer.__name__ = name
+        name = NewOptimizer.__name__
+        keras.utils.get_custom_objects()[name] = NewOptimizer
+        return NewOptimizer
+    return new_extend_with
+@export_to_custom_objects
+def extend_with_weight_decay(BaseOptimizer):
+    """返回新的优化器类，加入权重衰减
+    """
+    class NewOptimizer(BaseOptimizer):
+        """带有权重衰减的优化器
+        """
+        @insert_arguments(weight_decay_rate=0.01, exclude_from_weight_decay=[])
+        def __init__(self, *args, **kwargs):
+            super(NewOptimizer, self).__init__(*args, **kwargs)
+            if not hasattr(self, 'learning_rate'):
+                self.learning_rate = self.lr
+        @K.symbolic
+        def get_updates(self, loss, params):
+            old_update = K.update
+            def new_update(x, new_x):
+                if is_one_of(x, params) and self._do_weight_decay(x):
+                    new_x = new_x - self.learning_rate * self.weight_decay_rate * x
+                return old_update(x, new_x)
+            K.update = new_update
+            updates = super(NewOptimizer, self).get_updates(loss, params)
+            K.update = old_update
+            return updates
+        def _do_weight_decay(self, w):
+            return (not string_matching(w.name, self.exclude_from_weight_decay))
+        def get_config(self):
+            config = {
+                'weight_decay_rate': self.weight_decay_rate,
+                'exclude_from_weight_decay': self.exclude_from_weight_decay,
+            }
+            base_config = super(NewOptimizer, self).get_config()
+            return dict(list(base_config.items()) + list(config.items()))
+    return NewOptimizer
+@export_to_custom_objects
+def extend_with_weight_decay_v2(BaseOptimizer):
+    """返回新的优化器类，加入权重衰减
+    """
+    class NewOptimizer(BaseOptimizer):
+        """带有权重衰减的优化器
+        """
+        @insert_arguments(weight_decay_rate=0.01, exclude_from_weight_decay=[])
+        def __init__(self, *args, **kwargs):
+            super(NewOptimizer, self).__init__(*args, **kwargs)
+        def _resource_apply(self, grad, var, indices=None):
+            old_update = K.update
+            def new_update(x, new_x):
+                if x is var and self._do_weight_decay(x):
+                    lr_t = self._decayed_lr(x.dtype.base_dtype)
+                    new_x = new_x - lr_t * self.weight_decay_rate * x
+                return old_update(x, new_x)
+            K.update = new_update
+            op = super(NewOptimizer, self)._resource_apply(grad, var, indices)
+            K.update = old_update
+            return op
+        def _do_weight_decay(self, w):
+            return (not string_matching(w.name, self.exclude_from_weight_decay))
+        def get_config(self):
+            config = {
+                'weight_decay_rate': self.weight_decay_rate,
+                'exclude_from_weight_decay': self.exclude_from_weight_decay,
+            }
+            base_config = super(NewOptimizer, self).get_config()
+            return dict(list(base_config.items()) + list(config.items()))
+    return NewOptimizer
+@export_to_custom_objects
+def extend_with_layer_adaptation(BaseOptimizer):
+    """返回新的优化器类，加入层自适应学习率
+    """
+    class NewOptimizer(BaseOptimizer):
+        """带有层自适应学习率的优化器
+        用每一层参数的模长来校正当前参数的学习率
+        https://arxiv.org/abs/1904.00962
+        """
+        @insert_arguments(exclude_from_layer_adaptation=[])
+        def __init__(self, *args, **kwargs):
+            super(NewOptimizer, self).__init__(*args, **kwargs)
+            if not hasattr(self, 'learning_rate'):
+                self.learning_rate = self.lr
+        @K.symbolic
+        def get_updates(self, loss, params):
+            old_update = K.update
+            def new_update(x, new_x):
+                if is_one_of(x, params) and self._do_layer_adaptation(x):
+                    dx = new_x - x
+                    lr_t = K.clip(self.learning_rate, K.epsilon(), 1e10)
+                    x_norm = tf.norm(x)
+                    g_norm = tf.norm(dx / lr_t)
+                    ratio = K.switch(
+                        x_norm > 0.0,
+                        K.switch(g_norm > K.epsilon(), x_norm / g_norm, 1.0),
+                        1.0
+                    )
+                    new_x = x + dx * ratio
+                return old_update(x, new_x)
+            K.update = new_update
+            updates = super(NewOptimizer, self).get_updates(loss, params)
+            K.update = old_update
+            return updates
+        def _do_layer_adaptation(self, w):
+            return (
+                not string_matching(w.name, self.exclude_from_layer_adaptation)
+            )
+        def get_config(self):
+            config = {
+                'exclude_from_layer_adaptation':
+                    self.exclude_from_layer_adaptation,
+            }
+            base_config = super(NewOptimizer, self).get_config()
+            return dict(list(base_config.items()) + list(config.items()))
+    return NewOptimizer
+@export_to_custom_objects
+def extend_with_layer_adaptation_v2(BaseOptimizer):
+    """返回新的优化器类，加入层自适应学习率
+    """
+    class NewOptimizer(BaseOptimizer):
+        """带有层自适应学习率的优化器
+        用每一层参数的模长来校正当前参数的学习率
+        https://arxiv.org/abs/1904.00962
+        """
+        @insert_arguments(exclude_from_layer_adaptation=[])
+        def __init__(self, *args, **kwargs):
+            super(NewOptimizer, self).__init__(*args, **kwargs)
+        def _resource_apply(self, grad, var, indices=None):
+            old_update = K.update
+            def new_update(x, new_x):
+                if x is var and self._do_layer_adaptation(x):
+                    dx = new_x - x
+                    lr_t = self._decayed_lr(x.dtype.base_dtype)
+                    lr_t = K.clip(lr_t, K.epsilon(), 1e10)
+                    x_norm = tf.norm(x)
+                    g_norm = tf.norm(dx / lr_t)
+                    ratio = K.switch(
+                        x_norm > 0.0,
+                        K.switch(g_norm > K.epsilon(), x_norm / g_norm, 1.0),
+                        1.0
+                    )
+                    new_x = x + dx * ratio
+                return old_update(x, new_x)
+            K.update = new_update
+            op = super(NewOptimizer, self)._resource_apply(grad, var, indices)
+            K.update = old_update
+            return op
+        def _do_layer_adaptation(self, w):
+            return (
+                not string_matching(w.name, self.exclude_from_layer_adaptation)
+            )
+        def get_config(self):
+            config = {
+                'exclude_from_layer_adaptation':
+                    self.exclude_from_layer_adaptation,
+            }
+            base_config = super(NewOptimizer, self).get_config()
+            return dict(list(base_config.items()) + list(config.items()))
+    return NewOptimizer
+@export_to_custom_objects
+def extend_with_piecewise_linear_lr(BaseOptimizer):
+    """返回新的优化器类，加入分段线性学习率
+    """
+    class NewOptimizer(BaseOptimizer):
+        """带有分段线性学习率的优化器
+        其中schedule是形如{1000: 1, 2000: 0.1}的字典，
+        表示0～1000步内学习率线性地从零增加到100%，然后
+        1000～2000步内线性地降到10%，2000步以后保持10%
+        """
+        @insert_arguments(lr_schedule={0: 1})
+        def __init__(self, *args, **kwargs):
+            super(NewOptimizer, self).__init__(*args, **kwargs)
+            self.lr_schedule = {int(i): j for i, j in self.lr_schedule.items()}
+        @K.symbolic
+        def get_updates(self, loss, params):
+            lr_multiplier = piecewise_linear(self.iterations, self.lr_schedule)
+            old_update = K.update
+            def new_update(x, new_x):
+                if is_one_of(x, params):
+                    new_x = x + (new_x - x) * lr_multiplier
+                return old_update(x, new_x)
+            K.update = new_update
+            updates = super(NewOptimizer, self).get_updates(loss, params)
+            K.update = old_update
+            return updates
+        def get_config(self):
+            config = {
+                'lr_schedule': self.lr_schedule,
+            }
+            base_config = super(NewOptimizer, self).get_config()
+            return dict(list(base_config.items()) + list(config.items()))
+    return NewOptimizer
+@export_to_custom_objects
+def extend_with_piecewise_linear_lr_v2(BaseOptimizer):
+    """返回新的优化器类，加入分段线性学习率
+    """
+    class NewOptimizer(BaseOptimizer):
+        """带有分段线性学习率的优化器
+        其中schedule是形如{1000: 1, 2000: 0.1}的字典，
+        表示0～1000步内学习率线性地从零增加到100%，然后
+        1000～2000步内线性地降到10%，2000步以后保持10%
+        """
+        @insert_arguments(lr_schedule={0: 1})
+        def __init__(self, *args, **kwargs):
+            super(NewOptimizer, self).__init__(*args, **kwargs)
+            self.lr_schedule = {int(i): j for i, j in self.lr_schedule.items()}
+        def _decayed_lr(self, var_dtype):
+            lr_multiplier = piecewise_linear(self.iterations, self.lr_schedule)
+            lr_t = super(NewOptimizer, self)._decayed_lr(var_dtype)
+            return lr_t * K.cast(lr_multiplier, var_dtype)
+        def get_config(self):
+            config = {
+                'lr_schedule': self.lr_schedule,
+            }
+            base_config = super(NewOptimizer, self).get_config()
+            return dict(list(base_config.items()) + list(config.items()))
+    return NewOptimizer
+@export_to_custom_objects
+def extend_with_gradient_accumulation(BaseOptimizer):
+    """返回新的优化器类，加入梯度累积
+    """
+    class NewOptimizer(BaseOptimizer):
+        """带有梯度累积的优化器
+        """
+        @insert_arguments(grad_accum_steps=2)
+        def __init__(self, *args, **kwargs):
+            super(NewOptimizer, self).__init__(*args, **kwargs)
+            self._first_get_gradients = True
+        def get_gradients(self, loss, params):
+            if self._first_get_gradients:
+                self._first_get_gradients = False
+                return super(NewOptimizer, self).get_gradients(loss, params)
+            else:
+                return [ag / self.grad_accum_steps for ag in self.accum_grads]
+        @K.symbolic
+        def get_updates(self, loss, params):
+            # 更新判据
+            cond = K.equal(self.iterations % self.grad_accum_steps, 0)
+            cond = K.cast(cond, K.floatx())
+            # 获取梯度
+            grads = self.get_gradients(loss, params)
+            self.accum_grads = [
+                K.zeros(
+                    K.int_shape(p), dtype=K.dtype(p), name='accum_grad_%s' % i
+                ) for i, p in enumerate(params)
+            ]
+            old_update = K.update
+            def new_update(x, new_x):
+                new_x = cond * new_x + (1 - cond) * x
+                return old_update(x, new_x)
+            K.update = new_update
+            updates = super(NewOptimizer, self).get_updates(loss, params)
+            K.update = old_update
+            # 累积梯度
+            with tf.control_dependencies(updates):
+                accum_updates = [
+                    K.update(ag, g + (1 - cond) * ag)
+                    for g, ag in zip(grads, self.accum_grads)
+                ]
+            return accum_updates
+        def get_config(self):
+            config = {
+                'grad_accum_steps': self.grad_accum_steps,
+            }
+            base_config = super(NewOptimizer, self).get_config()
+            return dict(list(base_config.items()) + list(config.items()))
+    return NewOptimizer
+@export_to_custom_objects
+def extend_with_gradient_accumulation_v2(BaseOptimizer):
+    """返回新的优化器类，加入梯度累积
+    """
+    class NewOptimizer(BaseOptimizer):
+        """带有梯度累积的优化器
+        """
+        @insert_arguments(grad_accum_steps=2)
+        def __init__(self, *args, **kwargs):
+            super(NewOptimizer, self).__init__(*args, **kwargs)
+        def _create_slots(self, var_list):
+            super(NewOptimizer, self)._create_slots(var_list)
+            for var in var_list:
+                self.add_slot(var, 'ag')
+        def _resource_apply(self, grad, var, indices=None):
+            # 更新判据
+            cond = K.equal(self.iterations % self.grad_accum_steps, 0)
+            # 获取梯度
+            ag = self.get_slot(var, 'ag')
+            old_update = K.update
+            def new_update(x, new_x):
+                new_x = K.switch(cond, new_x, x)
+                return old_update(x, new_x)
+            K.update = new_update
+            ag_t = ag / self.grad_accum_steps
+            op = super(NewOptimizer, self)._resource_apply(ag_t, var)
+            K.update = old_update
+            # 累积梯度
+            with tf.control_dependencies([op]):
+                ag_t = K.switch(cond, K.zeros_like(ag), ag)
+                with tf.control_dependencies([K.update(ag, ag_t)]):
+                    if indices is None:
+                        ag_t = K.update(ag, ag + grad)
+                    else:
+                        ag_t = self._resource_scatter_add(ag, indices, grad)
+            return ag_t
+        def get_config(self):
+            config = {
+                'grad_accum_steps': self.grad_accum_steps,
+            }
+            base_config = super(NewOptimizer, self).get_config()
+            return dict(list(base_config.items()) + list(config.items()))
+    return NewOptimizer
+@export_to_custom_objects
+def extend_with_lookahead(BaseOptimizer):
+    """返回新的优化器类，加入look ahead
+    """
+    class NewOptimizer(BaseOptimizer):
+        """带有look ahead的优化器
+        https://arxiv.org/abs/1907.08610
+        steps_per_slow_update: 即论文中的k；
+        slow_step_size: 即论文中的alpha。
+        """
+        @insert_arguments(steps_per_slow_update=5, slow_step_size=0.5)
+        def __init__(self, *args, **kwargs):
+            super(NewOptimizer, self).__init__(*args, **kwargs)
+        @K.symbolic
+        def get_updates(self, loss, params):
+            updates = super(NewOptimizer, self).get_updates(loss, params)
+            k, alpha = self.steps_per_slow_update, self.slow_step_size
+            cond = K.equal(self.iterations % k, 0)
+            slow_vars = [
+                K.zeros(
+                    K.int_shape(p), dtype=K.dtype(p), name='slow_var_%s' % i
+                ) for i, p in enumerate(params)
+            ]
+            with tf.control_dependencies(updates):
+                slow_updates = [
+                    K.update(q, K.switch(cond, q + alpha * (p - q), q))
+                    for p, q in zip(params, slow_vars)
+                ]
+                with tf.control_dependencies(slow_updates):
+                    copy_updates = [
+                        K.update(p, K.switch(cond, q, p))
+                        for p, q in zip(params, slow_vars)
+                    ]
+            return copy_updates
+        def get_config(self):
+            config = {
+                'steps_per_slow_update': self.steps_per_slow_update,
+                'slow_step_size': self.slow_step_size,
+            }
+            base_config = super(NewOptimizer, self).get_config()
+            return dict(list(base_config.items()) + list(config.items()))
+    return NewOptimizer
+@export_to_custom_objects
+def extend_with_lookahead_v2(BaseOptimizer):
+    """返回新的优化器类，加入look ahead
+    """
+    class NewOptimizer(BaseOptimizer):
+        """带有look ahead的优化器
+        https://arxiv.org/abs/1907.08610
+        steps_per_slow_update: 即论文中的k；
+        slow_step_size: 即论文中的alpha。
+        """
+        @insert_arguments(steps_per_slow_update=5, slow_step_size=0.5)
+        def __init__(self, *args, **kwargs):
+            super(NewOptimizer, self).__init__(*args, **kwargs)
+        def _create_slots(self, var_list):
+            super(NewOptimizer, self)._create_slots(var_list)
+            for var in var_list:
+                self.add_slot(var, 'slow_var')
+        def _resource_apply(self, grad, var, indices=None):
+            op = super(NewOptimizer, self)._resource_apply(grad, var, indices)
+            k, alpha = self.steps_per_slow_update, self.slow_step_size
+            cond = K.equal(self.iterations % k, 0)
+            slow_var = self.get_slot(var, 'slow_var')
+            slow_var_t = slow_var + alpha * (var - slow_var)
+            with tf.control_dependencies([op]):
+                slow_update = K.update(
+                    slow_var, K.switch(cond, slow_var_t, slow_var)
+                )
+                with tf.control_dependencies([slow_update]):
+                    copy_update = K.update(var, K.switch(cond, slow_var, var))
+            return copy_update
+        def get_config(self):
+            config = {
+                'steps_per_slow_update': self.steps_per_slow_update,
+                'slow_step_size': self.slow_step_size,
+            }
+            base_config = super(NewOptimizer, self).get_config()
+            return dict(list(base_config.items()) + list(config.items()))
+    return NewOptimizer
+@export_to_custom_objects
+def extend_with_lazy_optimization(BaseOptimizer):
+    """返回新的优化器类，加入懒惰更新
+    """
+    class NewOptimizer(BaseOptimizer):
+        """带有懒惰更新的优化器
+        使得部分权重（尤其是embedding）只有在梯度不等于0时
+        才发生更新。
+        """
+        @insert_arguments(include_in_lazy_optimization=[])
+        def __init__(self, *args, **kwargs):
+            super(NewOptimizer, self).__init__(*args, **kwargs)
+            self._first_get_gradients = True
+        def get_gradients(self, loss, params):
+            if self._first_get_gradients:
+                self._first_get_gradients = False
+                return super(NewOptimizer, self).get_gradients(loss, params)
+            else:
+                return [self.grads[p] for p in params]
+        @K.symbolic
+        def get_updates(self, loss, params):
+            self.grads = dict(zip(params, self.get_gradients(loss, params)))
+            old_update = K.update
+            def new_update(x, new_x):
+                if is_one_of(x, params) and self._do_lazy_optimization(x):
+                    g = self.grads[x]
+                    r = K.any(K.not_equal(g, 0.0), axis=-1, keepdims=True)
+                    new_x = x + (new_x - x) * K.cast(r, K.floatx())
+                return old_update(x, new_x)
+            K.update = new_update
+            updates = super(NewOptimizer, self).get_updates(loss, params)
+            K.update = old_update
+            return updates
+        def _do_lazy_optimization(self, w):
+            return string_matching(w.name, self.include_in_lazy_optimization)
+        def get_config(self):
+            config = {
+                'include_in_lazy_optimization':
+                    self.include_in_lazy_optimization,
+            }
+            base_config = super(NewOptimizer, self).get_config()
+            return dict(list(base_config.items()) + list(config.items()))
+    return NewOptimizer
+@export_to_custom_objects
+def extend_with_lazy_optimization_v2(BaseOptimizer):
+    """返回新的优化器类，加入懒惰更新
+    """
+    class NewOptimizer(BaseOptimizer):
+        """带有懒惰更新的优化器
+        使得部分权重（尤其是embedding）只有在梯度不等于0时
+        才发生更新。
+        """
+        @insert_arguments(include_in_lazy_optimization=[])
+        def __init__(self, *args, **kwargs):
+            super(NewOptimizer, self).__init__(*args, **kwargs)
+        def _resource_apply(self, grad, var, indices=None):
+            old_update = K.update
+            def new_update(x, new_x):
+                if x is var and self._do_lazy_optimization(x):
+                    if indices is None:
+                        r = K.any(
+                            K.not_equal(grad, 0.0), axis=-1, keepdims=True
+                        )
+                        new_x = x + (new_x - x) * K.cast(r, K.floatx())
+                        return old_update(x, new_x)
+                    else:
+                        return self._resource_scatter_add(
+                            x, indices, K.gather(new_x - x, indices)
+                        )
+                return old_update(x, new_x)
+            K.update = new_update
+            op = super(NewOptimizer, self)._resource_apply(grad, var, indices)
+            K.update = old_update
+            return op
+        def _do_lazy_optimization(self, w):
+            return string_matching(w.name, self.include_in_lazy_optimization)
+        def get_config(self):
+            config = {
+                'include_in_lazy_optimization':
+                    self.include_in_lazy_optimization,
+            }
+            base_config = super(NewOptimizer, self).get_config()
+            return dict(list(base_config.items()) + list(config.items()))
+    return NewOptimizer
+@export_to_custom_objects
+def extend_with_exponential_moving_average(BaseOptimizer):
+    """返回新的优化器类，加入EMA（权重滑动平均）
+    """
+    class NewOptimizer(BaseOptimizer):
+        """带EMA（权重滑动平均）的优化器
+        """
+        @insert_arguments(ema_momentum=0.999)
+        def __init__(self, *args, **kwargs):
+            super(NewOptimizer, self).__init__(*args, **kwargs)
+        def get_updates(self, loss, params):
+            updates = super(NewOptimizer, self).get_updates(loss, params)
+            self.model_weights = params
+            self.ema_weights = [K.zeros(K.shape(w)) for w in params]
+            self.old_weights = K.batch_get_value(params)
+            ema_updates, ema_momentum = [], self.ema_momentum
+            with tf.control_dependencies(updates):
+                for w1, w2 in zip(self.ema_weights, params):
+                    new_w = ema_momentum * w1 + (1 - ema_momentum) * w2
+                    ema_updates.append(K.update(w1, new_w))
+            return ema_updates
+        def get_config(self):
+            config = {
+                'ema_momentum': self.ema_momentum,
+            }
+            base_config = super(NewOptimizer, self).get_config()
+            return dict(list(base_config.items()) + list(config.items()))
+        def apply_ema_weights(self, bias_correction=True):
+            """备份原模型权重，然后将平均权重应用到模型上去。
+            """
+            self.old_weights = K.batch_get_value(self.model_weights)
+            ema_weights = K.batch_get_value(self.ema_weights)
+            if bias_correction:
+                iterations = K.eval(self.iterations)
+                scale = 1.0 - np.power(self.ema_momentum, iterations)
+                ema_weights = [weight / scale for weight in ema_weights]
+            K.batch_set_value(zip(self.model_weights, ema_weights))
+        def reset_old_weights(self):
+            """恢复模型到旧权重。
+            """
+            K.batch_set_value(zip(self.model_weights, self.old_weights))
+    return NewOptimizer
+@export_to_custom_objects
+def extend_with_exponential_moving_average_v2(BaseOptimizer):
+    """返回新的优化器类，加入EMA（权重滑动平均）
+    """
+    class NewOptimizer(BaseOptimizer):
+        """带EMA（权重滑动平均）的优化器
+        """
+        @insert_arguments(ema_momentum=0.999)
+        def __init__(self, *args, **kwargs):
+            super(NewOptimizer, self).__init__(*args, **kwargs)
+        def _create_slots(self, var_list):
+            super(NewOptimizer, self)._create_slots(var_list)
+            self.model_weights = var_list
+            self.ema_weights = []
+            for var in var_list:
+                self.ema_weights.append(self.add_slot(var, 'ema'))
+        def _resource_apply_dense(self, grad, var):
+            op = super(NewOptimizer, self)._resource_apply_dense(grad, var)
+            ema = self.get_slot(var, 'ema')
+            ema_momentum = self.ema_momentum
+            with tf.control_dependencies([op]):
+                return K.update(
+                    ema, ema * ema_momentum + var * (1.0 - ema_momentum)
+                )
+        def _resource_apply_sparse(self, grad, var, indices):
+            op = super(NewOptimizer,
+                       self)._resource_apply_sparse(grad, var, indices)
+            ema = self.get_slot(var, 'ema')
+            ema_momentum = self.ema_momentum
+            with tf.control_dependencies([op]):
+                return K.update(
+                    ema, ema * ema_momentum + var * (1.0 - ema_momentum)
+                )
+        def get_config(self):
+            config = {
+                'ema_momentum': self.ema_momentum,
+            }
+            base_config = super(NewOptimizer, self).get_config()
+            return dict(list(base_config.items()) + list(config.items()))
+        def apply_ema_weights(self, bias_correction=True):
+            """备份原模型权重，然后将平均权重应用到模型上去。
+            """
+            self.old_weights = K.batch_get_value(self.model_weights)
+            ema_weights = K.batch_get_value(self.ema_weights)
+            if bias_correction:
+                iterations = K.eval(self.iterations)
+                scale = 1.0 - np.power(self.ema_momentum, iterations)
+                ema_weights = [weight / scale for weight in ema_weights]
+            K.batch_set_value(zip(self.model_weights, ema_weights))
+        def reset_old_weights(self):
+            """恢复模型到旧权重。
+            """
+            K.batch_set_value(zip(self.model_weights, self.old_weights))
+    return NewOptimizer
+@export_to_custom_objects
+def extend_with_parameter_wise_lr(BaseOptimizer):
+    """返回新的优化器类，加入分参数学习率
+    主要场景就是给每层甚至每个参数设置不同的学习率。
+    """
+    class NewOptimizer(BaseOptimizer):
+        """带有分参数学习率的优化器
+        其中schedule是形如{name1: 2, name2: 0.1}的字典，
+        其实name1、name2是字符串，表示变量名包含name1的
+        参数学习率乘以2，变量名包含name2的参数学习率要
+        乘以0.1。
+        """
+        @insert_arguments(paramwise_lr_schedule={})
+        def __init__(self, *args, **kwargs):
+            super(NewOptimizer, self).__init__(*args, **kwargs)
+        @K.symbolic
+        def get_updates(self, loss, params):
+            old_update = K.update
+            def new_update(x, new_x):
+                if is_one_of(x, params):
+                    lr_multiplier = 1
+                    for k, v in self.paramwise_lr_schedule.items():
+                        if k in x.name:
+                            lr_multiplier *= v
+                    if lr_multiplier != 1:
+                        new_x = x + (new_x - x) * lr_multiplier
+                return old_update(x, new_x)
+            K.update = new_update
+            updates = super(NewOptimizer, self).get_updates(loss, params)
+            K.update = old_update
+            return updates
+        def get_config(self):
+            config = {
+                'paramwise_lr_schedule': self.paramwise_lr_schedule,
+            }
+            base_config = super(NewOptimizer, self).get_config()
+            return dict(list(base_config.items()) + list(config.items()))
+    return NewOptimizer
+@export_to_custom_objects
+def extend_with_parameter_wise_lr_v2(BaseOptimizer):
+    """返回新的优化器类，加入分参数学习率
+    主要场景就是给每层甚至每个参数设置不同的学习率。
+    """
+    class NewOptimizer(BaseOptimizer):
+        """带有分参数学习率的优化器
+        其中schedule是形如{name1: 2, name2: 0.1}的字典，
+        其实name1、name2是字符串，表示变量名包含name1的
+        参数学习率乘以2，变量名包含name2的参数学习率要
+        乘以0.1。
+        """
+        @insert_arguments(paramwise_lr_schedule={})
+        def __init__(self, *args, **kwargs):
+            super(NewOptimizer, self).__init__(*args, **kwargs)
+        def _resource_apply(self, grad, var, indices=None):
+            old_update = K.update
+            def new_update(x, new_x):
+                if x is var:
+                    lr_multiplier = 1
+                    for k, v in self.paramwise_lr_schedule.items():
+                        if k in x.name:
+                            lr_multiplier *= v
+                    if lr_multiplier != 1:
+                        new_x = x + (new_x - x) * lr_multiplier
+                return old_update(x, new_x)
+            K.update = new_update
+            op = super(NewOptimizer, self)._resource_apply(grad, var, indices)
+            K.update = old_update
+            return op
+        def get_config(self):
+            config = {
+                'paramwise_lr_schedule': self.paramwise_lr_schedule,
+            }
+            base_config = super(NewOptimizer, self).get_config()
+            return dict(list(base_config.items()) + list(config.items()))
+    return NewOptimizer
+if is_tf_keras:
+    extend_with_weight_decay = extend_with_weight_decay_v2
+    extend_with_layer_adaptation = extend_with_layer_adaptation_v2
+    extend_with_piecewise_linear_lr = extend_with_piecewise_linear_lr_v2
+    extend_with_gradient_accumulation = extend_with_gradient_accumulation_v2
+    extend_with_lookahead = extend_with_lookahead_v2
+    extend_with_lazy_optimization = extend_with_lazy_optimization_v2
+    extend_with_exponential_moving_average = extend_with_exponential_moving_average_v2
+    extend_with_parameter_wise_lr = extend_with_parameter_wise_lr_v2
+    AdaFactor = AdaFactorV2
+else:
+    Adam = keras.optimizers.Adam
+    AdaFactor = AdaFactorV1
+AdaFactor.__name__ = 'AdaFactor'
+custom_objects = {
+    'Adam': Adam,
+    'AdaFactor': AdaFactor,
+}
+keras.utils.get_custom_objects().update(custom_objects)
--- a/Keras/NLP/bert4keras/bert4keras/snippets.py
+++ b/Keras/NLP/bert4keras/bert4keras/snippets.py
+#! -*- coding: utf-8 -*-
+# 代码合集
+import os, sys, six, re, json
+import logging
+import numpy as np
+from collections import defaultdict
+from bert4keras.backend import K, keras, tf
+_open_ = open
+is_py2 = six.PY2
+if not is_py2:
+    basestring = str
+def to_array(*args):
+    """批量转numpy的array
+    """
+    results = [np.array(a) for a in args]
+    if len(args) == 1:
+        return results[0]
+    else:
+        return results
+def is_string(s):
+    """判断是否是字符串
+    """
+    return isinstance(s, basestring)
+def strQ2B(ustring):
+    """全角符号转对应的半角符号
+    """
+    rstring = ''
+    for uchar in ustring:
+        inside_code = ord(uchar)
+        # 全角空格直接转换
+        if inside_code == 12288:
+            inside_code = 32
+        # 全角字符（除空格）根据关系转化
+        elif (inside_code >= 65281 and inside_code <= 65374):
+            inside_code -= 65248
+        rstring += unichr(inside_code)
+    return rstring
+def string_matching(s, keywords):
+    """判断s是否至少包含keywords中的至少一个字符串
+    """
+    for k in keywords:
+        if re.search(k, s):
+            return True
+    return False
+def convert_to_unicode(text, encoding='utf-8', errors='ignore'):
+    """字符串转换为unicode格式（假设输入为utf-8格式）
+    """
+    if is_py2:
+        if isinstance(text, str):
+            text = text.decode(encoding, errors=errors)
+    else:
+        if isinstance(text, bytes):
+            text = text.decode(encoding, errors=errors)
+    return text
+def convert_to_str(text, encoding='utf-8', errors='ignore'):
+    """字符串转换为str格式（假设输入为utf-8格式）
+    """
+    if is_py2:
+        if isinstance(text, unicode):
+            text = text.encode(encoding, errors=errors)
+    else:
+        if isinstance(text, bytes):
+            text = text.decode(encoding, errors=errors)
+    return text
+class open:
+    """模仿python自带的open函数
+    作用：1.主要是为了同时兼容py2和py3；2.增加了索引功能，方便读取大文件。
+    """
+    def __init__(
+        self, name, mode='r', encoding=None, errors='strict', indexable=False
+    ):
+        self.name = name
+        if is_py2:
+            self.file = _open_(name, mode)
+        else:
+            self.file = _open_(name, mode, encoding=encoding, errors=errors)
+        self.encoding = encoding
+        self.errors = errors
+        self.iterator = None
+        if indexable:
+            if is_string(indexable) and os.path.exists(indexable):
+                self.offsets = json.load(_open_(indexable))
+            else:
+                self.create_indexes()
+                if is_string(indexable):
+                    json.dump(self.offsets, _open_(indexable, 'w'))
+    def create_indexes(self):
+        print('creating indexes ...')
+        self.offsets, offset = [], 0
+        pbar = keras.utils.Progbar(os.path.getsize(self.name))
+        while self.readline():
+            self.offsets.append(offset)
+            offset = self.tell()
+            pbar.update(offset)
+        self.seek(0)
+        print('indexes created.')
+    def __getitem__(self, key):
+        self.seek(self.offsets[key])
+        l = self.readline()
+        if self.encoding:
+            l = convert_to_unicode(l, self.encoding, self.errors)
+        return l
+    def __len__(self):
+        return len(self.offsets)
+    def __iter__(self):
+        if hasattr(self, 'offsets'):
+            for i in range(len(self)):
+                yield self[i]
+        else:
+            for l in self.file:
+                if self.encoding:
+                    l = convert_to_unicode(l, self.encoding, self.errors)
+                yield l
+    def next(self):
+        if self.iterator is None:
+            self.iterator = self.__iter__()
+        return next(self.iterator)
+    def __next__(self):
+        return self.next()
+    def read(self):
+        text = self.file.read()
+        if self.encoding:
+            text = convert_to_unicode(text, self.encoding, self.errors)
+        return text
+    def readline(self):
+        text = self.file.readline()
+        if self.encoding:
+            text = convert_to_unicode(text, self.encoding, self.errors)
+        return text
+    def readlines(self):
+        if self.encoding:
+            return [
+                convert_to_unicode(text, self.encoding, self.errors)
+                for text in self.file.readlines()
+            ]
+        else:
+            return self.file.readlines()
+    def write(self, text):
+        if self.encoding:
+            text = convert_to_str(text, self.encoding, self.errors)
+        self.file.write(text)
+    def flush(self):
+        self.file.flush()
+    def close(self):
+        self.file.close()
+    def tell(self):
+        return self.file.tell()
+    def seek(self, offset=0):
+        return self.file.seek(offset)
+    def __enter__(self):
+        return self
+    def __exit__(self, type, value, tb):
+        self.close()
+def parallel_apply(
+    func,
+    iterable,
+    workers,
+    max_queue_size,
+    callback=None,
+    dummy=False,
+    random_seeds=True
+):
+    """多进程或多线程地将func应用到iterable的每个元素中。
+    注意这个apply是异步且无序的，也就是说依次输入a,b,c，但是
+    输出可能是func(c), func(a), func(b)。
+    参数：
+        callback: 处理单个输出的回调函数；
+        dummy: False是多进程/线性，True则是多线程/线性；
+        random_seeds: 每个进程的随机种子。
+    """
+    if dummy:
+        from multiprocessing.dummy import Pool, Queue
+    else:
+        from multiprocessing import Pool, Queue
+    in_queue, out_queue, seed_queue = Queue(max_queue_size), Queue(), Queue()
+    if random_seeds is True:
+        random_seeds = [None] * workers
+    elif random_seeds is None or random_seeds is False:
+        random_seeds = []
+    for seed in random_seeds:
+        seed_queue.put(seed)
+    def worker_step(in_queue, out_queue):
+        """单步函数包装成循环执行
+        """
+        if not seed_queue.empty():
+            np.random.seed(seed_queue.get())
+        while True:
+            i, d = in_queue.get()
+            r = func(d)
+            out_queue.put((i, r))
+    # 启动多进程/线程
+    pool = Pool(workers, worker_step, (in_queue, out_queue))
+    if callback is None:
+        results = []
+    # 后处理函数
+    def process_out_queue():
+        out_count = 0
+        for _ in range(out_queue.qsize()):
+            i, d = out_queue.get()
+            out_count += 1
+            if callback is None:
+                results.append((i, d))
+            else:
+                callback(d)
+        return out_count
+    # 存入数据，取出结果
+    in_count, out_count = 0, 0
+    for i, d in enumerate(iterable):
+        in_count += 1
+        while True:
+            try:
+                in_queue.put((i, d), block=False)
+                break
+            except six.moves.queue.Full:
+                out_count += process_out_queue()
+        if in_count % max_queue_size == 0:
+            out_count += process_out_queue()
+    while out_count != in_count:
+        out_count += process_out_queue()
+    pool.terminate()
+    if callback is None:
+        results = sorted(results, key=lambda r: r[0])
+        return [r[1] for r in results]
+def sequence_padding(inputs, length=None, value=0, seq_dims=1, mode='post'):
+    """Numpy函数，将序列padding到同一长度
+    """
+    if length is None:
+        length = np.max([np.shape(x)[:seq_dims] for x in inputs], axis=0)
+    elif not hasattr(length, '__getitem__'):
+        length = [length]
+    slices = [np.s_[:length[i]] for i in range(seq_dims)]
+    slices = tuple(slices) if len(slices) > 1 else slices[0]
+    pad_width = [(0, 0) for _ in np.shape(inputs[0])]
+    outputs = []
+    for x in inputs:
+        x = x[slices]
+        for i in range(seq_dims):
+            if mode == 'post':
+                pad_width[i] = (0, length[i] - np.shape(x)[i])
+            elif mode == 'pre':
+                pad_width[i] = (length[i] - np.shape(x)[i], 0)
+            else:
+                raise ValueError('"mode" argument must be "post" or "pre".')
+        x = np.pad(x, pad_width, 'constant', constant_values=value)
+        outputs.append(x)
+    return np.array(outputs)
+def truncate_sequences(maxlen, indices, *sequences):
+    """截断总长度至不超过maxlen
+    """
+    sequences = [s for s in sequences if s]
+    if not isinstance(indices, (list, tuple)):
+        indices = [indices] * len(sequences)
+    while True:
+        lengths = [len(s) for s in sequences]
+        if sum(lengths) > maxlen:
+            i = np.argmax(lengths)
+            sequences[i].pop(indices[i])
+        else:
+            return sequences
+def text_segmentate(text, maxlen, seps='\n', strips=None):
+    """将文本按照标点符号划分为若干个短句
+    """
+    text = text.strip().strip(strips)
+    if seps and len(text) > maxlen:
+        pieces = text.split(seps[0])
+        text, texts = '', []
+        for i, p in enumerate(pieces):
+            if text and p and len(text) + len(p) > maxlen - 1:
+                texts.extend(text_segmentate(text, maxlen, seps[1:], strips))
+                text = ''
+            if i + 1 == len(pieces):
+                text = text + p
+            else:
+                text = text + p + seps[0]
+        if text:
+            texts.extend(text_segmentate(text, maxlen, seps[1:], strips))
+        return texts
+    else:
+        return [text]
+def is_one_of(x, ys):
+    """判断x是否在ys之中
+    等价于x in ys，但有些情况下x in ys会报错
+    """
+    for y in ys:
+        if x is y:
+            return True
+    return False
+class DataGenerator(object):
+    """数据生成器模版
+    """
+    def __init__(self, data, batch_size=32, buffer_size=None):
+        self.data = data
+        self.batch_size = batch_size
+        if hasattr(self.data, '__len__'):
+            self.steps = len(self.data) // self.batch_size
+            if len(self.data) % self.batch_size != 0:
+                self.steps += 1
+        else:
+            self.steps = None
+        self.buffer_size = buffer_size or batch_size * 1000
+    def __len__(self):
+        return self.steps
+    def sample(self, random=False):
+        """采样函数，每个样本同时返回一个is_end标记
+        """
+        if random:
+            if self.steps is None:
+                def generator():
+                    caches, isfull = [], False
+                    for d in self.data:
+                        caches.append(d)
+                        if isfull:
+                            i = np.random.randint(len(caches))
+                            yield caches.pop(i)
+                        elif len(caches) == self.buffer_size:
+                            isfull = True
+                    while caches:
+                        i = np.random.randint(len(caches))
+                        yield caches.pop(i)
+            else:
+                def generator():
+                    for i in np.random.permutation(len(self.data)):
+                        yield self.data[i]
+            data = generator()
+        else:
+            data = iter(self.data)
+        d_current = next(data)
+        for d_next in data:
+            yield False, d_current
+            d_current = d_next
+        yield True, d_current
+    def __iter__(self, random=False):
+        raise NotImplementedError
+    def forfit(self, random=True):
+        while True:
+            for d in self.__iter__(random):
+                yield d
+    def to_dataset(self, types, shapes, names=None, padded_batch=False):
+        """转为tf.data.Dataset格式
+        如果传入names的话，自动把数据包装成dict形式。
+        """
+        if names is None:
+            generator = self.forfit
+        else:
+            if is_string(names):
+                warps = lambda k, v: {k: v}
+            elif is_string(names[0]):
+                warps = lambda k, v: dict(zip(k, v))
+            else:
+                warps = lambda k, v: tuple(
+                    dict(zip(i, j)) for i, j in zip(k, v)
+                )
+            def generator():
+                for d in self.forfit():
+                    yield warps(names, d)
+            types = warps(names, types)
+            shapes = warps(names, shapes)
+        if padded_batch:
+            dataset = tf.data.Dataset.from_generator(
+                generator, output_types=types
+            )
+            dataset = dataset.padded_batch(self.batch_size, shapes)
+        else:
+            dataset = tf.data.Dataset.from_generator(
+                generator, output_types=types, output_shapes=shapes
+            )
+            dataset = dataset.batch(self.batch_size)
+        return dataset
+class ViterbiDecoder(object):
+    """Viterbi解码算法基类
+    """
+    def __init__(self, trans, starts=None, ends=None):
+        self.trans = trans
+        self.num_labels = len(trans)
+        self.non_starts = []
+        self.non_ends = []
+        if starts is not None:
+            for i in range(self.num_labels):
+                if i not in starts:
+                    self.non_starts.append(i)
+        if ends is not None:
+            for i in range(self.num_labels):
+                if i not in ends:
+                    self.non_ends.append(i)
+    def decode(self, nodes):
+        """nodes.shape=[seq_len, num_labels]
+        """
+        # 预处理
+        nodes[0, self.non_starts] -= np.inf
+        nodes[-1, self.non_ends] -= np.inf
+        # 动态规划
+        labels = np.arange(self.num_labels).reshape((1, -1))
+        scores = nodes[0].reshape((-1, 1))
+        paths = labels
+        for l in range(1, len(nodes)):
+            M = scores + self.trans + nodes[l].reshape((1, -1))
+            idxs = M.argmax(0)
+            scores = M.max(0).reshape((-1, 1))
+            paths = np.concatenate([paths[:, idxs], labels], 0)
+        # 最优路径
+        return paths[:, scores[:, 0].argmax()]
+def softmax(x, axis=-1):
+    """numpy版softmax
+    """
+    x = x - x.max(axis=axis, keepdims=True)
+    x = np.exp(x)
+    return x / x.sum(axis=axis, keepdims=True)
+class AutoRegressiveDecoder(object):
+    """通用自回归生成模型解码基类
+    包含beam search和random sample两种策略
+    """
+    def __init__(self, start_id, end_id, maxlen, minlen=1):
+        self.start_id = start_id
+        self.end_id = end_id
+        self.maxlen = maxlen
+        self.minlen = minlen
+        self.models = {}
+        if start_id is None:
+            self.first_output_ids = np.empty((1, 0), dtype=int)
+        else:
+            self.first_output_ids = np.array([[self.start_id]])
+    @staticmethod
+    def wraps(default_rtype='probas', use_states=False):
+        """用来进一步完善predict函数
+        目前包含：1. 设置rtype参数，并做相应处理；
+                  2. 确定states的使用，并做相应处理；
+                  3. 设置温度参数，并做相应处理。
+        """
+        def actual_decorator(predict):
+            def new_predict(
+                self,
+                inputs,
+                output_ids,
+                states,
+                temperature=1,
+                rtype=default_rtype
+            ):
+                assert rtype in ['probas', 'logits']
+                prediction = predict(self, inputs, output_ids, states)
+                if not use_states:
+                    prediction = (prediction, None)
+                if default_rtype == 'logits':
+                    prediction = (
+                        softmax(prediction[0] / temperature), prediction[1]
+                    )
+                elif temperature != 1:
+                    probas = np.power(prediction[0], 1.0 / temperature)
+                    probas = probas / probas.sum(axis=-1, keepdims=True)
+                    prediction = (probas, prediction[1])
+                if rtype == 'probas':
+                    return prediction
+                else:
+                    return np.log(prediction[0] + 1e-12), prediction[1]
+            return new_predict
+        return actual_decorator
+    def last_token(self, model):
+        """创建一个只返回最后一个token输出的新Model
+        """
+        if model not in self.models:
+            outputs = [
+                keras.layers.Lambda(lambda x: x[:, -1])(output)
+                for output in model.outputs
+            ]
+            self.models[model] = keras.models.Model(model.inputs, outputs)
+        return self.models[model]
+    def predict(self, inputs, output_ids, states=None):
+        """用户需自定义递归预测函数
+        说明：定义的时候，需要用wraps方法进行装饰，传入default_rtype和use_states，
+             其中default_rtype为字符串logits或probas，probas时返回归一化的概率，
+             rtype=logits时则返回softmax前的结果或者概率对数。
+        返回：二元组 (得分或概率, states)
+        """
+        raise NotImplementedError
+    def beam_search(self, inputs, topk, states=None, temperature=1, min_ends=1):
+        """beam search解码
+        说明：这里的topk即beam size；
+        返回：最优解码序列。
+        """
+        inputs = [np.array([i]) for i in inputs]
+        output_ids, output_scores = self.first_output_ids, np.zeros(1)
+        for step in range(self.maxlen):
+            scores, states = self.predict(
+                inputs, output_ids, states, temperature, 'logits'
+            )  # 计算当前得分
+            if step == 0:  # 第1步预测后将输入重复topk次
+                inputs = [np.repeat(i, topk, axis=0) for i in inputs]
+            scores = output_scores.reshape((-1, 1)) + scores  # 综合累积得分
+            indices = scores.argpartition(-topk, axis=None)[-topk:]  # 仅保留topk
+            indices_1 = indices // scores.shape[1]  # 行索引
+            indices_2 = (indices % scores.shape[1]).reshape((-1, 1))  # 列索引
+            output_ids = np.concatenate([output_ids[indices_1], indices_2],
+                                        1)  # 更新输出
+            output_scores = np.take_along_axis(
+                scores, indices, axis=None
+            )  # 更新得分
+            end_counts = (output_ids == self.end_id).sum(1)  # 统计出现的end标记
+            if output_ids.shape[1] >= self.minlen:  # 最短长度判断
+                best_one = output_scores.argmax()  # 得分最大的那个
+                if end_counts[best_one] == min_ends:  # 如果已经终止
+                    return output_ids[best_one]  # 直接输出
+                else:  # 否则，只保留未完成部分
+                    flag = (end_counts < min_ends)  # 标记未完成序列
+                    if not flag.all():  # 如果有已完成的
+                        inputs = [i[flag] for i in inputs]  # 扔掉已完成序列
+                        output_ids = output_ids[flag]  # 扔掉已完成序列
+                        output_scores = output_scores[flag]  # 扔掉已完成序列
+                        end_counts = end_counts[flag]  # 扔掉已完成end计数
+                        topk = flag.sum()  # topk相应变化
+        # 达到长度直接输出
+        return output_ids[output_scores.argmax()]
+    def random_sample(
+        self,
+        inputs,
+        n,
+        topk=None,
+        topp=None,
+        states=None,
+        temperature=1,
+        min_ends=1
+    ):
+        """随机采样n个结果
+        说明：非None的topk表示每一步只从概率最高的topk个中采样；而非None的topp
+             表示每一步只从概率最高的且概率之和刚好达到topp的若干个token中采样。
+        返回：n个解码序列组成的list。
+        """
+        inputs = [np.array([i]) for i in inputs]
+        output_ids = self.first_output_ids
+        results = []
+        for step in range(self.maxlen):
+            probas, states = self.predict(
+                inputs, output_ids, states, temperature, 'probas'
+            )  # 计算当前概率
+            probas /= probas.sum(axis=1, keepdims=True)  # 确保归一化
+            if step == 0:  # 第1步预测后将结果重复n次
+                probas = np.repeat(probas, n, axis=0)
+                inputs = [np.repeat(i, n, axis=0) for i in inputs]
+                output_ids = np.repeat(output_ids, n, axis=0)
+            if topk is not None:
+                k_indices = probas.argpartition(-topk,
+                                                axis=1)[:, -topk:]  # 仅保留topk
+                probas = np.take_along_axis(probas, k_indices, axis=1)  # topk概率
+                probas /= probas.sum(axis=1, keepdims=True)  # 重新归一化
+            if topp is not None:
+                p_indices = probas.argsort(axis=1)[:, ::-1]  # 从高到低排序
+                probas = np.take_along_axis(probas, p_indices, axis=1)  # 排序概率
+                cumsum_probas = np.cumsum(probas, axis=1)  # 累积概率
+                flag = np.roll(cumsum_probas >= topp, 1, axis=1)  # 标记超过topp的部分
+                flag[:, 0] = False  # 结合上面的np.roll，实现平移一位的效果
+                probas[flag] = 0  # 后面的全部置零
+                probas /= probas.sum(axis=1, keepdims=True)  # 重新归一化
+            sample_func = lambda p: np.random.choice(len(p), p=p)  # 按概率采样函数
+            sample_ids = np.apply_along_axis(sample_func, 1, probas)  # 执行采样
+            sample_ids = sample_ids.reshape((-1, 1))  # 对齐形状
+            if topp is not None:
+                sample_ids = np.take_along_axis(
+                    p_indices, sample_ids, axis=1
+                )  # 对齐原id
+            if topk is not None:
+                sample_ids = np.take_along_axis(
+                    k_indices, sample_ids, axis=1
+                )  # 对齐原id
+            output_ids = np.concatenate([output_ids, sample_ids], 1)  # 更新输出
+            end_counts = (output_ids == self.end_id).sum(1)  # 统计出现的end标记
+            if output_ids.shape[1] >= self.minlen:  # 最短长度判断
+                flag = (end_counts == min_ends)  # 标记已完成序列
+                if flag.any():  # 如果有已完成的
+                    for ids in output_ids[flag]:  # 存好已完成序列
+                        results.append(ids)
+                    flag = (flag == False)  # 标记未完成序列
+                    inputs = [i[flag] for i in inputs]  # 只保留未完成部分输入
+                    output_ids = output_ids[flag]  # 只保留未完成部分候选集
+                    end_counts = end_counts[flag]  # 只保留未完成部分end计数
+                    if len(output_ids) == 0:
+                        break
+        # 如果还有未完成序列，直接放入结果
+        for ids in output_ids:
+            results.append(ids)
+        # 返回结果
+        return results
+def insert_arguments(**arguments):
+    """装饰器，为类方法增加参数
+    （主要用于类的__init__方法）
+    """
+    def actual_decorator(func):
+        def new_func(self, *args, **kwargs):
+            for k, v in arguments.items():
+                if k in kwargs:
+                    v = kwargs.pop(k)
+                setattr(self, k, v)
+            return func(self, *args, **kwargs)
+        return new_func
+    return actual_decorator
+def delete_arguments(*arguments):
+    """装饰器，为类方法删除参数
+    （主要用于类的__init__方法）
+    """
+    def actual_decorator(func):
+        def new_func(self, *args, **kwargs):
+            for k in arguments:
+                if k in kwargs:
+                    raise TypeError(
+                        '%s got an unexpected keyword argument \'%s\'' %
+                        (self.__class__.__name__, k)
+                    )
+            return func(self, *args, **kwargs)
+        return new_func
+    return actual_decorator
+def longest_common_substring(source, target):
+    """最长公共子串（source和target的最长公共切片区间）
+    返回：子串长度, 所在区间（四元组）
+    注意：最长公共子串可能不止一个，所返回的区间只代表其中一个。
+    """
+    c, l, span = defaultdict(int), 0, (0, 0, 0, 0)
+    for i, si in enumerate(source, 1):
+        for j, tj in enumerate(target, 1):
+            if si == tj:
+                c[i, j] = c[i - 1, j - 1] + 1
+                if c[i, j] > l:
+                    l = c[i, j]
+                    span = (i - l, i, j - l, j)
+    return l, span
+def longest_common_subsequence(source, target):
+    """最长公共子序列（source和target的最长非连续子序列）
+    返回：子序列长度, 映射关系（映射对组成的list）
+    注意：最长公共子序列可能不止一个，所返回的映射只代表其中一个。
+    """
+    c = defaultdict(int)
+    for i, si in enumerate(source, 1):
+        for j, tj in enumerate(target, 1):
+            if si == tj:
+                c[i, j] = c[i - 1, j - 1] + 1
+            elif c[i, j - 1] > c[i - 1, j]:
+                c[i, j] = c[i, j - 1]
+            else:
+                c[i, j] = c[i - 1, j]
+    l, mapping = c[len(source), len(target)], []
+    i, j = len(source) - 1, len(target) - 1
+    while len(mapping) < l:
+        if source[i] == target[j]:
+            mapping.append((i, j))
+            i, j = i - 1, j - 1
+        elif c[i + 1, j] > c[i, j + 1]:
+            j = j - 1
+        else:
+            i = i - 1
+    return l, mapping[::-1]
+class WebServing(object):
+    """简单的Web接口
+    用法：
+        arguments = {'text': (None, True), 'n': (int, False)}
+        web = WebServing(port=8864)
+        web.route('/gen_synonyms', gen_synonyms, arguments)
+        web.start()
+        # 然后访问 http://127.0.0.1:8864/gen_synonyms?text=你好
+    说明：
+        基于bottlepy简单封装，仅作为临时测试使用，不保证性能。
+        目前仅保证支持 Tensorflow 1.x + Keras <= 2.3.1。
+        欢迎有经验的开发者帮忙改进。
+    依赖：
+        pip install bottle
+        pip install paste
+        （如果不用 server='paste' 的话，可以不装paste库）
+    """
+    def __init__(self, host='0.0.0.0', port=8000, server='paste'):
+        import bottle
+        self.host = host
+        self.port = port
+        self.server = server
+        self.graph = tf.get_default_graph()
+        self.sess = K.get_session()
+        self.set_session = K.set_session
+        self.bottle = bottle
+    def wraps(self, func, arguments, method='GET'):
+        """封装为接口函数
+        参数：
+            func：要转换为接口的函数，需要保证输出可以json化，即需要
+                  保证 json.dumps(func(inputs)) 能被执行成功；
+            arguments：声明func所需参数，其中key为参数名，value[0]为
+                       对应的转换函数（接口获取到的参数值都是字符串
+                       型），value[1]为该参数是否必须；
+            method：GET或者POST。
+        """
+        def new_func():
+            outputs = {'code': 0, 'desc': u'succeeded', 'data': {}}
+            kwargs = {}
+            for key, value in arguments.items():
+                if method == 'GET':
+                    result = self.bottle.request.GET.getunicode(key)
+                else:
+                    result = self.bottle.request.POST.getunicode(key)
+                if result is None:
+                    if value[1]:
+                        outputs['code'] = 1
+                        outputs['desc'] = 'lack of "%s" argument' % key
+                        return json.dumps(outputs, ensure_ascii=False)
+                else:
+                    if value[0] is not None:
+                        result = value[0](result)
+                    kwargs[key] = result
+            try:
+                with self.graph.as_default():
+                    self.set_session(self.sess)
+                    outputs['data'] = func(**kwargs)
+            except Exception as e:
+                outputs['code'] = 2
+                outputs['desc'] = str(e)
+            return json.dumps(outputs, ensure_ascii=False)
+        return new_func
+    def route(self, path, func, arguments, method='GET'):
+        """添加接口
+        """
+        func = self.wraps(func, arguments, method)
+        self.bottle.route(path, method=method)(func)
+    def start(self):
+        """启动服务
+        """
+        self.bottle.run(host=self.host, port=self.port, server=self.server)
+class Hook:
+    """注入uniout模块，实现import时才触发
+    """
+    def __init__(self, module):
+        self.module = module
+    def __getattr__(self, attr):
+        """使得 from bert4keras.backend import uniout
+        等效于 import uniout （自动识别Python版本，Python3
+        下则无操作。）
+        """
+        if attr == 'uniout':
+            if is_py2:
+                import uniout
+        else:
+            return getattr(self.module, attr)
+Hook.__name__ = __name__
+sys.modules[__name__] = Hook(sys.modules[__name__])
+del Hook
--- a/Keras/NLP/bert4keras/bert4keras/tokenizers.py
+++ b/Keras/NLP/bert4keras/bert4keras/tokenizers.py
+#! -*- coding: utf-8 -*-
+# 分词函数
+import unicodedata, re
+from bert4keras.snippets import is_string, is_py2
+from bert4keras.snippets import open
+from bert4keras.snippets import convert_to_unicode
+from bert4keras.snippets import truncate_sequences
+def load_vocab(dict_path, encoding='utf-8', simplified=False, startswith=None):
+    """从bert的词典文件中读取词典
+    """
+    token_dict = {}
+    with open(dict_path, encoding=encoding) as reader:
+        for line in reader:
+            token = line.split()
+            token = token[0] if token else line.strip()
+            token_dict[token] = len(token_dict)
+    if simplified:  # 过滤冗余部分token
+        new_token_dict, keep_tokens = {}, []
+        startswith = startswith or []
+        for t in startswith:
+            new_token_dict[t] = len(new_token_dict)
+            keep_tokens.append(token_dict[t])
+        for t, _ in sorted(token_dict.items(), key=lambda s: s[1]):
+            if t not in new_token_dict:
+                keep = True
+                if len(t) > 1:
+                    for c in Tokenizer.stem(t):
+                        if (
+                            Tokenizer._is_cjk_character(c) or
+                            Tokenizer._is_punctuation(c)
+                        ):
+                            keep = False
+                            break
+                if keep:
+                    new_token_dict[t] = len(new_token_dict)
+                    keep_tokens.append(token_dict[t])
+        return new_token_dict, keep_tokens
+    else:
+        return token_dict
+def save_vocab(dict_path, token_dict, encoding='utf-8'):
+    """将词典（比如精简过的）保存为文件
+    """
+    with open(dict_path, 'w', encoding=encoding) as writer:
+        for k, v in sorted(token_dict.items(), key=lambda s: s[1]):
+            writer.write(k + '\n')
+class TokenizerBase(object):
+    """分词器基类
+    """
+    def __init__(
+        self,
+        token_start='[CLS]',
+        token_end='[SEP]',
+        pre_tokenize=None,
+        token_translate=None
+    ):
+        """参数说明：
+        pre_tokenize：外部传入的分词函数，用作对文本进行预分词。如果传入
+                      pre_tokenize，则先执行pre_tokenize(text)，然后在它
+                      的基础上执行原本的tokenize函数；
+        token_translate：映射字典，主要用在tokenize之后，将某些特殊的token
+                         替换为对应的token。
+        """
+        self._token_pad = '[PAD]'
+        self._token_unk = '[UNK]'
+        self._token_mask = '[MASK]'
+        self._token_start = token_start
+        self._token_end = token_end
+        self._pre_tokenize = pre_tokenize
+        self._token_translate = token_translate or {}
+        self._token_translate_inv = {
+            v: k
+            for k, v in self._token_translate.items()
+        }
+    def tokenize(self, text, maxlen=None):
+        """分词函数
+        """
+        tokens = [
+            self._token_translate.get(token) or token
+            for token in self._tokenize(text)
+        ]
+        if self._token_start is not None:
+            tokens.insert(0, self._token_start)
+        if self._token_end is not None:
+            tokens.append(self._token_end)
+        if maxlen is not None:
+            index = int(self._token_end is not None) + 1
+            truncate_sequences(maxlen, -index, tokens)
+        return tokens
+    def token_to_id(self, token):
+        """token转换为对应的id
+        """
+        raise NotImplementedError
+    def tokens_to_ids(self, tokens):
+        """token序列转换为对应的id序列
+        """
+        return [self.token_to_id(token) for token in tokens]
+    def encode(
+        self,
+        first_text,
+        second_text=None,
+        maxlen=None,
+        pattern='S*E*E',
+        truncate_from='right'
+    ):
+        """输出文本对应token id和segment id
+        """
+        if is_string(first_text):
+            first_tokens = self.tokenize(first_text)
+        else:
+            first_tokens = first_text
+        if second_text is None:
+            second_tokens = None
+        elif is_string(second_text):
+            second_tokens = self.tokenize(second_text)
+        else:
+            second_tokens = second_text
+        if maxlen is not None:
+            if truncate_from == 'right':
+                index = -int(self._token_end is not None) - 1
+            elif truncate_from == 'left':
+                index = int(self._token_start is not None)
+            else:
+                index = truncate_from
+            if second_text is not None and pattern == 'S*E*E':
+                maxlen += 1
+            truncate_sequences(maxlen, index, first_tokens, second_tokens)
+        first_token_ids = self.tokens_to_ids(first_tokens)
+        first_segment_ids = [0] * len(first_token_ids)
+        if second_text is not None:
+            if pattern == 'S*E*E':
+                idx = int(bool(self._token_start))
+                second_tokens = second_tokens[idx:]
+            second_token_ids = self.tokens_to_ids(second_tokens)
+            second_segment_ids = [1] * len(second_token_ids)
+            first_token_ids.extend(second_token_ids)
+            first_segment_ids.extend(second_segment_ids)
+        return first_token_ids, first_segment_ids
+    def id_to_token(self, i):
+        """id序列为对应的token
+        """
+        raise NotImplementedError
+    def ids_to_tokens(self, ids):
+        """id序列转换为对应的token序列
+        """
+        return [self.id_to_token(i) for i in ids]
+    def decode(self, ids):
+        """转为可读文本
+        """
+        raise NotImplementedError
+    def _tokenize(self, text):
+        """基本分词函数
+        """
+        raise NotImplementedError
+class Tokenizer(TokenizerBase):
+    """Bert原生分词器
+    纯Python实现，代码修改自keras_bert的tokenizer实现
+    """
+    def __init__(
+        self, token_dict, do_lower_case=False, word_maxlen=200, **kwargs
+    ):
+        super(Tokenizer, self).__init__(**kwargs)
+        if is_string(token_dict):
+            token_dict = load_vocab(token_dict)
+        self._do_lower_case = do_lower_case
+        self._token_dict = token_dict
+        self._token_dict_inv = {v: k for k, v in token_dict.items()}
+        self._vocab_size = len(token_dict)
+        self._word_maxlen = word_maxlen
+        for token in ['pad', 'unk', 'mask', 'start', 'end']:
+            try:
+                _token_id = token_dict[getattr(self, '_token_%s' % token)]
+                setattr(self, '_token_%s_id' % token, _token_id)
+            except:
+                pass
+    def token_to_id(self, token):
+        """token转换为对应的id
+        """
+        return self._token_dict.get(token, self._token_unk_id)
+    def id_to_token(self, i):
+        """id转换为对应的token
+        """
+        return self._token_dict_inv[i]
+    def decode(self, ids, tokens=None):
+        """转为可读文本
+        """
+        tokens = tokens or self.ids_to_tokens(ids)
+        tokens = [token for token in tokens if not self._is_special(token)]
+        text, flag = '', False
+        for i, token in enumerate(tokens):
+            if token[:2] == '##':
+                text += token[2:]
+            elif len(token) == 1 and self._is_cjk_character(token):
+                text += token
+            elif len(token) == 1 and self._is_punctuation(token):
+                text += token
+                text += ' '
+            elif i > 0 and self._is_cjk_character(text[-1]):
+                text += token
+            else:
+                text += ' '
+                text += token
+        text = re.sub(' +', ' ', text)
+        text = re.sub('\' (re|m|s|t|ve|d|ll) ', '\'\\1 ', text)
+        punctuation = self._cjk_punctuation() + '+-/={(<['
+        punctuation_regex = '|'.join([re.escape(p) for p in punctuation])
+        punctuation_regex = '(%s) ' % punctuation_regex
+        text = re.sub(punctuation_regex, '\\1', text)
+        text = re.sub('(\d\.) (\d)', '\\1\\2', text)
+        return text.strip()
+    def _tokenize(self, text, pre_tokenize=True):
+        """基本分词函数
+        """
+        if self._do_lower_case:
+            if is_py2:
+                text = unicode(text)
+            text = text.lower()
+            text = unicodedata.normalize('NFD', text)
+            text = ''.join([
+                ch for ch in text if unicodedata.category(ch) != 'Mn'
+            ])
+        if pre_tokenize and self._pre_tokenize is not None:
+            tokens = []
+            for token in self._pre_tokenize(text):
+                if token in self._token_dict:
+                    tokens.append(token)
+                else:
+                    tokens.extend(self._tokenize(token, False))
+            return tokens
+        spaced = ''
+        for ch in text:
+            if self._is_punctuation(ch) or self._is_cjk_character(ch):
+                spaced += ' ' + ch + ' '
+            elif self._is_space(ch):
+                spaced += ' '
+            elif ord(ch) == 0 or ord(ch) == 0xfffd or self._is_control(ch):
+                continue
+            else:
+                spaced += ch
+        tokens = []
+        for word in spaced.strip().split():
+            tokens.extend(self._word_piece_tokenize(word))
+        return tokens
+    def _word_piece_tokenize(self, word):
+        """word内分成subword
+        """
+        if len(word) > self._word_maxlen:
+            return [word]
+        tokens, start, end = [], 0, 0
+        while start < len(word):
+            end = len(word)
+            while end > start:
+                sub = word[start:end]
+                if start > 0:
+                    sub = '##' + sub
+                if sub in self._token_dict:
+                    break
+                end -= 1
+            if start == end:
+                return [word]
+            else:
+                tokens.append(sub)
+                start = end
+        return tokens
+    @staticmethod
+    def stem(token):
+        """获取token的“词干”（如果是##开头，则自动去掉##）
+        """
+        if token[:2] == '##':
+            return token[2:]
+        else:
+            return token
+    @staticmethod
+    def _is_space(ch):
+        """空格类字符判断
+        """
+        return ch == ' ' or ch == '\n' or ch == '\r' or ch == '\t' or \
+            unicodedata.category(ch) == 'Zs'
+    @staticmethod
+    def _is_punctuation(ch):
+        """标点符号类字符判断（全/半角均在此内）
+        提醒：unicodedata.category这个函数在py2和py3下的
+        表现可能不一样，比如u'§'字符，在py2下的结果为'So'，
+        在py3下的结果是'Po'。
+        """
+        code = ord(ch)
+        return 33 <= code <= 47 or \
+            58 <= code <= 64 or \
+            91 <= code <= 96 or \
+            123 <= code <= 126 or \
+            unicodedata.category(ch).startswith('P')
+    @staticmethod
+    def _cjk_punctuation():
+        return u'\uff02\uff03\uff04\uff05\uff06\uff07\uff08\uff09\uff0a\uff0b\uff0c\uff0d\uff0f\uff1a\uff1b\uff1c\uff1d\uff1e\uff20\uff3b\uff3c\uff3d\uff3e\uff3f\uff40\uff5b\uff5c\uff5d\uff5e\uff5f\uff60\uff62\uff63\uff64\u3000\u3001\u3003\u3008\u3009\u300a\u300b\u300c\u300d\u300e\u300f\u3010\u3011\u3014\u3015\u3016\u3017\u3018\u3019\u301a\u301b\u301c\u301d\u301e\u301f\u3030\u303e\u303f\u2013\u2014\u2018\u2019\u201b\u201c\u201d\u201e\u201f\u2026\u2027\ufe4f\ufe51\ufe54\u00b7\uff01\uff1f\uff61\u3002'
+    @staticmethod
+    def _is_cjk_character(ch):
+        """CJK类字符判断（包括中文字符也在此列）
+        参考：https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        """
+        code = ord(ch)
+        return 0x4E00 <= code <= 0x9FFF or \
+            0x3400 <= code <= 0x4DBF or \
+            0x20000 <= code <= 0x2A6DF or \
+            0x2A700 <= code <= 0x2B73F or \
+            0x2B740 <= code <= 0x2B81F or \
+            0x2B820 <= code <= 0x2CEAF or \
+            0xF900 <= code <= 0xFAFF or \
+            0x2F800 <= code <= 0x2FA1F
+    @staticmethod
+    def _is_control(ch):
+        """控制类字符判断
+        """
+        return unicodedata.category(ch) in ('Cc', 'Cf')
+    @staticmethod
+    def _is_special(ch):
+        """判断是不是有特殊含义的符号
+        """
+        return bool(ch) and (ch[0] == '[') and (ch[-1] == ']')
+    def rematch(self, text, tokens):
+        """给出原始的text和tokenize后的tokens的映射关系
+        """
+        if is_py2:
+            text = unicode(text)
+        if self._do_lower_case:
+            text = text.lower()
+        normalized_text, char_mapping = '', []
+        for i, ch in enumerate(text):
+            if self._do_lower_case:
+                ch = unicodedata.normalize('NFD', ch)
+                ch = ''.join([c for c in ch if unicodedata.category(c) != 'Mn'])
+            ch = ''.join([
+                c for c in ch
+                if not (ord(c) == 0 or ord(c) == 0xfffd or self._is_control(c))
+            ])
+            normalized_text += ch
+            char_mapping.extend([i] * len(ch))
+        text, token_mapping, offset = normalized_text, [], 0
+        for token in tokens:
+            if self._is_special(token):
+                token_mapping.append([])
+            else:
+                token = self.stem(token)
+                start = text[offset:].index(token) + offset
+                end = start + len(token)
+                token_mapping.append(char_mapping[start:end])
+                offset = end
+        return token_mapping
+class SpTokenizer(TokenizerBase):
+    """基于SentencePiece模型的封装，使用上跟Tokenizer基本一致。
+    """
+    def __init__(self, sp_model_path, **kwargs):
+        super(SpTokenizer, self).__init__(**kwargs)
+        import sentencepiece as spm
+        self.sp_model = spm.SentencePieceProcessor()
+        self.sp_model.Load(sp_model_path)
+        self._token_pad = self.sp_model.id_to_piece(self.sp_model.pad_id())
+        self._token_unk = self.sp_model.id_to_piece(self.sp_model.unk_id())
+        self._vocab_size = self.sp_model.get_piece_size()
+        for token in ['pad', 'unk', 'mask', 'start', 'end']:
+            try:
+                _token = getattr(self, '_token_%s' % token)
+                _token_id = self.sp_model.piece_to_id(_token)
+                setattr(self, '_token_%s_id' % token, _token_id)
+            except:
+                pass
+    def token_to_id(self, token):
+        """token转换为对应的id
+        """
+        return self.sp_model.piece_to_id(token)
+    def id_to_token(self, i):
+        """id转换为对应的token
+        """
+        if i < self._vocab_size:
+            return self.sp_model.id_to_piece(i)
+        else:
+            return ''
+    def decode(self, ids):
+        """转为可读文本
+        """
+        tokens = [
+            self._token_translate_inv.get(token) or token
+            for token in self.ids_to_tokens(ids)
+        ]
+        text = self.sp_model.decode_pieces(tokens)
+        return convert_to_unicode(text)
+    def _tokenize(self, text):
+        """基本分词函数
+        """
+        if self._pre_tokenize is not None:
+            text = ' '.join(self._pre_tokenize(text))
+        tokens = self.sp_model.encode_as_pieces(text)
+        return tokens
+    def _is_special(self, i):
+        """判断是不是有特殊含义的符号
+        """
+        return self.sp_model.is_control(i) or \
+            self.sp_model.is_unknown(i) or \
+            self.sp_model.is_unused(i)
+    def _is_decodable(self, i):
+        """判断是否应该被解码输出
+        """
+        return (i < self._vocab_size) and not self._is_special(i)
--- a/Keras/NLP/bert4keras/examples/README.md
+++ b/Keras/NLP/bert4keras/examples/README.md
+# 例子合集
+提示：Github上的examples只保证兼容Github上的最新版bert4keras，如果报错，请首先尝试升级bert4keras。
+## 简介
+- [basic_extract_features.py](https://github.com/bojone/bert4keras/tree/master/examples/basic_extract_features.py): 基础测试，测试BERT对句子的编码序列。
+- [basic_gibbs_sampling_via_mlm.py](https://github.com/bojone/bert4keras/tree/master/examples/basic_gibbs_sampling_via_mlm.py): 基础测试，利用BERT+Gibbs采样进行文本随机生成，参考[这里](https://kexue.fm/archives/8119)。
+- [basic_language_model_cpm_lm.py](https://github.com/bojone/bert4keras/tree/master/examples/basic_language_model_cpm_lm.py): 基础测试，测试[CPM_LM](https://github.com/TsinghuaAI/CPM-Generate)的生成效果。
+- [basic_language_model_gpt2_ml.py](https://github.com/bojone/bert4keras/tree/master/examples/basic_language_model_gpt2_ml.py): 基础测试，测试[GPT2_ML](https://github.com/imcaspar/gpt2-ml)的生成效果。
+- [basic_language_model_nezha_gen_gpt.py](https://github.com/bojone/bert4keras/tree/master/examples/basic_language_model_nezha_gen_gpt.py): 基础测试，测试[GPT Base（又叫NEZHE-GEN）](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/NEZHA-Gen-TensorFlow)的生成效果。
+- [basic_make_uncased_model_cased.py](https://github.com/bojone/bert4keras/tree/master/examples/basic_make_uncased_model_cased.py): 基础测试，通过简单修改词表，使得不区分大小写的模型有区分大小写的能力。
+- [basic_masked_language_model.py](https://github.com/bojone/bert4keras/tree/master/examples/basic_masked_language_model.py): 基础测试，测试BERT的MLM模型效果。
+- [basic_simple_web_serving_simbert.py](https://github.com/bojone/bert4keras/tree/master/examples/basic_simple_web_serving_simbert.py): 基础测试，测试自带的WebServing（将模型转化为Web接口）。
+- [task_conditional_language_model.py](https://github.com/bojone/bert4keras/tree/master/examples/task_conditional_language_model.py): 任务例子，结合 BERT + [Conditional Layer Normalization](https://kexue.fm/archives/7124) 做条件语言模型。
+- [task_iflytek_adversarial_training.py](https://github.com/bojone/bert4keras/tree/master/examples/task_iflytek_adversarial_training.py): 任务例子，通过[对抗训练](https://kexue.fm/archives/7234)提升分类效果。
+- [task_iflytek_bert_of_theseus.py](https://github.com/bojone/bert4keras/tree/master/examples/task_iflytek_bert_of_theseus.py): 任务例子，通过[BERT-of-Theseus](https://kexue.fm/archives/7575)来进行模型压缩。
+- [task_iflytek_gradient_penalty.py](https://github.com/bojone/bert4keras/tree/master/examples/task_iflytek_gradient_penalty.py): 任务例子，通过[梯度惩罚](https://kexue.fm/archives/7234)提升分类效果，可以视为另一种对抗训练。
+- [task_image_caption.py](https://github.com/bojone/bert4keras/tree/master/examples/task_image_caption.py): 任务例子，BERT + [Conditional Layer Normalization](https://kexue.fm/archives/7124) + ImageNet预训练模型 来做图像描述生成。
+- [task_language_model.py](https://github.com/bojone/bert4keras/tree/master/examples/task_language_model.py): 任务例子，加载BERT的预训练权重做无条件语言模型，效果上等价于GPT。
+- [task_language_model_chinese_chess.py](https://github.com/bojone/bert4keras/tree/master/examples/task_language_model_chinese_chess.py): 任务例子，用GPT的方式下中国象棋，过程请参考[博客](https://kexue.fm/archives/7877)。
+- [task_question_answer_generation_by_seq2seq.py](https://github.com/bojone/bert4keras/tree/master/examples/task_question_answer_generation_by_seq2seq.py): 任务例子，通过[UniLM](https://kexue.fm/archives/6933)式的Seq2Seq模型来做[问答对自动构建](https://kexue.fm/archives/7630)，属于自回归文本生成。
+- [task_reading_comprehension_by_mlm.py](https://github.com/bojone/bert4keras/tree/master/examples/task_reading_comprehension_by_mlm.py): 任务例子，通过MLM模型来做[阅读理解问答](https://kexue.fm/archives/7148)，属于简单的非自回归文本生成。
+- [task_reading_comprehension_by_seq2seq.py](https://github.com/bojone/bert4keras/tree/master/examples/task_reading_comprehension_by_seq2seq.py): 任务例子，通过[UniLM](https://kexue.fm/archives/6933)式的Seq2Seq模型来做[阅读理解问答](https://kexue.fm/archives/7115)，属于自回归文本生成。
+- [task_relation_extraction.py](https://github.com/bojone/bert4keras/tree/master/examples/task_relation_extraction.py): 任务例子，结合BERT以及自行设计的“半指针-半标注”结构来做[关系抽取](https://kexue.fm/archives/7161)。
+- [task_sentence_similarity_lcqmc.py](https://github.com/bojone/bert4keras/tree/master/examples/task_sentence_similarity_lcqmc.py): 任务例子，句子对分类任务。
+- [task_sentiment_albert.py](https://github.com/bojone/bert4keras/tree/master/examples/task_sentiment_albert.py): 任务例子，情感分类任务，加载ALBERT模型。
+- [task_sentiment_integrated_gradients.py](https://github.com/bojone/bert4keras/tree/master/examples/task_sentiment_integrated_gradients.py): 任务例子，通过[积分梯度](https://kexue.fm/archives/7533)的方式可视化情感分类任务。
+- [task_sentiment_virtual_adversarial_training.py](https://github.com/bojone/bert4keras/tree/master/examples/task_sentiment_virtual_adversarial_training.py): 任务例子，通过[虚拟对抗训练](https://kexue.fm/archives/7466)进行半监督学习，提升小样本下的情感分类性能。
+- [task_seq2seq_ape210k_math_word_problem.py](https://github.com/bojone/bert4keras/tree/master/examples/task_seq2seq_ape210k_math_word_problem.py): 任务例子，通过[UniLM](https://kexue.fm/archives/6933)式的Seq2Seq模型来做小学数学应用题（数学公式生成），详情请见[这里](https://kexue.fm/archives/7809)。
+- [task_seq2seq_autotitle.py](https://github.com/bojone/bert4keras/tree/master/examples/task_seq2seq_autotitle.py): 任务例子，通过[UniLM](https://kexue.fm/archives/6933)式的Seq2Seq模型来做新闻标题生成。
+- [task_seq2seq_autotitle_csl.py](https://github.com/bojone/bert4keras/tree/master/examples/task_seq2seq_autotitle_csl.py): 任务例子，通过[UniLM](https://kexue.fm/archives/6933)式的Seq2Seq模型来做论文标题生成，包含了评测代码。
+- [task_seq2seq_autotitle_csl_mt5.py](https://github.com/bojone/bert4keras/tree/master/examples/task_seq2seq_autotitle_csl_mt5.py): 任务例子，通过[多国语言版T5](https://kexue.fm/archives/7867)式的Seq2Seq模型来做论文标题生成，包含了评测代码。
+- [task_seq2seq_autotitle_multigpu.py](https://github.com/bojone/bert4keras/tree/master/examples/task_seq2seq_autotitle_multigpu.py): 任务例子，通过[UniLM](https://kexue.fm/archives/6933)式的Seq2Seq模型来做新闻标题生成，单机多卡版本。
+- [task_sequence_labeling_cws_crf.py](https://github.com/bojone/bert4keras/tree/master/examples/task_sequence_labeling_cws_crf.py): 任务例子，通过 BERT + [CRF](https://kexue.fm/archives/7196) 来做中文分词。
+- [task_sequence_labeling_ner_crf.py](https://github.com/bojone/bert4keras/tree/master/examples/task_sequence_labeling_ner_crf.py): 
+任务例子，通过 BERT + [CRF](https://kexue.fm/archives/7196) 来做中文NER。
--- a/Keras/NLP/bert4keras/examples/basic_extract_features.py
+++ b/Keras/NLP/bert4keras/examples/basic_extract_features.py
+#! -*- coding: utf-8 -*-
+# 测试代码可用性: 提取特征
+import numpy as np
+from bert4keras.backend import keras
+from bert4keras.models import build_transformer_model
+from bert4keras.tokenizers import Tokenizer
+from bert4keras.snippets import to_array
+config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
+dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'
+tokenizer = Tokenizer(dict_path, do_lower_case=True)  # 建立分词器
+model = build_transformer_model(config_path, checkpoint_path)  # 建立模型，加载权重
+# 编码测试
+token_ids, segment_ids = tokenizer.encode(u'语言模型')
+token_ids, segment_ids = to_array([token_ids], [segment_ids])
+print('\n ===== predicting =====\n')
+print(model.predict([token_ids, segment_ids]))
+"""
+输出：
+[[[-0.63251007  0.2030236   0.07936534 ...  0.49122632 -0.20493352
+    0.2575253 ]
+  [-0.7588351   0.09651865  1.0718756  ... -0.6109694   0.04312154
+    0.03881441]
+  [ 0.5477043  -0.792117    0.44435206 ...  0.42449304  0.41105673
+    0.08222899]
+  [-0.2924238   0.6052722   0.49968526 ...  0.8604137  -0.6533166
+    0.5369075 ]
+  [-0.7473459   0.49431565  0.7185162  ...  0.3848612  -0.74090636
+    0.39056838]
+  [-0.8741375  -0.21650358  1.338839   ...  0.5816864  -0.4373226
+    0.56181806]]]
+"""
+print('\n ===== reloading and predicting =====\n')
+model.save('test.model')
+del model
+model = keras.models.load_model('test.model')
+print(model.predict([token_ids, segment_ids]))
--- a/Keras/NLP/bert4keras/examples/basic_gibbs_sampling_via_mlm.py
+++ b/Keras/NLP/bert4keras/examples/basic_gibbs_sampling_via_mlm.py
+#! -*- coding: utf-8 -*-
+# 测试代码可用性: 结合MLM的Gibbs采样
+from tqdm import tqdm
+import numpy as np
+from bert4keras.models import build_transformer_model
+from bert4keras.tokenizers import Tokenizer
+from bert4keras.snippets import to_array
+config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
+dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'
+tokenizer = Tokenizer(dict_path, do_lower_case=True)  # 建立分词器
+model = build_transformer_model(
+    config_path=config_path, checkpoint_path=checkpoint_path, with_mlm=True
+)  # 建立模型，加载权重
+sentences = []
+init_sent = u'科学技术是第一生产力。'  # 给定句子或者None
+minlen, maxlen = 8, 32
+steps = 10000
+converged_steps = 1000
+vocab_size = tokenizer._vocab_size
+if init_sent is None:
+    length = np.random.randint(minlen, maxlen + 1)
+    tokens = ['[CLS]'] + ['[MASK]'] * length + ['[SEP]']
+    token_ids = tokenizer.tokens_to_ids(tokens)
+    segment_ids = [0] * len(token_ids)
+else:
+    token_ids, segment_ids = tokenizer.encode(init_sent)
+    length = len(token_ids) - 2
+for _ in tqdm(range(steps), desc='Sampling'):
+    # Gibbs采样流程：随机mask掉一个token，然后通过MLM模型重新采样这个token。
+    i = np.random.choice(length) + 1
+    token_ids[i] = tokenizer._token_mask_id
+    probas = model.predict(to_array([token_ids], [segment_ids]))[0, i]
+    token = np.random.choice(vocab_size, p=probas)
+    token_ids[i] = token
+    sentences.append(tokenizer.decode(token_ids))
+print(u'部分随机采样结果：')
+for _ in range(10):
+    print(np.random.choice(sentences[converged_steps:]))
--- a/Keras/NLP/bert4keras/examples/basic_language_model_cpm_lm.py
+++ b/Keras/NLP/bert4keras/examples/basic_language_model_cpm_lm.py
+#! -*- coding: utf-8 -*-
+# 基本测试：清华开源的中文GPT2模型（26亿参数）
+# 项目链接：https://github.com/TsinghuaAI/CPM-Generate
+# 博客介绍：https://kexue.fm/archives/7912
+import numpy as np
+from bert4keras.models import build_transformer_model
+from bert4keras.tokenizers import SpTokenizer
+from bert4keras.snippets import AutoRegressiveDecoder
+from bert4keras.snippets import uniout
+import jieba
+jieba.initialize()
+# 模型路径
+config_path = '/root/kg/bert/CPM_LM_2.6B_TF/config.json'
+checkpoint_path = '/root/kg/bert/CPM_LM_2.6B_TF/model.ckpt'
+spm_path = '/root/kg/bert/CPM_LM_2.6B_TF/chinese_vocab.model'
+def pre_tokenize(text):
+    """分词前处理函数
+    """
+    return [
+        w.replace(' ', u'\u2582').replace('\n', u'\u2583')
+        for w in jieba.cut(text, cut_all=False)
+    ]
+tokenizer = SpTokenizer(
+    spm_path,
+    token_start=None,
+    token_end=None,
+    pre_tokenize=pre_tokenize,
+    token_translate={u'\u2583': '<cls>'}
+)  # 建立分词器
+model = build_transformer_model(
+    config_path=config_path, checkpoint_path=checkpoint_path, model='gpt2'
+)  # 建立模型，加载权重
+class TextExpansion(AutoRegressiveDecoder):
+    """基于随机采样的文本续写
+    """
+    @AutoRegressiveDecoder.wraps(default_rtype='probas')
+    def predict(self, inputs, output_ids, states):
+        token_ids = np.concatenate([inputs[0], output_ids], 1)
+        return self.last_token(model).predict(token_ids)
+    def generate(self, text, n=1, topp=0.95, temperature=1):
+        """输出结果会有一定的随机性，如果只关心Few Shot效果，
+        可以考虑将解码方式换为beam search。
+        """
+        token_ids, _ = tokenizer.encode(text)
+        results = self.random_sample([token_ids],
+                                     n,
+                                     topp=topp,
+                                     temperature=temperature)  # 基于随机采样
+        results = [token_ids + [int(i) for i in ids] for ids in results]
+        texts = [tokenizer.decode(ids) for ids in results]
+        return [self.post_replace(text) for text in texts]
+    def post_replace(self, text):
+        for s, t in [(' ', ''), (u'\u2582', ' '), (u'\u2583', '\n')]:
+            text = text.replace(s, t)
+        return text
+text_expansion = TextExpansion(
+    start_id=None,
+    end_id=3,  # 3是<cls>，也是换行符
+    maxlen=16,
+)
+# 常识推理
+# 本例输出：北京
+query = u"""
+美国的首都是华盛顿
+法国的首都是巴黎
+日本的首都是东京
+中国的首都是
+"""
+print(text_expansion.generate(query[1:-1], 1)[0])
+# 单词翻译
+# 本例输出：bird
+query = u"""
+狗 dog
+猫 cat
+猪 pig
+鸟 
+"""
+print(text_expansion.generate(query[1:-1], 1)[0])
+# 主语抽取
+# 本例输出：杨振宁
+query = u"""
+从1931年起，华罗庚在清华大学边学习边工作 华罗庚
+在一间简陋的房间里，陈景润攻克了“哥德巴赫猜想” 陈景润
+在这里，丘成桐得到IBM奖学金 丘成桐
+杨振宁在粒子物理学、统计力学和凝聚态物理等领域作出里程碑性贡献 
+"""
+print(text_expansion.generate(query[1:-1], 1)[0])
+# 三元组抽取
+# 本例输出：张红,体重,140斤
+query = u"""
+姚明的身高是211cm，是很多人心目中的偶像。 ->姚明，身高，211cm
+毛泽东是绍兴人，早年在长沙读书。->毛泽东，出生地，绍兴
+虽然周杰伦在欧洲办的婚礼，但是他是土生土长的中国人->周杰伦，国籍，中国
+小明出生于武汉，但是却不喜欢在武汉生成，长大后去了北京。->小明，出生地，武汉
+吴亦凡是很多人的偶像，但是他却是加拿大人，另很多人失望->吴亦凡，国籍，加拿大
+武耀的生日在5月8号，这一天，大家都为他庆祝了生日->武耀，生日，5月8号
+《青花瓷》是周杰伦最得意的一首歌。->周杰伦，作品，《青花瓷》
+北京是中国的首都。->中国，首都，北京
+蒋碧的家乡在盘龙城，毕业后去了深圳工作。->蒋碧，籍贯，盘龙城
+上周我们和王立一起去了他的家乡云南玩昨天才回到了武汉。->王立，籍贯，云南
+昨天11月17号，我和朋友一起去了海底捞，期间服务员为我的朋友刘章庆祝了生日。->刘章，生日，11月17号
+张红的体重达到了140斤，她很苦恼。->
+"""
+print(text_expansion.generate(query[1:-1], 1)[0])
--- a/Keras/NLP/bert4keras/examples/basic_language_model_gpt2_ml.py
+++ b/Keras/NLP/bert4keras/examples/basic_language_model_gpt2_ml.py
+#! -*- coding: utf-8 -*-
+# 基本测试：中文GPT2_ML模型
+# 介绍链接：https://kexue.fm/archives/7292
+import numpy as np
+from bert4keras.models import build_transformer_model
+from bert4keras.tokenizers import Tokenizer
+from bert4keras.snippets import AutoRegressiveDecoder
+from bert4keras.snippets import uniout
+config_path = '/root/kg/bert/gpt2_ml/config.json'
+checkpoint_path = '/root/kg/bert/gpt2_ml/model.ckpt-100000'
+dict_path = '/root/kg/bert/gpt2_ml/vocab.txt'
+tokenizer = Tokenizer(
+    dict_path, token_start=None, token_end=None, do_lower_case=True
+)  # 建立分词器
+model = build_transformer_model(
+    config_path=config_path, checkpoint_path=checkpoint_path, model='gpt2_ml'
+)  # 建立模型，加载权重
+class ArticleCompletion(AutoRegressiveDecoder):
+    """基于随机采样的文章续写
+    """
+    @AutoRegressiveDecoder.wraps(default_rtype='probas')
+    def predict(self, inputs, output_ids, states):
+        token_ids = np.concatenate([inputs[0], output_ids], 1)
+        return self.last_token(model).predict(token_ids)
+    def generate(self, text, n=1, topp=0.95):
+        token_ids, _ = tokenizer.encode(text)
+        results = self.random_sample([token_ids], n, topp=topp)  # 基于随机采样
+        return [text + tokenizer.decode(ids) for ids in results]
+article_completion = ArticleCompletion(
+    start_id=None,
+    end_id=511,  # 511是中文句号
+    maxlen=256,
+    minlen=128
+)
+print(article_completion.generate(u'今天天气不错'))
+"""
+部分结果：
+>>> article_completion.generate(u'今天天气不错')
+[u'今天天气不错，可以去跑步。昨晚看了一个关于跑步的纪录片，里面的女主讲述的是一个女孩子的成长，很励志，也很美丽。我也想跑，但是我不知道跑步要穿运动鞋，所以就买了一双运动鞋。这个纪录片是关于运动鞋的，有一 集讲了一个女孩子，从小学开始就没有穿过运动鞋，到了高中才开始尝试跑步。']
+>>> article_completion.generate(u'双十一')
+[u'双十一马上就要到了！你还在为双11的物流配送而担心吗？你还在为没时间去仓库取货而发愁吗？你还在为不知道怎么买到便宜货而发愁吗？你还在为买不到心仪的产品而懊恼吗？那么，双十一就来了！今天小编带你来看看这些 快递，都是怎么送货的！1. 物流配送快递公司的配送，主要是由快递公司负责，快递公司负责派件，物流服务。']
+>>> article_completion.generate(u'科学空间')
+[u'科学空间站科学空间站（英文：science space station），是中华人民共和国的一个空间站。该空间站是中国科学院大连物理研究所研制，主要研发和使用中国科学院大连物理研究所的核能动力空间站。科学空间站位于北京市海淀区，距离地面393米，总建筑面积约为1万平方米，总投资约为5亿元人民币。科学空间站于2018年12月26日开始动工，2021年6月建成并投入使用。']
+"""
--- a/Keras/NLP/bert4keras/examples/basic_language_model_nezha_gen_gpt.py
+++ b/Keras/NLP/bert4keras/examples/basic_language_model_nezha_gen_gpt.py
+#! -*- coding: utf-8 -*-
+# 基本测试：中文GPT模型，base版本，华为开源的
+# 权重链接: https://pan.baidu.com/s/1-FB0yl1uxYDCGIRvU1XNzQ 提取码: xynn
+# 参考项目：https://github.com/bojone/chinese-gen
+import numpy as np
+from bert4keras.models import build_transformer_model
+from bert4keras.tokenizers import Tokenizer
+from bert4keras.snippets import AutoRegressiveDecoder
+from bert4keras.snippets import uniout
+config_path = '/root/kg/bert/chinese_nezha_gpt_L-12_H-768_A-12/config.json'
+checkpoint_path = '/root/kg/bert/chinese_nezha_gpt_L-12_H-768_A-12/gpt.ckpt'
+dict_path = '/root/kg/bert/chinese_nezha_gpt_L-12_H-768_A-12/vocab.txt'
+tokenizer = Tokenizer(dict_path, do_lower_case=True)  # 建立分词器
+model = build_transformer_model(
+    config_path=config_path,
+    checkpoint_path=checkpoint_path,
+    segment_vocab_size=0,  # 去掉segmeng_ids输入
+    application='lm',
+)  # 建立模型，加载权重
+class ArticleCompletion(AutoRegressiveDecoder):
+    """基于随机采样的文章续写
+    """
+    @AutoRegressiveDecoder.wraps(default_rtype='probas')
+    def predict(self, inputs, output_ids, states):
+        token_ids = np.concatenate([inputs[0], output_ids], 1)
+        return self.last_token(model).predict(token_ids)
+    def generate(self, text, n=1, topp=0.95):
+        token_ids = tokenizer.encode(text)[0][:-1]
+        results = self.random_sample([token_ids], n, topp=topp)  # 基于随机采样
+        return [text + tokenizer.decode(ids) for ids in results]
+article_completion = ArticleCompletion(
+    start_id=None,
+    end_id=511,  # 511是中文句号
+    maxlen=256,
+    minlen=128
+)
+print(article_completion.generate(u'今天天气不错'))
+"""
+部分结果：
+>>> article_completion.generate(u'今天天气不错')
+[u'今天天气不错。昨天的天气是多云到晴的天气，今天的天气还不错，不会太冷。明后两天天气还是比较好的。不过今天的天气比较闷热，最高温度在30℃左右，明后两天天气会更加热。预计今天的最高温度为30℃，明后两天的最   高温度为32℃左右，今天的最高气温将在30℃左右。（记者李莉）。新华网重庆频道诚邀广大网友投稿，您可以用相机或手机记录下身边的感人故事，精彩瞬间。请将作者、拍摄时间、地点和简要说明连同照片发给我们，我们将精选其中的好图、美图在页面上展示，让所有新华网友共赏。[投稿] 。本报讯(记者陈敏华) 今年上半年，重庆市各级公安机关在全力抓好']
+>>> article_completion.generate(u'双十一')
+[u'双十一大是中国共产党在新的历史起点上召开的一次十分重要的代表大会, 是全面落实科学发展观、推进中国特色社会主义伟大事业的一次重要会议。会议的召开, 是党和政府对新世纪新阶段我国改革开放和社会主义现代化建设 事业的新的历史任务的一次重要总动员, 必将对我们党全面推进党的建']
+>>> article_completion.generate(u'科学空间')
+[u'科学空间站上的两个机器人在进入轨道后，一边在轨道上工作，一边用它们的身体和心脏在空间站上的一个大气层进行活动，以确保它们在进入地球之后不会因太阳风暴而受到影响；而另外一个机器人则在进入轨道的过程中，通 过机器人与地球上的大气层相互作用，使地球的大气层不断地向地球的大气层中转移，以使其能够在空间站上工作，并且使用它们的身体和心脏来完成它们的各种任务。']
+"""
--- a/Keras/NLP/bert4keras/examples/basic_make_uncased_model_cased.py
+++ b/Keras/NLP/bert4keras/examples/basic_make_uncased_model_cased.py
+#! -*- coding: utf-8 -*-
+# 通过简单修改词表，使得不区分大小写的模型有区分大小写的能力
+# 基本思路：将英文单词大写化后添加到词表中，并修改模型Embedding层
+from bert4keras.models import build_transformer_model
+from bert4keras.tokenizers import Tokenizer, load_vocab
+from bert4keras.snippets import to_array
+import numpy as np
+config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
+dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'
+token_dict = load_vocab(dict_path)
+new_token_dict = token_dict.copy()
+compound_tokens = []
+for t, i in sorted(token_dict.items(), key=lambda s: s[1]):
+    # 这里主要考虑两种情况：1、首字母大写；2、整个单词大写。
+    # Python2下，新增了5594个token；Python3下，新增了5596个token。
+    tokens = []
+    if t.isalpha():
+        tokens.extend([t[:1].upper() + t[1:], t.upper()])
+    elif t[:2] == '##' and t[2:].isalpha():
+        tokens.append(t.upper())
+    for token in tokens:
+        if token not in new_token_dict:
+            compound_tokens.append([i])
+            new_token_dict[token] = len(new_token_dict)
+tokenizer = Tokenizer(new_token_dict, do_lower_case=False)
+model = build_transformer_model(
+    config_path,
+    checkpoint_path,
+    compound_tokens=compound_tokens,  # 增加新token，用旧token平均来初始化
+)
+text = u'Welcome to BEIJING.'
+tokens = tokenizer.tokenize(text)
+print(tokens)
+"""
+输出：['[CLS]', u'Welcome', u'to', u'BE', u'##I', u'##JING', u'.', '[SEP]']
+"""
+token_ids, segment_ids = tokenizer.encode(text)
+token_ids, segment_ids = to_array([token_ids], [segment_ids])
+print(model.predict([token_ids, segment_ids]))
+"""
+输出：
+[[[-1.4999904e-01  1.9651388e-01 -1.7924258e-01 ...  7.8269649e-01
+    2.2241375e-01  1.1325148e-01]
+  [-4.5268752e-02  5.5090344e-01  7.4699545e-01 ... -4.7773960e-01
+   -1.7562288e-01  4.1265407e-01]
+  [ 7.0158571e-02  1.7816302e-01  3.6949167e-01 ...  9.6258509e-01
+   -8.4678203e-01  6.3776302e-01]
+  ...
+  [ 9.3637377e-01  3.0232478e-02  8.1411439e-01 ...  7.9186147e-01
+    7.5704646e-01 -8.3475001e-04]
+  [ 2.3699696e-01  2.9953337e-01  8.1962071e-02 ... -1.3776925e-01
+    3.8681498e-01  3.2553676e-01]
+  [ 1.9728680e-01  7.7782705e-02  5.2951699e-01 ...  8.9622810e-02
+   -2.3932748e-02  6.9600858e-02]]]
+"""
--- a/Keras/NLP/bert4keras/examples/basic_masked_language_model.py
+++ b/Keras/NLP/bert4keras/examples/basic_masked_language_model.py
+#! -*- coding: utf-8 -*-
+# 测试代码可用性: MLM
+import numpy as np
+from bert4keras.models import build_transformer_model
+from bert4keras.tokenizers import Tokenizer
+from bert4keras.snippets import to_array
+config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
+dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'
+tokenizer = Tokenizer(dict_path, do_lower_case=True)  # 建立分词器
+model = build_transformer_model(
+    config_path=config_path, checkpoint_path=checkpoint_path, with_mlm=True
+)  # 建立模型，加载权重
+token_ids, segment_ids = tokenizer.encode(u'科学技术是第一生产力')
+# mask掉“技术”
+token_ids[3] = token_ids[4] = tokenizer._token_mask_id
+token_ids, segment_ids = to_array([token_ids], [segment_ids])
+# 用mlm模型预测被mask掉的部分
+probas = model.predict([token_ids, segment_ids])[0]
+print(tokenizer.decode(probas[3:5].argmax(axis=1)))  # 结果正是“技术”
--- a/Keras/NLP/bert4keras/examples/basic_simple_web_serving_simbert.py
+++ b/Keras/NLP/bert4keras/examples/basic_simple_web_serving_simbert.py
+#! -*- coding: utf-8 -*-
+# 利用自带的接口，将SimBERT的同义句生成搭建成Web服务。
+# 基于bottlepy简单封装，仅作为临时测试使用，不保证性能。
+# 目前仅保证支持 Tensorflow 1.x + Keras <= 2.3.1。
+# 具体用法请看 https://github.com/bojone/bert4keras/blob/8ffb46a16a79f87aa8cdf045df7994036b4be47d/bert4keras/snippets.py#L580
+import numpy as np
+from collections import Counter
+from bert4keras.backend import keras, K
+from bert4keras.models import build_transformer_model
+from bert4keras.tokenizers import Tokenizer
+from bert4keras.snippets import sequence_padding, AutoRegressiveDecoder
+from bert4keras.snippets import WebServing
+maxlen = 32
+# bert配置
+config_path = '/root/kg/bert/chinese_simbert_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = '/root/kg/bert/chinese_simbert_L-12_H-768_A-12/bert_model.ckpt'
+dict_path = '/root/kg/bert/chinese_simbert_L-12_H-768_A-12/vocab.txt'
+# 建立分词器
+tokenizer = Tokenizer(dict_path, do_lower_case=True)  # 建立分词器
+# 建立加载模型
+bert = build_transformer_model(
+    config_path,
+    checkpoint_path,
+    with_pool='linear',
+    application='unilm',
+    return_keras_model=False,
+)
+encoder = keras.models.Model(bert.model.inputs, bert.model.outputs[0])
+seq2seq = keras.models.Model(bert.model.inputs, bert.model.outputs[1])
+class SynonymsGenerator(AutoRegressiveDecoder):
+    """seq2seq解码器
+    """
+    @AutoRegressiveDecoder.wraps(default_rtype='probas')
+    def predict(self, inputs, output_ids, states):
+        token_ids, segment_ids = inputs
+        token_ids = np.concatenate([token_ids, output_ids], 1)
+        segment_ids = np.concatenate([segment_ids, np.ones_like(output_ids)], 1)
+        return self.last_token(seq2seq).predict([token_ids, segment_ids])
+    def generate(self, text, n=1, topp=0.95):
+        token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
+        output_ids = self.random_sample([token_ids, segment_ids], n,
+                                        topp=topp)  # 基于随机采样
+        return [tokenizer.decode(ids) for ids in output_ids]
+synonyms_generator = SynonymsGenerator(
+    start_id=None, end_id=tokenizer._token_end_id, maxlen=maxlen
+)
+def gen_synonyms(text, n=100, k=20):
+    """"含义： 产生sent的n个相似句，然后返回最相似的k个。
+    做法：用seq2seq生成，并用encoder算相似度并排序。
+    """
+    r = synonyms_generator.generate(text, n)
+    r = [i for i in set(r) if i != text]
+    r = [text] + r
+    X, S = [], []
+    for t in r:
+        x, s = tokenizer.encode(t)
+        X.append(x)
+        S.append(s)
+    X = sequence_padding(X)
+    S = sequence_padding(S)
+    Z = encoder.predict([X, S])
+    Z /= (Z**2).sum(axis=1, keepdims=True)**0.5
+    argsort = np.dot(Z[1:], -Z[0]).argsort()
+    return [r[i + 1] for i in argsort[:k]]
+if __name__ == '__main__':
+    arguments = {'text': (None, True), 'n': (int, False), 'k': (int, False)}
+    web = WebServing(port=8864)
+    web.route('/gen_synonyms', gen_synonyms, arguments)
+    web.start()
+    # 现在可以测试访问 http://127.0.0.1:8864/gen_synonyms?text=苹果多少钱一斤
--- a/Keras/NLP/bert4keras/examples/task_conditional_language_model.py
+++ b/Keras/NLP/bert4keras/examples/task_conditional_language_model.py
+#! -*- coding: utf-8 -*-
+# bert做conditional language model任务
+# 按类随机生成文本，这个demo的类别是情感极性（正／负）
+# 请参考：https://kexue.fm/archives/7124
+from __future__ import print_function
+import re
+import numpy as np
+from bert4keras.backend import keras, K
+from bert4keras.layers import Loss
+from bert4keras.models import build_transformer_model
+from bert4keras.tokenizers import Tokenizer, load_vocab
+from bert4keras.optimizers import Adam
+from bert4keras.snippets import sequence_padding, open
+from bert4keras.snippets import text_segmentate
+from bert4keras.snippets import DataGenerator, AutoRegressiveDecoder
+from bert4keras.snippets import uniout  # 打印中文
+from keras.layers import Input, Embedding, Reshape
+from keras.models import Model
+# 模型配置
+maxlen = 128
+batch_size = 32
+num_classes = 2
+epochs = 20
+# bert配置
+config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
+dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'
+# 加载并精简词表，建立分词器
+token_dict, keep_tokens = load_vocab(
+    dict_path=dict_path,
+    simplified=True,
+    startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
+)
+tokenizer = Tokenizer(token_dict, do_lower_case=True)
+def load_data(filenames):
+    """加载数据，并尽量划分为不超过maxlen的句子
+    """
+    D = []
+    seps, strips = u'\n。！？!?；;，, ', u'；;，, '
+    for filename in filenames:
+        with open(filename, encoding='utf-8') as f:
+            for l in f:
+                text, label = l.strip().split('\t')
+                for t in text_segmentate(text, maxlen - 2, seps, strips):
+                    D.append((t, int(label)))
+    return D
+# 加载数据集
+data = load_data([
+    'datasets/sentiment/sentiment.train.data',
+    'datasets/sentiment/sentiment.valid.data',
+    'datasets/sentiment/sentiment.test.data',
+])
+class data_generator(DataGenerator):
+    """数据生成器
+    """
+    def __iter__(self, random=False):
+        batch_token_ids, batch_segment_ids, batch_labels = [], [], []
+        for is_end, (text, label) in self.sample(random):
+            token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
+            batch_token_ids.append(token_ids)
+            batch_segment_ids.append(segment_ids)
+            batch_labels.append([label])
+            if len(batch_token_ids) == self.batch_size or is_end:
+                batch_token_ids = sequence_padding(batch_token_ids)
+                batch_segment_ids = sequence_padding(batch_segment_ids)
+                batch_labels = sequence_padding(batch_labels)
+                yield [batch_token_ids, batch_segment_ids, batch_labels], None
+                batch_token_ids, batch_segment_ids, batch_labels = [], [], []
+class CrossEntropy(Loss):
+    """交叉熵作为loss，并mask掉padding部分
+    """
+    def compute_loss(self, inputs, mask=None):
+        y_true, y_pred = inputs
+        if mask[1] is None:
+            y_mask = 1.0
+        else:
+            y_mask = K.cast(mask[1], K.floatx())[:, 1:]
+        y_true = y_true[:, 1:]  # 目标token_ids
+        y_pred = y_pred[:, :-1]  # 预测序列，错开一位
+        loss = K.sparse_categorical_crossentropy(y_true, y_pred)
+        loss = K.sum(loss * y_mask) / K.sum(y_mask)
+        return loss
+c_in = Input(shape=(1,))
+c = Embedding(num_classes, 128)(c_in)
+c = Reshape((128,))(c)
+# Bert模型
+model = build_transformer_model(
+    config_path,
+    checkpoint_path,
+    application='lm',
+    keep_tokens=keep_tokens,  # 只保留keep_tokens中的字，精简原字表
+    layer_norm_cond=c,
+    additional_input_layers=c_in,
+)
+output = CrossEntropy(1)([model.inputs[0], model.outputs[0]])
+model = Model(model.inputs, output)
+model.compile(optimizer=Adam(1e-5))
+model.summary()
+class RandomSentiment(AutoRegressiveDecoder):
+    """根据情感标签（0:负，1:正）随机生成一批句子
+    """
+    @AutoRegressiveDecoder.wraps(default_rtype='probas')
+    def predict(self, inputs, output_ids, states):
+        token_ids = output_ids
+        segment_ids = np.zeros_like(token_ids)
+        return self.last_token(model).predict([
+            token_ids, segment_ids, inputs[0]
+        ])
+    def generate(self, label, n=1, topp=0.95):
+        results = self.random_sample([[label]], n, topp=topp)  # 基于随机采样
+        return [tokenizer.decode(ids) for ids in results]
+random_sentiment = RandomSentiment(
+    start_id=tokenizer._token_start_id,
+    end_id=tokenizer._token_end_id,
+    maxlen=maxlen
+)
+def just_show():
+    print(u'正面采样:')
+    print(random_sentiment.generate(1, 5, 5), '\n')
+    print(u'负面采样:')
+    print(random_sentiment.generate(0, 5, 5), '\n')
+class Evaluator(keras.callbacks.Callback):
+    """评估与保存
+    """
+    def __init__(self):
+        self.lowest = 1e10
+    def on_epoch_end(self, epoch, logs=None):
+        # 保存最优
+        if logs['loss'] <= self.lowest:
+            self.lowest = logs['loss']
+            model.save_weights('./best_model.weights')
+        # 演示效果
+        just_show()
+if __name__ == '__main__':
+    evaluator = Evaluator()
+    train_generator = data_generator(data, batch_size)
+    model.fit(
+        train_generator.forfit(),
+        steps_per_epoch=len(train_generator),
+        epochs=epochs,
+        callbacks=[evaluator]
+    )
+else:
+    model.load_weights('./best_model.weights')
+"""
+正面采样:
+[
+    u'外观时尚、漂亮、性价比高。',
+    u'外观漂亮，配置均衡，比较满意，性价比高，外观漂亮，性能较高。',
+    u'我是在大学的时候看到这本书的，所以一直在买。书中的作者是林静蕾，她用自己的口吻写出了一个孩子成长中的心路历程，让我看到了她们成长中的不同之处，以及她们成长过程中的不同境界。让我很欣赏！',
+    u'我想这是一本能够告诉读者什么是坏的，而不是教你怎样说话，告诉我什么是错。这里我推荐了《我要讲故事》，这本书是我很喜欢的一本书，我认为它的理由很多，但是，我相信我。如果你从中得到一些改进，或者你已经有了一个明智的决定。',
+    u'我们一家五口住的是标间，大床房，大床的床很舒服；而我们在携程网上订了两套大床房，这个酒店的价格还是比较合理的；但是房间的隔音效果不太理想，有点响的声音；酒店门口的地铁在施工中，不方便；但是酒店的门口的出租车不知道是哪个车的，打车不是很方便；酒店外面的停'
+]
+负面采样:
+[
+    u'不知道是不是因为电池不太好，不是我不喜欢。',
+    u'看了评论才买的. 结果发现不是那么便宜, 价格也不便宜.',
+    u'1、外壳不容易沾手印，不容易洗洗2、屏幕有点旧， 不能下载铃声',
+    u'我是7月6日订购了《杜拉拉升职记》并已通过银行付款，为什么订单下了两周多至今还未到货？是收货时间太快了，可能就这么过去了吧？',
+    u'这本书我是在网上先看了一遍，后来我再看了一遍。感觉作者的文笔实在太烂了，特别是在写他的博客时特别别扭，写得很不专业，特别是他写股票时那个情绪调节的小男孩，简直就是自作聪明的样子，简直就是自作聪明的一种表现！'
+]
+"""
--- a/Keras/NLP/bert4keras/examples/task_iflytek_adversarial_training.py
+++ b/Keras/NLP/bert4keras/examples/task_iflytek_adversarial_training.py
+#! -*- coding:utf-8 -*-
+# 通过对抗训练增强模型的泛化性能
+# 比CLUE榜单公开的同数据集上的BERT base的成绩高2%
+# 数据集：IFLYTEK' 长文本分类 (https://github.com/CLUEbenchmark/CLUE)
+# 博客：https://kexue.fm/archives/7234
+# 适用于Keras 2.3.1
+import json
+import numpy as np
+from bert4keras.backend import keras, search_layer, K
+from bert4keras.tokenizers import Tokenizer
+from bert4keras.models import build_transformer_model
+from bert4keras.optimizers import Adam
+from bert4keras.snippets import sequence_padding, DataGenerator
+from keras.layers import Lambda, Dense
+from tqdm import tqdm
+num_classes = 119
+maxlen = 128
+batch_size = 32
+# BERT base
+config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
+dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'
+def load_data(filename):
+    """加载数据
+    单条格式：(文本, 标签id)
+    """
+    D = []
+    with open(filename) as f:
+        for i, l in enumerate(f):
+            l = json.loads(l)
+            text, label = l['sentence'], l['label']
+            D.append((text, int(label)))
+    return D
+# 加载数据集
+train_data = load_data(
+    '/root/CLUE-master/baselines/CLUEdataset/iflytek/train.json'
+)
+valid_data = load_data(
+    '/root/CLUE-master/baselines/CLUEdataset/iflytek/dev.json'
+)
+# 建立分词器
+tokenizer = Tokenizer(dict_path, do_lower_case=True)
+class data_generator(DataGenerator):
+    """数据生成器
+    """
+    def __iter__(self, random=False):
+        batch_token_ids, batch_segment_ids, batch_labels = [], [], []
+        for is_end, (text, label) in self.sample(random):
+            token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
+            batch_token_ids.append(token_ids)
+            batch_segment_ids.append(segment_ids)
+            batch_labels.append([label])
+            if len(batch_token_ids) == self.batch_size or is_end:
+                batch_token_ids = sequence_padding(batch_token_ids)
+                batch_segment_ids = sequence_padding(batch_segment_ids)
+                batch_labels = sequence_padding(batch_labels)
+                yield [batch_token_ids, batch_segment_ids], batch_labels
+                batch_token_ids, batch_segment_ids, batch_labels = [], [], []
+# 转换数据集
+train_generator = data_generator(train_data, batch_size)
+valid_generator = data_generator(valid_data, batch_size)
+# 加载预训练模型
+bert = build_transformer_model(
+    config_path=config_path,
+    checkpoint_path=checkpoint_path,
+    return_keras_model=False,
+)
+output = Lambda(lambda x: x[:, 0])(bert.model.output)
+output = Dense(
+    units=num_classes,
+    activation='softmax',
+    kernel_initializer=bert.initializer
+)(output)
+model = keras.models.Model(bert.model.input, output)
+model.summary()
+model.compile(
+    loss='sparse_categorical_crossentropy',
+    optimizer=Adam(2e-5),
+    metrics=['sparse_categorical_accuracy'],
+)
+def adversarial_training(model, embedding_name, epsilon=1):
+    """给模型添加对抗训练
+    其中model是需要添加对抗训练的keras模型，embedding_name
+    则是model里边Embedding层的名字。要在模型compile之后使用。
+    """
+    if model.train_function is None:  # 如果还没有训练函数
+        model._make_train_function()  # 手动make
+    old_train_function = model.train_function  # 备份旧的训练函数
+    # 查找Embedding层
+    for output in model.outputs:
+        embedding_layer = search_layer(output, embedding_name)
+        if embedding_layer is not None:
+            break
+    if embedding_layer is None:
+        raise Exception('Embedding layer not found')
+    # 求Embedding梯度
+    embeddings = embedding_layer.embeddings  # Embedding矩阵
+    gradients = K.gradients(model.total_loss, [embeddings])  # Embedding梯度
+    gradients = K.zeros_like(embeddings) + gradients[0]  # 转为dense tensor
+    # 封装为函数
+    inputs = (
+        model._feed_inputs + model._feed_targets + model._feed_sample_weights
+    )  # 所有输入层
+    embedding_gradients = K.function(
+        inputs=inputs,
+        outputs=[gradients],
+        name='embedding_gradients',
+    )  # 封装为函数
+    def train_function(inputs):  # 重新定义训练函数
+        grads = embedding_gradients(inputs)[0]  # Embedding梯度
+        delta = epsilon * grads / (np.sqrt((grads**2).sum()) + 1e-8)  # 计算扰动
+        K.set_value(embeddings, K.eval(embeddings) + delta)  # 注入扰动
+        outputs = old_train_function(inputs)  # 梯度下降
+        K.set_value(embeddings, K.eval(embeddings) - delta)  # 删除扰动
+        return outputs
+    model.train_function = train_function  # 覆盖原训练函数
+# 写好函数后，启用对抗训练只需要一行代码
+adversarial_training(model, 'Embedding-Token', 0.5)
+def evaluate(data):
+    total, right = 0., 0.
+    for x_true, y_true in data:
+        y_pred = model.predict(x_true).argmax(axis=1)
+        y_true = y_true[:, 0]
+        total += len(y_true)
+        right += (y_true == y_pred).sum()
+    return right / total
+class Evaluator(keras.callbacks.Callback):
+    """评估与保存
+    """
+    def __init__(self):
+        self.best_val_acc = 0.
+    def on_epoch_end(self, epoch, logs=None):
+        val_acc = evaluate(valid_generator)
+        if val_acc > self.best_val_acc:
+            self.best_val_acc = val_acc
+            model.save_weights('best_model.weights')
+        print(
+            u'val_acc: %.5f, best_val_acc: %.5f\n' %
+            (val_acc, self.best_val_acc)
+        )
+def predict_to_file(in_file, out_file):
+    """输出预测结果到文件
+    结果文件可以提交到 https://www.cluebenchmarks.com 评测。
+    """
+    fw = open(out_file, 'w')
+    with open(in_file) as fr:
+        for l in tqdm(fr):
+            l = json.loads(l)
+            text = l['sentence']
+            token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
+            label = model.predict([[token_ids], [segment_ids]])[0].argmax()
+            l = json.dumps({'id': str(l['id']), 'label': str(label)})
+            fw.write(l + '\n')
+    fw.close()
+if __name__ == '__main__':
+    evaluator = Evaluator()
+    model.fit(
+        train_generator.forfit(),
+        steps_per_epoch=len(train_generator),
+        epochs=50,
+        callbacks=[evaluator]
+    )
+else:
+    model.load_weights('best_model.weights')
+    # predict_to_file('/root/CLUE-master/baselines/CLUEdataset/iflytek/test.json', 'iflytek_predict.json')