glm

0b0ad26a · yuguo960516 · f27ee404 · 0b0ad26a · 0b0ad26a · 0b0ad26a
Commit 0b0ad26a authored Apr 04, 2023 by yuguo960516
8 changed files
--- a/README.md
+++ b/README.md
@@ -161,6 +161,45 @@ tokenizer.decode: 0.0698804759979248秒
 [CLS] 冬天,中国哪座城市最适合避寒?问题描述:能推荐一些国内适合冬天避寒的城市吗?回答用户:旅游爱好者 回答: [gMASK] <|endoftext|> <|startofpiece|> 避寒,当然是去海南呀!<n><n>海南的冬天,阳光明媚,温度适宜,而且空气清新,没有雾霾,没有沙尘暴,没有雾霾,没有雾霾!<n><n>海南的冬天,阳光明媚,温度适宜,而且空气清新,没有雾霾,没有沙尘暴,没有雾霾!<n><n>海南的冬天,阳光明媚,温度适宜,而且空气清新,没有雾霾,没有沙尘暴,没有雾霾!
 ```
+### 问答示例
+采用1节点，4张DCU-Z100-16G，采用tp=2，pp=2的并行配置。
+运行以下代码：
+    cd projects/GLM`
+    python3 -m oneflow.distributed.launch --nproc_per_node 4 glm-QA.py
+程序运行起来后，允许用户在命令行进行问答交互，输入“退出”，可以结束程序，如下所示：
+```
+输入
+> 如何改善睡眠质量
+正在生成内容...
+> [CLS] 如何改善睡眠质量 回答: [gMASK] <|endoftext|> <|startofpiece|> 睡眠不好,可以试着用以下方法改善: 1、睡前不要喝咖啡、浓茶、吸烟等,也不要喝含咖啡因的饮料,如可乐、咖啡、茶等。 2、睡前不要进行剧烈运动,如剧烈的跑步、跳舞、打球等。 3、睡前不要看刺激性的电视节目,如恐怖电影、凶杀片等。 4、睡前不要思考问题,如回忆今天发生的事情、明天的工作计划等。 5、睡前不要进食,如吃得过饱、过晚、过饱等。 6、睡前不要进行剧烈的体力活动,如跑步、打球、游泳
+输入：
+> 从北京到郑州有多少公里
+正在生成内容...
+> [CLS] 从北京到郑州有多少公里 回答: [gMASK] <|endoftext|> <|startofpiece|> 北京到郑州,直线距离约1000公里,开车需要大约12个小时。 <|endofpiece|>
+输入：
+> 推荐一部高分恐怖电影
+正在生成内容...
+> [CLS] 推荐一部高分恐怖电影 回答: [gMASK] <|endoftext|> <|startofpiece|> 《恐怖游轮》<n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n>
+输入：
+> 问题：冬天，中国哪座城市最适合避寒？问题描述：能推荐一些国内适合冬天避寒的城市吗？回答用户：旅游爱好者
+正在生成内容...
+> [CLS] 问题:冬天,中国哪座城市最适合避寒?问题描述:能推荐一些国内适合冬天避寒的城市吗?回答用户:旅游爱好者 回答: [gMASK] <|endoftext|> <|startofpiece|> 避寒,当然是去海南呀,海南的冬天,阳光明媚,温度适宜,而且海南的冬天,没有雾霾,没有沙尘暴,没有寒冷,只有温暖,海南的冬天,是避寒的好地方。 <|endofpiece|>
+输入：
+> 介绍一下中科曙光公司
+正在生成内容...
+> [CLS] 介绍一下中科曙光公司 回答: [gMASK] <|endoftext|> <|startofpiece|> 中科曙光公司成立于2000年,是中国科学院计算技术研究所控股的高科技公司,是国家首批创新型企业,是国家规划布局内重点软件企业,是国家863计划成果产业化基地,是国家高技术产业化示范工程,是国家火炬计划重点高新技术企业,是国家创新型企业试点单位,是国家集成电路设计产业化基地,是国家信息安全成果产业化基地,是国家863计划成果产业化基地,是国家集成电路设计产业化基地,是国家信息安全成果产业化基地,是国家火炬计划重点高新技术企业,是国家创新型企业试点单位,是国家集成电路设计产业化基地,是国家信息安全成果产业化基地,是国家863计划成果产业化基地
+输入：
+> 退出
+> 再见
+```
 ## 性能数据
 使用的加速卡：4张DCU-Z100-16G：

--- a/libai/layers/activation.py
+++ b/libai/layers/activation.py
@@ -64,7 +64,8 @@ class QuickGELU(nn.Module):
        super().__init__()
    def forward(self, x: flow.Tensor) -> flow.Tensor:
-        return x * flow.sigmoid(1.702 * x)
+        """QuickGELU is estimated with: x * flow.sigmoid(1.702 * x)"""
+        return flow._C.quick_gelu(x)
 def build_activation(activation: Optional[Activation]):

--- a/libai/layers/layer_norm.py
+++ b/libai/layers/layer_norm.py
@@ -70,6 +70,7 @@ class LayerNorm(nn.Module):
            self.bias = None
    def forward(self, x):
+        x = x.to_global(placement=self.weight.placement)
        assert x.shape[-len(self.normalized_shape) :] == self.normalized_shape
        begin_norm_axis = x.ndim - len(self.normalized_shape)
        begin_params_axis = x.ndim - len(self.normalized_shape)
@@ -126,4 +127,5 @@ class RMSLayerNorm(nn.Module):
        self.l2norm_epsilon = eps
    def forward(self, hidden_states):
+        hidden_states = hidden_states.to_global(placement=self.weight.placement)
        return flow._C.rms_norm(hidden_states, self.weight, self.weight.shape, self.l2norm_epsilon)
--- a/projects/GLM/configs/glm_inference.py
+++ b/projects/GLM/configs/glm_inference.py
@@ -9,17 +9,17 @@ cfg = dict(
    hidden_size=4096,
    num_attention_heads=64,
    max_sequence_length=1024,
-    embedding_dropout_prob=0.1,
+    embedding_dropout_prob=0.0,
-    attention_dropout_prob=0.1,
+    attention_dropout_prob=0.0,
-    output_dropout_prob=0.1,
+    output_dropout_prob=0.0,
    layernorm_epsilon=1e-5,
    initializer_range=0.02,
    use_scaled_init_for_output_weights=True,
    bias_gelu_fusion=True,
-    bias_dropout_fusion=False,
+    bias_dropout_fusion=True,
    scale_mask_softmax_fusion=False,
    apply_query_key_layer_scaling=False,
-    amp_enabled=False,
+    amp_enabled=True,
    block_position_encoding=True,
    attention_scale=1.0,
    padding_idx=None,

--- a/projects/GLM/glm-QA.py
+++ b/projects/GLM/glm-QA.py
+# model parallel + pipeline parallel demo
+import oneflow as flow
+from projects.GLM.tokenizer.glm_tokenizer import GLMChineseTokenzier
+from libai.utils import distributed as dist
+from projects.GLM.configs.glm_inference import cfg
+from projects.GLM.modeling_glm import GLMForConditionalGeneration
+from projects.GLM.utils.glm_loader import GLMLoaderHuggerFace
+from omegaconf import DictConfig
+import time
+# 只需简单配置并行方案
+parallel_config = DictConfig(
+    dict(
+        data_parallel_size=1,
+        tensor_parallel_size=2,
+        pipeline_parallel_size=2,
+        pipeline_num_layers=2 * 24
+    )
+)
+dist.setup_dist_util(parallel_config)
+tokenizer = GLMChineseTokenzier.from_pretrained("glm-10b-chinese")
+sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
+placement = dist.get_layer_placement(0)
+loader = GLMLoaderHuggerFace(
+    GLMForConditionalGeneration, 
+    cfg, 
+    "glm-10b-chinese",
+    embedding_dropout_prob=0,
+    attention_dropout_prob=0,
+    output_dropout_prob=0,
+)
+if dist.is_main_process():
+    print("请稍等，正在加载模型中...")
+model = loader.load()
+question = ""
+while True:
+    if dist.is_main_process():
+        print("输入：")
+        question = input("> ")
+    else:
+        question = None
+    question = dist.broadcast_py_object(question, src=0)
+    dist.synchronize()
+    # print(question)
+    if question.lower() == "退出":
+        break
+    input_ids = tokenizer.encode(
+        [
+            question + " 回答： [gMASK]"
+        ],
+        return_tensors="of",
+    )
+    inputs = {"input_ids": input_ids, "attention_mask": flow.ones(input_ids.size())}
+    inputs = tokenizer.build_inputs_for_generation(inputs, max_gen_length=128)
+    if dist.is_main_process():
+        print("正在生成内容...")
+    # start_t = time.time()
+    outputs = model.generate(
+        inputs=inputs['input_ids'].to_global(sbp=sbp, placement=placement), 
+        position_ids=inputs['position_ids'].to_global(sbp=sbp, placement=placement), 
+        generation_attention_mask=inputs['generation_attention_mask'].to_global(sbp=sbp, placement=placement), 
+        max_length=128
+    )
+    # end_t = time.time()
+    # if dist.is_main_process():
+    #     print('model.generate: %s秒' % (end_t - start_t))
+    res = tokenizer.decode(outputs[0])
+    if dist.is_main_process():
+        print("> " + res)
+if dist.is_main_process():
+    print("> 再见")
--- a/projects/GLM/infer_glm.py
+++ b/projects/GLM/infer_glm.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import oneflow as flow
+from libai.utils import distributed as dist
+from projects.GLM.configs.glm_inference import cfg
+from projects.GLM.modeling_glm import GLMForConditionalGeneration
+from projects.GLM.tokenizer.glm_tokenizer import GLMChineseTokenzier
+from projects.GLM.utils.glm_loader import GLMLoaderHuggerFace
+tokenizer = GLMChineseTokenzier.from_pretrained("/data/home/xiezipeng/glm-10b-chinese")
+input_ids = tokenizer.encode(
+    ["西游记的作者是[MASK]。"],
+    return_tensors="of",
+)
+inputs = {"input_ids": input_ids, "attention_mask": flow.ones(input_ids.size(), dtype=flow.bool)}
+inputs = tokenizer.build_inputs_for_generation(inputs, max_gen_length=512)
+sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
+placement = dist.get_layer_placement(0)
+dist.set_device_type("cpu")
+loader = GLMLoaderHuggerFace(
+    GLMForConditionalGeneration,
+    cfg,
+    "/data/home/xiezipeng/glm-10b-chinese",
+    embedding_dropout_prob=0,
+    attention_dropout_prob=0,
+    output_dropout_prob=0,
+)
+model = loader.load()
+model = model.half().cuda()
+model.eval()
+dist.set_device_type("cuda")
+while True:
+    outputs = model.generate(
+        inputs=inputs["input_ids"].to_global(sbp=sbp, placement=placement),
+        position_ids=inputs["position_ids"].to_global(sbp=sbp, placement=placement),
+        generation_attention_mask=inputs["generation_attention_mask"].to_global(
+            sbp=sbp, placement=placement
+        ),
+        max_length=512,
+    )
+    res = tokenizer.decode(outputs[0])
+    if dist.is_main_process():
+        print(res)
--- a/projects/GLM/layers/attention_layer.py
+++ b/projects/GLM/layers/attention_layer.py
@@ -41,6 +41,7 @@ class MultiheadAttention(nn.Module):
        super().__init__()
        self.hidden_size = hidden_size
        self.attention_scale = attention_scale
+        self.num_attention_heads = num_attention_heads
        if output_layer_init_method is None:
            output_layer_init_method = init_method

--- a/projects/GLM/readme.md
+++ b/projects/GLM/readme.md
+# GLM
+2017 年, Google 提出了 Transformer 架构, 随后 BERT 、GPT、T5等预训练模型不断涌现, 并在各项任务中都不断刷新 SOTA 纪录。去年, 清华提出了 GLM 模型(https://github.com/THUDM/GLM), 不同于上述预训练模型架构，它采用了一种自回归的空白填充方法, 在 NLP 领域三种主要的任务（自然语言理解、无条件生成、有条件生成）上都取得了不错的结果。
+在LiBai中主要实现了GLM推理部分的工作，训练相关内容可以参考：
+- [GLM国产大模型训练加速：性能最高提升3倍，显存节省1/3，低成本上手](https://mp.weixin.qq.com/s/dkTGXuJV38KuLb4_LmM20Q)
+- https://github.com/Oneflow-Inc/one-glm
+## GLM-Inference
+当模型规模过于庞大，单个 GPU 设备无法容纳大规模模型参数时，便捷好用的分布式训练和推理需求就相继出现，业内也随之推出相应的工具。
+基于 OneFlow 构建的 LiBai 模型库让分布式上手难度降到最低，用户不需要关注模型如何分配在不同的显卡设备，只需要修改几个配置数据就可以设置不同的分布式策略。当然，加速性能更是出众。
+用 LiBai 搭建的 GLM 可以便捷地实现model parallel + pipeline parallel推理, 很好地解决单卡放不下大规模模型的问题。
+那么，用户如何利用大规模模型训练与推理仓库 LiBai 来构建 GLM 的分布式推理部分？下面用一个小例子解释一下。
+### 分布式推理具有天然优势
+要知道，模型的参数其实就是许多 tensor，也就是以矩阵的形式出现，大模型的参数也就是大矩阵，并行策略就是把大矩阵分为多个小矩阵，并分配到不同的显卡或不同的设备上，基础的 LinearLayer 在LiBai中的实现代码如下：
+```python
+class Linear1D(nn.Module):
+    def __init__(self, in_features, out_features, parallel="data", layer_idx=0, ...):
+        super().__init__()
+        if parallel == "col":
+            weight_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)])
+        elif parallel == "row":
+            weight_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(1)])
+        elif parallel == "data":
+            weight_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
+        else:
+            raise KeyError(f"{parallel} is not supported! Only support ('data', 'row' and 'col')")
+        self.weight = flow.nn.Parameter(
+            flow.empty(
+                (out_features, in_features),
+                dtype=flow.float32,
+                placement=dist.get_layer_placement(layer_idx),  # for pipeline parallelism placement
+                sbp=weight_sbp,
+            )
+        )
+        init_method(self.weight)
+        ...
+    def forward(self, x):
+        ...
+```
+在这里，用户可选择去如何切分 Linear 层的矩阵，如何切分数据矩阵，而OneFlow 中的 SBP 控制竖着切、横着切以及其他拆分矩阵的方案（模型并行、数据并行），以及通过设置 Placement 来控制这个 LinearLayer 是放在第几张显卡上（流水并行）。
+所以，根据 LiBai 中各种 layer 的设计原理以及基于 OneFlow 中 tensor 自带的 SBP 和 Placement 属性的天然优势，使得用户搭建的模型能够很简单地就实现数据并行、模型并行以及流水并行操作。
+### GLM 推理的 Demo 演示
+这里为用户展示 LiBai 中 GLM 便捷的4卡`model parallel+pipeline parallel`推理 Demo，模型可在 HuggingFace 上获取：https://huggingface.co/models?filter=glm
+#### glm-10b的文件结构
+```python
+$ tree data
+path/to/glm-10b
+├── added_tokens.json
+├── vocab.json
+├── merges.txt
+├── config.json
+└── pytorch_model.bin
+```
+#### 推理
+运行以下代码：
+```bash
+# 运行前修改 glm_inference.py 中 `pad_token_id=0, eos_token_id=50258, bos_token_id=50000`
+python3 -m oneflow.distributed.launch --nproc_per_node 4 demo.py
+```
+```python
+# model parallel + pipeline parallel demo
+import oneflow as flow
+from projects.GLM.tokenizer.glm_tokenizer import GLMGPT2Tokenizer
+from libai.utils import distributed as dist
+from projects.GLM.configs.glm_inference import cfg
+from projects.GLM.modeling_glm import GLMForConditionalGeneration
+from projects.GLM.utils.glm_loader import GLMLoaderHuggerFace
+from omegaconf import DictConfig
+# 只需简单配置并行方案
+parallel_config = DictConfig(
+    dict(
+        data_parallel_size=1,
+        tensor_parallel_size=2,
+        pipeline_parallel_size=2,
+        pipeline_num_layers=2 * 24
+    )
+)
+dist.setup_dist_util(parallel_config)
+tokenizer = GLMGPT2Tokenizer.from_pretrained("/path/to/glm-10b")
+input_ids = tokenizer.encode(
+    [
+        "Ng is an adjunct professor at [MASK] (formerly associate professor and Director of its Stanford AI Lab or SAIL ). Also a pioneer in online education, Ng co-founded Coursera and deeplearning.ai."
+    ],
+    return_tensors="of",
+)
+inputs = {"input_ids": input_ids, "attention_mask": flow.ones(input_ids.size())}
+inputs = tokenizer.build_inputs_for_generation(inputs, max_gen_length=512)
+sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
+placement = dist.get_layer_placement(0)
+loader = GLMLoaderHuggerFace(GLMForConditionalGeneration, cfg, "/path/to/glm-10b")
+model = loader.load()
+outputs = model.generate(
+    inputs=inputs['input_ids'].to_global(sbp=sbp, placement=placement), 
+    position_ids=inputs['position_ids'].to_global(sbp=sbp, placement=placement), 
+    generation_attention_mask=inputs['generation_attention_mask'].to_global(sbp=sbp, placement=placement), 
+    max_length=512
+)
+res = tokenizer.decode(outputs[0])
+if dist.is_main_process():
+    print(res)
+>>> [CLS] Ng is an adjunct professor at [MASK] (formerly associate professor and Director of its Stanford AI Lab or SAIL ). Also a pioneer in online education, Ng co-founded Coursera and deeplearning.ai.<|endoftext|> <|startofpiece|>  Stanford University and a co-founder of <|endofpiece|>
+```
+#### glm-10b-chinese的文件结构
+```python
+$ tree data
+path/to/glm-10b-chinese
+├── added_tokens.json
+├── cog-pretrain.model
+├── config.json
+└── pytorch_model.bin
+```
+#### 推理
+运行以下代码：
+```bash
+# 运行前修改 glm_inference.py 中 `pad_token_id=50000, eos_token_id=50007, bos_token_id=None`
+python3 -m oneflow.distributed.launch --nproc_per_node 4 demo.py
+```
+```python
+# model parallel + pipeline parallel demo
+import oneflow as flow
+from projects.GLM.tokenizer.glm_tokenizer import GLMChineseTokenzier
+from libai.utils import distributed as dist
+from projects.GLM.configs.glm_inference import cfg
+from projects.GLM.modeling_glm import GLMForConditionalGeneration
+from projects.GLM.utils.glm_loader import GLMLoaderHuggerFace
+from omegaconf import DictConfig
+# 只需简单配置并行方案
+parallel_config = DictConfig(
+    dict(
+        data_parallel_size=1,
+        tensor_parallel_size=2,
+        pipeline_parallel_size=2,
+        pipeline_num_layers=2 * 24
+    )
+)
+dist.setup_dist_util(parallel_config)
+tokenizer = GLMChineseTokenzier.from_pretrained("/path/to/glm-10b-chinese")
+input_ids = tokenizer.encode(
+    [
+        "凯旋门位于意大利米兰市古城堡旁。1807年为纪念[MASK]而建，门高25米，顶上矗立两武士青铜古兵车铸像。"
+    ],
+    return_tensors="of",
+)
+inputs = {"input_ids": input_ids, "attention_mask": flow.ones(input_ids.size())}
+inputs = tokenizer.build_inputs_for_generation(inputs, max_gen_length=512)
+sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
+placement = dist.get_layer_placement(0)
+loader = GLMLoaderHuggerFace(
+    GLMForConditionalGeneration, 
+    cfg, 
+    "/path/to/glm-10b-chinese",
+    embedding_dropout_prob=0,
+    attention_dropout_prob=0,
+    output_dropout_prob=0,
+)
+model = loader.load()
+outputs = model.generate(
+    inputs=inputs['input_ids'].to_global(sbp=sbp, placement=placement), 
+    position_ids=inputs['position_ids'].to_global(sbp=sbp, placement=placement), 
+    generation_attention_mask=inputs['generation_attention_mask'].to_global(sbp=sbp, placement=placement), 
+    max_length=512
+)
+res = tokenizer.decode(outputs[0])
+if dist.is_main_process():
+    print(res)
+>>> [CLS] 凯旋门位于意大利米兰市古城堡旁。1807年为纪念 [MASK] 而建,门高25米,顶上矗立两武士青铜古兵车铸像。 <|endoftext|> <|startofpiece|> 拿破仑军队攻克米兰城 <|endofpiece|>
+```
+#### 使用 One-GLM 训练的模型进行推理
+LiBai对于OneFlow的模型加载同样方便，如果你希望使用one-glm训练后的模型进行推理，只需简单的将上述demo中的 GLMLoaderHuggerFace 替换为 GLMLoaderLiBai。
\ No newline at end of file