"docs/vscode:/vscode.git/clone" did not exist on "47b334642261926618078c1e4e043cd0ec21fae7"
Commit 0b0ad26a authored by yuguo960516's avatar yuguo960516
Browse files

glm

parent f27ee404
...@@ -161,6 +161,45 @@ tokenizer.decode: 0.0698804759979248秒 ...@@ -161,6 +161,45 @@ tokenizer.decode: 0.0698804759979248秒
[CLS] 冬天,中国哪座城市最适合避寒?问题描述:能推荐一些国内适合冬天避寒的城市吗?回答用户:旅游爱好者 回答: [gMASK] <|endoftext|> <|startofpiece|> 避寒,当然是去海南呀!<n><n>海南的冬天,阳光明媚,温度适宜,而且空气清新,没有雾霾,没有沙尘暴,没有雾霾,没有雾霾!<n><n>海南的冬天,阳光明媚,温度适宜,而且空气清新,没有雾霾,没有沙尘暴,没有雾霾!<n><n>海南的冬天,阳光明媚,温度适宜,而且空气清新,没有雾霾,没有沙尘暴,没有雾霾! [CLS] 冬天,中国哪座城市最适合避寒?问题描述:能推荐一些国内适合冬天避寒的城市吗?回答用户:旅游爱好者 回答: [gMASK] <|endoftext|> <|startofpiece|> 避寒,当然是去海南呀!<n><n>海南的冬天,阳光明媚,温度适宜,而且空气清新,没有雾霾,没有沙尘暴,没有雾霾,没有雾霾!<n><n>海南的冬天,阳光明媚,温度适宜,而且空气清新,没有雾霾,没有沙尘暴,没有雾霾!<n><n>海南的冬天,阳光明媚,温度适宜,而且空气清新,没有雾霾,没有沙尘暴,没有雾霾!
``` ```
### 问答示例
采用1节点,4张DCU-Z100-16G,采用tp=2,pp=2的并行配置。
运行以下代码:
cd projects/GLM`
python3 -m oneflow.distributed.launch --nproc_per_node 4 glm-QA.py
程序运行起来后,允许用户在命令行进行问答交互,输入“退出”,可以结束程序,如下所示:
```
输入
> 如何改善睡眠质量
正在生成内容...
> [CLS] 如何改善睡眠质量 回答: [gMASK] <|endoftext|> <|startofpiece|> 睡眠不好,可以试着用以下方法改善: 1、睡前不要喝咖啡、浓茶、吸烟等,也不要喝含咖啡因的饮料,如可乐、咖啡、茶等。 2、睡前不要进行剧烈运动,如剧烈的跑步、跳舞、打球等。 3、睡前不要看刺激性的电视节目,如恐怖电影、凶杀片等。 4、睡前不要思考问题,如回忆今天发生的事情、明天的工作计划等。 5、睡前不要进食,如吃得过饱、过晚、过饱等。 6、睡前不要进行剧烈的体力活动,如跑步、打球、游泳
输入:
> 从北京到郑州有多少公里
正在生成内容...
> [CLS] 从北京到郑州有多少公里 回答: [gMASK] <|endoftext|> <|startofpiece|> 北京到郑州,直线距离约1000公里,开车需要大约12个小时。 <|endofpiece|>
输入:
> 推荐一部高分恐怖电影
正在生成内容...
> [CLS] 推荐一部高分恐怖电影 回答: [gMASK] <|endoftext|> <|startofpiece|> 《恐怖游轮》<n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n><n>
输入:
> 问题:冬天,中国哪座城市最适合避寒?问题描述:能推荐一些国内适合冬天避寒的城市吗?回答用户:旅游爱好者
正在生成内容...
> [CLS] 问题:冬天,中国哪座城市最适合避寒?问题描述:能推荐一些国内适合冬天避寒的城市吗?回答用户:旅游爱好者 回答: [gMASK] <|endoftext|> <|startofpiece|> 避寒,当然是去海南呀,海南的冬天,阳光明媚,温度适宜,而且海南的冬天,没有雾霾,没有沙尘暴,没有寒冷,只有温暖,海南的冬天,是避寒的好地方。 <|endofpiece|>
输入:
> 介绍一下中科曙光公司
正在生成内容...
> [CLS] 介绍一下中科曙光公司 回答: [gMASK] <|endoftext|> <|startofpiece|> 中科曙光公司成立于2000年,是中国科学院计算技术研究所控股的高科技公司,是国家首批创新型企业,是国家规划布局内重点软件企业,是国家863计划成果产业化基地,是国家高技术产业化示范工程,是国家火炬计划重点高新技术企业,是国家创新型企业试点单位,是国家集成电路设计产业化基地,是国家信息安全成果产业化基地,是国家863计划成果产业化基地,是国家集成电路设计产业化基地,是国家信息安全成果产业化基地,是国家火炬计划重点高新技术企业,是国家创新型企业试点单位,是国家集成电路设计产业化基地,是国家信息安全成果产业化基地,是国家863计划成果产业化基地
输入:
> 退出
> 再见
```
## 性能数据 ## 性能数据
使用的加速卡:4张DCU-Z100-16G: 使用的加速卡:4张DCU-Z100-16G:
......
...@@ -64,7 +64,8 @@ class QuickGELU(nn.Module): ...@@ -64,7 +64,8 @@ class QuickGELU(nn.Module):
super().__init__() super().__init__()
def forward(self, x: flow.Tensor) -> flow.Tensor: def forward(self, x: flow.Tensor) -> flow.Tensor:
return x * flow.sigmoid(1.702 * x) """QuickGELU is estimated with: x * flow.sigmoid(1.702 * x)"""
return flow._C.quick_gelu(x)
def build_activation(activation: Optional[Activation]): def build_activation(activation: Optional[Activation]):
......
...@@ -70,6 +70,7 @@ class LayerNorm(nn.Module): ...@@ -70,6 +70,7 @@ class LayerNorm(nn.Module):
self.bias = None self.bias = None
def forward(self, x): def forward(self, x):
x = x.to_global(placement=self.weight.placement)
assert x.shape[-len(self.normalized_shape) :] == self.normalized_shape assert x.shape[-len(self.normalized_shape) :] == self.normalized_shape
begin_norm_axis = x.ndim - len(self.normalized_shape) begin_norm_axis = x.ndim - len(self.normalized_shape)
begin_params_axis = x.ndim - len(self.normalized_shape) begin_params_axis = x.ndim - len(self.normalized_shape)
...@@ -126,4 +127,5 @@ class RMSLayerNorm(nn.Module): ...@@ -126,4 +127,5 @@ class RMSLayerNorm(nn.Module):
self.l2norm_epsilon = eps self.l2norm_epsilon = eps
def forward(self, hidden_states): def forward(self, hidden_states):
hidden_states = hidden_states.to_global(placement=self.weight.placement)
return flow._C.rms_norm(hidden_states, self.weight, self.weight.shape, self.l2norm_epsilon) return flow._C.rms_norm(hidden_states, self.weight, self.weight.shape, self.l2norm_epsilon)
...@@ -9,17 +9,17 @@ cfg = dict( ...@@ -9,17 +9,17 @@ cfg = dict(
hidden_size=4096, hidden_size=4096,
num_attention_heads=64, num_attention_heads=64,
max_sequence_length=1024, max_sequence_length=1024,
embedding_dropout_prob=0.1, embedding_dropout_prob=0.0,
attention_dropout_prob=0.1, attention_dropout_prob=0.0,
output_dropout_prob=0.1, output_dropout_prob=0.0,
layernorm_epsilon=1e-5, layernorm_epsilon=1e-5,
initializer_range=0.02, initializer_range=0.02,
use_scaled_init_for_output_weights=True, use_scaled_init_for_output_weights=True,
bias_gelu_fusion=True, bias_gelu_fusion=True,
bias_dropout_fusion=False, bias_dropout_fusion=True,
scale_mask_softmax_fusion=False, scale_mask_softmax_fusion=False,
apply_query_key_layer_scaling=False, apply_query_key_layer_scaling=False,
amp_enabled=False, amp_enabled=True,
block_position_encoding=True, block_position_encoding=True,
attention_scale=1.0, attention_scale=1.0,
padding_idx=None, padding_idx=None,
......
# model parallel + pipeline parallel demo
import oneflow as flow
from projects.GLM.tokenizer.glm_tokenizer import GLMChineseTokenzier
from libai.utils import distributed as dist
from projects.GLM.configs.glm_inference import cfg
from projects.GLM.modeling_glm import GLMForConditionalGeneration
from projects.GLM.utils.glm_loader import GLMLoaderHuggerFace
from omegaconf import DictConfig
import time
# 只需简单配置并行方案
parallel_config = DictConfig(
dict(
data_parallel_size=1,
tensor_parallel_size=2,
pipeline_parallel_size=2,
pipeline_num_layers=2 * 24
)
)
dist.setup_dist_util(parallel_config)
tokenizer = GLMChineseTokenzier.from_pretrained("glm-10b-chinese")
sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
placement = dist.get_layer_placement(0)
loader = GLMLoaderHuggerFace(
GLMForConditionalGeneration,
cfg,
"glm-10b-chinese",
embedding_dropout_prob=0,
attention_dropout_prob=0,
output_dropout_prob=0,
)
if dist.is_main_process():
print("请稍等,正在加载模型中...")
model = loader.load()
question = ""
while True:
if dist.is_main_process():
print("输入:")
question = input("> ")
else:
question = None
question = dist.broadcast_py_object(question, src=0)
dist.synchronize()
# print(question)
if question.lower() == "退出":
break
input_ids = tokenizer.encode(
[
question + " 回答: [gMASK]"
],
return_tensors="of",
)
inputs = {"input_ids": input_ids, "attention_mask": flow.ones(input_ids.size())}
inputs = tokenizer.build_inputs_for_generation(inputs, max_gen_length=128)
if dist.is_main_process():
print("正在生成内容...")
# start_t = time.time()
outputs = model.generate(
inputs=inputs['input_ids'].to_global(sbp=sbp, placement=placement),
position_ids=inputs['position_ids'].to_global(sbp=sbp, placement=placement),
generation_attention_mask=inputs['generation_attention_mask'].to_global(sbp=sbp, placement=placement),
max_length=128
)
# end_t = time.time()
# if dist.is_main_process():
# print('model.generate: %s秒' % (end_t - start_t))
res = tokenizer.decode(outputs[0])
if dist.is_main_process():
print("> " + res)
if dist.is_main_process():
print("> 再见")
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
# Copyright 2022 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import oneflow as flow
from libai.utils import distributed as dist
from projects.GLM.configs.glm_inference import cfg
from projects.GLM.modeling_glm import GLMForConditionalGeneration
from projects.GLM.tokenizer.glm_tokenizer import GLMChineseTokenzier
from projects.GLM.utils.glm_loader import GLMLoaderHuggerFace
tokenizer = GLMChineseTokenzier.from_pretrained("/data/home/xiezipeng/glm-10b-chinese")
input_ids = tokenizer.encode(
["西游记的作者是[MASK]。"],
return_tensors="of",
)
inputs = {"input_ids": input_ids, "attention_mask": flow.ones(input_ids.size(), dtype=flow.bool)}
inputs = tokenizer.build_inputs_for_generation(inputs, max_gen_length=512)
sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
placement = dist.get_layer_placement(0)
dist.set_device_type("cpu")
loader = GLMLoaderHuggerFace(
GLMForConditionalGeneration,
cfg,
"/data/home/xiezipeng/glm-10b-chinese",
embedding_dropout_prob=0,
attention_dropout_prob=0,
output_dropout_prob=0,
)
model = loader.load()
model = model.half().cuda()
model.eval()
dist.set_device_type("cuda")
while True:
outputs = model.generate(
inputs=inputs["input_ids"].to_global(sbp=sbp, placement=placement),
position_ids=inputs["position_ids"].to_global(sbp=sbp, placement=placement),
generation_attention_mask=inputs["generation_attention_mask"].to_global(
sbp=sbp, placement=placement
),
max_length=512,
)
res = tokenizer.decode(outputs[0])
if dist.is_main_process():
print(res)
...@@ -41,6 +41,7 @@ class MultiheadAttention(nn.Module): ...@@ -41,6 +41,7 @@ class MultiheadAttention(nn.Module):
super().__init__() super().__init__()
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.attention_scale = attention_scale self.attention_scale = attention_scale
self.num_attention_heads = num_attention_heads
if output_layer_init_method is None: if output_layer_init_method is None:
output_layer_init_method = init_method output_layer_init_method = init_method
......
# GLM
2017 年, Google 提出了 Transformer 架构, 随后 BERT 、GPT、T5等预训练模型不断涌现, 并在各项任务中都不断刷新 SOTA 纪录。去年, 清华提出了 GLM 模型(https://github.com/THUDM/GLM), 不同于上述预训练模型架构,它采用了一种自回归的空白填充方法, 在 NLP 领域三种主要的任务(自然语言理解、无条件生成、有条件生成)上都取得了不错的结果。
在LiBai中主要实现了GLM推理部分的工作,训练相关内容可以参考:
- [GLM国产大模型训练加速:性能最高提升3倍,显存节省1/3,低成本上手](https://mp.weixin.qq.com/s/dkTGXuJV38KuLb4_LmM20Q)
- https://github.com/Oneflow-Inc/one-glm
## GLM-Inference
当模型规模过于庞大,单个 GPU 设备无法容纳大规模模型参数时,便捷好用的分布式训练和推理需求就相继出现,业内也随之推出相应的工具。
基于 OneFlow 构建的 LiBai 模型库让分布式上手难度降到最低,用户不需要关注模型如何分配在不同的显卡设备,只需要修改几个配置数据就可以设置不同的分布式策略。当然,加速性能更是出众。
用 LiBai 搭建的 GLM 可以便捷地实现model parallel + pipeline parallel推理, 很好地解决单卡放不下大规模模型的问题。
那么,用户如何利用大规模模型训练与推理仓库 LiBai 来构建 GLM 的分布式推理部分?下面用一个小例子解释一下。
### 分布式推理具有天然优势
要知道,模型的参数其实就是许多 tensor,也就是以矩阵的形式出现,大模型的参数也就是大矩阵,并行策略就是把大矩阵分为多个小矩阵,并分配到不同的显卡或不同的设备上,基础的 LinearLayer 在LiBai中的实现代码如下:
```python
class Linear1D(nn.Module):
def __init__(self, in_features, out_features, parallel="data", layer_idx=0, ...):
super().__init__()
if parallel == "col":
weight_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)])
elif parallel == "row":
weight_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(1)])
elif parallel == "data":
weight_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
else:
raise KeyError(f"{parallel} is not supported! Only support ('data', 'row' and 'col')")
self.weight = flow.nn.Parameter(
flow.empty(
(out_features, in_features),
dtype=flow.float32,
placement=dist.get_layer_placement(layer_idx), # for pipeline parallelism placement
sbp=weight_sbp,
)
)
init_method(self.weight)
...
def forward(self, x):
...
```
在这里,用户可选择去如何切分 Linear 层的矩阵,如何切分数据矩阵,而OneFlow 中的 SBP 控制竖着切、横着切以及其他拆分矩阵的方案(模型并行、数据并行),以及通过设置 Placement 来控制这个 LinearLayer 是放在第几张显卡上(流水并行)。
所以,根据 LiBai 中各种 layer 的设计原理以及基于 OneFlow 中 tensor 自带的 SBP 和 Placement 属性的天然优势,使得用户搭建的模型能够很简单地就实现数据并行、模型并行以及流水并行操作。
### GLM 推理的 Demo 演示
这里为用户展示 LiBai 中 GLM 便捷的4卡`model parallel+pipeline parallel`推理 Demo,模型可在 HuggingFace 上获取:https://huggingface.co/models?filter=glm
#### glm-10b的文件结构
```python
$ tree data
path/to/glm-10b
├── added_tokens.json
├── vocab.json
├── merges.txt
├── config.json
└── pytorch_model.bin
```
#### 推理
运行以下代码:
```bash
# 运行前修改 glm_inference.py 中 `pad_token_id=0, eos_token_id=50258, bos_token_id=50000`
python3 -m oneflow.distributed.launch --nproc_per_node 4 demo.py
```
```python
# model parallel + pipeline parallel demo
import oneflow as flow
from projects.GLM.tokenizer.glm_tokenizer import GLMGPT2Tokenizer
from libai.utils import distributed as dist
from projects.GLM.configs.glm_inference import cfg
from projects.GLM.modeling_glm import GLMForConditionalGeneration
from projects.GLM.utils.glm_loader import GLMLoaderHuggerFace
from omegaconf import DictConfig
# 只需简单配置并行方案
parallel_config = DictConfig(
dict(
data_parallel_size=1,
tensor_parallel_size=2,
pipeline_parallel_size=2,
pipeline_num_layers=2 * 24
)
)
dist.setup_dist_util(parallel_config)
tokenizer = GLMGPT2Tokenizer.from_pretrained("/path/to/glm-10b")
input_ids = tokenizer.encode(
[
"Ng is an adjunct professor at [MASK] (formerly associate professor and Director of its Stanford AI Lab or SAIL ). Also a pioneer in online education, Ng co-founded Coursera and deeplearning.ai."
],
return_tensors="of",
)
inputs = {"input_ids": input_ids, "attention_mask": flow.ones(input_ids.size())}
inputs = tokenizer.build_inputs_for_generation(inputs, max_gen_length=512)
sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
placement = dist.get_layer_placement(0)
loader = GLMLoaderHuggerFace(GLMForConditionalGeneration, cfg, "/path/to/glm-10b")
model = loader.load()
outputs = model.generate(
inputs=inputs['input_ids'].to_global(sbp=sbp, placement=placement),
position_ids=inputs['position_ids'].to_global(sbp=sbp, placement=placement),
generation_attention_mask=inputs['generation_attention_mask'].to_global(sbp=sbp, placement=placement),
max_length=512
)
res = tokenizer.decode(outputs[0])
if dist.is_main_process():
print(res)
>>> [CLS] Ng is an adjunct professor at [MASK] (formerly associate professor and Director of its Stanford AI Lab or SAIL ). Also a pioneer in online education, Ng co-founded Coursera and deeplearning.ai.<|endoftext|> <|startofpiece|> Stanford University and a co-founder of <|endofpiece|>
```
#### glm-10b-chinese的文件结构
```python
$ tree data
path/to/glm-10b-chinese
├── added_tokens.json
├── cog-pretrain.model
├── config.json
└── pytorch_model.bin
```
#### 推理
运行以下代码:
```bash
# 运行前修改 glm_inference.py 中 `pad_token_id=50000, eos_token_id=50007, bos_token_id=None`
python3 -m oneflow.distributed.launch --nproc_per_node 4 demo.py
```
```python
# model parallel + pipeline parallel demo
import oneflow as flow
from projects.GLM.tokenizer.glm_tokenizer import GLMChineseTokenzier
from libai.utils import distributed as dist
from projects.GLM.configs.glm_inference import cfg
from projects.GLM.modeling_glm import GLMForConditionalGeneration
from projects.GLM.utils.glm_loader import GLMLoaderHuggerFace
from omegaconf import DictConfig
# 只需简单配置并行方案
parallel_config = DictConfig(
dict(
data_parallel_size=1,
tensor_parallel_size=2,
pipeline_parallel_size=2,
pipeline_num_layers=2 * 24
)
)
dist.setup_dist_util(parallel_config)
tokenizer = GLMChineseTokenzier.from_pretrained("/path/to/glm-10b-chinese")
input_ids = tokenizer.encode(
[
"凯旋门位于意大利米兰市古城堡旁。1807年为纪念[MASK]而建,门高25米,顶上矗立两武士青铜古兵车铸像。"
],
return_tensors="of",
)
inputs = {"input_ids": input_ids, "attention_mask": flow.ones(input_ids.size())}
inputs = tokenizer.build_inputs_for_generation(inputs, max_gen_length=512)
sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
placement = dist.get_layer_placement(0)
loader = GLMLoaderHuggerFace(
GLMForConditionalGeneration,
cfg,
"/path/to/glm-10b-chinese",
embedding_dropout_prob=0,
attention_dropout_prob=0,
output_dropout_prob=0,
)
model = loader.load()
outputs = model.generate(
inputs=inputs['input_ids'].to_global(sbp=sbp, placement=placement),
position_ids=inputs['position_ids'].to_global(sbp=sbp, placement=placement),
generation_attention_mask=inputs['generation_attention_mask'].to_global(sbp=sbp, placement=placement),
max_length=512
)
res = tokenizer.decode(outputs[0])
if dist.is_main_process():
print(res)
>>> [CLS] 凯旋门位于意大利米兰市古城堡旁1807年为纪念 [MASK] 而建,门高25米,顶上矗立两武士青铜古兵车铸像 <|endoftext|> <|startofpiece|> 拿破仑军队攻克米兰城 <|endofpiece|>
```
#### 使用 One-GLM 训练的模型进行推理
LiBai对于OneFlow的模型加载同样方便,如果你希望使用one-glm训练后的模型进行推理,只需简单的将上述demo中的 GLMLoaderHuggerFace 替换为 GLMLoaderLiBai。
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment