Commit 6ff004ca authored by chenzk's avatar chenzk
Browse files

v1.2.5

parent f6156c58
Pipeline #1702 failed with stage
...@@ -22,8 +22,8 @@ mv megatron-deepspeed-vit_pytorch megatron-deepspeed-vit # 去框架名后缀 ...@@ -22,8 +22,8 @@ mv megatron-deepspeed-vit_pytorch megatron-deepspeed-vit # 去框架名后缀
``` ```
### Docker(方法一) ### Docker(方法一)
``` ```
docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.10.0-centos7.6-dtk-23.04-py38-latest docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-ubuntu20.04-dtk24.04.1-py3.10
# <your IMAGE ID>用以上拉取的docker的镜像ID替换 # <your IMAGE ID>用以上拉取的docker的镜像ID a4dd5be0ca23替换
docker run --shm-size 10g --network=host --name=megatron --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v $PWD/megatron-deepspeed-vit:/home/megatron-deepspeed-vit -it <your IMAGE ID> bash docker run --shm-size 10g --network=host --name=megatron --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v $PWD/megatron-deepspeed-vit:/home/megatron-deepspeed-vit -it <your IMAGE ID> bash
pip install -r requirements.txt pip install -r requirements.txt
``` ```
...@@ -38,13 +38,13 @@ docker run --rm --shm-size 10g --network=host --name=megatron --privileged --dev ...@@ -38,13 +38,13 @@ docker run --rm --shm-size 10g --network=host --name=megatron --privileged --dev
1、关于本项目DCU显卡所需的特殊深度学习库可从光合开发者社区下载安装: 1、关于本项目DCU显卡所需的特殊深度学习库可从光合开发者社区下载安装:
https://developer.hpccube.com/tool/ https://developer.hpccube.com/tool/
``` ```
DTK驱动:dtk23.04 DTK驱动:dtk24.04.1
python:python3.8 python:python3.10
torch:1.10.0 torch:2.1.0
torchvision:0.10.0 torchvision:0.16.0
torchaudio:0.10.0 torchaudio:2.1.2
deepspeed:0.9.2 deepspeed:0.12.3
apex:0.1 apex:1.1.0
``` ```
`Tips:以上dtk驱动、python、torch等DCU相关工具版本需要严格一一对应` `Tips:以上dtk驱动、python、torch等DCU相关工具版本需要严格一一对应`
...@@ -92,21 +92,13 @@ data ...@@ -92,21 +92,13 @@ data
``` ```
cd megatron-deepspeed-vit cd megatron-deepspeed-vit
sh examples/dspvit_1node.sh sh examples/dspvit_1node.sh
# 训练过程中报:Message: 'is_pipe_partitioned= False',不影响训练,为deepspeed本身bug,如需要屏蔽可参照deepspeed github官网issue进行源码修改来解决。
``` ```
### 单机单卡 ### 单机单卡
``` ```
sh examples/dspvit_1dcu.sh sh examples/dspvit_1dcu.sh
``` ```
## 推理
方法类似以上训练步骤,只需传参时在[`dspvit_1node.sh`](./examples/dspvit_1node.sh)中额外添加以下两个参数:
```
--eval-only True \
--do_test True \
```
### 单机多卡
```
sh examples/dspvit_1node.sh
```
## result ## result
<div align=center> <div align=center>
<img src="./doc/classify.png"/> <img src="./doc/classify.png"/>
......
FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.10.0-centos7.6-dtk-23.04-py38-latest FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-ubuntu20.04-dtk24.04.1-py3.10
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
# RUN yum update && yum install -y git cmake wget build-essential # RUN yum update && yum install -y git cmake wget build-essential
RUN source /opt/dtk-23.04/env.sh RUN source /opt/dtk-23.04/env.sh
......
docker run -it --shm-size=64G -v $PWD/megatron-deepspeed-vit:/home/megatron-deepspeed-vit -v /public/DL_DATA/AI:/home/AI -v /opt/hyhal:/opt/hyhal:ro --privileged=true --device=/dev/kfd --device=//dev/dri/ --group-add video --name vit2 a4dd5be0ca23 bash # python -m torch.utils.collect_env
...@@ -31,7 +31,6 @@ deepspeed --num_gpus 1 pretrain_vit.py \ ...@@ -31,7 +31,6 @@ deepspeed --num_gpus 1 pretrain_vit.py \
--weight-decay 1e-2 \ --weight-decay 1e-2 \
--clip-grad 1.0 \ --clip-grad 1.0 \
--lr-warmup-fraction .01 \ --lr-warmup-fraction .01 \
--checkpoint-activations \
--log-interval 100 \ --log-interval 100 \
--save-interval 10000 \ --save-interval 10000 \
--eval-interval 1000 \ --eval-interval 1000 \
...@@ -42,3 +41,6 @@ deepspeed --num_gpus 1 pretrain_vit.py \ ...@@ -42,3 +41,6 @@ deepspeed --num_gpus 1 pretrain_vit.py \
--world_size ${WORLD_SIZE} \ --world_size ${WORLD_SIZE} \
--deepspeed \ --deepspeed \
--deepspeed_config $DS_CONFIG \ --deepspeed_config $DS_CONFIG \
...@@ -31,7 +31,6 @@ deepspeed --num_gpus 4 pretrain_vit.py \ ...@@ -31,7 +31,6 @@ deepspeed --num_gpus 4 pretrain_vit.py \
--weight-decay 1e-2 \ --weight-decay 1e-2 \
--clip-grad 1.0 \ --clip-grad 1.0 \
--lr-warmup-fraction .01 \ --lr-warmup-fraction .01 \
--checkpoint-activations \
--log-interval 100 \ --log-interval 100 \
--save-interval 10000 \ --save-interval 10000 \
--eval-interval 1000 \ --eval-interval 1000 \
...@@ -40,7 +39,5 @@ deepspeed --num_gpus 4 pretrain_vit.py \ ...@@ -40,7 +39,5 @@ deepspeed --num_gpus 4 pretrain_vit.py \
--padded_vocab_size 224\ --padded_vocab_size 224\
--deepspeed \ --deepspeed \
--deepspeed_config $DS_CONFIG \ --deepspeed_config $DS_CONFIG \
# --eval-only True \ # --eval-only True \
# --do_test True \ # --do_test True \
wget https://github.com/git-lfs/git-lfs/releases/download/v3.5.1/git-lfs-linux-amd64-v3.5.1.tar.gz
tar -xzvf git-lfs-linux-amd64-v3.5.1.tar.gz
./git-lfs-3.5.1/install.sh
rm -rf git-lfs-3.5.1 git-lfs-linux-amd64-v3.5.1.tar.gz
...@@ -211,7 +211,7 @@ def _compile_dependencies(): ...@@ -211,7 +211,7 @@ def _compile_dependencies():
if torch.distributed.get_rank() == 0: if torch.distributed.get_rank() == 0:
start_time = time.time() start_time = time.time()
print('> compiling and loading fused kernels ...', flush=True) print('> compiling and loading fused kernels ...', flush=True)
fused_kernels.load(args) #fused_kernels.load(args)
torch.distributed.barrier() torch.distributed.barrier()
else: else:
torch.distributed.barrier() torch.distributed.barrier()
...@@ -219,7 +219,7 @@ def _compile_dependencies(): ...@@ -219,7 +219,7 @@ def _compile_dependencies():
with warnings.catch_warnings(): with warnings.catch_warnings():
# ignore loading noise # ignore loading noise
warnings.simplefilter("ignore") warnings.simplefilter("ignore")
fused_kernels.load(args) #fused_kernels.load(args)
# Simple barrier to make sure all ranks have passed the # Simple barrier to make sure all ranks have passed the
# compilation phase successfully before moving on to the # compilation phase successfully before moving on to the
......
...@@ -29,52 +29,17 @@ from torch.nn.parameter import Parameter ...@@ -29,52 +29,17 @@ from torch.nn.parameter import Parameter
import importlib import importlib
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
from apex.normalization.fused_layer_norm import FusedLayerNormAffineFunction
global fused_mix_prec_layer_norm_cuda global fused_layer_norm_cuda
fused_mix_prec_layer_norm_cuda = None fused_layer_norm_cuda = None
class FusedLayerNormAffineFunction(torch.autograd.Function):
@staticmethod
def forward(ctx, input, weight, bias, normalized_shape, eps):
ctx.normalized_shape = normalized_shape
ctx.eps = eps
input_ = input.contiguous()
weight_ = weight.contiguous()
bias_ = bias.contiguous()
output, mean, invvar = fused_mix_prec_layer_norm_cuda.forward_affine(
input_, ctx.normalized_shape, weight_, bias_, ctx.eps)
ctx.save_for_backward(input_, weight_, bias_, mean, invvar)
return output
@staticmethod
def backward(ctx, grad_output):
input_, weight_, bias_, mean, invvar = ctx.saved_tensors
grad_input = grad_weight = grad_bias = None
grad_input, grad_weight, grad_bias \
= fused_mix_prec_layer_norm_cuda.backward_affine(
grad_output.contiguous(), mean, invvar,
input_, ctx.normalized_shape,
weight_, bias_, ctx.eps)
return grad_input, grad_weight, grad_bias, None, None
class MixedFusedLayerNorm(torch.nn.Module): class MixedFusedLayerNorm(torch.nn.Module):
def __init__(self, normalized_shape, eps=1e-5): def __init__(self, normalized_shape, eps=1e-5):
super(MixedFusedLayerNorm, self).__init__() super(MixedFusedLayerNorm, self).__init__()
global fused_mix_prec_layer_norm_cuda global fused_layer_norm_cuda
fused_mix_prec_layer_norm_cuda = importlib.import_module( fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
"fused_mix_prec_layer_norm_cuda")
if isinstance(normalized_shape, numbers.Integral): if isinstance(normalized_shape, numbers.Integral):
normalized_shape = (normalized_shape,) normalized_shape = (normalized_shape,)
self.normalized_shape = torch.Size(normalized_shape) self.normalized_shape = torch.Size(normalized_shape)
......
...@@ -16,7 +16,8 @@ ...@@ -16,7 +16,8 @@
"""Gradient clipping.""" """Gradient clipping."""
import torch import torch
from torch._six import inf #from torch._six import inf
from torch import inf
from apex.multi_tensor_apply import multi_tensor_applier from apex.multi_tensor_apply import multi_tensor_applier
import amp_C import amp_C
......
...@@ -208,15 +208,17 @@ def pretrain(train_valid_test_dataset_provider, ...@@ -208,15 +208,17 @@ def pretrain(train_valid_test_dataset_provider,
if args.save and iteration != 0: if args.save and iteration != 0:
save_checkpoint(iteration, model, optimizer, lr_scheduler) save_checkpoint(iteration, model, optimizer, lr_scheduler)
'''
if args.do_test: if args.do_test:
# Run on test data. # Run on test data.
names = args.test_weighted_split_names names = args.test_weighted_split_names
names = names if names is not None else ['test'] * len(test_data_iterator) names = names if names is not None else ['test'] * len(test_data_iterator)
for iterator, name in zip(test_data_iterator, names): test(forward_step_func, model, iteration, verbose=False)
test(forward_step_func, iterator, model, verbose=False) print('the end of training for test data')
# for iterator, name in zip(test_data_iterator, names):
# test(forward_step_func, model, iteration, verbose=False)
'''
codecarbon_tracker_stop() codecarbon_tracker_stop()
...@@ -1115,14 +1117,13 @@ class Testdataset(Dataset): ...@@ -1115,14 +1117,13 @@ class Testdataset(Dataset):
pil_img = Image.open(img_path).convert('RGB') pil_img = Image.open(img_path).convert('RGB')
data = self.transforms(pil_img) data = self.transforms(pil_img)
return data, label, data_path return data, label
def __len__(self): def __len__(self):
return len(self.imgs) return len(self.imgs)
def test(forward_step_func, model, iteration, verbose=False):
def test(forward_step_func, data_iterator, model, verbose=False):
"""Test.""" """Test."""
args = get_args() args = get_args()
test_data_path = os.path.join(args.data_path[0], "test/images") test_data_path = os.path.join(args.data_path[0], "test/images")
...@@ -1131,6 +1132,10 @@ def test(forward_step_func, data_iterator, model, verbose=False): ...@@ -1131,6 +1132,10 @@ def test(forward_step_func, data_iterator, model, verbose=False):
test_dataset = Testdataset(img_paths=img_paths) test_dataset = Testdataset(img_paths=img_paths)
data_iterator = iter(DataLoader(test_dataset, batch_size=1, shuffle=False)) data_iterator = iter(DataLoader(test_dataset, batch_size=1, shuffle=False))
#for data in data_iterator:
# print(data)
# Turn on evaluation mode which disables dropout. # Turn on evaluation mode which disables dropout.
for model_module in model: for model_module in model:
model_module.eval() model_module.eval()
...@@ -1149,10 +1154,12 @@ def test(forward_step_func, data_iterator, model, verbose=False): ...@@ -1149,10 +1154,12 @@ def test(forward_step_func, data_iterator, model, verbose=False):
with torch.no_grad(): with torch.no_grad():
iteration = 0 iteration = 0
while iteration < len(img_paths):#test images num:len(img_paths) while iteration < len(img_paths):#test images num:len(img_paths)
iteration += 1 # while True:
# print("test image: ", img_paths, img_paths[iteration])
if verbose and iteration % args.log_interval == 0: if verbose and iteration % args.log_interval == 0:
print_rank_0('Evaluating iter {}/{}'.format(iteration, args.eval_iters)) print_rank_0('Evaluating iter {}/{}'.format(iteration, args.eval_iters))
'''
if mpu.get_pipeline_model_parallel_world_size() > 1: if mpu.get_pipeline_model_parallel_world_size() > 1:
if args.virtual_pipeline_model_parallel_size is not None: if args.virtual_pipeline_model_parallel_size is not None:
forward_backward_func = forward_backward_pipelining_with_interleaving forward_backward_func = forward_backward_pipelining_with_interleaving
...@@ -1160,26 +1167,23 @@ def test(forward_step_func, data_iterator, model, verbose=False): ...@@ -1160,26 +1167,23 @@ def test(forward_step_func, data_iterator, model, verbose=False):
forward_backward_func = forward_backward_pipelining_without_interleaving forward_backward_func = forward_backward_pipelining_without_interleaving
else: else:
forward_backward_func = forward_backward_no_pipelining forward_backward_func = forward_backward_no_pipelining
'''
if args.deepspeed: if args.deepspeed:
# DeepSpeed uses eval_batch() and already aggregates losses. # DeepSpeed uses eval_batch() and already aggregates losses.
assert isinstance(model, list) and len(model) == 1 assert isinstance(model, list) and len(model) == 1
data_path = next(data_iterator)[2][0]
logits = model[0].eval_batch(data_iterator, compute_loss = False, reduce_output = None) logits = model[0].eval_batch(data_iterator, compute_loss = False, reduce_output = None)
logits = torch.cat(logits, 0) logits = torch.cat(logits, 0)
outputs = torch.argmax(logits, -1)[0] outputs = torch.argmax(logits, -1)[0]
if args.rank == 0: if args.rank == 0:
print(data_path,': ',outputs.cpu().numpy()) print(img_paths[iteration],': ',outputs.cpu().numpy())
else: else:
data = next(data_iterator) data = next(data_iterator)
data_path = data[2][0]
images = data[0].cuda() images = data[0].cuda()
logits = model[0](images).contiguous().float() logits = model[0](images).contiguous().float()
outputs = torch.argmax(logits, -1)[0] outputs = torch.argmax(logits, -1)[0]
if args.rank == 0: if args.rank == 0:
print(data_path,': ',outputs.cpu().numpy()) print(img_paths[iteration],': ',outputs.cpu().numpy())
iteration += 1
print('the end of training for test data')
def evaluate_and_print_results(prefix, forward_step_func, def evaluate_and_print_results(prefix, forward_step_func,
......
This diff is collapsed.
...@@ -80,8 +80,12 @@ def get_batch(data_iterator): ...@@ -80,8 +80,12 @@ def get_batch(data_iterator):
return images, labels return images, labels
def get_batch_pipe(data): def get_batch_pipe(data):
images = data[0].cuda() if len(data) == 1:# for test
labels = data[1].cuda() images = data[0].cuda()
labels = torch.tensor(int(0)).cuda()
else:
images = data[0].cuda()
labels = data[1].cuda()
return (images), (labels) return (images), (labels)
def forward_step(data_iterator, model): def forward_step(data_iterator, model):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment