v1.2.5

6ff004ca · chenzk · f6156c58 · 6ff004ca · 6ff004ca · 6ff004ca
Commit 6ff004ca authored Sep 18, 2024 by chenzk
13 changed files
--- a/README.md
+++ b/README.md
@@ -22,8 +22,8 @@ mv megatron-deepspeed-vit_pytorch megatron-deepspeed-vit # 去框架名后缀
 ```
 ### Docker（方法一）
 ```
-docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.10.0-centos7.6-dtk-23.04-py38-latest
+docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-ubuntu20.04-dtk24.04.1-py3.10
-# <your IMAGE ID>用以上拉取的docker的镜像ID替换
+# <your IMAGE ID>用以上拉取的docker的镜像ID a4dd5be0ca23替换
 docker run --shm-size 10g --network=host --name=megatron --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v $PWD/megatron-deepspeed-vit:/home/megatron-deepspeed-vit -it <your IMAGE ID> bash
 pip install -r requirements.txt
 ```
@@ -38,13 +38,13 @@ docker run --rm --shm-size 10g --network=host --name=megatron --privileged --dev
 1、关于本项目DCU显卡所需的特殊深度学习库可从光合开发者社区下载安装：
 https://developer.hpccube.com/tool/
 ```
-DTK驱动：dtk23.04
+DTK驱动：dtk24.04.1
-python：python3.8
+python：python3.10
-torch:1.10.0
+torch:2.1.0
-torchvision:0.10.0
+torchvision:0.16.0
-torchaudio:0.10.0
+torchaudio:2.1.2
-deepspeed:0.9.2
+deepspeed:0.12.3
-apex:0.1
+apex:1.1.0
 ```
 `Tips：以上dtk驱动、python、torch等DCU相关工具版本需要严格一一对应`
@@ -92,21 +92,13 @@ data
 ```
 cd megatron-deepspeed-vit
 sh examples/dspvit_1node.sh
+# 训练过程中报：Message: 'is_pipe_partitioned= False',不影响训练，为deepspeed本身bug，如需要屏蔽可参照deepspeed github官网issue进行源码修改来解决。
 ```
 ### 单机单卡
 ```
 sh examples/dspvit_1dcu.sh
 ```
-## 推理
-方法类似以上训练步骤，只需传参时在[`dspvit_1node.sh`](./examples/dspvit_1node.sh)中额外添加以下两个参数：
-```
--eval-only True \
--do_test True \
-```
-### 单机多卡
-```
-sh examples/dspvit_1node.sh
-```
 ## result
 <div align=center>
    <img src="./doc/classify.png"/>

--- a/data/test/images/ILSVRC2012_val_0001250388.JPEG
+++ b/data/test/images/ILSVRC2012_val_0001250388.JPEG
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
-FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.10.0-centos7.6-dtk-23.04-py38-latest
+FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-ubuntu20.04-dtk24.04.1-py3.10
 ENV DEBIAN_FRONTEND=noninteractive
 # RUN yum update && yum install -y git cmake wget build-essential
 RUN source /opt/dtk-23.04/env.sh

--- a/docker_start.sh
+++ b/docker_start.sh
+docker run -it --shm-size=64G -v $PWD/megatron-deepspeed-vit:/home/megatron-deepspeed-vit -v /public/DL_DATA/AI:/home/AI -v /opt/hyhal:/opt/hyhal:ro --privileged=true --device=/dev/kfd --device=//dev/dri/ --group-add video --name vit2 a4dd5be0ca23 bash   # python -m torch.utils.collect_env
--- a/examples/dspvit_1dcu.sh
+++ b/examples/dspvit_1dcu.sh
@@ -31,7 +31,6 @@ deepspeed --num_gpus 1 pretrain_vit.py \
       --weight-decay 1e-2 \
       --clip-grad 1.0 \
       --lr-warmup-fraction .01 \
-       --checkpoint-activations \
       --log-interval 100 \
       --save-interval 10000 \
       --eval-interval 1000 \
@@ -42,3 +41,6 @@ deepspeed --num_gpus 1 pretrain_vit.py \
       --world_size ${WORLD_SIZE} \
       --deepspeed \
       --deepspeed_config $DS_CONFIG \
--- a/examples/dspvit_1node.sh
+++ b/examples/dspvit_1node.sh
@@ -31,7 +31,6 @@ deepspeed --num_gpus 4 pretrain_vit.py \
       --weight-decay 1e-2 \
       --clip-grad 1.0 \
       --lr-warmup-fraction .01 \
-       --checkpoint-activations \
       --log-interval 100 \
       --save-interval 10000 \
       --eval-interval 1000 \
@@ -40,7 +39,5 @@ deepspeed --num_gpus 4 pretrain_vit.py \
       --padded_vocab_size 224\
       --deepspeed \
       --deepspeed_config $DS_CONFIG \
 # --eval-only True \
 # --do_test True \
--- a/git-lfs_install.sh
+++ b/git-lfs_install.sh
+wget https://github.com/git-lfs/git-lfs/releases/download/v3.5.1/git-lfs-linux-amd64-v3.5.1.tar.gz
+tar -xzvf git-lfs-linux-amd64-v3.5.1.tar.gz
+./git-lfs-3.5.1/install.sh
+rm -rf git-lfs-3.5.1 git-lfs-linux-amd64-v3.5.1.tar.gz
--- a/megatron/initialize.py
+++ b/megatron/initialize.py
@@ -211,7 +211,7 @@ def _compile_dependencies():
    if torch.distributed.get_rank() == 0:
        start_time = time.time()
        print('> compiling and loading fused kernels ...', flush=True)
-        fused_kernels.load(args)
+        #fused_kernels.load(args)
        torch.distributed.barrier()
    else:
        torch.distributed.barrier()
@@ -219,7 +219,7 @@ def _compile_dependencies():
        with warnings.catch_warnings():
            # ignore loading noise
            warnings.simplefilter("ignore")
-            fused_kernels.load(args)
+            #fused_kernels.load(args)
    # Simple barrier to make sure all ranks have passed the
    # compilation phase successfully before moving on to the

--- a/megatron/model/fused_layer_norm.py
+++ b/megatron/model/fused_layer_norm.py
@@ -29,52 +29,17 @@ from torch.nn.parameter import Parameter
 import importlib
 import torch
 import torch.nn.functional as F
+from apex.normalization.fused_layer_norm import FusedLayerNormAffineFunction
-global fused_mix_prec_layer_norm_cuda
+global fused_layer_norm_cuda
-fused_mix_prec_layer_norm_cuda = None
+fused_layer_norm_cuda = None
-class FusedLayerNormAffineFunction(torch.autograd.Function):
-  @staticmethod
-  def forward(ctx, input, weight, bias, normalized_shape, eps):
-    ctx.normalized_shape = normalized_shape
-    ctx.eps = eps
-    input_ = input.contiguous()
-    weight_ = weight.contiguous()
-    bias_ = bias.contiguous()
-    output, mean, invvar = fused_mix_prec_layer_norm_cuda.forward_affine(
-        input_, ctx.normalized_shape, weight_, bias_, ctx.eps)
-    ctx.save_for_backward(input_, weight_, bias_, mean, invvar)
-    return output
-  @staticmethod
-  def backward(ctx, grad_output):
-    input_, weight_, bias_, mean, invvar = ctx.saved_tensors
-    grad_input = grad_weight = grad_bias = None
-    grad_input, grad_weight, grad_bias \
-      = fused_mix_prec_layer_norm_cuda.backward_affine(
-        grad_output.contiguous(), mean, invvar,
-        input_, ctx.normalized_shape,
-        weight_, bias_, ctx.eps)
-    return grad_input, grad_weight, grad_bias, None, None
 class MixedFusedLayerNorm(torch.nn.Module):
  def __init__(self, normalized_shape, eps=1e-5):
    super(MixedFusedLayerNorm, self).__init__()
-    global fused_mix_prec_layer_norm_cuda
+    global fused_layer_norm_cuda
-    fused_mix_prec_layer_norm_cuda = importlib.import_module(
+    fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
-      "fused_mix_prec_layer_norm_cuda")
    if isinstance(normalized_shape, numbers.Integral):
        normalized_shape = (normalized_shape,)
    self.normalized_shape = torch.Size(normalized_shape)

--- a/megatron/optimizer/clip_grads.py
+++ b/megatron/optimizer/clip_grads.py
@@ -16,7 +16,8 @@
 """Gradient clipping."""
 import torch
-from torch._six import inf
+#from torch._six import inf
+from torch import inf
 from apex.multi_tensor_apply import multi_tensor_applier
 import amp_C

--- a/megatron/training.py
+++ b/megatron/training.py
@@ -208,15 +208,17 @@ def pretrain(train_valid_test_dataset_provider,
    if args.save and iteration != 0:
        save_checkpoint(iteration, model, optimizer, lr_scheduler)
+    '''
    if args.do_test:
        # Run on test data.
        names = args.test_weighted_split_names
        names = names if names is not None else ['test'] * len(test_data_iterator)
-        for iterator, name in zip(test_data_iterator, names):
+        test(forward_step_func, model, iteration, verbose=False)
-            test(forward_step_func, iterator, model, verbose=False)
+        print('the end of training for test data')
+        # for iterator, name in zip(test_data_iterator, names):
+            # test(forward_step_func, model, iteration, verbose=False)
+    '''
    codecarbon_tracker_stop()
@@ -1115,14 +1117,13 @@ class Testdataset(Dataset):
        pil_img = Image.open(img_path).convert('RGB')    
        data = self.transforms(pil_img)
-        return data, label, data_path
+        return data, label
    def __len__(self):
        return len(self.imgs)
+def test(forward_step_func, model, iteration, verbose=False):
-def test(forward_step_func, data_iterator, model, verbose=False):
    """Test."""
    args = get_args()
    test_data_path = os.path.join(args.data_path[0], "test/images")
@@ -1131,6 +1132,10 @@ def test(forward_step_func, data_iterator, model, verbose=False):
    test_dataset = Testdataset(img_paths=img_paths)
    data_iterator = iter(DataLoader(test_dataset, batch_size=1, shuffle=False))
+    #for data in data_iterator:
+    #    print(data)
    # Turn on evaluation mode which disables dropout.
    for model_module in model:
        model_module.eval()
@@ -1149,10 +1154,12 @@ def test(forward_step_func, data_iterator, model, verbose=False):
    with torch.no_grad():
        iteration = 0
        while iteration < len(img_paths):#test images num:len(img_paths)
-            iteration += 1
+        # while True:
+            # print("test image: ", img_paths, img_paths[iteration])
            if verbose and iteration % args.log_interval == 0:
                print_rank_0('Evaluating iter {}/{}'.format(iteration, args.eval_iters))
+            '''
            if mpu.get_pipeline_model_parallel_world_size() > 1:
                if args.virtual_pipeline_model_parallel_size is not None:
                    forward_backward_func = forward_backward_pipelining_with_interleaving
@@ -1160,26 +1167,23 @@ def test(forward_step_func, data_iterator, model, verbose=False):
                    forward_backward_func = forward_backward_pipelining_without_interleaving
            else:
                forward_backward_func = forward_backward_no_pipelining
+            '''
            if args.deepspeed:
                # DeepSpeed uses eval_batch() and already aggregates losses.
                assert isinstance(model, list) and len(model) == 1
-                data_path = next(data_iterator)[2][0]
                logits = model[0].eval_batch(data_iterator, compute_loss = False, reduce_output = None)
                logits = torch.cat(logits, 0)
                outputs = torch.argmax(logits, -1)[0]
                if args.rank == 0:
-                    print(data_path,': ',outputs.cpu().numpy())
+                    print(img_paths[iteration],': ',outputs.cpu().numpy())
            else:
                data = next(data_iterator)
-                data_path = data[2][0]
                images = data[0].cuda()
                logits = model[0](images).contiguous().float()
                outputs = torch.argmax(logits, -1)[0]
                if args.rank == 0:
-                    print(data_path,': ',outputs.cpu().numpy())
+                    print(img_paths[iteration],': ',outputs.cpu().numpy())
+            iteration += 1
-    print('the end of training for test data')
 def evaluate_and_print_results(prefix, forward_step_func,

--- a/megatron/training_bak.py
+++ b/megatron/training_bak.py
--- a/pretrain_vit.py
+++ b/pretrain_vit.py
@@ -80,8 +80,12 @@ def get_batch(data_iterator):
        return images, labels
 def get_batch_pipe(data):
-    images = data[0].cuda()
+    if len(data) == 1:# for test
-    labels = data[1].cuda()
+        images = data[0].cuda()
+        labels = torch.tensor(int(0)).cuda()
+    else:
+        images = data[0].cuda()
+        labels = data[1].cuda()
    return (images), (labels)
 def forward_step(data_iterator, model):