update backend for PyTorch Update (#130)

* update backend * version fixes https://github.com/zhanghang1989/PyTorch-Encoding/issues/123

update backend for PyTorch Update (#130)
* update backend * version fixes https://github.com/zhanghang1989/PyTorch-Encoding/issues/123
c2cb2aab · Hang Zhang · GitHub · a0fe6223 · c2cb2aab · c2cb2aab
Unverified Commit c2cb2aab authored Oct 04, 2018 by Hang Zhang Committed by GitHub Oct 04, 2018
16 changed files
--- a/LICENSE
+++ b/LICENSE
 MIT License

-Copyright (c) 2017 Hang Zhang. All rights reserved.
-Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All rights reserved.
+Copyright (c) 2017-     Hang Zhang. All rights reserved.
+Copyright (c) 2018-     Amazon.com, Inc. or its affiliates. All rights reserved.

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

--- a/docs/source/experiments/segmentation.rst
+++ b/docs/source/experiments/segmentation.rst
@@ -83,15 +83,15 @@ Test Pre-trained Model


    <code xml:space="preserve" id="cmd_enc101_ade" style="display: none; text-align: left; white-space: pre-wrap">
-    CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --dataset ADE20K --model EncNet --aux --se-loss --backbone resnet101 
+    CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --dataset ADE20K --model EncNet --aux --se-loss --backbone resnet101 --base-size 640 --crop-size 576
    </code>

    <code xml:space="preserve" id="cmd_enc101_voc" style="display: none; text-align: left; white-space: pre-wrap">
    # First finetuning COCO dataset pretrained model on augmented set
    # You can also train from scratch on COCO by yourself
-    CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --dataset Pascal_aug --model-zoo EncNet_Resnet101_COCO --aux --se-loss --lr 0.001 --syncbn --ngpus 4 --checkname res101
+    CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --dataset Pascal_aug --model-zoo EncNet_Resnet101_COCO --aux --se-loss --lr 0.001 --syncbn --ngpus 4 --checkname res101 --ft
    # Finetuning on original set
-    CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --dataset Pascal_voc --model encnet --aux  --se-loss --backbone resnet101 --lr 0.0001 --syncbn --ngpus 4 --checkname res101 --resume runs/Pascal_aug/encnet/res101/checkpoint.params
+    CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --dataset Pascal_voc --model encnet --aux  --se-loss --backbone resnet101 --lr 0.0001 --syncbn --ngpus 4 --checkname res101 --resume runs/Pascal_aug/encnet/res101/checkpoint.params --ft
    </code>

 Quick Demo

--- a/docs/source/experiments/texture.rst
+++ b/docs/source/experiments/texture.rst
@@ -22,7 +22,7 @@ Test Pre-trained Model
    cd PyTorch-Encoding/
    python scripts/prepare_minc.py

- Download pre-trained model (pre-trained on train-1 split using single training size of 224, with an error rate of :math:`19.70\%` using single crop on test-1 set)::
+- Download pre-trained model (pre-trained on train-1 split using single training size of 224, with an error rate of :math:`18.96\%` using single crop on test-1 set)::

    cd experiments/recognition
    python model/download_models.py

--- a/encoding/__init__.py
+++ b/encoding/__init__.py
@@ -10,4 +10,4 @@

 """An optimized PyTorch package with CUDA backend."""
 from .version import __version__
-from . import nn, functions, dilated, parallel, utils, models, datasets, optimizer
+from . import nn, functions, dilated, parallel, utils, models, datasets
--- a/encoding/datasets/cityscapes.py
+++ b/encoding/datasets/cityscapes.py
@@ -6,6 +6,7 @@

 import os
 import sys
+import random
 import numpy as np
 from tqdm import tqdm, trange
 from PIL import Image, ImageOps, ImageFilter
@@ -93,7 +94,7 @@ class CitySegmentation(BaseDataset):
            mask = mask.transpose(Image.FLIP_LEFT_RIGHT)
        crop_size = self.crop_size
        # random scale (short edge from 480 to 720)
-        short_size = random.randint(int(self.base_size*0.5), int(self.base_size*2.5))
+        short_size = random.randint(int(self.base_size*0.5), int(self.base_size*2.0))
        w, h = img.size
        if h > w:
            ow = short_size

--- a/encoding/lib/cpu/nms_cpu.cpp
+++ b/encoding/lib/cpu/nms_cpu.cpp
+#include <torch/tensor.h>
 #include <ATen/ATen.h>
 #include <ATen/NativeFunctions.h>

@@ -42,7 +43,8 @@ std::vector<at::Tensor> Non_Max_Suppression_CPU(
  
  auto num_boxes = input.size(1);
  auto batch_size = input.size(0);
-  auto mask = input.type().toScalarType(at::kByte).tensor({batch_size, num_boxes});
+  auto mask = torch::zeros({batch_size, num_boxes}, input.type().toScalarType(at::kByte));
+  //auto mask = input.type().toScalarType(at::kByte).tensor({batch_size, num_boxes});
  mask.fill_(1);
  auto *rawMask = mask.data<unsigned char>();
  auto *rawIdx = sorted_inds.data<int64_t>();

--- a/encoding/lib/cpu/roi_align_cpu.cpp
+++ b/encoding/lib/cpu/roi_align_cpu.cpp
+#include <torch/tensor.h>
 #include <ATen/ATen.h>
 //#include <omp.h>

@@ -404,7 +405,7 @@ at::Tensor ROIAlign_Forward_CPU(
  AT_ASSERT(roi_cols == 4 || roi_cols == 5);

  // Output at::Tensor is (num_rois, C, pooled_height, pooled_width)
-  auto output = input.type().tensor({num_rois, channels, pooled_height, pooled_width});
+  auto output = torch::zeros({num_rois, channels, pooled_height, pooled_width}, input.options());

  AT_ASSERT(input.is_contiguous());
  AT_ASSERT(bottom_rois.is_contiguous());
@@ -451,7 +452,7 @@ at::Tensor ROIAlign_Backward_CPU(
  AT_ASSERT(roi_cols == 4 || roi_cols == 5);

  // Output at::Tensor is (num_rois, C, pooled_height, pooled_width)
-  auto grad_in = bottom_rois.type().tensor({b_size, channels, height, width}).zero_(); 
+  auto grad_in = torch::zeros({b_size, channels, height, width}, bottom_rois.options());

  AT_ASSERT(bottom_rois.is_contiguous());


--- a/encoding/lib/cpu/syncbn_cpu.cpp
+++ b/encoding/lib/cpu/syncbn_cpu.cpp
+#include <torch/tensor.h>
 #include <ATen/ATen.h>
 #include <vector>

@@ -45,8 +46,8 @@ std::vector<at::Tensor> BatchNorm_Backward_CPU(
 std::vector<at::Tensor> Sum_Square_Forward_CPU(
    const at::Tensor input) {
  /* outputs */
-  at::Tensor sum = input.type().tensor({input.size(1)}).zero_();
-  at::Tensor square = input.type().tensor({input.size(1)}).zero_();
+  at::Tensor sum = torch::zeros({input.size(1)}, input.options());
+  at::Tensor square = torch::zeros({input.size(1)}, input.options());
  return {sum, square};
 }


--- a/encoding/lib/gpu/encoding_kernel.cu
+++ b/encoding/lib/gpu/encoding_kernel.cu
 #include <vector>
+#include <torch/tensor.h>
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>

@@ -165,7 +166,7 @@ at::Tensor Aggregate_Forward_CUDA(
    const at::Tensor X_,
    const at::Tensor C_) {
  /* Device tensors */
-  auto E_ = A_.type().tensor({A_.size(0), C_.size(0), C_.size(1)}).zero_(); 
+  auto E_ = torch::zeros({A_.size(0), C_.size(0), C_.size(1)}, A_.options());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  // B, K, D
  dim3 blocks(C_.size(1), C_.size(0), X_.size(0));
@@ -214,7 +215,7 @@ at::Tensor ScaledL2_Forward_CUDA(
    const at::Tensor X_,
    const at::Tensor C_,
    const at::Tensor S_) {
-  auto SL_ = X_.type().tensor({X_.size(0), X_.size(1), C_.size(0)}).zero_();
+  auto SL_ = torch::zeros({X_.size(0), X_.size(1), C_.size(0)}, X_.options());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  dim3 blocks(C_.size(0), X_.size(1), X_.size(0));
  dim3 threads(getNumThreads(C_.size(1)));

--- a/encoding/lib/gpu/encodingv2_kernel.cu
+++ b/encoding/lib/gpu/encodingv2_kernel.cu
 #include <vector>
+#include <torch/tensor.h>
 #include <ATen/ATen.h>
 #include <ATen/Functions.h>
 #include <ATen/cuda/CUDAContext.h>
@@ -239,7 +240,7 @@ at::Tensor Encoding_Dist_Inference_Forward_CUDA(
    const at::Tensor STD_) {
    // const at::Tensor S_,
  // X \in R^{B, N, D}, C \in R^{K, D}, S \in R^K
-  auto KD_ = X_.type().tensor({X_.size(0), X_.size(1), C_.size(0)}).zero_();
+  auto KD_ = torch::zeros({X_.size(0), X_.size(1), C_.size(0)}, X_.options());
  // E(x), E(x^2)
  int N = X_.size(0) * X_.size(1);
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
@@ -301,7 +302,7 @@ std::vector<at::Tensor> Encoding_Dist_Forward_CUDA(
    double eps) {
    // const at::Tensor S_,
  // X \in R^{B, N, D}, C \in R^{K, D}, S \in R^K
-  auto KD_ = X_.type().tensor({X_.size(0), X_.size(1), C_.size(0)}).zero_();
+  auto KD_ = torch::zeros({X_.size(0), X_.size(1), C_.size(0)}, X_.options());
  // E(x), E(x^2)
  int N = X_.size(0) * X_.size(1);
  auto SVar_ = (X_.pow(2).sum(0).sum(0).view({1, X_.size(2)}) -
@@ -373,7 +374,7 @@ at::Tensor AggregateV2_Forward_CUDA(
    const at::Tensor C_,
    const at::Tensor STD_) {
  /* Device tensors */
-  auto E_ = A_.type().tensor({A_.size(0), C_.size(0), C_.size(1)}).zero_(); 
+  auto E_ = torch::zeros({A_.size(0), C_.size(0), C_.size(1)}, A_.options());
  // auto IS_ = 1.0f / (S_ + eps).sqrt();
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  // B, K, D

--- a/encoding/lib/gpu/nms_kernel.cu
+++ b/encoding/lib/gpu/nms_kernel.cu
+#include <torch/tensor.h>
 #include <ATen/ATen.h>
 #include "ATen/NativeFunctions.h"
 #include <ATen/cuda/CUDAContext.h>
@@ -75,7 +76,8 @@ std::vector<at::Tensor> Non_Max_Suppression_CUDA(

  auto num_boxes = input.size(1);
  auto batch_size = input.size(0);
-  auto mask = input.type().toScalarType(at::kByte).tensor({batch_size, num_boxes});
+  //auto mask = input.type().toScalarType(at::kByte).tensor({batch_size, num_boxes});
+  auto mask = torch::zeros({batch_size, num_boxes}, input.type().toScalarType(at::kByte));
  mask.fill_(1);
  
  //need the indices of the boxes sorted by score.

--- a/encoding/lib/gpu/roi_align_kernel.cu
+++ b/encoding/lib/gpu/roi_align_kernel.cu
+#include <torch/tensor.h>
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>

@@ -367,7 +368,7 @@ at::Tensor ROIAlign_Forward_CUDA(
  auto width = input.size(3);

  // Output Tensor is (num_rois, C, pooled_height, pooled_width)
-  auto output = input.type().tensor({proposals, channels, pooled_height, pooled_width});
+  auto output = torch::zeros({proposals, channels, pooled_height, pooled_width}, input.options());

  auto count = output.numel();
  
@@ -414,7 +415,7 @@ at::Tensor ROIAlign_Backward_CUDA(

  // Output Tensor is (num_rois, C, pooled_height, pooled_width)
  // gradient wrt input features
-  auto grad_in = rois.type().tensor({b_size, channels, height, width}).zero_(); 
+  auto grad_in = torch::zeros({b_size, channels, height, width}, rois.options());
  auto num_rois = rois.size(0);
  auto count = grad_output.numel();


--- a/encoding/lib/gpu/syncbn_kernel.cu
+++ b/encoding/lib/gpu/syncbn_kernel.cu
 #include <vector>
+#include <torch/tensor.h>
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>

@@ -244,8 +245,8 @@ std::vector<at::Tensor> BatchNorm_Backward_CUDA(
 std::vector<at::Tensor> Sum_Square_Forward_CUDA(
    const at::Tensor input_) {
  /* outputs */
-  at::Tensor sum_ = input_.type().tensor({input_.size(1)}).zero_();
-  at::Tensor square_ = input_.type().tensor({input_.size(1)}).zero_();
+  at::Tensor sum_ = torch::zeros({input_.size(1)}, input_.options());
+  at::Tensor square_ = torch::zeros({input_.size(1)}, input_.options());
  /* cuda utils*/
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  dim3 blocks(input_.size(1));

--- a/encoding/models/model_store.py
+++ b/encoding/models/model_store.py
@@ -11,7 +11,7 @@ _model_sha1 = {name: checksum for checksum, name in [
    ('2a57e44de9c853fa015b172309a1ee7e2d0e4e2a', 'resnet101'),
    ('0d43d698c66aceaa2bc0309f55efdd7ff4b143af', 'resnet152'),
    ('2e22611a7f3992ebdee6726af169991bc26d7363', 'deepten_minc'),
-    ('fc8c0b795abf0133700c2d4265d2f9edab7eb6cc', 'fcn_resnet50_ade'),
+    ('662e979de25a389f11c65e9f1df7e06c2c356381', 'fcn_resnet50_ade'),
    ('eeed8e582f0fdccdba8579e7490570adc6d85c7c', 'fcn_resnet50_pcontext'),
    ('54f70c772505064e30efd1ddd3a14e1759faa363', 'psp_resnet50_ade'),
    ('075195c5237b778c718fd73ceddfa1376c18dfd0', 'deeplab_resnet50_ade'),

--- a/experiments/segmentation/option.py
+++ b/experiments/segmentation/option.py
@@ -92,7 +92,7 @@ class Options():
        if args.epochs is None:
            epoches = {
                'coco': 30,
-                'citys': 180,
+                'citys': 240,
                'pascal_voc': 50,
                'pascal_aug': 50,
                'pcontext': 80,
@@ -100,7 +100,7 @@ class Options():
            }
            args.epochs = epoches[args.dataset.lower()]
        if args.batch_size is None:
-            args.batch_size = 4 * torch.cuda.device_count()
+            args.batch_size = 16
        if args.test_batch_size is None:
            args.test_batch_size = args.batch_size
        if args.lr is None:

--- a/setup.py
+++ b/setup.py
@@ -18,7 +18,7 @@ import setuptools.command.install

 cwd = os.path.dirname(os.path.abspath(__file__))

-version = '0.5.0'
+version = '0.5.1'
 try:
    sha = subprocess.check_output(['git', 'rev-parse', 'HEAD'], 
        cwd=cwd).decode('ascii').strip()