v0.4.2

2d21747a · Zhang · 7e19143c · 7e19143c · 2d21747a · 2d21747a
Commit 2d21747a authored Jun 04, 2018 by Zhang
20 changed files
--- a/build.py
+++ b/build.py
-##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-## Created by: Hang Zhang
-## ECE Department, Rutgers University
-## Email: zhang.hang@rutgers.edu
-## Copyright (c) 2017
-##
-## This source code is licensed under the MIT-style license found in the
-## LICENSE file in the root directory of this source tree 
-##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-
-import os
-import torch
-import platform
-import subprocess
-from torch.utils.ffi import create_extension
-
-torch_ver = torch.__version__[:3]
-
-lib_path = os.path.join(os.path.dirname(torch.__file__), 'lib')
-cwd = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'encoding/')
-encoding_lib_path = os.path.join(cwd, "lib")
-
-# clean the build files
-clean_cmd = ['bash', 'clean.sh']
-subprocess.check_call(clean_cmd)
-
-# build CUDA library
-os.environ['TORCH_BUILD_DIR'] = lib_path
-if platform.system() == 'Darwin':
-    if torch_ver == '0.3':
-        os.environ['TH_LIBRARIES'] = os.path.join(lib_path,'libATen.1.dylib')
-    else:
-        os.environ['TH_LIBRARIES'] = os.path.join(lib_path,'libATen.dylib')
-    ENCODING_LIB = os.path.join(cwd, 'lib/libENCODING.dylib')
-
-else:
-    os.environ['CFLAGS'] = '-std=c99'
-    if torch_ver == '0.3':
-        os.environ['TH_LIBRARIES'] = os.path.join(lib_path,'libATen.so.1')
-    else:
-        os.environ['TH_LIBRARIES'] = os.path.join(lib_path,'libATen.so')
-    ENCODING_LIB = os.path.join(cwd, 'lib/libENCODING.so')
-
-build_all_cmd = ['bash', 'encoding/make.sh']
-subprocess.check_call(build_all_cmd, env=dict(os.environ))
-
-# build FFI
-sources = ['encoding/src/encoding_lib.cpp']
-headers = [
-    'encoding/src/encoding_lib.h',
-]
-defines = [('WITH_CUDA', None)]
-with_cuda = True 
-
-include_path = [os.path.join(lib_path, 'include'),
-                os.path.join(cwd,'kernel'),
-                os.path.join(cwd,'kernel/include'),
-                os.path.join(cwd,'src/')]
-
-def make_relative_rpath(path):
-    if platform.system() == 'Darwin':
-        return '-Wl,-rpath,' + path
-    else:
-        return '-Wl,-rpath,' + path
-
-ffi = create_extension(
-    'encoding._ext.encoding_lib',
-    package=True,
-    headers=headers,
-    sources=sources,
-    define_macros=defines,
-    relative_to=__file__,
-    with_cuda=with_cuda,
-    extra_compile_args=["-std=c99"],
-    include_dirs = include_path,
-    extra_link_args = [
-        make_relative_rpath(lib_path),
-        make_relative_rpath(encoding_lib_path),
-        ENCODING_LIB,
-    ],
-)
-
-if __name__ == '__main__':
-    ffi.build()
--- a/docs/source/_templates/layout.html
+++ b/docs/source/_templates/layout.html
+{% extends "!layout.html" %}
+
+{%- block extrahead %} 
+
+
+  <script type="text/javascript" src="http://zhanghang1989.github.io/files/hidebib.js"></script>    
+{% endblock %}
--- a/docs/source/dilated.rst
+++ b/docs/source/dilated.rst
@@ -50,37 +50,3 @@ ResNet
 ~~~~~~~~~~~~~~~~~~~

 .. autofunction:: resnet152
-
-
-DenseNet
--------
-
-:hidden:`DenseNet`
-~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: DenseNet
-    :members:
-
-
-:hidden:`densenet161`
-~~~~~~~~~~~~~~~~~~~~~
-
-.. autofunction:: densenet161
-
-
-:hidden:`densenet121`
-~~~~~~~~~~~~~~~~~~~~~
-
-.. autofunction:: densenet121
-
-
-:hidden:`densenet169`
-~~~~~~~~~~~~~~~~~~~~~
-
-.. autofunction:: densenet169
-
-
-:hidden:`densenet201`
-~~~~~~~~~~~~~~~~~~~~~
-
-.. autofunction:: densenet201
--- a/docs/source/encoding.rst
+++ b/docs/source/encoding.rst
-.. role:: hidden
-    :class: hidden-section
-
-NN Layers
-=========
-
-Customized NN modules in Encoding Package. For Synchronized Cross-GPU Batch Normalization, please visit :class:`encoding.nn.BatchNorm2d`.
-
-.. currentmodule:: encoding.nn
-
-:hidden:`Encoding`
-~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: Encoding
-    :members:
-
-:hidden:`Inspiration`
-~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: Inspiration
-    :members:
-
-:hidden:`UpsampleConv2d`
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: UpsampleConv2d
-    :members:
-
-:hidden:`DilatedAvgPool2d`
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: DilatedAvgPool2d
-    :members:
-
-:hidden:`GramMatrix`
-~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: GramMatrix
-    :members:
--- a/docs/source/experiments/segmentation.rst
+++ b/docs/source/experiments/segmentation.rst
+Context Encoding for Semantic Segmentation (EncNet)
+===================================================
+
+Install Package
+---------------
+
+- Clone the GitHub repo::
+    
+    git clone git@github.com:zhanghang1989/PyTorch-Encoding.git
+
+- Install PyTorch Encoding (if not yet). Please follow the installation guide `Installing PyTorch Encoding <../notes/compile.html>`_.
+
+Test Pre-trained Model
+----------------------
+
+.. hint::
+    The model names contain the training information. For instance ``FCN_ResNet50_PContext``:
+      - ``FCN`` indicate the algorithm is “Fully Convolutional Network for Semantic Segmentation”
+      - ``ResNet50`` is the name of backbone network.
+      - ``PContext`` means the PASCAL in Context dataset.
+
+    How to get pretrained model, for example ``FCN_ResNet50_PContext``::
+
+        model = encoding.models.get_model('FCN_ResNet50_PContext', pretrained=True)
+
+    The test script is in the ``experiments/segmentation/`` folder. For evaluating the model (using MS),
+    for example ``Encnet_ResNet50_PContext``::
+
+        python test.py --dataset PContext --model-zoo Encnet_ResNet50_PContext --eval
+        # pixAcc: 0.7862, mIoU: 0.4946: 100%|████████████████████████| 319/319 [09:44<00:00,  1.83s/it]
+
+    The command for training the model can be found by clicking ``cmd`` in the table.
+
+.. role:: raw-html(raw)
+   :format: html
+
+----------------------------------+-----------+-----------+---------------------------------------------------------------------------------------------+
+| Model                            | pixAcc    | mIoU      | Command                                                                                     |
+==================================+===========+===========+=============================================================================================+
+| FCN_ResNet50_PContext            | 76.0%     | 45.7      | :raw-html:`<a href="javascript:toggleblock('cmd_fcn50_pcont')" class="toggleblock">cmd</a>` |
+----------------------------------+-----------+-----------+---------------------------------------------------------------------------------------------+
+| Encnet_ResNet50_PContext         | 78.6%     | 49.5      | :raw-html:`<a href="javascript:toggleblock('cmd_enc50_pcont')" class="toggleblock">cmd</a>` |
+----------------------------------+-----------+-----------+---------------------------------------------------------------------------------------------+
+
+.. raw:: html
+
+    <code xml:space="preserve" id="cmd_fcn50_pcont" style="display: none; text-align: left; white-space: pre-wrap">
+    CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --dataset PContext --model FCN
+    </code>
+
+    <code xml:space="preserve" id="cmd_enc50_pcont" style="display: none; text-align: left; white-space: pre-wrap">
+    CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --dataset PContext --model EncNet --aux --se-loss
+    </code>
+
+Quick Demo
+~~~~~~~~~~
+
+.. code-block:: python
+
+    import torch
+    import encoding
+
+    # Get the model
+    model = encoding.models.get_model('Encnet_ResNet50_PContext', pretrained=True).cuda()
+    model.eval()
+
+    # Prepare the image
+    url = 'https://github.com/zhanghang1989/image-data/blob/master/' + \
+          'encoding/segmentation/pcontext/2010_001829_org.jpg?raw=true'
+    filename = 'example.jpg'
+    img = encoding.utils.load_image(
+        encoding.utils.download(url, filename)).cuda().unsqueeze(0)
+
+    # Make prediction
+    output = model.evaluate(img)
+    predict = torch.max(output, 1)[1].cpu().numpy() + 1
+
+    # Get color pallete for visualization
+    mask = encoding.utils.get_mask_pallete(predict, 'pcontext')
+    mask.save('output.png')
+
+
+.. image:: https://raw.githubusercontent.com/zhanghang1989/image-data/master/encoding/segmentation/pcontext/2010_001829_org.jpg
+   :width: 45%
+
+.. image:: https://raw.githubusercontent.com/zhanghang1989/image-data/master/encoding/segmentation/pcontext/2010_001829.png
+   :width: 45%
+
+Train Your Own Model
+--------------------
+
+- Prepare the datasets by runing the scripts in the ``scripts/`` folder, for example preparing ``PASCAL Context`` dataset::
+
+    python scripts/prepare_pcontext.py
+
+- The training script is in the ``experiments/segmentation/`` folder, example training command::
+
+    CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --dataset pcontext --model encnet --aux --se-loss
+
+- Detail training options, please run ``python train.py -h``.
+
+Citation
+--------
+
+.. note::
+    * Hang Zhang, Kristin Dana, Jianping Shi, Zhongyue Zhang, Xiaogang Wang, Ambrish Tyagi, Amit Agrawal. "Context Encoding for Semantic Segmentation"  *The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) 2018*::
+
+        @InProceedings{Zhang_2018_CVPR,
+        author = {Zhang, Hang and Dana, Kristin and Shi, Jianping and Zhang, Zhongyue and Wang, Xiaogang and Tyagi, Ambrish and Agrawal, Amit},
+        title = {Context Encoding for Semantic Segmentation},
+        booktitle = {The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+        month = {June},
+        year = {2018}
+        }
--- a/docs/source/functions.rst
+++ b/docs/source/functions.rst
@@ -9,19 +9,10 @@ encoding.functions
 .. currentmodule:: encoding.functions


-:hidden:`batchnorm`
+:hidden:`batchnormtrain`
 ~~~~~~~~~~~~~~~~~~~~~~~~

-.. autofunction:: batchnorm
-
-:hidden:`batchnormeval`
-~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autofunction:: batchnormeval
-:hidden:`dilatedavgpool2d`
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autofunction:: dilatedavgpool2d
+.. autofunction:: batchnormtrain

 :hidden:`aggregate`
 ~~~~~~~~~~~~~~~~~~~

--- a/docs/source/nn.rst
+++ b/docs/source/nn.rst
@@ -4,7 +4,7 @@
 encoding.nn
 ===========

-Customized NN modules in Encoding Package. For Synchronized Cross-GPU Batch Normalization, please visit :class:`encoding.nn.SyncBatchNorm2d`.
+Customized NN modules in Encoding Package. For Synchronized Cross-GPU Batch Normalization, please visit :class:`encoding.nn.BatchNorm2d`.

 .. currentmodule:: encoding.nn

@@ -14,22 +14,22 @@ Customized NN modules in Encoding Package. For Synchronized Cross-GPU Batch Norm
 .. autoclass:: Encoding
    :members:

-:hidden:`SyncBatchNorm2d`
+:hidden:`BatchNorm2d`
 ~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: SyncBatchNorm2d
+.. autoclass:: BatchNorm2d
    :members:

-:hidden:`SyncBatchNorm1d`
+:hidden:`BatchNorm1d`
 ~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: SyncBatchNorm1d
+.. autoclass:: BatchNorm1d
    :members:

-:hidden:`SyncBatchNorm3d`
+:hidden:`BatchNorm3d`
 ~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: SyncBatchNorm3d
+.. autoclass:: BatchNorm3d
    :members:

 :hidden:`Inspiration`
@@ -44,12 +44,6 @@ Customized NN modules in Encoding Package. For Synchronized Cross-GPU Batch Norm
 .. autoclass:: UpsampleConv2d
    :members:

-:hidden:`DilatedAvgPool2d`
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: DilatedAvgPool2d
-    :members:
-
 :hidden:`GramMatrix`
 ~~~~~~~~~~~~~~~~~~~~


--- a/docs/source/syncbn.rst
+++ b/docs/source/syncbn.rst
-.. role:: hidden
-    :class: hidden-section
-
-Synchronized BatchNorm
-======================
-
-.. note::
-    The original ``Self-Parallel`` version of ``BatchNorm`` has been deprecated in favor of PyTorch Compatible :class:`encoding.nn.BatchNorm2d`.
-
-.. currentmodule:: encoding.nn
-
-Modules
-------
-
-:hidden:`BatchNorm1d`
-~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: BatchNorm1d
-    :members:
-
-:hidden:`BatchNorm2d`
-~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: BatchNorm2d
-    :members:
-
-
-.. currentmodule:: encoding
-
-
-Functions
---------
-
-.. currentmodule:: encoding.functions
-
-
-:hidden:`batchnormtrain`
-~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autofunction:: batchnormtrain
-
-:hidden:`batchnormeval`
-~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autofunction:: batchnormeval
--- a/docs/source/utils.rst
+++ b/docs/source/utils.rst
@@ -9,10 +9,11 @@ Useful util functions.
 .. automodule:: encoding.utils
 .. currentmodule:: encoding.utils

-:hidden:`get_optimizer`
-~~~~~~~~~~~~~~~~~~~~~~~
+:hidden:`LR_Scheduler`
+~~~~~~~~~~~~~~~~~~~~~~

-.. autofunction:: get_optimizer
+.. autoclass:: LR_Scheduler
+    :members:

 :hidden:`save_checkpoint`
 ~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -20,11 +21,11 @@ Useful util functions.
 .. autofunction:: save_checkpoint

 :hidden:`batch_pix_accuracy`
-~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autofunction:: batch_pix_accuracy

 :hidden:`batch_intersection_union`
-~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autofunction:: batch_intersection_union
--- a/encoding/CMakeLists.txt
+++ b/encoding/CMakeLists.txt
-##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-## Created by: Hang Zhang
-## ECE Department, Rutgers University
-## Email: zhang.hang@rutgers.edu
-## Copyright (c) 2017
-##
-## This source code is licensed under the MIT-style license found in the
-## LICENSE file in the root directory of this source tree 
-##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-
-CMAKE_MINIMUM_REQUIRED(VERSION 2.8 FATAL_ERROR)
-CMAKE_POLICY(VERSION 2.8)
-
-INCLUDE(${CMAKE_CURRENT_SOURCE_DIR}/cmake/FindTorch.cmake)
-
-IF(NOT CUDA_FOUND)
-  FIND_PACKAGE(CUDA 6.5 REQUIRED)
-ENDIF()
-
-# Detect CUDA architecture and get best NVCC flags
-IF(NOT COMMAND CUDA_SELECT_NVCC_ARCH_FLAGS OR MSVC)
-  INCLUDE(${CMAKE_CURRENT_SOURCE_DIR}/cmake/select_compute_arch.cmake)
-ENDIF()
-LIST(APPEND CUDA_NVCC_FLAGS $ENV{TORCH_NVCC_FLAGS})
-CUDA_SELECT_NVCC_ARCH_FLAGS(NVCC_FLAGS_EXTRA $ENV{TORCH_CUDA_ARCH_LIST})
-LIST(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
-
-if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-  if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "4.9.3")
-    if(CUDA_VERSION VERSION_LESS "8.0")
-      MESSAGE(STATUS "Found gcc >=5 and CUDA <= 7.5, adding workaround C++ flags")
-      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_FORCE_INLINES -D_MWAITXINTRIN_H_INCLUDED -D__STRICT_ANSI__")
-    endif(CUDA_VERSION VERSION_LESS "8.0")
-  endif(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "4.9.3")
-endif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-
-IF(MSVC)
-  LIST(APPEND CUDA_NVCC_FLAGS "-Xcompiler /wd4819")
-  ADD_DEFINITIONS(-DTH_EXPORTS)
-ENDIF()
-
-IF(NOT ENCODING_INSTALL_LIB_SUBDIR)
-  SET(ENCODING_INSTALL_LIB_SUBDIR "${TORCH_BUILD_DIR}" CACHE PATH "ENCODING install library directory")
-  SET(ENCODING_INSTALL_INCLUDE_SUBDIR "${TORCH_BUILD_DIR}/include" CACHE PATH "ENCODING install include subdirectory")
-ENDIF()
-
-SET(CMAKE_MACOSX_RPATH 1)
-SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-
-FILE(GLOB src-cuda kernel/*.cu)
-
-MESSAGE(STATUS "Torch_INSTALL_INCLUDE:" ${Torch_INSTALL_INCLUDE})
-CUDA_INCLUDE_DIRECTORIES(
-	${CMAKE_CURRENT_SOURCE_DIR}/kernel
-	${CMAKE_CURRENT_SOURCE_DIR}/kernel/include
-	${Torch_INSTALL_INCLUDE} 
-)
-CUDA_ADD_LIBRARY(ENCODING SHARED ${src-cuda})
-
-IF(MSVC)
-  SET_TARGET_PROPERTIES(ENCODING PROPERTIES PREFIX "lib" IMPORT_PREFIX "lib")
-ENDIF()
-
-TARGET_LINK_LIBRARIES(ENCODING 
-	${TH_LIBRARIES} 
-	${CUDA_cusparse_LIBRARY}
-)
-
-# Luarocks bug pre-14.04 prevents us from setting it for Lua-Torch
-IF(ENCODING_SO_VERSION)
-  MESSAGE(STATUS "ENCODING_SO_VERSION: ${ENCODING_SO_VERSION}")
-  SET_TARGET_PROPERTIES(ENCODING PROPERTIES
-    VERSION   ${ENCODING_SO_VERSION}
-    SOVERSION ${ENCODING_SO_VERSION})
-ENDIF(ENCODING_SO_VERSION)
-
-#INSTALL(TARGETS ENCODING LIBRARY DESTINATION ${ENCODING_INSTALL_LIB_SUBDIR})
--- a/encoding/__init__.py
+++ b/encoding/__init__.py
@@ -10,4 +10,4 @@

 """An optimized PyTorch package with CUDA backend."""
 from .version import __version__
-from . import nn, functions, dilated, parallel, utils
+from . import nn, functions, dilated, parallel, utils, models, datasets
--- a/encoding/cmake/FindTorch.cmake
+++ b/encoding/cmake/FindTorch.cmake
-##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-## Created by: Hang Zhang
-## ECE Department, Rutgers University
-## Email: zhang.hang@rutgers.edu
-## Copyright (c) 2017
-##
-## This source code is licensed under the MIT-style license found in the
-## LICENSE file in the root directory of this source tree 
-##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-
-# No longer using manual way to find the library.
-if(FALSE)
-FILE(GLOB TORCH_LIB_HINTS 
-	"/anaconda/lib/python3.6/site-packages/torch/lib" 
-	"/anaconda2/lib/python3.6/site-packages/torch/lib" 
-	"$ENV{HOME}/anaconda/lib/python2.7/site-packages/torch/lib"
-	"$ENV{HOME}/anaconda2/lib/python2.7/site-packages/torch/lib"
-)
-FIND_PATH(TORCH_BUILD_DIR
-	NAMES "THNN.h"
-	PATHS "${TORCH_LIB_HINTS}"
-)
-FIND_LIBRARY(THC_LIBRARIES NAMES THC THC.1 PATHS ${TORCH_BUILD_DIR} PATH_SUFFIXES lib)
-FIND_LIBRARY(TH_LIBRARIES NAMES TH TH.1 PATHS ${TORCH_BUILD_DIR} PATH_SUFFIXES lib)
-endif()
-
-# Set the envrionment variable via python
-SET(TORCH_BUILD_DIR "$ENV{TORCH_BUILD_DIR}")
-MESSAGE(STATUS "TORCH_BUILD_DIR: " ${TORCH_BUILD_DIR})
-
-# Find the include files
-SET(TORCH_TH_INCLUDE_DIR "${TORCH_BUILD_DIR}/include/TH")
-SET(TORCH_THC_INCLUDE_DIR "${TORCH_BUILD_DIR}/include/THC")
-
-SET(Torch_INSTALL_INCLUDE "${TORCH_BUILD_DIR}/include" ${TORCH_TH_INCLUDE_DIR} ${TORCH_THC_INCLUDE_DIR})
-
-# Find the libs. We need to find libraries one by one.
-SET(TH_LIBRARIES "$ENV{TH_LIBRARIES}")
--- a/encoding/cmake/select_compute_arch.cmake
+++ b/encoding/cmake/select_compute_arch.cmake
-# Synopsis:
-#   CUDA_SELECT_NVCC_ARCH_FLAGS(out_variable [target_CUDA_architectures])
-#   -- Selects GPU arch flags for nvcc based on target_CUDA_architectures
-#      target_CUDA_architectures : Auto | Common | All | LIST(ARCH_AND_PTX ...)
-#       - "Auto" detects local machine GPU compute arch at runtime.
-#       - "Common" and "All" cover common and entire subsets of architectures
-#      ARCH_AND_PTX : NAME | NUM.NUM | NUM.NUM(NUM.NUM) | NUM.NUM+PTX
-#      NAME: Fermi Kepler Maxwell Kepler+Tegra Kepler+Tesla Maxwell+Tegra Pascal
-#      NUM: Any number. Only those pairs are currently accepted by NVCC though:
-#            2.0 2.1 3.0 3.2 3.5 3.7 5.0 5.2 5.3 6.0 6.2
-#      Returns LIST of flags to be added to CUDA_NVCC_FLAGS in ${out_variable}
-#      Additionally, sets ${out_variable}_readable to the resulting numeric list
-#      Example:
-#       CUDA_SELECT_NVCC_ARCH_FLAGS(ARCH_FLAGS 3.0 3.5+PTX 5.2(5.0) Maxwell)
-#        LIST(APPEND CUDA_NVCC_FLAGS ${ARCH_FLAGS})
-#
-#      More info on CUDA architectures: https://en.wikipedia.org/wiki/CUDA
-#
-
-# This list will be used for CUDA_ARCH_NAME = All option
-set(CUDA_KNOWN_GPU_ARCHITECTURES  "Fermi" "Kepler" "Maxwell")
-
-# This list will be used for CUDA_ARCH_NAME = Common option (enabled by default)
-set(CUDA_COMMON_GPU_ARCHITECTURES "3.0" "3.5" "5.0")
-
-if (CUDA_VERSION VERSION_GREATER "6.5")
-  list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Kepler+Tegra" "Kepler+Tesla" "Maxwell+Tegra")
-  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "5.2")
-endif ()
-
-if (CUDA_VERSION VERSION_GREATER "7.5")
-  list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Pascal")
-  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "6.0" "6.1" "6.1+PTX")
-else()
-  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "5.2+PTX")
-endif ()
-
-
-
-################################################################################################
-# A function for automatic detection of GPUs installed  (if autodetection is enabled)
-# Usage:
-#   CUDA_DETECT_INSTALLED_GPUS(OUT_VARIABLE)
-#
-function(CUDA_DETECT_INSTALLED_GPUS OUT_VARIABLE)
-  if(NOT CUDA_GPU_DETECT_OUTPUT)
-    set(cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu)
-
-    file(WRITE ${cufile} ""
-      "#include <cstdio>\n"
-      "int main()\n"
-      "{\n"
-      "  int count = 0;\n"
-      "  if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n"
-      "  if (count == 0) return -1;\n"
-      "  for (int device = 0; device < count; ++device)\n"
-      "  {\n"
-      "    cudaDeviceProp prop;\n"
-      "    if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
-      "      std::printf(\"%d.%d \", prop.major, prop.minor);\n"
-      "  }\n"
-      "  return 0;\n"
-      "}\n")
-
-    execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" "--run" "${cufile}"
-                    "-ccbin" ${CMAKE_CXX_COMPILER}
-                    WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
-                    RESULT_VARIABLE nvcc_res OUTPUT_VARIABLE nvcc_out
-                    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-
-    if(nvcc_res EQUAL 0)
-      # only keep the last line of nvcc_out
-      string(REGEX REPLACE ";" "\\\\;" nvcc_out "${nvcc_out}")
-      string(REGEX REPLACE "\n" ";" nvcc_out "${nvcc_out}")
-      list(GET nvcc_out -1 nvcc_out)
-      string(REPLACE "2.1" "2.1(2.0)" nvcc_out "${nvcc_out}")
-      set(CUDA_GPU_DETECT_OUTPUT ${nvcc_out} CACHE INTERNAL "Returned GPU architetures from detect_gpus tool" FORCE)
-    endif()
-  endif()
-
-  if(NOT CUDA_GPU_DETECT_OUTPUT)
-    message(STATUS "Automatic GPU detection failed. Building for common architectures.")
-    set(${OUT_VARIABLE} ${CUDA_COMMON_GPU_ARCHITECTURES} PARENT_SCOPE)
-  else()
-    set(${OUT_VARIABLE} ${CUDA_GPU_DETECT_OUTPUT} PARENT_SCOPE)
-  endif()
-endfunction()
-
-
-################################################################################################
-# Function for selecting GPU arch flags for nvcc based on CUDA architectures from parameter list
-# Usage:
-#   SELECT_NVCC_ARCH_FLAGS(out_variable [list of CUDA compute archs])
-function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable)
-  set(CUDA_ARCH_LIST "${ARGN}")
-
-  if("X${CUDA_ARCH_LIST}" STREQUAL "X" )
-    set(CUDA_ARCH_LIST "Auto")
-  endif()
-
-  set(cuda_arch_bin)
-  set(cuda_arch_ptx)
-
-  if("${CUDA_ARCH_LIST}" STREQUAL "All")
-    set(CUDA_ARCH_LIST ${CUDA_KNOWN_GPU_ARCHITECTURES})
-  elseif("${CUDA_ARCH_LIST}" STREQUAL "Common")
-    set(CUDA_ARCH_LIST ${CUDA_COMMON_GPU_ARCHITECTURES})
-  elseif("${CUDA_ARCH_LIST}" STREQUAL "Auto")
-    CUDA_DETECT_INSTALLED_GPUS(CUDA_ARCH_LIST)
-    message(STATUS "Autodetected CUDA architecture(s): ${CUDA_ARCH_LIST}")
-  endif()
-
-  # Now process the list and look for names
-  string(REGEX REPLACE "[ \t]+" ";" CUDA_ARCH_LIST "${CUDA_ARCH_LIST}")
-  list(REMOVE_DUPLICATES CUDA_ARCH_LIST)
-  foreach(arch_name ${CUDA_ARCH_LIST})
-    set(arch_bin)
-    set(add_ptx FALSE)
-    # Check to see if we are compiling PTX
-    if(arch_name MATCHES "(.*)\\+PTX$")
-      set(add_ptx TRUE)
-      set(arch_name ${CMAKE_MATCH_1})
-    endif()
-    if(arch_name MATCHES "(^[0-9]\\.[0-9](\\([0-9]\\.[0-9]\\))?)$")
-      set(arch_bin ${CMAKE_MATCH_1})
-      set(arch_ptx ${arch_bin})
-    else()
-      # Look for it in our list of known architectures
-      if(${arch_name} STREQUAL "Fermi")
-        set(arch_bin "2.0 2.1(2.0)")
-      elseif(${arch_name} STREQUAL "Kepler+Tegra")
-        set(arch_bin 3.2)
-      elseif(${arch_name} STREQUAL "Kepler+Tesla")
-        set(arch_bin 3.7)
-      elseif(${arch_name} STREQUAL "Kepler")
-        set(arch_bin 3.0 3.5)
-        set(arch_ptx 3.5)
-      elseif(${arch_name} STREQUAL "Maxwell+Tegra")
-        set(arch_bin 5.3)
-      elseif(${arch_name} STREQUAL "Maxwell")
-        set(arch_bin 5.0 5.2)
-        set(arch_ptx 5.2)
-      elseif(${arch_name} STREQUAL "Pascal")
-        set(arch_bin 6.0 6.1)
-        set(arch_ptx 6.1)
-      else()
-        message(SEND_ERROR "Unknown CUDA Architecture Name ${arch_name} in CUDA_SELECT_NVCC_ARCH_FLAGS")
-      endif()
-    endif()
-    if(NOT arch_bin)
-      message(SEND_ERROR "arch_bin wasn't set for some reason")
-    endif()
-    list(APPEND cuda_arch_bin ${arch_bin})
-    if(add_ptx)
-      if (NOT arch_ptx)
-        set(arch_ptx ${arch_bin})
-      endif()
-      list(APPEND cuda_arch_ptx ${arch_ptx})
-    endif()
-  endforeach()
-
-  # remove dots and convert to lists
-  string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}")
-  string(REGEX REPLACE "\\." "" cuda_arch_ptx "${cuda_arch_ptx}")
-  string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}")
-  string(REGEX MATCHALL "[0-9]+"   cuda_arch_ptx "${cuda_arch_ptx}")
-
-  if(cuda_arch_bin)
-    list(REMOVE_DUPLICATES cuda_arch_bin)
-  endif()
-  if(cuda_arch_ptx)
-    list(REMOVE_DUPLICATES cuda_arch_ptx)
-  endif()
-
-  set(nvcc_flags "")
-  set(nvcc_archs_readable "")
-
-  # Tell NVCC to add binaries for the specified GPUs
-  foreach(arch ${cuda_arch_bin})
-    if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
-      # User explicitly specified ARCH for the concrete CODE
-      list(APPEND nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
-      list(APPEND nvcc_archs_readable sm_${CMAKE_MATCH_1})
-    else()
-      # User didn't explicitly specify ARCH for the concrete CODE, we assume ARCH=CODE
-      list(APPEND nvcc_flags -gencode arch=compute_${arch},code=sm_${arch})
-      list(APPEND nvcc_archs_readable sm_${arch})
-    endif()
-  endforeach()
-
-  # Tell NVCC to add PTX intermediate code for the specified architectures
-  foreach(arch ${cuda_arch_ptx})
-    list(APPEND nvcc_flags -gencode arch=compute_${arch},code=compute_${arch})
-    list(APPEND nvcc_archs_readable compute_${arch})
-  endforeach()
-
-  string(REPLACE ";" " " nvcc_archs_readable "${nvcc_archs_readable}")
-  set(${out_variable}          ${nvcc_flags}          PARENT_SCOPE)
-  set(${out_variable}_readable ${nvcc_archs_readable} PARENT_SCOPE)
-endfunction()
--- a/encoding/datasets/__init__.py
+++ b/encoding/datasets/__init__.py
+from .base import *
+from .ade20k import ADE20KSegmentation
+from .pascal_voc import VOCSegmentation
+from .pascal_aug import VOCAugSegmentation
+from .pcontext import ContextSegmentation
+
+datasets = {
+    'ade20k': ADE20KSegmentation,
+    'pascal_voc': VOCSegmentation,
+    'pascal_aug': VOCAugSegmentation,
+    'pcontext': ContextSegmentation,
+}
+def get_segmentation_dataset(name, **kwargs):
+    return datasets[name.lower()](**kwargs)
--- a/encoding/datasets/ade20k.py
+++ b/encoding/datasets/ade20k.py
+###########################################################################
+# Created by: Hang Zhang
+# Email: zhang.hang@rutgers.edu
+# Copyright (c) 2017
+###########################################################################
+
+import os
+import sys
+import numpy as np
+import random
+import math
+from PIL import Image, ImageOps, ImageFilter
+
+import torch
+import torch.utils.data as data
+import torchvision.transforms as transform
+
+from .base import BaseDataset
+
+class ADE20KSegmentation(BaseDataset):
+    BASE_DIR = 'ADEChallengeData2016'
+    NUM_CLASS = 150
+    def __init__(self, root=os.path.expanduser('~/.encoding/data'), split='train',
+                 mode=None, transform=None, target_transform=None):
+        super(ADE20KSegmentation, self).__init__(
+            root, split, mode, transform, target_transform)
+        # assert exists and prepare dataset automatically
+        root = os.path.join(root, self.BASE_DIR)
+        assert os.path.exists(root), "Please setup the dataset using" + \
+            "encoding/scripts/prepare_ade20k.py"
+        self.images, self.masks = _get_ade20k_pairs(root, split)
+        if split != 'test':
+            assert (len(self.images) == len(self.masks))
+        if len(self.images) == 0:
+            raise(RuntimeError("Found 0 images in subfolders of: \
+                " + root + "\n"))
+
+    def __getitem__(self, index):
+        img = Image.open(self.images[index]).convert('RGB')
+        if self.mode == 'test':
+            if self.transform is not None:
+                img = self.transform(img)
+            return img, os.path.basename(self.images[index])
+        mask = Image.open(self.masks[index])
+        # synchrosized transform
+        if self.mode == 'train':
+            img, mask = self._sync_transform(img, mask)
+        elif self.mode == 'val':
+            img, mask = self._val_sync_transform(img, mask)
+        else:
+            assert self.mode == 'testval'
+            mask = self._mask_transform(mask)
+        # general resize, normalize and toTensor
+        if self.transform is not None:
+            img = self.transform(img)
+        if self.target_transform is not None:
+            mask = self.target_transform(mask)
+        return img, mask
+
+    def _mask_transform(self, mask):
+        target = np.array(mask).astype('int32') - 1
+        return torch.from_numpy(target).long()
+
+    def __len__(self):
+        return len(self.images)
+
+    @property
+    def pred_offset(self):
+        return 1
+
+
+def _get_ade20k_pairs(folder, split='train'):
+    img_paths = []
+    mask_paths = []
+    if split == 'train':
+        img_folder = os.path.join(folder, 'images/training')
+        mask_folder = os.path.join(folder, 'annotations/training')
+    else:
+        img_folder = os.path.join(folder, 'images/validation')
+        mask_folder = os.path.join(folder, 'annotations/validation')
+    for filename in os.listdir(img_folder):
+        basename, _ = os.path.splitext(filename)
+        if filename.endswith(".jpg"):
+            imgpath = os.path.join(img_folder, filename)
+            maskname = basename + '.png'
+            maskpath = os.path.join(mask_folder, maskname)
+            if os.path.isfile(maskpath):
+                img_paths.append(imgpath)
+                mask_paths.append(maskpath)
+            else:
+                print('cannot find the mask:', maskpath)
+
+    return img_paths, mask_paths
--- a/encoding/datasets/base.py
+++ b/encoding/datasets/base.py
+###########################################################################
+# Created by: Hang Zhang 
+# Email: zhang.hang@rutgers.edu 
+# Copyright (c) 2017
+###########################################################################
+
+import random
+import numpy as np
+from PIL import Image, ImageOps, ImageFilter
+import torch
+import torch.utils.data as data
+
+__all__ = ['BaseDataset', 'test_batchify_fn']
+
+class BaseDataset(data.Dataset):
+    def __init__(self, root, split, mode=None, transform=None, 
+                 target_transform=None, base_size=520, crop_size=480):
+        self.root = root
+        self.transform = transform
+        self.target_transform = target_transform
+        self.split = split
+        self.mode = mode if mode is not None else split
+        self.base_size = base_size
+        self.crop_size = crop_size
+
+    def __getitem__(self, index):
+        raise NotImplemented
+
+    @property
+    def num_class(self):
+        return self.NUM_CLASS
+
+    @property
+    def pred_offset(self):
+        raise NotImplemented
+
+    def _val_sync_transform(self, img, mask):
+        outsize = self.crop_size
+        short_size = outsize
+        w, h = img.size
+        if w > h:
+            oh = short_size
+            ow = int(1.0 * w * oh / h)
+        else:
+            ow = short_size
+            oh = int(1.0 * h * ow / w)
+        img = img.resize((ow, oh), Image.BILINEAR)
+        mask = mask.resize((ow, oh), Image.NEAREST)
+        # center crop
+        w, h = img.size
+        x1 = int(round((w - outsize) / 2.))
+        y1 = int(round((h - outsize) / 2.))
+        img = img.crop((x1, y1, x1+outsize, y1+outsize))
+        mask = mask.crop((x1, y1, x1+outsize, y1+outsize))
+        # final transform
+        return img, self._mask_transform(mask)
+
+    def _sync_transform(self, img, mask):
+        # random mirror
+        if random.random() < 0.5:
+            img = img.transpose(Image.FLIP_LEFT_RIGHT)
+            mask = mask.transpose(Image.FLIP_LEFT_RIGHT)
+        crop_size = self.crop_size
+        # random scale (short edge from 480 to 720)
+        short_size = random.randint(int(self.base_size*0.5), int(self.base_size*2.0))
+        w, h = img.size
+        if h > w:
+            ow = short_size
+            oh = int(1.0 * h * ow / w)
+        else:
+            oh = short_size
+            ow = int(1.0 * w * oh / h)
+        img = img.resize((ow, oh), Image.BILINEAR)
+        mask = mask.resize((ow, oh), Image.NEAREST)
+        # random rotate -10~10, mask using NN rotate
+        deg = random.uniform(-10, 10)
+        img = img.rotate(deg, resample=Image.BILINEAR)
+        mask = mask.rotate(deg, resample=Image.NEAREST)
+        # pad crop
+        if short_size < crop_size:
+            padh = crop_size - oh if oh < crop_size else 0
+            padw = crop_size - ow if ow < crop_size else 0
+            img = ImageOps.expand(img, border=(0, 0, padw, padh), fill=0)
+            mask = ImageOps.expand(mask, border=(0, 0, padw, padh), fill=0)
+        # random crop crop_size
+        w, h = img.size
+        x1 = random.randint(0, w - crop_size)
+        y1 = random.randint(0, h - crop_size)
+        img = img.crop((x1, y1, x1+crop_size, y1+crop_size))
+        mask = mask.crop((x1, y1, x1+crop_size, y1+crop_size))
+        # gaussian blur as in PSP
+        if random.random() < 0.5:
+            img = img.filter(ImageFilter.GaussianBlur(
+                radius=random.random()))
+        # final transform
+        return img, self._mask_transform(mask)
+
+    def _mask_transform(self, mask):
+        return torch.from_numpy(np.array(mask)).long()
+
+
+def test_batchify_fn(data):
+    error_msg = "batch must contain tensors, tuples or lists; found {}"
+    if isinstance(data[0], (str, torch.Tensor)):
+        return list(data)
+    elif isinstance(data[0], (tuple, list)):
+        data = zip(*data)
+        return [test_batchify_fn(i) for i in data]
+    elif isinstance(data[0], ):
+        data = np.asarray(data)
+        return mx.nd.array(data, dtype=data.dtype)
+    raise TypeError((error_msg.format(type(batch[0]))))
--- a/encoding/datasets/cityscapes.py
+++ b/encoding/datasets/cityscapes.py
+###########################################################################
+# Created by: Hang Zhang
+# Email: zhang.hang@rutgers.edu
+# Copyright (c) 2017
+###########################################################################
+
+import os
+import sys
+import numpy as np
+import random
+import math
+from tqdm import tqdm
+from PIL import Image, ImageOps, ImageFilter
+
+import torch
+import torch.utils.data as data
+import torchvision.transforms as transform
+
+class Segmentation(data.Dataset):
+    BASE_DIR = 'cityscapes'
+    
+    def __init__(self, data_folder, mode='train', transform=None, 
+                 target_transform=None):
+        self.root = os.path.join(data_folder, self.BASE_DIR)
+        self.transform = transform
+        self.target_transform = target_transform
+        self.mode = mode
+        self.images, self.masks = get_city_pairs(self.root, mode)
+        assert (len(self.images) == len(self.masks))
+        print("Found {} images in subfolders of: {}\
+                ".format(len(self.images), self.root + '/' + mode))
+        if len(self.images) == 0:
+            raise(RuntimeError("Found 0 images in subfolders of: \
+                " + self.root + "\n"))
+
+    def __getitem__(self, index):
+        img = Image.open(self.images[index]).convert('RGB')
+        if self.mode == 'test':
+            if self.transform is not None:
+                img = self.transform(img)
+            return img, os.path.basename(self.images[index])
+
+        mask = Image.open(self.masks[index])#.convert("P")
+        mask = np.array(mask) 
+        mask += 1
+        mask[mask==256] = 0
+        mask = Image.fromarray(mask)
+        # synchrosized transform
+        if self.mode == 'train':
+            img, mask = self._sync_transform(img, mask)
+        elif self.mode == 'val':
+            img, mask = self._val_sync_transform(img, mask)
+        else:
+            raise RuntimeError('unknown mode for dataloader: {}'.format(self.mode))
+
+        # general resize, normalize and toTensor
+        if self.transform is not None:
+            #print("transform for input")
+            img = self.transform(img)
+        if self.target_transform is not None:
+            #print("transform for label")
+            mask = self.target_transform(mask)
+
+        return img, mask
+
+    def __len__(self):
+        return len(self.images)
+
+    def _val_sync_transform(self, img, mask):
+        """
+        synchronized transformation
+        """
+        outsize = 720
+        short = outsize
+        w, h = img.size
+        if w > h:
+            oh = short
+            ow = int(1.0 * w * oh / h)
+        else:
+            ow = short
+            oh = int(1.0 * h * ow / w)
+        img = img.resize((ow, oh), Image.BILINEAR)
+        mask = mask.resize((ow, oh), Image.NEAREST)
+        # center crop
+        w, h = img.size
+        x1 = int(round((w - outsize) / 2.))
+        y1 = int(round((h - outsize) / 2.))
+        img = img.crop((x1, y1, x1+outsize, y1+outsize))
+        mask = mask.crop((x1, y1, x1+outsize, y1+outsize))
+
+        return img, mask
+
+    def _sync_transform(self, img, mask):
+        # random mirror
+        if random.random() < 0.5:
+            img  = img.transpose(Image.FLIP_LEFT_RIGHT)
+            mask = mask.transpose(Image.FLIP_LEFT_RIGHT)
+        base_size = 2048
+        crop_size = 720
+        # random scale (short edge from 480 to 720)
+        long_size = random.randint(int(base_size*0.5), int(base_size*2.0))
+        w, h = img.size
+        if h > w:
+            oh = long_size
+            ow = int(1.0 * w * oh / h)
+            short_size = ow
+        else:
+            ow = long_size
+            oh = int(1.0 * h * ow / w)
+            short_size = oh
+        img = img.resize((ow, oh), Image.BILINEAR)
+        mask = mask.resize((ow, oh), Image.NEAREST)
+        # random rotate -10~10, mask using NN rotate
+        deg = random.uniform(-10,10)
+        img = img.rotate(deg, resample=Image.BILINEAR)
+        mask = mask.rotate(deg, resample=Image.NEAREST)
+        # pad crop
+        if short_size < crop_size:
+            padh = crop_size - oh if oh < crop_size else 0
+            padw = crop_size - ow if ow < crop_size else 0
+            img  = ImageOps.expand(img,  border=(0,0,padw,padh), fill=0)
+            mask = ImageOps.expand(mask, border=(0,0,padw,padh), fill=0)
+        # random crop 480
+        w, h = img.size
+        x1 = random.randint(0, w - crop_size)
+        y1 = random.randint(0, h - crop_size) 
+        img = img.crop((x1, y1, x1+crop_size, y1+crop_size))
+        mask = mask.crop((x1, y1, x1+crop_size, y1+crop_size))
+        # gaussian blur as in PSP ?
+        if random.random() < 0.5:
+            img = img.filter(ImageFilter.GaussianBlur(
+                radius=random.random()))
+        return img, mask
+
+def get_city_pairs(folder, mode='train'):
+    img_paths = []  
+    mask_paths = []  
+    img_folder = os.path.join(folder, 'leftImg8bit/' + mode)
+    mask_folder = os.path.join(folder, 'gtFine/'+ mode)
+    for root, directories, files in os.walk(img_folder):
+        for filename in files:#os.listdir(img_folder):
+            #basename, extension =os.path.splitext(filename)
+            if filename.endswith(".png"):
+                imgpath = os.path.join(root, filename)
+                foldername = os.path.basename(os.path.dirname(imgpath))
+                maskname = filename.replace('leftImg8bit','gtFine_trainIds')
+                maskpath = os.path.join(mask_folder, foldername, maskname)
+                if os.path.isfile(imgpath) and os.path.isfile(maskpath):
+                    img_paths.append(imgpath)
+                    mask_paths.append(maskpath)
+                else:
+                    print('cannot find the mask or image:', imgpath, maskpath)
+
+    return img_paths, mask_paths
--- a/encoding/datasets/cityscapescoarse.py
+++ b/encoding/datasets/cityscapescoarse.py
+###########################################################################
+# Created by: Hang Zhang
+# Email: zhang.hang@rutgers.edu
+# Copyright (c) 2017
+###########################################################################
+
+import os
+import sys
+import numpy as np
+import random
+import math
+from tqdm import tqdm
+from PIL import Image, ImageOps, ImageFilter
+
+import torch
+import torch.utils.data as data
+import torchvision.transforms as transform
+
+class Segmentation(data.Dataset):
+    BASE_DIR = 'cityscapes'
+    
+    def __init__(self, data_folder, mode='train', transform=None, 
+                 target_transform=None):
+        self.root = os.path.join(data_folder, self.BASE_DIR)
+        self.transform = transform
+        self.target_transform = target_transform
+        self.mode = mode
+        self.images, self.masks = get_city_pairs(self.root, mode)
+        assert (len(self.images) == len(self.masks))
+        if len(self.images) == 0:
+            raise(RuntimeError("Found 0 images in subfolders of: \
+                " + self.root + "\n"))
+
+    def __getitem__(self, index):
+        img = Image.open(self.images[index]).convert('RGB')
+        if self.mode == 'test':
+            if self.transform is not None:
+                img = self.transform(img)
+            return img, os.path.basename(self.images[index])
+
+        mask = Image.open(self.masks[index])#.convert("P")
+        mask = np.array(mask) 
+        mask += 1
+        mask[mask==256] = 0
+        mask = Image.fromarray(mask)
+        # synchrosized transform
+        if self.mode == 'train':
+            img, mask = self._sync_transform(img, mask)
+        elif self.mode == 'val':
+            img, mask = self._val_sync_transform(img, mask)
+        else:
+            raise RuntimeError('unknown mode for dataloader: {}'.format(self.mode))
+        
+
+        # general resize, normalize and toTensor
+        if self.transform is not None:
+            #print("transform for input")
+            img = self.transform(img)
+        if self.target_transform is not None:
+            #print("transform for label")
+            mask = self.target_transform(mask)
+
+        return img, mask
+
+    def __len__(self):
+        return len(self.images)
+
+    def _val_sync_transform(self, img, mask):
+        """
+        synchronized transformation
+        """
+        outsize = 720
+        short = outsize
+        w, h = img.size
+        if w > h:
+            oh = short
+            ow = int(1.0 * w * oh / h)
+        else:
+            ow = short
+            oh = int(1.0 * h * ow / w)
+        img = img.resize((ow, oh), Image.BILINEAR)
+        mask = mask.resize((ow, oh), Image.NEAREST)
+        # center crop
+        w, h = img.size
+        x1 = int(round((w - outsize) / 2.))
+        y1 = int(round((h - outsize) / 2.))
+        img = img.crop((x1, y1, x1+outsize, y1+outsize))
+        mask = mask.crop((x1, y1, x1+outsize, y1+outsize))
+
+        return img, mask
+
+    def _sync_transform(self, img, mask):
+        # random mirror
+        if random.random() < 0.5:
+            img  = img.transpose(Image.FLIP_LEFT_RIGHT)
+            mask = mask.transpose(Image.FLIP_LEFT_RIGHT)
+        base_size = 2048
+        crop_size = 720
+        # random scale (short edge from 480 to 720)
+        long_size = random.randint(int(base_size*0.5), int(base_size*2.0))
+        w, h = img.size
+        if h > w:
+            oh = long_size
+            ow = int(1.0 * w * oh / h)
+            short_size = ow
+        else:
+            ow = long_size
+            oh = int(1.0 * h * ow / w)
+            short_size = oh
+        img = img.resize((ow, oh), Image.BILINEAR)
+        mask = mask.resize((ow, oh), Image.NEAREST)
+        # random rotate -10~10, mask using NN rotate
+        deg = random.uniform(-10,10)
+        img = img.rotate(deg, resample=Image.BILINEAR)
+        mask = mask.rotate(deg, resample=Image.NEAREST)
+        # pad crop
+        if short_size < crop_size:
+            padh = crop_size - oh if oh < crop_size else 0
+            padw = crop_size - ow if ow < crop_size else 0
+            img  = ImageOps.expand(img,  border=(0,0,padw,padh), fill=0)
+            mask = ImageOps.expand(mask, border=(0,0,padw,padh), fill=0)
+        # random crop 480
+        w, h = img.size
+        x1 = random.randint(0, w - crop_size)
+        y1 = random.randint(0, h - crop_size) 
+        img = img.crop((x1, y1, x1+crop_size, y1+crop_size))
+        mask = mask.crop((x1, y1, x1+crop_size, y1+crop_size))
+        # gaussian blur as in PSP ?
+        if random.random() < 0.5:
+            img = img.filter(ImageFilter.GaussianBlur(
+                radius=random.random()))
+        return img, mask
+
+
+def get_city_pairs(folder, mode='train'):
+    img_paths = []  
+    mask_paths = []  
+    if mode=='train':
+        img_folder = os.path.join(folder, 'leftImg8bit/train_extra')
+        mask_folder = os.path.join(folder, 'gtCoarse/train_extra')
+    else:
+        img_folder = os.path.join(folder, 'leftImg8bit/val')
+        mask_folder = os.path.join(folder, 'gtFine/val')
+    for root, directories, files in os.walk(img_folder):
+        for filename in files:
+            basename, extension =os.path.splitext(filename)
+            if filename.endswith(".png"):
+                imgpath = os.path.join(root, filename)
+                foldername = os.path.basename(os.path.dirname(imgpath))
+                maskname = filename.replace('leftImg8bit','gtCoarse_trainIds')
+                maskpath = os.path.join(mask_folder, foldername, maskname)
+                if os.path.isfile(imgpath) and os.path.isfile(maskpath):
+                    img_paths.append(imgpath)
+                    mask_paths.append(maskpath)
+                else:
+                    print('cannot find the mask or image:', imgpath, maskpath)
+
+    return img_paths, mask_paths
--- a/encoding/datasets/coco.py
+++ b/encoding/datasets/coco.py
+import os
+from tqdm import tqdm, trange
+from PIL import Image, ImageOps, ImageFilter
+
+import torch
+import torch.utils.data as data
+import torchvision.transforms as transform
+import random
+import math
+import numpy as np
+
+from .dataset import ToLabel
+
+"""
+NUM_CHANNEL = 91
+[] background
+[5] airplane
+[2] bicycle
+[16] bird
+[9] boat
+[44] bottle
+[6] bus
+[3] car
+[17] cat
+[62] chair
+[21] cow
+[67] dining table
+[18] dog
+[19] horse
+[4] motorcycle
+[1] person
+[64] potted plant
+[20] sheep
+[63] couch
+[7] train
+[72] tv
+"""
+catlist = [0, 5, 2, 16, 9, 44, 6, 3, 17, 62, 21, 67, 18, 19, 4,
+    1, 64, 20, 63, 7, 72]
+
+
+class Segmentation(data.Dataset):
+    def __init__(self, root, mode='train', transform=None, 
+                 target_transform=None):
+        from pycocotools.coco import COCO
+        from pycocotools import mask
+        if mode == 'train':
+            print('train set')
+            ann_file = os.path.join(root, 'coco/annotations/instances_train2014.json')
+            ids_file = os.path.join(root, 'coco/annotations/train_ids.pth')
+            root = os.path.join(root, 'coco/train2014')
+        else:
+            print('val set')
+            ann_file = os.path.join(root, 'coco/annotations/instances_val2014.json')
+            ids_file = os.path.join(root, 'coco/annotations/val_ids.pth')
+            root = os.path.join(root, 'coco/val2014')
+        self.train = mode
+        self.root = root
+        self.coco = COCO(ann_file)
+        self.coco_mask = mask
+        if os.path.exists(ids_file):
+            self.ids = torch.load(ids_file)
+        else:
+            self.new_ids = []
+            self.ids = list(self.coco.imgs.keys())
+            self.preprocess(ids_file)
+            self.ids = self.new_ids
+        self.transform = transform
+        self.target_transform = target_transform
+
+    def preprocess(self, ids_file):
+        tbar = trange(len(self.ids))
+        for i in tbar:
+            img_id = self.ids[i]
+            cocotarget = self.coco.loadAnns(self.coco.getAnnIds(imgIds=img_id))
+            img_metadata = self.coco.loadImgs(img_id)[0]
+            mask = self._gen_seg_mask(cocotarget, img_metadata['height'], 
+                                      img_metadata['width'])
+            # more than 1k pixels
+            if (mask > 0).sum() > 1000:
+                self.new_ids.append(img_id)
+            tbar.set_description('Doing: {}/{}, got {} qualified images'.\
+                format(i, len(self.ids), len(self.new_ids)))
+
+        print('number of qualified images: ', len(self.new_ids))
+        torch.save(self.new_ids, ids_file)
+
+    def __getitem__(self, index):
+        coco = self.coco
+        img_id = self.ids[index]
+        cocotarget = coco.loadAnns(coco.getAnnIds(imgIds=img_id))
+        img_metadata = coco.loadImgs(img_id)[0]
+        path = img_metadata['file_name']
+
+        img = Image.open(os.path.join(self.root, path)).convert('RGB')
+        mask = Image.fromarray(
+            self._gen_seg_mask(cocotarget, img_metadata['height'], 
+                               img_metadata['width'])
+            )
+        # synchrosized transform
+        if True:#self.train == 'train':
+            img, mask = self._sync_transform(img, mask)
+        else:
+            img, mask = self._val_sync_transform(img, mask)
+
+        # general resize, normalize and toTensor
+        if self.transform is not None:
+            #print("transform for input")
+            img = self.transform(img)
+        if self.target_transform is not None:
+            #print("transform for label")
+            mask = self.target_transform(mask)
+
+        return img, mask
+
+    def __len__(self):
+        return len(self.ids)
+
+    def _val_sync_transform(self, img, mask):
+        outsize = 480
+        short = outsize
+        w, h = img.size
+        if w > h:
+            oh = short
+            ow = int(1.0 * w * oh / h)
+        else:
+            ow = short
+            oh = int(1.0 * h * ow / w)
+        img = img.resize((ow, oh), Image.BILINEAR)
+        mask = mask.resize((ow, oh), Image.NEAREST)
+        # center crop
+        w, h = img.size
+        x1 = int(round((w - outsize) / 2.))
+        y1 = int(round((h - outsize) / 2.))
+        img = img.crop((x1, y1, x1+outsize, y1+outsize))
+        mask = mask.crop((x1, y1, x1+outsize, y1+outsize))
+
+        return img, mask
+
+    def _sync_transform(self, img, mask):
+        # random mirror
+        if random.random() < 0.5:
+            img  = img.transpose(Image.FLIP_LEFT_RIGHT)
+            mask = mask.transpose(Image.FLIP_LEFT_RIGHT)
+        base_size = 520
+        crop_size = 480
+        # random scale (short edge from 480 to 720)
+        long_size = random.randint(int(base_size*0.5), int(base_size*2.0))
+        w, h = img.size
+        if h > w:
+            oh = long_size
+            ow = int(1.0 * w * oh / h)
+            short_size = ow
+        else:
+            ow = long_size
+            oh = int(1.0 * h * ow / w)
+            short_size = oh
+        img = img.resize((ow, oh), Image.BILINEAR)
+        mask = mask.resize((ow, oh), Image.NEAREST)
+        # random rotate -10~10, mask using NN rotate
+        deg = random.uniform(-10,10)
+        img = img.rotate(deg, resample=Image.BILINEAR)
+        mask = mask.rotate(deg, resample=Image.NEAREST)
+        # pad crop
+        if short_size < crop_size:
+            padh = crop_size - oh if oh < crop_size else 0
+            padw = crop_size - ow if ow < crop_size else 0
+            img  = ImageOps.expand(img,  border=(0,0,padw,padh), fill=0)
+            mask = ImageOps.expand(mask, border=(0,0,padw,padh), fill=0)
+        # random crop 480
+        w, h = img.size
+        x1 = random.randint(0, w - crop_size)
+        y1 = random.randint(0, h - crop_size) 
+        img = img.crop((x1, y1, x1+crop_size, y1+crop_size))
+        mask = mask.crop((x1, y1, x1+crop_size, y1+crop_size))
+        # gaussian blur as in PSP ?
+        if random.random() < 0.5:
+            img = img.filter(ImageFilter.GaussianBlur(
+                radius=random.random()))
+        return img, mask
+
+    def _gen_seg_mask(self, target, h, w):
+        mask = np.zeros((h, w), dtype=np.uint8)
+        coco_mask = self.coco_mask
+        for instance in target:
+            rle = coco_mask.frPyObjects(instance['segmentation'], h, w)
+            m = coco_mask.decode(rle)
+            cat = instance['category_id']
+            if cat in catlist:
+                c = catlist.index(cat)
+            else:
+                continue
+            if len(m.shape) < 3:
+                mask[:, :] += (mask == 0) * (m * c)
+            else:
+                mask[:, :] += (mask == 0) * (((np.sum(m, axis=2)) > 0) * c).astype(np.uint8)
+        return mask
--- a/encoding/datasets/download_datasets.sh
+++ b/encoding/datasets/download_datasets.sh
+mkdir -p "$HOME"/data
+cd "$HOME"/data
+
+# augmented PASCAL VOC
+wget http://www.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/semantic_contours/benchmark.tgz # 1.3 GB
+tar -zxvf benchmark.tgz
+mv benchmark_RELEASE VOCaug
+<<<<<<< HEAD
+# generate trainval txt
+cd VOCaug/dataset/
+cp train.txt trainval.txt
+cat val.txt >> trainval.txt
+cd -
+
+=======
+
+# generate trainval.txt
+cd VOCaug/dataset/
+cp train.txt trainval.txt
+cat val.txt >>  trainval.txt
+
+cd "$HOME"/data
+>>>>>>> d7e511b09ee6127be11f03497b7d83327e9f7b1b
+# original PASCAL VOC 2012
+wget http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar # 2 GB
+tar -xvf VOCtrainval_11-May-2012.tar
+
+<<<<<<< HEAD
+# for PASCAL VOC testset, you need to login and manually download from (http://host.robots.ox.ac.uk:8080/)
+=======
+cd -
+>>>>>>> d7e511b09ee6127be11f03497b7d83327e9f7b1b