add fastmoe project

0f091a1d · Sugon_ldc · 0f091a1d · 0f091a1d · 0f091a1d · 0f091a1d
Commit 0f091a1d authored May 17, 2023 by Sugon_ldc
20 changed files
--- a/cuda/tests/limit.cu
+++ b/cuda/tests/limit.cu
+#include "../balancing.cuh"
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cuda.h>
+#include <cuda_runtime.h>
+int main(int argc, char* args[]) {
+    int n_worker = atoi(args[1]);
+    int n_expert = atoi(args[2]);
+    int cap_v = atoi(args[3]);
+    int tot_expert = n_worker * n_expert;
+    long* lec = new long[tot_expert];
+    for (int i = 0; i < tot_expert; ++i) {
+        lec[i] = i;
+    }
+    long* g_lec;
+    cudaMalloc(&g_lec, sizeof(long) * tot_expert);
+    cudaMemcpy(g_lec, lec, sizeof(long) * tot_expert, cudaMemcpyHostToDevice);
+    int* cap = new int[n_expert];
+    for (int i = 0; i < n_expert; ++i) {
+        cap[i] = cap_v;
+    }
+    int* g_cap;
+    cudaMalloc(&g_cap, sizeof(int) * n_expert);
+    cudaMemcpy(g_cap, cap, sizeof(int) * n_expert, cudaMemcpyHostToDevice);
+    long* eca = new long[tot_expert];
+    long* g_eca;
+    cudaMalloc(&g_eca, sizeof(long) * tot_expert);
+    auto smgr = getCudaStreamManager(0);
+    fmoe_cuda_limit_by_capacity_impl(g_lec, g_cap, g_eca, n_expert, n_worker, smgr);
+    cudaMemcpy(cap, g_cap, sizeof(int) * n_expert, cudaMemcpyDeviceToHost);
+    cudaMemcpy(eca, g_eca, sizeof(long) * tot_expert, cudaMemcpyDeviceToHost);
+    printf("%d\n", cap[0]);
+    for (int i = 0; i < tot_expert; ++i) {
+        printf("%ld %ld\n", lec[i], eca[i]);
+    }
+}
--- a/cuda/tests/prune_gate.cu
+++ b/cuda/tests/prune_gate.cu
+#include "../balancing.cuh"
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cuda.h>
+#include <cuda_runtime.h>
+int main(int argc, char* args[]) {
+    int n_worker = atoi(args[1]);
+    int n_expert = atoi(args[2]);
+    int batch_size = atoi(args[3]);
+    int tot_expert = n_worker * n_expert;
+    long* gate_idx = new long[batch_size];
+    long* n_gate_idx = new long[batch_size];
+    long* lec = new long[tot_expert];
+    memset(lec, 0, sizeof(long) * tot_expert);
+    for (int i = 0; i < batch_size; ++i) {
+        gate_idx[i] = rand() % tot_expert;
+        ++lec[gate_idx[i]];
+    }
+    for (int i = 0; i < tot_expert; ++i) {
+        lec[i] >>= 1;
+    }
+    long* g_lec;
+    cudaMalloc(&g_lec, sizeof(long) * tot_expert);
+    cudaMemcpy(g_lec, lec, sizeof(long) * tot_expert, cudaMemcpyHostToDevice);
+    int* g_new_lec;
+    cudaMalloc(&g_new_lec, sizeof(int) * tot_expert);
+    long* g_gate_idx;
+    cudaMalloc(&g_gate_idx, sizeof(long) * batch_size);
+    cudaMemcpy(g_gate_idx, gate_idx, sizeof(long) * batch_size, cudaMemcpyHostToDevice);
+    auto smgr = getCudaStreamManager(0);
+    fmoe_cuda_prune_gate_by_capacity_impl(g_gate_idx, g_lec, g_new_lec,
+            batch_size, n_expert, n_worker, smgr);
+    cudaMemcpy(n_gate_idx, g_gate_idx, sizeof(long) * batch_size, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < batch_size; ++i) {
+        printf("%ld %ld (%d)\n", gate_idx[i], n_gate_idx[i], lec[gate_idx[i]]);
+    }
+}
--- a/cuda/utils/cublas_wrapper.h
+++ b/cuda/utils/cublas_wrapper.h
+#ifndef CUBLAS_WRAPPER_H
+#define CUBLAS_WRAPPER_H
+#include <cublas_v2.h>
+#include <c10/util/Half.h>
+inline cublasStatus_t cublasXgemmBatched(cublasHandle_t handle,
+                                  cublasOperation_t transa,
+                                  cublasOperation_t transb,
+                                  int m, int n, int k,
+                                  const float           *alpha,
+                                  const float           *Aarray[], int lda,
+                                  const float           *Barray[], int ldb,
+                                  const float           *beta,
+                                  float           *Carray[], int ldc,
+                                  int batchCount) {
+#ifdef FMOE_USE_HIP
+    return rocblas_sgemm_batched(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+#else
+    return cublasSgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+#endif
+}
+inline cublasStatus_t cublasXgemmBatched(cublasHandle_t handle,
+                                  cublasOperation_t transa,
+                                  cublasOperation_t transb,
+                                  int m, int n, int k,
+                                  const double           *alpha,
+                                  const double           *Aarray[], int lda,
+                                  const double           *Barray[], int ldb,
+                                  const double           *beta,
+                                  double           *Carray[], int ldc,
+                                  int batchCount) {
+#ifdef FMOE_USE_HIP
+    return rocblas_dgemm_batched(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+#else
+    return cublasDgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+#endif
+}
+inline cublasStatus_t cublasXgemmBatched(cublasHandle_t handle,
+                                  cublasOperation_t transa,
+                                  cublasOperation_t transb,
+                                  int m, int n, int k,
+                                  const __half           *alpha,
+                                  const __half           *Aarray[], int lda,
+                                  const __half           *Barray[], int ldb,
+                                  const __half           *beta,
+                                  __half           *Carray[], int ldc,
+                                  int batchCount) {
+#ifdef FMOE_USE_HIP
+    return rocblas_hgemm_batched(handle, transa, transb, m, n, k, (const rocblas_half*)alpha, (const rocblas_half* const*)Aarray, lda, (const rocblas_half* const*)Barray, ldb, (const rocblas_half*)beta, (rocblas_half* const*)Carray, ldc, batchCount);
+#else
+    return cublasHgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+#endif
+}
+inline cublasStatus_t cublasXgemm(cublasHandle_t handle,
+                                cublasOperation_t transa, cublasOperation_t transb,
+                                int m, int n, int k,
+                                const float           *alpha,
+                                const float           *A, int lda,
+                                const float           *B, int ldb,
+                                const float           *beta,
+                                float           *C, int ldc) {
+#ifdef FMOE_USE_HIP
+    return rocblas_sgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+#else
+    return cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+#endif
+}
+inline cublasStatus_t cublasXgemm(cublasHandle_t handle,
+                                cublasOperation_t transa, cublasOperation_t transb,
+                                int m, int n, int k,
+                                const double          *alpha,
+                                const double          *A, int lda,
+                                const double          *B, int ldb,
+                                const double          *beta,
+                                double          *C, int ldc) {
+#ifdef FMOE_USE_HIP
+    return rocblas_dgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+#else
+    return cublasDgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+#endif
+}
+inline cublasStatus_t cublasXgemm(cublasHandle_t handle,
+                                cublasOperation_t transa, cublasOperation_t transb,
+                                int m, int n, int k,
+                                const __half *alpha,
+                                const __half *A, int lda,
+                                const __half *B, int ldb,
+                                const __half *beta,
+                                __half *C, int ldc) {
+#ifdef FMOE_USE_HIP
+    return rocblas_hgemm(handle, transa, transb, m, n, k, (const rocblas_half*)alpha, (const rocblas_half* )A, lda, (const rocblas_half* )B, ldb, (const rocblas_half*)beta, (rocblas_half* )C, ldc);
+#else
+    return cublasHgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+#endif
+}
+inline cublasStatus_t cublasXgemm(cublasHandle_t handle,
+                                cublasOperation_t transa, cublasOperation_t transb,
+                                int m, int n, int k,
+                                const c10::Half *alpha,
+                                const c10::Half *A, int lda,
+                                const c10::Half *B, int ldb,
+                                const c10::Half *beta,
+                                c10::Half *C, int ldc) {
+#ifdef FMOE_USE_HIP
+    return rocblas_hgemm(handle, transa, transb, m, n, k,
+            (const rocblas_half*)alpha,
+            (const rocblas_half*)A, lda,
+            (const rocblas_half*)B, ldb,
+            (const rocblas_half*)beta,
+            (rocblas_half*)C, ldc);
+#else
+    return cublasHgemm(handle, transa, transb, m, n, k,
+            (const __half*)alpha,
+            (const __half*)A, lda,
+            (const __half*)B, ldb,
+            (const __half*)beta,
+            (__half*)C, ldc);
+#endif
+}
+#endif  // CUBLAS_WRAPPER_H
--- a/cuda/utils/fmoe_utils.h
+++ b/cuda/utils/fmoe_utils.h
+#ifndef FMOE_UTILS_H
+#define FMOE_UTILS_H
+#define CHECK_CUDA(x) AT_ASSERTM(x.device().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+#define CEIL(_x_,_y_) (((_x_)-1)/(_y_)+1)
+#endif  // FMOE_UTILS_H
--- a/cuda/utils/helper_cuda.h
+++ b/cuda/utils/helper_cuda.h
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+////////////////////////////////////////////////////////////////////////////////
+// These are CUDA Helper functions for initialization and error checking
+// This file is clipped from the original header file by laekov
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <stdio.h>
+#include <stdlib.h>
+#ifndef HELPER_CUDA_H
+#define HELPER_CUDA_H
+static const char *_cudaGetErrorEnum(cudaError_t error) {
+  return cudaGetErrorName(error);
+}
+#ifdef CUDA_DRIVER_API
+// CUDA Driver API errors
+static const char *_cudaGetErrorEnum(CUresult error) {
+  static char unknown[] = "<unknown>";
+  const char *ret = NULL;
+  cuGetErrorName(error, &ret);
+  return ret ? ret : unknown;
+}
+#endif
+#ifdef FMOE_USE_HIP
+static const char *_cudaGetErrorEnum(cublasStatus_t error) {
+  switch (error) {
+    case rocblas_status_success:
+      return "rocblas_status_success";
+    case rocblas_status_invalid_handle:
+      return "rocblas_status_invalid_handle";
+    case rocblas_status_not_implemented:
+      return "rocblas_status_not_implemented";
+    case rocblas_status_invalid_pointer:
+      return "rocblas_status_invalid_pointer:";
+    case rocblas_status_invalid_size:
+      return "rocblas_status_invalid_size";
+    case rocblas_status_memory_error:
+      return "rocblas_status_memory_error";
+    case rocblas_status_internal_error:
+      return "rocblas_status_internal_error";
+    case rocblas_status_perf_degraded:
+      return "rocblas_status_perf_degraded";
+    case rocblas_status_size_query_mismatch:
+      return "rocblas_status_size_query_mismatch";
+    case rocblas_status_size_increased:
+      return "rocblas_status_size_increased";
+    case rocblas_status_size_unchanged:
+      return "rocblas_status_size_unchanged";
+    case rocblas_status_invalid_value:
+      return "rocblas_status_invalid_value";
+    case rocblas_status_continue:
+      return "rocblas_status_continue";
+  }
+  return "<unknown>";
+}
+#else
+// cuBLAS API errors
+static const char *_cudaGetErrorEnum(cublasStatus_t error) {
+  switch (error) {
+    case CUBLAS_STATUS_SUCCESS:
+      return "CUBLAS_STATUS_SUCCESS";
+    case CUBLAS_STATUS_NOT_INITIALIZED:
+      return "CUBLAS_STATUS_NOT_INITIALIZED";
+    case CUBLAS_STATUS_ALLOC_FAILED:
+      return "CUBLAS_STATUS_ALLOC_FAILED";
+    case CUBLAS_STATUS_INVALID_VALUE:
+      return "CUBLAS_STATUS_INVALID_VALUE";
+    case CUBLAS_STATUS_ARCH_MISMATCH:
+      return "CUBLAS_STATUS_ARCH_MISMATCH";
+    case CUBLAS_STATUS_MAPPING_ERROR:
+      return "CUBLAS_STATUS_MAPPING_ERROR";
+    case CUBLAS_STATUS_EXECUTION_FAILED:
+      return "CUBLAS_STATUS_EXECUTION_FAILED";
+    case CUBLAS_STATUS_INTERNAL_ERROR:
+      return "CUBLAS_STATUS_INTERNAL_ERROR";
+    case CUBLAS_STATUS_NOT_SUPPORTED:
+      return "CUBLAS_STATUS_NOT_SUPPORTED";
+    case CUBLAS_STATUS_LICENSE_ERROR:
+      return "CUBLAS_STATUS_LICENSE_ERROR";
+  }
+  return "<unknown>";
+}
+#endif
+#ifdef _CUFFT_H_
+// cuFFT API errors
+static const char *_cudaGetErrorEnum(cufftResult error) {
+  switch (error) {
+    case CUFFT_SUCCESS:
+      return "CUFFT_SUCCESS";
+    case CUFFT_INVALID_PLAN:
+      return "CUFFT_INVALID_PLAN";
+    case CUFFT_ALLOC_FAILED:
+      return "CUFFT_ALLOC_FAILED";
+    case CUFFT_INVALID_TYPE:
+      return "CUFFT_INVALID_TYPE";
+    case CUFFT_INVALID_VALUE:
+      return "CUFFT_INVALID_VALUE";
+    case CUFFT_INTERNAL_ERROR:
+      return "CUFFT_INTERNAL_ERROR";
+    case CUFFT_EXEC_FAILED:
+      return "CUFFT_EXEC_FAILED";
+    case CUFFT_SETUP_FAILED:
+      return "CUFFT_SETUP_FAILED";
+    case CUFFT_INVALID_SIZE:
+      return "CUFFT_INVALID_SIZE";
+    case CUFFT_UNALIGNED_DATA:
+      return "CUFFT_UNALIGNED_DATA";
+    case CUFFT_INCOMPLETE_PARAMETER_LIST:
+      return "CUFFT_INCOMPLETE_PARAMETER_LIST";
+    case CUFFT_INVALID_DEVICE:
+      return "CUFFT_INVALID_DEVICE";
+    case CUFFT_PARSE_ERROR:
+      return "CUFFT_PARSE_ERROR";
+    case CUFFT_NO_WORKSPACE:
+      return "CUFFT_NO_WORKSPACE";
+    case CUFFT_NOT_IMPLEMENTED:
+      return "CUFFT_NOT_IMPLEMENTED";
+    case CUFFT_LICENSE_ERROR:
+      return "CUFFT_LICENSE_ERROR";
+    case CUFFT_NOT_SUPPORTED:
+      return "CUFFT_NOT_SUPPORTED";
+  }
+  return "<unknown>";
+}
+#endif
+#ifdef CUSPARSEAPI
+// cuSPARSE API errors
+static const char *_cudaGetErrorEnum(cusparseStatus_t error) {
+  switch (error) {
+    case CUSPARSE_STATUS_SUCCESS:
+      return "CUSPARSE_STATUS_SUCCESS";
+    case CUSPARSE_STATUS_NOT_INITIALIZED:
+      return "CUSPARSE_STATUS_NOT_INITIALIZED";
+    case CUSPARSE_STATUS_ALLOC_FAILED:
+      return "CUSPARSE_STATUS_ALLOC_FAILED";
+    case CUSPARSE_STATUS_INVALID_VALUE:
+      return "CUSPARSE_STATUS_INVALID_VALUE";
+    case CUSPARSE_STATUS_ARCH_MISMATCH:
+      return "CUSPARSE_STATUS_ARCH_MISMATCH";
+    case CUSPARSE_STATUS_MAPPING_ERROR:
+      return "CUSPARSE_STATUS_MAPPING_ERROR";
+    case CUSPARSE_STATUS_EXECUTION_FAILED:
+      return "CUSPARSE_STATUS_EXECUTION_FAILED";
+    case CUSPARSE_STATUS_INTERNAL_ERROR:
+      return "CUSPARSE_STATUS_INTERNAL_ERROR";
+    case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+      return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+  }
+  return "<unknown>";
+}
+#endif
+#ifdef CUSOLVER_COMMON_H_
+// cuSOLVER API errors
+static const char *_cudaGetErrorEnum(cusolverStatus_t error) {
+  switch (error) {
+    case CUSOLVER_STATUS_SUCCESS:
+      return "CUSOLVER_STATUS_SUCCESS";
+    case CUSOLVER_STATUS_NOT_INITIALIZED:
+      return "CUSOLVER_STATUS_NOT_INITIALIZED";
+    case CUSOLVER_STATUS_ALLOC_FAILED:
+      return "CUSOLVER_STATUS_ALLOC_FAILED";
+    case CUSOLVER_STATUS_INVALID_VALUE:
+      return "CUSOLVER_STATUS_INVALID_VALUE";
+    case CUSOLVER_STATUS_ARCH_MISMATCH:
+      return "CUSOLVER_STATUS_ARCH_MISMATCH";
+    case CUSOLVER_STATUS_MAPPING_ERROR:
+      return "CUSOLVER_STATUS_MAPPING_ERROR";
+    case CUSOLVER_STATUS_EXECUTION_FAILED:
+      return "CUSOLVER_STATUS_EXECUTION_FAILED";
+    case CUSOLVER_STATUS_INTERNAL_ERROR:
+      return "CUSOLVER_STATUS_INTERNAL_ERROR";
+    case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+      return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+    case CUSOLVER_STATUS_NOT_SUPPORTED:
+      return "CUSOLVER_STATUS_NOT_SUPPORTED ";
+    case CUSOLVER_STATUS_ZERO_PIVOT:
+      return "CUSOLVER_STATUS_ZERO_PIVOT";
+    case CUSOLVER_STATUS_INVALID_LICENSE:
+      return "CUSOLVER_STATUS_INVALID_LICENSE";
+  }
+  return "<unknown>";
+}
+#endif
+#ifdef CURAND_H_
+// cuRAND API errors
+static const char *_cudaGetErrorEnum(curandStatus_t error) {
+  switch (error) {
+    case CURAND_STATUS_SUCCESS:
+      return "CURAND_STATUS_SUCCESS";
+    case CURAND_STATUS_VERSION_MISMATCH:
+      return "CURAND_STATUS_VERSION_MISMATCH";
+    case CURAND_STATUS_NOT_INITIALIZED:
+      return "CURAND_STATUS_NOT_INITIALIZED";
+    case CURAND_STATUS_ALLOCATION_FAILED:
+      return "CURAND_STATUS_ALLOCATION_FAILED";
+    case CURAND_STATUS_TYPE_ERROR:
+      return "CURAND_STATUS_TYPE_ERROR";
+    case CURAND_STATUS_OUT_OF_RANGE:
+      return "CURAND_STATUS_OUT_OF_RANGE";
+    case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
+      return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
+    case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+      return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+    case CURAND_STATUS_LAUNCH_FAILURE:
+      return "CURAND_STATUS_LAUNCH_FAILURE";
+    case CURAND_STATUS_PREEXISTING_FAILURE:
+      return "CURAND_STATUS_PREEXISTING_FAILURE";
+    case CURAND_STATUS_INITIALIZATION_FAILED:
+      return "CURAND_STATUS_INITIALIZATION_FAILED";
+    case CURAND_STATUS_ARCH_MISMATCH:
+      return "CURAND_STATUS_ARCH_MISMATCH";
+    case CURAND_STATUS_INTERNAL_ERROR:
+      return "CURAND_STATUS_INTERNAL_ERROR";
+  }
+  return "<unknown>";
+}
+#endif
+#ifdef NVJPEGAPI
+// nvJPEG API errors
+static const char *_cudaGetErrorEnum(nvjpegStatus_t error) {
+  switch (error) {
+    case NVJPEG_STATUS_SUCCESS:
+      return "NVJPEG_STATUS_SUCCESS";
+    case NVJPEG_STATUS_NOT_INITIALIZED:
+      return "NVJPEG_STATUS_NOT_INITIALIZED";
+    case NVJPEG_STATUS_INVALID_PARAMETER:
+      return "NVJPEG_STATUS_INVALID_PARAMETER";
+    case NVJPEG_STATUS_BAD_JPEG:
+      return "NVJPEG_STATUS_BAD_JPEG";
+    case NVJPEG_STATUS_JPEG_NOT_SUPPORTED:
+      return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED";
+    case NVJPEG_STATUS_ALLOCATOR_FAILURE:
+      return "NVJPEG_STATUS_ALLOCATOR_FAILURE";
+    case NVJPEG_STATUS_EXECUTION_FAILED:
+      return "NVJPEG_STATUS_EXECUTION_FAILED";
+    case NVJPEG_STATUS_ARCH_MISMATCH:
+      return "NVJPEG_STATUS_ARCH_MISMATCH";
+    case NVJPEG_STATUS_INTERNAL_ERROR:
+      return "NVJPEG_STATUS_INTERNAL_ERROR";
+  }
+  return "<unknown>";
+}
+#endif
+#ifdef NV_NPPIDEFS_H
+// NPP API errors
+static const char *_cudaGetErrorEnum(NppStatus error) {
+  switch (error) {
+    case NPP_NOT_SUPPORTED_MODE_ERROR:
+      return "NPP_NOT_SUPPORTED_MODE_ERROR";
+    case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR:
+      return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR";
+    case NPP_RESIZE_NO_OPERATION_ERROR:
+      return "NPP_RESIZE_NO_OPERATION_ERROR";
+    case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY:
+      return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY";
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
+    case NPP_BAD_ARG_ERROR:
+      return "NPP_BAD_ARGUMENT_ERROR";
+    case NPP_COEFF_ERROR:
+      return "NPP_COEFFICIENT_ERROR";
+    case NPP_RECT_ERROR:
+      return "NPP_RECTANGLE_ERROR";
+    case NPP_QUAD_ERROR:
+      return "NPP_QUADRANGLE_ERROR";
+    case NPP_MEM_ALLOC_ERR:
+      return "NPP_MEMORY_ALLOCATION_ERROR";
+    case NPP_HISTO_NUMBER_OF_LEVELS_ERROR:
+      return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
+    case NPP_INVALID_INPUT:
+      return "NPP_INVALID_INPUT";
+    case NPP_POINTER_ERROR:
+      return "NPP_POINTER_ERROR";
+    case NPP_WARNING:
+      return "NPP_WARNING";
+    case NPP_ODD_ROI_WARNING:
+      return "NPP_ODD_ROI_WARNING";
+#else
+    // These are for CUDA 5.5 or higher
+    case NPP_BAD_ARGUMENT_ERROR:
+      return "NPP_BAD_ARGUMENT_ERROR";
+    case NPP_COEFFICIENT_ERROR:
+      return "NPP_COEFFICIENT_ERROR";
+    case NPP_RECTANGLE_ERROR:
+      return "NPP_RECTANGLE_ERROR";
+    case NPP_QUADRANGLE_ERROR:
+      return "NPP_QUADRANGLE_ERROR";
+    case NPP_MEMORY_ALLOCATION_ERR:
+      return "NPP_MEMORY_ALLOCATION_ERROR";
+    case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR:
+      return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
+    case NPP_INVALID_HOST_POINTER_ERROR:
+      return "NPP_INVALID_HOST_POINTER_ERROR";
+    case NPP_INVALID_DEVICE_POINTER_ERROR:
+      return "NPP_INVALID_DEVICE_POINTER_ERROR";
+#endif
+    case NPP_LUT_NUMBER_OF_LEVELS_ERROR:
+      return "NPP_LUT_NUMBER_OF_LEVELS_ERROR";
+    case NPP_TEXTURE_BIND_ERROR:
+      return "NPP_TEXTURE_BIND_ERROR";
+    case NPP_WRONG_INTERSECTION_ROI_ERROR:
+      return "NPP_WRONG_INTERSECTION_ROI_ERROR";
+    case NPP_NOT_EVEN_STEP_ERROR:
+      return "NPP_NOT_EVEN_STEP_ERROR";
+    case NPP_INTERPOLATION_ERROR:
+      return "NPP_INTERPOLATION_ERROR";
+    case NPP_RESIZE_FACTOR_ERROR:
+      return "NPP_RESIZE_FACTOR_ERROR";
+    case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR:
+      return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR";
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
+    case NPP_MEMFREE_ERR:
+      return "NPP_MEMFREE_ERR";
+    case NPP_MEMSET_ERR:
+      return "NPP_MEMSET_ERR";
+    case NPP_MEMCPY_ERR:
+      return "NPP_MEMCPY_ERROR";
+    case NPP_MIRROR_FLIP_ERR:
+      return "NPP_MIRROR_FLIP_ERR";
+#else
+    case NPP_MEMFREE_ERROR:
+      return "NPP_MEMFREE_ERROR";
+    case NPP_MEMSET_ERROR:
+      return "NPP_MEMSET_ERROR";
+    case NPP_MEMCPY_ERROR:
+      return "NPP_MEMCPY_ERROR";
+    case NPP_MIRROR_FLIP_ERROR:
+      return "NPP_MIRROR_FLIP_ERROR";
+#endif
+    case NPP_ALIGNMENT_ERROR:
+      return "NPP_ALIGNMENT_ERROR";
+    case NPP_STEP_ERROR:
+      return "NPP_STEP_ERROR";
+    case NPP_SIZE_ERROR:
+      return "NPP_SIZE_ERROR";
+    case NPP_NULL_POINTER_ERROR:
+      return "NPP_NULL_POINTER_ERROR";
+    case NPP_CUDA_KERNEL_EXECUTION_ERROR:
+      return "NPP_CUDA_KERNEL_EXECUTION_ERROR";
+    case NPP_NOT_IMPLEMENTED_ERROR:
+      return "NPP_NOT_IMPLEMENTED_ERROR";
+    case NPP_ERROR:
+      return "NPP_ERROR";
+    case NPP_SUCCESS:
+      return "NPP_SUCCESS";
+    case NPP_WRONG_INTERSECTION_QUAD_WARNING:
+      return "NPP_WRONG_INTERSECTION_QUAD_WARNING";
+    case NPP_MISALIGNED_DST_ROI_WARNING:
+      return "NPP_MISALIGNED_DST_ROI_WARNING";
+    case NPP_AFFINE_QUAD_INCORRECT_WARNING:
+      return "NPP_AFFINE_QUAD_INCORRECT_WARNING";
+    case NPP_DOUBLE_SIZE_WARNING:
+      return "NPP_DOUBLE_SIZE_WARNING";
+    case NPP_WRONG_INTERSECTION_ROI_WARNING:
+      return "NPP_WRONG_INTERSECTION_ROI_WARNING";
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000
+    /* These are 6.0 or higher */
+    case NPP_LUT_PALETTE_BITSIZE_ERROR:
+      return "NPP_LUT_PALETTE_BITSIZE_ERROR";
+    case NPP_ZC_MODE_NOT_SUPPORTED_ERROR:
+      return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR";
+    case NPP_QUALITY_INDEX_ERROR:
+      return "NPP_QUALITY_INDEX_ERROR";
+    case NPP_CHANNEL_ORDER_ERROR:
+      return "NPP_CHANNEL_ORDER_ERROR";
+    case NPP_ZERO_MASK_VALUE_ERROR:
+      return "NPP_ZERO_MASK_VALUE_ERROR";
+    case NPP_NUMBER_OF_CHANNELS_ERROR:
+      return "NPP_NUMBER_OF_CHANNELS_ERROR";
+    case NPP_COI_ERROR:
+      return "NPP_COI_ERROR";
+    case NPP_DIVISOR_ERROR:
+      return "NPP_DIVISOR_ERROR";
+    case NPP_CHANNEL_ERROR:
+      return "NPP_CHANNEL_ERROR";
+    case NPP_STRIDE_ERROR:
+      return "NPP_STRIDE_ERROR";
+    case NPP_ANCHOR_ERROR:
+      return "NPP_ANCHOR_ERROR";
+    case NPP_MASK_SIZE_ERROR:
+      return "NPP_MASK_SIZE_ERROR";
+    case NPP_MOMENT_00_ZERO_ERROR:
+      return "NPP_MOMENT_00_ZERO_ERROR";
+    case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR:
+      return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR";
+    case NPP_THRESHOLD_ERROR:
+      return "NPP_THRESHOLD_ERROR";
+    case NPP_CONTEXT_MATCH_ERROR:
+      return "NPP_CONTEXT_MATCH_ERROR";
+    case NPP_FFT_FLAG_ERROR:
+      return "NPP_FFT_FLAG_ERROR";
+    case NPP_FFT_ORDER_ERROR:
+      return "NPP_FFT_ORDER_ERROR";
+    case NPP_SCALE_RANGE_ERROR:
+      return "NPP_SCALE_RANGE_ERROR";
+    case NPP_DATA_TYPE_ERROR:
+      return "NPP_DATA_TYPE_ERROR";
+    case NPP_OUT_OFF_RANGE_ERROR:
+      return "NPP_OUT_OFF_RANGE_ERROR";
+    case NPP_DIVIDE_BY_ZERO_ERROR:
+      return "NPP_DIVIDE_BY_ZERO_ERROR";
+    case NPP_RANGE_ERROR:
+      return "NPP_RANGE_ERROR";
+    case NPP_NO_MEMORY_ERROR:
+      return "NPP_NO_MEMORY_ERROR";
+    case NPP_ERROR_RESERVED:
+      return "NPP_ERROR_RESERVED";
+    case NPP_NO_OPERATION_WARNING:
+      return "NPP_NO_OPERATION_WARNING";
+    case NPP_DIVIDE_BY_ZERO_WARNING:
+      return "NPP_DIVIDE_BY_ZERO_WARNING";
+#endif
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x7000
+    /* These are 7.0 or higher */
+    case NPP_OVERFLOW_ERROR:
+      return "NPP_OVERFLOW_ERROR";
+    case NPP_CORRUPTED_DATA_ERROR:
+      return "NPP_CORRUPTED_DATA_ERROR";
+#endif
+  }
+  return "<unknown>";
+}
+#endif
+template <typename T>
+void check(T result, char const *const func, const char *const file,
+           int const line) {
+  if (result) {
+    fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
+            static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
+    exit(EXIT_FAILURE);
+  }
+}
+// This will output the proper CUDA error strings in the event
+// that a CUDA host call returns an error
+#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
+#endif  // HELPER_CUDA_H
--- a/cuda/utils/timer.hh
+++ b/cuda/utils/timer.hh
+#ifndef TIMER_HH
+#define TIMER_HH
+/*
+ * This part of code is not used.
+#include <chrono>
+inline double getDuration(std::chrono::time_point<std::chrono::system_clock> a,
+        std::chrono::time_point<std::chrono::system_clock> b) {
+    return  std::chrono::duration<double>(b - a).count();
+}
+#define timestamp(__var__) auto __var__ = std::chrono::system_clock::now();
+*/
+#endif  // TIMER_HH
--- a/doc/fastmoe_data_parallel.png
+++ b/doc/fastmoe_data_parallel.png
--- a/doc/fastmoe_model_parallel.png
+++ b/doc/fastmoe_model_parallel.png
--- a/doc/logo/rect.png
+++ b/doc/logo/rect.png
--- a/doc/logo/square.png
+++ b/doc/logo/square.png
--- a/doc/readme-cn.md
+++ b/doc/readme-cn.md
+FastMoE 系统
+===
+[版本更新记录](release-note.md)
+| [Slack 讨论组邀请链接](https://join.slack.com/t/fastmoe/shared_invite/zt-mz0ai6ol-ggov75D62YsgHfzShw8KYw)
+## 简介
+FastMoE 是一个易用且高效的基于 PyTorch 的 MoE 模型训练系统.
+## 安装
+### 依赖
+启用了 CUDA 的 PyTorch 是必要的. 当前版本的 FastMoE 在 PyTorch v1.8.0 和 CUDA 10
+的平台上经过了测试. 本系统从设计上也支持更旧的 PyTorch 版本.
+如果需要使能 FastMoE 模型并行特性, 那么支持点对点通信的 NCCL 库 (即不旧于
+`2.7.5` 版本) 也是必需的.
+### 安装
+FastMoE 包含一些定制的 PyTorch 算子, 包含一些 C 的组件. 用 `python setup.py install`
+来简单地安装 FastMoE.
+FastMoE 分布式模型并行特性默认是不被启用的. 如果它需要被启用,
+则需要在运行上述命令时加入环境变量 `USE_NCCL=1`.
+注意, 由于 PyTorch 框架通常仅集成了 NCCL 的运行时组件, 额外的 NCCL
+开发包需要被安装在编译环境中, 而且它的版本需要与 PyTorch 的版本相对应. 推荐使用
+[PyTorch 官方 Docker 镜像](https://hub.docker.com/r/pytorch/pytorch),
+因为那里的环境较为干净. 如果您希望手工配置环境, 可以在 [NCCL
+全部版本的下载链接](https://developer.nvidia.com/nccl/nccl-legacy-downloads)
+下载合适版本的 NCCL 开发包.
+## 使用
+### 将一个 Transformer 模型 FMoE 化
+Transformer 是当前最流行的可被 MoE 化的模型. FastMoE 可以一键将一个普通的
+Transformer 模型变为一个  MoE 的模型. 其使用方法如下.
+例如在 [Megatron-LM](https://github.com/nvidia/megatron-lm) 中,
+添加如下的代码即可将 Transformer 中的每个 MLP 层变为多个 MLP 层构成的 MoE 网络.
+```python
+model = ...
+from fmoe.megatron import fmoefy
+model = fmoefy(model, num_experts=<number of experts per worker>)
+train(model, ...)
+```
+一个更详细的在 Megatron-LM 中使用 `fmoefy` 函数的样例参见[此处](../examples/megatron).
+### 将 FastMoE 作为一个网络模块使用
+一个使用 FastMoE 的 Transformer 模型见[这个示例](../examples/transformer-xl).
+最简单的使用方式是使用 `FMoE` 层来代替 `MLP` 层. 
+### 分布式地使用 FastMoE
+FastMoE 支持数据并行和模型并行.
+#### 数据并行.
+在 FastMoE 的数据并行模式下,
+门网络(gate)和专家网络都被复制地放置在各个运算单元上.
+下图展示了一个有三个专家的两路数据并行MoE模型进行前向计算的方式.
+<p align="center">
+<img src="fastmoe_data_parallel.png" width="600">
+</p>
+对于数据并行, 额外的代码是不需要的. FastMoE 与 PyTorch 的 `DataParallel` 和
+`DistributedDataParallel` 模块都可以无缝对接. 该方式唯一的问题是,
+专家的数量受到单个计算单元(如GPU)的内存大小限制.
+#### 模型并行
+在 FastMoE 的模型并行模式中, 门网络依然是复制地被放置在每个计算单元上的,
+但是专家网络被独立地分别放置在各个计算单元上. 因此, 通过引入额外的通信操作,
+FastMoE 可以允许更多的专家网络们同时被训练,
+而其数量限制与计算单元的数量是正相关的.
+下图展示了一个有六个专家网络的模型被两路模型并行地训练.
+注意专家1-3被放置在第一个计算单元上, 而专家4-6被放置在第二个计算单元上.
+<p align="center">
+<img src="fastmoe_model_parallel.png" width="600">
+</p>
+FastMoE 的模型并行模式需要专门的并行策略, 而 PyTorch 和 Megatron-LM
+都不支持这样的策略. 因此, 需要使用 `fmoe.DistributedGroupedDataParallel`
+模块来代替 PyTorch 的 DDP 模块.
+## 答疑 / 讨论
+如果您在使用 FastMoE 的过程中有任何疑问, 或您有兴趣参与 FastMoE 的相关工作,
+欢迎加入我们的 [Slack 讨论组](https://join.slack.com/t/fastmoe/shared_invite/zt-mz0ai6ol-ggov75D62YsgHfzShw8KYw).
--- a/doc/release-note.md
+++ b/doc/release-note.md
+## v0.3.0
+### FMoE core
+* Previous `mp_group` is renamed to `slice_group`, indicating that all workers in the group receive the same input batch, and process a slice of the input. `mp_group` will be deprecated in our next release.
+* ROCm supported.
+* `FMoELinear` is moved to a stand-alone file.
+### Groupped data parallel
+* Support any group name by their relative tag name.
+###  Load balancing
+* A brand new balancing strategy - SWIPE. Contributed by authors of a (currently unpublished) paper.
+* A property `has_loss` is added to each gate, in order to identify whether balance loss should be collected.
+### Megatron-LM support
+* Experts are partitioned by tensor model parallelism in `mp_group`, instead of expert parallelism.
+* Support arbitrary customized gate in `MegatronMLP`.
+* Move the patches to a stand-alone file.
+### Tests
+* Move util functions into `test_ddp.py`.
+## v0.2.1
+## Load balancing
+* Fix gradient for balance loss.
+### Misc
+* Typos.
+* Update benchmark interface.
+* Remove some redundant code for performance improvement.
+* Enable `USE_NCCL` by default.
+* Compatibility for PyTorch `<1.8.0` and `>=1.8.0`.
+### Megatron adaption
+* Patch for numerical correctness of gradient clipping.
+* Support to pipeline parallelism.
+## v0.2.0
+## Load balancing
+* A brand new gate module with capacity-related utilities.
+* GShard's and Switch Transformer's balance strategies are implemented as integrated gates.
+* Balance loss is enabled.
+* Balance monitor is provided.
+## Checkpointing
+* MoE models can be loaded and saved by fmoe's checkpointing module.
+## Performance
+* FP16 training performance is improved.
+## Misc
+* CUDA code directory is reconstructed.
+* More tests are added.
+## v0.1.2
+### Compilation
+- Remove dependency on the CUDA examples repository.
+### Distributed
+- Fix a bug related to PyTorch v1.8.0. FastMoE can now operate on multiple GPUs
+on multiple nodes with PyTorch v1.8.0.
+### Misc
+- Fix tons of typos.
+- Format the code.
+## v0.1.1
+### Distributed
+- Broadcast data-parallel parameters before training.
+### Megatron adaption
+- Initialize `FMoELinear` parameters using different seed in model parallel even using the same random seed in megatron.
+- Use proper comm for mp and dp.
+### Transformer-XL example
+- Improve scripts.
+### Misc
+- Logo and slack workspace link.
+- Document in Chinese.
+- Figures to explain how FastMoE works.
+## v0.1.0
+### Functions
+- A model-injection-style easy-to-use user interface for Megatron-LM. 
+- Support both data parallel and model parallel, and a hybrid of the two,
+- Provide a new customized DDP module to synchronize in different comm groups.
+- Support to customized `nn.Module` as an expert.
+### Document and infrastructure
+- Use PyTest.
+- Setup PyLint.
+- Installation and usage guide.
+- Explanation of functions and code structure in code.
+### Performance
+- A benchmark to compare FastMoE and old PyTorch impl.
--- a/examples/.gitignore
+++ b/examples/.gitignore
+transformer-xl/data
+transformer-xl/LM-TFM-enwik8
+data
--- a/examples/megatron/README.md
+++ b/examples/megatron/README.md
+FastMoE works with different versions of
+[Megatron-LM](https://github.com/nvidia/megatron-lm).
+See `fmoe/megatron/utils.py` for arguments of FastMoE.
+An example patch is provided for `v2.2` release.
+The patch can be directly applied to add FastMoE support if you are using
+Megatron-LM v2.2.
+Otherwise, you may need to manually enable FastMoE in your codebase.
+The patch includes the following modifications.
+### Add arguments to Megatron's argparser
+In `megatron/arguments.py`, add `_add_fmoe_args` to the parser.
+### Patch checkpoint
+In `megatron/training.py`, replace `load_checkpoint` and `save_checkpoint` by
+functions with the same name in `fmoe.megatron.checkpointing`.
+### Building the model in FastMoE style
+In `megatron/training.py`, the `fmoe.megatron.fmoefy` function is used as an
+entrance to one-key introduce FastMoE layer to replace the MLP layers in the
+transformer language models.
+```python
+from fmoe.megatron import fmoefy
+model = fmoefy(model, num_experts=4)
+```
+Note that the `fmoefy` function currently only takes a standard Megatron-LM's
+top-level raw model as input, i.e. the MLP layers should be available at
+`model.language_model.transformer.layers[i].mlp`.
+### Using FastMoE's model parallellization
+In `megatron/training.py`, the `LocalDDP` module is replaced by the one in 
+`fmoe.megatron` to enable the sophiscated data parallel strategies that can
+parallelize the experts across both the data parallel group and the (tensor) 
+model parallel model group.
+```python
+# from megatron.model import DistributedDataParallel as LocalDDP
+from fmoe.megatron import DistributedDataParallel as LocalDDP
+```
+### Fix gradient clipping
+Megatron-LM uses gradient normalization, which is incompatible with FastMoE.
+Incorrect norm of the gradients lead to inconsistent parameter updates.
+Apply `clip-grad-v2.2.patch` to fix the issue.
+Note that only 2-norm is implemented in the patch. If other norm methods is
+used, remember to implement it accordingly.
+### Train as usual
+Start traning with FastMoE by using the scripts provided by Megatron-LM.
--- a/examples/megatron/clip-grad-v2.2.patch
+++ b/examples/megatron/clip-grad-v2.2.patch
+diff --git a/megatron/optimizer/clip_grads.py b/megatron/optimizer/clip_grads.py
+index e8d0d02..fd6660a 100644
+--- a/megatron/optimizer/clip_grads.py
+++ b/megatron/optimizer/clip_grads.py
+@@ -52,6 +52,7 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
+     #   - should not be a replica due to tensor model parallelism
+     grads = []
+     grads_for_norm = []
+    grads_in_moe = []
+     for param in parameters:
+         grad_not_none = param.grad is not None
+         is_not_shared = not hasattr(param, 'shared') or not param.shared
+@@ -63,7 +64,10 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
+             assert param.grad.type() == 'torch.cuda.FloatTensor'
+             grads.append(grad)
+         if grad_not_none and is_not_shared and is_not_tp_duplicate:
+-            grads_for_norm.append(grad)
+            if hasattr(param, 'dp_comm') and param.dp_comm in ('none'):
+                grads_in_moe.append(grad)
+            else:
+                grads_for_norm.append(grad)
+     # Norm parameters.
+     max_norm = float(max_norm)
+@@ -72,6 +76,7 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
+     # Calculate norm.
+     if norm_type == inf:
+        # TODO: moe
+         total_norm = max(grad.abs().max() for grad in grads_for_norm)
+         total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
+         # Take max across all model-parallel GPUs.
+@@ -96,7 +101,18 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
+             # we need the pow(norm-type).
+             total_norm = grad_norm ** norm_type
+            grad_norm, _ = multi_tensor_applier(
+                amp_C.multi_tensor_l2norm,
+                dummy_overflow_buf,
+                [grads_in_moe],
+                False # no per-parameter norm
+            )
+            grad_norm = grad_norm ** norm_type
+            torch.distributed.all_reduce(grad_norm)
+            total_norm += grad_norm
+
+         else:
+            # TODO: moe
+             for grad in grads_for_norm:
+                 grad_norm = torch.norm(grad, norm_type)
+                 total_norm += grad_norm ** norm_type
--- a/examples/megatron/fmoefy-v2.2.patch
+++ b/examples/megatron/fmoefy-v2.2.patch
+diff --git a/megatron/arguments.py b/megatron/arguments.py
+index 26a7cec..0acfb22 100644
+--- a/megatron/arguments.py
+++ b/megatron/arguments.py
+@@ -21,6 +21,8 @@ import os
+ import torch
+ from megatron import fused_kernels
+from fmoe.megatron import add_fmoe_args as _add_fmoe_args
+
+ def parse_args(extra_args_provider=None, defaults={},
+                ignore_unknown_args=False):
+     """Parse all arguments."""
+@@ -40,6 +42,7 @@ def parse_args(extra_args_provider=None, defaults={},
+     parser = _add_data_args(parser)
+     parser = _add_autoresume_args(parser)
+     parser = _add_realm_args(parser)
+    parser = _add_fmoe_args(parser)
+     # Custom arguments.
+     if extra_args_provider is not None:
+diff --git a/megatron/optimizer/optimizer.py b/megatron/optimizer/optimizer.py
+index 9d42260..2583db2 100644
+--- a/megatron/optimizer/optimizer.py
+++ b/megatron/optimizer/optimizer.py
+@@ -177,6 +177,8 @@ class FP16OptimizerWithFP16Params(MegatronOptimizer):
+                                                                   param)
+                         if hasattr(param, 'shared'):
+                             main_param.shared = param.shared
+                        if hasattr(param, 'dp_comm'):
+                            main_param.dp_comm = param.dp_comm
+                         # Replace the optimizer params with the new fp32 copy.
+                         param_group['params'][i] = main_param
+                         fp32_from_fp16_params_this_group.append(main_param)
+diff --git a/megatron/training.py b/megatron/training.py
+index 56d1c7c..f825bf3 100644
+--- a/megatron/training.py
+++ b/megatron/training.py
+@@ -35,20 +35,24 @@ from megatron import update_num_microbatches
+ from megatron import mpu
+ from megatron import print_rank_0
+ from megatron import print_rank_last
+-from megatron.checkpointing import load_checkpoint
+-from megatron.checkpointing import save_checkpoint
+# from megatron.checkpointing import load_checkpoint
+from fmoe.megatron.checkpoint import load_checkpoint
+# from megatron.checkpointing import save_checkpoint
+from fmoe.megatron.checkpoint import save_checkpoint
+ from megatron.model import FP16Module
+ from megatron.optimizer import get_megatron_optimizer
+ from megatron.initialize import initialize_megatron
+ from megatron.initialize import write_args_to_tensorboard
+ from megatron.learning_rates import AnnealingLR
+-from megatron.model import DistributedDataParallel as LocalDDP
+# from megatron.model import DistributedDataParallel as LocalDDP
+ from megatron.model.realm_model import ICTBertModel
+ from megatron.utils import check_adlr_autoresume_termination
+ from megatron.data.data_loaders import build_pretraining_data_loader
+ from megatron.utils import report_memory
+from fmoe.megatron import DistributedDataParallel as LocalDDP
+from fmoe.megatron import add_balance_log
+ def print_datetime(string):
+     """Note that this call will sync across all ranks."""
+@@ -102,6 +106,13 @@ def pretrain(train_valid_test_dataset_provider, model_provider,
+     args = get_args()
+     timers = get_timers()
+    # Initialize FastMoE
+    if args.fmoefy:
+        from fmoe.megatron import patch_forward_step, patch_model_provider
+
+        forward_step_func = patch_forward_step(forward_step_func)
+        model_provider = patch_model_provider(model_provider)
+
+     # Model, optimizer, and learning rate.
+     timers('model and optimizer').start()
+     model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider)
+@@ -643,7 +654,7 @@ def train_step(forward_step_func, data_iterator,
+ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
+-                 loss_scale, report_memory_flag, skipped_iter):
+                 loss_scale, report_memory_flag, skipped_iter, model):
+     """Log training information such as losses, timing, ...."""
+     args = get_args()
+     timers = get_timers()
+@@ -725,6 +736,8 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
+                           args.consumed_train_samples)
+         timers.write(timers_to_log, writer, iteration,
+                      normalizer=total_iterations)
+    if args.fmoefy and args.balance_strategy and args.balance_strategy != 'naive':
+        add_balance_log(model, writer, iteration)
+     if iteration % args.log_interval == 0:
+         elapsed_time = timers('interval time').elapsed()
+@@ -816,7 +829,7 @@ def train(forward_step_func, model, optimizer, lr_scheduler,
+         report_memory_flag = training_log(loss_dict, total_loss_dict,
+                                           optimizer.param_groups[0]['lr'],
+                                           iteration, loss_scale,
+-                                          report_memory_flag, skipped_iter)
+                                          report_memory_flag, skipped_iter, model)
+         # Autoresume
+         if args.adlr_autoresume and \
--- a/examples/transformer-xl/LICENSE
+++ b/examples/transformer-xl/LICENSE
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/examples/transformer-xl/README.md
+++ b/examples/transformer-xl/README.md
+This directory contains an example based on Zihang Dai, et.al's open-source
+transformer [implementation](https://github.com/kimiyoung/transformer-xl) to
+demostrate the usage of the usage of Fast MoE's layers.
+The code is released with Apache-2.0 license. Here, only the pytorch part of the
+code is used, with modification in the `mem_transformer.py` file to enable MoE
+training.
+## Introduction
+This directory contains our pytorch implementation of Transformer-XL. Note that our state-of-the-art results reported in the paper were obtained by training the model on a large-scale TPU cluster, and our pytorch codebase currently does not support distributed training. Here we provide two sets of hyperparameters and scripts:
+- `*large.sh` are for the SoTA setting with large models which might not be directly runnable on a local GPU machine.
+- `*base.sh` are for the base models which can be run on a few GPUs.
+The pytorch implementation produces similar results to the TF codebase under the same settings in our preliminary experiments.
+## Prerequisite
+- Pytorch 0.4: `conda install pytorch torchvision -c pytorch`
+## Data Prepration
+`bash getdata.sh`
+## Training and Evaluation
+#### Replicate the "bpc = 1.06" result on `enwik8` with a 12-layer Transformer-XL
+- Make sure the machine have **4 GPUs**, each with **at least 11G memory**
+- Training
+  `bash run_enwik8_base.sh train --work_dir PATH_TO_WORK_DIR`
+- Evaluation
+  `bash run_enwik8_base.sh eval --work_dir PATH_TO_WORK_DIR`
+#### Replicate the "PPL = 24.03" result on `wikitext-103` with Transformer-XL
+- Make sure the machine have **4 GPUs**, each with **at least 11G memory**
+- Training
+  `bash run_wt103_base.sh train --work_dir PATH_TO_WORK_DIR`
+- Evaluation
+  `bash run_wt103_base.sh eval --work_dir PATH_TO_WORK_DIR`
+#### Other options:
+- `--batch_chunk`: this option allows one to trade speed for memory. For `batch_chunk > 1`, the program will split each training batch into `batch_chunk` sub-batches and perform forward and backward on each sub-batch sequentially, with the gradient accumulated and divided by `batch_chunk`. Hence, the memory usage will propertionally lower while the computation time will inversely higher. 
+- `--div_val`: when using adaptive softmax and embedding, the embedding dimension is divided by `div_val` from bin $i$ to bin $i+1$. This saves both GPU memory and the parameter budget.
+- `--fp16` and `--dynamic-loss-scale`: Run in pseudo-fp16 mode (fp16 storage fp32 math) with dynamic loss scaling. 
+  - Note: to explore the `--fp16` option, please make sure the `apex` package is installed (https://github.com/NVIDIA/apex/).
+- To see performance without the recurrence mechanism, simply use `mem_len=0` in all your scripts.
+- To see performance of a standard Transformer without relative positional encodings or recurrence mechanisms, use `attn_type=2` and `mem_len=0`.
+#### Other datasets:
+- `Text8` character-level language modeling: check out `run_text8_base.sh`
+- `lm1b` word-level language modeling: check out `run_lm1b_base.sh`
--- a/examples/transformer-xl/data_utils.py
+++ b/examples/transformer-xl/data_utils.py
+import os, sys
+import glob
+from collections import Counter, OrderedDict
+import numpy as np
+import torch
+from utils.vocabulary import Vocab
+class LMOrderedIterator(object):
+    def __init__(self, data, bsz, bptt, device='cpu', ext_len=None):
+        """
+            data -- LongTensor -- the LongTensor is strictly ordered
+        """
+        self.bsz = bsz
+        self.bptt = bptt
+        self.ext_len = ext_len if ext_len is not None else 0
+        self.device = device
+        # Work out how cleanly we can divide the dataset into bsz parts.
+        self.n_step = data.size(0) // bsz
+        # Trim off any extra elements that wouldn't cleanly fit (remainders).
+        data = data.narrow(0, 0, self.n_step * bsz)
+        # Evenly divide the data across the bsz batches.
+        self.data = data.view(bsz, -1).t().contiguous().to(device)
+        # Number of mini-batches
+        self.n_batch = (self.n_step + self.bptt - 1) // self.bptt
+    def get_batch(self, i, bptt=None):
+        if bptt is None: bptt = self.bptt
+        seq_len = min(bptt, self.data.size(0) - 1 - i)
+        end_idx = i + seq_len
+        beg_idx = max(0, i - self.ext_len)
+        data = self.data[beg_idx:end_idx]
+        target = self.data[i+1:i+1+seq_len]
+        return data, target, seq_len
+    def get_fixlen_iter(self, start=0):
+        for i in range(start, self.data.size(0) - 1, self.bptt):
+            yield self.get_batch(i)
+    def get_varlen_iter(self, start=0, std=5, min_len=5, max_deviation=3):
+        max_len = self.bptt + max_deviation * std
+        i = start
+        while True:
+            bptt = self.bptt if np.random.random() < 0.95 else self.bptt / 2.
+            bptt = min(max_len, max(min_len, int(np.random.normal(bptt, std))))
+            data, target, seq_len = self.get_batch(i, bptt)
+            i += seq_len
+            yield data, target, seq_len
+            if i >= self.data.size(0) - 2:
+                break
+    def __iter__(self):
+        return self.get_fixlen_iter()
+class LMShuffledIterator(object):
+    def __init__(self, data, bsz, bptt, device='cpu', ext_len=None, shuffle=False):
+        """
+            data -- list[LongTensor] -- there is no order among the LongTensors
+        """
+        self.data = data
+        self.bsz = bsz
+        self.bptt = bptt
+        self.ext_len = ext_len if ext_len is not None else 0
+        self.device = device
+        self.shuffle = shuffle
+    def get_sent_stream(self):
+        # index iterator
+        epoch_indices = np.random.permutation(len(self.data)) if self.shuffle \
+            else np.array(range(len(self.data)))
+        # sentence iterator
+        for idx in epoch_indices:
+            yield self.data[idx]
+    def stream_iterator(self, sent_stream):
+        # streams for each data in the batch
+        streams = [None] * self.bsz
+        data = torch.LongTensor(self.bptt, self.bsz)
+        target = torch.LongTensor(self.bptt, self.bsz)
+        n_retain = 0
+        while True:
+            # data   : [n_retain+bptt x bsz]
+            # target : [bptt x bsz]
+            data[n_retain:].fill_(-1)
+            target.fill_(-1)
+            valid_batch = True
+            for i in range(self.bsz):
+                n_filled = 0
+                try:
+                    while n_filled < self.bptt:
+                        if streams[i] is None or len(streams[i]) <= 1:
+                            streams[i] = next(sent_stream)
+                        # number of new tokens to fill in
+                        n_new = min(len(streams[i]) - 1, self.bptt - n_filled)
+                        # first n_retain tokens are retained from last batch
+                        data[n_retain+n_filled:n_retain+n_filled+n_new, i] = \
+                            streams[i][:n_new]
+                        target[n_filled:n_filled+n_new, i] = \
+                            streams[i][1:n_new+1]
+                        streams[i] = streams[i][n_new:]
+                        n_filled += n_new
+                except StopIteration:
+                    valid_batch = False
+                    break
+            if not valid_batch:
+                return
+            data = data.to(self.device)
+            target = target.to(self.device)
+            yield data, target, self.bptt
+            n_retain = min(data.size(0), self.ext_len)
+            if n_retain > 0:
+                data[:n_retain] = data[-n_retain:]
+            data.resize_(n_retain + self.bptt, data.size(1))
+    def __iter__(self):
+        # sent_stream is an iterator
+        sent_stream = self.get_sent_stream()
+        for batch in self.stream_iterator(sent_stream):
+            yield batch
+class LMMultiFileIterator(LMShuffledIterator):
+    def __init__(self, paths, vocab, bsz, bptt, device='cpu', ext_len=None,
+        shuffle=False):
+        self.paths = paths
+        self.vocab = vocab
+        self.bsz = bsz
+        self.bptt = bptt
+        self.ext_len = ext_len if ext_len is not None else 0
+        self.device = device
+        self.shuffle = shuffle
+    def get_sent_stream(self, path):
+        sents = self.vocab.encode_file(path, add_double_eos=True)
+        if self.shuffle:
+            np.random.shuffle(sents)
+        sent_stream = iter(sents)
+        return sent_stream
+    def __iter__(self):
+        if self.shuffle:
+            np.random.shuffle(self.paths)
+        for path in self.paths:
+            # sent_stream is an iterator
+            sent_stream = self.get_sent_stream(path)
+            for batch in self.stream_iterator(sent_stream):
+                yield batch
+class Corpus(object):
+    def __init__(self, path, dataset, *args, **kwargs):
+        self.dataset = dataset
+        self.vocab = Vocab(*args, **kwargs)
+        if self.dataset in ['ptb', 'wt2', 'enwik8', 'text8']:
+            self.vocab.count_file(os.path.join(path, 'train.txt'))
+            self.vocab.count_file(os.path.join(path, 'valid.txt'))
+            self.vocab.count_file(os.path.join(path, 'test.txt'))
+        elif self.dataset == 'wt103':
+            self.vocab.count_file(os.path.join(path, 'train.txt'))
+        elif self.dataset == 'lm1b':
+            train_path_pattern = os.path.join(
+                path, '1-billion-word-language-modeling-benchmark-r13output',
+                'training-monolingual.tokenized.shuffled', 'news.en-*')
+            train_paths = glob.glob(train_path_pattern)
+            # the vocab will load from file when build_vocab() is called
+        self.vocab.build_vocab()
+        if self.dataset in ['ptb', 'wt2', 'wt103']:
+            self.train = self.vocab.encode_file(
+                os.path.join(path, 'train.txt'), ordered=True)
+            self.valid = self.vocab.encode_file(
+                os.path.join(path, 'valid.txt'), ordered=True)
+            self.test  = self.vocab.encode_file(
+                os.path.join(path, 'test.txt'), ordered=True)
+        elif self.dataset in ['enwik8', 'text8']:
+            self.train = self.vocab.encode_file(
+                os.path.join(path, 'train.txt'), ordered=True, add_eos=False)
+            self.valid = self.vocab.encode_file(
+                os.path.join(path, 'valid.txt'), ordered=True, add_eos=False)
+            self.test  = self.vocab.encode_file(
+                os.path.join(path, 'test.txt'), ordered=True, add_eos=False)
+        elif self.dataset == 'lm1b':
+            self.train = train_paths
+            self.valid = self.vocab.encode_file(
+                os.path.join(path, 'valid.txt'), ordered=False, add_double_eos=True)
+            self.test  = self.vocab.encode_file(
+                os.path.join(path, 'test.txt'), ordered=False, add_double_eos=True)
+    def get_iterator(self, split, *args, **kwargs):
+        if split == 'train':
+            if self.dataset in ['ptb', 'wt2', 'wt103', 'enwik8', 'text8']:
+                data_iter = LMOrderedIterator(self.train, *args, **kwargs)
+            elif self.dataset == 'lm1b':
+                kwargs['shuffle'] = True
+                data_iter = LMMultiFileIterator(self.train, self.vocab, *args, **kwargs)
+        elif split in ['valid', 'test']:
+            data = self.valid if split == 'valid' else self.test
+            if self.dataset in ['ptb', 'wt2', 'wt103', 'enwik8', 'text8']:
+                data_iter = LMOrderedIterator(data, *args, **kwargs)
+            elif self.dataset == 'lm1b':
+                data_iter = LMShuffledIterator(data, *args, **kwargs)
+        return data_iter
+def get_lm_corpus(datadir, dataset):
+    fn = os.path.join(datadir, 'cache.pt')
+    if os.path.exists(fn):
+        print('Loading cached dataset...')
+        corpus = torch.load(fn)
+    else:
+        print('Producing dataset {}...'.format(dataset))
+        kwargs = {}
+        if dataset in ['wt103', 'wt2']:
+            kwargs['special'] = ['<eos>']
+            kwargs['lower_case'] = False
+        elif dataset == 'ptb':
+            kwargs['special'] = ['<eos>']
+            kwargs['lower_case'] = True
+        elif dataset == 'lm1b':
+            kwargs['special'] = []
+            kwargs['lower_case'] = False
+            kwargs['vocab_file'] = os.path.join(datadir, '1b_word_vocab.txt')
+        elif dataset in ['enwik8', 'text8']:
+            pass
+        corpus = Corpus(datadir, dataset, **kwargs)
+        torch.save(corpus, fn)
+    return corpus
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser(description='unit test')
+    parser.add_argument('--datadir', type=str, default='../data/text8',
+                        help='location of the data corpus')
+    parser.add_argument('--dataset', type=str, default='text8',
+                        choices=['ptb', 'wt2', 'wt103', 'lm1b', 'enwik8', 'text8'],
+                        help='dataset name')
+    args = parser.parse_args()
+    corpus = get_lm_corpus(args.datadir, args.dataset)
+    print('Vocab size : {}'.format(len(corpus.vocab.idx2sym)))
--- a/examples/transformer-xl/eval.py
+++ b/examples/transformer-xl/eval.py
+# coding: utf-8
+import argparse
+import time
+import math
+import os, sys
+import torch
+from data_utils import get_lm_corpus
+from mem_transformer import MemTransformerLM
+from utils.exp_utils import get_logger
+parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model')
+parser.add_argument('--data', type=str, default='../data/wikitext-103',
+                    help='location of the data corpus')
+parser.add_argument('--dataset', type=str, default='wt103',
+                    choices=['wt103', 'lm1b', 'enwik8', 'text8'],
+                    help='dataset name')
+parser.add_argument('--split', type=str, default='all',
+                    choices=['all', 'valid', 'test'],
+                    help='which split to evaluate')
+parser.add_argument('--batch_size', type=int, default=10,
+                    help='batch size')
+parser.add_argument('--tgt_len', type=int, default=5,
+                    help='number of tokens to predict')
+parser.add_argument('--ext_len', type=int, default=0,
+                    help='length of the extended context')
+parser.add_argument('--mem_len', type=int, default=0,
+                    help='length of the retained previous heads')
+parser.add_argument('--clamp_len', type=int, default=-1,
+                    help='max positional embedding index')
+parser.add_argument('--cuda', action='store_true',
+                    help='use CUDA')
+parser.add_argument('--work_dir', type=str, required=True,
+                    help='path to the work_dir')
+parser.add_argument('--no_log', action='store_true',
+                    help='do not log the eval result')
+parser.add_argument('--same_length', action='store_true',
+                    help='set same length attention with masking')
+args = parser.parse_args()
+assert args.ext_len >= 0, 'extended context length must be non-negative'
+device = torch.device("cuda" if args.cuda else "cpu")
+# Get logger
+logging = get_logger(os.path.join(args.work_dir, 'log.txt'),
+                     log_=not args.no_log)
+# Load dataset
+corpus = get_lm_corpus(args.data, args.dataset)
+ntokens = len(corpus.vocab)
+va_iter = corpus.get_iterator('valid', args.batch_size, args.tgt_len,
+    device=device, ext_len=args.ext_len)
+te_iter = corpus.get_iterator('test', args.batch_size, args.tgt_len,
+    device=device, ext_len=args.ext_len)
+# Load the best saved model.
+with open(os.path.join(args.work_dir, 'model.pt'), 'rb') as f:
+    model = torch.load(f)
+model.backward_compatible()
+model = model.to(device)
+logging('Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}'.format(
+       args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len))
+model.reset_length(args.tgt_len, args.ext_len, args.mem_len)
+if args.clamp_len > 0:
+    model.clamp_len = args.clamp_len
+if args.same_length:
+    model.same_length = True
+###############################################################################
+# Evaluation code
+###############################################################################
+def evaluate(eval_iter):
+    # Turn on evaluation mode which disables dropout.
+    model.eval()
+    total_len, total_loss = 0, 0.
+    start_time = time.time()
+    with torch.no_grad():
+        mems = tuple()
+        for idx, (data, target, seq_len) in enumerate(eval_iter):
+            ret = model(data, target, *mems)
+            loss, mems = ret[0], ret[1:]
+            loss = loss.mean()
+            total_loss += seq_len * loss.item()
+            total_len += seq_len
+        total_time = time.time() - start_time
+    logging('Time : {:.2f}s, {:.2f}ms/segment'.format(
+            total_time, 1000 * total_time / (idx+1)))
+    return total_loss / total_len
+# Run on test data.
+if args.split == 'all':
+    test_loss = evaluate(te_iter)
+    valid_loss = evaluate(va_iter)
+elif args.split == 'valid':
+    valid_loss = evaluate(va_iter)
+    test_loss = None
+elif args.split == 'test':
+    test_loss = evaluate(te_iter)
+    valid_loss = None
+def format_log(loss, split):
+    if args.dataset in ['enwik8', 'text8']:
+        log_str = '| {0} loss {1:5.2f} | {0} bpc {2:9.5f} '.format(
+            split, loss, loss / math.log(2))
+    else:
+        log_str = '| {0} loss {1:5.2f} | {0} ppl {2:9.3f} '.format(
+            split, loss, math.exp(loss))
+    return log_str
+log_str = ''
+if valid_loss is not None:
+    log_str += format_log(valid_loss, 'valid')
+if test_loss is not None:
+    log_str += format_log(test_loss, 'test')
+logging('=' * 100)
+logging(log_str)
+logging('=' * 100)