You need to sign in or sign up before continuing.
Commit 0f091a1d authored by Sugon_ldc's avatar Sugon_ldc
Browse files

add fastmoe project

parents
Pipeline #263 failed with stages
in 0 seconds
#include "../balancing.cuh"
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <cuda.h>
#include <cuda_runtime.h>
int main(int argc, char* args[]) {
int n_worker = atoi(args[1]);
int n_expert = atoi(args[2]);
int cap_v = atoi(args[3]);
int tot_expert = n_worker * n_expert;
long* lec = new long[tot_expert];
for (int i = 0; i < tot_expert; ++i) {
lec[i] = i;
}
long* g_lec;
cudaMalloc(&g_lec, sizeof(long) * tot_expert);
cudaMemcpy(g_lec, lec, sizeof(long) * tot_expert, cudaMemcpyHostToDevice);
int* cap = new int[n_expert];
for (int i = 0; i < n_expert; ++i) {
cap[i] = cap_v;
}
int* g_cap;
cudaMalloc(&g_cap, sizeof(int) * n_expert);
cudaMemcpy(g_cap, cap, sizeof(int) * n_expert, cudaMemcpyHostToDevice);
long* eca = new long[tot_expert];
long* g_eca;
cudaMalloc(&g_eca, sizeof(long) * tot_expert);
auto smgr = getCudaStreamManager(0);
fmoe_cuda_limit_by_capacity_impl(g_lec, g_cap, g_eca, n_expert, n_worker, smgr);
cudaMemcpy(cap, g_cap, sizeof(int) * n_expert, cudaMemcpyDeviceToHost);
cudaMemcpy(eca, g_eca, sizeof(long) * tot_expert, cudaMemcpyDeviceToHost);
printf("%d\n", cap[0]);
for (int i = 0; i < tot_expert; ++i) {
printf("%ld %ld\n", lec[i], eca[i]);
}
}
#include "../balancing.cuh"
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <cuda.h>
#include <cuda_runtime.h>
int main(int argc, char* args[]) {
int n_worker = atoi(args[1]);
int n_expert = atoi(args[2]);
int batch_size = atoi(args[3]);
int tot_expert = n_worker * n_expert;
long* gate_idx = new long[batch_size];
long* n_gate_idx = new long[batch_size];
long* lec = new long[tot_expert];
memset(lec, 0, sizeof(long) * tot_expert);
for (int i = 0; i < batch_size; ++i) {
gate_idx[i] = rand() % tot_expert;
++lec[gate_idx[i]];
}
for (int i = 0; i < tot_expert; ++i) {
lec[i] >>= 1;
}
long* g_lec;
cudaMalloc(&g_lec, sizeof(long) * tot_expert);
cudaMemcpy(g_lec, lec, sizeof(long) * tot_expert, cudaMemcpyHostToDevice);
int* g_new_lec;
cudaMalloc(&g_new_lec, sizeof(int) * tot_expert);
long* g_gate_idx;
cudaMalloc(&g_gate_idx, sizeof(long) * batch_size);
cudaMemcpy(g_gate_idx, gate_idx, sizeof(long) * batch_size, cudaMemcpyHostToDevice);
auto smgr = getCudaStreamManager(0);
fmoe_cuda_prune_gate_by_capacity_impl(g_gate_idx, g_lec, g_new_lec,
batch_size, n_expert, n_worker, smgr);
cudaMemcpy(n_gate_idx, g_gate_idx, sizeof(long) * batch_size, cudaMemcpyDeviceToHost);
for (int i = 0; i < batch_size; ++i) {
printf("%ld %ld (%d)\n", gate_idx[i], n_gate_idx[i], lec[gate_idx[i]]);
}
}
#ifndef CUBLAS_WRAPPER_H
#define CUBLAS_WRAPPER_H
#include <cublas_v2.h>
#include <c10/util/Half.h>
inline cublasStatus_t cublasXgemmBatched(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m, int n, int k,
const float *alpha,
const float *Aarray[], int lda,
const float *Barray[], int ldb,
const float *beta,
float *Carray[], int ldc,
int batchCount) {
#ifdef FMOE_USE_HIP
return rocblas_sgemm_batched(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
#else
return cublasSgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
#endif
}
inline cublasStatus_t cublasXgemmBatched(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m, int n, int k,
const double *alpha,
const double *Aarray[], int lda,
const double *Barray[], int ldb,
const double *beta,
double *Carray[], int ldc,
int batchCount) {
#ifdef FMOE_USE_HIP
return rocblas_dgemm_batched(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
#else
return cublasDgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
#endif
}
inline cublasStatus_t cublasXgemmBatched(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m, int n, int k,
const __half *alpha,
const __half *Aarray[], int lda,
const __half *Barray[], int ldb,
const __half *beta,
__half *Carray[], int ldc,
int batchCount) {
#ifdef FMOE_USE_HIP
return rocblas_hgemm_batched(handle, transa, transb, m, n, k, (const rocblas_half*)alpha, (const rocblas_half* const*)Aarray, lda, (const rocblas_half* const*)Barray, ldb, (const rocblas_half*)beta, (rocblas_half* const*)Carray, ldc, batchCount);
#else
return cublasHgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
#endif
}
inline cublasStatus_t cublasXgemm(cublasHandle_t handle,
cublasOperation_t transa, cublasOperation_t transb,
int m, int n, int k,
const float *alpha,
const float *A, int lda,
const float *B, int ldb,
const float *beta,
float *C, int ldc) {
#ifdef FMOE_USE_HIP
return rocblas_sgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
#else
return cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
#endif
}
inline cublasStatus_t cublasXgemm(cublasHandle_t handle,
cublasOperation_t transa, cublasOperation_t transb,
int m, int n, int k,
const double *alpha,
const double *A, int lda,
const double *B, int ldb,
const double *beta,
double *C, int ldc) {
#ifdef FMOE_USE_HIP
return rocblas_dgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
#else
return cublasDgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
#endif
}
inline cublasStatus_t cublasXgemm(cublasHandle_t handle,
cublasOperation_t transa, cublasOperation_t transb,
int m, int n, int k,
const __half *alpha,
const __half *A, int lda,
const __half *B, int ldb,
const __half *beta,
__half *C, int ldc) {
#ifdef FMOE_USE_HIP
return rocblas_hgemm(handle, transa, transb, m, n, k, (const rocblas_half*)alpha, (const rocblas_half* )A, lda, (const rocblas_half* )B, ldb, (const rocblas_half*)beta, (rocblas_half* )C, ldc);
#else
return cublasHgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
#endif
}
inline cublasStatus_t cublasXgemm(cublasHandle_t handle,
cublasOperation_t transa, cublasOperation_t transb,
int m, int n, int k,
const c10::Half *alpha,
const c10::Half *A, int lda,
const c10::Half *B, int ldb,
const c10::Half *beta,
c10::Half *C, int ldc) {
#ifdef FMOE_USE_HIP
return rocblas_hgemm(handle, transa, transb, m, n, k,
(const rocblas_half*)alpha,
(const rocblas_half*)A, lda,
(const rocblas_half*)B, ldb,
(const rocblas_half*)beta,
(rocblas_half*)C, ldc);
#else
return cublasHgemm(handle, transa, transb, m, n, k,
(const __half*)alpha,
(const __half*)A, lda,
(const __half*)B, ldb,
(const __half*)beta,
(__half*)C, ldc);
#endif
}
#endif // CUBLAS_WRAPPER_H
#ifndef FMOE_UTILS_H
#define FMOE_UTILS_H
#define CHECK_CUDA(x) AT_ASSERTM(x.device().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
#define CEIL(_x_,_y_) (((_x_)-1)/(_y_)+1)
#endif // FMOE_UTILS_H
/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
////////////////////////////////////////////////////////////////////////////////
// These are CUDA Helper functions for initialization and error checking
// This file is clipped from the original header file by laekov
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <stdio.h>
#include <stdlib.h>
#ifndef HELPER_CUDA_H
#define HELPER_CUDA_H
static const char *_cudaGetErrorEnum(cudaError_t error) {
return cudaGetErrorName(error);
}
#ifdef CUDA_DRIVER_API
// CUDA Driver API errors
static const char *_cudaGetErrorEnum(CUresult error) {
static char unknown[] = "<unknown>";
const char *ret = NULL;
cuGetErrorName(error, &ret);
return ret ? ret : unknown;
}
#endif
#ifdef FMOE_USE_HIP
static const char *_cudaGetErrorEnum(cublasStatus_t error) {
switch (error) {
case rocblas_status_success:
return "rocblas_status_success";
case rocblas_status_invalid_handle:
return "rocblas_status_invalid_handle";
case rocblas_status_not_implemented:
return "rocblas_status_not_implemented";
case rocblas_status_invalid_pointer:
return "rocblas_status_invalid_pointer:";
case rocblas_status_invalid_size:
return "rocblas_status_invalid_size";
case rocblas_status_memory_error:
return "rocblas_status_memory_error";
case rocblas_status_internal_error:
return "rocblas_status_internal_error";
case rocblas_status_perf_degraded:
return "rocblas_status_perf_degraded";
case rocblas_status_size_query_mismatch:
return "rocblas_status_size_query_mismatch";
case rocblas_status_size_increased:
return "rocblas_status_size_increased";
case rocblas_status_size_unchanged:
return "rocblas_status_size_unchanged";
case rocblas_status_invalid_value:
return "rocblas_status_invalid_value";
case rocblas_status_continue:
return "rocblas_status_continue";
}
return "<unknown>";
}
#else
// cuBLAS API errors
static const char *_cudaGetErrorEnum(cublasStatus_t error) {
switch (error) {
case CUBLAS_STATUS_SUCCESS:
return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED:
return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED:
return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE:
return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH:
return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR:
return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED:
return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR:
return "CUBLAS_STATUS_INTERNAL_ERROR";
case CUBLAS_STATUS_NOT_SUPPORTED:
return "CUBLAS_STATUS_NOT_SUPPORTED";
case CUBLAS_STATUS_LICENSE_ERROR:
return "CUBLAS_STATUS_LICENSE_ERROR";
}
return "<unknown>";
}
#endif
#ifdef _CUFFT_H_
// cuFFT API errors
static const char *_cudaGetErrorEnum(cufftResult error) {
switch (error) {
case CUFFT_SUCCESS:
return "CUFFT_SUCCESS";
case CUFFT_INVALID_PLAN:
return "CUFFT_INVALID_PLAN";
case CUFFT_ALLOC_FAILED:
return "CUFFT_ALLOC_FAILED";
case CUFFT_INVALID_TYPE:
return "CUFFT_INVALID_TYPE";
case CUFFT_INVALID_VALUE:
return "CUFFT_INVALID_VALUE";
case CUFFT_INTERNAL_ERROR:
return "CUFFT_INTERNAL_ERROR";
case CUFFT_EXEC_FAILED:
return "CUFFT_EXEC_FAILED";
case CUFFT_SETUP_FAILED:
return "CUFFT_SETUP_FAILED";
case CUFFT_INVALID_SIZE:
return "CUFFT_INVALID_SIZE";
case CUFFT_UNALIGNED_DATA:
return "CUFFT_UNALIGNED_DATA";
case CUFFT_INCOMPLETE_PARAMETER_LIST:
return "CUFFT_INCOMPLETE_PARAMETER_LIST";
case CUFFT_INVALID_DEVICE:
return "CUFFT_INVALID_DEVICE";
case CUFFT_PARSE_ERROR:
return "CUFFT_PARSE_ERROR";
case CUFFT_NO_WORKSPACE:
return "CUFFT_NO_WORKSPACE";
case CUFFT_NOT_IMPLEMENTED:
return "CUFFT_NOT_IMPLEMENTED";
case CUFFT_LICENSE_ERROR:
return "CUFFT_LICENSE_ERROR";
case CUFFT_NOT_SUPPORTED:
return "CUFFT_NOT_SUPPORTED";
}
return "<unknown>";
}
#endif
#ifdef CUSPARSEAPI
// cuSPARSE API errors
static const char *_cudaGetErrorEnum(cusparseStatus_t error) {
switch (error) {
case CUSPARSE_STATUS_SUCCESS:
return "CUSPARSE_STATUS_SUCCESS";
case CUSPARSE_STATUS_NOT_INITIALIZED:
return "CUSPARSE_STATUS_NOT_INITIALIZED";
case CUSPARSE_STATUS_ALLOC_FAILED:
return "CUSPARSE_STATUS_ALLOC_FAILED";
case CUSPARSE_STATUS_INVALID_VALUE:
return "CUSPARSE_STATUS_INVALID_VALUE";
case CUSPARSE_STATUS_ARCH_MISMATCH:
return "CUSPARSE_STATUS_ARCH_MISMATCH";
case CUSPARSE_STATUS_MAPPING_ERROR:
return "CUSPARSE_STATUS_MAPPING_ERROR";
case CUSPARSE_STATUS_EXECUTION_FAILED:
return "CUSPARSE_STATUS_EXECUTION_FAILED";
case CUSPARSE_STATUS_INTERNAL_ERROR:
return "CUSPARSE_STATUS_INTERNAL_ERROR";
case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
}
return "<unknown>";
}
#endif
#ifdef CUSOLVER_COMMON_H_
// cuSOLVER API errors
static const char *_cudaGetErrorEnum(cusolverStatus_t error) {
switch (error) {
case CUSOLVER_STATUS_SUCCESS:
return "CUSOLVER_STATUS_SUCCESS";
case CUSOLVER_STATUS_NOT_INITIALIZED:
return "CUSOLVER_STATUS_NOT_INITIALIZED";
case CUSOLVER_STATUS_ALLOC_FAILED:
return "CUSOLVER_STATUS_ALLOC_FAILED";
case CUSOLVER_STATUS_INVALID_VALUE:
return "CUSOLVER_STATUS_INVALID_VALUE";
case CUSOLVER_STATUS_ARCH_MISMATCH:
return "CUSOLVER_STATUS_ARCH_MISMATCH";
case CUSOLVER_STATUS_MAPPING_ERROR:
return "CUSOLVER_STATUS_MAPPING_ERROR";
case CUSOLVER_STATUS_EXECUTION_FAILED:
return "CUSOLVER_STATUS_EXECUTION_FAILED";
case CUSOLVER_STATUS_INTERNAL_ERROR:
return "CUSOLVER_STATUS_INTERNAL_ERROR";
case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
case CUSOLVER_STATUS_NOT_SUPPORTED:
return "CUSOLVER_STATUS_NOT_SUPPORTED ";
case CUSOLVER_STATUS_ZERO_PIVOT:
return "CUSOLVER_STATUS_ZERO_PIVOT";
case CUSOLVER_STATUS_INVALID_LICENSE:
return "CUSOLVER_STATUS_INVALID_LICENSE";
}
return "<unknown>";
}
#endif
#ifdef CURAND_H_
// cuRAND API errors
static const char *_cudaGetErrorEnum(curandStatus_t error) {
switch (error) {
case CURAND_STATUS_SUCCESS:
return "CURAND_STATUS_SUCCESS";
case CURAND_STATUS_VERSION_MISMATCH:
return "CURAND_STATUS_VERSION_MISMATCH";
case CURAND_STATUS_NOT_INITIALIZED:
return "CURAND_STATUS_NOT_INITIALIZED";
case CURAND_STATUS_ALLOCATION_FAILED:
return "CURAND_STATUS_ALLOCATION_FAILED";
case CURAND_STATUS_TYPE_ERROR:
return "CURAND_STATUS_TYPE_ERROR";
case CURAND_STATUS_OUT_OF_RANGE:
return "CURAND_STATUS_OUT_OF_RANGE";
case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
case CURAND_STATUS_LAUNCH_FAILURE:
return "CURAND_STATUS_LAUNCH_FAILURE";
case CURAND_STATUS_PREEXISTING_FAILURE:
return "CURAND_STATUS_PREEXISTING_FAILURE";
case CURAND_STATUS_INITIALIZATION_FAILED:
return "CURAND_STATUS_INITIALIZATION_FAILED";
case CURAND_STATUS_ARCH_MISMATCH:
return "CURAND_STATUS_ARCH_MISMATCH";
case CURAND_STATUS_INTERNAL_ERROR:
return "CURAND_STATUS_INTERNAL_ERROR";
}
return "<unknown>";
}
#endif
#ifdef NVJPEGAPI
// nvJPEG API errors
static const char *_cudaGetErrorEnum(nvjpegStatus_t error) {
switch (error) {
case NVJPEG_STATUS_SUCCESS:
return "NVJPEG_STATUS_SUCCESS";
case NVJPEG_STATUS_NOT_INITIALIZED:
return "NVJPEG_STATUS_NOT_INITIALIZED";
case NVJPEG_STATUS_INVALID_PARAMETER:
return "NVJPEG_STATUS_INVALID_PARAMETER";
case NVJPEG_STATUS_BAD_JPEG:
return "NVJPEG_STATUS_BAD_JPEG";
case NVJPEG_STATUS_JPEG_NOT_SUPPORTED:
return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED";
case NVJPEG_STATUS_ALLOCATOR_FAILURE:
return "NVJPEG_STATUS_ALLOCATOR_FAILURE";
case NVJPEG_STATUS_EXECUTION_FAILED:
return "NVJPEG_STATUS_EXECUTION_FAILED";
case NVJPEG_STATUS_ARCH_MISMATCH:
return "NVJPEG_STATUS_ARCH_MISMATCH";
case NVJPEG_STATUS_INTERNAL_ERROR:
return "NVJPEG_STATUS_INTERNAL_ERROR";
}
return "<unknown>";
}
#endif
#ifdef NV_NPPIDEFS_H
// NPP API errors
static const char *_cudaGetErrorEnum(NppStatus error) {
switch (error) {
case NPP_NOT_SUPPORTED_MODE_ERROR:
return "NPP_NOT_SUPPORTED_MODE_ERROR";
case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR:
return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR";
case NPP_RESIZE_NO_OPERATION_ERROR:
return "NPP_RESIZE_NO_OPERATION_ERROR";
case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY:
return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY";
#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
case NPP_BAD_ARG_ERROR:
return "NPP_BAD_ARGUMENT_ERROR";
case NPP_COEFF_ERROR:
return "NPP_COEFFICIENT_ERROR";
case NPP_RECT_ERROR:
return "NPP_RECTANGLE_ERROR";
case NPP_QUAD_ERROR:
return "NPP_QUADRANGLE_ERROR";
case NPP_MEM_ALLOC_ERR:
return "NPP_MEMORY_ALLOCATION_ERROR";
case NPP_HISTO_NUMBER_OF_LEVELS_ERROR:
return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
case NPP_INVALID_INPUT:
return "NPP_INVALID_INPUT";
case NPP_POINTER_ERROR:
return "NPP_POINTER_ERROR";
case NPP_WARNING:
return "NPP_WARNING";
case NPP_ODD_ROI_WARNING:
return "NPP_ODD_ROI_WARNING";
#else
// These are for CUDA 5.5 or higher
case NPP_BAD_ARGUMENT_ERROR:
return "NPP_BAD_ARGUMENT_ERROR";
case NPP_COEFFICIENT_ERROR:
return "NPP_COEFFICIENT_ERROR";
case NPP_RECTANGLE_ERROR:
return "NPP_RECTANGLE_ERROR";
case NPP_QUADRANGLE_ERROR:
return "NPP_QUADRANGLE_ERROR";
case NPP_MEMORY_ALLOCATION_ERR:
return "NPP_MEMORY_ALLOCATION_ERROR";
case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR:
return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
case NPP_INVALID_HOST_POINTER_ERROR:
return "NPP_INVALID_HOST_POINTER_ERROR";
case NPP_INVALID_DEVICE_POINTER_ERROR:
return "NPP_INVALID_DEVICE_POINTER_ERROR";
#endif
case NPP_LUT_NUMBER_OF_LEVELS_ERROR:
return "NPP_LUT_NUMBER_OF_LEVELS_ERROR";
case NPP_TEXTURE_BIND_ERROR:
return "NPP_TEXTURE_BIND_ERROR";
case NPP_WRONG_INTERSECTION_ROI_ERROR:
return "NPP_WRONG_INTERSECTION_ROI_ERROR";
case NPP_NOT_EVEN_STEP_ERROR:
return "NPP_NOT_EVEN_STEP_ERROR";
case NPP_INTERPOLATION_ERROR:
return "NPP_INTERPOLATION_ERROR";
case NPP_RESIZE_FACTOR_ERROR:
return "NPP_RESIZE_FACTOR_ERROR";
case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR:
return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR";
#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
case NPP_MEMFREE_ERR:
return "NPP_MEMFREE_ERR";
case NPP_MEMSET_ERR:
return "NPP_MEMSET_ERR";
case NPP_MEMCPY_ERR:
return "NPP_MEMCPY_ERROR";
case NPP_MIRROR_FLIP_ERR:
return "NPP_MIRROR_FLIP_ERR";
#else
case NPP_MEMFREE_ERROR:
return "NPP_MEMFREE_ERROR";
case NPP_MEMSET_ERROR:
return "NPP_MEMSET_ERROR";
case NPP_MEMCPY_ERROR:
return "NPP_MEMCPY_ERROR";
case NPP_MIRROR_FLIP_ERROR:
return "NPP_MIRROR_FLIP_ERROR";
#endif
case NPP_ALIGNMENT_ERROR:
return "NPP_ALIGNMENT_ERROR";
case NPP_STEP_ERROR:
return "NPP_STEP_ERROR";
case NPP_SIZE_ERROR:
return "NPP_SIZE_ERROR";
case NPP_NULL_POINTER_ERROR:
return "NPP_NULL_POINTER_ERROR";
case NPP_CUDA_KERNEL_EXECUTION_ERROR:
return "NPP_CUDA_KERNEL_EXECUTION_ERROR";
case NPP_NOT_IMPLEMENTED_ERROR:
return "NPP_NOT_IMPLEMENTED_ERROR";
case NPP_ERROR:
return "NPP_ERROR";
case NPP_SUCCESS:
return "NPP_SUCCESS";
case NPP_WRONG_INTERSECTION_QUAD_WARNING:
return "NPP_WRONG_INTERSECTION_QUAD_WARNING";
case NPP_MISALIGNED_DST_ROI_WARNING:
return "NPP_MISALIGNED_DST_ROI_WARNING";
case NPP_AFFINE_QUAD_INCORRECT_WARNING:
return "NPP_AFFINE_QUAD_INCORRECT_WARNING";
case NPP_DOUBLE_SIZE_WARNING:
return "NPP_DOUBLE_SIZE_WARNING";
case NPP_WRONG_INTERSECTION_ROI_WARNING:
return "NPP_WRONG_INTERSECTION_ROI_WARNING";
#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000
/* These are 6.0 or higher */
case NPP_LUT_PALETTE_BITSIZE_ERROR:
return "NPP_LUT_PALETTE_BITSIZE_ERROR";
case NPP_ZC_MODE_NOT_SUPPORTED_ERROR:
return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR";
case NPP_QUALITY_INDEX_ERROR:
return "NPP_QUALITY_INDEX_ERROR";
case NPP_CHANNEL_ORDER_ERROR:
return "NPP_CHANNEL_ORDER_ERROR";
case NPP_ZERO_MASK_VALUE_ERROR:
return "NPP_ZERO_MASK_VALUE_ERROR";
case NPP_NUMBER_OF_CHANNELS_ERROR:
return "NPP_NUMBER_OF_CHANNELS_ERROR";
case NPP_COI_ERROR:
return "NPP_COI_ERROR";
case NPP_DIVISOR_ERROR:
return "NPP_DIVISOR_ERROR";
case NPP_CHANNEL_ERROR:
return "NPP_CHANNEL_ERROR";
case NPP_STRIDE_ERROR:
return "NPP_STRIDE_ERROR";
case NPP_ANCHOR_ERROR:
return "NPP_ANCHOR_ERROR";
case NPP_MASK_SIZE_ERROR:
return "NPP_MASK_SIZE_ERROR";
case NPP_MOMENT_00_ZERO_ERROR:
return "NPP_MOMENT_00_ZERO_ERROR";
case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR:
return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR";
case NPP_THRESHOLD_ERROR:
return "NPP_THRESHOLD_ERROR";
case NPP_CONTEXT_MATCH_ERROR:
return "NPP_CONTEXT_MATCH_ERROR";
case NPP_FFT_FLAG_ERROR:
return "NPP_FFT_FLAG_ERROR";
case NPP_FFT_ORDER_ERROR:
return "NPP_FFT_ORDER_ERROR";
case NPP_SCALE_RANGE_ERROR:
return "NPP_SCALE_RANGE_ERROR";
case NPP_DATA_TYPE_ERROR:
return "NPP_DATA_TYPE_ERROR";
case NPP_OUT_OFF_RANGE_ERROR:
return "NPP_OUT_OFF_RANGE_ERROR";
case NPP_DIVIDE_BY_ZERO_ERROR:
return "NPP_DIVIDE_BY_ZERO_ERROR";
case NPP_RANGE_ERROR:
return "NPP_RANGE_ERROR";
case NPP_NO_MEMORY_ERROR:
return "NPP_NO_MEMORY_ERROR";
case NPP_ERROR_RESERVED:
return "NPP_ERROR_RESERVED";
case NPP_NO_OPERATION_WARNING:
return "NPP_NO_OPERATION_WARNING";
case NPP_DIVIDE_BY_ZERO_WARNING:
return "NPP_DIVIDE_BY_ZERO_WARNING";
#endif
#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x7000
/* These are 7.0 or higher */
case NPP_OVERFLOW_ERROR:
return "NPP_OVERFLOW_ERROR";
case NPP_CORRUPTED_DATA_ERROR:
return "NPP_CORRUPTED_DATA_ERROR";
#endif
}
return "<unknown>";
}
#endif
template <typename T>
void check(T result, char const *const func, const char *const file,
int const line) {
if (result) {
fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
exit(EXIT_FAILURE);
}
}
// This will output the proper CUDA error strings in the event
// that a CUDA host call returns an error
#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
#endif // HELPER_CUDA_H
#ifndef TIMER_HH
#define TIMER_HH
/*
* This part of code is not used.
#include <chrono>
inline double getDuration(std::chrono::time_point<std::chrono::system_clock> a,
std::chrono::time_point<std::chrono::system_clock> b) {
return std::chrono::duration<double>(b - a).count();
}
#define timestamp(__var__) auto __var__ = std::chrono::system_clock::now();
*/
#endif // TIMER_HH
FastMoE 系统
===
[版本更新记录](release-note.md)
| [Slack 讨论组邀请链接](https://join.slack.com/t/fastmoe/shared_invite/zt-mz0ai6ol-ggov75D62YsgHfzShw8KYw)
## 简介
FastMoE 是一个易用且高效的基于 PyTorch 的 MoE 模型训练系统.
## 安装
### 依赖
启用了 CUDA 的 PyTorch 是必要的. 当前版本的 FastMoE 在 PyTorch v1.8.0 和 CUDA 10
的平台上经过了测试. 本系统从设计上也支持更旧的 PyTorch 版本.
如果需要使能 FastMoE 模型并行特性, 那么支持点对点通信的 NCCL 库 (即不旧于
`2.7.5` 版本) 也是必需的.
### 安装
FastMoE 包含一些定制的 PyTorch 算子, 包含一些 C 的组件. 用 `python setup.py install`
来简单地安装 FastMoE.
FastMoE 分布式模型并行特性默认是不被启用的. 如果它需要被启用,
则需要在运行上述命令时加入环境变量 `USE_NCCL=1`.
注意, 由于 PyTorch 框架通常仅集成了 NCCL 的运行时组件, 额外的 NCCL
开发包需要被安装在编译环境中, 而且它的版本需要与 PyTorch 的版本相对应. 推荐使用
[PyTorch 官方 Docker 镜像](https://hub.docker.com/r/pytorch/pytorch),
因为那里的环境较为干净. 如果您希望手工配置环境, 可以在 [NCCL
全部版本的下载链接](https://developer.nvidia.com/nccl/nccl-legacy-downloads)
下载合适版本的 NCCL 开发包.
## 使用
### 将一个 Transformer 模型 FMoE 化
Transformer 是当前最流行的可被 MoE 化的模型. FastMoE 可以一键将一个普通的
Transformer 模型变为一个 MoE 的模型. 其使用方法如下.
例如在 [Megatron-LM](https://github.com/nvidia/megatron-lm) 中,
添加如下的代码即可将 Transformer 中的每个 MLP 层变为多个 MLP 层构成的 MoE 网络.
```python
model = ...
from fmoe.megatron import fmoefy
model = fmoefy(model, num_experts=<number of experts per worker>)
train(model, ...)
```
一个更详细的在 Megatron-LM 中使用 `fmoefy` 函数的样例参见[此处](../examples/megatron).
### 将 FastMoE 作为一个网络模块使用
一个使用 FastMoE 的 Transformer 模型见[这个示例](../examples/transformer-xl).
最简单的使用方式是使用 `FMoE` 层来代替 `MLP` 层.
### 分布式地使用 FastMoE
FastMoE 支持数据并行和模型并行.
#### 数据并行.
在 FastMoE 的数据并行模式下,
门网络(gate)和专家网络都被复制地放置在各个运算单元上.
下图展示了一个有三个专家的两路数据并行MoE模型进行前向计算的方式.
<p align="center">
<img src="fastmoe_data_parallel.png" width="600">
</p>
对于数据并行, 额外的代码是不需要的. FastMoE 与 PyTorch 的 `DataParallel`
`DistributedDataParallel` 模块都可以无缝对接. 该方式唯一的问题是,
专家的数量受到单个计算单元(如GPU)的内存大小限制.
#### 模型并行
在 FastMoE 的模型并行模式中, 门网络依然是复制地被放置在每个计算单元上的,
但是专家网络被独立地分别放置在各个计算单元上. 因此, 通过引入额外的通信操作,
FastMoE 可以允许更多的专家网络们同时被训练,
而其数量限制与计算单元的数量是正相关的.
下图展示了一个有六个专家网络的模型被两路模型并行地训练.
注意专家1-3被放置在第一个计算单元上, 而专家4-6被放置在第二个计算单元上.
<p align="center">
<img src="fastmoe_model_parallel.png" width="600">
</p>
FastMoE 的模型并行模式需要专门的并行策略, 而 PyTorch 和 Megatron-LM
都不支持这样的策略. 因此, 需要使用 `fmoe.DistributedGroupedDataParallel`
模块来代替 PyTorch 的 DDP 模块.
## 答疑 / 讨论
如果您在使用 FastMoE 的过程中有任何疑问, 或您有兴趣参与 FastMoE 的相关工作,
欢迎加入我们的 [Slack 讨论组](https://join.slack.com/t/fastmoe/shared_invite/zt-mz0ai6ol-ggov75D62YsgHfzShw8KYw).
## v0.3.0
### FMoE core
* Previous `mp_group` is renamed to `slice_group`, indicating that all workers in the group receive the same input batch, and process a slice of the input. `mp_group` will be deprecated in our next release.
* ROCm supported.
* `FMoELinear` is moved to a stand-alone file.
### Groupped data parallel
* Support any group name by their relative tag name.
### Load balancing
* A brand new balancing strategy - SWIPE. Contributed by authors of a (currently unpublished) paper.
* A property `has_loss` is added to each gate, in order to identify whether balance loss should be collected.
### Megatron-LM support
* Experts are partitioned by tensor model parallelism in `mp_group`, instead of expert parallelism.
* Support arbitrary customized gate in `MegatronMLP`.
* Move the patches to a stand-alone file.
### Tests
* Move util functions into `test_ddp.py`.
## v0.2.1
## Load balancing
* Fix gradient for balance loss.
### Misc
* Typos.
* Update benchmark interface.
* Remove some redundant code for performance improvement.
* Enable `USE_NCCL` by default.
* Compatibility for PyTorch `<1.8.0` and `>=1.8.0`.
### Megatron adaption
* Patch for numerical correctness of gradient clipping.
* Support to pipeline parallelism.
## v0.2.0
## Load balancing
* A brand new gate module with capacity-related utilities.
* GShard's and Switch Transformer's balance strategies are implemented as integrated gates.
* Balance loss is enabled.
* Balance monitor is provided.
## Checkpointing
* MoE models can be loaded and saved by fmoe's checkpointing module.
## Performance
* FP16 training performance is improved.
## Misc
* CUDA code directory is reconstructed.
* More tests are added.
## v0.1.2
### Compilation
- Remove dependency on the CUDA examples repository.
### Distributed
- Fix a bug related to PyTorch v1.8.0. FastMoE can now operate on multiple GPUs
on multiple nodes with PyTorch v1.8.0.
### Misc
- Fix tons of typos.
- Format the code.
## v0.1.1
### Distributed
- Broadcast data-parallel parameters before training.
### Megatron adaption
- Initialize `FMoELinear` parameters using different seed in model parallel even using the same random seed in megatron.
- Use proper comm for mp and dp.
### Transformer-XL example
- Improve scripts.
### Misc
- Logo and slack workspace link.
- Document in Chinese.
- Figures to explain how FastMoE works.
## v0.1.0
### Functions
- A model-injection-style easy-to-use user interface for Megatron-LM.
- Support both data parallel and model parallel, and a hybrid of the two,
- Provide a new customized DDP module to synchronize in different comm groups.
- Support to customized `nn.Module` as an expert.
### Document and infrastructure
- Use PyTest.
- Setup PyLint.
- Installation and usage guide.
- Explanation of functions and code structure in code.
### Performance
- A benchmark to compare FastMoE and old PyTorch impl.
transformer-xl/data
transformer-xl/LM-TFM-enwik8
data
FastMoE works with different versions of
[Megatron-LM](https://github.com/nvidia/megatron-lm).
See `fmoe/megatron/utils.py` for arguments of FastMoE.
An example patch is provided for `v2.2` release.
The patch can be directly applied to add FastMoE support if you are using
Megatron-LM v2.2.
Otherwise, you may need to manually enable FastMoE in your codebase.
The patch includes the following modifications.
### Add arguments to Megatron's argparser
In `megatron/arguments.py`, add `_add_fmoe_args` to the parser.
### Patch checkpoint
In `megatron/training.py`, replace `load_checkpoint` and `save_checkpoint` by
functions with the same name in `fmoe.megatron.checkpointing`.
### Building the model in FastMoE style
In `megatron/training.py`, the `fmoe.megatron.fmoefy` function is used as an
entrance to one-key introduce FastMoE layer to replace the MLP layers in the
transformer language models.
```python
from fmoe.megatron import fmoefy
model = fmoefy(model, num_experts=4)
```
Note that the `fmoefy` function currently only takes a standard Megatron-LM's
top-level raw model as input, i.e. the MLP layers should be available at
`model.language_model.transformer.layers[i].mlp`.
### Using FastMoE's model parallellization
In `megatron/training.py`, the `LocalDDP` module is replaced by the one in
`fmoe.megatron` to enable the sophiscated data parallel strategies that can
parallelize the experts across both the data parallel group and the (tensor)
model parallel model group.
```python
# from megatron.model import DistributedDataParallel as LocalDDP
from fmoe.megatron import DistributedDataParallel as LocalDDP
```
### Fix gradient clipping
Megatron-LM uses gradient normalization, which is incompatible with FastMoE.
Incorrect norm of the gradients lead to inconsistent parameter updates.
Apply `clip-grad-v2.2.patch` to fix the issue.
Note that only 2-norm is implemented in the patch. If other norm methods is
used, remember to implement it accordingly.
### Train as usual
Start traning with FastMoE by using the scripts provided by Megatron-LM.
diff --git a/megatron/optimizer/clip_grads.py b/megatron/optimizer/clip_grads.py
index e8d0d02..fd6660a 100644
--- a/megatron/optimizer/clip_grads.py
+++ b/megatron/optimizer/clip_grads.py
@@ -52,6 +52,7 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
# - should not be a replica due to tensor model parallelism
grads = []
grads_for_norm = []
+ grads_in_moe = []
for param in parameters:
grad_not_none = param.grad is not None
is_not_shared = not hasattr(param, 'shared') or not param.shared
@@ -63,7 +64,10 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
assert param.grad.type() == 'torch.cuda.FloatTensor'
grads.append(grad)
if grad_not_none and is_not_shared and is_not_tp_duplicate:
- grads_for_norm.append(grad)
+ if hasattr(param, 'dp_comm') and param.dp_comm in ('none'):
+ grads_in_moe.append(grad)
+ else:
+ grads_for_norm.append(grad)
# Norm parameters.
max_norm = float(max_norm)
@@ -72,6 +76,7 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
# Calculate norm.
if norm_type == inf:
+ # TODO: moe
total_norm = max(grad.abs().max() for grad in grads_for_norm)
total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
# Take max across all model-parallel GPUs.
@@ -96,7 +101,18 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
# we need the pow(norm-type).
total_norm = grad_norm ** norm_type
+ grad_norm, _ = multi_tensor_applier(
+ amp_C.multi_tensor_l2norm,
+ dummy_overflow_buf,
+ [grads_in_moe],
+ False # no per-parameter norm
+ )
+ grad_norm = grad_norm ** norm_type
+ torch.distributed.all_reduce(grad_norm)
+ total_norm += grad_norm
+
else:
+ # TODO: moe
for grad in grads_for_norm:
grad_norm = torch.norm(grad, norm_type)
total_norm += grad_norm ** norm_type
diff --git a/megatron/arguments.py b/megatron/arguments.py
index 26a7cec..0acfb22 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -21,6 +21,8 @@ import os
import torch
from megatron import fused_kernels
+from fmoe.megatron import add_fmoe_args as _add_fmoe_args
+
def parse_args(extra_args_provider=None, defaults={},
ignore_unknown_args=False):
"""Parse all arguments."""
@@ -40,6 +42,7 @@ def parse_args(extra_args_provider=None, defaults={},
parser = _add_data_args(parser)
parser = _add_autoresume_args(parser)
parser = _add_realm_args(parser)
+ parser = _add_fmoe_args(parser)
# Custom arguments.
if extra_args_provider is not None:
diff --git a/megatron/optimizer/optimizer.py b/megatron/optimizer/optimizer.py
index 9d42260..2583db2 100644
--- a/megatron/optimizer/optimizer.py
+++ b/megatron/optimizer/optimizer.py
@@ -177,6 +177,8 @@ class FP16OptimizerWithFP16Params(MegatronOptimizer):
param)
if hasattr(param, 'shared'):
main_param.shared = param.shared
+ if hasattr(param, 'dp_comm'):
+ main_param.dp_comm = param.dp_comm
# Replace the optimizer params with the new fp32 copy.
param_group['params'][i] = main_param
fp32_from_fp16_params_this_group.append(main_param)
diff --git a/megatron/training.py b/megatron/training.py
index 56d1c7c..f825bf3 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -35,20 +35,24 @@ from megatron import update_num_microbatches
from megatron import mpu
from megatron import print_rank_0
from megatron import print_rank_last
-from megatron.checkpointing import load_checkpoint
-from megatron.checkpointing import save_checkpoint
+# from megatron.checkpointing import load_checkpoint
+from fmoe.megatron.checkpoint import load_checkpoint
+# from megatron.checkpointing import save_checkpoint
+from fmoe.megatron.checkpoint import save_checkpoint
from megatron.model import FP16Module
from megatron.optimizer import get_megatron_optimizer
from megatron.initialize import initialize_megatron
from megatron.initialize import write_args_to_tensorboard
from megatron.learning_rates import AnnealingLR
-from megatron.model import DistributedDataParallel as LocalDDP
+# from megatron.model import DistributedDataParallel as LocalDDP
from megatron.model.realm_model import ICTBertModel
from megatron.utils import check_adlr_autoresume_termination
from megatron.data.data_loaders import build_pretraining_data_loader
from megatron.utils import report_memory
+from fmoe.megatron import DistributedDataParallel as LocalDDP
+from fmoe.megatron import add_balance_log
def print_datetime(string):
"""Note that this call will sync across all ranks."""
@@ -102,6 +106,13 @@ def pretrain(train_valid_test_dataset_provider, model_provider,
args = get_args()
timers = get_timers()
+ # Initialize FastMoE
+ if args.fmoefy:
+ from fmoe.megatron import patch_forward_step, patch_model_provider
+
+ forward_step_func = patch_forward_step(forward_step_func)
+ model_provider = patch_model_provider(model_provider)
+
# Model, optimizer, and learning rate.
timers('model and optimizer').start()
model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider)
@@ -643,7 +654,7 @@ def train_step(forward_step_func, data_iterator,
def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
- loss_scale, report_memory_flag, skipped_iter):
+ loss_scale, report_memory_flag, skipped_iter, model):
"""Log training information such as losses, timing, ...."""
args = get_args()
timers = get_timers()
@@ -725,6 +736,8 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
args.consumed_train_samples)
timers.write(timers_to_log, writer, iteration,
normalizer=total_iterations)
+ if args.fmoefy and args.balance_strategy and args.balance_strategy != 'naive':
+ add_balance_log(model, writer, iteration)
if iteration % args.log_interval == 0:
elapsed_time = timers('interval time').elapsed()
@@ -816,7 +829,7 @@ def train(forward_step_func, model, optimizer, lr_scheduler,
report_memory_flag = training_log(loss_dict, total_loss_dict,
optimizer.param_groups[0]['lr'],
iteration, loss_scale,
- report_memory_flag, skipped_iter)
+ report_memory_flag, skipped_iter, model)
# Autoresume
if args.adlr_autoresume and \
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
This directory contains an example based on Zihang Dai, et.al's open-source
transformer [implementation](https://github.com/kimiyoung/transformer-xl) to
demostrate the usage of the usage of Fast MoE's layers.
The code is released with Apache-2.0 license. Here, only the pytorch part of the
code is used, with modification in the `mem_transformer.py` file to enable MoE
training.
## Introduction
This directory contains our pytorch implementation of Transformer-XL. Note that our state-of-the-art results reported in the paper were obtained by training the model on a large-scale TPU cluster, and our pytorch codebase currently does not support distributed training. Here we provide two sets of hyperparameters and scripts:
- `*large.sh` are for the SoTA setting with large models which might not be directly runnable on a local GPU machine.
- `*base.sh` are for the base models which can be run on a few GPUs.
The pytorch implementation produces similar results to the TF codebase under the same settings in our preliminary experiments.
## Prerequisite
- Pytorch 0.4: `conda install pytorch torchvision -c pytorch`
## Data Prepration
`bash getdata.sh`
## Training and Evaluation
#### Replicate the "bpc = 1.06" result on `enwik8` with a 12-layer Transformer-XL
- Make sure the machine have **4 GPUs**, each with **at least 11G memory**
- Training
`bash run_enwik8_base.sh train --work_dir PATH_TO_WORK_DIR`
- Evaluation
`bash run_enwik8_base.sh eval --work_dir PATH_TO_WORK_DIR`
#### Replicate the "PPL = 24.03" result on `wikitext-103` with Transformer-XL
- Make sure the machine have **4 GPUs**, each with **at least 11G memory**
- Training
`bash run_wt103_base.sh train --work_dir PATH_TO_WORK_DIR`
- Evaluation
`bash run_wt103_base.sh eval --work_dir PATH_TO_WORK_DIR`
#### Other options:
- `--batch_chunk`: this option allows one to trade speed for memory. For `batch_chunk > 1`, the program will split each training batch into `batch_chunk` sub-batches and perform forward and backward on each sub-batch sequentially, with the gradient accumulated and divided by `batch_chunk`. Hence, the memory usage will propertionally lower while the computation time will inversely higher.
- `--div_val`: when using adaptive softmax and embedding, the embedding dimension is divided by `div_val` from bin $i$ to bin $i+1$. This saves both GPU memory and the parameter budget.
- `--fp16` and `--dynamic-loss-scale`: Run in pseudo-fp16 mode (fp16 storage fp32 math) with dynamic loss scaling.
- Note: to explore the `--fp16` option, please make sure the `apex` package is installed (https://github.com/NVIDIA/apex/).
- To see performance without the recurrence mechanism, simply use `mem_len=0` in all your scripts.
- To see performance of a standard Transformer without relative positional encodings or recurrence mechanisms, use `attn_type=2` and `mem_len=0`.
#### Other datasets:
- `Text8` character-level language modeling: check out `run_text8_base.sh`
- `lm1b` word-level language modeling: check out `run_lm1b_base.sh`
import os, sys
import glob
from collections import Counter, OrderedDict
import numpy as np
import torch
from utils.vocabulary import Vocab
class LMOrderedIterator(object):
def __init__(self, data, bsz, bptt, device='cpu', ext_len=None):
"""
data -- LongTensor -- the LongTensor is strictly ordered
"""
self.bsz = bsz
self.bptt = bptt
self.ext_len = ext_len if ext_len is not None else 0
self.device = device
# Work out how cleanly we can divide the dataset into bsz parts.
self.n_step = data.size(0) // bsz
# Trim off any extra elements that wouldn't cleanly fit (remainders).
data = data.narrow(0, 0, self.n_step * bsz)
# Evenly divide the data across the bsz batches.
self.data = data.view(bsz, -1).t().contiguous().to(device)
# Number of mini-batches
self.n_batch = (self.n_step + self.bptt - 1) // self.bptt
def get_batch(self, i, bptt=None):
if bptt is None: bptt = self.bptt
seq_len = min(bptt, self.data.size(0) - 1 - i)
end_idx = i + seq_len
beg_idx = max(0, i - self.ext_len)
data = self.data[beg_idx:end_idx]
target = self.data[i+1:i+1+seq_len]
return data, target, seq_len
def get_fixlen_iter(self, start=0):
for i in range(start, self.data.size(0) - 1, self.bptt):
yield self.get_batch(i)
def get_varlen_iter(self, start=0, std=5, min_len=5, max_deviation=3):
max_len = self.bptt + max_deviation * std
i = start
while True:
bptt = self.bptt if np.random.random() < 0.95 else self.bptt / 2.
bptt = min(max_len, max(min_len, int(np.random.normal(bptt, std))))
data, target, seq_len = self.get_batch(i, bptt)
i += seq_len
yield data, target, seq_len
if i >= self.data.size(0) - 2:
break
def __iter__(self):
return self.get_fixlen_iter()
class LMShuffledIterator(object):
def __init__(self, data, bsz, bptt, device='cpu', ext_len=None, shuffle=False):
"""
data -- list[LongTensor] -- there is no order among the LongTensors
"""
self.data = data
self.bsz = bsz
self.bptt = bptt
self.ext_len = ext_len if ext_len is not None else 0
self.device = device
self.shuffle = shuffle
def get_sent_stream(self):
# index iterator
epoch_indices = np.random.permutation(len(self.data)) if self.shuffle \
else np.array(range(len(self.data)))
# sentence iterator
for idx in epoch_indices:
yield self.data[idx]
def stream_iterator(self, sent_stream):
# streams for each data in the batch
streams = [None] * self.bsz
data = torch.LongTensor(self.bptt, self.bsz)
target = torch.LongTensor(self.bptt, self.bsz)
n_retain = 0
while True:
# data : [n_retain+bptt x bsz]
# target : [bptt x bsz]
data[n_retain:].fill_(-1)
target.fill_(-1)
valid_batch = True
for i in range(self.bsz):
n_filled = 0
try:
while n_filled < self.bptt:
if streams[i] is None or len(streams[i]) <= 1:
streams[i] = next(sent_stream)
# number of new tokens to fill in
n_new = min(len(streams[i]) - 1, self.bptt - n_filled)
# first n_retain tokens are retained from last batch
data[n_retain+n_filled:n_retain+n_filled+n_new, i] = \
streams[i][:n_new]
target[n_filled:n_filled+n_new, i] = \
streams[i][1:n_new+1]
streams[i] = streams[i][n_new:]
n_filled += n_new
except StopIteration:
valid_batch = False
break
if not valid_batch:
return
data = data.to(self.device)
target = target.to(self.device)
yield data, target, self.bptt
n_retain = min(data.size(0), self.ext_len)
if n_retain > 0:
data[:n_retain] = data[-n_retain:]
data.resize_(n_retain + self.bptt, data.size(1))
def __iter__(self):
# sent_stream is an iterator
sent_stream = self.get_sent_stream()
for batch in self.stream_iterator(sent_stream):
yield batch
class LMMultiFileIterator(LMShuffledIterator):
def __init__(self, paths, vocab, bsz, bptt, device='cpu', ext_len=None,
shuffle=False):
self.paths = paths
self.vocab = vocab
self.bsz = bsz
self.bptt = bptt
self.ext_len = ext_len if ext_len is not None else 0
self.device = device
self.shuffle = shuffle
def get_sent_stream(self, path):
sents = self.vocab.encode_file(path, add_double_eos=True)
if self.shuffle:
np.random.shuffle(sents)
sent_stream = iter(sents)
return sent_stream
def __iter__(self):
if self.shuffle:
np.random.shuffle(self.paths)
for path in self.paths:
# sent_stream is an iterator
sent_stream = self.get_sent_stream(path)
for batch in self.stream_iterator(sent_stream):
yield batch
class Corpus(object):
def __init__(self, path, dataset, *args, **kwargs):
self.dataset = dataset
self.vocab = Vocab(*args, **kwargs)
if self.dataset in ['ptb', 'wt2', 'enwik8', 'text8']:
self.vocab.count_file(os.path.join(path, 'train.txt'))
self.vocab.count_file(os.path.join(path, 'valid.txt'))
self.vocab.count_file(os.path.join(path, 'test.txt'))
elif self.dataset == 'wt103':
self.vocab.count_file(os.path.join(path, 'train.txt'))
elif self.dataset == 'lm1b':
train_path_pattern = os.path.join(
path, '1-billion-word-language-modeling-benchmark-r13output',
'training-monolingual.tokenized.shuffled', 'news.en-*')
train_paths = glob.glob(train_path_pattern)
# the vocab will load from file when build_vocab() is called
self.vocab.build_vocab()
if self.dataset in ['ptb', 'wt2', 'wt103']:
self.train = self.vocab.encode_file(
os.path.join(path, 'train.txt'), ordered=True)
self.valid = self.vocab.encode_file(
os.path.join(path, 'valid.txt'), ordered=True)
self.test = self.vocab.encode_file(
os.path.join(path, 'test.txt'), ordered=True)
elif self.dataset in ['enwik8', 'text8']:
self.train = self.vocab.encode_file(
os.path.join(path, 'train.txt'), ordered=True, add_eos=False)
self.valid = self.vocab.encode_file(
os.path.join(path, 'valid.txt'), ordered=True, add_eos=False)
self.test = self.vocab.encode_file(
os.path.join(path, 'test.txt'), ordered=True, add_eos=False)
elif self.dataset == 'lm1b':
self.train = train_paths
self.valid = self.vocab.encode_file(
os.path.join(path, 'valid.txt'), ordered=False, add_double_eos=True)
self.test = self.vocab.encode_file(
os.path.join(path, 'test.txt'), ordered=False, add_double_eos=True)
def get_iterator(self, split, *args, **kwargs):
if split == 'train':
if self.dataset in ['ptb', 'wt2', 'wt103', 'enwik8', 'text8']:
data_iter = LMOrderedIterator(self.train, *args, **kwargs)
elif self.dataset == 'lm1b':
kwargs['shuffle'] = True
data_iter = LMMultiFileIterator(self.train, self.vocab, *args, **kwargs)
elif split in ['valid', 'test']:
data = self.valid if split == 'valid' else self.test
if self.dataset in ['ptb', 'wt2', 'wt103', 'enwik8', 'text8']:
data_iter = LMOrderedIterator(data, *args, **kwargs)
elif self.dataset == 'lm1b':
data_iter = LMShuffledIterator(data, *args, **kwargs)
return data_iter
def get_lm_corpus(datadir, dataset):
fn = os.path.join(datadir, 'cache.pt')
if os.path.exists(fn):
print('Loading cached dataset...')
corpus = torch.load(fn)
else:
print('Producing dataset {}...'.format(dataset))
kwargs = {}
if dataset in ['wt103', 'wt2']:
kwargs['special'] = ['<eos>']
kwargs['lower_case'] = False
elif dataset == 'ptb':
kwargs['special'] = ['<eos>']
kwargs['lower_case'] = True
elif dataset == 'lm1b':
kwargs['special'] = []
kwargs['lower_case'] = False
kwargs['vocab_file'] = os.path.join(datadir, '1b_word_vocab.txt')
elif dataset in ['enwik8', 'text8']:
pass
corpus = Corpus(datadir, dataset, **kwargs)
torch.save(corpus, fn)
return corpus
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(description='unit test')
parser.add_argument('--datadir', type=str, default='../data/text8',
help='location of the data corpus')
parser.add_argument('--dataset', type=str, default='text8',
choices=['ptb', 'wt2', 'wt103', 'lm1b', 'enwik8', 'text8'],
help='dataset name')
args = parser.parse_args()
corpus = get_lm_corpus(args.datadir, args.dataset)
print('Vocab size : {}'.format(len(corpus.vocab.idx2sym)))
# coding: utf-8
import argparse
import time
import math
import os, sys
import torch
from data_utils import get_lm_corpus
from mem_transformer import MemTransformerLM
from utils.exp_utils import get_logger
parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model')
parser.add_argument('--data', type=str, default='../data/wikitext-103',
help='location of the data corpus')
parser.add_argument('--dataset', type=str, default='wt103',
choices=['wt103', 'lm1b', 'enwik8', 'text8'],
help='dataset name')
parser.add_argument('--split', type=str, default='all',
choices=['all', 'valid', 'test'],
help='which split to evaluate')
parser.add_argument('--batch_size', type=int, default=10,
help='batch size')
parser.add_argument('--tgt_len', type=int, default=5,
help='number of tokens to predict')
parser.add_argument('--ext_len', type=int, default=0,
help='length of the extended context')
parser.add_argument('--mem_len', type=int, default=0,
help='length of the retained previous heads')
parser.add_argument('--clamp_len', type=int, default=-1,
help='max positional embedding index')
parser.add_argument('--cuda', action='store_true',
help='use CUDA')
parser.add_argument('--work_dir', type=str, required=True,
help='path to the work_dir')
parser.add_argument('--no_log', action='store_true',
help='do not log the eval result')
parser.add_argument('--same_length', action='store_true',
help='set same length attention with masking')
args = parser.parse_args()
assert args.ext_len >= 0, 'extended context length must be non-negative'
device = torch.device("cuda" if args.cuda else "cpu")
# Get logger
logging = get_logger(os.path.join(args.work_dir, 'log.txt'),
log_=not args.no_log)
# Load dataset
corpus = get_lm_corpus(args.data, args.dataset)
ntokens = len(corpus.vocab)
va_iter = corpus.get_iterator('valid', args.batch_size, args.tgt_len,
device=device, ext_len=args.ext_len)
te_iter = corpus.get_iterator('test', args.batch_size, args.tgt_len,
device=device, ext_len=args.ext_len)
# Load the best saved model.
with open(os.path.join(args.work_dir, 'model.pt'), 'rb') as f:
model = torch.load(f)
model.backward_compatible()
model = model.to(device)
logging('Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}'.format(
args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len))
model.reset_length(args.tgt_len, args.ext_len, args.mem_len)
if args.clamp_len > 0:
model.clamp_len = args.clamp_len
if args.same_length:
model.same_length = True
###############################################################################
# Evaluation code
###############################################################################
def evaluate(eval_iter):
# Turn on evaluation mode which disables dropout.
model.eval()
total_len, total_loss = 0, 0.
start_time = time.time()
with torch.no_grad():
mems = tuple()
for idx, (data, target, seq_len) in enumerate(eval_iter):
ret = model(data, target, *mems)
loss, mems = ret[0], ret[1:]
loss = loss.mean()
total_loss += seq_len * loss.item()
total_len += seq_len
total_time = time.time() - start_time
logging('Time : {:.2f}s, {:.2f}ms/segment'.format(
total_time, 1000 * total_time / (idx+1)))
return total_loss / total_len
# Run on test data.
if args.split == 'all':
test_loss = evaluate(te_iter)
valid_loss = evaluate(va_iter)
elif args.split == 'valid':
valid_loss = evaluate(va_iter)
test_loss = None
elif args.split == 'test':
test_loss = evaluate(te_iter)
valid_loss = None
def format_log(loss, split):
if args.dataset in ['enwik8', 'text8']:
log_str = '| {0} loss {1:5.2f} | {0} bpc {2:9.5f} '.format(
split, loss, loss / math.log(2))
else:
log_str = '| {0} loss {1:5.2f} | {0} ppl {2:9.3f} '.format(
split, loss, math.exp(loss))
return log_str
log_str = ''
if valid_loss is not None:
log_str += format_log(valid_loss, 'valid')
if test_loss is not None:
log_str += format_log(test_loss, 'test')
logging('=' * 100)
logging(log_str)
logging('=' * 100)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment