update marcros for torch2.1 and import fastpt

ff295599 · sangwzh · 43ff1d4f · ff295599 · ff295599 · ff295599
Commit ff295599 authored Jun 07, 2024 by sangwzh
7 changed files
--- a/README.md
+++ b/README.md
@@ -23,6 +23,13 @@ pytorch whl包下载目录：[https://cancon.hpccube.com:65024/4/main/pytorch/dt
 pip install torch* (下载的torch的whl包)
 ```

+#### 源码编译安装
+torch2.1下，首先安装fastpt工具包，下载地址：http://10.6.10.68:8000/debug/fastpt/
+执行
+```shell
+pip install fastpt*.whl
+```
+
 ```shell
 pip install setuptools wheel
 ```
@@ -40,4 +47,4 @@ pip install dist/colossalai*

 ## 参考
 - [README_ORIGIN](README_ORIGIN.md)
- [README_zh-Hans](README_zh-Hans.md)
\ No newline at end of file
+- [README_zh-Hans](README_zh-Hans.md)
--- a/colossalai/kernel/cuda_native/csrc/kernels/cublas_wrappers.cu
+++ b/colossalai/kernel/cuda_native/csrc/kernels/cublas_wrappers.cu
@@ -4,7 +4,7 @@
 */
 #include "cublas_wrappers.h"

-#ifdef COLOSSAL_HIP
+#if defined(COLOSSAL_HIP) && !defined(HIPBLAS_H)
 int cublas_gemm_ex(cublasHandle_t handle, cublasOperation_t transa,
                   cublasOperation_t transb, int m, int n, int k,
                   const float *alpha, const float *beta, const float *A,

--- a/colossalai/kernel/cuda_native/csrc/kernels/include/cublas_wrappers.h
+++ b/colossalai/kernel/cuda_native/csrc/kernels/include/cublas_wrappers.h
@@ -14,7 +14,7 @@
 #endif
 #include <stdio.h>

-#ifdef COLOSSAL_HIP
+#if  defined(COLOSSAL_HIP) && !defined(HIPBLAS_H)
 int cublas_gemm_ex(cublasHandle_t handle, cublasOperation_t transa,
                   cublasOperation_t transb, int m, int n, int k,
                   const float *alpha, const float *beta, const float *A,

--- a/colossalai/kernel/cuda_native/csrc/kernels/include/feed_forward.h
+++ b/colossalai/kernel/cuda_native/csrc/kernels/include/feed_forward.h
@@ -35,7 +35,7 @@ class FeedForward {
    float alpha = T(1.);
    float beta = T(0.);

-#ifdef COLOSSAL_HIP
+#if defined(COLOSSAL_HIP)&& !defined(HIPBLAS_H)
    cublas_gemm_ex(_cublasHandle, CUBLAS_OP_T, CUBLAS_OP_N, config_.outputSize,
                   bsz, config_.inputSize, &alpha, &beta, weights, input_ptr,
                   out, rocblas_gemm_algo(rocblas_gemm_algo_standard));
@@ -51,7 +51,7 @@ class FeedForward {
                T *inp_grad_out = nullptr, T *out_grad_trans_out = nullptr,
                bool compute_bias = true) {
    float alpha = (T)1.0, beta = (T)0.0;
-#ifdef COLOSSAL_HIP
+#if defined(COLOSSAL_HIP)&& !defined(HIPBLAS_H)
    cublas_gemm_ex(_cublasHandle, CUBLAS_OP_N, CUBLAS_OP_T, config_.inputSize,
                   config_.outputSize, bsz, &alpha, &beta, input_ptr, out_grad,
                   weights_grad, rocblas_gemm_algo(rocblas_gemm_algo_standard));

--- a/colossalai/kernel/cuda_native/csrc/kernels/include/strided_batch_gemm.h
+++ b/colossalai/kernel/cuda_native/csrc/kernels/include/strided_batch_gemm.h
@@ -49,7 +49,7 @@ class StridedBatchGemm {
    int stride_b = _config.n * _config.k;
    int stride_c = _config.m * _config.n;

-#ifdef COLOSSAL_HIP
+#if defined(COLOSSAL_HIP)&& !defined(HIPBLAS_H)
    cublas_strided_batched_gemm(
        handle, _config.m, _config.n, _config.k, &_config.alpha, &_config.beta,
        _buffer_a, _buffer_b, output, _config.op_A, _config.op_B, stride_a,
@@ -77,7 +77,7 @@ class StridedBatchGemm {
        (_config.op_B == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T);

    // Calculate d_A.
-#ifdef COLOSSAL_HIP
+#if defined(COLOSSAL_HIP)&& !defined(HIPBLAS_H)
    cublas_strided_batched_gemm(
        handle, mb, kb, _config.n, &_config.alpha, &_config.beta,
        (_config.op_A == CUBLAS_OP_T ? _buffer_b : d_output),
@@ -102,7 +102,7 @@ class StridedBatchGemm {
    stride_c = _config.n * _config.k;

    // Calculate d_B.
-#ifdef COLOSSAL_HIP
+#if defined(COLOSSAL_HIP)&& !defined(HIPBLAS_H)
    cublas_strided_batched_gemm(
        handle, _config.k, _config.n, _config.m, &_config.alpha, &_config.beta,
        _buffer_a, d_output, inpGradB, op_a, CUBLAS_OP_N, stride_a, stride_b,

--- a/custom_hipify_mappings.json
+++ b/custom_hipify_mappings.json
+{
+"custom_map" : {
+        "#if TORCH_VERSION_MINOR >= 13":"#if TORCH_VERSION_MINOR >= 13 || TORCH_VERSION_MAJOR >= 2",
+        "cublasGemmAlgo_t":"hipblasGemmAlgo_t",
+        "CUDA_R_32F":"HIPBLAS_R_32F",
+        "CUDA_R_16F":"HIPBLAS_R_16F"
+
+    }
+}
--- a/setup.py
+++ b/setup.py
@@ -189,6 +189,7 @@ if build_cuda_ext or build_hip_ext:
 try:
    import torch
    from torch.utils.cpp_extension import CUDA_HOME, BuildExtension, CUDAExtension
+    from fastpt import CUDAExtension
    print("\n\ntorch.__version__  = {}\n\n".format(torch.__version__))
    TORCH_MAJOR = int(torch.__version__.split('.')[0])
    TORCH_MINOR = int(torch.__version__.split('.')[1])
@@ -220,6 +221,7 @@ if build_hip_ext:
                                                 'nvcc': ['-O3'] + version_dependent_macros + hip_macros + extra_cuda_flags})

    from torch.utils.hipify import hipify_python
+    from fastpt import hipify_python
    hipify_python.hipify(
           project_directory=this_dir,
           output_directory=this_dir,