Merge branch 'develop-pytorch1.13.1' into 'develop'

Develop pytorch1.13.1 See merge request aicomponent/warpctc!1

Merge branch 'develop-pytorch1.13.1' into 'develop'
Develop pytorch1.13.1 See merge request aicomponent/warpctc!1
cfb0ec1a · lishen01 · 5e65c1c3 · c9a60a98 · 5e65c1c3 · 5e65c1c3
Commit cfb0ec1a authored Jun 16, 2023 by lishen01
15 changed files
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
-# Default ignored files
-/shelf/
-/workspace.xml
-# Editor-based HTTP Client requests
-/httpRequests/
-# Datasource local storage ignored files
-/dataSources/
-/dataSources.local.xml
--- a/.idea/deployment.xml
+++ b/.idea/deployment.xml
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="PublishConfigData" autoUpload="On explicit save action" serverName="10.6.10.62" preserveTimestamps="false" deleteMissingItems="true" createEmptyFolders="true" filePermissions="420" folderPermissions="493" confirmBeforeUploading="false" confirmBeforeDeletion="false" autoUploadExternalChanges="true">
-    <option name="confirmBeforeDeletion" value="false" />
-    <option name="confirmBeforeUploading" value="false" />
-    <serverData>
-      <paths name="10.6.10.61">
-        <serverdata>
-          <mappings>
-            <mapping local="$PROJECT_DIR$" web="/" />
-          </mappings>
-        </serverdata>
-      </paths>
-      <paths name="10.6.10.62">
-        <serverdata>
-          <mappings>
-            <mapping deploy="/public/home/lishen/warpctc/warpctc_dcu" local="$PROJECT_DIR$" web="/" />
-          </mappings>
-        </serverdata>
-      </paths>
-      <paths name="10.6.10.69">
-        <serverdata>
-          <mappings>
-            <mapping local="$PROJECT_DIR$" web="/" />
-          </mappings>
-        </serverdata>
-      </paths>
-      <paths name="10.6.6.220">
-        <serverdata>
-          <mappings>
-            <mapping local="$PROJECT_DIR$" web="/" />
-          </mappings>
-        </serverdata>
-      </paths>
-      <paths name="lishen_89_docker (c96f91f6-1c12-4fb0-b3d8-cc68ccf3f77c)">
-        <serverdata>
-          <mappings>
-            <mapping local="$PROJECT_DIR$" web="/" />
-          </mappings>
-        </serverdata>
-      </paths>
-      <paths name="lishen_90">
-        <serverdata>
-          <mappings>
-            <mapping local="$PROJECT_DIR$" web="/" />
-          </mappings>
-        </serverdata>
-      </paths>
-      <paths name="lishen_93">
-        <serverdata>
-          <mappings>
-            <mapping local="$PROJECT_DIR$" web="/" />
-          </mappings>
-        </serverdata>
-      </paths>
-      <paths name="lishen_95">
-        <serverdata>
-          <mappings>
-            <mapping local="$PROJECT_DIR$" web="/" />
-          </mappings>
-        </serverdata>
-      </paths>
-    </serverData>
-    <option name="myAutoUpload" value="ON_EXPLICIT_SAVE" />
-  </component>
-</project>
\ No newline at end of file
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="ProjectModuleManager">
-    <modules>
-      <module fileurl="file://$PROJECT_DIR$/.idea/warpctc_dcu.iml" filepath="$PROJECT_DIR$/.idea/warpctc_dcu.iml" />
-    </modules>
-  </component>
-</project>
\ No newline at end of file
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="VcsDirectoryMappings">
-    <mapping directory="$PROJECT_DIR$" vcs="Git" />
-  </component>
-</project>
\ No newline at end of file
--- a/.idea/warpctc_dcu.iml
+++ b/.idea/warpctc_dcu.iml
-<?xml version="1.0" encoding="UTF-8"?>
-<module type="CPP_MODULE" version="4">
-  <component name="FacetManager">
-    <facet type="Python" name="Python facet">
-      <configuration sdkName="Python 3.7" />
-    </facet>
-  </component>
-  <component name="NewModuleRootManager">
-    <content url="file://$MODULE_DIR$" />
-    <orderEntry type="inheritedJdk" />
-    <orderEntry type="sourceFolder" forTests="false" />
-    <orderEntry type="library" name="Python 3.7 interpreter library" level="application" />
-  </component>
-</module>
\ No newline at end of file
--- a/README.md
+++ b/README.md
-# PyTorch bindings for Warp-ctc
+# DLIB

-[![Build Status](https://travis-ci.org/SeanNaren/warp-ctc.svg?branch=pytorch_bindings)](https://travis-ci.org/SeanNaren/warp-ctc)
+## 环境配置

-This is an extension onto the original repo found [here](https://github.com/baidu-research/warp-ctc).
+使用DCU编译之前，需要准备编译环境。参考
+[environment prepare](environment_prepare.md)

-## Installation
+## 使用源码安装

-Install [PyTorch](https://github.com/pytorch/pytorch#installation) v0.4.
+### 编译环境准备（以dtk-23.04版本为例）

-`WARP_CTC_PATH` should be set to the location of a built WarpCTC
-(i.e. `libwarpctc.so`).  This defaults to `../build`, so from within a
-new warp-ctc clone you could build WarpCTC like this:
+- 拉取代码

-```bash
-git clone https://github.com/SeanNaren/warp-ctc.git
-cd warp-ctc
-mkdir build; cd build
-cmake ..
-make
-```
+  ```
+  git clone -b develop http://developer.hpccube.com/codes/aicomponent/warpctc.git
+  ```
+- 在[开发者社区](https://developer.hpccube.com/tool/#sdk) DCU Toolkit 中下载 DTK-23.04 解压至 /opt/ 路径下，并建立软链接
+
+  ```
+  cd /opt && ln -s dtk-23.04 dtk
+  ```
+
+- 导入环境变量以及安装必要依赖库
+
+  ```shell
+  source /opt/dtk/env.sh
+  ```

-Now install the bindings:
-```bash
+### 编译安装
+
+#### 编译 Python API
+
+- 使用python安装
+
+```shell
 cd pytorch_binding
 python setup.py install
 ```

-If you try the above and get a dlopen error on OSX with anaconda3 (as recommended by pytorch):
-```bash
-cd ../pytorch_binding
-python setup.py install
-cd ../build
-cp libwarpctc.dylib /Users/$WHOAMI/anaconda3/lib
+- 使用python编译whl包
+
+```shell
+cd pytorch_binding
+python setup.py bdist_wheel
 ```
-This will resolve the library not loaded error. This can be easily modified to work with other python installs if needed.
-
-Example to use the bindings below.
-
-```python
-import torch
-from warpctc_pytorch import CTCLoss
-ctc_loss = CTCLoss()
-# expected shape of seqLength x batchSize x alphabet_size
-probs = torch.FloatTensor([[[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]]]).transpose(0, 1).contiguous()
-labels = torch.IntTensor([1, 2])
-label_sizes = torch.IntTensor([2])
-probs_sizes = torch.IntTensor([2])
-probs.requires_grad_(True)  # tells autograd to compute gradients for probs
-cost = ctc_loss(probs, labels, probs_sizes, label_sizes)
-cost.backward()
+
+### 测试
+
+- 验证warpctc的loss正确性（CPU和GPU的一致性）
+
+```shell
+cd pytorch_binding/tests
+python3 test_gpu.py
 ```

-## Documentation
+- 验证warpctc的loss的GPU加速效果

+```shell
+cd pytorch_binding/tests
+python3 test_gpu_speed.py
 ```
-CTCLoss(size_average=False, length_average=False)
-    # size_average (bool): normalize the loss by the batch size (default: False)
-    # length_average (bool): normalize the loss by the total number of frames in the batch. If True, supersedes size_average (default: False)
-
-forward(acts, labels, act_lens, label_lens)
-    # acts: Tensor of (seqLength x batch x outputDim) containing output activations from network (before softmax)
-    # labels: 1 dimensional Tensor containing all the targets of the batch in one large sequence
-    # act_lens: Tensor of size (batch) containing size of each output sequence from the network
-    # label_lens: Tensor of (batch) containing label length of each example
-```
\ No newline at end of file
+ 
+
+
+
--- a/README_HIP.md
+++ b/README_HIP.md
-# DLIB
-
-## 环境配置
-
-使用DCU编译之前，需要准备编译环境。参考
-[environment prepare](environment_prepare.md)
-
-## 使用源码安装
-
-### 编译环境准备（以dtk-23.04版本为例）
-
- 拉取代码
-
-  ```
-  git clone -b develop http://developer.hpccube.com/codes/aicomponent/warpctc.git
-  ```
- 在[开发者社区](https://developer.hpccube.com/tool/#sdk) DCU Toolkit 中下载 DTK-23.04 解压至 /opt/ 路径下，并建立软链接
-
-  ```
-  cd /opt && ln -s dtk-23.04 dtk
-  ```
-
- 导入环境变量以及安装必要依赖库
-
-  ```shell
-  source /opt/dtk/env.sh
-  ```
-
-### 编译安装
-
-#### 编译 Python API
-
- 使用python安装
-
-```shell
-cd pytorch_binding
-python setup.py install
-```
-
- 使用python编译whl包
-
-```shell
-cd pytorch_binding
-python setup.py bdist_wheel
-```
-
-### 测试
-
- 验证warpctc的loss正确性（CPU和GPU的一致性）
-
-```shell
-cd pytorch_binding/tests
-python3 test_gpu.py
-```
-
- 验证warpctc的loss的GPU加速效果
-
-```shell
-cd pytorch_binding/tests
-python3 test_gpu_speed.py
-```
- 
-
-
-
--- a/README_origin.md
+++ b/README_origin.md
+# PyTorch bindings for Warp-ctc
+
+[![Build Status](https://travis-ci.org/SeanNaren/warp-ctc.svg?branch=pytorch_bindings)](https://travis-ci.org/SeanNaren/warp-ctc)
+
+This is an extension onto the original repo found [here](https://github.com/baidu-research/warp-ctc).
+
+## Installation
+
+Install [PyTorch](https://github.com/pytorch/pytorch#installation) v0.4.
+
+`WARP_CTC_PATH` should be set to the location of a built WarpCTC
+(i.e. `libwarpctc.so`).  This defaults to `../build`, so from within a
+new warp-ctc clone you could build WarpCTC like this:
+
+```bash
+git clone https://github.com/SeanNaren/warp-ctc.git
+cd warp-ctc
+mkdir build; cd build
+cmake ..
+make
+```
+
+Now install the bindings:
+```bash
+cd pytorch_binding
+python setup.py install
+```
+
+If you try the above and get a dlopen error on OSX with anaconda3 (as recommended by pytorch):
+```bash
+cd ../pytorch_binding
+python setup.py install
+cd ../build
+cp libwarpctc.dylib /Users/$WHOAMI/anaconda3/lib
+```
+This will resolve the library not loaded error. This can be easily modified to work with other python installs if needed.
+
+Example to use the bindings below.
+
+```python
+import torch
+from warpctc_pytorch import CTCLoss
+ctc_loss = CTCLoss()
+# expected shape of seqLength x batchSize x alphabet_size
+probs = torch.FloatTensor([[[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]]]).transpose(0, 1).contiguous()
+labels = torch.IntTensor([1, 2])
+label_sizes = torch.IntTensor([2])
+probs_sizes = torch.IntTensor([2])
+probs.requires_grad_(True)  # tells autograd to compute gradients for probs
+cost = ctc_loss(probs, labels, probs_sizes, label_sizes)
+cost.backward()
+```
+
+## Documentation
+
+```
+CTCLoss(size_average=False, length_average=False)
+    # size_average (bool): normalize the loss by the batch size (default: False)
+    # length_average (bool): normalize the loss by the total number of frames in the batch. If True, supersedes size_average (default: False)
+
+forward(acts, labels, act_lens, label_lens)
+    # acts: Tensor of (seqLength x batch x outputDim) containing output activations from network (before softmax)
+    # labels: 1 dimensional Tensor containing all the targets of the batch in one large sequence
+    # act_lens: Tensor of size (batch) containing size of each output sequence from the network
+    # label_lens: Tensor of (batch) containing label length of each example
+```
\ No newline at end of file
--- a/include/detail/gpu_ctc_kernels.h
+++ b/include/detail/gpu_ctc_kernels.h
@@ -130,15 +130,14 @@ void compute_alpha_kernel(const ProbT *probs, const int *label_sizes,
    const int *label_global = &labels[blockIdx.x * S_memoffset];
    ProbT *alpha = &alphas[blockIdx.x * (S_memoffset * T_memoffset)];

-    // Set the first row of alpha neg_inf - it is much more efficient to do it
-    // here than outside
-#pragma unroll
+    // Set the first row of alpha neg_inf - it is much more efficient to do it here than outside
+//#pragma unroll
    for (int idx = tid; idx < min(S, NV); idx += blockDim.x) {
        alpha[idx] = ctc_helper::neg_inf<ProbT>();
    }

    // Load labels into shared memory
-#pragma unroll
+//#pragma unroll
    for (int i = tid; i < S; i += NT) {
        label[i] = label_global[i];
    }
@@ -272,8 +271,8 @@ void compute_betas_and_grad_kernel(const ProbT *probs, const int *label_sizes,
    int start = S > 1 ? (S - 2) : 0;
    int end = (L + repeats < T) ? S : S - 1;

-    // Setup shared memory buffers
-#pragma unroll
+//    // Setup shared memory buffers
+//#pragma unroll
    for (int idx = tid; idx < NV; idx += NT) {
        label[idx] = (idx < S) ? label_global[idx] : INT_MAX;
    }
@@ -290,7 +289,7 @@ void compute_betas_and_grad_kernel(const ProbT *probs, const int *label_sizes,
        int key[VT];
        int gather_val[VT];

-#pragma unroll
+//#pragma unroll
        for (int i = 0; i < VT; ++i) {
            const int idx = tid * VT + i;
            gather_val[i] = idx;

--- a/pytorch_binding/get_version.py
+++ b/pytorch_binding/get_version.py
+import os
+import subprocess
+from pathlib import Path
+
+import torch
+
+ROOT_DIR = Path(__file__).parent.resolve()
+
+def _run_cmd(cmd, shell=False):
+    try:
+        return subprocess.check_output(cmd, cwd=ROOT_DIR, stderr=subprocess.DEVNULL, shell=shell).decode("ascii").strip()
+    except Exception:
+        return None
+
+
+def _get_version():
+    if os.path.exists(ROOT_DIR / "version.txt"):
+        with open(ROOT_DIR / "version.txt", "r") as f:
+            version = f.read().strip()
+    else:
+        version = '0.1'
+    if os.getenv("BUILD_VERSION"):
+        version = os.getenv("BUILD_VERSION")
+    return version
+
+
+def _make_version_file(version, sha, abi, dtk, torch_version, branch):
+    sha = "Unknown" if sha is None else sha
+    torch_version = '.'.join(torch_version.split('.')[:2])
+    dcu_version = f"{version}+{sha}.abi{abi}.dtk{dtk}.torch{torch_version}"
+    version_path = ROOT_DIR / "warpctc_pytorch" / "version.py"
+    with open(version_path, "w") as f:
+        f.write(f"version = '{version}'\n")
+        f.write(f"git_hash = '{sha}'\n")
+        f.write(f"git_branch = '{branch}'\n")
+        f.write(f"abi = 'abi{abi}'\n")
+        f.write(f"dtk = '{dtk}'\n")
+        f.write(f"torch_version = '{torch_version}'\n")
+        f.write(f"dcu_version = '{dcu_version}'\n")
+    return dcu_version
+
+
+def _get_pytorch_version():
+    if "PYTORCH_VERSION" in os.environ:
+        return f"{os.environ['PYTORCH_VERSION']}"
+    return torch.__version__
+
+
+def get_version(ROCM_HOME):
+    sha = _run_cmd(["git", "rev-parse", "HEAD"])
+    sha = sha[:7]
+    branch = _run_cmd(["git", "rev-parse", "--abbrev-ref", "HEAD"])
+    tag = _run_cmd(["git", "describe", "--tags", "--exact-match", "@"])
+    print("-- Git branch:", branch)
+    print("-- Git SHA:", sha)
+    print("-- Git tag:", tag)
+    torch_version = _get_pytorch_version()
+    print("-- PyTorch:", torch_version)
+    version = _get_version()
+    print("-- Building version", version)
+    abi = _run_cmd(["echo '#include <string>' | gcc -x c++ -E -dM - | fgrep _GLIBCXX_USE_CXX11_ABI | awk '{print $3}'"], shell=True)
+    print("-- _GLIBCXX_USE_CXX11_ABI:", abi)
+    dtk = _run_cmd(["cat", os.path.join(ROCM_HOME, '.info/rocm_version')])
+    dtk = ''.join(dtk.split('.')[:2])
+    print("-- DTK:", dtk)
+
+    return _make_version_file(version, sha, abi, dtk, torch_version, branch)
--- a/pytorch_binding/setup.py
+++ b/pytorch_binding/setup.py
@@ -9,6 +9,7 @@ from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CppExtensio
 from setuptools import find_packages, setup
 from setuptools.command.build_ext import build_ext
 from pkg_resources import packaging  # type: ignore[attr-defined]
+from get_version import get_version


 def _find_rocm_home() -> Optional[str]:
@@ -291,10 +292,6 @@ class BuildReleaseExtension(BuildExtension):
        build_ext.build_extensions(self)


-def get_version():
-    return "0.1"
-
-
 def get_extensions():
    extensions = []
    include_dirs = []
@@ -331,7 +328,7 @@ def get_extensions():
 def main():
    setup(
        name='warpctc_pytorch',
-        version="0.1",
+        version=get_version(_find_rocm_home()),
        description='Torch fuseop Computer Vision Foundation',
        keywords='computer vision',
        packages=find_packages(),

--- a/pytorch_binding/src/binding.cu
+++ b/pytorch_binding/src/binding.cu
@@ -7,9 +7,7 @@
 	#include "ATen/cuda/CUDAContext.h"
 	#include <c10/cuda/CUDAGuard.h>
 	#include "ATen/cuda/CUDAEvent.h"
-    #include <THC/THCGeneral.h>
-
-    extern THCState* state;
+    #include <ATen/cuda/ThrustAllocator.h>
 #endif

 #include "ctc.h"
@@ -91,7 +89,7 @@ int gpu_ctc(torch::Tensor probs,
                       probs_size, minibatch_size,
                       options, &gpu_size_bytes);

-    void* gpu_workspace = THCudaMalloc(state, gpu_size_bytes);
+    void* gpu_workspace = c10::cuda::CUDACachingAllocator::raw_alloc(gpu_size_bytes);

    compute_ctc_loss(probs_ptr, grads_ptr,
                     labels_ptr, label_sizes_ptr,
@@ -99,7 +97,8 @@ int gpu_ctc(torch::Tensor probs,
                     minibatch_size, costs_ptr,
                     gpu_workspace, options);

-    THCudaFree(state, (void *) gpu_workspace);
+    c10::cuda::CUDACachingAllocator::raw_delete((void *) gpu_workspace);
+
    return 1;
 }
 #endif

--- a/pytorch_binding/src/gpu_binding.h
+++ b/pytorch_binding/src/gpu_binding.h
 #pragma once

-/*
-int gpu_ctc(THCudaTensor *probs,
-                        THCudaTensor *grads,
-                        THIntTensor *labels_ptr,
-                        THIntTensor *label_sizes_ptr,
-                        THIntTensor *sizes,
-                        int minibatch_size,
-                        THFloatTensor *costs,
-                        int blank_label);
-*/
-
 int gpu_ctc(torch::Tensor probs,
            torch::Tensor grads,
            torch::Tensor labels,

--- a/pytorch_binding/version.txt
+++ b/pytorch_binding/version.txt
+0.1
\ No newline at end of file
--- a/pytorch_binding/warpctc_pytorch/__init__.py
+++ b/pytorch_binding/warpctc_pytorch/__init__.py
@@ -87,3 +87,9 @@ class CTCLoss(Module):
        _assert_no_grad(label_lens)
        return self.ctc(acts, labels, act_lens, label_lens, self.size_average,
                        self.length_average, self.blank)
+
+try:
+    from .version import version, git_hash, git_branch, dtk, abi, torch_version, dcu_version  # noqa: F401
+    __version__, __dcu_version__ = version, dcu_version
+except ImportError:
+    pass