add README_HIP

fix test for torch 1.10.0

add README_HIP
fix test for torch 1.10.0
2c6c0f28 · flyingdown · 2d8b3600 · 2c6c0f28 · 2c6c0f28 · 2c6c0f28
Commit 2c6c0f28 authored May 08, 2023 by flyingdown
4 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -148,3 +148,4 @@ cython_debug/
 *.hip
 *_hip.*
 *hip* 
+!README_HIP.md
--- a/README_HIP.md
+++ b/README_HIP.md
+# APEX
+## 安装
+### System Requirements
+- Linux.
+- Python 3.7, 3.8, 3.9
+- (**推荐**) Upgrade pip
+  ```
+  python3 -m pip install --upgrade pip #--user
+  ```
+### 使用pip安装（以dtk-23.04版本为例）
+可以在光合[光合开发者社区](https://developer.hpccube.com/tool/#sdk) AI 生态包中获取最新的 apex Release 版本（需对应 DCU Toolkit 版本与 python 版本）
+```bash 
+python3 -m pip install apex-0.1+git2d8b360.abi0.dtk2304-cp37-cp37m-linux_x86_64.whl
+```
+### 使用源码安装
+#### 编译环境准备（以dtk-23.04版本为例）
+- 拉取 apex 代码
+  ```
+  git clone -b dtk-23.04 http://developer.hpccube.com/codes/aicomponent/apex.git
+  ```
+- 在[开发者社区](https://developer.hpccube.com/tool/#sdk) DCU Toolkit 中下载 DTK-23.04 解压至 /opt/ 路径下，并建立软链接
+  ```
+  cd /opt && ln -s dtk-23.04 dtk
+  ```
+- 在光合[光合开发者社区](https://developer.hpccube.com/tool/#sdk) AI 生态包中获取对应的 pytorch Release 版本（需对应 DCU Toolkit 版本与 python 版本）
+  ```bash
+  python3 -m pip install torch-1.13.1a0+git4c8a1fe.abi0.dtk2304-cp37-cp37m-linux_x86_64.whl
+  ```
+- 导入环境变量以及安装必要依赖库
+  ```bash
+  source /opt/dtk/env.sh
+  export PYTORCH_ROCM_ARCH="gfx906;gfx926"
+  MAX_JOBS=16
+  sha=`git rev-parse HEAD`
+  sed -i  "/version=/{s/\(.*=\)['\"]\(.*\)['\"]/\1'\2\+git${sha:0:7}\.abi0.dtk23.04'/}" setup.py
+  pip3 install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
+  pip3 install wheel -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
+  ```
+#### 编译安装
+- 执行编译命令
+  ```shell
+  cd apex
+  CXX=hipcc CC=hipcc python3 setup.py --cpp_ext --cuda_ext  bdist_wheel
+  pip install dist/apex*
+  ```
--- a/tests/L0/run_amp/test_checkpointing.py
+++ b/tests/L0/run_amp/test_checkpointing.py
@@ -228,6 +228,11 @@ class TestCheckpointing(unittest.TestCase):
                continue
            model = MyModel().to('cuda')
+            torch_ver = torch.__version__.split('a0')[0]
+            optimizer = None
+            if torch_ver == '1.10.0':
+                optimizer = optim.Adam(model.parameters(), lr=1e-3)
+            else:
                optimizer = optim.Adam(model.parameters(), lr=1e-3, capturable=True)
            model, optimizer = amp.initialize(
                model, optimizer, opt_level=opt_level, verbosity=0)

--- a/tests/L0/run_optimizers/test_fused_optimizer.py
+++ b/tests/L0/run_optimizers/test_fused_optimizer.py
@@ -92,6 +92,11 @@ class TestFusedAdam(TestFusedOptimizer):
    def setUp(self):
        super().setUp()
+        torch_ver = torch.__version__.split('a0')[0]
+        if torch_ver == '1.10.0':
+            self.options = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
+                'weight_decay': 0, 'amsgrad': False}
+        else:
            self.options = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
                'weight_decay': 0, 'amsgrad': False, "capturable": True}
        self.tst_options = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
@@ -180,6 +185,12 @@ class TestFusedAdam(TestFusedOptimizer):
    def test_adam_option(self):
        nelem = 1
+        torch_ver = torch.__version__.split('a0')[0]
+        adam_option = None
+        if torch_ver == '1.10.0':
+            adam_option = {'lr':0.01, 'betas':(0.6, 0.9), 'eps':3e-06,
+                'weight_decay':0, 'amsgrad':False}
+        else:
            adam_option = {'lr':0.01, 'betas':(0.6, 0.9), 'eps':3e-06,
                'weight_decay':0, 'amsgrad':False, 'capturable':True}