Commit c0705977 authored by wangkaixiong's avatar wangkaixiong 🚴🏼
Browse files

init

parent d3982d85
#
# Configuration file for the Sphinx documentation builder.
#
# This file does only contain a selection of the most common options. For a
# full list see the documentation:
# http://www.sphinx-doc.org/en/master/config
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
# -- Project information
import os
import sys
# import pytorch_sphinx_theme
from m2r import MdInclude
from recommonmark.transform import AutoStructify
from sphinx.builders.html import StandaloneHTMLBuilder
sys.path.insert(0, os.path.abspath('../..'))
version_file = '../../version.py'
with open(version_file, 'r') as f:
exec(compile(f.read(), version_file, 'exec'))
__version__ = locals()['__version__']
project = 'DCU'
copyright = '2024-now, DCU User'
author = 'DCU User'
# The short X.Y version
version = __version__
# The full version, including alpha/beta/rc tags
release = __version__
intersphinx_mapping = {
'python': ('https://docs.python.org/3/', None),
'sphinx': ('https://www.sphinx-doc.org/en/master/', None),
}
intersphinx_disabled_domains = ['std']
templates_path = ['_templates']
# -- General configuration
# If your documentation needs a minimal Sphinx version, state it here.
#
# needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
#todo orginal
# extensions = [
# 'sphinx.ext.duration',
# 'sphinx.ext.doctest',
# 'sphinx.ext.autodoc',
# 'sphinx.ext.autosummary',
# 'sphinx.ext.intersphinx',
# ]
extensions = [
'sphinx.ext.duration',
'sphinx.ext.doctest',
'sphinx.ext.autodoc',
'sphinx.ext.autosummary',
'sphinx.ext.intersphinx',
'sphinx.ext.napoleon',
'sphinx.ext.viewcode',
'sphinx.ext.autosectionlabel',
'sphinx_tabs.tabs',
'sphinx_markdown_tables',
'myst_parser',
'sphinx_copybutton',
'sphinxcontrib.mermaid'
] # yapf: disable
# 用于 Sphinx 文档构建系统中的配置项,它允许项目之间的交叉引用。这个功能使得文档作者可以在一个项目的文档中轻松引用另一个项目的文档中的对象(如函数、类或术语),从而提高文档的一致性和便利性
intersphinx_mapping = {
'python': ('https://docs.python.org/3/', None),
'sphinx': ('https://www.sphinx-doc.org/en/master/', None),
}
#todo original
# intersphinx_disabled_domains = ['std']
# templates_path = ['_templates']
# -------------------------------------------------------
autodoc_mock_imports = []
autosectionlabel_prefix_document = True # 自动生成前缀
intersphinx_disabled_domains = ['std']
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
#
source_parsers = {
'.md': 'recommonmark.parser.CommonMarkParser',
}
source_suffix = {
'.rst': 'restructuredtext',
'.md': 'markdown',
}
#! The master toctree document.
# master_doc = 'index'
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = 'zh_CN'
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
# Sphinx 文档构建配置中的一个列表,用于指定在文档构建过程中需要排除的文件或目录模式
# 避免构建错误、提升构建效率、保持文档纯净
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
# ----------------------- Options for HTML output -----------------------
# html_theme = 'sphinx_rtd_theme'
html_theme = 'sphinx_rtd_theme'
# html_theme = 'pytorch_sphinx_theme'
# html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()]
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
#
# html_theme_options = {
# 'logo_url': 'https://dcu.readthedocs.io/zh-cn/latest/',
# 'menu': [{
# 'name': 'GitHub',
# 'url': 'https://github.com/lacacy/DCU'
# }],
# 'menu_lang': 'cn',
# }
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
html_css_files = ['css/readthedocs.css']
# Enable ::: for my_st
# myst_enable_extensions = ['colon_fence']
# myst_heading_anchors = 5
# Custom sidebar templates, must be a dictionary that maps document names
# to template names.
#
# The default sidebars (for documents that don't match any pattern) are
# defined by theme itself. Builtin themes are using these templates by
# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
# 'searchbox.html']``.
#
# html_sidebars = {}
# -- Options for HTMLHelp output ---------------------------------------------
# Output file base name for HTML help builder.
htmlhelp_basename = 'DCU Doc'
# --------------------------------------------------------------------------------------------
# -- Options for LaTeX output ------------------------------------------------
#! latex_elements = {
# # The paper size ('letterpaper' or 'a4paper').
# #
# # 'papersize': 'letterpaper',
# # The font size ('10pt', '11pt' or '12pt').
# #
# # 'pointsize': '10pt',
# # Additional stuff for the LaTeX preamble.
# #
# # 'preamble': '',
# # Latex figure (float) alignment
# #
# # 'figure_align': 'htbp',
# }
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
# !latex_documents = [
# (master_doc, 'DCU.tex', 'DCU Documentation',
# 'DCU Contributors', 'manual'),
# ]
# -- Options for manual page output ------------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
# !man_pages = [(master_doc, 'DCU', 'DCU Documentation', [author], 1)]
# -- Options for Texinfo output ----------------------------------------------
# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
# dir menu entry, description, category)
# !texinfo_documents = [
# (master_doc, 'DCU', 'DCU Documentation', author, 'DCU',
# 'One line description of project.', 'Miscellaneous'),
# ]
# -- Options for Epub output -------------------------------------------------
# Bibliographic Dublin Core info.
epub_title = project
# -- Options for EPUB output
epub_show_urls = 'footnote'
# The unique identifier of the text. This can be a ISBN number
# or the project homepage.
#
# epub_identifier = ''
# A unique identification for the text.
#
# epub_uid = ''
# A list of files that should not be packed into the epub file.
epub_exclude_files = ['search.html']
# --------------------------------------------------------------
# set priority when building html
# StandaloneHTMLBuilder.supported_image_types = [
# 'image/svg+xml', 'image/gif', 'image/png', 'image/jpeg'
# ]
# -- Extension configuration -------------------------------------------------
# Ignore >>> when copying code
copybutton_prompt_text = r'>>> |\.\.\. '
copybutton_prompt_is_regexp = True
# def setup(app):
# app.add_config_value('no_underscore_emphasis', False, 'env')
# app.add_config_value('m2r_parse_relative_links', False, 'env')
# app.add_config_value('m2r_anonymous_references', False, 'env')
# app.add_config_value('m2r_disable_inline_math', False, 'env')
# app.add_directive('mdinclude', MdInclude)
# app.add_config_value('recommonmark_config', {
# 'auto_toc_tree_section': 'Contents',
# 'enable_eval_rst': True,
# }, True)
# app.add_transform(AutoStructify)
## 下载中心:
- [**驱动下载地址**](https://cancon.hpccube.com:65024/6/main) → latest 驱动→ rock-xxx-xxx.aio.run
- [**DTK下载地址**](https://cancon.hpccube.com:65024/1/main) → latest → 对应的操作系统 → DTK-version-OS-version-x86_64.tar.gz
- [**工具包地址(DCU直通、Kubernets插件、HyQual压力测试、工具包文档)**](https://cancon.hpccube.com:65024/5/main)
- [**DAS生态包下载地址**](https://cancon.hpccube.com:65024/4/main/)
- [**光源地址**](https://sourcefind.cn/#/main-page)
\ No newline at end of file
FAQ-cuda以及hip移植常见问题处理经验
### 问题一、纹理内存报错:
1. /data/wkx/develop/ppl.cv/src/ppl/cv/cuda/warp.hpp:31:8: error: no
template named \'texture\'
2. static texture\<float4, cudaTextureType2D,
3. \^
4. /data/wkx/develop/ppl.cv/src/ppl/cv/cuda/warp.hpp/data/wkx/develop/ppl.cv/src/ppl/cv/cuda/warp.hpp::3131::88::
5. error: error: no template named \'texture\'no template named
\'texture\'
6. 7. static texture\<float4, cudaTextureType2D,
8. static texture\<float4, cudaTextureType2D,
#### **解决方法:**
**CUDA 的 texture 类型在较新版本(CUDA 12
及以上)中已被弃用或移除**。旧版 CUDA(如 CUDA 10 或 11)中可以使用
texture\<T, \...\> 这种全局变量声明方式,但在 **CUDA 12+
中,这种语法不再支持**,必须改用 **cudaTextureObject\_t +
cudaResourceDesc/cudaTextureDesc** 的方式来创建纹理对象
使用DCU的cuda-11.8编译老旧代码即可顺利通过;
### 问题二、 launch bounds (256) 报错:
Launch params (1024, 1, 1) are larger than launch bounds (256) for
kernel \_ZL12rms\_norm\_f32ILi1024EEvPKfPfif please add launch\_bounds
to kernel define or use \--gpu-max-threads-per-block recompile program !
#### 解决方法:
解决方法1:
1. 所有的核函数 \_\_global\_\_ 替换为 \_\_global\_\_
\_\_launch\_bounds\_\_(1024)
解决方法2:
nvcc或者hip编译增加: \--gpu-max-threads-per-block=1024
### 问题三、asm 代码,内联汇编代码编译报错;
![4XDLESZEAAQE6](media/image1.png){width="5.763888888888889in"
height="1.8840080927384077in"}
#### 解决方法:
内嵌 PTX 功能开启需要主动加"-fnline-asm-ptx"选项。
![LMVLGSZEABAFG](media/image2.png){width="5.763888888888889in"
height="2.996674321959755in"}
### 问题四、 cuda应用不转码适配找不到 math.h 头文件
![CISLKSZEACABE](media/image3.png){width="5.763888888888889in"
height="1.9728018372703413in"}
#### 解决方法:
cmake 编译中增加的 -isystem /usr/include 与 nvcc
编译器同时使用会存在冲突。
开启打印,关注编译过程的 完整头文件、库文件的依赖,去掉 -isystem
/usr/include 即可编译成功。
make VERBOSE=1 \<project\>
### 问题五、使用开源的pycuda 无法编译 cu文件
#### 解决方法:
参考这个,更改下 compiler.py 适配 hip 编译;
[[https://ontrack.hygon.cn/browse/CSD-10705]{.underline}](https://ontrack.hygon.cn/browse/CSD-10705)
### 问题六、如何针对一个文件夹的cu代码进行转码
详细可以参考:
![ppt](media/image4.png){width="0.1527777777777778in"
height="0.1527777777777778in"}[[DCU应用移植介绍-程顺延]{.underline}](https://www.kdocs.cn/l/cmD2M59DD2vk)
#### 解决方法:
1. hipconvertinplace-perl.sh \<cuda代码文件夹\>
cuda 文件夹下原有的代码,转码后以 org-name.h/cu.prehip
形式存储在当前目录
由于要使用hip编译, 因此所有的 cu 后缀, 修改为 hip 或者 cpp;
### 问题七、hip转码后部分宏定义不规范不会被转换,可能导致出现问题:
#### 解决方法:
- CublasHandleManager.h
1. \#if !defined(ROCM\_SYMLINK\_HIPBLAS\_H)
2. \#error hipblas.h must be included at the very top of any file
including CublasHandleManager.h
3. \#endif
4. 5. 从 CUBLAS\_V2\_H\_ 更改为 ROCM\_SYMLINK\_HIPBLAS\_H
### 问题八、 math\_constants.h 找不到:
#### 解决方法:
DTK的cuda下有 math\_constants.h 会被别的工程依赖;
hip下不存在对应的代码,可以直接拷贝 math\_constants.h 到工程中使用;
math\_constants.h 仅仅是一些数学值的定义;
### 问题九、转码后部分hip核函数不识别 min:
#### 解决方法:
EddyMatrixKernels.cpp 中不支持 min 的问题解决
1. \_\_global\_\_ void QR(// Input
2. const float \*K, // Row-first matrices to decompose
3. unsigned int m, // Number of rows of K
4. unsigned int n, // Number of columns of K
5. unsigned int nmat, // Number of matrices
6. // Output
7. float \*Qt, // nmat mxm Q matrices
8. float \*R) // nmat mxn R matrices
9. {
10. extern \_\_shared\_\_ float scratch\[\];
11. 12. if (blockIdx.x \< nmat && threadIdx.x \< m) {
13. unsigned int id = threadIdx.x;
14. // unsigned int ntpm = min(m,blockDim.x); // Number of threads per
matrix
15. unsigned int ntpm = (m \< blockDim.x) ? m : blockDim.x;
16. float \*v = scratch;
17. float \*w = &scratch\[m\];
18. const float \*lK = &K\[blockIdx.x\*m\*n\];
19. float \*lQt = &Qt\[blockIdx.x\*m\*m\];
20. float \*lR = &R\[blockIdx.x\*m\*n\];
21. qr\_single(lK,m,n,v,w,id,ntpm,lQt,lR);
22. }
23. return;
24. }
### 问题十、使用 DTK-25.04 之后的软件栈编译报头文件错:
#### 解决方法:
尽量尝试使用 -std=c++17\\-std=c++14
### 问题十一、g++ 编译 hipRuntime(hipMalloc、hipMemcpy)等接口代码,编译报错:
#### 解决方法:
编译时增加宏定义,
\_\_HIP\_PLATFORM\_AMD\_\_
链接依赖增加 -l galaxyhip
## 操作系统版本兼容列表
**注意**:
> 使用`iso`镜像安装操作系统时,请勿允许任何操作系统的更新行为, 否则会带来内核版本的升级,导致安装失败; <br>
> 可参考 `构建DCU基础环境` 的 `2.3\3.2` 下的常用操作系统安装步骤之中的锁核操作;
**操作系统**:
开发者社区推荐操作系统:
| 操作系统 | 版本 | 内核 |
| ------- | --- | ---- |
| Centos | 7.6 | 3.10.0-957.el7.x86_64 |
| Centos | 8.5 | 4.18.0-348.el8.x86_64 |
| Rocky | 8.6 | 4.18.0-348.el8.x86_64 |
| Rocky | 9.2 | 4.18.0-348.el8.x86_64 |
| Ubuntu | 20.04.1 | 5.4.0-42-generic |
| Ubuntu | 22.04 | 5.15.0-25-generic |
| NFS | 3.2 | 3.10.0-957.nfs.5.x86_64 |
| NFS | 4.0 | 4.19.113-14.1.nfs4.x86_64 |
| NFS | 4.0-Desktop | 5.4.0-49-generic |
| UOS | 1021e | 4.19.90-2109.1.0.0108.up2.uel20.x86_64 |
| Kylin | v10 SP2 | 4.19.90-24.4.v2101.ky10.x86_64 |
| Anolis | 8.4 | 4.19.91-23.4.an8.x86_64 |
| Anolis | 8.6 | 4.19.91-26.an8.x86_64 |
| openEuler | 22.03 | 5.10.0-60.18.0.50.oe2203.x86_64 |
| BCLinux | 8.2 | 4.19.0-240.23.11.el8_2.bclinux.x86_64 |
[操作系统兼容性列表包含(兼容性等级等)](https://docs.qq.com/sheet/DVHdTZHB3RVZOVENI?tab=dklqmf)
---
## 支持的DCU型号
- Z100
- Z100L
- K100
- K100_AI
## DCU软件介绍:
- **DTK**:
> DCU加速卡软件工具包:包括函数库、编译环境、管理工具、性能分析工具等。
- **DAS (DCU AI Software Stack)**:
> 目前主要以python的whl形式在光合开发者社区进行发布。
- 算子层
- 框架层
- 扩展组件层
## **兼容性(必读)**
### 加速卡与DTK的兼容性
| DCU猩红 | DTK版本 | 注意 |
| ------- | ------- | ------- |
| Z100 | DTK >=21.04 | 推荐使用 DTK >= 23.10 |
| Z100L | DTK >=21.04 | 推荐使用 DTK >= 23.10 |
| K100 | DTK >=23.10 | |
| K100-AI | DTK >=24.04 | |
### DTK和DAS(AI生态包)兼容性
| DTK版本 | DAS版本 | 注意 |
| ------- | ------- | ------- |
|DTK-24.04.1 | DAS1.1 | 见下述 DAS1.1 使用注意 |
|DTK-24.04 | DAS1.0 | |
**注意:**
> 不兼容的版本可能出现严重的环境问题
---
- DAS1.1 使用注意:
- 不支持操作系统`ubuntu18.04`, `Centos7.6``glibc <= 2.31`的操作系统
- glib 版本查看方式: `ldd --version`
- 如果遇到问题, 建议通过 docker 使用 glibc 高版本的容器系统;
---
欢迎来到 DCU 的环境安装教程!
====================================
资源下载:
-------------
* `驱动下载 <https://cancon.hpccube.com:65024/6/main>`_ → latest 驱动 → rock-xxx-xxx.aio.run
* `DTK下载 <https://cancon.hpccube.com:65024/1/main>`_ → latest → 对应的操作系统 → DTK-version-OS-version-x86_64.tar.gz
* `工具包地址(DCU直通、Kubernets插件、HyQual压力测试、工具包文档) <https://cancon.hpccube.com:65024/5/main>`_
* `DAS生态包下载 <https://cancon.hpccube.com:65024/4/main/>`_
* `光源地址 <https://sourcefind.cn/#/main-page>`_
文档
-------------
.. toctree::
:maxdepth: 2
:caption: 基础介绍
get_started.md
.. toctree::
:maxdepth: 1
:caption: 资源下载地址
download.md
.. toctree::
:maxdepth: 1
:caption: 构建DCU基础环境
./install_dcu_on_os/base_install_intro.md
./install_dcu_on_os/centos.md
./install_dcu_on_os/ubuntu.md
.. toctree::
:maxdepth: 2
:caption: 快速使用 DCU
Anaconda_Docker.md
.. toctree::
:maxdepth: 2
:caption: hy-smi 使用介绍
Hy-SMI.md
.. toctree::
:maxdepth: 2
:caption: 从NV的GPU迁移到DCU
NV_GPU_TO_DCU.md
.. toctree::
:maxdepth: 2
:caption: CUDA 与 HIP 移植常见问题
faq_cuda_hip.md
索引与表格
==================
* :ref:`genindex`
* :ref:`search`
\ No newline at end of file
## 1. 开发者社区 DCU 环境安装手册
该文档主要针对 DCU 加速卡,提供基础软件环境安装部署以及基础测试的参考指导。
建议参考如下文档进行安装DCU基础环境:
[**点击,进入开发者社区环境搭建文档**](https://cancon.hpccube.com:65024/1/main/latest/Document) → DTK 开发环境安装部署手册.pdf
\ No newline at end of file
## **2 DCU基础环境完整教程-Centos7.6**:
### 2.1 **非root用户安装注意事项:**
- 确保非root用户已加入`video`组,以便能够使用DCU。
```shell
# 对于有sudo权限的非root用户
sudo usermod -aG video $USER
# 对于无sudo权限的用户,由root执行
usermod -aG video <userid>
```
### 2.2. **操作系统设置**:
确保启动项中不包含nomodeset选项,如果内核以nomodeset选项启动,则驱动可能无法成功加载。
- 需要保证系统纯净,命令行运行 `lsmod | grep amdgpu` 为空
- 关闭 selinux(可选)
修改`/etc/selinux/config`,设置`SELINUX=disabled`
- 关闭 firewalld(可选)
```bash
systemctl stop firewalld
systemctl disable firewalld
```
### 2.3. **关闭内核自动更新:**
- 编辑`/etc/yum.conf`,在`[main]`部分添加:
```
exclude=kernel*
exclude=centos-release*
```
### 2.4. **更新yum源:**
- 替换为中科大源,针对CentOS 7.6的示例, (注意使用双引号):
```shell
sed -e "s|^mirrorlist=|#mirrorlist=|g" -e "s|^#baseurl=http://mirror.centos.org/centos/\$releasever|baseurl=https://mirrors.ustc.edu.cn/centos-vault/$minorver|g" -i.bak /etc/yum.repos.d/CentOS-*.repo
```
- 替换 `CentOS-CR.repo` 并且 `enable`(安装python3需要)
```shell
sed -i "s|^baseurl=http://mirror.centos.org/centos/\$releasever|baseurl=https://mirrors.ustc.edu.cn/centos-vault/$minorver|g;s|enabled=0|enabled=1|g" /etc/yum.repos.d/CentOS-CR.repo
```
- 对 Centos7 配置 SCLo 源(安装devtoolset需要):
编辑 `/etc/yum.repos.d/CentOS-SCLo.repo` 配置文件;
```shell
vi /etc/yum.repos.d/CentOS-SCLo.repo
```
在 `/etc/yum.repos.d/CentOS-SCLo.repo` 写入以下内容, `Esc + :wq` 保存退出
```shell
[centos-sclo-sclo]
name=CentOS-7 - SCLo sclo
baseurl=https://mirrors.ustc.edu.cn/centos/7/sclo/$basearch/sclo/
gpgcheck=0
enabled=1
gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-SIG-SCLo
[centos-sclo-rh]
name=CentOS-7 - SCLo rh
baseurl=https://mirrors.ustc.edu.cn/centos/7/sclo/$basearch/rh/
gpgcheck=0
enabled=1
gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-SIG-SCLo
```
- 对 Centos7 配置 epel 源(安装cmake3需要)
编辑 `/etc/yum.repos.d/epel-7.repo` 配置文件;
```shell
vi /etc/yum.repos.d/epel-7.repo
```
在/etc/yum.repos.d/epel-7.repo写入以下内容, `Esc + :wq` 保存退出;
```shell
[epel]
name=Extra Packages for Enterprise Linux 7 - $basearch
baseurl=http://mirrors.aliyun.com/epel/7/$basearch
failovermethod=priority
enabled=1
gpgcheck=0
gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-7
[epel-debuginfo]
name=Extra Packages for Enterprise Linux 7 - $basearch – Debug
baseurl=http://mirrors.aliyun.com/epel/7/$basearch/debug
failovermethod=priority
enabled=0
gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-7
gpgcheck=0
[epel-source]
name=Extra Packages for Enterprise Linux 7 - $basearch – Source
baseurl=http://mirrors.aliyun.com/epel/7/SRPMS
failovermethod=priority
enabled=0
gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-7
gpgcheck=0
```
- 更新cache
```shell
yum clean all
yum makecache
```
### 2.5. **安装相关依赖:**
- 联网执行以下命令安装必要的依赖包:
```shell
#安装 DCU 加速卡驱动所需的依赖包命令
yum install -y gcc gcc-c++ rpm-build autoconf kernel-devel-$(uname -r) kernel-headers-$(uname -r)
#安装cmake3
yum install -y cmake3
ln -s /usr/bin/cmake3 /usr/bin/cmake
#安装 DTK 开发环境所需的依赖包命令
yum install -y centos-release-scl
yum install -y gcc gcc-c++ gcc-gfortran elfutils elfutils-devel make rpm-build devtoolset-7
yum install -y libbabeltrace-devel libbabeltrace pciutils-devel libpciaccess-devel
yum install -y numactl-devel elfutils-libelf-devel mesa-libGL-devel
yum install -y epel-release cmake3 pciutils-libs pciutils-devel
yum install -y perl-File-Which perl-File-BaseDir perl-File-Copy-Recursive perl-File-Listing java-1.8.0-openjdk
yum install -y git python python-pip python-devel python-wheel python3 python3-pip python3-devel python3-wheel sqlite-devel libibverbs
yum install -y redhat-lsb-core gettext gettext-devel protobuf
yum install -y perl-Digest perl-Digest-MD5 perl-Data-Dumper vim-common curl libcurl libcurl-devel
yum install -y doxygen graphviz texlive texlive-xtab texlive-multirow texlive-sectsty texlive-tocloft
yum install -y texlive-adjustbox deltarpm tcl automake
```
- 离线安装依赖:
<br>
依赖离线包:rpm_DTK2404_centos7.6_3.10.0-957.tar
<br>
链接:https://pan.baidu.com/s/1jnWfddL4lHWVQb3btD34Iw?pwd=wj6q
<br>
提取码:wj6q
<br>
下载好导入服务器,进行解压:
<br>
1. `vi /etc/yum.repos.d/localyum.repo`,输入如下内容,请根据解压的路径,修改baseurl的内容,下面配置是在root路径下解压的依赖包;
```shell
[local-repo]
name=local-repo
baseurl=file:///root/centos7-dtk24.04
enabled=1
gpgcheck=0
```
2. 开始安装离线依赖包
```shell
sudo yum clean all
#以安装cmake3为例
yum --disablerepo="*" --enablerepo="local-repo" install cmake3
```
### 2.6. **校验系统配置:**
| 设备名称 | 设备码 |
| ----- | ----- |
| Z100L | 1d94:55b7 |
| K100 | 1d94:62b7 |
| K100_AI | 1d94:6210 |
- 查看DCU设备
```shell
# Z100L
root@sugontest79:/mnt#lspci -nn |grep -i 55b7
------------------------------------------------------------------------------------------------------
07:00.0 Display controller [0380]: Chengdu Haiguang IC Design Co., Ltd. ZIFANG [1d94:55b7] (rev 01)
0a:00.0 Display controller [0380]: Chengdu Haiguang IC Design Co., Ltd. ZIFANG [1d94:55b7] (rev 01)
------------------------------------------------------------------------------------------------------
# K100
root@sugontest79:/mnt#lspci -nn |grep -i 62b7
------------------------------------------------------------------------------------------------------
07:00.0 Co-processor [0b40]: Chengdu Haiguang IC Design Co., Ltd. KONGMING [1d94:62b7] (rev 01)
0a:00.0 Co-processor [0b40]: Chengdu Haiguang IC Design Co., Ltd. KONGMING [1d94:62b7] (rev 01)
------------------------------------------------------------------------------------------------------
# K100-AI
root@sugontest79:/mnt#lspci -nn |grep -i 6210
------------------------------------------------------------------------------------------------------
07:00.0 Co-processor [0b40]: Chengdu Haiguang IC Design Co., Ltd. KONGMING [1d94:6210] (rev 01)
0a:00.0 Co-processor [0b40]: Chengdu Haiguang IC Design Co., Ltd. KONGMING [1d94:6210] (rev 01)
------------------------------------------------------------------------------------------------------
```
> 注:输出信息可能和截图不完全一致。
- 配置环境变量
<br>
创建文件 `/etc/profile.d/devtoolset-7.sh`,内容如下:
<br>
`source /opt/rh/devtoolset-7/enable`
<br>
退出当前登录会话重新登录,或者执行source /etc/profile.d/devtoolset-7.sh。
### 2.7. **安装驱动:**
**注意:**
> DTK和rock驱动有对应关系,可参考[dcu-环境安装手册](#DCU环境安装手册),推荐安装最新的使用<br>
> -安装驱动之前需要安装基础包,包括cmake、gcc等多种基础依赖包,请先参考`DCU环境安装手册`完成基础环境包的安装。
**驱动下载地址**: [https://cancon.hpccube.com:65024/6/main](https://cancon.hpccube.com:65024/6/main) → latest 驱动→ rock-xxx-xxx.aio.run
1. 安装 DCU 加速卡驱动
```bash
chmod 755 rock-5.7.1-6.2.13-V1.0.1a.aio.run
./rock-5.7.1-6.2.13-V1.0.1a.aio.run
```
2. 如果安装过程更新了`vbios`, 则需要重启机器
```bash
reboot
```
3. 查看验证是否安装成功
```bash
# 出现如下类似结果则安装成功
[root@b04r3n02 ~]# lsmod | grep hydcu
hydcu 1435342 0
hydcu_sched 34432 1 hydcu
hyttm 61919 1 hydcu
hykcl 46567 3 hydcu_sched,hydcu,hyttm
hy_extra 32140 3 hydcu_sched,hydcu,hykcl
amd_iommu_v2 18821 1 hydcu
drm_kms_helper 179394 3 ast,hydcu,hykcl
drm 429744 8 ast,ttm,hydcu,hykcl,hyttm,drm_kms_helper
```
4. 卸载驱动步骤:
<br>
如遇到异常情况或需要更新版本,先执行驱动卸载。
<br>
```shell
rpm -qa | grep rock #查询安装的驱动版本
rmmod hydcu
rpm -e rock-5.7.1-6.2.18-1.x86_64
```
### 2.8. **安装DTK:**
**DTK下载地址**: [https://cancon.hpccube.com:65024/1/main](https://cancon.hpccube.com:65024/1/main) → latest → 对应的操作系统 → DTK-version-OS-version-x86_64.tar.gz
1. 安装
```bash
# 解压安装
tar xvf DTK-24.04.1-CentOS7.6-x86_64.tar.gz -C /opt
# 创建软连接
ln -s /opt/dtk-24.04.1 /opt/dtk
```
3. 设置 DTK 环境变量
> DTK 压缩文件中提供了设置环境变量脚本 env.sh。可以通过 source /opt/dtk/env.sh 的方式临时加载环境变量。为避免多次配置,常用以下方式加载环境变量
```bash
echo "source /opt/dtk/env.sh">> ~/.bashrc
# 激活环境变量
source ~/.bashrc
```
4. 验证 DCU 环境
```bash
# 查看并执行 hy-smi 或者 rocm-smi 指令查看 dcu 基本信息
[root@h01r4n04~]# rocm-smi
# 出现如下内容, 则安装成功
===================System Management Interface =================
==========================================================
DCU Temp AvgPwr Fan Perf PwrCap VRAM% DCU%
0 50.0c 55.0W 0.0% auto 450.0W 0% 0%
1 50.0c 58.0W 0.0% auto 450.0W 0% 0%
2 49.0c 58.0W 0.0% auto 450.0W 0% 0%
3 49.0c 55.0W 0.0% auto 450.0W 0% 0%
==========================================================
======================End of SMI Log========================
```
### 2.9. **验证安装结果:**
1. 使用`rocminfo`命令检查ROCm系统状态
<br>
终端输入如下内容:
<br>
```shell
rocminfo | grep gfx
# 其中Z100/Z100L为gfx906,K100为gfx926,K100_AI为gfx928; 有输出即说明驱动和DTK安装成功
------------------------------------------------------------------------------------------------------------
Name: amdgcn-amd-amdhsa--gfx906:sramecc+:xnack-
Name: amdgcn-amd-amdhsa--gfx906:sramecc+:xnack-
Name: amdgcn-amd-amdhsa--gfx906:sramecc+:xnack-
Name: amdgcn-amd-amdhsa--gfx906:sramecc+:xnack-
Name: amdgcn-amd-amdhsa--gfx906:sramecc+:xnack-
Name: amdgcn-amd-amdhsa--gfx906:sramecc+:xnack-
Name: amdgcn-amd-amdhsa--gfx906:sramecc+:xnack-
Name: amdgcn-amd-amdhsa--gfx906:sramecc+:xnack-
```
<br>
2. 运行`hy-smi``rocm-smi`来监控DCU的状态和性能指标;
```shell
# 查看并执行 hy-smi 或者 rocm-smi 指令查看 dcu 基本信息
[root@h01r4n04~]# rocm-smi
# 出现如下内容, 则安装成功
============================ System Management Interface =============================
======================================================================================
DCU Temp AvgPwr Perf PwrCap VRAM% DCU% Mode
0 42.0C 39.0W auto 280.0W 0% 0% Normal
1 41.0C 39.0W auto 280.0W 0% 0% Normal
2 41.0C 36.0W auto 280.0W 0% 0% Normal
3 40.0C 38.0W auto 280.0W 0% 0% Normal
4 40.0C 39.0W auto 280.0W 0% 0% Normal
5 41.0C 41.0W auto 280.0W 0% 0% Normal
6 42.0C 37.0W auto 280.0W 0% 0% Normal
7 41.0C 36.0W auto 280.0W 0% 0% Normal
======================================================================================
=================================== End of SMI Log ===================================
```
## **3 DCU基础环境完整教程-ubuntu**:
### 3.1. **安装系统过程中注意:**
> Ubuntu20.04.1(5.4.0-42-generic)和Ubuntu22.04.1(5.15.0-25-generic)已在 DCU 上进行全量验证,本文以Ubuntu20.04.1(5.4.0-42-generic)安装部署为例,如果是其他版本或者kernel不一致可能导致驱动无效
1. 安装时不要连接网络(直接断掉网线或者在安装系统时将网络disable),否则会自动升级内核(<font color="red">即使选择了不更新操作</font>)。
安装系统时将网络disable;
![disable_net](../imgs/ubuntu/disable_net.png)
2. 安装过程中, 涉及到更新的选项都选择不更新;
### 3.2. **安装完成系统后的配置**
1. 安装完系统后确定内核, 是否符合兼容性, 如果不符合兼容性,
```shell
root@test79:/mnt# uname -r
5.4.0-173-generic
```
建议确认
- 使用的操作系统版本是否在兼容性列表;
- 是否在安装操作系统中禁用网络,选择不更新的相关配置;
2. 确保非root用户已加入`video`组,以便能够使用DCU;
- 确保非root用户已加入`video`组,以便能够使用DCU, shell 命令修改;
```shell
# 对于有sudo权限的非root用户
sudo usermod -aG video $USER
# 对于无sudo权限的用户,由root执行
usermod -aG video <userid>
```
- 或者,或者修改/etc/group文件,添加test用户;
![disable_net](../imgs/ubuntu/render.png)
退出重新登录。
3. 关闭内核自动更新:
- 查看安装内核:
```shell
dpkg --list | grep linux-image
dpkg --list | grep linux-headers
dpkg --list | grep linux-modules
# 输出结果:-----------------------------------------------------------------------------------------------------------
ii linux-image-5.4.0-173-generic 5.4.0-173.191 amd64 Signed kernel image generic
hi linux-image-5.4.0-42-generic 5.4.0-42.46 amd64 Signed kernel image generic
ii linux-image-generic 5.4.0.173.171 amd64 Generic Linux kernel image
```
- 禁止内核更新方法1:
```shell
sudo vi /etc/apt/apt.conf.d/10periodic
sudo vi /etc/apt/apt.conf.d/20auto-upgrades
# 后面部分全部改成 “0”
# 修改后内容 ----------------------------------------
# 10periodic
APT::Periodic::Update-Package-Lists "0";
APT::Periodic::Download-Upgradeable-Packages "0";
APT::Periodic::AutocleanInterval "0";
# 20auto-upgrades
APT::Periodic::Update-Package-Lists "0";
APT::Periodic::Unattended-Upgrade "0";
```
- 禁止内核更新方法2:
直接使用hold参数,固定内核版本:
```shell
sudo apt-mark hold linux-image-5.4.0-42-generic
sudo apt-mark hold linux-headers-5.4.0-42-generic
sudo apt-mark hold linux-modules-extra-5.4.0-42-generic
```
查询 Ubuntu系统被锁定不更新的软件包状态(hold)
```shell
sudo dpkg --get-selections | grep hold
```
### 3.3. **更新国内软件下载源:**
1. 备份源文件:
```shell
sudo cp /etc/apt/sources.list /etc/apt/sources.list.bak
```
2. 修改源文件sources.list: 将原文件内容全部注释或删掉,添加以下内容;
```shell
# 当前使用阿里源, 如果想使用其他源, 可以自行在网上搜索其他源的配置文件,并替换掉以下内容;
# 注意换源的时候需要和操作系统的版本号作匹配
deb http://mirrors.aliyun.com/ubuntu/ focal main restricted universe multiverse
deb-src http://mirrors.aliyun.com/ubuntu/ focal main restricted universe multiverse
deb http://mirrors.aliyun.com/ubuntu/ focal-security main restricted universe multiverse
deb-src http://mirrors.aliyun.com/ubuntu/ focal-security main restricted universe multiverse
deb http://mirrors.aliyun.com/ubuntu/ focal-updates main restricted universe multiverse
deb-src http://mirrors.aliyun.com/ubuntu/ focal-updates main restricted universe multiverse
deb http://mirrors.aliyun.com/ubuntu/ focal-proposed main restricted universe multiverse
deb-src http://mirrors.aliyun.com/ubuntu/ focal-proposed main restricted universe multiverse
deb http://mirrors.aliyun.com/ubuntu/ focal-backports main restricted universe multiverse
deb-src http://mirrors.aliyun.com/ubuntu/ focal-backports main restricted universe multiverse
```
如果是ubuntu-22.04可以直接使用如下方式, 更换为华为源;
```shell
# 修改为华为源
sudo sed -i "s@http://.*archive.ubuntu.com@http://repo.huaweicloud.com@g" /etc/apt/sources.list
sudo sed -i "s@http://.*security.ubuntu.com@http://repo.huaweicloud.com@g" /etc/apt/sources.list
```
3. 更新:
```shell
sudo apt-get update
```
### 3.4. **安装相关依赖:**
- 联网执行以下命令安装必要的依赖包:
```shell
# 安装 DCU 加速卡驱动所需的依赖包命令
sudo apt-get install -y cmake gcc autoconf linux-kernel-headers kernel-package automake linux-modules-extra-`uname -r` linux-image-`uname -r` linux-headers-`uname -r`
# 安装 DTK 开发环境所需的依赖包命令
sudo apt-get install -y make gcc g++ cmake git wget gfortran elfutils libdrm-dev
sudo apt-get install -y kmod libtinfo5 sqlite3 libsqlite3-dev libelf-dev libibverbs1 libgtk2.0-0
sudo apt-get install -y libnuma-dev libgl1-mesa-dev rpm rsync mesa-common-dev apt-utils
sudo apt-get install -y cmake libpci-dev pciutils libpciaccess-dev libbabeltrace-dev pkg-config
sudo apt-get install -y libfile-which-perl libfile-basedir-perl libfile-copy-recursive-perl libfile-listing-perl
sudo apt-get install -y python3 python3-pip python3-dev python3-wheel
sudo apt-get install -y gettext gettext-base libprotobuf-dev tcl
sudo apt-get install -y libio-digest-perl libdigest-md5-file-perl libdata-dumper-simple-perl vim curl libcurlpp-dev
sudo apt-get install -y doxygen graphviz texlive libncurses5 msgpack*
sudo apt install mlocate
```
- 离线安装依赖:
<br>
依赖离线包:deb_DTK2404_Ubuntu20.04.1_5.4.0-42-generic.tar
<br>
链接:https://pan.baidu.com/s/1jnWfddL4lHWVQb3btD34Iw?pwd=wj6q
<br>
提取码:wj6q
<br>
下载好导入服务器,进行解压:
<br>
1. `vi /etc/apt/sources.list.d/myrepo.list`,输入如下内容,注意修改解压的路径,下面配置是在/data路径下解压的;
```shell
vi /etc/apt/sources.list.d/myrepo.list
#输入如下内容,注意修改解压的路径,下面配置是在/data路径下解压的
deb [trusted=yes] file:///data/my-debian-packages ./
#保存退出即可
chmod +r /data
chown -R _apt:root /datahR
chown -R man:root /var/cache/man
#更新系统的apt缓存,使其能够识别并使用新添加的本地私有源
apt update
apt-get install udev
apt install mlocate
```
更新了之后,就可以通过`apt install`安装依赖包了。
### 3.5. **校验系统配置:**
| 设备名称 | 设备码 |
| ----- | ----- |
| Z100L | 1d94:55b7 |
| K100 | 1d94:62b7 |
| K100_AI | 1d94:6210 |
- 查看DCU设备
```shell
# Z100L
root@sugontest79:/mnt#lspci -nn |grep -i 55b7
------------------------------------------------------------------------------------------------------
07:00.0 Display controller [0380]: Chengdu Haiguang IC Design Co., Ltd. ZIFANG [1d94:55b7] (rev 01)
0a:00.0 Display controller [0380]: Chengdu Haiguang IC Design Co., Ltd. ZIFANG [1d94:55b7] (rev 01)
------------------------------------------------------------------------------------------------------
# K100
root@sugontest79:/mnt#lspci -nn |grep -i 62b7
------------------------------------------------------------------------------------------------------
07:00.0 Co-processor [0b40]: Chengdu Haiguang IC Design Co., Ltd. KONGMING [1d94:62b7] (rev 01)
0a:00.0 Co-processor [0b40]: Chengdu Haiguang IC Design Co., Ltd. KONGMING [1d94:62b7] (rev 01)
------------------------------------------------------------------------------------------------------
# K100-AI
root@sugontest79:/mnt#lspci -nn |grep -i 6210
------------------------------------------------------------------------------------------------------
07:00.0 Co-processor [0b40]: Chengdu Haiguang IC Design Co., Ltd. KONGMING [1d94:6210] (rev 01)
0a:00.0 Co-processor [0b40]: Chengdu Haiguang IC Design Co., Ltd. KONGMING [1d94:6210] (rev 01)
------------------------------------------------------------------------------------------------------
```
> 注:输出信息可能和截图不完全一致。
### 3.6. **安装驱动:**
**注意:**
> DTK和rock驱动有对应关系,可参考[dcu-环境安装手册](#DCU环境安装手册),推荐安装最新的使用<br>
> -安装驱动之前需要安装基础包,包括cmake、gcc等多种基础依赖包,请先参考`DCU环境安装手册`完成基础环境包的安装。
**驱动下载地址**: [https://cancon.hpccube.com:65024/6/main](https://cancon.hpccube.com:65024/6/main) → latest 驱动→ rock-xxx-xxx.aio.run
1. 安装 DCU 加速卡驱动
```bash
chmod 755 rock-5.7.1-6.2.13-V1.0.1a.aio.run
./rock-5.7.1-6.2.13-V1.0.1a.aio.run
```
2. 如果安装过程更新了`vbios`, 则需要重启机器
```bash
reboot
```
3. 查看验证是否安装成功
```bash
# 出现如下类似结果则安装成功
[root@b04r3n02 ~]# lsmod | grep hydcu
hydcu 1435342 0
hydcu_sched 34432 1 hydcu
hyttm 61919 1 hydcu
hykcl 46567 3 hydcu_sched,hydcu,hyttm
hy_extra 32140 3 hydcu_sched,hydcu,hykcl
amd_iommu_v2 18821 1 hydcu
drm_kms_helper 179394 3 ast,hydcu,hykcl
drm 429744 8 ast,ttm,hydcu,hykcl,hyttm,drm_kms_helper
```
4. 卸载驱动步骤:
<br>
如遇到异常情况或需要更新版本,先执行驱动卸载。
<br>
```shell
rpm -qa | grep rock #查询安装的驱动版本
rmmod hydcu
rpm -e rock-5.7.1-6.2.18-1.x86_64
```
### 3.7. **安装DTK:**
**DTK下载地址**: [https://cancon.hpccube.com:65024/1/main](https://cancon.hpccube.com:65024/1/main) → latest → 对应的操作系统 → DTK-version-OS-version-x86_64.tar.gz
1. 安装
```bash
# 解压安装
tar xvf DTK-24.04.1-CentOS7.6-x86_64.tar.gz -C /opt
# 创建软连接
ln -s /opt/dtk-24.04.1 /opt/dtk
```
3. 设置 DTK 环境变量
> DTK 压缩文件中提供了设置环境变量脚本 env.sh。可以通过 source /opt/dtk/env.sh 的方式临时加载环境变量。为避免多次配置,常用以下方式加载环境变量
```bash
echo "source /opt/dtk/env.sh">> ~/.bashrc
# 激活环境变量
source ~/.bashrc
```
4. 验证 DCU 环境
```bash
# 查看并执行 hy-smi 或者 rocm-smi 指令查看 dcu 基本信息
[root@h01r4n04~]# rocm-smi
# 出现如下内容, 则安装成功
===================System Management Interface =================
==========================================================
DCU Temp AvgPwr Fan Perf PwrCap VRAM% DCU%
0 50.0c 55.0W 0.0% auto 450.0W 0% 0%
1 50.0c 58.0W 0.0% auto 450.0W 0% 0%
2 49.0c 58.0W 0.0% auto 450.0W 0% 0%
3 49.0c 55.0W 0.0% auto 450.0W 0% 0%
==========================================================
======================End of SMI Log========================
```
### 3.8. **验证安装结果:**
1. 使用`rocminfo`命令检查ROCm系统状态
<br>
终端输入如下内容:
<br>
```shell
rocminfo | grep gfx
# 其中Z100/Z100L为gfx906,K100为gfx926,K100_AI为gfx928; 有输出即说明驱动和DTK安装成功
------------------------------------------------------------------------------------------------------------
Name: amdgcn-amd-amdhsa--gfx906:sramecc+:xnack-
Name: amdgcn-amd-amdhsa--gfx906:sramecc+:xnack-
Name: amdgcn-amd-amdhsa--gfx906:sramecc+:xnack-
Name: amdgcn-amd-amdhsa--gfx906:sramecc+:xnack-
Name: amdgcn-amd-amdhsa--gfx906:sramecc+:xnack-
Name: amdgcn-amd-amdhsa--gfx906:sramecc+:xnack-
Name: amdgcn-amd-amdhsa--gfx906:sramecc+:xnack-
Name: amdgcn-amd-amdhsa--gfx906:sramecc+:xnack-
```
<br>
2. 运行`hy-smi``rocm-smi`来监控DCU的状态和性能指标;
```shell
# 查看并执行 hy-smi 或者 rocm-smi 指令查看 dcu 基本信息
[root@h01r4n04~]# rocm-smi
# 出现如下内容, 则安装成功
============================ System Management Interface =============================
======================================================================================
DCU Temp AvgPwr Perf PwrCap VRAM% DCU% Mode
0 42.0C 39.0W auto 280.0W 0% 0% Normal
1 41.0C 39.0W auto 280.0W 0% 0% Normal
2 41.0C 36.0W auto 280.0W 0% 0% Normal
3 40.0C 38.0W auto 280.0W 0% 0% Normal
4 40.0C 39.0W auto 280.0W 0% 0% Normal
5 41.0C 41.0W auto 280.0W 0% 0% Normal
6 42.0C 37.0W auto 280.0W 0% 0% Normal
7 41.0C 36.0W auto 280.0W 0% 0% Normal
======================================================================================
=================================== End of SMI Log ===================================
```
[build-system]
requires = ["flit_core >=3.2,<4"]
build-backend = "flit_core.buildapi"
[project]
name = "lumache"
authors = [{name = "DCU user", email = "wkx1025@foxmail.com"}]
dynamic = ["version", "description"]
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Tuple
__version__ = '0.2.0'
short_version = __version__
def parse_version_info(version_str: str) -> Tuple:
"""Parse version from a string.
Args:
version_str (str): A string represents a version info.
Returns:
tuple: A sequence of integer and string represents version.
"""
_version_info = []
for x in version_str.split('.'):
if x.isdigit():
_version_info.append(int(x))
elif x.find('rc') != -1:
patch_version = x.split('rc')
_version_info.append(int(patch_version[0]))
_version_info.append(f'rc{patch_version[1]}')
return tuple(_version_info)
version_info = parse_version_info(__version__)
\ No newline at end of file
my-docs @ 3612c11e
Subproject commit 3612c11ea62b09e1ffd3387b7cc4649a8c0f6df2
# Welcome to MkDocs
For full documentation visit [mkdocs.org](https://www.mkdocs.org).
## Commands
* `mkdocs new [dir-name]` - Create a new project.
* `mkdocs serve` - Start the live-reloading docs server.
* `mkdocs build` - Build the documentation site.
* `mkdocs -h` - Print help message and exit.
## Project layout
mkdocs.yml # The configuration file.
docs/
index.md # The documentation homepage.
... # Other markdown pages, images and other files.
mkdocs
mkdocs-material
mike
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment