Commit 3fa19ffc authored by aiss's avatar aiss
Browse files

Merge branch 'ds-v0.9.2-rocm' into 'main'

Ds v0.9.2 rocm

See merge request dcutoolkit/deeplearing/deepspeed!3
parents c25a91b6 25d5540b
This diff is collapsed.
# DeepSpeed
## 安装
DeepSpeed 支持
+ Python 3.7.
+ Python 3.8.
+ Python 3.9.
### 使用pip安装
DeepSpeed whl包下载目录:[https://cancon.hpccube.com:65024/4/main/deepspeed/dtk23.04](https://cancon.hpccube.com:65024/4/main/deepspeed/dtk23.04)
根据对应的pytorch版本和python版本,下载对应deepspeed的whl包
```shell
pip install deepspeed* (下载的deepspeed的whl包)
```
### 使用源码安装
编译之前,需要先安装对应版本python,安装相应的三方包依赖项,并配置DTK环境变量(以Centos7.x为例)。
pytorch whl包下载目录:[https://cancon.hpccube.com:65024/4/main/pytorch/dtk23.04](https://cancon.hpccube.com:65024/4/main/pytorch/dtk23.04)
根据python版本,下载对应pytorch的whl包。如果是基于pytorch1.13,需要注释掉op_builder/builder.py中大概L659:
```bash
#sources[i] = str(src.relative_to(curr_file))
```
安装依赖项:
```bash
# 安装三方包的源
yum install epel-release -y
# 安装相关依赖项
yum install libffi-devel -y
yum -y install openssl openssl-devel
yum install -y libaio-devel
yum install -y libaio
# 配置libiomp5.so库,可以复用系统下现用的so库或者自主安装,指定该动态库的位置。例如export LIBRARY_PATH=/usr/local/lib:$LIBRARY_PATH
# 若python内未包含相关项,需基于上面安装的三方包重新源码编译python,再配置python环境
python3 -m pip install --upgrade pip setuptools
pip3 install wheel -i https://pypi.tuna.tsinghua.edu.cn/simple
pip3 install ninja -i https://pypi.tuna.tsinghua.edu.cn/simple
```
下载DTK并配置环境变量:
```bash
# DTK tar包下载目录:光合社区/资源工具/DCU Toolkit/DTK23.04(https://cancon.hpccube.com:65024/1/main/DTK-23.04),根据系统选择对应DTK的tar包,并解压至/opt目录。
# 如果使用的是dtk23.04前的版本,可以参考以图片下方式修改torch中的hipify文件
export ROCM_PATH=/opt/dtk-23.04
source /opt/dtk-23.04/env.sh
```
![logo](hipify_20230511113250.png)
编译deepspeed
```bash
# 下载源码
git clone -b ds-v0.9.2-rocm http://developer.hpccube.com/codes/aicomponent/deepspeed.git
cd deepspeed
sh requirements/run_pip.sh
DS_BUILD_STRING=.dtk22.10.1.torch1.10 DS_BUILD_RANDOM_LTD=0 DS_BUILD_QUANTIZER=0 DS_BUILD_TRANSFORMER_INFERENCE=0 DS_BUILD_OPS=1 verbose=1 CXX=hipcc CC=hipcc python3 setup.py install bdist_wheel
```
安装deepspeed
```bash
# deepspeed的whl包会在dist文件夹生成
pip3 install ./dist/deepspeed*
```
## Note
+ 若使用 pip install 下载安装过慢,可添加国内源:-i https://pypi.tuna.tsinghua.edu.cn/simple/
+ deepspeed共设置两种版本号查询方式__version__ 和__dcu_version__,分别标识主版本号(与官网版本一致)和基于dcu适配的内部版本号。例如:
```bash
#编译后的whl包
[root@26388537c721 deepspeed-v0.9.2-release]# ls dist/
deepspeed-0.9.2+8cfd4af.dtk22.10.1.torch1.10-cp37-cp37m-linux_x86_64.whl
deepspeed-0.9.2+8cfd4af.dtk22.10.1.torch1.10-py3.7-linux-x86_64.egg
#查询deepspeed主版本号
[root@26388537c721 deepspeed-v0.9.2-release]# python3 -c "import deepspeed as ds; print(ds.__version__)"
0.9.2
#查询deepspeed基于dcu的内部版本号
[root@26388537c721 deepspeed-v0.9.2-release]# python3 -c "import deepspeed as ds; print(ds.__dcu_version__)"
0.9.2+8cfd4af.dtk22.10.1.torch1.10
```
This diff is collapsed.
...@@ -36,7 +36,7 @@ from .runtime import DeepSpeedOptimizer, ZeROOptimizer ...@@ -36,7 +36,7 @@ from .runtime import DeepSpeedOptimizer, ZeROOptimizer
from .pipe import PipelineModule from .pipe import PipelineModule
from .git_version_info import version, git_hash, git_branch from .git_version_info import version, dcu_version, git_hash, git_branch
def _parse_version(version_str): def _parse_version(version_str):
...@@ -47,6 +47,8 @@ def _parse_version(version_str): ...@@ -47,6 +47,8 @@ def _parse_version(version_str):
# Export version information # Export version information
__version__ = version __version__ = version
#aiss
__dcu_version__ = dcu_version
__version_major__, __version_minor__, __version_patch__ = _parse_version(__version__) __version_major__, __version_minor__, __version_patch__ = _parse_version(__version__)
__git_hash__ = git_hash __git_hash__ = git_hash
__git_branch__ = git_branch __git_branch__ = git_branch
......
...@@ -15,6 +15,7 @@ except ModuleNotFoundError: ...@@ -15,6 +15,7 @@ except ModuleNotFoundError:
version = "0.0.0" version = "0.0.0"
git_hash = '[none]' git_hash = '[none]'
git_branch = '[none]' git_branch = '[none]'
dcu_version = '[none]'
from .ops.op_builder.all_ops import ALL_OPS from .ops.op_builder.all_ops import ALL_OPS
installed_ops = dict.fromkeys(ALL_OPS.keys(), False) installed_ops = dict.fromkeys(ALL_OPS.keys(), False)
......
...@@ -29,7 +29,8 @@ class FusedAdamBuilder(CUDAOpBuilder): ...@@ -29,7 +29,8 @@ class FusedAdamBuilder(CUDAOpBuilder):
return args + self.version_dependent_macros() return args + self.version_dependent_macros()
def nvcc_args(self): def nvcc_args(self):
nvcc_flags = ['-O3'] + self.version_dependent_macros() nvcc_flags = super().nvcc_args()
nvcc_flags += ['-O3'] + self.version_dependent_macros()
if not self.is_rocm_pytorch(): if not self.is_rocm_pytorch():
nvcc_flags.extend( nvcc_flags.extend(
['-allow-unsupported-compiler' if sys.platform == "win32" else '', '-lineinfo', '--use_fast_math'] + ['-allow-unsupported-compiler' if sys.platform == "win32" else '', '-lineinfo', '--use_fast_math'] +
......
...@@ -29,7 +29,8 @@ class FusedLambBuilder(CUDAOpBuilder): ...@@ -29,7 +29,8 @@ class FusedLambBuilder(CUDAOpBuilder):
return args + self.version_dependent_macros() return args + self.version_dependent_macros()
def nvcc_args(self): def nvcc_args(self):
nvcc_flags = ['-O3'] + self.version_dependent_macros() nvcc_flags = super().nvcc_args()
nvcc_flags += ['-O3'] + self.version_dependent_macros()
if self.is_rocm_pytorch(): if self.is_rocm_pytorch():
ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version() ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
nvcc_flags += ['-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR, '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR] nvcc_flags += ['-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR, '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR]
......
#!/bin/bash #!/bin/bash
export LIBRARY_PATH=/usr/local/lib:$LIBRARY_PATH export LIBRARY_PATH=/usr/local/lib:$LIBRARY_PATH
export DS_BUILD_VERSION=dtk22.10.1 #export DS_BUILD_VERSION=dtk22.10.1
export LD_LIBRARY_PATH=/usr/local/lib/python3.7/site-packages/torch/lib:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=/usr/local/lib/python3.7/site-packages/torch/lib:$LD_LIBRARY_PATH
DS_BUILD_RANDOM_LTD=0 DS_BUILD_QUANTIZER=0 DS_BUILD_TRANSFORMER_INFERENCE=0 DS_BUILD_OPS=1 verbose=1 CXX=hipcc CC=hipcc python3 setup.py install bdist_wheel DS_BUILD_RANDOM_LTD=0 DS_BUILD_QUANTIZER=0 DS_BUILD_TRANSFORMER_INFERENCE=0 DS_BUILD_OPS=1 verbose=1 CXX=hipcc CC=hipcc python3 setup.py install bdist_wheel
...@@ -114,6 +114,7 @@ if torch_available and not torch.cuda.is_available(): ...@@ -114,6 +114,7 @@ if torch_available and not torch.cuda.is_available():
print("[WARNING] Torch did not find cuda available, if cross-compiling or running with cpu only " print("[WARNING] Torch did not find cuda available, if cross-compiling or running with cpu only "
"you can ignore this message. Adding compute capability for Pascal, Volta, and Turing " "you can ignore this message. Adding compute capability for Pascal, Volta, and Turing "
"(compute capabilities 6.0, 6.1, 6.2)") "(compute capabilities 6.0, 6.1, 6.2)")
if os.environ.get("TORCH_CUDA_ARCH_LIST", None) is None: if os.environ.get("TORCH_CUDA_ARCH_LIST", None) is None:
os.environ["TORCH_CUDA_ARCH_LIST"] = get_default_compute_capabilities() os.environ["TORCH_CUDA_ARCH_LIST"] = get_default_compute_capabilities()
...@@ -179,16 +180,13 @@ print(f'Install Ops={install_ops}') ...@@ -179,16 +180,13 @@ print(f'Install Ops={install_ops}')
# Write out version/git info. # Write out version/git info.
git_hash_cmd = "git rev-parse --short HEAD" git_hash_cmd = "git rev-parse --short HEAD"
git_branch_cmd = "git rev-parse --abbrev-ref HEAD" git_branch_cmd = "git rev-parse --abbrev-ref HEAD"
if command_exists('git') and 'DS_BUILD_STRING' not in os.environ: #if command_exists('git') and 'DS_BUILD_STRING' not in os.environ:
if command_exists('git'):
try: try:
result = subprocess.check_output(git_hash_cmd, shell=True) result = subprocess.check_output(git_hash_cmd, shell=True)
git_hash = result.decode('utf-8').strip() git_hash = result.decode('utf-8').strip()
result = subprocess.check_output(git_branch_cmd, shell=True) result = subprocess.check_output(git_branch_cmd, shell=True)
git_branch = result.decode('utf-8').strip() git_branch = result.decode('utf-8').strip()
#add dtk version
if os.getenv('DS_BUILD_VERSION'):
version_dtk = os.getenv('DS_BUILD_VERSION', "")
git_hash += "." + version_dtk
except subprocess.CalledProcessError: except subprocess.CalledProcessError:
git_hash = "unknown" git_hash = "unknown"
...@@ -216,24 +214,22 @@ if sys.platform == "win32": ...@@ -216,24 +214,22 @@ if sys.platform == "win32":
# Parse the DeepSpeed version string from version.txt. # Parse the DeepSpeed version string from version.txt.
version_str = open('version.txt', 'r').read().strip() version_str = open('version.txt', 'r').read().strip()
dcu_version_str = version_str
abi_version = subprocess.getoutput("echo '#include <string>' | gcc -x c++ -E -dM - | fgrep _GLIBCXX_USE_CXX11_ABI >tmp.txt")
abi_version = 'abi' + subprocess.getoutput("awk '{print $3}' tmp.txt")
# print(abi_version)
# Build specifiers like .devX can be added at install time. Otherwise, add the git hash. dtk_version_path = subprocess.getoutput("echo $DTKROOT")
# Example: DS_BUILD_STRING=".dev20201022" python setup.py sdist bdist_wheel.
with open("{0}/.info/rocm_version".format(dtk_version_path), 'r') as fd:
# Building wheel for distribution, update version file. dtk_version_str = fd.read().split('.')[:2]
if 'DS_BUILD_STRING' in os.environ: dtk_version_str ='dtk' + "".join(dtk_version_str)
# Build string env specified, probably building for distribution.
with open('build.txt', 'w') as fd: torch_version = "torch"+ subprocess.getoutput("python3 -c \"import torch;print(torch.__version__)\"")
fd.write(os.environ.get('DS_BUILD_STRING')) # print(torch_version)
version_str += os.environ.get('DS_BUILD_STRING')
elif os.path.isfile('build.txt'):
# build.txt exists, probably installing from distribution.
with open('build.txt', 'r') as fd:
version_str += fd.read().strip()
else:
# None of the above, probably installing from source.
version_str += f'+{git_hash}'
dcu_version_str += f'+git{git_hash}' + '.' + abi_version + '.'+ dtk_version_str + '.' + torch_version
print("dcu_version_str=", dcu_version_str)
torch_version = ".".join([TORCH_MAJOR, TORCH_MINOR]) torch_version = ".".join([TORCH_MAJOR, TORCH_MINOR])
bf16_support = False bf16_support = False
# Set cuda_version to 0.0 if cpu-only. # Set cuda_version to 0.0 if cpu-only.
...@@ -261,9 +257,10 @@ torch_info = { ...@@ -261,9 +257,10 @@ torch_info = {
"hip_version": hip_version "hip_version": hip_version
} }
print(f"version={version_str}, git_hash={git_hash}, git_branch={git_branch}") print(f"version={version_str}, dcu_version={dcu_version_str}, git_hash={git_hash}, git_branch={git_branch}")
with open('deepspeed/git_version_info_installed.py', 'w') as fd: with open('deepspeed/git_version_info_installed.py', 'w') as fd:
fd.write(f"version='{version_str}'\n") fd.write(f"version='{version_str}'\n")
fd.write(f"dcu_version='{dcu_version_str}'\n")
fd.write(f"git_hash='{git_hash}'\n") fd.write(f"git_hash='{git_hash}'\n")
fd.write(f"git_branch='{git_branch}'\n") fd.write(f"git_branch='{git_branch}'\n")
fd.write(f"installed_ops={install_ops}\n") fd.write(f"installed_ops={install_ops}\n")
...@@ -282,7 +279,7 @@ with open(os.path.join(thisdir, 'README.md'), encoding='utf-8') as fin: ...@@ -282,7 +279,7 @@ with open(os.path.join(thisdir, 'README.md'), encoding='utf-8') as fin:
start_time = time.time() start_time = time.time()
setup(name='deepspeed', setup(name='deepspeed',
version=version_str, version=dcu_version_str,
description='DeepSpeed library', description='DeepSpeed library',
long_description=readme_text, long_description=readme_text,
long_description_content_type='text/markdown', long_description_content_type='text/markdown',
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment